browser/components/translation/cld2/internal/lang_script.cc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/browser/components/translation/cld2/internal/lang_script.cc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,560 @@
     1.4 +// Copyright 2013 Google Inc. All Rights Reserved.
     1.5 +//
     1.6 +// Licensed under the Apache License, Version 2.0 (the "License");
     1.7 +// you may not use this file except in compliance with the License.
     1.8 +// You may obtain a copy of the License at
     1.9 +//
    1.10 +//     http://www.apache.org/licenses/LICENSE-2.0
    1.11 +//
    1.12 +// Unless required by applicable law or agreed to in writing, software
    1.13 +// distributed under the License is distributed on an "AS IS" BASIS,
    1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    1.15 +// See the License for the specific language governing permissions and
    1.16 +// limitations under the License.
    1.17 +
    1.18 +//
    1.19 +// File: lang_script.cc
    1.20 +// ================
    1.21 +//
    1.22 +// Author: dsites@google.com (Dick Sites)
    1.23 +//
    1.24 +// This file declares language and script numbers and names for CLD2
    1.25 +//
    1.26 +
    1.27 +#include "lang_script.h"
    1.28 +
    1.29 +#include <stdlib.h>
    1.30 +#include <string.h>
    1.31 +
    1.32 +#include "generated_language.h"
    1.33 +#include "generated_ulscript.h"
    1.34 +
    1.35 +namespace CLD2 {
    1.36 +
    1.37 +// Language tables
    1.38 +// Subscripted by enum Language
    1.39 +extern const int kLanguageToNameSize;
    1.40 +extern const char* const kLanguageToName[];
    1.41 +extern const int kLanguageToCodeSize;
    1.42 +extern const char* const kLanguageToCode[];
    1.43 +extern const int kLanguageToCNameSize;
    1.44 +extern const char* const kLanguageToCName[];
    1.45 +extern const int kLanguageToScriptsSize;
    1.46 +extern const FourScripts kLanguageToScripts[];
    1.47 +
    1.48 +// Subscripted by Language
    1.49 +extern const int kLanguageToPLangSize;
    1.50 +extern const uint8 kLanguageToPLang[];
    1.51 +// Subscripted by per-script language
    1.52 +extern const uint16 kPLangToLanguageLatn[];
    1.53 +extern const uint16 kPLangToLanguageOthr[];
    1.54 +
    1.55 +// Alphabetical order for binary search
    1.56 +extern const int kNameToLanguageSize;
    1.57 +extern const CharIntPair kNameToLanguage[];
    1.58 +extern const int kCodeToLanguageSize;
    1.59 +extern const CharIntPair kCodeToLanguage[];
    1.60 +
    1.61 +// ULScript tables
    1.62 +// Subscripted by enum ULScript
    1.63 +extern const int kULScriptToNameSize;
    1.64 +extern const char* const kULScriptToName[];
    1.65 +extern const int kULScriptToCodeSize;
    1.66 +extern const char* const kULScriptToCode[];
    1.67 +extern const int kULScriptToCNameSize;
    1.68 +extern const char* const kULScriptToCName[];
    1.69 +extern const int kULScriptToRtypeSize;
    1.70 +extern const ULScriptRType kULScriptToRtype[];
    1.71 +extern const int kULScriptToDefaultLangSize;
    1.72 +extern const Language kULScriptToDefaultLang[];
    1.73 +
    1.74 +// Alphabetical order for binary search
    1.75 +extern const int kNameToULScriptSize;
    1.76 +extern const CharIntPair kNameToULScript[];
    1.77 +extern const int kCodeToULScriptSize;
    1.78 +extern const CharIntPair kCodeToULScript[];
    1.79 +
    1.80 +
    1.81 +//
    1.82 +// File: lang_script.h
    1.83 +// ================
    1.84 +//
    1.85 +// Author: dsites@google.com (Dick Sites)
    1.86 +//
    1.87 +// This file declares language and script numbers and names for CLD2
    1.88 +//
    1.89 +
    1.90 +
    1.91 +// NOTE: The script numbers and language numbers here are not guaranteed to be
    1.92 +// stable. If you want to record a result for posterity, save the ISO codes
    1.93 +// as character strings.
    1.94 +//
    1.95 +//
    1.96 +// The Unicode scripts recognized by CLD2 are numbered almost arbitrarily,
    1.97 +// specified in an enum. Each script has human-readable script name and a
    1.98 +// 4-letter ISO 15924 script code. Each has a C name (largely for use by
    1.99 +// programs that generate declarations in cld2_generated_scripts.h). Each
   1.100 +// also has a recognition type
   1.101 +//  r_type: 0 script-only, 1 nilgrams, 2 quadgrams, 3 CJK
   1.102 +//
   1.103 +// The declarations for a particular version of Unicode are machine-generated in
   1.104 +//   cld2_generated_scripts.h
   1.105 +//
   1.106 +// This file includes that one and declares the access routines. The type
   1.107 +// involved is called "ULScript" to signify Unicode Letters-Marks Scripts,
   1.108 +// which are not quite Unicode Scripts. In particular, the CJK scripts are
   1.109 +// merged into a single number because CLD2 recognizes the CJK languages from
   1.110 +// four scripts intermixed: Hani (both Hans  and Hant), Hangul, Hiragana, and
   1.111 +// Katakana.
   1.112 +
   1.113 +// Each script has one of these four recognition types.
   1.114 +// RTypeNone: There is no language associated with this script. In extended
   1.115 +//  language recognition calls, return a fake language number that maps to
   1.116 +//  xx-Cham, with literally "xx" for the language code,and with the script
   1.117 +//  code instead of "Cham". In non-extended calls, return UNKNOWN_LANGUAGE.
   1.118 +// RTypeOne: The script maps 1:1 to a single language. No letters are examined
   1.119 +//  during recognition and no lookups done.
   1.120 +// RTypeMany: The usual quadgram + delta-octagram + distinctive-words scoring
   1.121 +//  is done to determine the languages involved.
   1.122 +// RTypeCJK: The CJK unigram + delta-bigram scoring is done to determine the
   1.123 +//  languages involved.
   1.124 +//
   1.125 +// Note that the choice of recognition type is a function of script, not
   1.126 +// language. In particular, some languges are recognized in multiple scripts
   1.127 +// and those have different recognition types (Mongolian mn-Latn vs. mn-Mong
   1.128 +// for example).
   1.129 +
   1.130 +//----------------------------------------------------------------------------//
   1.131 +// Functions of ULScript                                                      //
   1.132 +//----------------------------------------------------------------------------//
   1.133 +
   1.134 +// If the input is out of range or otherwise unrecognized, it is treated
   1.135 +// as UNKNOWN_ULSCRIPT (which never participates in language recognition)
   1.136 +const char* ULScriptName(ULScript ulscript) {
   1.137 +  int i_ulscript = ulscript;
   1.138 +  if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
   1.139 +  if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
   1.140 +  return kULScriptToName[i_ulscript];
   1.141 +}
   1.142 +
   1.143 +const char* ULScriptCode(ULScript ulscript) {
   1.144 +  int i_ulscript = ulscript;
   1.145 +  if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
   1.146 +  if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
   1.147 +  return kULScriptToCode[i_ulscript];
   1.148 +}
   1.149 +
   1.150 +const char* ULScriptDeclaredName(ULScript ulscript) {
   1.151 +  int i_ulscript = ulscript;
   1.152 +  if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
   1.153 +  if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
   1.154 +  return kULScriptToCName[i_ulscript];
   1.155 +}
   1.156 +
   1.157 +ULScriptRType ULScriptRecognitionType(ULScript ulscript) {
   1.158 +  int i_ulscript = ulscript;
   1.159 +  if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
   1.160 +  if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
   1.161 +  return kULScriptToRtype[i_ulscript];
   1.162 +}
   1.163 +
   1.164 +
   1.165 +
   1.166 +// The languages recognized by CLD2 are numbered almost arbitrarily,
   1.167 +// specified in an enum. Each language has human-readable language name and a
   1.168 +// 2- or 3-letter ISO 639 language code. Each has a C name (largely for use by
   1.169 +// programs that generate declarations in cld2_generated_languagess.h).
   1.170 +// Each has a list of up to four scripts in which it is currently recognized.
   1.171 +//
   1.172 +// The declarations for a particular set of recognized languages are
   1.173 +// machine-generated in
   1.174 +//   cld2_generated_languages.h
   1.175 +//
   1.176 +// The Language enum is intended to match the internal Google Language enum
   1.177 +// in i18n/languages/proto/languages.proto up to NUM_LANGUAGES, with additional
   1.178 +// languages assigned above that. Over time, some languages may be renumbered
   1.179 +// if they are moved into the Language enum.
   1.180 +//
   1.181 +// The Language enum includes the fake language numbers for RTypeNone above.
   1.182 +//
   1.183 +// In an open-source environment, the Google-specific Language enum is not
   1.184 +// available. Language decouples the two environments while maintaining
   1.185 +// internal compatibility.
   1.186 +
   1.187 +
   1.188 +// If the input is out of range or otherwise unrecognized, it is treated
   1.189 +// as UNKNOWN_LANGUAGE
   1.190 +//
   1.191 +// LanguageCode
   1.192 +// ------------
   1.193 +// Given the Language, return the language code, e.g. "ko"
   1.194 +// This is determined by
   1.195 +// the following (in order of preference):
   1.196 +// - ISO-639-1 two-letter language code
   1.197 +//   (all except those mentioned below)
   1.198 +// - ISO-639-2 three-letter bibliographic language code
   1.199 +//   (Tibetan, Dhivehi, Cherokee, Syriac)
   1.200 +// - Google-specific language code
   1.201 +//   (ChineseT ("zh-TW"), Teragram Unknown, Unknown,
   1.202 +//   Portuguese-Portugal, Portuguese-Brazil, Limbu)
   1.203 +// - Fake RTypeNone names.
   1.204 +
   1.205 +//----------------------------------------------------------------------------//
   1.206 +// Functions of Language                                                      //
   1.207 +//----------------------------------------------------------------------------//
   1.208 +
   1.209 +const char* LanguageName(Language lang) {
   1.210 +  int i_lang = lang;
   1.211 +  if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
   1.212 +  if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
   1.213 +  return kLanguageToName[i_lang];
   1.214 +}
   1.215 +const char* LanguageCode(Language lang) {
   1.216 +  int i_lang = lang;
   1.217 +  if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
   1.218 +  if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
   1.219 +  return kLanguageToCode[i_lang];
   1.220 +}
   1.221 +
   1.222 +const char* LanguageDeclaredName(Language lang) {
   1.223 +  int i_lang = lang;
   1.224 +  if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
   1.225 +  if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
   1.226 +  return kLanguageToCName[i_lang];
   1.227 +}
   1.228 +
   1.229 +// n is in 0..3. Trailing entries are filled with
   1.230 +// UNKNOWN_LANGUAGE (which never participates in language recognition)
   1.231 +ULScript LanguageRecognizedScript(Language lang, int n) {
   1.232 +  int i_lang = lang;
   1.233 +  if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
   1.234 +  if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
   1.235 +  return static_cast<ULScript>(kLanguageToScripts[i_lang][n]);
   1.236 +}
   1.237 +
   1.238 +// Given the Language, returns its string name used as the output by
   1.239 +// the lang/enc identifier, e.g. "Korean"
   1.240 +// "invalid_language" if the input is invalid.
   1.241 +// TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language,
   1.242 +// used to subtract out HTML, link farms, DNA strings, and alittle English porn
   1.243 +const char* ExtLanguageName(const Language lang) {
   1.244 +  return LanguageName(lang);
   1.245 +}
   1.246 +
   1.247 +// Given the Language, return the language code, e.g. "ko"
   1.248 +const char* ExtLanguageCode(const Language lang) {
   1.249 +  return LanguageCode(lang);
   1.250 +}
   1.251 +
   1.252 +
   1.253 +// Given the Language, returns its Language enum spelling, for use by
   1.254 +// programs that create C declarations, e.g. "KOREAN"
   1.255 +// "UNKNOWN_LANGUAGE" if the input is invalid.
   1.256 +const char* ExtLanguageDeclaredName(const Language lang) {
   1.257 +  return LanguageDeclaredName(lang);
   1.258 +}
   1.259 +
   1.260 +
   1.261 +extern const int kCloseSetSize = 10;
   1.262 +
   1.263 +// Returns which set of statistically-close languages lang is in. 0 means none.
   1.264 +int LanguageCloseSet(Language lang) {
   1.265 +  // Scaffolding
   1.266 +  // id ms         # INDONESIAN MALAY coef=0.4698    Problematic w/o extra words
   1.267 +  // bo dz         # TIBETAN DZONGKHA coef=0.4571
   1.268 +  // cs sk         # CZECH SLOVAK coef=0.4273
   1.269 +  // zu xh         # ZULU XHOSA coef=0.3716
   1.270 +  //
   1.271 +  // bs hr sr srm  # BOSNIAN CROATIAN SERBIAN MONTENEGRIN
   1.272 +  // hi mr bh ne   # HINDI MARATHI BIHARI NEPALI
   1.273 +  // no nn da      # NORWEGIAN NORWEGIAN_N DANISH
   1.274 +  // gl es pt      # GALICIAN SPANISH PORTUGUESE
   1.275 +  // rw rn         # KINYARWANDA RUNDI
   1.276 +
   1.277 +  if (lang == INDONESIAN) {return 1;}
   1.278 +  if (lang == MALAY) {return 1;}
   1.279 +
   1.280 +  if (lang == TIBETAN) {return 2;}
   1.281 +  if (lang == DZONGKHA) {return 2;}
   1.282 +
   1.283 +  if (lang == CZECH) {return 3;}
   1.284 +  if (lang == SLOVAK) {return 3;}
   1.285 +
   1.286 +  if (lang == ZULU) {return 4;}
   1.287 +  if (lang == XHOSA) {return 4;}
   1.288 +
   1.289 +  if (lang == BOSNIAN) {return 5;}
   1.290 +  if (lang == CROATIAN) {return 5;}
   1.291 +  if (lang == SERBIAN) {return 5;}
   1.292 +  if (lang == MONTENEGRIN) {return 5;}
   1.293 +
   1.294 +  if (lang == HINDI) {return 6;}
   1.295 +  if (lang == MARATHI) {return 6;}
   1.296 +  if (lang == BIHARI) {return 6;}
   1.297 +  if (lang == NEPALI) {return 6;}
   1.298 +
   1.299 +  if (lang == NORWEGIAN) {return 7;}
   1.300 +  if (lang == NORWEGIAN_N) {return 7;}
   1.301 +  if (lang == DANISH) {return 7;}
   1.302 +
   1.303 +  if (lang == GALICIAN) {return 8;}
   1.304 +  if (lang == SPANISH) {return 8;}
   1.305 +  if (lang == PORTUGUESE) {return 8;}
   1.306 +
   1.307 +  if (lang == KINYARWANDA) {return 9;}
   1.308 +  if (lang == RUNDI) {return 9;}
   1.309 +
   1.310 +  return 0;
   1.311 +}
   1.312 +
   1.313 +//----------------------------------------------------------------------------//
   1.314 +// Functions of ULScript and Language                                         //
   1.315 +//----------------------------------------------------------------------------//
   1.316 +
   1.317 +Language DefaultLanguage(ULScript ulscript) {
   1.318 +  if (ulscript < 0) {return UNKNOWN_LANGUAGE;}
   1.319 +  if (ulscript >= NUM_ULSCRIPTS) {return UNKNOWN_LANGUAGE;}
   1.320 +  return kULScriptToDefaultLang[ulscript];
   1.321 +}
   1.322 +
   1.323 +uint8 PerScriptNumber(ULScript ulscript, Language lang) {
   1.324 +  if (ulscript < 0) {return 0;}
   1.325 +  if (ulscript >= NUM_ULSCRIPTS) {return 0;}
   1.326 +  if (kULScriptToRtype[ulscript] == RTypeNone) {return 1;}
   1.327 +  if (lang >= kLanguageToPLangSize) {return 0;}
   1.328 +  return kLanguageToPLang[lang];
   1.329 +}
   1.330 +
   1.331 +Language FromPerScriptNumber(ULScript ulscript, uint8 perscript_number) {
   1.332 +  if (ulscript < 0) {return UNKNOWN_LANGUAGE;}
   1.333 +  if (ulscript >= NUM_ULSCRIPTS) {return UNKNOWN_LANGUAGE;}
   1.334 +  if ((kULScriptToRtype[ulscript] == RTypeNone) ||
   1.335 +      (kULScriptToRtype[ulscript] == RTypeOne)) {
   1.336 +    return kULScriptToDefaultLang[ulscript];
   1.337 +  }
   1.338 +
   1.339 +  if (ulscript == ULScript_Latin) {
   1.340 +     return static_cast<Language>(kPLangToLanguageLatn[perscript_number]);
   1.341 +  } else {
   1.342 +     return static_cast<Language>(kPLangToLanguageOthr[perscript_number]);
   1.343 +  }
   1.344 +}
   1.345 +
   1.346 +// Return true if language can be in the Latin script
   1.347 +bool IsLatnLanguage(Language lang) {
   1.348 +  if (lang >= kLanguageToPLangSize) {return false;}
   1.349 +  return (lang == kPLangToLanguageLatn[kLanguageToPLang[lang]]);
   1.350 +}
   1.351 +
   1.352 +// Return true if language can be in a non-Latin script
   1.353 +bool IsOthrLanguage(Language lang) {
   1.354 +  if (lang >= kLanguageToPLangSize) {return false;}
   1.355 +  return (lang == kPLangToLanguageOthr[kLanguageToPLang[lang]]);
   1.356 +}
   1.357 +
   1.358 +
   1.359 +//----------------------------------------------------------------------------//
   1.360 +// Other                                                                      //
   1.361 +//----------------------------------------------------------------------------//
   1.362 +
   1.363 +// Returns mid if key found in lo <= mid < hi, else -1
   1.364 +int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair) {
   1.365 +  // binary search
   1.366 +  while (lo < hi) {
   1.367 +    int mid = (lo + hi) >> 1;
   1.368 +    if (strcmp(key, cipair[mid].s) < 0) {
   1.369 +      hi = mid;
   1.370 +    } else if (strcmp(key, cipair[mid].s) > 0) {
   1.371 +      lo = mid + 1;
   1.372 +    } else {
   1.373 +      return mid;
   1.374 +    }
   1.375 +  }
   1.376 +  return -1;
   1.377 +}
   1.378 +
   1.379 +Language MakeLang(int i) {return static_cast<Language>(i);}
   1.380 +
   1.381 +// Name can be either full name or ISO code, or can be ISO code embedded in
   1.382 +// a language-script combination such as "ABKHAZIAN", "en", "en-Latn-GB"
   1.383 +Language GetLanguageFromName(const char* src) {
   1.384 +  const char* hyphen1 = strchr(src, '-');
   1.385 +  const char* hyphen2 = NULL;
   1.386 +  if (hyphen1 != NULL) {hyphen2 = strchr(hyphen1 + 1, '-');}
   1.387 +
   1.388 +  int match = -1;
   1.389 +  if (hyphen1 == NULL) {
   1.390 +    // Bare name. Look at full name, then code
   1.391 +    match = BinarySearch(src, 0, kNameToLanguageSize, kNameToLanguage);
   1.392 +    if (match >= 0) {return MakeLang(kNameToLanguage[match].i);}    // aa
   1.393 +    match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage);
   1.394 +    if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);}    // aa
   1.395 +    return UNKNOWN_LANGUAGE;
   1.396 +  }
   1.397 +
   1.398 +  if (hyphen2 == NULL) {
   1.399 +    // aa-bb. Not a full name; must be code-something. Try zh-TW then bare zh
   1.400 +    match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage);
   1.401 +    if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);}    // aa-bb
   1.402 +
   1.403 +    int len = strlen(src);
   1.404 +    if (len >= 16) {return UNKNOWN_LANGUAGE;}   // Real codes are shorter
   1.405 +
   1.406 +    char temp[16];
   1.407 +    int hyphen1_offset = hyphen1 - src;
   1.408 +    // Take off part after hyphen1
   1.409 +    memcpy(temp, src, len);
   1.410 +    temp[hyphen1_offset] = '\0';
   1.411 +    match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
   1.412 +    if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);}    // aa
   1.413 +
   1.414 +    return UNKNOWN_LANGUAGE;
   1.415 +  }
   1.416 +
   1.417 +  // aa-bb-cc. Must be code-something. Try en-Latn-US, en-Latn, en-US, en
   1.418 +  match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage);
   1.419 +  if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);}    // aa-bb-cc
   1.420 +
   1.421 +
   1.422 +  int len = strlen(src);
   1.423 +  if (len >= 16) {return UNKNOWN_LANGUAGE;}   // Real codes are shorter
   1.424 +
   1.425 +  char temp[16];
   1.426 +  int hyphen1_offset = hyphen1 - src;
   1.427 +  int hyphen2_offset = hyphen2 - src;
   1.428 +  // Take off part after hyphen2
   1.429 +  memcpy(temp, src, len);
   1.430 +  temp[hyphen2_offset] = '\0';
   1.431 +  match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
   1.432 +  if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);}    // aa-bb
   1.433 +
   1.434 +
   1.435 +  // Take off part between hyphen1 and hyphen2
   1.436 +  int len2 = len - hyphen2_offset;
   1.437 +  memcpy(temp, src, len);
   1.438 +  memcpy(&temp[hyphen1_offset], hyphen2, len2);
   1.439 +  temp[hyphen1_offset + len2] = '\0';
   1.440 +  match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
   1.441 +  if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);}    // aa-cc
   1.442 +
   1.443 +
   1.444 +  // Take off everything after hyphen1
   1.445 +  memcpy(temp, src, len);
   1.446 +  temp[hyphen1_offset] = '\0';
   1.447 +  match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
   1.448 +  if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);}    // aa
   1.449 +
   1.450 +
   1.451 +  return UNKNOWN_LANGUAGE;
   1.452 +}
   1.453 +
   1.454 +
   1.455 +// Name can be either full name or ISO code, or can be ISO code embedded in
   1.456 +// a language-script combination such as "en-Latn-GB"
   1.457 +// MORE WORK to do here. also kLanguageToScripts [4] is bogus
   1.458 +// if bare language name, no script, want  zh, ja, ko to Hani, pt to Latn, etc.
   1.459 +// Something like map code to Language, then Language to kLanguageToScripts[x][0]
   1.460 +// ADD BIAS: kLanguageToScripts lists default script first
   1.461 +// If total mismatch, reutrn Latn
   1.462 +//   if (strcmp(src, "nd") == 0) {return NDEBELE;}         // [nd was wrong]
   1.463 +//   if (strcmp(src, "sit-NP-Limb") == 0) {return ULScript_Limbu;}
   1.464 +
   1.465 +ULScript MakeULScr(int i) {return static_cast<ULScript>(i);}
   1.466 +
   1.467 +ULScript GetULScriptFromName(const char* src) {
   1.468 +  const char* hyphen1 = strchr(src, '-');
   1.469 +  const char* hyphen2 = NULL;
   1.470 +  if (hyphen1 != NULL) {hyphen2 = strchr(hyphen1 + 1, '-');}
   1.471 +
   1.472 +  int match = -1;
   1.473 +  if (hyphen1 == NULL) {
   1.474 +    // Bare name. Look at full name, then code, then try backmapping as Language
   1.475 +    match = BinarySearch(src, 0, kNameToULScriptSize, kNameToULScript);
   1.476 +    if (match >= 0) {return MakeULScr(kNameToULScript[match].i);}    // aa
   1.477 +    match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript);
   1.478 +    if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);}    // aa
   1.479 +
   1.480 +    Language backmap_me = GetLanguageFromName(src);
   1.481 +    if (backmap_me != UNKNOWN_LANGUAGE) {
   1.482 +      return static_cast<ULScript>(kLanguageToScripts[backmap_me][0]);
   1.483 +    }
   1.484 +    return ULScript_Latin;
   1.485 +  }
   1.486 +
   1.487 +  if (hyphen2 == NULL) {
   1.488 +    // aa-bb. Not a full name; must be code-something. Try en-Latn, bare Latn
   1.489 +    if (strcmp(src, "zh-TW") == 0) {return ULScript_Hani;}
   1.490 +    if (strcmp(src, "zh-CN") == 0) {return ULScript_Hani;}
   1.491 +    if (strcmp(src, "sit-NP") == 0) {return ULScript_Limbu;}
   1.492 +    if (strcmp(src, "sit-Limb") == 0) {return ULScript_Limbu;}
   1.493 +    if (strcmp(src, "sr-ME") == 0) {return ULScript_Latin;}
   1.494 +    match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript);
   1.495 +    if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);}    // aa-bb
   1.496 +
   1.497 +    int len = strlen(src);
   1.498 +    if (len >= 16) {return ULScript_Latin;}   // Real codes are shorter
   1.499 +
   1.500 +    char temp[16];
   1.501 +    int hyphen1_offset = hyphen1 - src;
   1.502 +    int len1 = len - hyphen1_offset - 1;    // Exclude the hyphen
   1.503 +    // Take off part before hyphen1
   1.504 +    memcpy(temp, hyphen1 + 1, len1);
   1.505 +    temp[len1] = '\0';
   1.506 +    match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
   1.507 +    if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);}    // bb
   1.508 +
   1.509 +    // Take off part after hyphen1
   1.510 +    memcpy(temp, src, len);
   1.511 +    temp[hyphen1_offset] = '\0';
   1.512 +    match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
   1.513 +    if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);}    // aa
   1.514 +
   1.515 +    return ULScript_Latin;
   1.516 +  }
   1.517 +
   1.518 +  // aa-bb-cc. Must be code-something. Try en-Latn-US, en-Latn, en-US, en
   1.519 +  if (strcmp(src, "sit-NP-Limb") == 0) {return ULScript_Limbu;}
   1.520 +  if (strcmp(src, "sr-ME-Latn") == 0) {return ULScript_Latin;}
   1.521 +  if (strcmp(src, "sr-ME-Cyrl") == 0) {return ULScript_Cyrillic;}
   1.522 +  match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript);
   1.523 +  if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);}    // aa-bb-cc
   1.524 +
   1.525 +  int len = strlen(src);
   1.526 +  if (len >= 16) {return ULScript_Latin;}   // Real codes are shorter
   1.527 +
   1.528 +  char temp[16];
   1.529 +  int hyphen1_offset = hyphen1 - src;
   1.530 +  int hyphen2_offset = hyphen2 - src;
   1.531 +  int len2 = len - hyphen2_offset - 1;                // Exclude the hyphen
   1.532 +  int lenmid = hyphen2_offset - hyphen1_offset - 1;   // Exclude the hyphen
   1.533 +  // Keep part between hyphen1 and hyphen2
   1.534 +  memcpy(temp, hyphen1 + 1, lenmid);
   1.535 +  temp[lenmid] = '\0';
   1.536 +  match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
   1.537 +  if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);}    // bb
   1.538 +
   1.539 +  // Keep part after hyphen2
   1.540 +  memcpy(temp, hyphen2 + 1, len2);
   1.541 +  temp[len2] = '\0';
   1.542 +  match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
   1.543 +  if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);}    // cc
   1.544 +
   1.545 +  // Keep part before hyphen1
   1.546 +  memcpy(temp, src, len);
   1.547 +  temp[hyphen1_offset] = '\0';
   1.548 +  match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
   1.549 +  if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);}    // aa
   1.550 +
   1.551 +  return ULScript_Latin;
   1.552 +}
   1.553 +
   1.554 +// Map script into Latin, Cyrillic, Arabic, Other
   1.555 +int LScript4(ULScript ulscript) {
   1.556 +  if (ulscript == ULScript_Latin) {return 0;}
   1.557 +  if (ulscript == ULScript_Cyrillic) {return 1;}
   1.558 +  if (ulscript == ULScript_Arabic) {return 2;}
   1.559 +  return 3;
   1.560 +}
   1.561 +
   1.562 +}  // namespace CLD2
   1.563 +

mercurial