1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/browser/components/translation/cld2/internal/lang_script.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,187 @@ 1.4 +// Copyright 2013 Google Inc. All Rights Reserved. 1.5 +// 1.6 +// Licensed under the Apache License, Version 2.0 (the "License"); 1.7 +// you may not use this file except in compliance with the License. 1.8 +// You may obtain a copy of the License at 1.9 +// 1.10 +// http://www.apache.org/licenses/LICENSE-2.0 1.11 +// 1.12 +// Unless required by applicable law or agreed to in writing, software 1.13 +// distributed under the License is distributed on an "AS IS" BASIS, 1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.15 +// See the License for the specific language governing permissions and 1.16 +// limitations under the License. 1.17 + 1.18 +// 1.19 +// File: lang_script.h 1.20 +// ================ 1.21 +// 1.22 +// Author: dsites@google.com (Dick Sites) 1.23 +// 1.24 +// This file declares language and script numbers and names for CLD2, 1.25 +// plus routines that access side tables based on these 1.26 +// 1.27 + 1.28 +#ifndef I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__ 1.29 +#define I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__ 1.30 + 1.31 +#include "generated_language.h" 1.32 +#include "generated_ulscript.h" 1.33 +#include "integral_types.h" 1.34 + 1.35 + 1.36 +// NOTE: The script numbers and language numbers here are not guaranteed to be 1.37 +// stable. If you want to record a result for posterity, save the 1.38 +// ULScriptCode(ULScript ulscript) result as character strings. 1.39 +// 1.40 +// The Unicode scripts recognized by CLD2 are numbered almost arbitrarily, 1.41 +// specified in an enum. Each script has human-readable script name and a 1.42 +// 4-letter ISO 15924 script code. Each has a C name (largely for use by 1.43 +// programs that generate declarations in cld2_generated_scripts.h). Each 1.44 +// also has a recognition type 1.45 +// r_type: 0 script-only, 1 nilgrams, 2 quadgrams, 3 CJK 1.46 +// 1.47 +// The declarations for a particular version of Unicode are machine-generated in 1.48 +// generated_scripts.h 1.49 +// 1.50 +// This file includes that one and declares the access routines. The type 1.51 +// involved is called "ULScript" to signify Unicode Letters-Marks Scripts, 1.52 +// which are not quite Unicode Scripts. In particular, the CJK scripts are 1.53 +// merged into a single number because CLD2 recognizes the CJK languages from 1.54 +// four scripts intermixed: Hani (both Hans and Hant), Hangul, Hiragana, and 1.55 +// Katakana. 1.56 + 1.57 +// Each script has one of these four recognition types. 1.58 +// RTypeNone: There is no language associated with this script. In extended 1.59 +// language recognition calls, return a fake language number that maps to 1.60 +// xx-Cham, with literally "xx" for the language code,and with the script 1.61 +// code instead of "Cham". In non-extended calls, return UNKNOWN_LANGUAGE. 1.62 +// RTypeOne: The script maps 1:1 to a single language. No letters are examined 1.63 +// during recognition and no lookups done. 1.64 +// RTypeMany: The usual quadgram + delta-octagram + distinctive-words scoring 1.65 +// is done to determine the languages involved. 1.66 +// RTypeCJK: The CJK unigram + delta-bigram scoring is done to determine the 1.67 +// languages involved. 1.68 +// 1.69 +// Note that the choice of recognition type is a function of script, not 1.70 +// language. In particular, some languges are recognized in multiple scripts 1.71 +// and those have different recognition types (Mongolian mn-Latn vs. mn-Mong 1.72 +// for example). 1.73 + 1.74 +namespace CLD2 { 1.75 + 1.76 +//----------------------------------------------------------------------------// 1.77 +// Functions of ULScript // 1.78 +//----------------------------------------------------------------------------// 1.79 + 1.80 +// If the input is out of range or otherwise unrecognized, it is treated 1.81 +// as ULScript_Common (which never participates in language recognition) 1.82 +const char* ULScriptName(ULScript ulscript); 1.83 +const char* ULScriptCode(ULScript ulscript); 1.84 +const char* ULScriptDeclaredName(ULScript ulscript); 1.85 +ULScriptRType ULScriptRecognitionType(ULScript ulscript); 1.86 + 1.87 +// Name can be either full name or ISO code, or can be ISO code embedded in 1.88 +// a language-script combination such as "en-Latn-GB" 1.89 +ULScript GetULScriptFromName(const char* src); 1.90 + 1.91 +// Map script into Latin, Cyrillic, Arabic, Other 1.92 +int LScript4(ULScript ulscript); 1.93 + 1.94 +//----------------------------------------------------------------------------// 1.95 +// Functions of Language // 1.96 +//----------------------------------------------------------------------------// 1.97 + 1.98 +// The languages recognized by CLD2 are numbered almost arbitrarily, 1.99 +// specified in an enum. Each language has human-readable language name and a 1.100 +// 2- or 3-letter ISO 639 language code. Each has a C name (largely for use by 1.101 +// programs that generate declarations in cld2_generated_languagess.h). 1.102 +// Each has a list of up to four scripts in which it is currently recognized. 1.103 +// 1.104 +// The declarations for a particular set of recognized languages are 1.105 +// machine-generated in 1.106 +// generated_languages.h 1.107 +// 1.108 +// The Language enum is intended to match the internal Google Language enum 1.109 +// in i18n/languages/proto/languages.proto up to NUM_LANGUAGES, with additional 1.110 +// languages assigned above that. Over time, some languages may be renumbered 1.111 +// if they are moved into the Language enum. 1.112 +// 1.113 +// The Language enum includes the fake language numbers for RTypeNone above. 1.114 +// 1.115 + 1.116 + 1.117 +// If the input is out of range or otherwise unrecognized, it is treated 1.118 +// as UNKNOWN_LANGUAGE 1.119 +// 1.120 +// LanguageCode 1.121 +// ------------ 1.122 +// Given the Language, return the language code, e.g. "ko" 1.123 +// This is determined by 1.124 +// the following (in order of preference): 1.125 +// - ISO-639-1 two-letter language code 1.126 +// (all except those mentioned below) 1.127 +// - ISO-639-2 three-letter bibliographic language code 1.128 +// (Tibetan, Dhivehi, Cherokee, Syriac) 1.129 +// - Google-specific language code 1.130 +// (ChineseT ("zh-TW"), Teragram Unknown, Unknown, 1.131 +// Portuguese-Portugal, Portuguese-Brazil, Limbu) 1.132 +// - Fake RTypeNone names. 1.133 + 1.134 +const char* LanguageName(Language lang); 1.135 +const char* LanguageCode(Language lang); 1.136 +const char* LanguageShortCode(Language lang); 1.137 +const char* LanguageDeclaredName(Language lang); 1.138 + 1.139 +// n is in 0..3. Trailing entries are filled with 1.140 +// ULScript_Common (which never participates in language recognition) 1.141 +ULScript LanguageRecognizedScript(Language lang, int n); 1.142 + 1.143 +// Name can be either full name or ISO code, or can be ISO code embedded in 1.144 +// a language-script combination such as "en-Latn-GB" 1.145 +Language GetLanguageFromName(const char* src); 1.146 + 1.147 +// Returns which set of statistically-close languages lang is in. 0 means none. 1.148 +int LanguageCloseSet(Language lang); 1.149 + 1.150 +//----------------------------------------------------------------------------// 1.151 +// Functions of ULScript and Language // 1.152 +//----------------------------------------------------------------------------// 1.153 + 1.154 +// Most common language in each script 1.155 +Language DefaultLanguage(ULScript ulscript); 1.156 + 1.157 +// For RTypeMany recognition, 1.158 +// the CLD2 lookup tables are kept small by encoding a language into one byte. 1.159 +// To avoid limiting CLD2 to at most 256 languages, a larger range of external 1.160 +// Language numbers is mapped to a smaller range of per-script numbers. At 1.161 +// the moment (January 2013) the Latin script has about 90 languages to be 1.162 +// recognized, while all the other scripts total about 50 more languages. In 1.163 +// addition, the RTypeNone scripts map to about 100 fake languages. 1.164 +// So we map all Latin-script languages to one range of 1..255 per-script 1.165 +// numbers and map all the other RTypeMany languages to an overlapping range 1.166 +// 1..255 of per-script numbers. 1.167 + 1.168 +uint8 PerScriptNumber(ULScript ulscript, Language lang); 1.169 +Language FromPerScriptNumber(ULScript ulscript, uint8 perscript_number); 1.170 + 1.171 +// While the speed-sensitive processing deals with per-script language numbers, 1.172 +// there is a need for low-performance dealing with original language numbers 1.173 +// and unknown scripts, mostly for processing language hints. 1.174 +// These routines let one derive a script class from a bare language. 1.175 +// For languages written in multiple scripts, both of these can return true. 1.176 + 1.177 +bool IsLatnLanguage(Language lang); 1.178 +bool IsOthrLanguage(Language lang); 1.179 + 1.180 + 1.181 +//----------------------------------------------------------------------------// 1.182 +// Other // 1.183 +//----------------------------------------------------------------------------// 1.184 + 1.185 +// Utility routine to search alphabetical tables 1.186 +int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair); 1.187 + 1.188 +} // namespace CLD2 1.189 + 1.190 +#endif // I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__