Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | // Copyright 2013 Google Inc. All Rights Reserved. |
michael@0 | 2 | // |
michael@0 | 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
michael@0 | 4 | // you may not use this file except in compliance with the License. |
michael@0 | 5 | // You may obtain a copy of the License at |
michael@0 | 6 | // |
michael@0 | 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
michael@0 | 8 | // |
michael@0 | 9 | // Unless required by applicable law or agreed to in writing, software |
michael@0 | 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
michael@0 | 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
michael@0 | 12 | // See the License for the specific language governing permissions and |
michael@0 | 13 | // limitations under the License. |
michael@0 | 14 | |
michael@0 | 15 | // |
michael@0 | 16 | // File: lang_script.h |
michael@0 | 17 | // ================ |
michael@0 | 18 | // |
michael@0 | 19 | // Author: dsites@google.com (Dick Sites) |
michael@0 | 20 | // |
michael@0 | 21 | // This file declares language and script numbers and names for CLD2, |
michael@0 | 22 | // plus routines that access side tables based on these |
michael@0 | 23 | // |
michael@0 | 24 | |
michael@0 | 25 | #ifndef I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__ |
michael@0 | 26 | #define I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__ |
michael@0 | 27 | |
michael@0 | 28 | #include "generated_language.h" |
michael@0 | 29 | #include "generated_ulscript.h" |
michael@0 | 30 | #include "integral_types.h" |
michael@0 | 31 | |
michael@0 | 32 | |
michael@0 | 33 | // NOTE: The script numbers and language numbers here are not guaranteed to be |
michael@0 | 34 | // stable. If you want to record a result for posterity, save the |
michael@0 | 35 | // ULScriptCode(ULScript ulscript) result as character strings. |
michael@0 | 36 | // |
michael@0 | 37 | // The Unicode scripts recognized by CLD2 are numbered almost arbitrarily, |
michael@0 | 38 | // specified in an enum. Each script has human-readable script name and a |
michael@0 | 39 | // 4-letter ISO 15924 script code. Each has a C name (largely for use by |
michael@0 | 40 | // programs that generate declarations in cld2_generated_scripts.h). Each |
michael@0 | 41 | // also has a recognition type |
michael@0 | 42 | // r_type: 0 script-only, 1 nilgrams, 2 quadgrams, 3 CJK |
michael@0 | 43 | // |
michael@0 | 44 | // The declarations for a particular version of Unicode are machine-generated in |
michael@0 | 45 | // generated_scripts.h |
michael@0 | 46 | // |
michael@0 | 47 | // This file includes that one and declares the access routines. The type |
michael@0 | 48 | // involved is called "ULScript" to signify Unicode Letters-Marks Scripts, |
michael@0 | 49 | // which are not quite Unicode Scripts. In particular, the CJK scripts are |
michael@0 | 50 | // merged into a single number because CLD2 recognizes the CJK languages from |
michael@0 | 51 | // four scripts intermixed: Hani (both Hans and Hant), Hangul, Hiragana, and |
michael@0 | 52 | // Katakana. |
michael@0 | 53 | |
michael@0 | 54 | // Each script has one of these four recognition types. |
michael@0 | 55 | // RTypeNone: There is no language associated with this script. In extended |
michael@0 | 56 | // language recognition calls, return a fake language number that maps to |
michael@0 | 57 | // xx-Cham, with literally "xx" for the language code,and with the script |
michael@0 | 58 | // code instead of "Cham". In non-extended calls, return UNKNOWN_LANGUAGE. |
michael@0 | 59 | // RTypeOne: The script maps 1:1 to a single language. No letters are examined |
michael@0 | 60 | // during recognition and no lookups done. |
michael@0 | 61 | // RTypeMany: The usual quadgram + delta-octagram + distinctive-words scoring |
michael@0 | 62 | // is done to determine the languages involved. |
michael@0 | 63 | // RTypeCJK: The CJK unigram + delta-bigram scoring is done to determine the |
michael@0 | 64 | // languages involved. |
michael@0 | 65 | // |
michael@0 | 66 | // Note that the choice of recognition type is a function of script, not |
michael@0 | 67 | // language. In particular, some languges are recognized in multiple scripts |
michael@0 | 68 | // and those have different recognition types (Mongolian mn-Latn vs. mn-Mong |
michael@0 | 69 | // for example). |
michael@0 | 70 | |
michael@0 | 71 | namespace CLD2 { |
michael@0 | 72 | |
michael@0 | 73 | //----------------------------------------------------------------------------// |
michael@0 | 74 | // Functions of ULScript // |
michael@0 | 75 | //----------------------------------------------------------------------------// |
michael@0 | 76 | |
michael@0 | 77 | // If the input is out of range or otherwise unrecognized, it is treated |
michael@0 | 78 | // as ULScript_Common (which never participates in language recognition) |
michael@0 | 79 | const char* ULScriptName(ULScript ulscript); |
michael@0 | 80 | const char* ULScriptCode(ULScript ulscript); |
michael@0 | 81 | const char* ULScriptDeclaredName(ULScript ulscript); |
michael@0 | 82 | ULScriptRType ULScriptRecognitionType(ULScript ulscript); |
michael@0 | 83 | |
michael@0 | 84 | // Name can be either full name or ISO code, or can be ISO code embedded in |
michael@0 | 85 | // a language-script combination such as "en-Latn-GB" |
michael@0 | 86 | ULScript GetULScriptFromName(const char* src); |
michael@0 | 87 | |
michael@0 | 88 | // Map script into Latin, Cyrillic, Arabic, Other |
michael@0 | 89 | int LScript4(ULScript ulscript); |
michael@0 | 90 | |
michael@0 | 91 | //----------------------------------------------------------------------------// |
michael@0 | 92 | // Functions of Language // |
michael@0 | 93 | //----------------------------------------------------------------------------// |
michael@0 | 94 | |
michael@0 | 95 | // The languages recognized by CLD2 are numbered almost arbitrarily, |
michael@0 | 96 | // specified in an enum. Each language has human-readable language name and a |
michael@0 | 97 | // 2- or 3-letter ISO 639 language code. Each has a C name (largely for use by |
michael@0 | 98 | // programs that generate declarations in cld2_generated_languagess.h). |
michael@0 | 99 | // Each has a list of up to four scripts in which it is currently recognized. |
michael@0 | 100 | // |
michael@0 | 101 | // The declarations for a particular set of recognized languages are |
michael@0 | 102 | // machine-generated in |
michael@0 | 103 | // generated_languages.h |
michael@0 | 104 | // |
michael@0 | 105 | // The Language enum is intended to match the internal Google Language enum |
michael@0 | 106 | // in i18n/languages/proto/languages.proto up to NUM_LANGUAGES, with additional |
michael@0 | 107 | // languages assigned above that. Over time, some languages may be renumbered |
michael@0 | 108 | // if they are moved into the Language enum. |
michael@0 | 109 | // |
michael@0 | 110 | // The Language enum includes the fake language numbers for RTypeNone above. |
michael@0 | 111 | // |
michael@0 | 112 | |
michael@0 | 113 | |
michael@0 | 114 | // If the input is out of range or otherwise unrecognized, it is treated |
michael@0 | 115 | // as UNKNOWN_LANGUAGE |
michael@0 | 116 | // |
michael@0 | 117 | // LanguageCode |
michael@0 | 118 | // ------------ |
michael@0 | 119 | // Given the Language, return the language code, e.g. "ko" |
michael@0 | 120 | // This is determined by |
michael@0 | 121 | // the following (in order of preference): |
michael@0 | 122 | // - ISO-639-1 two-letter language code |
michael@0 | 123 | // (all except those mentioned below) |
michael@0 | 124 | // - ISO-639-2 three-letter bibliographic language code |
michael@0 | 125 | // (Tibetan, Dhivehi, Cherokee, Syriac) |
michael@0 | 126 | // - Google-specific language code |
michael@0 | 127 | // (ChineseT ("zh-TW"), Teragram Unknown, Unknown, |
michael@0 | 128 | // Portuguese-Portugal, Portuguese-Brazil, Limbu) |
michael@0 | 129 | // - Fake RTypeNone names. |
michael@0 | 130 | |
michael@0 | 131 | const char* LanguageName(Language lang); |
michael@0 | 132 | const char* LanguageCode(Language lang); |
michael@0 | 133 | const char* LanguageShortCode(Language lang); |
michael@0 | 134 | const char* LanguageDeclaredName(Language lang); |
michael@0 | 135 | |
michael@0 | 136 | // n is in 0..3. Trailing entries are filled with |
michael@0 | 137 | // ULScript_Common (which never participates in language recognition) |
michael@0 | 138 | ULScript LanguageRecognizedScript(Language lang, int n); |
michael@0 | 139 | |
michael@0 | 140 | // Name can be either full name or ISO code, or can be ISO code embedded in |
michael@0 | 141 | // a language-script combination such as "en-Latn-GB" |
michael@0 | 142 | Language GetLanguageFromName(const char* src); |
michael@0 | 143 | |
michael@0 | 144 | // Returns which set of statistically-close languages lang is in. 0 means none. |
michael@0 | 145 | int LanguageCloseSet(Language lang); |
michael@0 | 146 | |
michael@0 | 147 | //----------------------------------------------------------------------------// |
michael@0 | 148 | // Functions of ULScript and Language // |
michael@0 | 149 | //----------------------------------------------------------------------------// |
michael@0 | 150 | |
michael@0 | 151 | // Most common language in each script |
michael@0 | 152 | Language DefaultLanguage(ULScript ulscript); |
michael@0 | 153 | |
michael@0 | 154 | // For RTypeMany recognition, |
michael@0 | 155 | // the CLD2 lookup tables are kept small by encoding a language into one byte. |
michael@0 | 156 | // To avoid limiting CLD2 to at most 256 languages, a larger range of external |
michael@0 | 157 | // Language numbers is mapped to a smaller range of per-script numbers. At |
michael@0 | 158 | // the moment (January 2013) the Latin script has about 90 languages to be |
michael@0 | 159 | // recognized, while all the other scripts total about 50 more languages. In |
michael@0 | 160 | // addition, the RTypeNone scripts map to about 100 fake languages. |
michael@0 | 161 | // So we map all Latin-script languages to one range of 1..255 per-script |
michael@0 | 162 | // numbers and map all the other RTypeMany languages to an overlapping range |
michael@0 | 163 | // 1..255 of per-script numbers. |
michael@0 | 164 | |
michael@0 | 165 | uint8 PerScriptNumber(ULScript ulscript, Language lang); |
michael@0 | 166 | Language FromPerScriptNumber(ULScript ulscript, uint8 perscript_number); |
michael@0 | 167 | |
michael@0 | 168 | // While the speed-sensitive processing deals with per-script language numbers, |
michael@0 | 169 | // there is a need for low-performance dealing with original language numbers |
michael@0 | 170 | // and unknown scripts, mostly for processing language hints. |
michael@0 | 171 | // These routines let one derive a script class from a bare language. |
michael@0 | 172 | // For languages written in multiple scripts, both of these can return true. |
michael@0 | 173 | |
michael@0 | 174 | bool IsLatnLanguage(Language lang); |
michael@0 | 175 | bool IsOthrLanguage(Language lang); |
michael@0 | 176 | |
michael@0 | 177 | |
michael@0 | 178 | //----------------------------------------------------------------------------// |
michael@0 | 179 | // Other // |
michael@0 | 180 | //----------------------------------------------------------------------------// |
michael@0 | 181 | |
michael@0 | 182 | // Utility routine to search alphabetical tables |
michael@0 | 183 | int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair); |
michael@0 | 184 | |
michael@0 | 185 | } // namespace CLD2 |
michael@0 | 186 | |
michael@0 | 187 | #endif // I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__ |