|
1 // Copyright 2013 Google Inc. All Rights Reserved. |
|
2 // |
|
3 // Licensed under the Apache License, Version 2.0 (the "License"); |
|
4 // you may not use this file except in compliance with the License. |
|
5 // You may obtain a copy of the License at |
|
6 // |
|
7 // http://www.apache.org/licenses/LICENSE-2.0 |
|
8 // |
|
9 // Unless required by applicable law or agreed to in writing, software |
|
10 // distributed under the License is distributed on an "AS IS" BASIS, |
|
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
12 // See the License for the specific language governing permissions and |
|
13 // limitations under the License. |
|
14 |
|
15 // |
|
16 // File: lang_script.h |
|
17 // ================ |
|
18 // |
|
19 // Author: dsites@google.com (Dick Sites) |
|
20 // |
|
21 // This file declares language and script numbers and names for CLD2, |
|
22 // plus routines that access side tables based on these |
|
23 // |
|
24 |
|
25 #ifndef I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__ |
|
26 #define I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__ |
|
27 |
|
28 #include "generated_language.h" |
|
29 #include "generated_ulscript.h" |
|
30 #include "integral_types.h" |
|
31 |
|
32 |
|
33 // NOTE: The script numbers and language numbers here are not guaranteed to be |
|
34 // stable. If you want to record a result for posterity, save the |
|
35 // ULScriptCode(ULScript ulscript) result as character strings. |
|
36 // |
|
37 // The Unicode scripts recognized by CLD2 are numbered almost arbitrarily, |
|
38 // specified in an enum. Each script has human-readable script name and a |
|
39 // 4-letter ISO 15924 script code. Each has a C name (largely for use by |
|
40 // programs that generate declarations in cld2_generated_scripts.h). Each |
|
41 // also has a recognition type |
|
42 // r_type: 0 script-only, 1 nilgrams, 2 quadgrams, 3 CJK |
|
43 // |
|
44 // The declarations for a particular version of Unicode are machine-generated in |
|
45 // generated_scripts.h |
|
46 // |
|
47 // This file includes that one and declares the access routines. The type |
|
48 // involved is called "ULScript" to signify Unicode Letters-Marks Scripts, |
|
49 // which are not quite Unicode Scripts. In particular, the CJK scripts are |
|
50 // merged into a single number because CLD2 recognizes the CJK languages from |
|
51 // four scripts intermixed: Hani (both Hans and Hant), Hangul, Hiragana, and |
|
52 // Katakana. |
|
53 |
|
54 // Each script has one of these four recognition types. |
|
55 // RTypeNone: There is no language associated with this script. In extended |
|
56 // language recognition calls, return a fake language number that maps to |
|
57 // xx-Cham, with literally "xx" for the language code,and with the script |
|
58 // code instead of "Cham". In non-extended calls, return UNKNOWN_LANGUAGE. |
|
59 // RTypeOne: The script maps 1:1 to a single language. No letters are examined |
|
60 // during recognition and no lookups done. |
|
61 // RTypeMany: The usual quadgram + delta-octagram + distinctive-words scoring |
|
62 // is done to determine the languages involved. |
|
63 // RTypeCJK: The CJK unigram + delta-bigram scoring is done to determine the |
|
64 // languages involved. |
|
65 // |
|
66 // Note that the choice of recognition type is a function of script, not |
|
67 // language. In particular, some languges are recognized in multiple scripts |
|
68 // and those have different recognition types (Mongolian mn-Latn vs. mn-Mong |
|
69 // for example). |
|
70 |
|
71 namespace CLD2 { |
|
72 |
|
73 //----------------------------------------------------------------------------// |
|
74 // Functions of ULScript // |
|
75 //----------------------------------------------------------------------------// |
|
76 |
|
77 // If the input is out of range or otherwise unrecognized, it is treated |
|
78 // as ULScript_Common (which never participates in language recognition) |
|
79 const char* ULScriptName(ULScript ulscript); |
|
80 const char* ULScriptCode(ULScript ulscript); |
|
81 const char* ULScriptDeclaredName(ULScript ulscript); |
|
82 ULScriptRType ULScriptRecognitionType(ULScript ulscript); |
|
83 |
|
84 // Name can be either full name or ISO code, or can be ISO code embedded in |
|
85 // a language-script combination such as "en-Latn-GB" |
|
86 ULScript GetULScriptFromName(const char* src); |
|
87 |
|
88 // Map script into Latin, Cyrillic, Arabic, Other |
|
89 int LScript4(ULScript ulscript); |
|
90 |
|
91 //----------------------------------------------------------------------------// |
|
92 // Functions of Language // |
|
93 //----------------------------------------------------------------------------// |
|
94 |
|
95 // The languages recognized by CLD2 are numbered almost arbitrarily, |
|
96 // specified in an enum. Each language has human-readable language name and a |
|
97 // 2- or 3-letter ISO 639 language code. Each has a C name (largely for use by |
|
98 // programs that generate declarations in cld2_generated_languagess.h). |
|
99 // Each has a list of up to four scripts in which it is currently recognized. |
|
100 // |
|
101 // The declarations for a particular set of recognized languages are |
|
102 // machine-generated in |
|
103 // generated_languages.h |
|
104 // |
|
105 // The Language enum is intended to match the internal Google Language enum |
|
106 // in i18n/languages/proto/languages.proto up to NUM_LANGUAGES, with additional |
|
107 // languages assigned above that. Over time, some languages may be renumbered |
|
108 // if they are moved into the Language enum. |
|
109 // |
|
110 // The Language enum includes the fake language numbers for RTypeNone above. |
|
111 // |
|
112 |
|
113 |
|
114 // If the input is out of range or otherwise unrecognized, it is treated |
|
115 // as UNKNOWN_LANGUAGE |
|
116 // |
|
117 // LanguageCode |
|
118 // ------------ |
|
119 // Given the Language, return the language code, e.g. "ko" |
|
120 // This is determined by |
|
121 // the following (in order of preference): |
|
122 // - ISO-639-1 two-letter language code |
|
123 // (all except those mentioned below) |
|
124 // - ISO-639-2 three-letter bibliographic language code |
|
125 // (Tibetan, Dhivehi, Cherokee, Syriac) |
|
126 // - Google-specific language code |
|
127 // (ChineseT ("zh-TW"), Teragram Unknown, Unknown, |
|
128 // Portuguese-Portugal, Portuguese-Brazil, Limbu) |
|
129 // - Fake RTypeNone names. |
|
130 |
|
131 const char* LanguageName(Language lang); |
|
132 const char* LanguageCode(Language lang); |
|
133 const char* LanguageShortCode(Language lang); |
|
134 const char* LanguageDeclaredName(Language lang); |
|
135 |
|
136 // n is in 0..3. Trailing entries are filled with |
|
137 // ULScript_Common (which never participates in language recognition) |
|
138 ULScript LanguageRecognizedScript(Language lang, int n); |
|
139 |
|
140 // Name can be either full name or ISO code, or can be ISO code embedded in |
|
141 // a language-script combination such as "en-Latn-GB" |
|
142 Language GetLanguageFromName(const char* src); |
|
143 |
|
144 // Returns which set of statistically-close languages lang is in. 0 means none. |
|
145 int LanguageCloseSet(Language lang); |
|
146 |
|
147 //----------------------------------------------------------------------------// |
|
148 // Functions of ULScript and Language // |
|
149 //----------------------------------------------------------------------------// |
|
150 |
|
151 // Most common language in each script |
|
152 Language DefaultLanguage(ULScript ulscript); |
|
153 |
|
154 // For RTypeMany recognition, |
|
155 // the CLD2 lookup tables are kept small by encoding a language into one byte. |
|
156 // To avoid limiting CLD2 to at most 256 languages, a larger range of external |
|
157 // Language numbers is mapped to a smaller range of per-script numbers. At |
|
158 // the moment (January 2013) the Latin script has about 90 languages to be |
|
159 // recognized, while all the other scripts total about 50 more languages. In |
|
160 // addition, the RTypeNone scripts map to about 100 fake languages. |
|
161 // So we map all Latin-script languages to one range of 1..255 per-script |
|
162 // numbers and map all the other RTypeMany languages to an overlapping range |
|
163 // 1..255 of per-script numbers. |
|
164 |
|
165 uint8 PerScriptNumber(ULScript ulscript, Language lang); |
|
166 Language FromPerScriptNumber(ULScript ulscript, uint8 perscript_number); |
|
167 |
|
168 // While the speed-sensitive processing deals with per-script language numbers, |
|
169 // there is a need for low-performance dealing with original language numbers |
|
170 // and unknown scripts, mostly for processing language hints. |
|
171 // These routines let one derive a script class from a bare language. |
|
172 // For languages written in multiple scripts, both of these can return true. |
|
173 |
|
174 bool IsLatnLanguage(Language lang); |
|
175 bool IsOthrLanguage(Language lang); |
|
176 |
|
177 |
|
178 //----------------------------------------------------------------------------// |
|
179 // Other // |
|
180 //----------------------------------------------------------------------------// |
|
181 |
|
182 // Utility routine to search alphabetical tables |
|
183 int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair); |
|
184 |
|
185 } // namespace CLD2 |
|
186 |
|
187 #endif // I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__ |