|
1 // Copyright 2013 Google Inc. All Rights Reserved. |
|
2 // |
|
3 // Licensed under the Apache License, Version 2.0 (the "License"); |
|
4 // you may not use this file except in compliance with the License. |
|
5 // You may obtain a copy of the License at |
|
6 // |
|
7 // http://www.apache.org/licenses/LICENSE-2.0 |
|
8 // |
|
9 // Unless required by applicable law or agreed to in writing, software |
|
10 // distributed under the License is distributed on an "AS IS" BASIS, |
|
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
12 // See the License for the specific language governing permissions and |
|
13 // limitations under the License. |
|
14 |
|
15 // |
|
16 // File: lang_script.cc |
|
17 // ================ |
|
18 // |
|
19 // Author: dsites@google.com (Dick Sites) |
|
20 // |
|
21 // This file declares language and script numbers and names for CLD2 |
|
22 // |
|
23 |
|
24 #include "lang_script.h" |
|
25 |
|
26 #include <stdlib.h> |
|
27 #include <string.h> |
|
28 |
|
29 #include "generated_language.h" |
|
30 #include "generated_ulscript.h" |
|
31 |
|
32 namespace CLD2 { |
|
33 |
|
34 // Language tables |
|
35 // Subscripted by enum Language |
|
36 extern const int kLanguageToNameSize; |
|
37 extern const char* const kLanguageToName[]; |
|
38 extern const int kLanguageToCodeSize; |
|
39 extern const char* const kLanguageToCode[]; |
|
40 extern const int kLanguageToCNameSize; |
|
41 extern const char* const kLanguageToCName[]; |
|
42 extern const int kLanguageToScriptsSize; |
|
43 extern const FourScripts kLanguageToScripts[]; |
|
44 |
|
45 // Subscripted by Language |
|
46 extern const int kLanguageToPLangSize; |
|
47 extern const uint8 kLanguageToPLang[]; |
|
48 // Subscripted by per-script language |
|
49 extern const uint16 kPLangToLanguageLatn[]; |
|
50 extern const uint16 kPLangToLanguageOthr[]; |
|
51 |
|
52 // Alphabetical order for binary search |
|
53 extern const int kNameToLanguageSize; |
|
54 extern const CharIntPair kNameToLanguage[]; |
|
55 extern const int kCodeToLanguageSize; |
|
56 extern const CharIntPair kCodeToLanguage[]; |
|
57 |
|
58 // ULScript tables |
|
59 // Subscripted by enum ULScript |
|
60 extern const int kULScriptToNameSize; |
|
61 extern const char* const kULScriptToName[]; |
|
62 extern const int kULScriptToCodeSize; |
|
63 extern const char* const kULScriptToCode[]; |
|
64 extern const int kULScriptToCNameSize; |
|
65 extern const char* const kULScriptToCName[]; |
|
66 extern const int kULScriptToRtypeSize; |
|
67 extern const ULScriptRType kULScriptToRtype[]; |
|
68 extern const int kULScriptToDefaultLangSize; |
|
69 extern const Language kULScriptToDefaultLang[]; |
|
70 |
|
71 // Alphabetical order for binary search |
|
72 extern const int kNameToULScriptSize; |
|
73 extern const CharIntPair kNameToULScript[]; |
|
74 extern const int kCodeToULScriptSize; |
|
75 extern const CharIntPair kCodeToULScript[]; |
|
76 |
|
77 |
|
78 // |
|
79 // File: lang_script.h |
|
80 // ================ |
|
81 // |
|
82 // Author: dsites@google.com (Dick Sites) |
|
83 // |
|
84 // This file declares language and script numbers and names for CLD2 |
|
85 // |
|
86 |
|
87 |
|
88 // NOTE: The script numbers and language numbers here are not guaranteed to be |
|
89 // stable. If you want to record a result for posterity, save the ISO codes |
|
90 // as character strings. |
|
91 // |
|
92 // |
|
93 // The Unicode scripts recognized by CLD2 are numbered almost arbitrarily, |
|
94 // specified in an enum. Each script has human-readable script name and a |
|
95 // 4-letter ISO 15924 script code. Each has a C name (largely for use by |
|
96 // programs that generate declarations in cld2_generated_scripts.h). Each |
|
97 // also has a recognition type |
|
98 // r_type: 0 script-only, 1 nilgrams, 2 quadgrams, 3 CJK |
|
99 // |
|
100 // The declarations for a particular version of Unicode are machine-generated in |
|
101 // cld2_generated_scripts.h |
|
102 // |
|
103 // This file includes that one and declares the access routines. The type |
|
104 // involved is called "ULScript" to signify Unicode Letters-Marks Scripts, |
|
105 // which are not quite Unicode Scripts. In particular, the CJK scripts are |
|
106 // merged into a single number because CLD2 recognizes the CJK languages from |
|
107 // four scripts intermixed: Hani (both Hans and Hant), Hangul, Hiragana, and |
|
108 // Katakana. |
|
109 |
|
110 // Each script has one of these four recognition types. |
|
111 // RTypeNone: There is no language associated with this script. In extended |
|
112 // language recognition calls, return a fake language number that maps to |
|
113 // xx-Cham, with literally "xx" for the language code,and with the script |
|
114 // code instead of "Cham". In non-extended calls, return UNKNOWN_LANGUAGE. |
|
115 // RTypeOne: The script maps 1:1 to a single language. No letters are examined |
|
116 // during recognition and no lookups done. |
|
117 // RTypeMany: The usual quadgram + delta-octagram + distinctive-words scoring |
|
118 // is done to determine the languages involved. |
|
119 // RTypeCJK: The CJK unigram + delta-bigram scoring is done to determine the |
|
120 // languages involved. |
|
121 // |
|
122 // Note that the choice of recognition type is a function of script, not |
|
123 // language. In particular, some languges are recognized in multiple scripts |
|
124 // and those have different recognition types (Mongolian mn-Latn vs. mn-Mong |
|
125 // for example). |
|
126 |
|
127 //----------------------------------------------------------------------------// |
|
128 // Functions of ULScript // |
|
129 //----------------------------------------------------------------------------// |
|
130 |
|
131 // If the input is out of range or otherwise unrecognized, it is treated |
|
132 // as UNKNOWN_ULSCRIPT (which never participates in language recognition) |
|
133 const char* ULScriptName(ULScript ulscript) { |
|
134 int i_ulscript = ulscript; |
|
135 if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;} |
|
136 if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;} |
|
137 return kULScriptToName[i_ulscript]; |
|
138 } |
|
139 |
|
140 const char* ULScriptCode(ULScript ulscript) { |
|
141 int i_ulscript = ulscript; |
|
142 if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;} |
|
143 if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;} |
|
144 return kULScriptToCode[i_ulscript]; |
|
145 } |
|
146 |
|
147 const char* ULScriptDeclaredName(ULScript ulscript) { |
|
148 int i_ulscript = ulscript; |
|
149 if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;} |
|
150 if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;} |
|
151 return kULScriptToCName[i_ulscript]; |
|
152 } |
|
153 |
|
154 ULScriptRType ULScriptRecognitionType(ULScript ulscript) { |
|
155 int i_ulscript = ulscript; |
|
156 if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;} |
|
157 if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;} |
|
158 return kULScriptToRtype[i_ulscript]; |
|
159 } |
|
160 |
|
161 |
|
162 |
|
163 // The languages recognized by CLD2 are numbered almost arbitrarily, |
|
164 // specified in an enum. Each language has human-readable language name and a |
|
165 // 2- or 3-letter ISO 639 language code. Each has a C name (largely for use by |
|
166 // programs that generate declarations in cld2_generated_languagess.h). |
|
167 // Each has a list of up to four scripts in which it is currently recognized. |
|
168 // |
|
169 // The declarations for a particular set of recognized languages are |
|
170 // machine-generated in |
|
171 // cld2_generated_languages.h |
|
172 // |
|
173 // The Language enum is intended to match the internal Google Language enum |
|
174 // in i18n/languages/proto/languages.proto up to NUM_LANGUAGES, with additional |
|
175 // languages assigned above that. Over time, some languages may be renumbered |
|
176 // if they are moved into the Language enum. |
|
177 // |
|
178 // The Language enum includes the fake language numbers for RTypeNone above. |
|
179 // |
|
180 // In an open-source environment, the Google-specific Language enum is not |
|
181 // available. Language decouples the two environments while maintaining |
|
182 // internal compatibility. |
|
183 |
|
184 |
|
185 // If the input is out of range or otherwise unrecognized, it is treated |
|
186 // as UNKNOWN_LANGUAGE |
|
187 // |
|
188 // LanguageCode |
|
189 // ------------ |
|
190 // Given the Language, return the language code, e.g. "ko" |
|
191 // This is determined by |
|
192 // the following (in order of preference): |
|
193 // - ISO-639-1 two-letter language code |
|
194 // (all except those mentioned below) |
|
195 // - ISO-639-2 three-letter bibliographic language code |
|
196 // (Tibetan, Dhivehi, Cherokee, Syriac) |
|
197 // - Google-specific language code |
|
198 // (ChineseT ("zh-TW"), Teragram Unknown, Unknown, |
|
199 // Portuguese-Portugal, Portuguese-Brazil, Limbu) |
|
200 // - Fake RTypeNone names. |
|
201 |
|
202 //----------------------------------------------------------------------------// |
|
203 // Functions of Language // |
|
204 //----------------------------------------------------------------------------// |
|
205 |
|
206 const char* LanguageName(Language lang) { |
|
207 int i_lang = lang; |
|
208 if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;} |
|
209 if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;} |
|
210 return kLanguageToName[i_lang]; |
|
211 } |
|
212 const char* LanguageCode(Language lang) { |
|
213 int i_lang = lang; |
|
214 if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;} |
|
215 if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;} |
|
216 return kLanguageToCode[i_lang]; |
|
217 } |
|
218 |
|
219 const char* LanguageDeclaredName(Language lang) { |
|
220 int i_lang = lang; |
|
221 if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;} |
|
222 if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;} |
|
223 return kLanguageToCName[i_lang]; |
|
224 } |
|
225 |
|
226 // n is in 0..3. Trailing entries are filled with |
|
227 // UNKNOWN_LANGUAGE (which never participates in language recognition) |
|
228 ULScript LanguageRecognizedScript(Language lang, int n) { |
|
229 int i_lang = lang; |
|
230 if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;} |
|
231 if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;} |
|
232 return static_cast<ULScript>(kLanguageToScripts[i_lang][n]); |
|
233 } |
|
234 |
|
235 // Given the Language, returns its string name used as the output by |
|
236 // the lang/enc identifier, e.g. "Korean" |
|
237 // "invalid_language" if the input is invalid. |
|
238 // TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language, |
|
239 // used to subtract out HTML, link farms, DNA strings, and alittle English porn |
|
240 const char* ExtLanguageName(const Language lang) { |
|
241 return LanguageName(lang); |
|
242 } |
|
243 |
|
244 // Given the Language, return the language code, e.g. "ko" |
|
245 const char* ExtLanguageCode(const Language lang) { |
|
246 return LanguageCode(lang); |
|
247 } |
|
248 |
|
249 |
|
250 // Given the Language, returns its Language enum spelling, for use by |
|
251 // programs that create C declarations, e.g. "KOREAN" |
|
252 // "UNKNOWN_LANGUAGE" if the input is invalid. |
|
253 const char* ExtLanguageDeclaredName(const Language lang) { |
|
254 return LanguageDeclaredName(lang); |
|
255 } |
|
256 |
|
257 |
|
258 extern const int kCloseSetSize = 10; |
|
259 |
|
260 // Returns which set of statistically-close languages lang is in. 0 means none. |
|
261 int LanguageCloseSet(Language lang) { |
|
262 // Scaffolding |
|
263 // id ms # INDONESIAN MALAY coef=0.4698 Problematic w/o extra words |
|
264 // bo dz # TIBETAN DZONGKHA coef=0.4571 |
|
265 // cs sk # CZECH SLOVAK coef=0.4273 |
|
266 // zu xh # ZULU XHOSA coef=0.3716 |
|
267 // |
|
268 // bs hr sr srm # BOSNIAN CROATIAN SERBIAN MONTENEGRIN |
|
269 // hi mr bh ne # HINDI MARATHI BIHARI NEPALI |
|
270 // no nn da # NORWEGIAN NORWEGIAN_N DANISH |
|
271 // gl es pt # GALICIAN SPANISH PORTUGUESE |
|
272 // rw rn # KINYARWANDA RUNDI |
|
273 |
|
274 if (lang == INDONESIAN) {return 1;} |
|
275 if (lang == MALAY) {return 1;} |
|
276 |
|
277 if (lang == TIBETAN) {return 2;} |
|
278 if (lang == DZONGKHA) {return 2;} |
|
279 |
|
280 if (lang == CZECH) {return 3;} |
|
281 if (lang == SLOVAK) {return 3;} |
|
282 |
|
283 if (lang == ZULU) {return 4;} |
|
284 if (lang == XHOSA) {return 4;} |
|
285 |
|
286 if (lang == BOSNIAN) {return 5;} |
|
287 if (lang == CROATIAN) {return 5;} |
|
288 if (lang == SERBIAN) {return 5;} |
|
289 if (lang == MONTENEGRIN) {return 5;} |
|
290 |
|
291 if (lang == HINDI) {return 6;} |
|
292 if (lang == MARATHI) {return 6;} |
|
293 if (lang == BIHARI) {return 6;} |
|
294 if (lang == NEPALI) {return 6;} |
|
295 |
|
296 if (lang == NORWEGIAN) {return 7;} |
|
297 if (lang == NORWEGIAN_N) {return 7;} |
|
298 if (lang == DANISH) {return 7;} |
|
299 |
|
300 if (lang == GALICIAN) {return 8;} |
|
301 if (lang == SPANISH) {return 8;} |
|
302 if (lang == PORTUGUESE) {return 8;} |
|
303 |
|
304 if (lang == KINYARWANDA) {return 9;} |
|
305 if (lang == RUNDI) {return 9;} |
|
306 |
|
307 return 0; |
|
308 } |
|
309 |
|
310 //----------------------------------------------------------------------------// |
|
311 // Functions of ULScript and Language // |
|
312 //----------------------------------------------------------------------------// |
|
313 |
|
314 Language DefaultLanguage(ULScript ulscript) { |
|
315 if (ulscript < 0) {return UNKNOWN_LANGUAGE;} |
|
316 if (ulscript >= NUM_ULSCRIPTS) {return UNKNOWN_LANGUAGE;} |
|
317 return kULScriptToDefaultLang[ulscript]; |
|
318 } |
|
319 |
|
320 uint8 PerScriptNumber(ULScript ulscript, Language lang) { |
|
321 if (ulscript < 0) {return 0;} |
|
322 if (ulscript >= NUM_ULSCRIPTS) {return 0;} |
|
323 if (kULScriptToRtype[ulscript] == RTypeNone) {return 1;} |
|
324 if (lang >= kLanguageToPLangSize) {return 0;} |
|
325 return kLanguageToPLang[lang]; |
|
326 } |
|
327 |
|
328 Language FromPerScriptNumber(ULScript ulscript, uint8 perscript_number) { |
|
329 if (ulscript < 0) {return UNKNOWN_LANGUAGE;} |
|
330 if (ulscript >= NUM_ULSCRIPTS) {return UNKNOWN_LANGUAGE;} |
|
331 if ((kULScriptToRtype[ulscript] == RTypeNone) || |
|
332 (kULScriptToRtype[ulscript] == RTypeOne)) { |
|
333 return kULScriptToDefaultLang[ulscript]; |
|
334 } |
|
335 |
|
336 if (ulscript == ULScript_Latin) { |
|
337 return static_cast<Language>(kPLangToLanguageLatn[perscript_number]); |
|
338 } else { |
|
339 return static_cast<Language>(kPLangToLanguageOthr[perscript_number]); |
|
340 } |
|
341 } |
|
342 |
|
343 // Return true if language can be in the Latin script |
|
344 bool IsLatnLanguage(Language lang) { |
|
345 if (lang >= kLanguageToPLangSize) {return false;} |
|
346 return (lang == kPLangToLanguageLatn[kLanguageToPLang[lang]]); |
|
347 } |
|
348 |
|
349 // Return true if language can be in a non-Latin script |
|
350 bool IsOthrLanguage(Language lang) { |
|
351 if (lang >= kLanguageToPLangSize) {return false;} |
|
352 return (lang == kPLangToLanguageOthr[kLanguageToPLang[lang]]); |
|
353 } |
|
354 |
|
355 |
|
356 //----------------------------------------------------------------------------// |
|
357 // Other // |
|
358 //----------------------------------------------------------------------------// |
|
359 |
|
360 // Returns mid if key found in lo <= mid < hi, else -1 |
|
361 int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair) { |
|
362 // binary search |
|
363 while (lo < hi) { |
|
364 int mid = (lo + hi) >> 1; |
|
365 if (strcmp(key, cipair[mid].s) < 0) { |
|
366 hi = mid; |
|
367 } else if (strcmp(key, cipair[mid].s) > 0) { |
|
368 lo = mid + 1; |
|
369 } else { |
|
370 return mid; |
|
371 } |
|
372 } |
|
373 return -1; |
|
374 } |
|
375 |
|
376 Language MakeLang(int i) {return static_cast<Language>(i);} |
|
377 |
|
378 // Name can be either full name or ISO code, or can be ISO code embedded in |
|
379 // a language-script combination such as "ABKHAZIAN", "en", "en-Latn-GB" |
|
380 Language GetLanguageFromName(const char* src) { |
|
381 const char* hyphen1 = strchr(src, '-'); |
|
382 const char* hyphen2 = NULL; |
|
383 if (hyphen1 != NULL) {hyphen2 = strchr(hyphen1 + 1, '-');} |
|
384 |
|
385 int match = -1; |
|
386 if (hyphen1 == NULL) { |
|
387 // Bare name. Look at full name, then code |
|
388 match = BinarySearch(src, 0, kNameToLanguageSize, kNameToLanguage); |
|
389 if (match >= 0) {return MakeLang(kNameToLanguage[match].i);} // aa |
|
390 match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage); |
|
391 if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa |
|
392 return UNKNOWN_LANGUAGE; |
|
393 } |
|
394 |
|
395 if (hyphen2 == NULL) { |
|
396 // aa-bb. Not a full name; must be code-something. Try zh-TW then bare zh |
|
397 match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage); |
|
398 if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb |
|
399 |
|
400 int len = strlen(src); |
|
401 if (len >= 16) {return UNKNOWN_LANGUAGE;} // Real codes are shorter |
|
402 |
|
403 char temp[16]; |
|
404 int hyphen1_offset = hyphen1 - src; |
|
405 // Take off part after hyphen1 |
|
406 memcpy(temp, src, len); |
|
407 temp[hyphen1_offset] = '\0'; |
|
408 match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage); |
|
409 if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa |
|
410 |
|
411 return UNKNOWN_LANGUAGE; |
|
412 } |
|
413 |
|
414 // aa-bb-cc. Must be code-something. Try en-Latn-US, en-Latn, en-US, en |
|
415 match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage); |
|
416 if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb-cc |
|
417 |
|
418 |
|
419 int len = strlen(src); |
|
420 if (len >= 16) {return UNKNOWN_LANGUAGE;} // Real codes are shorter |
|
421 |
|
422 char temp[16]; |
|
423 int hyphen1_offset = hyphen1 - src; |
|
424 int hyphen2_offset = hyphen2 - src; |
|
425 // Take off part after hyphen2 |
|
426 memcpy(temp, src, len); |
|
427 temp[hyphen2_offset] = '\0'; |
|
428 match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage); |
|
429 if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb |
|
430 |
|
431 |
|
432 // Take off part between hyphen1 and hyphen2 |
|
433 int len2 = len - hyphen2_offset; |
|
434 memcpy(temp, src, len); |
|
435 memcpy(&temp[hyphen1_offset], hyphen2, len2); |
|
436 temp[hyphen1_offset + len2] = '\0'; |
|
437 match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage); |
|
438 if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-cc |
|
439 |
|
440 |
|
441 // Take off everything after hyphen1 |
|
442 memcpy(temp, src, len); |
|
443 temp[hyphen1_offset] = '\0'; |
|
444 match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage); |
|
445 if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa |
|
446 |
|
447 |
|
448 return UNKNOWN_LANGUAGE; |
|
449 } |
|
450 |
|
451 |
|
452 // Name can be either full name or ISO code, or can be ISO code embedded in |
|
453 // a language-script combination such as "en-Latn-GB" |
|
454 // MORE WORK to do here. also kLanguageToScripts [4] is bogus |
|
455 // if bare language name, no script, want zh, ja, ko to Hani, pt to Latn, etc. |
|
456 // Something like map code to Language, then Language to kLanguageToScripts[x][0] |
|
457 // ADD BIAS: kLanguageToScripts lists default script first |
|
458 // If total mismatch, reutrn Latn |
|
459 // if (strcmp(src, "nd") == 0) {return NDEBELE;} // [nd was wrong] |
|
460 // if (strcmp(src, "sit-NP-Limb") == 0) {return ULScript_Limbu;} |
|
461 |
|
462 ULScript MakeULScr(int i) {return static_cast<ULScript>(i);} |
|
463 |
|
464 ULScript GetULScriptFromName(const char* src) { |
|
465 const char* hyphen1 = strchr(src, '-'); |
|
466 const char* hyphen2 = NULL; |
|
467 if (hyphen1 != NULL) {hyphen2 = strchr(hyphen1 + 1, '-');} |
|
468 |
|
469 int match = -1; |
|
470 if (hyphen1 == NULL) { |
|
471 // Bare name. Look at full name, then code, then try backmapping as Language |
|
472 match = BinarySearch(src, 0, kNameToULScriptSize, kNameToULScript); |
|
473 if (match >= 0) {return MakeULScr(kNameToULScript[match].i);} // aa |
|
474 match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript); |
|
475 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa |
|
476 |
|
477 Language backmap_me = GetLanguageFromName(src); |
|
478 if (backmap_me != UNKNOWN_LANGUAGE) { |
|
479 return static_cast<ULScript>(kLanguageToScripts[backmap_me][0]); |
|
480 } |
|
481 return ULScript_Latin; |
|
482 } |
|
483 |
|
484 if (hyphen2 == NULL) { |
|
485 // aa-bb. Not a full name; must be code-something. Try en-Latn, bare Latn |
|
486 if (strcmp(src, "zh-TW") == 0) {return ULScript_Hani;} |
|
487 if (strcmp(src, "zh-CN") == 0) {return ULScript_Hani;} |
|
488 if (strcmp(src, "sit-NP") == 0) {return ULScript_Limbu;} |
|
489 if (strcmp(src, "sit-Limb") == 0) {return ULScript_Limbu;} |
|
490 if (strcmp(src, "sr-ME") == 0) {return ULScript_Latin;} |
|
491 match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript); |
|
492 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa-bb |
|
493 |
|
494 int len = strlen(src); |
|
495 if (len >= 16) {return ULScript_Latin;} // Real codes are shorter |
|
496 |
|
497 char temp[16]; |
|
498 int hyphen1_offset = hyphen1 - src; |
|
499 int len1 = len - hyphen1_offset - 1; // Exclude the hyphen |
|
500 // Take off part before hyphen1 |
|
501 memcpy(temp, hyphen1 + 1, len1); |
|
502 temp[len1] = '\0'; |
|
503 match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript); |
|
504 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // bb |
|
505 |
|
506 // Take off part after hyphen1 |
|
507 memcpy(temp, src, len); |
|
508 temp[hyphen1_offset] = '\0'; |
|
509 match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript); |
|
510 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa |
|
511 |
|
512 return ULScript_Latin; |
|
513 } |
|
514 |
|
515 // aa-bb-cc. Must be code-something. Try en-Latn-US, en-Latn, en-US, en |
|
516 if (strcmp(src, "sit-NP-Limb") == 0) {return ULScript_Limbu;} |
|
517 if (strcmp(src, "sr-ME-Latn") == 0) {return ULScript_Latin;} |
|
518 if (strcmp(src, "sr-ME-Cyrl") == 0) {return ULScript_Cyrillic;} |
|
519 match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript); |
|
520 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa-bb-cc |
|
521 |
|
522 int len = strlen(src); |
|
523 if (len >= 16) {return ULScript_Latin;} // Real codes are shorter |
|
524 |
|
525 char temp[16]; |
|
526 int hyphen1_offset = hyphen1 - src; |
|
527 int hyphen2_offset = hyphen2 - src; |
|
528 int len2 = len - hyphen2_offset - 1; // Exclude the hyphen |
|
529 int lenmid = hyphen2_offset - hyphen1_offset - 1; // Exclude the hyphen |
|
530 // Keep part between hyphen1 and hyphen2 |
|
531 memcpy(temp, hyphen1 + 1, lenmid); |
|
532 temp[lenmid] = '\0'; |
|
533 match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript); |
|
534 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // bb |
|
535 |
|
536 // Keep part after hyphen2 |
|
537 memcpy(temp, hyphen2 + 1, len2); |
|
538 temp[len2] = '\0'; |
|
539 match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript); |
|
540 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // cc |
|
541 |
|
542 // Keep part before hyphen1 |
|
543 memcpy(temp, src, len); |
|
544 temp[hyphen1_offset] = '\0'; |
|
545 match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript); |
|
546 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa |
|
547 |
|
548 return ULScript_Latin; |
|
549 } |
|
550 |
|
551 // Map script into Latin, Cyrillic, Arabic, Other |
|
552 int LScript4(ULScript ulscript) { |
|
553 if (ulscript == ULScript_Latin) {return 0;} |
|
554 if (ulscript == ULScript_Cyrillic) {return 1;} |
|
555 if (ulscript == ULScript_Arabic) {return 2;} |
|
556 return 3; |
|
557 } |
|
558 |
|
559 } // namespace CLD2 |
|
560 |