browser/components/translation/cld2/internal/lang_script.cc

changeset 0
6474c204b198
equal deleted inserted replaced
-1:000000000000 0:7c08be75170b
1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 //
16 // File: lang_script.cc
17 // ================
18 //
19 // Author: dsites@google.com (Dick Sites)
20 //
21 // This file declares language and script numbers and names for CLD2
22 //
23
24 #include "lang_script.h"
25
26 #include <stdlib.h>
27 #include <string.h>
28
29 #include "generated_language.h"
30 #include "generated_ulscript.h"
31
32 namespace CLD2 {
33
34 // Language tables
35 // Subscripted by enum Language
36 extern const int kLanguageToNameSize;
37 extern const char* const kLanguageToName[];
38 extern const int kLanguageToCodeSize;
39 extern const char* const kLanguageToCode[];
40 extern const int kLanguageToCNameSize;
41 extern const char* const kLanguageToCName[];
42 extern const int kLanguageToScriptsSize;
43 extern const FourScripts kLanguageToScripts[];
44
45 // Subscripted by Language
46 extern const int kLanguageToPLangSize;
47 extern const uint8 kLanguageToPLang[];
48 // Subscripted by per-script language
49 extern const uint16 kPLangToLanguageLatn[];
50 extern const uint16 kPLangToLanguageOthr[];
51
52 // Alphabetical order for binary search
53 extern const int kNameToLanguageSize;
54 extern const CharIntPair kNameToLanguage[];
55 extern const int kCodeToLanguageSize;
56 extern const CharIntPair kCodeToLanguage[];
57
58 // ULScript tables
59 // Subscripted by enum ULScript
60 extern const int kULScriptToNameSize;
61 extern const char* const kULScriptToName[];
62 extern const int kULScriptToCodeSize;
63 extern const char* const kULScriptToCode[];
64 extern const int kULScriptToCNameSize;
65 extern const char* const kULScriptToCName[];
66 extern const int kULScriptToRtypeSize;
67 extern const ULScriptRType kULScriptToRtype[];
68 extern const int kULScriptToDefaultLangSize;
69 extern const Language kULScriptToDefaultLang[];
70
71 // Alphabetical order for binary search
72 extern const int kNameToULScriptSize;
73 extern const CharIntPair kNameToULScript[];
74 extern const int kCodeToULScriptSize;
75 extern const CharIntPair kCodeToULScript[];
76
77
78 //
79 // File: lang_script.h
80 // ================
81 //
82 // Author: dsites@google.com (Dick Sites)
83 //
84 // This file declares language and script numbers and names for CLD2
85 //
86
87
88 // NOTE: The script numbers and language numbers here are not guaranteed to be
89 // stable. If you want to record a result for posterity, save the ISO codes
90 // as character strings.
91 //
92 //
93 // The Unicode scripts recognized by CLD2 are numbered almost arbitrarily,
94 // specified in an enum. Each script has human-readable script name and a
95 // 4-letter ISO 15924 script code. Each has a C name (largely for use by
96 // programs that generate declarations in cld2_generated_scripts.h). Each
97 // also has a recognition type
98 // r_type: 0 script-only, 1 nilgrams, 2 quadgrams, 3 CJK
99 //
100 // The declarations for a particular version of Unicode are machine-generated in
101 // cld2_generated_scripts.h
102 //
103 // This file includes that one and declares the access routines. The type
104 // involved is called "ULScript" to signify Unicode Letters-Marks Scripts,
105 // which are not quite Unicode Scripts. In particular, the CJK scripts are
106 // merged into a single number because CLD2 recognizes the CJK languages from
107 // four scripts intermixed: Hani (both Hans and Hant), Hangul, Hiragana, and
108 // Katakana.
109
110 // Each script has one of these four recognition types.
111 // RTypeNone: There is no language associated with this script. In extended
112 // language recognition calls, return a fake language number that maps to
113 // xx-Cham, with literally "xx" for the language code,and with the script
114 // code instead of "Cham". In non-extended calls, return UNKNOWN_LANGUAGE.
115 // RTypeOne: The script maps 1:1 to a single language. No letters are examined
116 // during recognition and no lookups done.
117 // RTypeMany: The usual quadgram + delta-octagram + distinctive-words scoring
118 // is done to determine the languages involved.
119 // RTypeCJK: The CJK unigram + delta-bigram scoring is done to determine the
120 // languages involved.
121 //
122 // Note that the choice of recognition type is a function of script, not
123 // language. In particular, some languges are recognized in multiple scripts
124 // and those have different recognition types (Mongolian mn-Latn vs. mn-Mong
125 // for example).
126
127 //----------------------------------------------------------------------------//
128 // Functions of ULScript //
129 //----------------------------------------------------------------------------//
130
131 // If the input is out of range or otherwise unrecognized, it is treated
132 // as UNKNOWN_ULSCRIPT (which never participates in language recognition)
133 const char* ULScriptName(ULScript ulscript) {
134 int i_ulscript = ulscript;
135 if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
136 if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
137 return kULScriptToName[i_ulscript];
138 }
139
140 const char* ULScriptCode(ULScript ulscript) {
141 int i_ulscript = ulscript;
142 if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
143 if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
144 return kULScriptToCode[i_ulscript];
145 }
146
147 const char* ULScriptDeclaredName(ULScript ulscript) {
148 int i_ulscript = ulscript;
149 if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
150 if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
151 return kULScriptToCName[i_ulscript];
152 }
153
154 ULScriptRType ULScriptRecognitionType(ULScript ulscript) {
155 int i_ulscript = ulscript;
156 if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
157 if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
158 return kULScriptToRtype[i_ulscript];
159 }
160
161
162
163 // The languages recognized by CLD2 are numbered almost arbitrarily,
164 // specified in an enum. Each language has human-readable language name and a
165 // 2- or 3-letter ISO 639 language code. Each has a C name (largely for use by
166 // programs that generate declarations in cld2_generated_languagess.h).
167 // Each has a list of up to four scripts in which it is currently recognized.
168 //
169 // The declarations for a particular set of recognized languages are
170 // machine-generated in
171 // cld2_generated_languages.h
172 //
173 // The Language enum is intended to match the internal Google Language enum
174 // in i18n/languages/proto/languages.proto up to NUM_LANGUAGES, with additional
175 // languages assigned above that. Over time, some languages may be renumbered
176 // if they are moved into the Language enum.
177 //
178 // The Language enum includes the fake language numbers for RTypeNone above.
179 //
180 // In an open-source environment, the Google-specific Language enum is not
181 // available. Language decouples the two environments while maintaining
182 // internal compatibility.
183
184
185 // If the input is out of range or otherwise unrecognized, it is treated
186 // as UNKNOWN_LANGUAGE
187 //
188 // LanguageCode
189 // ------------
190 // Given the Language, return the language code, e.g. "ko"
191 // This is determined by
192 // the following (in order of preference):
193 // - ISO-639-1 two-letter language code
194 // (all except those mentioned below)
195 // - ISO-639-2 three-letter bibliographic language code
196 // (Tibetan, Dhivehi, Cherokee, Syriac)
197 // - Google-specific language code
198 // (ChineseT ("zh-TW"), Teragram Unknown, Unknown,
199 // Portuguese-Portugal, Portuguese-Brazil, Limbu)
200 // - Fake RTypeNone names.
201
202 //----------------------------------------------------------------------------//
203 // Functions of Language //
204 //----------------------------------------------------------------------------//
205
206 const char* LanguageName(Language lang) {
207 int i_lang = lang;
208 if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
209 if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
210 return kLanguageToName[i_lang];
211 }
212 const char* LanguageCode(Language lang) {
213 int i_lang = lang;
214 if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
215 if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
216 return kLanguageToCode[i_lang];
217 }
218
219 const char* LanguageDeclaredName(Language lang) {
220 int i_lang = lang;
221 if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
222 if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
223 return kLanguageToCName[i_lang];
224 }
225
226 // n is in 0..3. Trailing entries are filled with
227 // UNKNOWN_LANGUAGE (which never participates in language recognition)
228 ULScript LanguageRecognizedScript(Language lang, int n) {
229 int i_lang = lang;
230 if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
231 if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
232 return static_cast<ULScript>(kLanguageToScripts[i_lang][n]);
233 }
234
235 // Given the Language, returns its string name used as the output by
236 // the lang/enc identifier, e.g. "Korean"
237 // "invalid_language" if the input is invalid.
238 // TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language,
239 // used to subtract out HTML, link farms, DNA strings, and alittle English porn
240 const char* ExtLanguageName(const Language lang) {
241 return LanguageName(lang);
242 }
243
244 // Given the Language, return the language code, e.g. "ko"
245 const char* ExtLanguageCode(const Language lang) {
246 return LanguageCode(lang);
247 }
248
249
250 // Given the Language, returns its Language enum spelling, for use by
251 // programs that create C declarations, e.g. "KOREAN"
252 // "UNKNOWN_LANGUAGE" if the input is invalid.
253 const char* ExtLanguageDeclaredName(const Language lang) {
254 return LanguageDeclaredName(lang);
255 }
256
257
258 extern const int kCloseSetSize = 10;
259
260 // Returns which set of statistically-close languages lang is in. 0 means none.
261 int LanguageCloseSet(Language lang) {
262 // Scaffolding
263 // id ms # INDONESIAN MALAY coef=0.4698 Problematic w/o extra words
264 // bo dz # TIBETAN DZONGKHA coef=0.4571
265 // cs sk # CZECH SLOVAK coef=0.4273
266 // zu xh # ZULU XHOSA coef=0.3716
267 //
268 // bs hr sr srm # BOSNIAN CROATIAN SERBIAN MONTENEGRIN
269 // hi mr bh ne # HINDI MARATHI BIHARI NEPALI
270 // no nn da # NORWEGIAN NORWEGIAN_N DANISH
271 // gl es pt # GALICIAN SPANISH PORTUGUESE
272 // rw rn # KINYARWANDA RUNDI
273
274 if (lang == INDONESIAN) {return 1;}
275 if (lang == MALAY) {return 1;}
276
277 if (lang == TIBETAN) {return 2;}
278 if (lang == DZONGKHA) {return 2;}
279
280 if (lang == CZECH) {return 3;}
281 if (lang == SLOVAK) {return 3;}
282
283 if (lang == ZULU) {return 4;}
284 if (lang == XHOSA) {return 4;}
285
286 if (lang == BOSNIAN) {return 5;}
287 if (lang == CROATIAN) {return 5;}
288 if (lang == SERBIAN) {return 5;}
289 if (lang == MONTENEGRIN) {return 5;}
290
291 if (lang == HINDI) {return 6;}
292 if (lang == MARATHI) {return 6;}
293 if (lang == BIHARI) {return 6;}
294 if (lang == NEPALI) {return 6;}
295
296 if (lang == NORWEGIAN) {return 7;}
297 if (lang == NORWEGIAN_N) {return 7;}
298 if (lang == DANISH) {return 7;}
299
300 if (lang == GALICIAN) {return 8;}
301 if (lang == SPANISH) {return 8;}
302 if (lang == PORTUGUESE) {return 8;}
303
304 if (lang == KINYARWANDA) {return 9;}
305 if (lang == RUNDI) {return 9;}
306
307 return 0;
308 }
309
310 //----------------------------------------------------------------------------//
311 // Functions of ULScript and Language //
312 //----------------------------------------------------------------------------//
313
314 Language DefaultLanguage(ULScript ulscript) {
315 if (ulscript < 0) {return UNKNOWN_LANGUAGE;}
316 if (ulscript >= NUM_ULSCRIPTS) {return UNKNOWN_LANGUAGE;}
317 return kULScriptToDefaultLang[ulscript];
318 }
319
320 uint8 PerScriptNumber(ULScript ulscript, Language lang) {
321 if (ulscript < 0) {return 0;}
322 if (ulscript >= NUM_ULSCRIPTS) {return 0;}
323 if (kULScriptToRtype[ulscript] == RTypeNone) {return 1;}
324 if (lang >= kLanguageToPLangSize) {return 0;}
325 return kLanguageToPLang[lang];
326 }
327
328 Language FromPerScriptNumber(ULScript ulscript, uint8 perscript_number) {
329 if (ulscript < 0) {return UNKNOWN_LANGUAGE;}
330 if (ulscript >= NUM_ULSCRIPTS) {return UNKNOWN_LANGUAGE;}
331 if ((kULScriptToRtype[ulscript] == RTypeNone) ||
332 (kULScriptToRtype[ulscript] == RTypeOne)) {
333 return kULScriptToDefaultLang[ulscript];
334 }
335
336 if (ulscript == ULScript_Latin) {
337 return static_cast<Language>(kPLangToLanguageLatn[perscript_number]);
338 } else {
339 return static_cast<Language>(kPLangToLanguageOthr[perscript_number]);
340 }
341 }
342
343 // Return true if language can be in the Latin script
344 bool IsLatnLanguage(Language lang) {
345 if (lang >= kLanguageToPLangSize) {return false;}
346 return (lang == kPLangToLanguageLatn[kLanguageToPLang[lang]]);
347 }
348
349 // Return true if language can be in a non-Latin script
350 bool IsOthrLanguage(Language lang) {
351 if (lang >= kLanguageToPLangSize) {return false;}
352 return (lang == kPLangToLanguageOthr[kLanguageToPLang[lang]]);
353 }
354
355
356 //----------------------------------------------------------------------------//
357 // Other //
358 //----------------------------------------------------------------------------//
359
360 // Returns mid if key found in lo <= mid < hi, else -1
361 int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair) {
362 // binary search
363 while (lo < hi) {
364 int mid = (lo + hi) >> 1;
365 if (strcmp(key, cipair[mid].s) < 0) {
366 hi = mid;
367 } else if (strcmp(key, cipair[mid].s) > 0) {
368 lo = mid + 1;
369 } else {
370 return mid;
371 }
372 }
373 return -1;
374 }
375
376 Language MakeLang(int i) {return static_cast<Language>(i);}
377
378 // Name can be either full name or ISO code, or can be ISO code embedded in
379 // a language-script combination such as "ABKHAZIAN", "en", "en-Latn-GB"
380 Language GetLanguageFromName(const char* src) {
381 const char* hyphen1 = strchr(src, '-');
382 const char* hyphen2 = NULL;
383 if (hyphen1 != NULL) {hyphen2 = strchr(hyphen1 + 1, '-');}
384
385 int match = -1;
386 if (hyphen1 == NULL) {
387 // Bare name. Look at full name, then code
388 match = BinarySearch(src, 0, kNameToLanguageSize, kNameToLanguage);
389 if (match >= 0) {return MakeLang(kNameToLanguage[match].i);} // aa
390 match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage);
391 if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa
392 return UNKNOWN_LANGUAGE;
393 }
394
395 if (hyphen2 == NULL) {
396 // aa-bb. Not a full name; must be code-something. Try zh-TW then bare zh
397 match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage);
398 if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb
399
400 int len = strlen(src);
401 if (len >= 16) {return UNKNOWN_LANGUAGE;} // Real codes are shorter
402
403 char temp[16];
404 int hyphen1_offset = hyphen1 - src;
405 // Take off part after hyphen1
406 memcpy(temp, src, len);
407 temp[hyphen1_offset] = '\0';
408 match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
409 if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa
410
411 return UNKNOWN_LANGUAGE;
412 }
413
414 // aa-bb-cc. Must be code-something. Try en-Latn-US, en-Latn, en-US, en
415 match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage);
416 if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb-cc
417
418
419 int len = strlen(src);
420 if (len >= 16) {return UNKNOWN_LANGUAGE;} // Real codes are shorter
421
422 char temp[16];
423 int hyphen1_offset = hyphen1 - src;
424 int hyphen2_offset = hyphen2 - src;
425 // Take off part after hyphen2
426 memcpy(temp, src, len);
427 temp[hyphen2_offset] = '\0';
428 match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
429 if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb
430
431
432 // Take off part between hyphen1 and hyphen2
433 int len2 = len - hyphen2_offset;
434 memcpy(temp, src, len);
435 memcpy(&temp[hyphen1_offset], hyphen2, len2);
436 temp[hyphen1_offset + len2] = '\0';
437 match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
438 if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-cc
439
440
441 // Take off everything after hyphen1
442 memcpy(temp, src, len);
443 temp[hyphen1_offset] = '\0';
444 match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
445 if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa
446
447
448 return UNKNOWN_LANGUAGE;
449 }
450
451
452 // Name can be either full name or ISO code, or can be ISO code embedded in
453 // a language-script combination such as "en-Latn-GB"
454 // MORE WORK to do here. also kLanguageToScripts [4] is bogus
455 // if bare language name, no script, want zh, ja, ko to Hani, pt to Latn, etc.
456 // Something like map code to Language, then Language to kLanguageToScripts[x][0]
457 // ADD BIAS: kLanguageToScripts lists default script first
458 // If total mismatch, reutrn Latn
459 // if (strcmp(src, "nd") == 0) {return NDEBELE;} // [nd was wrong]
460 // if (strcmp(src, "sit-NP-Limb") == 0) {return ULScript_Limbu;}
461
462 ULScript MakeULScr(int i) {return static_cast<ULScript>(i);}
463
464 ULScript GetULScriptFromName(const char* src) {
465 const char* hyphen1 = strchr(src, '-');
466 const char* hyphen2 = NULL;
467 if (hyphen1 != NULL) {hyphen2 = strchr(hyphen1 + 1, '-');}
468
469 int match = -1;
470 if (hyphen1 == NULL) {
471 // Bare name. Look at full name, then code, then try backmapping as Language
472 match = BinarySearch(src, 0, kNameToULScriptSize, kNameToULScript);
473 if (match >= 0) {return MakeULScr(kNameToULScript[match].i);} // aa
474 match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript);
475 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa
476
477 Language backmap_me = GetLanguageFromName(src);
478 if (backmap_me != UNKNOWN_LANGUAGE) {
479 return static_cast<ULScript>(kLanguageToScripts[backmap_me][0]);
480 }
481 return ULScript_Latin;
482 }
483
484 if (hyphen2 == NULL) {
485 // aa-bb. Not a full name; must be code-something. Try en-Latn, bare Latn
486 if (strcmp(src, "zh-TW") == 0) {return ULScript_Hani;}
487 if (strcmp(src, "zh-CN") == 0) {return ULScript_Hani;}
488 if (strcmp(src, "sit-NP") == 0) {return ULScript_Limbu;}
489 if (strcmp(src, "sit-Limb") == 0) {return ULScript_Limbu;}
490 if (strcmp(src, "sr-ME") == 0) {return ULScript_Latin;}
491 match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript);
492 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa-bb
493
494 int len = strlen(src);
495 if (len >= 16) {return ULScript_Latin;} // Real codes are shorter
496
497 char temp[16];
498 int hyphen1_offset = hyphen1 - src;
499 int len1 = len - hyphen1_offset - 1; // Exclude the hyphen
500 // Take off part before hyphen1
501 memcpy(temp, hyphen1 + 1, len1);
502 temp[len1] = '\0';
503 match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
504 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // bb
505
506 // Take off part after hyphen1
507 memcpy(temp, src, len);
508 temp[hyphen1_offset] = '\0';
509 match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
510 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa
511
512 return ULScript_Latin;
513 }
514
515 // aa-bb-cc. Must be code-something. Try en-Latn-US, en-Latn, en-US, en
516 if (strcmp(src, "sit-NP-Limb") == 0) {return ULScript_Limbu;}
517 if (strcmp(src, "sr-ME-Latn") == 0) {return ULScript_Latin;}
518 if (strcmp(src, "sr-ME-Cyrl") == 0) {return ULScript_Cyrillic;}
519 match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript);
520 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa-bb-cc
521
522 int len = strlen(src);
523 if (len >= 16) {return ULScript_Latin;} // Real codes are shorter
524
525 char temp[16];
526 int hyphen1_offset = hyphen1 - src;
527 int hyphen2_offset = hyphen2 - src;
528 int len2 = len - hyphen2_offset - 1; // Exclude the hyphen
529 int lenmid = hyphen2_offset - hyphen1_offset - 1; // Exclude the hyphen
530 // Keep part between hyphen1 and hyphen2
531 memcpy(temp, hyphen1 + 1, lenmid);
532 temp[lenmid] = '\0';
533 match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
534 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // bb
535
536 // Keep part after hyphen2
537 memcpy(temp, hyphen2 + 1, len2);
538 temp[len2] = '\0';
539 match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
540 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // cc
541
542 // Keep part before hyphen1
543 memcpy(temp, src, len);
544 temp[hyphen1_offset] = '\0';
545 match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
546 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa
547
548 return ULScript_Latin;
549 }
550
551 // Map script into Latin, Cyrillic, Arabic, Other
552 int LScript4(ULScript ulscript) {
553 if (ulscript == ULScript_Latin) {return 0;}
554 if (ulscript == ULScript_Cyrillic) {return 1;}
555 if (ulscript == ULScript_Arabic) {return 2;}
556 return 3;
557 }
558
559 } // namespace CLD2
560

mercurial