michael@0: // Copyright 2013 Google Inc. All Rights Reserved. michael@0: // michael@0: // Licensed under the Apache License, Version 2.0 (the "License"); michael@0: // you may not use this file except in compliance with the License. michael@0: // You may obtain a copy of the License at michael@0: // michael@0: // http://www.apache.org/licenses/LICENSE-2.0 michael@0: // michael@0: // Unless required by applicable law or agreed to in writing, software michael@0: // distributed under the License is distributed on an "AS IS" BASIS, michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. michael@0: // See the License for the specific language governing permissions and michael@0: // limitations under the License. michael@0: michael@0: // michael@0: // File: lang_script.cc michael@0: // ================ michael@0: // michael@0: // Author: dsites@google.com (Dick Sites) michael@0: // michael@0: // This file declares language and script numbers and names for CLD2 michael@0: // michael@0: michael@0: #include "lang_script.h" michael@0: michael@0: #include michael@0: #include michael@0: michael@0: #include "generated_language.h" michael@0: #include "generated_ulscript.h" michael@0: michael@0: namespace CLD2 { michael@0: michael@0: // Language tables michael@0: // Subscripted by enum Language michael@0: extern const int kLanguageToNameSize; michael@0: extern const char* const kLanguageToName[]; michael@0: extern const int kLanguageToCodeSize; michael@0: extern const char* const kLanguageToCode[]; michael@0: extern const int kLanguageToCNameSize; michael@0: extern const char* const kLanguageToCName[]; michael@0: extern const int kLanguageToScriptsSize; michael@0: extern const FourScripts kLanguageToScripts[]; michael@0: michael@0: // Subscripted by Language michael@0: extern const int kLanguageToPLangSize; michael@0: extern const uint8 kLanguageToPLang[]; michael@0: // Subscripted by per-script language michael@0: extern const uint16 kPLangToLanguageLatn[]; michael@0: extern const uint16 kPLangToLanguageOthr[]; michael@0: michael@0: // Alphabetical order for binary search michael@0: extern const int kNameToLanguageSize; michael@0: extern const CharIntPair kNameToLanguage[]; michael@0: extern const int kCodeToLanguageSize; michael@0: extern const CharIntPair kCodeToLanguage[]; michael@0: michael@0: // ULScript tables michael@0: // Subscripted by enum ULScript michael@0: extern const int kULScriptToNameSize; michael@0: extern const char* const kULScriptToName[]; michael@0: extern const int kULScriptToCodeSize; michael@0: extern const char* const kULScriptToCode[]; michael@0: extern const int kULScriptToCNameSize; michael@0: extern const char* const kULScriptToCName[]; michael@0: extern const int kULScriptToRtypeSize; michael@0: extern const ULScriptRType kULScriptToRtype[]; michael@0: extern const int kULScriptToDefaultLangSize; michael@0: extern const Language kULScriptToDefaultLang[]; michael@0: michael@0: // Alphabetical order for binary search michael@0: extern const int kNameToULScriptSize; michael@0: extern const CharIntPair kNameToULScript[]; michael@0: extern const int kCodeToULScriptSize; michael@0: extern const CharIntPair kCodeToULScript[]; michael@0: michael@0: michael@0: // michael@0: // File: lang_script.h michael@0: // ================ michael@0: // michael@0: // Author: dsites@google.com (Dick Sites) michael@0: // michael@0: // This file declares language and script numbers and names for CLD2 michael@0: // michael@0: michael@0: michael@0: // NOTE: The script numbers and language numbers here are not guaranteed to be michael@0: // stable. If you want to record a result for posterity, save the ISO codes michael@0: // as character strings. michael@0: // michael@0: // michael@0: // The Unicode scripts recognized by CLD2 are numbered almost arbitrarily, michael@0: // specified in an enum. Each script has human-readable script name and a michael@0: // 4-letter ISO 15924 script code. Each has a C name (largely for use by michael@0: // programs that generate declarations in cld2_generated_scripts.h). Each michael@0: // also has a recognition type michael@0: // r_type: 0 script-only, 1 nilgrams, 2 quadgrams, 3 CJK michael@0: // michael@0: // The declarations for a particular version of Unicode are machine-generated in michael@0: // cld2_generated_scripts.h michael@0: // michael@0: // This file includes that one and declares the access routines. The type michael@0: // involved is called "ULScript" to signify Unicode Letters-Marks Scripts, michael@0: // which are not quite Unicode Scripts. In particular, the CJK scripts are michael@0: // merged into a single number because CLD2 recognizes the CJK languages from michael@0: // four scripts intermixed: Hani (both Hans and Hant), Hangul, Hiragana, and michael@0: // Katakana. michael@0: michael@0: // Each script has one of these four recognition types. michael@0: // RTypeNone: There is no language associated with this script. In extended michael@0: // language recognition calls, return a fake language number that maps to michael@0: // xx-Cham, with literally "xx" for the language code,and with the script michael@0: // code instead of "Cham". In non-extended calls, return UNKNOWN_LANGUAGE. michael@0: // RTypeOne: The script maps 1:1 to a single language. No letters are examined michael@0: // during recognition and no lookups done. michael@0: // RTypeMany: The usual quadgram + delta-octagram + distinctive-words scoring michael@0: // is done to determine the languages involved. michael@0: // RTypeCJK: The CJK unigram + delta-bigram scoring is done to determine the michael@0: // languages involved. michael@0: // michael@0: // Note that the choice of recognition type is a function of script, not michael@0: // language. In particular, some languges are recognized in multiple scripts michael@0: // and those have different recognition types (Mongolian mn-Latn vs. mn-Mong michael@0: // for example). michael@0: michael@0: //----------------------------------------------------------------------------// michael@0: // Functions of ULScript // michael@0: //----------------------------------------------------------------------------// michael@0: michael@0: // If the input is out of range or otherwise unrecognized, it is treated michael@0: // as UNKNOWN_ULSCRIPT (which never participates in language recognition) michael@0: const char* ULScriptName(ULScript ulscript) { michael@0: int i_ulscript = ulscript; michael@0: if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;} michael@0: if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;} michael@0: return kULScriptToName[i_ulscript]; michael@0: } michael@0: michael@0: const char* ULScriptCode(ULScript ulscript) { michael@0: int i_ulscript = ulscript; michael@0: if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;} michael@0: if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;} michael@0: return kULScriptToCode[i_ulscript]; michael@0: } michael@0: michael@0: const char* ULScriptDeclaredName(ULScript ulscript) { michael@0: int i_ulscript = ulscript; michael@0: if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;} michael@0: if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;} michael@0: return kULScriptToCName[i_ulscript]; michael@0: } michael@0: michael@0: ULScriptRType ULScriptRecognitionType(ULScript ulscript) { michael@0: int i_ulscript = ulscript; michael@0: if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;} michael@0: if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;} michael@0: return kULScriptToRtype[i_ulscript]; michael@0: } michael@0: michael@0: michael@0: michael@0: // The languages recognized by CLD2 are numbered almost arbitrarily, michael@0: // specified in an enum. Each language has human-readable language name and a michael@0: // 2- or 3-letter ISO 639 language code. Each has a C name (largely for use by michael@0: // programs that generate declarations in cld2_generated_languagess.h). michael@0: // Each has a list of up to four scripts in which it is currently recognized. michael@0: // michael@0: // The declarations for a particular set of recognized languages are michael@0: // machine-generated in michael@0: // cld2_generated_languages.h michael@0: // michael@0: // The Language enum is intended to match the internal Google Language enum michael@0: // in i18n/languages/proto/languages.proto up to NUM_LANGUAGES, with additional michael@0: // languages assigned above that. Over time, some languages may be renumbered michael@0: // if they are moved into the Language enum. michael@0: // michael@0: // The Language enum includes the fake language numbers for RTypeNone above. michael@0: // michael@0: // In an open-source environment, the Google-specific Language enum is not michael@0: // available. Language decouples the two environments while maintaining michael@0: // internal compatibility. michael@0: michael@0: michael@0: // If the input is out of range or otherwise unrecognized, it is treated michael@0: // as UNKNOWN_LANGUAGE michael@0: // michael@0: // LanguageCode michael@0: // ------------ michael@0: // Given the Language, return the language code, e.g. "ko" michael@0: // This is determined by michael@0: // the following (in order of preference): michael@0: // - ISO-639-1 two-letter language code michael@0: // (all except those mentioned below) michael@0: // - ISO-639-2 three-letter bibliographic language code michael@0: // (Tibetan, Dhivehi, Cherokee, Syriac) michael@0: // - Google-specific language code michael@0: // (ChineseT ("zh-TW"), Teragram Unknown, Unknown, michael@0: // Portuguese-Portugal, Portuguese-Brazil, Limbu) michael@0: // - Fake RTypeNone names. michael@0: michael@0: //----------------------------------------------------------------------------// michael@0: // Functions of Language // michael@0: //----------------------------------------------------------------------------// michael@0: michael@0: const char* LanguageName(Language lang) { michael@0: int i_lang = lang; michael@0: if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;} michael@0: if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;} michael@0: return kLanguageToName[i_lang]; michael@0: } michael@0: const char* LanguageCode(Language lang) { michael@0: int i_lang = lang; michael@0: if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;} michael@0: if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;} michael@0: return kLanguageToCode[i_lang]; michael@0: } michael@0: michael@0: const char* LanguageDeclaredName(Language lang) { michael@0: int i_lang = lang; michael@0: if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;} michael@0: if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;} michael@0: return kLanguageToCName[i_lang]; michael@0: } michael@0: michael@0: // n is in 0..3. Trailing entries are filled with michael@0: // UNKNOWN_LANGUAGE (which never participates in language recognition) michael@0: ULScript LanguageRecognizedScript(Language lang, int n) { michael@0: int i_lang = lang; michael@0: if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;} michael@0: if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;} michael@0: return static_cast(kLanguageToScripts[i_lang][n]); michael@0: } michael@0: michael@0: // Given the Language, returns its string name used as the output by michael@0: // the lang/enc identifier, e.g. "Korean" michael@0: // "invalid_language" if the input is invalid. michael@0: // TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language, michael@0: // used to subtract out HTML, link farms, DNA strings, and alittle English porn michael@0: const char* ExtLanguageName(const Language lang) { michael@0: return LanguageName(lang); michael@0: } michael@0: michael@0: // Given the Language, return the language code, e.g. "ko" michael@0: const char* ExtLanguageCode(const Language lang) { michael@0: return LanguageCode(lang); michael@0: } michael@0: michael@0: michael@0: // Given the Language, returns its Language enum spelling, for use by michael@0: // programs that create C declarations, e.g. "KOREAN" michael@0: // "UNKNOWN_LANGUAGE" if the input is invalid. michael@0: const char* ExtLanguageDeclaredName(const Language lang) { michael@0: return LanguageDeclaredName(lang); michael@0: } michael@0: michael@0: michael@0: extern const int kCloseSetSize = 10; michael@0: michael@0: // Returns which set of statistically-close languages lang is in. 0 means none. michael@0: int LanguageCloseSet(Language lang) { michael@0: // Scaffolding michael@0: // id ms # INDONESIAN MALAY coef=0.4698 Problematic w/o extra words michael@0: // bo dz # TIBETAN DZONGKHA coef=0.4571 michael@0: // cs sk # CZECH SLOVAK coef=0.4273 michael@0: // zu xh # ZULU XHOSA coef=0.3716 michael@0: // michael@0: // bs hr sr srm # BOSNIAN CROATIAN SERBIAN MONTENEGRIN michael@0: // hi mr bh ne # HINDI MARATHI BIHARI NEPALI michael@0: // no nn da # NORWEGIAN NORWEGIAN_N DANISH michael@0: // gl es pt # GALICIAN SPANISH PORTUGUESE michael@0: // rw rn # KINYARWANDA RUNDI michael@0: michael@0: if (lang == INDONESIAN) {return 1;} michael@0: if (lang == MALAY) {return 1;} michael@0: michael@0: if (lang == TIBETAN) {return 2;} michael@0: if (lang == DZONGKHA) {return 2;} michael@0: michael@0: if (lang == CZECH) {return 3;} michael@0: if (lang == SLOVAK) {return 3;} michael@0: michael@0: if (lang == ZULU) {return 4;} michael@0: if (lang == XHOSA) {return 4;} michael@0: michael@0: if (lang == BOSNIAN) {return 5;} michael@0: if (lang == CROATIAN) {return 5;} michael@0: if (lang == SERBIAN) {return 5;} michael@0: if (lang == MONTENEGRIN) {return 5;} michael@0: michael@0: if (lang == HINDI) {return 6;} michael@0: if (lang == MARATHI) {return 6;} michael@0: if (lang == BIHARI) {return 6;} michael@0: if (lang == NEPALI) {return 6;} michael@0: michael@0: if (lang == NORWEGIAN) {return 7;} michael@0: if (lang == NORWEGIAN_N) {return 7;} michael@0: if (lang == DANISH) {return 7;} michael@0: michael@0: if (lang == GALICIAN) {return 8;} michael@0: if (lang == SPANISH) {return 8;} michael@0: if (lang == PORTUGUESE) {return 8;} michael@0: michael@0: if (lang == KINYARWANDA) {return 9;} michael@0: if (lang == RUNDI) {return 9;} michael@0: michael@0: return 0; michael@0: } michael@0: michael@0: //----------------------------------------------------------------------------// michael@0: // Functions of ULScript and Language // michael@0: //----------------------------------------------------------------------------// michael@0: michael@0: Language DefaultLanguage(ULScript ulscript) { michael@0: if (ulscript < 0) {return UNKNOWN_LANGUAGE;} michael@0: if (ulscript >= NUM_ULSCRIPTS) {return UNKNOWN_LANGUAGE;} michael@0: return kULScriptToDefaultLang[ulscript]; michael@0: } michael@0: michael@0: uint8 PerScriptNumber(ULScript ulscript, Language lang) { michael@0: if (ulscript < 0) {return 0;} michael@0: if (ulscript >= NUM_ULSCRIPTS) {return 0;} michael@0: if (kULScriptToRtype[ulscript] == RTypeNone) {return 1;} michael@0: if (lang >= kLanguageToPLangSize) {return 0;} michael@0: return kLanguageToPLang[lang]; michael@0: } michael@0: michael@0: Language FromPerScriptNumber(ULScript ulscript, uint8 perscript_number) { michael@0: if (ulscript < 0) {return UNKNOWN_LANGUAGE;} michael@0: if (ulscript >= NUM_ULSCRIPTS) {return UNKNOWN_LANGUAGE;} michael@0: if ((kULScriptToRtype[ulscript] == RTypeNone) || michael@0: (kULScriptToRtype[ulscript] == RTypeOne)) { michael@0: return kULScriptToDefaultLang[ulscript]; michael@0: } michael@0: michael@0: if (ulscript == ULScript_Latin) { michael@0: return static_cast(kPLangToLanguageLatn[perscript_number]); michael@0: } else { michael@0: return static_cast(kPLangToLanguageOthr[perscript_number]); michael@0: } michael@0: } michael@0: michael@0: // Return true if language can be in the Latin script michael@0: bool IsLatnLanguage(Language lang) { michael@0: if (lang >= kLanguageToPLangSize) {return false;} michael@0: return (lang == kPLangToLanguageLatn[kLanguageToPLang[lang]]); michael@0: } michael@0: michael@0: // Return true if language can be in a non-Latin script michael@0: bool IsOthrLanguage(Language lang) { michael@0: if (lang >= kLanguageToPLangSize) {return false;} michael@0: return (lang == kPLangToLanguageOthr[kLanguageToPLang[lang]]); michael@0: } michael@0: michael@0: michael@0: //----------------------------------------------------------------------------// michael@0: // Other // michael@0: //----------------------------------------------------------------------------// michael@0: michael@0: // Returns mid if key found in lo <= mid < hi, else -1 michael@0: int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair) { michael@0: // binary search michael@0: while (lo < hi) { michael@0: int mid = (lo + hi) >> 1; michael@0: if (strcmp(key, cipair[mid].s) < 0) { michael@0: hi = mid; michael@0: } else if (strcmp(key, cipair[mid].s) > 0) { michael@0: lo = mid + 1; michael@0: } else { michael@0: return mid; michael@0: } michael@0: } michael@0: return -1; michael@0: } michael@0: michael@0: Language MakeLang(int i) {return static_cast(i);} michael@0: michael@0: // Name can be either full name or ISO code, or can be ISO code embedded in michael@0: // a language-script combination such as "ABKHAZIAN", "en", "en-Latn-GB" michael@0: Language GetLanguageFromName(const char* src) { michael@0: const char* hyphen1 = strchr(src, '-'); michael@0: const char* hyphen2 = NULL; michael@0: if (hyphen1 != NULL) {hyphen2 = strchr(hyphen1 + 1, '-');} michael@0: michael@0: int match = -1; michael@0: if (hyphen1 == NULL) { michael@0: // Bare name. Look at full name, then code michael@0: match = BinarySearch(src, 0, kNameToLanguageSize, kNameToLanguage); michael@0: if (match >= 0) {return MakeLang(kNameToLanguage[match].i);} // aa michael@0: match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage); michael@0: if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa michael@0: return UNKNOWN_LANGUAGE; michael@0: } michael@0: michael@0: if (hyphen2 == NULL) { michael@0: // aa-bb. Not a full name; must be code-something. Try zh-TW then bare zh michael@0: match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage); michael@0: if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb michael@0: michael@0: int len = strlen(src); michael@0: if (len >= 16) {return UNKNOWN_LANGUAGE;} // Real codes are shorter michael@0: michael@0: char temp[16]; michael@0: int hyphen1_offset = hyphen1 - src; michael@0: // Take off part after hyphen1 michael@0: memcpy(temp, src, len); michael@0: temp[hyphen1_offset] = '\0'; michael@0: match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage); michael@0: if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa michael@0: michael@0: return UNKNOWN_LANGUAGE; michael@0: } michael@0: michael@0: // aa-bb-cc. Must be code-something. Try en-Latn-US, en-Latn, en-US, en michael@0: match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage); michael@0: if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb-cc michael@0: michael@0: michael@0: int len = strlen(src); michael@0: if (len >= 16) {return UNKNOWN_LANGUAGE;} // Real codes are shorter michael@0: michael@0: char temp[16]; michael@0: int hyphen1_offset = hyphen1 - src; michael@0: int hyphen2_offset = hyphen2 - src; michael@0: // Take off part after hyphen2 michael@0: memcpy(temp, src, len); michael@0: temp[hyphen2_offset] = '\0'; michael@0: match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage); michael@0: if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb michael@0: michael@0: michael@0: // Take off part between hyphen1 and hyphen2 michael@0: int len2 = len - hyphen2_offset; michael@0: memcpy(temp, src, len); michael@0: memcpy(&temp[hyphen1_offset], hyphen2, len2); michael@0: temp[hyphen1_offset + len2] = '\0'; michael@0: match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage); michael@0: if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-cc michael@0: michael@0: michael@0: // Take off everything after hyphen1 michael@0: memcpy(temp, src, len); michael@0: temp[hyphen1_offset] = '\0'; michael@0: match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage); michael@0: if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa michael@0: michael@0: michael@0: return UNKNOWN_LANGUAGE; michael@0: } michael@0: michael@0: michael@0: // Name can be either full name or ISO code, or can be ISO code embedded in michael@0: // a language-script combination such as "en-Latn-GB" michael@0: // MORE WORK to do here. also kLanguageToScripts [4] is bogus michael@0: // if bare language name, no script, want zh, ja, ko to Hani, pt to Latn, etc. michael@0: // Something like map code to Language, then Language to kLanguageToScripts[x][0] michael@0: // ADD BIAS: kLanguageToScripts lists default script first michael@0: // If total mismatch, reutrn Latn michael@0: // if (strcmp(src, "nd") == 0) {return NDEBELE;} // [nd was wrong] michael@0: // if (strcmp(src, "sit-NP-Limb") == 0) {return ULScript_Limbu;} michael@0: michael@0: ULScript MakeULScr(int i) {return static_cast(i);} michael@0: michael@0: ULScript GetULScriptFromName(const char* src) { michael@0: const char* hyphen1 = strchr(src, '-'); michael@0: const char* hyphen2 = NULL; michael@0: if (hyphen1 != NULL) {hyphen2 = strchr(hyphen1 + 1, '-');} michael@0: michael@0: int match = -1; michael@0: if (hyphen1 == NULL) { michael@0: // Bare name. Look at full name, then code, then try backmapping as Language michael@0: match = BinarySearch(src, 0, kNameToULScriptSize, kNameToULScript); michael@0: if (match >= 0) {return MakeULScr(kNameToULScript[match].i);} // aa michael@0: match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript); michael@0: if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa michael@0: michael@0: Language backmap_me = GetLanguageFromName(src); michael@0: if (backmap_me != UNKNOWN_LANGUAGE) { michael@0: return static_cast(kLanguageToScripts[backmap_me][0]); michael@0: } michael@0: return ULScript_Latin; michael@0: } michael@0: michael@0: if (hyphen2 == NULL) { michael@0: // aa-bb. Not a full name; must be code-something. Try en-Latn, bare Latn michael@0: if (strcmp(src, "zh-TW") == 0) {return ULScript_Hani;} michael@0: if (strcmp(src, "zh-CN") == 0) {return ULScript_Hani;} michael@0: if (strcmp(src, "sit-NP") == 0) {return ULScript_Limbu;} michael@0: if (strcmp(src, "sit-Limb") == 0) {return ULScript_Limbu;} michael@0: if (strcmp(src, "sr-ME") == 0) {return ULScript_Latin;} michael@0: match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript); michael@0: if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa-bb michael@0: michael@0: int len = strlen(src); michael@0: if (len >= 16) {return ULScript_Latin;} // Real codes are shorter michael@0: michael@0: char temp[16]; michael@0: int hyphen1_offset = hyphen1 - src; michael@0: int len1 = len - hyphen1_offset - 1; // Exclude the hyphen michael@0: // Take off part before hyphen1 michael@0: memcpy(temp, hyphen1 + 1, len1); michael@0: temp[len1] = '\0'; michael@0: match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript); michael@0: if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // bb michael@0: michael@0: // Take off part after hyphen1 michael@0: memcpy(temp, src, len); michael@0: temp[hyphen1_offset] = '\0'; michael@0: match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript); michael@0: if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa michael@0: michael@0: return ULScript_Latin; michael@0: } michael@0: michael@0: // aa-bb-cc. Must be code-something. Try en-Latn-US, en-Latn, en-US, en michael@0: if (strcmp(src, "sit-NP-Limb") == 0) {return ULScript_Limbu;} michael@0: if (strcmp(src, "sr-ME-Latn") == 0) {return ULScript_Latin;} michael@0: if (strcmp(src, "sr-ME-Cyrl") == 0) {return ULScript_Cyrillic;} michael@0: match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript); michael@0: if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa-bb-cc michael@0: michael@0: int len = strlen(src); michael@0: if (len >= 16) {return ULScript_Latin;} // Real codes are shorter michael@0: michael@0: char temp[16]; michael@0: int hyphen1_offset = hyphen1 - src; michael@0: int hyphen2_offset = hyphen2 - src; michael@0: int len2 = len - hyphen2_offset - 1; // Exclude the hyphen michael@0: int lenmid = hyphen2_offset - hyphen1_offset - 1; // Exclude the hyphen michael@0: // Keep part between hyphen1 and hyphen2 michael@0: memcpy(temp, hyphen1 + 1, lenmid); michael@0: temp[lenmid] = '\0'; michael@0: match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript); michael@0: if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // bb michael@0: michael@0: // Keep part after hyphen2 michael@0: memcpy(temp, hyphen2 + 1, len2); michael@0: temp[len2] = '\0'; michael@0: match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript); michael@0: if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // cc michael@0: michael@0: // Keep part before hyphen1 michael@0: memcpy(temp, src, len); michael@0: temp[hyphen1_offset] = '\0'; michael@0: match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript); michael@0: if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa michael@0: michael@0: return ULScript_Latin; michael@0: } michael@0: michael@0: // Map script into Latin, Cyrillic, Arabic, Other michael@0: int LScript4(ULScript ulscript) { michael@0: if (ulscript == ULScript_Latin) {return 0;} michael@0: if (ulscript == ULScript_Cyrillic) {return 1;} michael@0: if (ulscript == ULScript_Arabic) {return 2;} michael@0: return 3; michael@0: } michael@0: michael@0: } // namespace CLD2 michael@0: