michael@0: // Copyright 2013 Google Inc. All Rights Reserved. michael@0: // michael@0: // Licensed under the Apache License, Version 2.0 (the "License"); michael@0: // you may not use this file except in compliance with the License. michael@0: // You may obtain a copy of the License at michael@0: // michael@0: // http://www.apache.org/licenses/LICENSE-2.0 michael@0: // michael@0: // Unless required by applicable law or agreed to in writing, software michael@0: // distributed under the License is distributed on an "AS IS" BASIS, michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. michael@0: // See the License for the specific language governing permissions and michael@0: // limitations under the License. michael@0: michael@0: // michael@0: // File: lang_script.h michael@0: // ================ michael@0: // michael@0: // Author: dsites@google.com (Dick Sites) michael@0: // michael@0: // This file declares language and script numbers and names for CLD2, michael@0: // plus routines that access side tables based on these michael@0: // michael@0: michael@0: #ifndef I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__ michael@0: #define I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__ michael@0: michael@0: #include "generated_language.h" michael@0: #include "generated_ulscript.h" michael@0: #include "integral_types.h" michael@0: michael@0: michael@0: // NOTE: The script numbers and language numbers here are not guaranteed to be michael@0: // stable. If you want to record a result for posterity, save the michael@0: // ULScriptCode(ULScript ulscript) result as character strings. michael@0: // michael@0: // The Unicode scripts recognized by CLD2 are numbered almost arbitrarily, michael@0: // specified in an enum. Each script has human-readable script name and a michael@0: // 4-letter ISO 15924 script code. Each has a C name (largely for use by michael@0: // programs that generate declarations in cld2_generated_scripts.h). Each michael@0: // also has a recognition type michael@0: // r_type: 0 script-only, 1 nilgrams, 2 quadgrams, 3 CJK michael@0: // michael@0: // The declarations for a particular version of Unicode are machine-generated in michael@0: // generated_scripts.h michael@0: // michael@0: // This file includes that one and declares the access routines. The type michael@0: // involved is called "ULScript" to signify Unicode Letters-Marks Scripts, michael@0: // which are not quite Unicode Scripts. In particular, the CJK scripts are michael@0: // merged into a single number because CLD2 recognizes the CJK languages from michael@0: // four scripts intermixed: Hani (both Hans and Hant), Hangul, Hiragana, and michael@0: // Katakana. michael@0: michael@0: // Each script has one of these four recognition types. michael@0: // RTypeNone: There is no language associated with this script. In extended michael@0: // language recognition calls, return a fake language number that maps to michael@0: // xx-Cham, with literally "xx" for the language code,and with the script michael@0: // code instead of "Cham". In non-extended calls, return UNKNOWN_LANGUAGE. michael@0: // RTypeOne: The script maps 1:1 to a single language. No letters are examined michael@0: // during recognition and no lookups done. michael@0: // RTypeMany: The usual quadgram + delta-octagram + distinctive-words scoring michael@0: // is done to determine the languages involved. michael@0: // RTypeCJK: The CJK unigram + delta-bigram scoring is done to determine the michael@0: // languages involved. michael@0: // michael@0: // Note that the choice of recognition type is a function of script, not michael@0: // language. In particular, some languges are recognized in multiple scripts michael@0: // and those have different recognition types (Mongolian mn-Latn vs. mn-Mong michael@0: // for example). michael@0: michael@0: namespace CLD2 { michael@0: michael@0: //----------------------------------------------------------------------------// michael@0: // Functions of ULScript // michael@0: //----------------------------------------------------------------------------// michael@0: michael@0: // If the input is out of range or otherwise unrecognized, it is treated michael@0: // as ULScript_Common (which never participates in language recognition) michael@0: const char* ULScriptName(ULScript ulscript); michael@0: const char* ULScriptCode(ULScript ulscript); michael@0: const char* ULScriptDeclaredName(ULScript ulscript); michael@0: ULScriptRType ULScriptRecognitionType(ULScript ulscript); michael@0: michael@0: // Name can be either full name or ISO code, or can be ISO code embedded in michael@0: // a language-script combination such as "en-Latn-GB" michael@0: ULScript GetULScriptFromName(const char* src); michael@0: michael@0: // Map script into Latin, Cyrillic, Arabic, Other michael@0: int LScript4(ULScript ulscript); michael@0: michael@0: //----------------------------------------------------------------------------// michael@0: // Functions of Language // michael@0: //----------------------------------------------------------------------------// michael@0: michael@0: // The languages recognized by CLD2 are numbered almost arbitrarily, michael@0: // specified in an enum. Each language has human-readable language name and a michael@0: // 2- or 3-letter ISO 639 language code. Each has a C name (largely for use by michael@0: // programs that generate declarations in cld2_generated_languagess.h). michael@0: // Each has a list of up to four scripts in which it is currently recognized. michael@0: // michael@0: // The declarations for a particular set of recognized languages are michael@0: // machine-generated in michael@0: // generated_languages.h michael@0: // michael@0: // The Language enum is intended to match the internal Google Language enum michael@0: // in i18n/languages/proto/languages.proto up to NUM_LANGUAGES, with additional michael@0: // languages assigned above that. Over time, some languages may be renumbered michael@0: // if they are moved into the Language enum. michael@0: // michael@0: // The Language enum includes the fake language numbers for RTypeNone above. michael@0: // michael@0: michael@0: michael@0: // If the input is out of range or otherwise unrecognized, it is treated michael@0: // as UNKNOWN_LANGUAGE michael@0: // michael@0: // LanguageCode michael@0: // ------------ michael@0: // Given the Language, return the language code, e.g. "ko" michael@0: // This is determined by michael@0: // the following (in order of preference): michael@0: // - ISO-639-1 two-letter language code michael@0: // (all except those mentioned below) michael@0: // - ISO-639-2 three-letter bibliographic language code michael@0: // (Tibetan, Dhivehi, Cherokee, Syriac) michael@0: // - Google-specific language code michael@0: // (ChineseT ("zh-TW"), Teragram Unknown, Unknown, michael@0: // Portuguese-Portugal, Portuguese-Brazil, Limbu) michael@0: // - Fake RTypeNone names. michael@0: michael@0: const char* LanguageName(Language lang); michael@0: const char* LanguageCode(Language lang); michael@0: const char* LanguageShortCode(Language lang); michael@0: const char* LanguageDeclaredName(Language lang); michael@0: michael@0: // n is in 0..3. Trailing entries are filled with michael@0: // ULScript_Common (which never participates in language recognition) michael@0: ULScript LanguageRecognizedScript(Language lang, int n); michael@0: michael@0: // Name can be either full name or ISO code, or can be ISO code embedded in michael@0: // a language-script combination such as "en-Latn-GB" michael@0: Language GetLanguageFromName(const char* src); michael@0: michael@0: // Returns which set of statistically-close languages lang is in. 0 means none. michael@0: int LanguageCloseSet(Language lang); michael@0: michael@0: //----------------------------------------------------------------------------// michael@0: // Functions of ULScript and Language // michael@0: //----------------------------------------------------------------------------// michael@0: michael@0: // Most common language in each script michael@0: Language DefaultLanguage(ULScript ulscript); michael@0: michael@0: // For RTypeMany recognition, michael@0: // the CLD2 lookup tables are kept small by encoding a language into one byte. michael@0: // To avoid limiting CLD2 to at most 256 languages, a larger range of external michael@0: // Language numbers is mapped to a smaller range of per-script numbers. At michael@0: // the moment (January 2013) the Latin script has about 90 languages to be michael@0: // recognized, while all the other scripts total about 50 more languages. In michael@0: // addition, the RTypeNone scripts map to about 100 fake languages. michael@0: // So we map all Latin-script languages to one range of 1..255 per-script michael@0: // numbers and map all the other RTypeMany languages to an overlapping range michael@0: // 1..255 of per-script numbers. michael@0: michael@0: uint8 PerScriptNumber(ULScript ulscript, Language lang); michael@0: Language FromPerScriptNumber(ULScript ulscript, uint8 perscript_number); michael@0: michael@0: // While the speed-sensitive processing deals with per-script language numbers, michael@0: // there is a need for low-performance dealing with original language numbers michael@0: // and unknown scripts, mostly for processing language hints. michael@0: // These routines let one derive a script class from a bare language. michael@0: // For languages written in multiple scripts, both of these can return true. michael@0: michael@0: bool IsLatnLanguage(Language lang); michael@0: bool IsOthrLanguage(Language lang); michael@0: michael@0: michael@0: //----------------------------------------------------------------------------// michael@0: // Other // michael@0: //----------------------------------------------------------------------------// michael@0: michael@0: // Utility routine to search alphabetical tables michael@0: int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair); michael@0: michael@0: } // namespace CLD2 michael@0: michael@0: #endif // I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__