browser/components/translation/cld2/internal/lang_script.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 // Copyright 2013 Google Inc. All Rights Reserved.
michael@0 2 //
michael@0 3 // Licensed under the Apache License, Version 2.0 (the "License");
michael@0 4 // you may not use this file except in compliance with the License.
michael@0 5 // You may obtain a copy of the License at
michael@0 6 //
michael@0 7 // http://www.apache.org/licenses/LICENSE-2.0
michael@0 8 //
michael@0 9 // Unless required by applicable law or agreed to in writing, software
michael@0 10 // distributed under the License is distributed on an "AS IS" BASIS,
michael@0 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
michael@0 12 // See the License for the specific language governing permissions and
michael@0 13 // limitations under the License.
michael@0 14
michael@0 15 //
michael@0 16 // File: lang_script.h
michael@0 17 // ================
michael@0 18 //
michael@0 19 // Author: dsites@google.com (Dick Sites)
michael@0 20 //
michael@0 21 // This file declares language and script numbers and names for CLD2,
michael@0 22 // plus routines that access side tables based on these
michael@0 23 //
michael@0 24
michael@0 25 #ifndef I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__
michael@0 26 #define I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__
michael@0 27
michael@0 28 #include "generated_language.h"
michael@0 29 #include "generated_ulscript.h"
michael@0 30 #include "integral_types.h"
michael@0 31
michael@0 32
michael@0 33 // NOTE: The script numbers and language numbers here are not guaranteed to be
michael@0 34 // stable. If you want to record a result for posterity, save the
michael@0 35 // ULScriptCode(ULScript ulscript) result as character strings.
michael@0 36 //
michael@0 37 // The Unicode scripts recognized by CLD2 are numbered almost arbitrarily,
michael@0 38 // specified in an enum. Each script has human-readable script name and a
michael@0 39 // 4-letter ISO 15924 script code. Each has a C name (largely for use by
michael@0 40 // programs that generate declarations in cld2_generated_scripts.h). Each
michael@0 41 // also has a recognition type
michael@0 42 // r_type: 0 script-only, 1 nilgrams, 2 quadgrams, 3 CJK
michael@0 43 //
michael@0 44 // The declarations for a particular version of Unicode are machine-generated in
michael@0 45 // generated_scripts.h
michael@0 46 //
michael@0 47 // This file includes that one and declares the access routines. The type
michael@0 48 // involved is called "ULScript" to signify Unicode Letters-Marks Scripts,
michael@0 49 // which are not quite Unicode Scripts. In particular, the CJK scripts are
michael@0 50 // merged into a single number because CLD2 recognizes the CJK languages from
michael@0 51 // four scripts intermixed: Hani (both Hans and Hant), Hangul, Hiragana, and
michael@0 52 // Katakana.
michael@0 53
michael@0 54 // Each script has one of these four recognition types.
michael@0 55 // RTypeNone: There is no language associated with this script. In extended
michael@0 56 // language recognition calls, return a fake language number that maps to
michael@0 57 // xx-Cham, with literally "xx" for the language code,and with the script
michael@0 58 // code instead of "Cham". In non-extended calls, return UNKNOWN_LANGUAGE.
michael@0 59 // RTypeOne: The script maps 1:1 to a single language. No letters are examined
michael@0 60 // during recognition and no lookups done.
michael@0 61 // RTypeMany: The usual quadgram + delta-octagram + distinctive-words scoring
michael@0 62 // is done to determine the languages involved.
michael@0 63 // RTypeCJK: The CJK unigram + delta-bigram scoring is done to determine the
michael@0 64 // languages involved.
michael@0 65 //
michael@0 66 // Note that the choice of recognition type is a function of script, not
michael@0 67 // language. In particular, some languges are recognized in multiple scripts
michael@0 68 // and those have different recognition types (Mongolian mn-Latn vs. mn-Mong
michael@0 69 // for example).
michael@0 70
michael@0 71 namespace CLD2 {
michael@0 72
michael@0 73 //----------------------------------------------------------------------------//
michael@0 74 // Functions of ULScript //
michael@0 75 //----------------------------------------------------------------------------//
michael@0 76
michael@0 77 // If the input is out of range or otherwise unrecognized, it is treated
michael@0 78 // as ULScript_Common (which never participates in language recognition)
michael@0 79 const char* ULScriptName(ULScript ulscript);
michael@0 80 const char* ULScriptCode(ULScript ulscript);
michael@0 81 const char* ULScriptDeclaredName(ULScript ulscript);
michael@0 82 ULScriptRType ULScriptRecognitionType(ULScript ulscript);
michael@0 83
michael@0 84 // Name can be either full name or ISO code, or can be ISO code embedded in
michael@0 85 // a language-script combination such as "en-Latn-GB"
michael@0 86 ULScript GetULScriptFromName(const char* src);
michael@0 87
michael@0 88 // Map script into Latin, Cyrillic, Arabic, Other
michael@0 89 int LScript4(ULScript ulscript);
michael@0 90
michael@0 91 //----------------------------------------------------------------------------//
michael@0 92 // Functions of Language //
michael@0 93 //----------------------------------------------------------------------------//
michael@0 94
michael@0 95 // The languages recognized by CLD2 are numbered almost arbitrarily,
michael@0 96 // specified in an enum. Each language has human-readable language name and a
michael@0 97 // 2- or 3-letter ISO 639 language code. Each has a C name (largely for use by
michael@0 98 // programs that generate declarations in cld2_generated_languagess.h).
michael@0 99 // Each has a list of up to four scripts in which it is currently recognized.
michael@0 100 //
michael@0 101 // The declarations for a particular set of recognized languages are
michael@0 102 // machine-generated in
michael@0 103 // generated_languages.h
michael@0 104 //
michael@0 105 // The Language enum is intended to match the internal Google Language enum
michael@0 106 // in i18n/languages/proto/languages.proto up to NUM_LANGUAGES, with additional
michael@0 107 // languages assigned above that. Over time, some languages may be renumbered
michael@0 108 // if they are moved into the Language enum.
michael@0 109 //
michael@0 110 // The Language enum includes the fake language numbers for RTypeNone above.
michael@0 111 //
michael@0 112
michael@0 113
michael@0 114 // If the input is out of range or otherwise unrecognized, it is treated
michael@0 115 // as UNKNOWN_LANGUAGE
michael@0 116 //
michael@0 117 // LanguageCode
michael@0 118 // ------------
michael@0 119 // Given the Language, return the language code, e.g. "ko"
michael@0 120 // This is determined by
michael@0 121 // the following (in order of preference):
michael@0 122 // - ISO-639-1 two-letter language code
michael@0 123 // (all except those mentioned below)
michael@0 124 // - ISO-639-2 three-letter bibliographic language code
michael@0 125 // (Tibetan, Dhivehi, Cherokee, Syriac)
michael@0 126 // - Google-specific language code
michael@0 127 // (ChineseT ("zh-TW"), Teragram Unknown, Unknown,
michael@0 128 // Portuguese-Portugal, Portuguese-Brazil, Limbu)
michael@0 129 // - Fake RTypeNone names.
michael@0 130
michael@0 131 const char* LanguageName(Language lang);
michael@0 132 const char* LanguageCode(Language lang);
michael@0 133 const char* LanguageShortCode(Language lang);
michael@0 134 const char* LanguageDeclaredName(Language lang);
michael@0 135
michael@0 136 // n is in 0..3. Trailing entries are filled with
michael@0 137 // ULScript_Common (which never participates in language recognition)
michael@0 138 ULScript LanguageRecognizedScript(Language lang, int n);
michael@0 139
michael@0 140 // Name can be either full name or ISO code, or can be ISO code embedded in
michael@0 141 // a language-script combination such as "en-Latn-GB"
michael@0 142 Language GetLanguageFromName(const char* src);
michael@0 143
michael@0 144 // Returns which set of statistically-close languages lang is in. 0 means none.
michael@0 145 int LanguageCloseSet(Language lang);
michael@0 146
michael@0 147 //----------------------------------------------------------------------------//
michael@0 148 // Functions of ULScript and Language //
michael@0 149 //----------------------------------------------------------------------------//
michael@0 150
michael@0 151 // Most common language in each script
michael@0 152 Language DefaultLanguage(ULScript ulscript);
michael@0 153
michael@0 154 // For RTypeMany recognition,
michael@0 155 // the CLD2 lookup tables are kept small by encoding a language into one byte.
michael@0 156 // To avoid limiting CLD2 to at most 256 languages, a larger range of external
michael@0 157 // Language numbers is mapped to a smaller range of per-script numbers. At
michael@0 158 // the moment (January 2013) the Latin script has about 90 languages to be
michael@0 159 // recognized, while all the other scripts total about 50 more languages. In
michael@0 160 // addition, the RTypeNone scripts map to about 100 fake languages.
michael@0 161 // So we map all Latin-script languages to one range of 1..255 per-script
michael@0 162 // numbers and map all the other RTypeMany languages to an overlapping range
michael@0 163 // 1..255 of per-script numbers.
michael@0 164
michael@0 165 uint8 PerScriptNumber(ULScript ulscript, Language lang);
michael@0 166 Language FromPerScriptNumber(ULScript ulscript, uint8 perscript_number);
michael@0 167
michael@0 168 // While the speed-sensitive processing deals with per-script language numbers,
michael@0 169 // there is a need for low-performance dealing with original language numbers
michael@0 170 // and unknown scripts, mostly for processing language hints.
michael@0 171 // These routines let one derive a script class from a bare language.
michael@0 172 // For languages written in multiple scripts, both of these can return true.
michael@0 173
michael@0 174 bool IsLatnLanguage(Language lang);
michael@0 175 bool IsOthrLanguage(Language lang);
michael@0 176
michael@0 177
michael@0 178 //----------------------------------------------------------------------------//
michael@0 179 // Other //
michael@0 180 //----------------------------------------------------------------------------//
michael@0 181
michael@0 182 // Utility routine to search alphabetical tables
michael@0 183 int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair);
michael@0 184
michael@0 185 } // namespace CLD2
michael@0 186
michael@0 187 #endif // I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__

mercurial