The Tor Browser: browser/components/translation/cld2/internal/lang

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 // Copyright 2013 Google Inc. All Rights Reserved.

     2 //

     3 // Licensed under the Apache License, Version 2.0 (the "License");

     4 // you may not use this file except in compliance with the License.

     5 // You may obtain a copy of the License at

     6 //

     7 //     http://www.apache.org/licenses/LICENSE-2.0

     8 //

     9 // Unless required by applicable law or agreed to in writing, software

    10 // distributed under the License is distributed on an "AS IS" BASIS,

    11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

    12 // See the License for the specific language governing permissions and

    13 // limitations under the License.

    15 //

    16 // File: lang_script.h

    17 // ================

    18 //

    19 // Author: dsites@google.com (Dick Sites)

    20 //

    21 // This file declares language and script numbers and names for CLD2,

    22 // plus routines that access side tables based on these

    23 //

    25 #ifndef I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__

    26 #define I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__

    28 #include "generated_language.h"

    29 #include "generated_ulscript.h"

    30 #include "integral_types.h"

    33 // NOTE: The script numbers and language numbers here are not guaranteed to be

    34 // stable. If you want to record a result for posterity, save the

    35 // ULScriptCode(ULScript ulscript) result as character strings.

    36 //

    37 // The Unicode scripts recognized by CLD2 are numbered almost arbitrarily,

    38 // specified in an enum. Each script has human-readable script name and a

    39 // 4-letter ISO 15924 script code. Each has a C name (largely for use by

    40 // programs that generate declarations in cld2_generated_scripts.h). Each

    41 // also has a recognition type

    42 //  r_type: 0 script-only, 1 nilgrams, 2 quadgrams, 3 CJK

    43 //

    44 // The declarations for a particular version of Unicode are machine-generated in

    45 //   generated_scripts.h

    46 //

    47 // This file includes that one and declares the access routines. The type

    48 // involved is called "ULScript" to signify Unicode Letters-Marks Scripts,

    49 // which are not quite Unicode Scripts. In particular, the CJK scripts are

    50 // merged into a single number because CLD2 recognizes the CJK languages from

    51 // four scripts intermixed: Hani (both Hans  and Hant), Hangul, Hiragana, and

    52 // Katakana.

    54 // Each script has one of these four recognition types.

    55 // RTypeNone: There is no language associated with this script. In extended

    56 //  language recognition calls, return a fake language number that maps to

    57 //  xx-Cham, with literally "xx" for the language code,and with the script

    58 //  code instead of "Cham". In non-extended calls, return UNKNOWN_LANGUAGE.

    59 // RTypeOne: The script maps 1:1 to a single language. No letters are examined

    60 //  during recognition and no lookups done.

    61 // RTypeMany: The usual quadgram + delta-octagram + distinctive-words scoring

    62 //  is done to determine the languages involved.

    63 // RTypeCJK: The CJK unigram + delta-bigram scoring is done to determine the

    64 //  languages involved.

    65 //

    66 // Note that the choice of recognition type is a function of script, not

    67 // language. In particular, some languges are recognized in multiple scripts

    68 // and those have different recognition types (Mongolian mn-Latn vs. mn-Mong

    69 // for example).

    71 namespace CLD2 {

    73 //----------------------------------------------------------------------------//

    74 // Functions of ULScript                                                      //

    75 //----------------------------------------------------------------------------//

    77 // If the input is out of range or otherwise unrecognized, it is treated

    78 // as ULScript_Common (which never participates in language recognition)

    79 const char* ULScriptName(ULScript ulscript);

    80 const char* ULScriptCode(ULScript ulscript);

    81 const char* ULScriptDeclaredName(ULScript ulscript);

    82 ULScriptRType ULScriptRecognitionType(ULScript ulscript);

    84 // Name can be either full name or ISO code, or can be ISO code embedded in

    85 // a language-script combination such as "en-Latn-GB"

    86 ULScript GetULScriptFromName(const char* src);

    88 // Map script into Latin, Cyrillic, Arabic, Other

    89 int LScript4(ULScript ulscript);

    91 //----------------------------------------------------------------------------//

    92 // Functions of Language                                                      //

    93 //----------------------------------------------------------------------------//

    95 // The languages recognized by CLD2 are numbered almost arbitrarily,

    96 // specified in an enum. Each language has human-readable language name and a

    97 // 2- or 3-letter ISO 639 language code. Each has a C name (largely for use by

    98 // programs that generate declarations in cld2_generated_languagess.h).

    99 // Each has a list of up to four scripts in which it is currently recognized.

   100 //

   101 // The declarations for a particular set of recognized languages are

   102 // machine-generated in

   103 //   generated_languages.h

   104 //

   105 // The Language enum is intended to match the internal Google Language enum

   106 // in i18n/languages/proto/languages.proto up to NUM_LANGUAGES, with additional

   107 // languages assigned above that. Over time, some languages may be renumbered

   108 // if they are moved into the Language enum.

   109 //

   110 // The Language enum includes the fake language numbers for RTypeNone above.

   111 //

   114 // If the input is out of range or otherwise unrecognized, it is treated

   115 // as UNKNOWN_LANGUAGE

   116 //

   117 // LanguageCode

   118 // ------------

   119 // Given the Language, return the language code, e.g. "ko"

   120 // This is determined by

   121 // the following (in order of preference):

   122 // - ISO-639-1 two-letter language code

   123 //   (all except those mentioned below)

   124 // - ISO-639-2 three-letter bibliographic language code

   125 //   (Tibetan, Dhivehi, Cherokee, Syriac)

   126 // - Google-specific language code

   127 //   (ChineseT ("zh-TW"), Teragram Unknown, Unknown,

   128 //   Portuguese-Portugal, Portuguese-Brazil, Limbu)

   129 // - Fake RTypeNone names.

   131 const char* LanguageName(Language lang);

   132 const char* LanguageCode(Language lang);

   133 const char* LanguageShortCode(Language lang);

   134 const char* LanguageDeclaredName(Language lang);

   136 // n is in 0..3. Trailing entries are filled with

   137 // ULScript_Common (which never participates in language recognition)

   138 ULScript LanguageRecognizedScript(Language lang, int n);

   140 // Name can be either full name or ISO code, or can be ISO code embedded in

   141 // a language-script combination such as "en-Latn-GB"

   142 Language GetLanguageFromName(const char* src);

   144 // Returns which set of statistically-close languages lang is in. 0 means none.

   145 int LanguageCloseSet(Language lang);

   147 //----------------------------------------------------------------------------//

   148 // Functions of ULScript and Language                                         //

   149 //----------------------------------------------------------------------------//

   151 // Most common language in each script

   152 Language DefaultLanguage(ULScript ulscript);

   154 // For RTypeMany recognition,

   155 // the CLD2 lookup tables are kept small by encoding a language into one byte.

   156 // To avoid limiting CLD2 to at most 256 languages, a larger range of external

   157 // Language numbers is mapped to a smaller range of per-script numbers. At

   158 // the moment (January 2013) the Latin script has about 90 languages to be

   159 // recognized, while all the other scripts total about 50 more languages. In

   160 // addition, the RTypeNone scripts map to about 100 fake languages.

   161 // So we map all Latin-script languages to one range of 1..255 per-script

   162 // numbers and map all the other RTypeMany languages to an overlapping range

   163 // 1..255 of per-script numbers.

   165 uint8 PerScriptNumber(ULScript ulscript, Language lang);

   166 Language FromPerScriptNumber(ULScript ulscript, uint8 perscript_number);

   168 // While the speed-sensitive processing deals with per-script language numbers,

   169 // there is a need for low-performance dealing with original language numbers

   170 // and unknown scripts, mostly for processing language hints.

   171 // These routines let one derive a script class from a bare language.

   172 // For languages written in multiple scripts, both of these can return true.

   174 bool IsLatnLanguage(Language lang);

   175 bool IsOthrLanguage(Language lang);

   178 //----------------------------------------------------------------------------//

   179 // Other                                                                      //

   180 //----------------------------------------------------------------------------//

   182 // Utility routine to search alphabetical tables

   183 int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair);

   185 }  // namespace CLD2

   187 #endif  // I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__

The Tor Browser / file revision

browser/components/translation/cld2/internal/lang_script.h@6474c204b198

browser/components/translation/cld2/internal/lang_script.h