browser/components/translation/cld2/internal/lang_script.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/browser/components/translation/cld2/internal/lang_script.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,187 @@
     1.4 +// Copyright 2013 Google Inc. All Rights Reserved.
     1.5 +//
     1.6 +// Licensed under the Apache License, Version 2.0 (the "License");
     1.7 +// you may not use this file except in compliance with the License.
     1.8 +// You may obtain a copy of the License at
     1.9 +//
    1.10 +//     http://www.apache.org/licenses/LICENSE-2.0
    1.11 +//
    1.12 +// Unless required by applicable law or agreed to in writing, software
    1.13 +// distributed under the License is distributed on an "AS IS" BASIS,
    1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    1.15 +// See the License for the specific language governing permissions and
    1.16 +// limitations under the License.
    1.17 +
    1.18 +//
    1.19 +// File: lang_script.h
    1.20 +// ================
    1.21 +//
    1.22 +// Author: dsites@google.com (Dick Sites)
    1.23 +//
    1.24 +// This file declares language and script numbers and names for CLD2,
    1.25 +// plus routines that access side tables based on these
    1.26 +//
    1.27 +
    1.28 +#ifndef I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__
    1.29 +#define I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__
    1.30 +
    1.31 +#include "generated_language.h"
    1.32 +#include "generated_ulscript.h"
    1.33 +#include "integral_types.h"
    1.34 +
    1.35 +
    1.36 +// NOTE: The script numbers and language numbers here are not guaranteed to be
    1.37 +// stable. If you want to record a result for posterity, save the
    1.38 +// ULScriptCode(ULScript ulscript) result as character strings.
    1.39 +//
    1.40 +// The Unicode scripts recognized by CLD2 are numbered almost arbitrarily,
    1.41 +// specified in an enum. Each script has human-readable script name and a
    1.42 +// 4-letter ISO 15924 script code. Each has a C name (largely for use by
    1.43 +// programs that generate declarations in cld2_generated_scripts.h). Each
    1.44 +// also has a recognition type
    1.45 +//  r_type: 0 script-only, 1 nilgrams, 2 quadgrams, 3 CJK
    1.46 +//
    1.47 +// The declarations for a particular version of Unicode are machine-generated in
    1.48 +//   generated_scripts.h
    1.49 +//
    1.50 +// This file includes that one and declares the access routines. The type
    1.51 +// involved is called "ULScript" to signify Unicode Letters-Marks Scripts,
    1.52 +// which are not quite Unicode Scripts. In particular, the CJK scripts are
    1.53 +// merged into a single number because CLD2 recognizes the CJK languages from
    1.54 +// four scripts intermixed: Hani (both Hans  and Hant), Hangul, Hiragana, and
    1.55 +// Katakana.
    1.56 +
    1.57 +// Each script has one of these four recognition types.
    1.58 +// RTypeNone: There is no language associated with this script. In extended
    1.59 +//  language recognition calls, return a fake language number that maps to
    1.60 +//  xx-Cham, with literally "xx" for the language code,and with the script
    1.61 +//  code instead of "Cham". In non-extended calls, return UNKNOWN_LANGUAGE.
    1.62 +// RTypeOne: The script maps 1:1 to a single language. No letters are examined
    1.63 +//  during recognition and no lookups done.
    1.64 +// RTypeMany: The usual quadgram + delta-octagram + distinctive-words scoring
    1.65 +//  is done to determine the languages involved.
    1.66 +// RTypeCJK: The CJK unigram + delta-bigram scoring is done to determine the
    1.67 +//  languages involved.
    1.68 +//
    1.69 +// Note that the choice of recognition type is a function of script, not
    1.70 +// language. In particular, some languges are recognized in multiple scripts
    1.71 +// and those have different recognition types (Mongolian mn-Latn vs. mn-Mong
    1.72 +// for example).
    1.73 +
    1.74 +namespace CLD2 {
    1.75 +
    1.76 +//----------------------------------------------------------------------------//
    1.77 +// Functions of ULScript                                                      //
    1.78 +//----------------------------------------------------------------------------//
    1.79 +
    1.80 +// If the input is out of range or otherwise unrecognized, it is treated
    1.81 +// as ULScript_Common (which never participates in language recognition)
    1.82 +const char* ULScriptName(ULScript ulscript);
    1.83 +const char* ULScriptCode(ULScript ulscript);
    1.84 +const char* ULScriptDeclaredName(ULScript ulscript);
    1.85 +ULScriptRType ULScriptRecognitionType(ULScript ulscript);
    1.86 +
    1.87 +// Name can be either full name or ISO code, or can be ISO code embedded in
    1.88 +// a language-script combination such as "en-Latn-GB"
    1.89 +ULScript GetULScriptFromName(const char* src);
    1.90 +
    1.91 +// Map script into Latin, Cyrillic, Arabic, Other
    1.92 +int LScript4(ULScript ulscript);
    1.93 +
    1.94 +//----------------------------------------------------------------------------//
    1.95 +// Functions of Language                                                      //
    1.96 +//----------------------------------------------------------------------------//
    1.97 +
    1.98 +// The languages recognized by CLD2 are numbered almost arbitrarily,
    1.99 +// specified in an enum. Each language has human-readable language name and a
   1.100 +// 2- or 3-letter ISO 639 language code. Each has a C name (largely for use by
   1.101 +// programs that generate declarations in cld2_generated_languagess.h).
   1.102 +// Each has a list of up to four scripts in which it is currently recognized.
   1.103 +//
   1.104 +// The declarations for a particular set of recognized languages are
   1.105 +// machine-generated in
   1.106 +//   generated_languages.h
   1.107 +//
   1.108 +// The Language enum is intended to match the internal Google Language enum
   1.109 +// in i18n/languages/proto/languages.proto up to NUM_LANGUAGES, with additional
   1.110 +// languages assigned above that. Over time, some languages may be renumbered
   1.111 +// if they are moved into the Language enum.
   1.112 +//
   1.113 +// The Language enum includes the fake language numbers for RTypeNone above.
   1.114 +//
   1.115 +
   1.116 +
   1.117 +// If the input is out of range or otherwise unrecognized, it is treated
   1.118 +// as UNKNOWN_LANGUAGE
   1.119 +//
   1.120 +// LanguageCode
   1.121 +// ------------
   1.122 +// Given the Language, return the language code, e.g. "ko"
   1.123 +// This is determined by
   1.124 +// the following (in order of preference):
   1.125 +// - ISO-639-1 two-letter language code
   1.126 +//   (all except those mentioned below)
   1.127 +// - ISO-639-2 three-letter bibliographic language code
   1.128 +//   (Tibetan, Dhivehi, Cherokee, Syriac)
   1.129 +// - Google-specific language code
   1.130 +//   (ChineseT ("zh-TW"), Teragram Unknown, Unknown,
   1.131 +//   Portuguese-Portugal, Portuguese-Brazil, Limbu)
   1.132 +// - Fake RTypeNone names.
   1.133 +
   1.134 +const char* LanguageName(Language lang);
   1.135 +const char* LanguageCode(Language lang);
   1.136 +const char* LanguageShortCode(Language lang);
   1.137 +const char* LanguageDeclaredName(Language lang);
   1.138 +
   1.139 +// n is in 0..3. Trailing entries are filled with
   1.140 +// ULScript_Common (which never participates in language recognition)
   1.141 +ULScript LanguageRecognizedScript(Language lang, int n);
   1.142 +
   1.143 +// Name can be either full name or ISO code, or can be ISO code embedded in
   1.144 +// a language-script combination such as "en-Latn-GB"
   1.145 +Language GetLanguageFromName(const char* src);
   1.146 +
   1.147 +// Returns which set of statistically-close languages lang is in. 0 means none.
   1.148 +int LanguageCloseSet(Language lang);
   1.149 +
   1.150 +//----------------------------------------------------------------------------//
   1.151 +// Functions of ULScript and Language                                         //
   1.152 +//----------------------------------------------------------------------------//
   1.153 +
   1.154 +// Most common language in each script
   1.155 +Language DefaultLanguage(ULScript ulscript);
   1.156 +
   1.157 +// For RTypeMany recognition,
   1.158 +// the CLD2 lookup tables are kept small by encoding a language into one byte.
   1.159 +// To avoid limiting CLD2 to at most 256 languages, a larger range of external
   1.160 +// Language numbers is mapped to a smaller range of per-script numbers. At
   1.161 +// the moment (January 2013) the Latin script has about 90 languages to be
   1.162 +// recognized, while all the other scripts total about 50 more languages. In
   1.163 +// addition, the RTypeNone scripts map to about 100 fake languages.
   1.164 +// So we map all Latin-script languages to one range of 1..255 per-script
   1.165 +// numbers and map all the other RTypeMany languages to an overlapping range
   1.166 +// 1..255 of per-script numbers.
   1.167 +
   1.168 +uint8 PerScriptNumber(ULScript ulscript, Language lang);
   1.169 +Language FromPerScriptNumber(ULScript ulscript, uint8 perscript_number);
   1.170 +
   1.171 +// While the speed-sensitive processing deals with per-script language numbers,
   1.172 +// there is a need for low-performance dealing with original language numbers
   1.173 +// and unknown scripts, mostly for processing language hints.
   1.174 +// These routines let one derive a script class from a bare language.
   1.175 +// For languages written in multiple scripts, both of these can return true.
   1.176 +
   1.177 +bool IsLatnLanguage(Language lang);
   1.178 +bool IsOthrLanguage(Language lang);
   1.179 +
   1.180 +
   1.181 +//----------------------------------------------------------------------------//
   1.182 +// Other                                                                      //
   1.183 +//----------------------------------------------------------------------------//
   1.184 +
   1.185 +// Utility routine to search alphabetical tables
   1.186 +int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair);
   1.187 +
   1.188 +}  // namespace CLD2
   1.189 +
   1.190 +#endif  // I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__

mercurial