1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/browser/components/translation/cld2/public/compact_lang_det.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,320 @@ 1.4 +// Copyright 2013 Google Inc. All Rights Reserved. 1.5 +// 1.6 +// Licensed under the Apache License, Version 2.0 (the "License"); 1.7 +// you may not use this file except in compliance with the License. 1.8 +// You may obtain a copy of the License at 1.9 +// 1.10 +// http://www.apache.org/licenses/LICENSE-2.0 1.11 +// 1.12 +// Unless required by applicable law or agreed to in writing, software 1.13 +// distributed under the License is distributed on an "AS IS" BASIS, 1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.15 +// See the License for the specific language governing permissions and 1.16 +// limitations under the License. 1.17 + 1.18 +// 1.19 +// Author: dsites@google.com (Dick Sites) 1.20 +// 1.21 + 1.22 +// NOTE: 1.23 +// Baybayin (ancient script of the Philippines) is detected as TAGALOG. 1.24 +// Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE. 1.25 +// HAITIAN_CREOLE is detected as such. 1.26 +// NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly) 1.27 +// PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE. 1.28 +// ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as ROMANIAN. 1.29 +// BOSNIAN is not detected as such, but likely scores as Croatian or Serbian. 1.30 +// MONTENEGRIN is not detected as such, but likely scores as Serbian. 1.31 +// CROATIAN is detected in the Latin script 1.32 +// SERBIAN is detected in the Cyrililc and Latin scripts 1.33 +// Zhuang is detected in the Latin script only. 1.34 +// 1.35 +// The languages X_PIG_LATIN and X_KLINGON are detected in the 1.36 +// extended calls ExtDetectLanguageSummary(). 1.37 +// 1.38 +// UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure 1.39 +// is high enough. This happens with non-text input such as the bytes of a 1.40 +// JPEG, and also with text in languages outside training set. 1.41 +// 1.42 +// The following languages are to be detected in multiple scripts: 1.43 +// AZERBAIJANI (Latin, Cyrillic*, Arabic*) 1.44 +// BURMESE (Latin, Myanmar) 1.45 +// HAUSA (Latin, Arabic) 1.46 +// KASHMIRI (Arabic, Devanagari) 1.47 +// KAZAKH (Latin, Cyrillic, Arabic) 1.48 +// KURDISH (Latin*, Arabic) 1.49 +// KYRGYZ (Cyrillic, Arabic) 1.50 +// LIMBU (Devanagari, Limbu) 1.51 +// MONGOLIAN (Cyrillic, Mongolian) 1.52 +// SANSKRIT (Latin, Devanagari) 1.53 +// SINDHI (Arabic, Devanagari) 1.54 +// TAGALOG (Latin, Tagalog) 1.55 +// TAJIK (Cyrillic, Arabic*) 1.56 +// TATAR (Latin, Cyrillic, Arabic) 1.57 +// TURKMEN (Latin, Cyrillic, Arabic) 1.58 +// UIGHUR (Latin, Cyrillic, Arabic) 1.59 +// UZBEK (Latin, Cyrillic, Arabic) 1.60 +// 1.61 +// * Due to a shortage of training text, AZERBAIJANI is not currently detected 1.62 +// in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in 1.63 +// Arabic script. 1.64 +// 1.65 + 1.66 +#ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_ 1.67 +#define I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_ 1.68 + 1.69 +#include <vector> 1.70 +#include "../internal/lang_script.h" // For Language 1.71 + 1.72 +namespace CLD2 { 1.73 + 1.74 + // Scan interchange-valid UTF-8 bytes and detect most likely language, 1.75 + // or set of languages. 1.76 + // 1.77 + // Design goals: 1.78 + // Skip over big stretches of HTML tags 1.79 + // Able to return ranges of different languages 1.80 + // Relatively small tables and relatively fast processing 1.81 + // Thread safe 1.82 + // 1.83 + // For HTML documents, tags are skipped, along with <script> ... </script> 1.84 + // and <style> ... </style> sequences, and entities are expanded. 1.85 + // 1.86 + // We distinguish between bytes of the raw input buffer and bytes of non-tag 1.87 + // text letters. Since tags can be over 50% of the bytes of an HTML Page, 1.88 + // and are nearly all seven-bit ASCII English, we prefer to distinguish 1.89 + // language mixture fractions based on just the non-tag text. 1.90 + // 1.91 + // Inputs: text and text_length 1.92 + // Code skips HTML tags and expands HTML entities, unless 1.93 + // is_plain_text is true 1.94 + // Outputs: 1.95 + // language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE 1.96 + // percent3 is an array of the text percentages 0..100 of the top 3 languages 1.97 + // text_bytes is the amount of non-tag/letters-only text found 1.98 + // is_reliable set true if the returned Language is some amount more 1.99 + // probable then the second-best Language. Calculation is a complex function 1.100 + // of the length of the text and the different-script runs of text. 1.101 + // Return value: the most likely Language for the majority of the input text 1.102 + // Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text 1.103 + // defaults to ENGLISH. 1.104 + // 1.105 + // The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for 1.106 + // backwards compatibility with a different detector. 1.107 + // 1.108 + // The third version may return UNKNOWN_LANGUAGE, and also returns extended 1.109 + // language codes from lang_script.h 1.110 + // 1.111 + 1.112 + 1.113 + // Instead of individual arguments, pass in hints as an initialized struct 1.114 + // Init to {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE} if not known. 1.115 + // 1.116 + // Pass in hints whenever possible; doing so improves detection accuracy. The 1.117 + // set of passed-in hints are all information that is external to the text 1.118 + // itself. 1.119 + // 1.120 + // The content_language_hint is intended to come from an HTTP header 1.121 + // Content-Language: field, the tld_hint from the hostname of a URL, the 1.122 + // encoding-hint from an encoding detector applied to the input 1.123 + // document, and the language hint from any other context you might have. 1.124 + // The lang= tags inside an HTML document will be picked up as hints 1.125 + // by code within the compact language detector. 1.126 + 1.127 + typedef struct { 1.128 + const char* content_language_hint; // "mi,en" boosts Maori and English 1.129 + const char* tld_hint; // "id" boosts Indonesian 1.130 + int encoding_hint; // SJS boosts Japanese 1.131 + Language language_hint; // ITALIAN boosts it 1.132 + } CLDHints; 1.133 + 1.134 + static const int kMaxResultChunkBytes = 65535; 1.135 + 1.136 + // For returning a vector of per-language pieces of the input buffer 1.137 + // Unreliable and too-short are mapped to UNKNOWN_LANGUAGE 1.138 + typedef struct { 1.139 + int offset; // Starting byte offset in original buffer 1.140 + uint16 bytes; // Number of bytes in chunk 1.141 + uint16 lang1; // Top lang, as full Language. Apply 1.142 + // static_cast<Language>() to this short value. 1.143 + } ResultChunk; 1.144 + typedef std::vector<ResultChunk> ResultChunkVector; 1.145 + 1.146 + 1.147 + // Scan interchange-valid UTF-8 bytes and detect most likely language 1.148 + Language DetectLanguage( 1.149 + const char* buffer, 1.150 + int buffer_length, 1.151 + bool is_plain_text, 1.152 + bool* is_reliable); 1.153 + 1.154 + // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. 1.155 + // language3[0] is usually also the return value 1.156 + Language DetectLanguageSummary( 1.157 + const char* buffer, 1.158 + int buffer_length, 1.159 + bool is_plain_text, 1.160 + Language* language3, 1.161 + int* percent3, 1.162 + int* text_bytes, 1.163 + bool* is_reliable); 1.164 + 1.165 + // Same as above, with hints supplied 1.166 + // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. 1.167 + // language3[0] is usually also the return value 1.168 + Language DetectLanguageSummary( 1.169 + const char* buffer, 1.170 + int buffer_length, 1.171 + bool is_plain_text, 1.172 + const char* tld_hint, // "id" boosts Indonesian 1.173 + int encoding_hint, // SJS boosts Japanese 1.174 + Language language_hint, // ITALIAN boosts it 1.175 + Language* language3, 1.176 + int* percent3, 1.177 + int* text_bytes, 1.178 + bool* is_reliable); 1.179 + 1.180 + // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended 1.181 + // languages. 1.182 + // 1.183 + // Extended languages are additional interface languages and Unicode 1.184 + // single-language scripts, from lang_script.h 1.185 + // 1.186 + // language3[0] is usually also the return value 1.187 + Language ExtDetectLanguageSummary( 1.188 + const char* buffer, 1.189 + int buffer_length, 1.190 + bool is_plain_text, 1.191 + Language* language3, 1.192 + int* percent3, 1.193 + int* text_bytes, 1.194 + bool* is_reliable); 1.195 + 1.196 + // Same as above, with hints supplied 1.197 + // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended 1.198 + // languages. 1.199 + // 1.200 + // Extended languages are additional Google interface languages and Unicode 1.201 + // single-language scripts, from lang_script.h 1.202 + // 1.203 + // language3[0] is usually also the return value 1.204 + Language ExtDetectLanguageSummary( 1.205 + const char* buffer, 1.206 + int buffer_length, 1.207 + bool is_plain_text, 1.208 + const char* tld_hint, // "id" boosts Indonesian 1.209 + int encoding_hint, // SJS boosts Japanese 1.210 + Language language_hint, // ITALIAN boosts it 1.211 + Language* language3, 1.212 + int* percent3, 1.213 + int* text_bytes, 1.214 + bool* is_reliable); 1.215 + 1.216 + // Same as above, and also returns 3 internal language scores as a ratio to 1.217 + // normal score for real text in that language. Scores close to 1.0 indicate 1.218 + // normal text, while scores far away from 1.0 indicate badly-skewed text or 1.219 + // gibberish 1.220 + // 1.221 + Language ExtDetectLanguageSummary( 1.222 + const char* buffer, 1.223 + int buffer_length, 1.224 + bool is_plain_text, 1.225 + const char* tld_hint, // "id" boosts Indonesian 1.226 + int encoding_hint, // SJS boosts Japanese 1.227 + Language language_hint, // ITALIAN boosts it 1.228 + Language* language3, 1.229 + int* percent3, 1.230 + double* normalized_score3, 1.231 + int* text_bytes, 1.232 + bool* is_reliable); 1.233 + 1.234 + 1.235 + // Use this one. 1.236 + // Hints are collected into a struct. 1.237 + // Flags are passed in (normally zero). 1.238 + // 1.239 + // Also returns 3 internal language scores as a ratio to 1.240 + // normal score for real text in that language. Scores close to 1.0 indicate 1.241 + // normal text, while scores far away from 1.0 indicate badly-skewed text or 1.242 + // gibberish 1.243 + // 1.244 + // Returns a vector of chunks in different languages, so that caller may 1.245 + // spell-check, translate, or otherwaise process different parts of the input 1.246 + // buffer in language-dependant ways. 1.247 + // 1.248 + Language ExtDetectLanguageSummary( 1.249 + const char* buffer, 1.250 + int buffer_length, 1.251 + bool is_plain_text, 1.252 + const CLDHints* cld_hints, 1.253 + int flags, 1.254 + Language* language3, 1.255 + int* percent3, 1.256 + double* normalized_score3, 1.257 + ResultChunkVector* resultchunkvector, 1.258 + int* text_bytes, 1.259 + bool* is_reliable); 1.260 + 1.261 + // Return version text string 1.262 + // String is "code_version - data_build_date" 1.263 + const char* DetectLanguageVersion(); 1.264 + 1.265 + 1.266 + // Public use flags, debug output controls 1.267 + static const int kCLDFlagScoreAsQuads = 0x0100; // Force Greek, etc. => quads 1.268 + static const int kCLDFlagHtml = 0x0200; // Debug HTML => stderr 1.269 + static const int kCLDFlagCr = 0x0400; // <cr> per chunk if HTML 1.270 + static const int kCLDFlagVerbose = 0x0800; // More debug HTML => stderr 1.271 + static const int kCLDFlagQuiet = 0x1000; // Less debug HTML => stderr 1.272 + static const int kCLDFlagEcho = 0x2000; // Echo input => stderr 1.273 + 1.274 + 1.275 +/*** 1.276 + 1.277 +Flag meanings: 1.278 + kCLDFlagScoreAsQuads 1.279 + Normally, several languages are detected solely by their Unicode script. 1.280 + Combined with appropritate lookup tables, this flag forces them instead 1.281 + to be detected via quadgrams. This can be a useful refinement when looking 1.282 + for meaningful text in these languages, instead of just character sets. 1.283 + The default tables do not support this use. 1.284 + kCLDFlagHtml 1.285 + For each detection call, write an HTML file to stderr, showing the text 1.286 + chunks and their detected languages. 1.287 + kCLDFlagCr 1.288 + In that HTML file, force a new line for each chunk. 1.289 + kCLDFlagVerbose 1.290 + In that HTML file, show every lookup entry. 1.291 + kCLDFlagQuiet 1.292 + In that HTML file, suppress most of the output detail. 1.293 + kCLDFlagEcho 1.294 + Echo every input buffer to stderr. 1.295 +***/ 1.296 + 1.297 +// Debug output: Print the resultchunkvector to file f 1.298 +void DumpResultChunkVector(FILE* f, const char* src, 1.299 + ResultChunkVector* resultchunkvector); 1.300 + 1.301 +#ifdef CLD2_DYNAMIC_MODE 1.302 + 1.303 +// If compiled with dynamic mode, load data from the specified file location. 1.304 +// If other data has already been loaded, it is discarded and the data is read 1.305 +// in from the specified file location again (even if the file has not changed). 1.306 +// WARNING: Before calling this method, language detection will always fail 1.307 +// and will always return the unknown language. 1.308 +void loadData(const char* fileName); 1.309 + 1.310 +// If compiled with dynamic mode, unload the previously-loaded data. 1.311 +// WARNING: After calling this method, language detection will no longer work 1.312 +// and will always return the unknown language. 1.313 +void unloadData(); 1.314 + 1.315 +// Returns true if and only if data has been loaded via a call to loadData(...) 1.316 +// and has not been subsequently unladed via a call to unloadDate(). 1.317 +bool isDataLoaded(); 1.318 + 1.319 +#endif // #ifdef CLD2_DYNAMIC_MODE 1.320 + 1.321 +}; // End namespace CLD2 1.322 + 1.323 +#endif // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_