browser/components/translation/cld2/public/compact_lang_det.h

Fri, 16 Jan 2015 04:50:19 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Fri, 16 Jan 2015 04:50:19 +0100
branch
TOR_BUG_9701
changeset 13
44a2da4a2ab2
permissions
-rw-r--r--

Replace accessor implementation with direct member state manipulation, by
request https://trac.torproject.org/projects/tor/ticket/9701#comment:32

michael@0 1 // Copyright 2013 Google Inc. All Rights Reserved.
michael@0 2 //
michael@0 3 // Licensed under the Apache License, Version 2.0 (the "License");
michael@0 4 // you may not use this file except in compliance with the License.
michael@0 5 // You may obtain a copy of the License at
michael@0 6 //
michael@0 7 // http://www.apache.org/licenses/LICENSE-2.0
michael@0 8 //
michael@0 9 // Unless required by applicable law or agreed to in writing, software
michael@0 10 // distributed under the License is distributed on an "AS IS" BASIS,
michael@0 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
michael@0 12 // See the License for the specific language governing permissions and
michael@0 13 // limitations under the License.
michael@0 14
michael@0 15 //
michael@0 16 // Author: dsites@google.com (Dick Sites)
michael@0 17 //
michael@0 18
michael@0 19 // NOTE:
michael@0 20 // Baybayin (ancient script of the Philippines) is detected as TAGALOG.
michael@0 21 // Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE.
michael@0 22 // HAITIAN_CREOLE is detected as such.
michael@0 23 // NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly)
michael@0 24 // PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE.
michael@0 25 // ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as ROMANIAN.
michael@0 26 // BOSNIAN is not detected as such, but likely scores as Croatian or Serbian.
michael@0 27 // MONTENEGRIN is not detected as such, but likely scores as Serbian.
michael@0 28 // CROATIAN is detected in the Latin script
michael@0 29 // SERBIAN is detected in the Cyrililc and Latin scripts
michael@0 30 // Zhuang is detected in the Latin script only.
michael@0 31 //
michael@0 32 // The languages X_PIG_LATIN and X_KLINGON are detected in the
michael@0 33 // extended calls ExtDetectLanguageSummary().
michael@0 34 //
michael@0 35 // UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure
michael@0 36 // is high enough. This happens with non-text input such as the bytes of a
michael@0 37 // JPEG, and also with text in languages outside training set.
michael@0 38 //
michael@0 39 // The following languages are to be detected in multiple scripts:
michael@0 40 // AZERBAIJANI (Latin, Cyrillic*, Arabic*)
michael@0 41 // BURMESE (Latin, Myanmar)
michael@0 42 // HAUSA (Latin, Arabic)
michael@0 43 // KASHMIRI (Arabic, Devanagari)
michael@0 44 // KAZAKH (Latin, Cyrillic, Arabic)
michael@0 45 // KURDISH (Latin*, Arabic)
michael@0 46 // KYRGYZ (Cyrillic, Arabic)
michael@0 47 // LIMBU (Devanagari, Limbu)
michael@0 48 // MONGOLIAN (Cyrillic, Mongolian)
michael@0 49 // SANSKRIT (Latin, Devanagari)
michael@0 50 // SINDHI (Arabic, Devanagari)
michael@0 51 // TAGALOG (Latin, Tagalog)
michael@0 52 // TAJIK (Cyrillic, Arabic*)
michael@0 53 // TATAR (Latin, Cyrillic, Arabic)
michael@0 54 // TURKMEN (Latin, Cyrillic, Arabic)
michael@0 55 // UIGHUR (Latin, Cyrillic, Arabic)
michael@0 56 // UZBEK (Latin, Cyrillic, Arabic)
michael@0 57 //
michael@0 58 // * Due to a shortage of training text, AZERBAIJANI is not currently detected
michael@0 59 // in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in
michael@0 60 // Arabic script.
michael@0 61 //
michael@0 62
michael@0 63 #ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
michael@0 64 #define I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
michael@0 65
michael@0 66 #include <vector>
michael@0 67 #include "../internal/lang_script.h" // For Language
michael@0 68
michael@0 69 namespace CLD2 {
michael@0 70
michael@0 71 // Scan interchange-valid UTF-8 bytes and detect most likely language,
michael@0 72 // or set of languages.
michael@0 73 //
michael@0 74 // Design goals:
michael@0 75 // Skip over big stretches of HTML tags
michael@0 76 // Able to return ranges of different languages
michael@0 77 // Relatively small tables and relatively fast processing
michael@0 78 // Thread safe
michael@0 79 //
michael@0 80 // For HTML documents, tags are skipped, along with <script> ... </script>
michael@0 81 // and <style> ... </style> sequences, and entities are expanded.
michael@0 82 //
michael@0 83 // We distinguish between bytes of the raw input buffer and bytes of non-tag
michael@0 84 // text letters. Since tags can be over 50% of the bytes of an HTML Page,
michael@0 85 // and are nearly all seven-bit ASCII English, we prefer to distinguish
michael@0 86 // language mixture fractions based on just the non-tag text.
michael@0 87 //
michael@0 88 // Inputs: text and text_length
michael@0 89 // Code skips HTML tags and expands HTML entities, unless
michael@0 90 // is_plain_text is true
michael@0 91 // Outputs:
michael@0 92 // language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
michael@0 93 // percent3 is an array of the text percentages 0..100 of the top 3 languages
michael@0 94 // text_bytes is the amount of non-tag/letters-only text found
michael@0 95 // is_reliable set true if the returned Language is some amount more
michael@0 96 // probable then the second-best Language. Calculation is a complex function
michael@0 97 // of the length of the text and the different-script runs of text.
michael@0 98 // Return value: the most likely Language for the majority of the input text
michael@0 99 // Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text
michael@0 100 // defaults to ENGLISH.
michael@0 101 //
michael@0 102 // The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for
michael@0 103 // backwards compatibility with a different detector.
michael@0 104 //
michael@0 105 // The third version may return UNKNOWN_LANGUAGE, and also returns extended
michael@0 106 // language codes from lang_script.h
michael@0 107 //
michael@0 108
michael@0 109
michael@0 110 // Instead of individual arguments, pass in hints as an initialized struct
michael@0 111 // Init to {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE} if not known.
michael@0 112 //
michael@0 113 // Pass in hints whenever possible; doing so improves detection accuracy. The
michael@0 114 // set of passed-in hints are all information that is external to the text
michael@0 115 // itself.
michael@0 116 //
michael@0 117 // The content_language_hint is intended to come from an HTTP header
michael@0 118 // Content-Language: field, the tld_hint from the hostname of a URL, the
michael@0 119 // encoding-hint from an encoding detector applied to the input
michael@0 120 // document, and the language hint from any other context you might have.
michael@0 121 // The lang= tags inside an HTML document will be picked up as hints
michael@0 122 // by code within the compact language detector.
michael@0 123
michael@0 124 typedef struct {
michael@0 125 const char* content_language_hint; // "mi,en" boosts Maori and English
michael@0 126 const char* tld_hint; // "id" boosts Indonesian
michael@0 127 int encoding_hint; // SJS boosts Japanese
michael@0 128 Language language_hint; // ITALIAN boosts it
michael@0 129 } CLDHints;
michael@0 130
michael@0 131 static const int kMaxResultChunkBytes = 65535;
michael@0 132
michael@0 133 // For returning a vector of per-language pieces of the input buffer
michael@0 134 // Unreliable and too-short are mapped to UNKNOWN_LANGUAGE
michael@0 135 typedef struct {
michael@0 136 int offset; // Starting byte offset in original buffer
michael@0 137 uint16 bytes; // Number of bytes in chunk
michael@0 138 uint16 lang1; // Top lang, as full Language. Apply
michael@0 139 // static_cast<Language>() to this short value.
michael@0 140 } ResultChunk;
michael@0 141 typedef std::vector<ResultChunk> ResultChunkVector;
michael@0 142
michael@0 143
michael@0 144 // Scan interchange-valid UTF-8 bytes and detect most likely language
michael@0 145 Language DetectLanguage(
michael@0 146 const char* buffer,
michael@0 147 int buffer_length,
michael@0 148 bool is_plain_text,
michael@0 149 bool* is_reliable);
michael@0 150
michael@0 151 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
michael@0 152 // language3[0] is usually also the return value
michael@0 153 Language DetectLanguageSummary(
michael@0 154 const char* buffer,
michael@0 155 int buffer_length,
michael@0 156 bool is_plain_text,
michael@0 157 Language* language3,
michael@0 158 int* percent3,
michael@0 159 int* text_bytes,
michael@0 160 bool* is_reliable);
michael@0 161
michael@0 162 // Same as above, with hints supplied
michael@0 163 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
michael@0 164 // language3[0] is usually also the return value
michael@0 165 Language DetectLanguageSummary(
michael@0 166 const char* buffer,
michael@0 167 int buffer_length,
michael@0 168 bool is_plain_text,
michael@0 169 const char* tld_hint, // "id" boosts Indonesian
michael@0 170 int encoding_hint, // SJS boosts Japanese
michael@0 171 Language language_hint, // ITALIAN boosts it
michael@0 172 Language* language3,
michael@0 173 int* percent3,
michael@0 174 int* text_bytes,
michael@0 175 bool* is_reliable);
michael@0 176
michael@0 177 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
michael@0 178 // languages.
michael@0 179 //
michael@0 180 // Extended languages are additional interface languages and Unicode
michael@0 181 // single-language scripts, from lang_script.h
michael@0 182 //
michael@0 183 // language3[0] is usually also the return value
michael@0 184 Language ExtDetectLanguageSummary(
michael@0 185 const char* buffer,
michael@0 186 int buffer_length,
michael@0 187 bool is_plain_text,
michael@0 188 Language* language3,
michael@0 189 int* percent3,
michael@0 190 int* text_bytes,
michael@0 191 bool* is_reliable);
michael@0 192
michael@0 193 // Same as above, with hints supplied
michael@0 194 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
michael@0 195 // languages.
michael@0 196 //
michael@0 197 // Extended languages are additional Google interface languages and Unicode
michael@0 198 // single-language scripts, from lang_script.h
michael@0 199 //
michael@0 200 // language3[0] is usually also the return value
michael@0 201 Language ExtDetectLanguageSummary(
michael@0 202 const char* buffer,
michael@0 203 int buffer_length,
michael@0 204 bool is_plain_text,
michael@0 205 const char* tld_hint, // "id" boosts Indonesian
michael@0 206 int encoding_hint, // SJS boosts Japanese
michael@0 207 Language language_hint, // ITALIAN boosts it
michael@0 208 Language* language3,
michael@0 209 int* percent3,
michael@0 210 int* text_bytes,
michael@0 211 bool* is_reliable);
michael@0 212
michael@0 213 // Same as above, and also returns 3 internal language scores as a ratio to
michael@0 214 // normal score for real text in that language. Scores close to 1.0 indicate
michael@0 215 // normal text, while scores far away from 1.0 indicate badly-skewed text or
michael@0 216 // gibberish
michael@0 217 //
michael@0 218 Language ExtDetectLanguageSummary(
michael@0 219 const char* buffer,
michael@0 220 int buffer_length,
michael@0 221 bool is_plain_text,
michael@0 222 const char* tld_hint, // "id" boosts Indonesian
michael@0 223 int encoding_hint, // SJS boosts Japanese
michael@0 224 Language language_hint, // ITALIAN boosts it
michael@0 225 Language* language3,
michael@0 226 int* percent3,
michael@0 227 double* normalized_score3,
michael@0 228 int* text_bytes,
michael@0 229 bool* is_reliable);
michael@0 230
michael@0 231
michael@0 232 // Use this one.
michael@0 233 // Hints are collected into a struct.
michael@0 234 // Flags are passed in (normally zero).
michael@0 235 //
michael@0 236 // Also returns 3 internal language scores as a ratio to
michael@0 237 // normal score for real text in that language. Scores close to 1.0 indicate
michael@0 238 // normal text, while scores far away from 1.0 indicate badly-skewed text or
michael@0 239 // gibberish
michael@0 240 //
michael@0 241 // Returns a vector of chunks in different languages, so that caller may
michael@0 242 // spell-check, translate, or otherwaise process different parts of the input
michael@0 243 // buffer in language-dependant ways.
michael@0 244 //
michael@0 245 Language ExtDetectLanguageSummary(
michael@0 246 const char* buffer,
michael@0 247 int buffer_length,
michael@0 248 bool is_plain_text,
michael@0 249 const CLDHints* cld_hints,
michael@0 250 int flags,
michael@0 251 Language* language3,
michael@0 252 int* percent3,
michael@0 253 double* normalized_score3,
michael@0 254 ResultChunkVector* resultchunkvector,
michael@0 255 int* text_bytes,
michael@0 256 bool* is_reliable);
michael@0 257
michael@0 258 // Return version text string
michael@0 259 // String is "code_version - data_build_date"
michael@0 260 const char* DetectLanguageVersion();
michael@0 261
michael@0 262
michael@0 263 // Public use flags, debug output controls
michael@0 264 static const int kCLDFlagScoreAsQuads = 0x0100; // Force Greek, etc. => quads
michael@0 265 static const int kCLDFlagHtml = 0x0200; // Debug HTML => stderr
michael@0 266 static const int kCLDFlagCr = 0x0400; // <cr> per chunk if HTML
michael@0 267 static const int kCLDFlagVerbose = 0x0800; // More debug HTML => stderr
michael@0 268 static const int kCLDFlagQuiet = 0x1000; // Less debug HTML => stderr
michael@0 269 static const int kCLDFlagEcho = 0x2000; // Echo input => stderr
michael@0 270
michael@0 271
michael@0 272 /***
michael@0 273
michael@0 274 Flag meanings:
michael@0 275 kCLDFlagScoreAsQuads
michael@0 276 Normally, several languages are detected solely by their Unicode script.
michael@0 277 Combined with appropritate lookup tables, this flag forces them instead
michael@0 278 to be detected via quadgrams. This can be a useful refinement when looking
michael@0 279 for meaningful text in these languages, instead of just character sets.
michael@0 280 The default tables do not support this use.
michael@0 281 kCLDFlagHtml
michael@0 282 For each detection call, write an HTML file to stderr, showing the text
michael@0 283 chunks and their detected languages.
michael@0 284 kCLDFlagCr
michael@0 285 In that HTML file, force a new line for each chunk.
michael@0 286 kCLDFlagVerbose
michael@0 287 In that HTML file, show every lookup entry.
michael@0 288 kCLDFlagQuiet
michael@0 289 In that HTML file, suppress most of the output detail.
michael@0 290 kCLDFlagEcho
michael@0 291 Echo every input buffer to stderr.
michael@0 292 ***/
michael@0 293
michael@0 294 // Debug output: Print the resultchunkvector to file f
michael@0 295 void DumpResultChunkVector(FILE* f, const char* src,
michael@0 296 ResultChunkVector* resultchunkvector);
michael@0 297
michael@0 298 #ifdef CLD2_DYNAMIC_MODE
michael@0 299
michael@0 300 // If compiled with dynamic mode, load data from the specified file location.
michael@0 301 // If other data has already been loaded, it is discarded and the data is read
michael@0 302 // in from the specified file location again (even if the file has not changed).
michael@0 303 // WARNING: Before calling this method, language detection will always fail
michael@0 304 // and will always return the unknown language.
michael@0 305 void loadData(const char* fileName);
michael@0 306
michael@0 307 // If compiled with dynamic mode, unload the previously-loaded data.
michael@0 308 // WARNING: After calling this method, language detection will no longer work
michael@0 309 // and will always return the unknown language.
michael@0 310 void unloadData();
michael@0 311
michael@0 312 // Returns true if and only if data has been loaded via a call to loadData(...)
michael@0 313 // and has not been subsequently unladed via a call to unloadDate().
michael@0 314 bool isDataLoaded();
michael@0 315
michael@0 316 #endif // #ifdef CLD2_DYNAMIC_MODE
michael@0 317
michael@0 318 }; // End namespace CLD2
michael@0 319
michael@0 320 #endif // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_

mercurial