browser/components/translation/cld2/public/compact_lang_det.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/browser/components/translation/cld2/public/compact_lang_det.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,320 @@
     1.4 +// Copyright 2013 Google Inc. All Rights Reserved.
     1.5 +//
     1.6 +// Licensed under the Apache License, Version 2.0 (the "License");
     1.7 +// you may not use this file except in compliance with the License.
     1.8 +// You may obtain a copy of the License at
     1.9 +//
    1.10 +//     http://www.apache.org/licenses/LICENSE-2.0
    1.11 +//
    1.12 +// Unless required by applicable law or agreed to in writing, software
    1.13 +// distributed under the License is distributed on an "AS IS" BASIS,
    1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    1.15 +// See the License for the specific language governing permissions and
    1.16 +// limitations under the License.
    1.17 +
    1.18 +//
    1.19 +// Author: dsites@google.com (Dick Sites)
    1.20 +//
    1.21 +
    1.22 +// NOTE:
    1.23 +// Baybayin (ancient script of the Philippines) is detected as TAGALOG.
    1.24 +// Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE.
    1.25 +// HAITIAN_CREOLE is detected as such.
    1.26 +// NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly)
    1.27 +// PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE.
    1.28 +// ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as ROMANIAN.
    1.29 +// BOSNIAN is not detected as such, but likely scores as Croatian or Serbian.
    1.30 +// MONTENEGRIN is not detected as such, but likely scores as Serbian.
    1.31 +// CROATIAN is detected in the Latin script
    1.32 +// SERBIAN is detected in the Cyrililc and Latin scripts
    1.33 +// Zhuang is detected in the Latin script only.
    1.34 +//
    1.35 +// The languages X_PIG_LATIN and X_KLINGON are detected in the
    1.36 +//  extended calls ExtDetectLanguageSummary().
    1.37 +//
    1.38 +// UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure
    1.39 +//  is high enough. This happens with non-text input such as the bytes of a
    1.40 +//  JPEG, and also with text in languages outside training set.
    1.41 +//
    1.42 +// The following languages are to be detected in multiple scripts:
    1.43 +//  AZERBAIJANI (Latin, Cyrillic*, Arabic*)
    1.44 +//  BURMESE (Latin, Myanmar)
    1.45 +//  HAUSA (Latin, Arabic)
    1.46 +//  KASHMIRI (Arabic, Devanagari)
    1.47 +//  KAZAKH (Latin, Cyrillic, Arabic)
    1.48 +//  KURDISH (Latin*, Arabic)
    1.49 +//  KYRGYZ (Cyrillic, Arabic)
    1.50 +//  LIMBU (Devanagari, Limbu)
    1.51 +//  MONGOLIAN (Cyrillic, Mongolian)
    1.52 +//  SANSKRIT (Latin, Devanagari)
    1.53 +//  SINDHI (Arabic, Devanagari)
    1.54 +//  TAGALOG (Latin, Tagalog)
    1.55 +//  TAJIK (Cyrillic, Arabic*)
    1.56 +//  TATAR (Latin, Cyrillic, Arabic)
    1.57 +//  TURKMEN (Latin, Cyrillic, Arabic)
    1.58 +//  UIGHUR (Latin, Cyrillic, Arabic)
    1.59 +//  UZBEK (Latin, Cyrillic, Arabic)
    1.60 +//
    1.61 +// * Due to a shortage of training text, AZERBAIJANI is not currently detected
    1.62 +//   in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in
    1.63 +//   Arabic script.
    1.64 +//
    1.65 +
    1.66 +#ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
    1.67 +#define I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
    1.68 +
    1.69 +#include <vector>
    1.70 +#include "../internal/lang_script.h"  // For Language
    1.71 +
    1.72 +namespace CLD2 {
    1.73 +
    1.74 +  // Scan interchange-valid UTF-8 bytes and detect most likely language,
    1.75 +  // or set of languages.
    1.76 +  //
    1.77 +  // Design goals:
    1.78 +  //   Skip over big stretches of HTML tags
    1.79 +  //   Able to return ranges of different languages
    1.80 +  //   Relatively small tables and relatively fast processing
    1.81 +  //   Thread safe
    1.82 +  //
    1.83 +  // For HTML documents, tags are skipped, along with <script> ... </script>
    1.84 +  // and <style> ... </style> sequences, and entities are expanded.
    1.85 +  //
    1.86 +  // We distinguish between bytes of the raw input buffer and bytes of non-tag
    1.87 +  // text letters. Since tags can be over 50% of the bytes of an HTML Page,
    1.88 +  // and are nearly all seven-bit ASCII English, we prefer to distinguish
    1.89 +  // language mixture fractions based on just the non-tag text.
    1.90 +  //
    1.91 +  // Inputs: text and text_length
    1.92 +  //  Code skips HTML tags and expands HTML entities, unless
    1.93 +  //  is_plain_text is true
    1.94 +  // Outputs:
    1.95 +  //  language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
    1.96 +  //  percent3 is an array of the text percentages 0..100 of the top 3 languages
    1.97 +  //  text_bytes is the amount of non-tag/letters-only text found
    1.98 +  //  is_reliable set true if the returned Language is some amount more
    1.99 +  //   probable then the second-best Language. Calculation is a complex function
   1.100 +  //   of the length of the text and the different-script runs of text.
   1.101 +  // Return value: the most likely Language for the majority of the input text
   1.102 +  //  Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text
   1.103 +  //  defaults to ENGLISH.
   1.104 +  //
   1.105 +  // The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for
   1.106 +  // backwards compatibility with a different detector.
   1.107 +  //
   1.108 +  // The third version may return UNKNOWN_LANGUAGE, and also returns extended
   1.109 +  // language codes from lang_script.h
   1.110 +  //
   1.111 +
   1.112 +
   1.113 +  // Instead of individual arguments, pass in hints as an initialized struct
   1.114 +  // Init to {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE} if not known.
   1.115 +  //
   1.116 +  // Pass in hints whenever possible; doing so improves detection accuracy. The
   1.117 +  // set of passed-in hints are all information that is external to the text
   1.118 +  // itself.
   1.119 +  //
   1.120 +  // The content_language_hint is intended to come from an HTTP header
   1.121 +  // Content-Language: field, the tld_hint from the hostname of a URL, the
   1.122 +  // encoding-hint from an encoding detector applied to the input
   1.123 +  // document, and the language hint from any other context you might have.
   1.124 +  // The lang= tags inside an HTML document will be picked up as hints
   1.125 +  // by code within the compact language detector.
   1.126 +
   1.127 +  typedef struct {
   1.128 +    const char* content_language_hint;      // "mi,en" boosts Maori and English
   1.129 +    const char* tld_hint;                   // "id" boosts Indonesian
   1.130 +    int encoding_hint;                      // SJS boosts Japanese
   1.131 +    Language language_hint;                 // ITALIAN boosts it
   1.132 +  } CLDHints;
   1.133 +
   1.134 +  static const int kMaxResultChunkBytes = 65535;
   1.135 +
   1.136 +  // For returning a vector of per-language pieces of the input buffer
   1.137 +  // Unreliable and too-short are mapped to UNKNOWN_LANGUAGE
   1.138 +  typedef struct {
   1.139 +    int offset;                 // Starting byte offset in original buffer
   1.140 +    uint16 bytes;               // Number of bytes in chunk
   1.141 +    uint16 lang1;               // Top lang, as full Language. Apply
   1.142 +                                // static_cast<Language>() to this short value.
   1.143 +  } ResultChunk;
   1.144 +  typedef std::vector<ResultChunk> ResultChunkVector;
   1.145 +
   1.146 +
   1.147 +  // Scan interchange-valid UTF-8 bytes and detect most likely language
   1.148 +  Language DetectLanguage(
   1.149 +                          const char* buffer,
   1.150 +                          int buffer_length,
   1.151 +                          bool is_plain_text,
   1.152 +                          bool* is_reliable);
   1.153 +
   1.154 +  // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
   1.155 +  // language3[0] is usually also the return value
   1.156 +  Language DetectLanguageSummary(
   1.157 +                          const char* buffer,
   1.158 +                          int buffer_length,
   1.159 +                          bool is_plain_text,
   1.160 +                          Language* language3,
   1.161 +                          int* percent3,
   1.162 +                          int* text_bytes,
   1.163 +                          bool* is_reliable);
   1.164 +
   1.165 +  // Same as above, with hints supplied
   1.166 +  // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
   1.167 +  // language3[0] is usually also the return value
   1.168 +  Language DetectLanguageSummary(
   1.169 +                          const char* buffer,
   1.170 +                          int buffer_length,
   1.171 +                          bool is_plain_text,
   1.172 +                          const char* tld_hint,       // "id" boosts Indonesian
   1.173 +                          int encoding_hint,          // SJS boosts Japanese
   1.174 +                          Language language_hint,     // ITALIAN boosts it
   1.175 +                          Language* language3,
   1.176 +                          int* percent3,
   1.177 +                          int* text_bytes,
   1.178 +                          bool* is_reliable);
   1.179 +
   1.180 +  // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
   1.181 +  // languages.
   1.182 +  //
   1.183 +  // Extended languages are additional interface languages and Unicode
   1.184 +  // single-language scripts, from lang_script.h
   1.185 +  //
   1.186 +  // language3[0] is usually also the return value
   1.187 +  Language ExtDetectLanguageSummary(
   1.188 +                          const char* buffer,
   1.189 +                          int buffer_length,
   1.190 +                          bool is_plain_text,
   1.191 +                          Language* language3,
   1.192 +                          int* percent3,
   1.193 +                          int* text_bytes,
   1.194 +                          bool* is_reliable);
   1.195 +
   1.196 +  // Same as above, with hints supplied
   1.197 +  // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
   1.198 +  // languages.
   1.199 +  //
   1.200 +  // Extended languages are additional Google interface languages and Unicode
   1.201 +  // single-language scripts, from lang_script.h
   1.202 +  //
   1.203 +  // language3[0] is usually also the return value
   1.204 +  Language ExtDetectLanguageSummary(
   1.205 +                          const char* buffer,
   1.206 +                          int buffer_length,
   1.207 +                          bool is_plain_text,
   1.208 +                          const char* tld_hint,       // "id" boosts Indonesian
   1.209 +                          int encoding_hint,          // SJS boosts Japanese
   1.210 +                          Language language_hint,     // ITALIAN boosts it
   1.211 +                          Language* language3,
   1.212 +                          int* percent3,
   1.213 +                          int* text_bytes,
   1.214 +                          bool* is_reliable);
   1.215 +
   1.216 +  // Same as above, and also returns 3 internal language scores as a ratio to
   1.217 +  // normal score for real text in that language. Scores close to 1.0 indicate
   1.218 +  // normal text, while scores far away from 1.0 indicate badly-skewed text or
   1.219 +  // gibberish
   1.220 +  //
   1.221 +  Language ExtDetectLanguageSummary(
   1.222 +                          const char* buffer,
   1.223 +                          int buffer_length,
   1.224 +                          bool is_plain_text,
   1.225 +                          const char* tld_hint,       // "id" boosts Indonesian
   1.226 +                          int encoding_hint,          // SJS boosts Japanese
   1.227 +                          Language language_hint,     // ITALIAN boosts it
   1.228 +                          Language* language3,
   1.229 +                          int* percent3,
   1.230 +                          double* normalized_score3,
   1.231 +                          int* text_bytes,
   1.232 +                          bool* is_reliable);
   1.233 +
   1.234 +
   1.235 +  // Use this one.
   1.236 +  // Hints are collected into a struct.
   1.237 +  // Flags are passed in (normally zero).
   1.238 +  //
   1.239 +  // Also returns 3 internal language scores as a ratio to
   1.240 +  // normal score for real text in that language. Scores close to 1.0 indicate
   1.241 +  // normal text, while scores far away from 1.0 indicate badly-skewed text or
   1.242 +  // gibberish
   1.243 +  //
   1.244 +  // Returns a vector of chunks in different languages, so that caller may
   1.245 +  // spell-check, translate, or otherwaise process different parts of the input
   1.246 +  // buffer in language-dependant ways.
   1.247 +  //
   1.248 +  Language ExtDetectLanguageSummary(
   1.249 +                          const char* buffer,
   1.250 +                          int buffer_length,
   1.251 +                          bool is_plain_text,
   1.252 +                          const CLDHints* cld_hints,
   1.253 +                          int flags,
   1.254 +                          Language* language3,
   1.255 +                          int* percent3,
   1.256 +                          double* normalized_score3,
   1.257 +                          ResultChunkVector* resultchunkvector,
   1.258 +                          int* text_bytes,
   1.259 +                          bool* is_reliable);
   1.260 +
   1.261 +  // Return version text string
   1.262 +  // String is "code_version - data_build_date"
   1.263 +  const char* DetectLanguageVersion();
   1.264 +
   1.265 +
   1.266 +  // Public use flags, debug output controls
   1.267 +  static const int kCLDFlagScoreAsQuads = 0x0100;  // Force Greek, etc. => quads
   1.268 +  static const int kCLDFlagHtml =         0x0200;  // Debug HTML => stderr
   1.269 +  static const int kCLDFlagCr =           0x0400;  // <cr> per chunk if HTML
   1.270 +  static const int kCLDFlagVerbose =      0x0800;  // More debug HTML => stderr
   1.271 +  static const int kCLDFlagQuiet =        0x1000;  // Less debug HTML => stderr
   1.272 +  static const int kCLDFlagEcho =         0x2000;  // Echo input => stderr
   1.273 +
   1.274 +
   1.275 +/***
   1.276 +
   1.277 +Flag meanings:
   1.278 + kCLDFlagScoreAsQuads
   1.279 +   Normally, several languages are detected solely by their Unicode script.
   1.280 +   Combined with appropritate lookup tables, this flag forces them instead
   1.281 +   to be detected via quadgrams. This can be a useful refinement when looking
   1.282 +   for meaningful text in these languages, instead of just character sets.
   1.283 +   The default tables do not support this use.
   1.284 + kCLDFlagHtml
   1.285 +   For each detection call, write an HTML file to stderr, showing the text
   1.286 +   chunks and their detected languages.
   1.287 + kCLDFlagCr
   1.288 +   In that HTML file, force a new line for each chunk.
   1.289 + kCLDFlagVerbose
   1.290 +   In that HTML file, show every lookup entry.
   1.291 + kCLDFlagQuiet
   1.292 +   In that HTML file, suppress most of the output detail.
   1.293 + kCLDFlagEcho
   1.294 +  Echo every input buffer to stderr.
   1.295 +***/
   1.296 +
   1.297 +// Debug output: Print the resultchunkvector to file f
   1.298 +void DumpResultChunkVector(FILE* f, const char* src,
   1.299 +                           ResultChunkVector* resultchunkvector);
   1.300 +
   1.301 +#ifdef CLD2_DYNAMIC_MODE
   1.302 +
   1.303 +// If compiled with dynamic mode, load data from the specified file location.
   1.304 +// If other data has already been loaded, it is discarded and the data is read
   1.305 +// in from the specified file location again (even if the file has not changed).
   1.306 +// WARNING: Before calling this method, language detection will always fail
   1.307 +// and will always return the unknown language.
   1.308 +void loadData(const char* fileName);
   1.309 +
   1.310 +// If compiled with dynamic mode, unload the previously-loaded data.
   1.311 +// WARNING: After calling this method, language detection will no longer work
   1.312 +// and will always return the unknown language.
   1.313 +void unloadData();
   1.314 +
   1.315 +// Returns true if and only if data has been loaded via a call to loadData(...)
   1.316 +// and has not been subsequently unladed via a call to unloadDate().
   1.317 +bool isDataLoaded();
   1.318 +
   1.319 +#endif // #ifdef CLD2_DYNAMIC_MODE
   1.320 +
   1.321 +};      // End namespace CLD2
   1.322 +
   1.323 +#endif  // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_

mercurial