browser/components/translation/cld2/public/compact_lang_det.h

Thu, 15 Jan 2015 15:55:04 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:55:04 +0100
branch
TOR_BUG_9701
changeset 9
a63d609f5ebe
permissions
-rw-r--r--

Back out 97036ab72558 which inappropriately compared turds to third parties.

     1 // Copyright 2013 Google Inc. All Rights Reserved.
     2 //
     3 // Licensed under the Apache License, Version 2.0 (the "License");
     4 // you may not use this file except in compliance with the License.
     5 // You may obtain a copy of the License at
     6 //
     7 //     http://www.apache.org/licenses/LICENSE-2.0
     8 //
     9 // Unless required by applicable law or agreed to in writing, software
    10 // distributed under the License is distributed on an "AS IS" BASIS,
    11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12 // See the License for the specific language governing permissions and
    13 // limitations under the License.
    15 //
    16 // Author: dsites@google.com (Dick Sites)
    17 //
    19 // NOTE:
    20 // Baybayin (ancient script of the Philippines) is detected as TAGALOG.
    21 // Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE.
    22 // HAITIAN_CREOLE is detected as such.
    23 // NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly)
    24 // PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE.
    25 // ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as ROMANIAN.
    26 // BOSNIAN is not detected as such, but likely scores as Croatian or Serbian.
    27 // MONTENEGRIN is not detected as such, but likely scores as Serbian.
    28 // CROATIAN is detected in the Latin script
    29 // SERBIAN is detected in the Cyrililc and Latin scripts
    30 // Zhuang is detected in the Latin script only.
    31 //
    32 // The languages X_PIG_LATIN and X_KLINGON are detected in the
    33 //  extended calls ExtDetectLanguageSummary().
    34 //
    35 // UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure
    36 //  is high enough. This happens with non-text input such as the bytes of a
    37 //  JPEG, and also with text in languages outside training set.
    38 //
    39 // The following languages are to be detected in multiple scripts:
    40 //  AZERBAIJANI (Latin, Cyrillic*, Arabic*)
    41 //  BURMESE (Latin, Myanmar)
    42 //  HAUSA (Latin, Arabic)
    43 //  KASHMIRI (Arabic, Devanagari)
    44 //  KAZAKH (Latin, Cyrillic, Arabic)
    45 //  KURDISH (Latin*, Arabic)
    46 //  KYRGYZ (Cyrillic, Arabic)
    47 //  LIMBU (Devanagari, Limbu)
    48 //  MONGOLIAN (Cyrillic, Mongolian)
    49 //  SANSKRIT (Latin, Devanagari)
    50 //  SINDHI (Arabic, Devanagari)
    51 //  TAGALOG (Latin, Tagalog)
    52 //  TAJIK (Cyrillic, Arabic*)
    53 //  TATAR (Latin, Cyrillic, Arabic)
    54 //  TURKMEN (Latin, Cyrillic, Arabic)
    55 //  UIGHUR (Latin, Cyrillic, Arabic)
    56 //  UZBEK (Latin, Cyrillic, Arabic)
    57 //
    58 // * Due to a shortage of training text, AZERBAIJANI is not currently detected
    59 //   in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in
    60 //   Arabic script.
    61 //
    63 #ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
    64 #define I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
    66 #include <vector>
    67 #include "../internal/lang_script.h"  // For Language
    69 namespace CLD2 {
    71   // Scan interchange-valid UTF-8 bytes and detect most likely language,
    72   // or set of languages.
    73   //
    74   // Design goals:
    75   //   Skip over big stretches of HTML tags
    76   //   Able to return ranges of different languages
    77   //   Relatively small tables and relatively fast processing
    78   //   Thread safe
    79   //
    80   // For HTML documents, tags are skipped, along with <script> ... </script>
    81   // and <style> ... </style> sequences, and entities are expanded.
    82   //
    83   // We distinguish between bytes of the raw input buffer and bytes of non-tag
    84   // text letters. Since tags can be over 50% of the bytes of an HTML Page,
    85   // and are nearly all seven-bit ASCII English, we prefer to distinguish
    86   // language mixture fractions based on just the non-tag text.
    87   //
    88   // Inputs: text and text_length
    89   //  Code skips HTML tags and expands HTML entities, unless
    90   //  is_plain_text is true
    91   // Outputs:
    92   //  language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
    93   //  percent3 is an array of the text percentages 0..100 of the top 3 languages
    94   //  text_bytes is the amount of non-tag/letters-only text found
    95   //  is_reliable set true if the returned Language is some amount more
    96   //   probable then the second-best Language. Calculation is a complex function
    97   //   of the length of the text and the different-script runs of text.
    98   // Return value: the most likely Language for the majority of the input text
    99   //  Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text
   100   //  defaults to ENGLISH.
   101   //
   102   // The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for
   103   // backwards compatibility with a different detector.
   104   //
   105   // The third version may return UNKNOWN_LANGUAGE, and also returns extended
   106   // language codes from lang_script.h
   107   //
   110   // Instead of individual arguments, pass in hints as an initialized struct
   111   // Init to {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE} if not known.
   112   //
   113   // Pass in hints whenever possible; doing so improves detection accuracy. The
   114   // set of passed-in hints are all information that is external to the text
   115   // itself.
   116   //
   117   // The content_language_hint is intended to come from an HTTP header
   118   // Content-Language: field, the tld_hint from the hostname of a URL, the
   119   // encoding-hint from an encoding detector applied to the input
   120   // document, and the language hint from any other context you might have.
   121   // The lang= tags inside an HTML document will be picked up as hints
   122   // by code within the compact language detector.
   124   typedef struct {
   125     const char* content_language_hint;      // "mi,en" boosts Maori and English
   126     const char* tld_hint;                   // "id" boosts Indonesian
   127     int encoding_hint;                      // SJS boosts Japanese
   128     Language language_hint;                 // ITALIAN boosts it
   129   } CLDHints;
   131   static const int kMaxResultChunkBytes = 65535;
   133   // For returning a vector of per-language pieces of the input buffer
   134   // Unreliable and too-short are mapped to UNKNOWN_LANGUAGE
   135   typedef struct {
   136     int offset;                 // Starting byte offset in original buffer
   137     uint16 bytes;               // Number of bytes in chunk
   138     uint16 lang1;               // Top lang, as full Language. Apply
   139                                 // static_cast<Language>() to this short value.
   140   } ResultChunk;
   141   typedef std::vector<ResultChunk> ResultChunkVector;
   144   // Scan interchange-valid UTF-8 bytes and detect most likely language
   145   Language DetectLanguage(
   146                           const char* buffer,
   147                           int buffer_length,
   148                           bool is_plain_text,
   149                           bool* is_reliable);
   151   // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
   152   // language3[0] is usually also the return value
   153   Language DetectLanguageSummary(
   154                           const char* buffer,
   155                           int buffer_length,
   156                           bool is_plain_text,
   157                           Language* language3,
   158                           int* percent3,
   159                           int* text_bytes,
   160                           bool* is_reliable);
   162   // Same as above, with hints supplied
   163   // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
   164   // language3[0] is usually also the return value
   165   Language DetectLanguageSummary(
   166                           const char* buffer,
   167                           int buffer_length,
   168                           bool is_plain_text,
   169                           const char* tld_hint,       // "id" boosts Indonesian
   170                           int encoding_hint,          // SJS boosts Japanese
   171                           Language language_hint,     // ITALIAN boosts it
   172                           Language* language3,
   173                           int* percent3,
   174                           int* text_bytes,
   175                           bool* is_reliable);
   177   // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
   178   // languages.
   179   //
   180   // Extended languages are additional interface languages and Unicode
   181   // single-language scripts, from lang_script.h
   182   //
   183   // language3[0] is usually also the return value
   184   Language ExtDetectLanguageSummary(
   185                           const char* buffer,
   186                           int buffer_length,
   187                           bool is_plain_text,
   188                           Language* language3,
   189                           int* percent3,
   190                           int* text_bytes,
   191                           bool* is_reliable);
   193   // Same as above, with hints supplied
   194   // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
   195   // languages.
   196   //
   197   // Extended languages are additional Google interface languages and Unicode
   198   // single-language scripts, from lang_script.h
   199   //
   200   // language3[0] is usually also the return value
   201   Language ExtDetectLanguageSummary(
   202                           const char* buffer,
   203                           int buffer_length,
   204                           bool is_plain_text,
   205                           const char* tld_hint,       // "id" boosts Indonesian
   206                           int encoding_hint,          // SJS boosts Japanese
   207                           Language language_hint,     // ITALIAN boosts it
   208                           Language* language3,
   209                           int* percent3,
   210                           int* text_bytes,
   211                           bool* is_reliable);
   213   // Same as above, and also returns 3 internal language scores as a ratio to
   214   // normal score for real text in that language. Scores close to 1.0 indicate
   215   // normal text, while scores far away from 1.0 indicate badly-skewed text or
   216   // gibberish
   217   //
   218   Language ExtDetectLanguageSummary(
   219                           const char* buffer,
   220                           int buffer_length,
   221                           bool is_plain_text,
   222                           const char* tld_hint,       // "id" boosts Indonesian
   223                           int encoding_hint,          // SJS boosts Japanese
   224                           Language language_hint,     // ITALIAN boosts it
   225                           Language* language3,
   226                           int* percent3,
   227                           double* normalized_score3,
   228                           int* text_bytes,
   229                           bool* is_reliable);
   232   // Use this one.
   233   // Hints are collected into a struct.
   234   // Flags are passed in (normally zero).
   235   //
   236   // Also returns 3 internal language scores as a ratio to
   237   // normal score for real text in that language. Scores close to 1.0 indicate
   238   // normal text, while scores far away from 1.0 indicate badly-skewed text or
   239   // gibberish
   240   //
   241   // Returns a vector of chunks in different languages, so that caller may
   242   // spell-check, translate, or otherwaise process different parts of the input
   243   // buffer in language-dependant ways.
   244   //
   245   Language ExtDetectLanguageSummary(
   246                           const char* buffer,
   247                           int buffer_length,
   248                           bool is_plain_text,
   249                           const CLDHints* cld_hints,
   250                           int flags,
   251                           Language* language3,
   252                           int* percent3,
   253                           double* normalized_score3,
   254                           ResultChunkVector* resultchunkvector,
   255                           int* text_bytes,
   256                           bool* is_reliable);
   258   // Return version text string
   259   // String is "code_version - data_build_date"
   260   const char* DetectLanguageVersion();
   263   // Public use flags, debug output controls
   264   static const int kCLDFlagScoreAsQuads = 0x0100;  // Force Greek, etc. => quads
   265   static const int kCLDFlagHtml =         0x0200;  // Debug HTML => stderr
   266   static const int kCLDFlagCr =           0x0400;  // <cr> per chunk if HTML
   267   static const int kCLDFlagVerbose =      0x0800;  // More debug HTML => stderr
   268   static const int kCLDFlagQuiet =        0x1000;  // Less debug HTML => stderr
   269   static const int kCLDFlagEcho =         0x2000;  // Echo input => stderr
   272 /***
   274 Flag meanings:
   275  kCLDFlagScoreAsQuads
   276    Normally, several languages are detected solely by their Unicode script.
   277    Combined with appropritate lookup tables, this flag forces them instead
   278    to be detected via quadgrams. This can be a useful refinement when looking
   279    for meaningful text in these languages, instead of just character sets.
   280    The default tables do not support this use.
   281  kCLDFlagHtml
   282    For each detection call, write an HTML file to stderr, showing the text
   283    chunks and their detected languages.
   284  kCLDFlagCr
   285    In that HTML file, force a new line for each chunk.
   286  kCLDFlagVerbose
   287    In that HTML file, show every lookup entry.
   288  kCLDFlagQuiet
   289    In that HTML file, suppress most of the output detail.
   290  kCLDFlagEcho
   291   Echo every input buffer to stderr.
   292 ***/
   294 // Debug output: Print the resultchunkvector to file f
   295 void DumpResultChunkVector(FILE* f, const char* src,
   296                            ResultChunkVector* resultchunkvector);
   298 #ifdef CLD2_DYNAMIC_MODE
   300 // If compiled with dynamic mode, load data from the specified file location.
   301 // If other data has already been loaded, it is discarded and the data is read
   302 // in from the specified file location again (even if the file has not changed).
   303 // WARNING: Before calling this method, language detection will always fail
   304 // and will always return the unknown language.
   305 void loadData(const char* fileName);
   307 // If compiled with dynamic mode, unload the previously-loaded data.
   308 // WARNING: After calling this method, language detection will no longer work
   309 // and will always return the unknown language.
   310 void unloadData();
   312 // Returns true if and only if data has been loaded via a call to loadData(...)
   313 // and has not been subsequently unladed via a call to unloadDate().
   314 bool isDataLoaded();
   316 #endif // #ifdef CLD2_DYNAMIC_MODE
   318 };      // End namespace CLD2
   320 #endif  // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_

mercurial