The Tor Browser: browser/components/translation/cld2/internal/compact_lang_det

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 // Copyright 2013 Google Inc. All Rights Reserved.

     2 //

     3 // Licensed under the Apache License, Version 2.0 (the "License");

     4 // you may not use this file except in compliance with the License.

     5 // You may obtain a copy of the License at

     6 //

     7 //     http://www.apache.org/licenses/LICENSE-2.0

     8 //

     9 // Unless required by applicable law or agreed to in writing, software

    10 // distributed under the License is distributed on an "AS IS" BASIS,

    11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

    12 // See the License for the specific language governing permissions and

    13 // limitations under the License.

    15 //

    16 // Author: dsites@google.com (Dick Sites)

    17 //

    19 #ifndef I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_

    20 #define I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_

    22 #include <vector>

    24 #include "../public/compact_lang_det.h"   // For CLDHints, ResultChunkVector

    25 #include "integral_types.h"

    26 #include "lang_script.h"

    28 namespace CLD2 {

    30 // Internal use flags

    31 static const int kCLDFlagFinish = 1;

    32 static const int kCLDFlagSqueeze = 2;

    33 static const int kCLDFlagRepeats = 4;

    34 static const int kCLDFlagTop40 = 8;

    35 static const int kCLDFlagShort = 16;

    36 static const int kCLDFlagHint = 32;

    37 static const int kCLDFlagUseWords = 64;

    38 static const int kCLDFlagUNUSED = 128;

    40 // Public use flags, debug output controls, defined in compact_lang_det.h

    41 // 0x0100 and above

    43 /***

    45 Flag meanings:

    47 Flags are used in the context of a recursive call from Detect to itself,

    48 trying to deal in a more restrictive way with input that was not reliably

    49 identified in the top-level call.

    51 Finish -- Do not further recurse; return whatever result ensues, even if it is

    52           unreliable. Typically set in any recursive call to take a second try

    53           on unreliable text.

    55 Squeeze -- For each text run, do an inplace cheapsqueeze to remove chunks of

    56           highly repetitive text and chunks of text with too many 1- and

    57           2-letter words. This avoids scoring repetitive or useless non-text

    58           crap in large files such bogus JPEGs within an HTML file.

    60 Repeats -- When scoring a text run, do a cheap prediction of each character

    61           and do not score a unigram/quadgram if the last character of same is

    62           correctly predicted. This is a slower, finer-grained form of

    63           cheapsqueeze, typically used when the first pass got unreliable

    64           results.

    66 Top40 -- Restrict the set of scored languages to the Google "Top 40", which is

    67           actually 38 languages. This gets rid of about 110 languages that

    68           represent about 0.7% of the web. Typically used when the first pass

    69           got unreliable results.

    71 Short -- DEPRICATED, unused

    73 Hint -- EXPERIMENTAL flag for compact_lang_det_test.cc to indicate a language

    74           hint supplied in parameter plus_one.

    76 UseWords -- In additon to scoring quad/uni/nil-grams, score complete words

    80 Tentative decision logic:

    82 In the middle of first pass -- After 4KB of text, look at the front 256 bytes

    83           of every full 4KB buffer. If it compresses very well (say 3:1) or has

    84           lots of spaces (say 1 of every 4 bytes), assume that the input is

    85           large and contains lots of bogus non-text. Recurse, passing the

    86           Squeeze flag to strip out chunks of this non-text.

    88 At the end of the first pass --

    89           If the top language is reliable and >= 70% of the document, return.

    90           Else if the top language is reliable and top+2nd >= say 94%, return.

    91           Else, either the top language is not reliable or there is a lot of

    92           other crap.

    93 ***/

    96   // Scan interchange-valid UTF-8 bytes and detect most likely language,

    97   // or set of languages.

    98   //

    99   // Design goals:

   100   //   Skip over big stretches of HTML tags

   101   //   Able to return ranges of different languages

   102   //   Relatively small tables and relatively fast processing

   103   //   Thread safe

   104   //

   106   typedef struct {

   107     int perscript_count;

   108     const Language* perscript_lang;

   109   } PerScriptPair;

   111   typedef struct {

   112     // Constants for hashing 4-7 byte quadgram to 32 bits

   113     const int kQuadHashB4Shift;

   114     const int kQuadHashB4bShift;

   115     const int kQuadHashB5Shift;

   116     const int kQuadHashB5bShift;

   117     // Constants for hashing 32 bits to kQuadKeyTable subscript/key

   118     const int kHashvalToSubShift;

   119     const uint32 kHashvalToSubMask;

   120     const int kHashvalToKeyShift;

   121     const uint32 kHashvalToKeyMask;

   122     const int kHashvalAssociativity;

   123     // Pointers to the actual tables

   124     const PerScriptPair* kPerScriptPair;

   125     const uint16* kQuadKeyTable;

   126     const uint32* kQuadValueTable;

   127   } LangDetObj;

   129   // For HTML documents, tags are skipped, along with <script> ... </script>

   130   // and <style> ... </style> sequences, and entities are expanded.

   131   //

   132   // We distinguish between bytes of the raw input buffer and bytes of non-tag

   133   // text letters. Since tags can be over 50% of the bytes of an HTML Page,

   134   // and are nearly all seven-bit ASCII English, we prefer to distinguish

   135   // language mixture fractions based on just the non-tag text.

   136   //

   137   // Inputs: text and text_length

   138   //  is_plain_text if true says to NOT parse/skip HTML tags nor entities

   139   // Outputs:

   140   //  language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE

   141   //  percent3 is an array of the text percentages 0..100 of the top 3 languages

   142   //  normalized_score3 is an array of internal scores, normalized to the

   143   //    average score for each language over a body of training text. A

   144   //    normalized score significantly away from 1.0 indicates very skewed text

   145   //    or gibberish.

   146   //

   147   //  text_bytes is the amount of non-tag/letters-only text found

   148   //  is_reliable set true if the returned Language is at least 2**30 times more

   149   //  probable then the second-best Language

   150   //

   151   // Return value: the most likely Language for the majority of the input text

   152   //  Length 0 input and text with no reliable letter sequences returns

   153   //  UNKNOWN_LANGUAGE

   154   //

   155   // Subsetting: For fast detection over large documents, these routines will

   156   // only scan up to a fixed limit (currently 160KB of non-tag letters).

   157   //

   159   Language DetectLanguageSummaryV2(

   160                         const char* buffer,

   161                         int buffer_length,

   162                         bool is_plain_text,

   163                         const CLDHints* cld_hints,

   164                         bool allow_extended_lang,

   165                         int flags,

   166                         Language plus_one,

   167                         Language* language3,

   168                         int* percent3,

   169                         double* normalized_score3,

   170                         ResultChunkVector* resultchunkvector,

   171                         int* text_bytes,

   172                         bool* is_reliable);

   174   // For unit testing:

   175   // Remove portions of text that have a high density of spaces, or that are

   176   // overly repetitive, squeezing the remaining text in-place to the front

   177   // of the input buffer.

   178   // Return the new, possibly-shorter length

   179   int CheapSqueezeInplace(char* isrc, int srclen, int ichunksize);

   181 }       // End namespace CLD2

   183 #endif  // I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_

The Tor Browser / file revision

browser/components/translation/cld2/internal/compact_lang_det_impl.h@6474c204b198

browser/components/translation/cld2/internal/compact_lang_det_impl.h