The Tor Browser: browser/components/translation/cld2/internal/compact_lang

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 // Copyright 2013 Google Inc. All Rights Reserved.

     2 //

     3 // Licensed under the Apache License, Version 2.0 (the "License");

     4 // you may not use this file except in compliance with the License.

     5 // You may obtain a copy of the License at

     6 //

     7 //     http://www.apache.org/licenses/LICENSE-2.0

     8 //

     9 // Unless required by applicable law or agreed to in writing, software

    10 // distributed under the License is distributed on an "AS IS" BASIS,

    11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

    12 // See the License for the specific language governing permissions and

    13 // limitations under the License.

    15 //

    16 // Author: dsites@google.com (Dick Sites)

    17 //

    19 #include <stdio.h>

    20 #include <stdlib.h>

    22 #include "../public/compact_lang_det.h"

    23 #include "../public/encodings.h"

    24 #include "compact_lang_det_impl.h"

    25 #include "integral_types.h"

    26 #include "lang_script.h"

    28 namespace CLD2 {

    30 // String is "code_version - data_scrape_date"

    31 //static const char* kDetectLanguageVersion = "V2.0 - 20130715";

    34 // Large-table version for all ~160 languages

    35 // Small-table version for all ~60 languages

    37 // Scan interchange-valid UTF-8 bytes and detect most likely language

    38 Language DetectLanguage(

    39                           const char* buffer,

    40                           int buffer_length,

    41                           bool is_plain_text,

    42                           bool* is_reliable) {

    43   bool allow_extended_lang = false;

    44   Language language3[3];

    45   int percent3[3];

    46   double normalized_score3[3];

    47   int text_bytes;

    48   int flags = 0;

    49   Language plus_one = UNKNOWN_LANGUAGE;

    50   const char* tld_hint = "";

    51   int encoding_hint = UNKNOWN_ENCODING;

    52   Language language_hint = UNKNOWN_LANGUAGE;

    53   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};

    55   Language lang = DetectLanguageSummaryV2(

    56                           buffer,

    57                           buffer_length,

    58                           is_plain_text,

    59                           &cldhints,

    60                           allow_extended_lang,

    61                           flags,

    62                           plus_one,

    63                           language3,

    64                           percent3,

    65                           normalized_score3,

    66                           NULL,

    67                           &text_bytes,

    68                           is_reliable);

    69   // Default to English

    70   if (lang == UNKNOWN_LANGUAGE) {

    71     lang = ENGLISH;

    72   }

    73   return lang;

    74 }

    76 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.

    77 Language DetectLanguageSummary(

    78                           const char* buffer,

    79                           int buffer_length,

    80                           bool is_plain_text,

    81                           Language* language3,

    82                           int* percent3,

    83                           int* text_bytes,

    84                           bool* is_reliable) {

    85   double normalized_score3[3];

    86   bool allow_extended_lang = false;

    87   int flags = 0;

    88   Language plus_one = UNKNOWN_LANGUAGE;

    89   const char* tld_hint = "";

    90   int encoding_hint = UNKNOWN_ENCODING;

    91   Language language_hint = UNKNOWN_LANGUAGE;

    92   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};

    94   Language lang = DetectLanguageSummaryV2(

    95                           buffer,

    96                           buffer_length,

    97                           is_plain_text,

    98                           &cldhints,

    99                           allow_extended_lang,

   100                           flags,

   101                           plus_one,

   102                           language3,

   103                           percent3,

   104                           normalized_score3,

   105                           NULL,

   106                           text_bytes,

   107                           is_reliable);

   108   // Default to English

   109   if (lang == UNKNOWN_LANGUAGE) {

   110     lang = ENGLISH;

   111   }

   112   return lang;

   113 }

   115 // Same as above, with hints supplied

   116 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.

   117 Language DetectLanguageSummary(

   118                           const char* buffer,

   119                           int buffer_length,

   120                           bool is_plain_text,

   121                           const char* tld_hint,       // "id" boosts Indonesian

   122                           int encoding_hint,          // SJS boosts Japanese

   123                           Language language_hint,     // ITALIAN boosts it

   124                           Language* language3,

   125                           int* percent3,

   126                           int* text_bytes,

   127                           bool* is_reliable) {

   128   double normalized_score3[3];

   129   bool allow_extended_lang = false;

   130   int flags = 0;

   131   Language plus_one = UNKNOWN_LANGUAGE;

   132   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};

   134   Language lang = DetectLanguageSummaryV2(

   135                           buffer,

   136                           buffer_length,

   137                           is_plain_text,

   138                           &cldhints,

   139                           allow_extended_lang,

   140                           flags,

   141                           plus_one,

   142                           language3,

   143                           percent3,

   144                           normalized_score3,

   145                           NULL,

   146                           text_bytes,

   147                           is_reliable);

   148   // Default to English

   149   if (lang == UNKNOWN_LANGUAGE) {

   150     lang = ENGLISH;

   151   }

   152   return lang;

   153 }

   156 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended

   157 // languages.

   158 // Extended languages are additional Google interface languages and Unicode

   159 // single-language scripts, from ext_lang_enc.h

   160 Language ExtDetectLanguageSummary(

   161                           const char* buffer,

   162                           int buffer_length,

   163                           bool is_plain_text,

   164                           Language* language3,

   165                           int* percent3,

   166                           int* text_bytes,

   167                           bool* is_reliable) {

   168   double normalized_score3[3];

   169   bool allow_extended_lang = true;

   170   int flags = 0;

   171   Language plus_one = UNKNOWN_LANGUAGE;

   172   const char* tld_hint = "";

   173   int encoding_hint = UNKNOWN_ENCODING;

   174   Language language_hint = UNKNOWN_LANGUAGE;

   175   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};

   177   Language lang = DetectLanguageSummaryV2(

   178                           buffer,

   179                           buffer_length,

   180                           is_plain_text,

   181                           &cldhints,

   182                           allow_extended_lang,

   183                           flags,

   184                           plus_one,

   185                           language3,

   186                           percent3,

   187                           normalized_score3,

   188                           NULL,

   189                           text_bytes,

   190                           is_reliable);

   191   // Do not default to English

   192   return lang;

   193 }

   195 // Same as above, with hints supplied

   196 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended

   197 // languages.

   198 // Extended languages are additional Google interface languages and Unicode

   199 // single-language scripts, from ext_lang_enc.h

   200 Language ExtDetectLanguageSummary(

   201                           const char* buffer,

   202                           int buffer_length,

   203                           bool is_plain_text,

   204                           const char* tld_hint,       // "id" boosts Indonesian

   205                           int encoding_hint,          // SJS boosts Japanese

   206                           Language language_hint,     // ITALIAN boosts it

   207                           Language* language3,

   208                           int* percent3,

   209                           int* text_bytes,

   210                           bool* is_reliable) {

   211   double normalized_score3[3];

   212   bool allow_extended_lang = true;

   213   int flags = 0;

   214   Language plus_one = UNKNOWN_LANGUAGE;

   215   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};

   217   Language lang = DetectLanguageSummaryV2(

   218                           buffer,

   219                           buffer_length,

   220                           is_plain_text,

   221                           &cldhints,

   222                           allow_extended_lang,

   223                           flags,

   224                           plus_one,

   225                           language3,

   226                           percent3,

   227                           normalized_score3,

   228                           NULL,

   229                           text_bytes,

   230                           is_reliable);

   231   // Do not default to English

   232   return lang;

   233 }

   235 // Same as above, and also returns internal language scores as a ratio to

   236 // normal score for real text in that language. Scores close to 1.0 indicate

   237 // normal text, while scores far away from 1.0 indicate badly-skewed text or

   238 // gibberish

   239 //

   240 Language ExtDetectLanguageSummary(

   241                         const char* buffer,

   242                         int buffer_length,

   243                         bool is_plain_text,

   244                         const char* tld_hint,       // "id" boosts Indonesian

   245                         int encoding_hint,          // SJS boosts Japanese

   246                         Language language_hint,     // ITALIAN boosts it

   247                         Language* language3,

   248                         int* percent3,

   249                         double* normalized_score3,

   250                         int* text_bytes,

   251                         bool* is_reliable) {

   252   bool allow_extended_lang = true;

   253   int flags = 0;

   254   Language plus_one = UNKNOWN_LANGUAGE;

   255   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};

   257   Language lang = DetectLanguageSummaryV2(

   258                           buffer,

   259                           buffer_length,

   260                           is_plain_text,

   261                           &cldhints,

   262                           allow_extended_lang,

   263                           flags,

   264                           plus_one,

   265                           language3,

   266                           percent3,

   267                           normalized_score3,

   268                           NULL,

   269                           text_bytes,

   270                           is_reliable);

   271   // Do not default to English

   272   return lang;

   273 }

   275 // Use this one.

   276 // Hints are collected into a struct.

   277 // Flags are passed in (normally zero).

   278 //

   279 // Also returns 3 internal language scores as a ratio to

   280 // normal score for real text in that language. Scores close to 1.0 indicate

   281 // normal text, while scores far away from 1.0 indicate badly-skewed text or

   282 // gibberish

   283 //

   284 // Returns a vector of chunks in different languages, so that caller may

   285 // spell-check, translate, or otherwaise process different parts of the input

   286 // buffer in language-dependant ways.

   287 //

   288 Language ExtDetectLanguageSummary(

   289                         const char* buffer,

   290                         int buffer_length,

   291                         bool is_plain_text,

   292                         const CLDHints* cld_hints,

   293                         int flags,

   294                         Language* language3,

   295                         int* percent3,

   296                         double* normalized_score3,

   297                         ResultChunkVector* resultchunkvector,

   298                         int* text_bytes,

   299                         bool* is_reliable) {

   300   bool allow_extended_lang = true;

   301   Language plus_one = UNKNOWN_LANGUAGE;

   303   Language lang = DetectLanguageSummaryV2(

   304                           buffer,

   305                           buffer_length,

   306                           is_plain_text,

   307                           cld_hints,

   308                           allow_extended_lang,

   309                           flags,

   310                           plus_one,

   311                           language3,

   312                           percent3,

   313                           normalized_score3,

   314                           resultchunkvector,

   315                           text_bytes,

   316                           is_reliable);

   317   // Do not default to English

   318   return lang;

   319 }

   321 }       // End namespace CLD2

The Tor Browser / file revision

browser/components/translation/cld2/internal/compact_lang_det.cc@6474c204b198

browser/components/translation/cld2/internal/compact_lang_det.cc