browser/components/translation/cld2/internal/compact_lang_det.cc

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 // Copyright 2013 Google Inc. All Rights Reserved.
     2 //
     3 // Licensed under the Apache License, Version 2.0 (the "License");
     4 // you may not use this file except in compliance with the License.
     5 // You may obtain a copy of the License at
     6 //
     7 //     http://www.apache.org/licenses/LICENSE-2.0
     8 //
     9 // Unless required by applicable law or agreed to in writing, software
    10 // distributed under the License is distributed on an "AS IS" BASIS,
    11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12 // See the License for the specific language governing permissions and
    13 // limitations under the License.
    15 //
    16 // Author: dsites@google.com (Dick Sites)
    17 //
    19 #include <stdio.h>
    20 #include <stdlib.h>
    22 #include "../public/compact_lang_det.h"
    23 #include "../public/encodings.h"
    24 #include "compact_lang_det_impl.h"
    25 #include "integral_types.h"
    26 #include "lang_script.h"
    28 namespace CLD2 {
    30 // String is "code_version - data_scrape_date"
    31 //static const char* kDetectLanguageVersion = "V2.0 - 20130715";
    34 // Large-table version for all ~160 languages
    35 // Small-table version for all ~60 languages
    37 // Scan interchange-valid UTF-8 bytes and detect most likely language
    38 Language DetectLanguage(
    39                           const char* buffer,
    40                           int buffer_length,
    41                           bool is_plain_text,
    42                           bool* is_reliable) {
    43   bool allow_extended_lang = false;
    44   Language language3[3];
    45   int percent3[3];
    46   double normalized_score3[3];
    47   int text_bytes;
    48   int flags = 0;
    49   Language plus_one = UNKNOWN_LANGUAGE;
    50   const char* tld_hint = "";
    51   int encoding_hint = UNKNOWN_ENCODING;
    52   Language language_hint = UNKNOWN_LANGUAGE;
    53   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
    55   Language lang = DetectLanguageSummaryV2(
    56                           buffer,
    57                           buffer_length,
    58                           is_plain_text,
    59                           &cldhints,
    60                           allow_extended_lang,
    61                           flags,
    62                           plus_one,
    63                           language3,
    64                           percent3,
    65                           normalized_score3,
    66                           NULL,
    67                           &text_bytes,
    68                           is_reliable);
    69   // Default to English
    70   if (lang == UNKNOWN_LANGUAGE) {
    71     lang = ENGLISH;
    72   }
    73   return lang;
    74 }
    76 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
    77 Language DetectLanguageSummary(
    78                           const char* buffer,
    79                           int buffer_length,
    80                           bool is_plain_text,
    81                           Language* language3,
    82                           int* percent3,
    83                           int* text_bytes,
    84                           bool* is_reliable) {
    85   double normalized_score3[3];
    86   bool allow_extended_lang = false;
    87   int flags = 0;
    88   Language plus_one = UNKNOWN_LANGUAGE;
    89   const char* tld_hint = "";
    90   int encoding_hint = UNKNOWN_ENCODING;
    91   Language language_hint = UNKNOWN_LANGUAGE;
    92   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
    94   Language lang = DetectLanguageSummaryV2(
    95                           buffer,
    96                           buffer_length,
    97                           is_plain_text,
    98                           &cldhints,
    99                           allow_extended_lang,
   100                           flags,
   101                           plus_one,
   102                           language3,
   103                           percent3,
   104                           normalized_score3,
   105                           NULL,
   106                           text_bytes,
   107                           is_reliable);
   108   // Default to English
   109   if (lang == UNKNOWN_LANGUAGE) {
   110     lang = ENGLISH;
   111   }
   112   return lang;
   113 }
   115 // Same as above, with hints supplied
   116 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
   117 Language DetectLanguageSummary(
   118                           const char* buffer,
   119                           int buffer_length,
   120                           bool is_plain_text,
   121                           const char* tld_hint,       // "id" boosts Indonesian
   122                           int encoding_hint,          // SJS boosts Japanese
   123                           Language language_hint,     // ITALIAN boosts it
   124                           Language* language3,
   125                           int* percent3,
   126                           int* text_bytes,
   127                           bool* is_reliable) {
   128   double normalized_score3[3];
   129   bool allow_extended_lang = false;
   130   int flags = 0;
   131   Language plus_one = UNKNOWN_LANGUAGE;
   132   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
   134   Language lang = DetectLanguageSummaryV2(
   135                           buffer,
   136                           buffer_length,
   137                           is_plain_text,
   138                           &cldhints,
   139                           allow_extended_lang,
   140                           flags,
   141                           plus_one,
   142                           language3,
   143                           percent3,
   144                           normalized_score3,
   145                           NULL,
   146                           text_bytes,
   147                           is_reliable);
   148   // Default to English
   149   if (lang == UNKNOWN_LANGUAGE) {
   150     lang = ENGLISH;
   151   }
   152   return lang;
   153 }
   156 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
   157 // languages.
   158 // Extended languages are additional Google interface languages and Unicode
   159 // single-language scripts, from ext_lang_enc.h
   160 Language ExtDetectLanguageSummary(
   161                           const char* buffer,
   162                           int buffer_length,
   163                           bool is_plain_text,
   164                           Language* language3,
   165                           int* percent3,
   166                           int* text_bytes,
   167                           bool* is_reliable) {
   168   double normalized_score3[3];
   169   bool allow_extended_lang = true;
   170   int flags = 0;
   171   Language plus_one = UNKNOWN_LANGUAGE;
   172   const char* tld_hint = "";
   173   int encoding_hint = UNKNOWN_ENCODING;
   174   Language language_hint = UNKNOWN_LANGUAGE;
   175   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
   177   Language lang = DetectLanguageSummaryV2(
   178                           buffer,
   179                           buffer_length,
   180                           is_plain_text,
   181                           &cldhints,
   182                           allow_extended_lang,
   183                           flags,
   184                           plus_one,
   185                           language3,
   186                           percent3,
   187                           normalized_score3,
   188                           NULL,
   189                           text_bytes,
   190                           is_reliable);
   191   // Do not default to English
   192   return lang;
   193 }
   195 // Same as above, with hints supplied
   196 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
   197 // languages.
   198 // Extended languages are additional Google interface languages and Unicode
   199 // single-language scripts, from ext_lang_enc.h
   200 Language ExtDetectLanguageSummary(
   201                           const char* buffer,
   202                           int buffer_length,
   203                           bool is_plain_text,
   204                           const char* tld_hint,       // "id" boosts Indonesian
   205                           int encoding_hint,          // SJS boosts Japanese
   206                           Language language_hint,     // ITALIAN boosts it
   207                           Language* language3,
   208                           int* percent3,
   209                           int* text_bytes,
   210                           bool* is_reliable) {
   211   double normalized_score3[3];
   212   bool allow_extended_lang = true;
   213   int flags = 0;
   214   Language plus_one = UNKNOWN_LANGUAGE;
   215   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
   217   Language lang = DetectLanguageSummaryV2(
   218                           buffer,
   219                           buffer_length,
   220                           is_plain_text,
   221                           &cldhints,
   222                           allow_extended_lang,
   223                           flags,
   224                           plus_one,
   225                           language3,
   226                           percent3,
   227                           normalized_score3,
   228                           NULL,
   229                           text_bytes,
   230                           is_reliable);
   231   // Do not default to English
   232   return lang;
   233 }
   235 // Same as above, and also returns internal language scores as a ratio to
   236 // normal score for real text in that language. Scores close to 1.0 indicate
   237 // normal text, while scores far away from 1.0 indicate badly-skewed text or
   238 // gibberish
   239 //
   240 Language ExtDetectLanguageSummary(
   241                         const char* buffer,
   242                         int buffer_length,
   243                         bool is_plain_text,
   244                         const char* tld_hint,       // "id" boosts Indonesian
   245                         int encoding_hint,          // SJS boosts Japanese
   246                         Language language_hint,     // ITALIAN boosts it
   247                         Language* language3,
   248                         int* percent3,
   249                         double* normalized_score3,
   250                         int* text_bytes,
   251                         bool* is_reliable) {
   252   bool allow_extended_lang = true;
   253   int flags = 0;
   254   Language plus_one = UNKNOWN_LANGUAGE;
   255   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
   257   Language lang = DetectLanguageSummaryV2(
   258                           buffer,
   259                           buffer_length,
   260                           is_plain_text,
   261                           &cldhints,
   262                           allow_extended_lang,
   263                           flags,
   264                           plus_one,
   265                           language3,
   266                           percent3,
   267                           normalized_score3,
   268                           NULL,
   269                           text_bytes,
   270                           is_reliable);
   271   // Do not default to English
   272   return lang;
   273 }
   275 // Use this one.
   276 // Hints are collected into a struct.
   277 // Flags are passed in (normally zero).
   278 //
   279 // Also returns 3 internal language scores as a ratio to
   280 // normal score for real text in that language. Scores close to 1.0 indicate
   281 // normal text, while scores far away from 1.0 indicate badly-skewed text or
   282 // gibberish
   283 //
   284 // Returns a vector of chunks in different languages, so that caller may
   285 // spell-check, translate, or otherwaise process different parts of the input
   286 // buffer in language-dependant ways.
   287 //
   288 Language ExtDetectLanguageSummary(
   289                         const char* buffer,
   290                         int buffer_length,
   291                         bool is_plain_text,
   292                         const CLDHints* cld_hints,
   293                         int flags,
   294                         Language* language3,
   295                         int* percent3,
   296                         double* normalized_score3,
   297                         ResultChunkVector* resultchunkvector,
   298                         int* text_bytes,
   299                         bool* is_reliable) {
   300   bool allow_extended_lang = true;
   301   Language plus_one = UNKNOWN_LANGUAGE;
   303   Language lang = DetectLanguageSummaryV2(
   304                           buffer,
   305                           buffer_length,
   306                           is_plain_text,
   307                           cld_hints,
   308                           allow_extended_lang,
   309                           flags,
   310                           plus_one,
   311                           language3,
   312                           percent3,
   313                           normalized_score3,
   314                           resultchunkvector,
   315                           text_bytes,
   316                           is_reliable);
   317   // Do not default to English
   318   return lang;
   319 }
   321 }       // End namespace CLD2

mercurial