browser/components/translation/cld2/internal/compact_lang_det.cc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/browser/components/translation/cld2/internal/compact_lang_det.cc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,322 @@
     1.4 +// Copyright 2013 Google Inc. All Rights Reserved.
     1.5 +//
     1.6 +// Licensed under the Apache License, Version 2.0 (the "License");
     1.7 +// you may not use this file except in compliance with the License.
     1.8 +// You may obtain a copy of the License at
     1.9 +//
    1.10 +//     http://www.apache.org/licenses/LICENSE-2.0
    1.11 +//
    1.12 +// Unless required by applicable law or agreed to in writing, software
    1.13 +// distributed under the License is distributed on an "AS IS" BASIS,
    1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    1.15 +// See the License for the specific language governing permissions and
    1.16 +// limitations under the License.
    1.17 +
    1.18 +//
    1.19 +// Author: dsites@google.com (Dick Sites)
    1.20 +//
    1.21 +
    1.22 +#include <stdio.h>
    1.23 +#include <stdlib.h>
    1.24 +
    1.25 +#include "../public/compact_lang_det.h"
    1.26 +#include "../public/encodings.h"
    1.27 +#include "compact_lang_det_impl.h"
    1.28 +#include "integral_types.h"
    1.29 +#include "lang_script.h"
    1.30 +
    1.31 +namespace CLD2 {
    1.32 +
    1.33 +// String is "code_version - data_scrape_date"
    1.34 +//static const char* kDetectLanguageVersion = "V2.0 - 20130715";
    1.35 +
    1.36 +
    1.37 +// Large-table version for all ~160 languages
    1.38 +// Small-table version for all ~60 languages
    1.39 +
    1.40 +// Scan interchange-valid UTF-8 bytes and detect most likely language
    1.41 +Language DetectLanguage(
    1.42 +                          const char* buffer,
    1.43 +                          int buffer_length,
    1.44 +                          bool is_plain_text,
    1.45 +                          bool* is_reliable) {
    1.46 +  bool allow_extended_lang = false;
    1.47 +  Language language3[3];
    1.48 +  int percent3[3];
    1.49 +  double normalized_score3[3];
    1.50 +  int text_bytes;
    1.51 +  int flags = 0;
    1.52 +  Language plus_one = UNKNOWN_LANGUAGE;
    1.53 +  const char* tld_hint = "";
    1.54 +  int encoding_hint = UNKNOWN_ENCODING;
    1.55 +  Language language_hint = UNKNOWN_LANGUAGE;
    1.56 +  CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
    1.57 +
    1.58 +  Language lang = DetectLanguageSummaryV2(
    1.59 +                          buffer,
    1.60 +                          buffer_length,
    1.61 +                          is_plain_text,
    1.62 +                          &cldhints,
    1.63 +                          allow_extended_lang,
    1.64 +                          flags,
    1.65 +                          plus_one,
    1.66 +                          language3,
    1.67 +                          percent3,
    1.68 +                          normalized_score3,
    1.69 +                          NULL,
    1.70 +                          &text_bytes,
    1.71 +                          is_reliable);
    1.72 +  // Default to English
    1.73 +  if (lang == UNKNOWN_LANGUAGE) {
    1.74 +    lang = ENGLISH;
    1.75 +  }
    1.76 +  return lang;
    1.77 +}
    1.78 +
    1.79 +// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
    1.80 +Language DetectLanguageSummary(
    1.81 +                          const char* buffer,
    1.82 +                          int buffer_length,
    1.83 +                          bool is_plain_text,
    1.84 +                          Language* language3,
    1.85 +                          int* percent3,
    1.86 +                          int* text_bytes,
    1.87 +                          bool* is_reliable) {
    1.88 +  double normalized_score3[3];
    1.89 +  bool allow_extended_lang = false;
    1.90 +  int flags = 0;
    1.91 +  Language plus_one = UNKNOWN_LANGUAGE;
    1.92 +  const char* tld_hint = "";
    1.93 +  int encoding_hint = UNKNOWN_ENCODING;
    1.94 +  Language language_hint = UNKNOWN_LANGUAGE;
    1.95 +  CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
    1.96 +
    1.97 +  Language lang = DetectLanguageSummaryV2(
    1.98 +                          buffer,
    1.99 +                          buffer_length,
   1.100 +                          is_plain_text,
   1.101 +                          &cldhints,
   1.102 +                          allow_extended_lang,
   1.103 +                          flags,
   1.104 +                          plus_one,
   1.105 +                          language3,
   1.106 +                          percent3,
   1.107 +                          normalized_score3,
   1.108 +                          NULL,
   1.109 +                          text_bytes,
   1.110 +                          is_reliable);
   1.111 +  // Default to English
   1.112 +  if (lang == UNKNOWN_LANGUAGE) {
   1.113 +    lang = ENGLISH;
   1.114 +  }
   1.115 +  return lang;
   1.116 +}
   1.117 +
   1.118 +// Same as above, with hints supplied
   1.119 +// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
   1.120 +Language DetectLanguageSummary(
   1.121 +                          const char* buffer,
   1.122 +                          int buffer_length,
   1.123 +                          bool is_plain_text,
   1.124 +                          const char* tld_hint,       // "id" boosts Indonesian
   1.125 +                          int encoding_hint,          // SJS boosts Japanese
   1.126 +                          Language language_hint,     // ITALIAN boosts it
   1.127 +                          Language* language3,
   1.128 +                          int* percent3,
   1.129 +                          int* text_bytes,
   1.130 +                          bool* is_reliable) {
   1.131 +  double normalized_score3[3];
   1.132 +  bool allow_extended_lang = false;
   1.133 +  int flags = 0;
   1.134 +  Language plus_one = UNKNOWN_LANGUAGE;
   1.135 +  CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
   1.136 +
   1.137 +  Language lang = DetectLanguageSummaryV2(
   1.138 +                          buffer,
   1.139 +                          buffer_length,
   1.140 +                          is_plain_text,
   1.141 +                          &cldhints,
   1.142 +                          allow_extended_lang,
   1.143 +                          flags,
   1.144 +                          plus_one,
   1.145 +                          language3,
   1.146 +                          percent3,
   1.147 +                          normalized_score3,
   1.148 +                          NULL,
   1.149 +                          text_bytes,
   1.150 +                          is_reliable);
   1.151 +  // Default to English
   1.152 +  if (lang == UNKNOWN_LANGUAGE) {
   1.153 +    lang = ENGLISH;
   1.154 +  }
   1.155 +  return lang;
   1.156 +}
   1.157 +
   1.158 +
   1.159 +// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
   1.160 +// languages.
   1.161 +// Extended languages are additional Google interface languages and Unicode
   1.162 +// single-language scripts, from ext_lang_enc.h
   1.163 +Language ExtDetectLanguageSummary(
   1.164 +                          const char* buffer,
   1.165 +                          int buffer_length,
   1.166 +                          bool is_plain_text,
   1.167 +                          Language* language3,
   1.168 +                          int* percent3,
   1.169 +                          int* text_bytes,
   1.170 +                          bool* is_reliable) {
   1.171 +  double normalized_score3[3];
   1.172 +  bool allow_extended_lang = true;
   1.173 +  int flags = 0;
   1.174 +  Language plus_one = UNKNOWN_LANGUAGE;
   1.175 +  const char* tld_hint = "";
   1.176 +  int encoding_hint = UNKNOWN_ENCODING;
   1.177 +  Language language_hint = UNKNOWN_LANGUAGE;
   1.178 +  CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
   1.179 +
   1.180 +  Language lang = DetectLanguageSummaryV2(
   1.181 +                          buffer,
   1.182 +                          buffer_length,
   1.183 +                          is_plain_text,
   1.184 +                          &cldhints,
   1.185 +                          allow_extended_lang,
   1.186 +                          flags,
   1.187 +                          plus_one,
   1.188 +                          language3,
   1.189 +                          percent3,
   1.190 +                          normalized_score3,
   1.191 +                          NULL,
   1.192 +                          text_bytes,
   1.193 +                          is_reliable);
   1.194 +  // Do not default to English
   1.195 +  return lang;
   1.196 +}
   1.197 +
   1.198 +// Same as above, with hints supplied
   1.199 +// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
   1.200 +// languages.
   1.201 +// Extended languages are additional Google interface languages and Unicode
   1.202 +// single-language scripts, from ext_lang_enc.h
   1.203 +Language ExtDetectLanguageSummary(
   1.204 +                          const char* buffer,
   1.205 +                          int buffer_length,
   1.206 +                          bool is_plain_text,
   1.207 +                          const char* tld_hint,       // "id" boosts Indonesian
   1.208 +                          int encoding_hint,          // SJS boosts Japanese
   1.209 +                          Language language_hint,     // ITALIAN boosts it
   1.210 +                          Language* language3,
   1.211 +                          int* percent3,
   1.212 +                          int* text_bytes,
   1.213 +                          bool* is_reliable) {
   1.214 +  double normalized_score3[3];
   1.215 +  bool allow_extended_lang = true;
   1.216 +  int flags = 0;
   1.217 +  Language plus_one = UNKNOWN_LANGUAGE;
   1.218 +  CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
   1.219 +
   1.220 +  Language lang = DetectLanguageSummaryV2(
   1.221 +                          buffer,
   1.222 +                          buffer_length,
   1.223 +                          is_plain_text,
   1.224 +                          &cldhints,
   1.225 +                          allow_extended_lang,
   1.226 +                          flags,
   1.227 +                          plus_one,
   1.228 +                          language3,
   1.229 +                          percent3,
   1.230 +                          normalized_score3,
   1.231 +                          NULL,
   1.232 +                          text_bytes,
   1.233 +                          is_reliable);
   1.234 +  // Do not default to English
   1.235 +  return lang;
   1.236 +}
   1.237 +
   1.238 +// Same as above, and also returns internal language scores as a ratio to
   1.239 +// normal score for real text in that language. Scores close to 1.0 indicate
   1.240 +// normal text, while scores far away from 1.0 indicate badly-skewed text or
   1.241 +// gibberish
   1.242 +//
   1.243 +Language ExtDetectLanguageSummary(
   1.244 +                        const char* buffer,
   1.245 +                        int buffer_length,
   1.246 +                        bool is_plain_text,
   1.247 +                        const char* tld_hint,       // "id" boosts Indonesian
   1.248 +                        int encoding_hint,          // SJS boosts Japanese
   1.249 +                        Language language_hint,     // ITALIAN boosts it
   1.250 +                        Language* language3,
   1.251 +                        int* percent3,
   1.252 +                        double* normalized_score3,
   1.253 +                        int* text_bytes,
   1.254 +                        bool* is_reliable) {
   1.255 +  bool allow_extended_lang = true;
   1.256 +  int flags = 0;
   1.257 +  Language plus_one = UNKNOWN_LANGUAGE;
   1.258 +  CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
   1.259 +
   1.260 +  Language lang = DetectLanguageSummaryV2(
   1.261 +                          buffer,
   1.262 +                          buffer_length,
   1.263 +                          is_plain_text,
   1.264 +                          &cldhints,
   1.265 +                          allow_extended_lang,
   1.266 +                          flags,
   1.267 +                          plus_one,
   1.268 +                          language3,
   1.269 +                          percent3,
   1.270 +                          normalized_score3,
   1.271 +                          NULL,
   1.272 +                          text_bytes,
   1.273 +                          is_reliable);
   1.274 +  // Do not default to English
   1.275 +  return lang;
   1.276 +}
   1.277 +
   1.278 +// Use this one.
   1.279 +// Hints are collected into a struct.
   1.280 +// Flags are passed in (normally zero).
   1.281 +//
   1.282 +// Also returns 3 internal language scores as a ratio to
   1.283 +// normal score for real text in that language. Scores close to 1.0 indicate
   1.284 +// normal text, while scores far away from 1.0 indicate badly-skewed text or
   1.285 +// gibberish
   1.286 +//
   1.287 +// Returns a vector of chunks in different languages, so that caller may
   1.288 +// spell-check, translate, or otherwaise process different parts of the input
   1.289 +// buffer in language-dependant ways.
   1.290 +//
   1.291 +Language ExtDetectLanguageSummary(
   1.292 +                        const char* buffer,
   1.293 +                        int buffer_length,
   1.294 +                        bool is_plain_text,
   1.295 +                        const CLDHints* cld_hints,
   1.296 +                        int flags,
   1.297 +                        Language* language3,
   1.298 +                        int* percent3,
   1.299 +                        double* normalized_score3,
   1.300 +                        ResultChunkVector* resultchunkvector,
   1.301 +                        int* text_bytes,
   1.302 +                        bool* is_reliable) {
   1.303 +  bool allow_extended_lang = true;
   1.304 +  Language plus_one = UNKNOWN_LANGUAGE;
   1.305 +
   1.306 +  Language lang = DetectLanguageSummaryV2(
   1.307 +                          buffer,
   1.308 +                          buffer_length,
   1.309 +                          is_plain_text,
   1.310 +                          cld_hints,
   1.311 +                          allow_extended_lang,
   1.312 +                          flags,
   1.313 +                          plus_one,
   1.314 +                          language3,
   1.315 +                          percent3,
   1.316 +                          normalized_score3,
   1.317 +                          resultchunkvector,
   1.318 +                          text_bytes,
   1.319 +                          is_reliable);
   1.320 +  // Do not default to English
   1.321 +  return lang;
   1.322 +}
   1.323 +
   1.324 +}       // End namespace CLD2
   1.325 +

mercurial