browser/components/translation/cld2/internal/compact_lang_det.cc

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 // Copyright 2013 Google Inc. All Rights Reserved.
michael@0 2 //
michael@0 3 // Licensed under the Apache License, Version 2.0 (the "License");
michael@0 4 // you may not use this file except in compliance with the License.
michael@0 5 // You may obtain a copy of the License at
michael@0 6 //
michael@0 7 // http://www.apache.org/licenses/LICENSE-2.0
michael@0 8 //
michael@0 9 // Unless required by applicable law or agreed to in writing, software
michael@0 10 // distributed under the License is distributed on an "AS IS" BASIS,
michael@0 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
michael@0 12 // See the License for the specific language governing permissions and
michael@0 13 // limitations under the License.
michael@0 14
michael@0 15 //
michael@0 16 // Author: dsites@google.com (Dick Sites)
michael@0 17 //
michael@0 18
michael@0 19 #include <stdio.h>
michael@0 20 #include <stdlib.h>
michael@0 21
michael@0 22 #include "../public/compact_lang_det.h"
michael@0 23 #include "../public/encodings.h"
michael@0 24 #include "compact_lang_det_impl.h"
michael@0 25 #include "integral_types.h"
michael@0 26 #include "lang_script.h"
michael@0 27
michael@0 28 namespace CLD2 {
michael@0 29
michael@0 30 // String is "code_version - data_scrape_date"
michael@0 31 //static const char* kDetectLanguageVersion = "V2.0 - 20130715";
michael@0 32
michael@0 33
michael@0 34 // Large-table version for all ~160 languages
michael@0 35 // Small-table version for all ~60 languages
michael@0 36
michael@0 37 // Scan interchange-valid UTF-8 bytes and detect most likely language
michael@0 38 Language DetectLanguage(
michael@0 39 const char* buffer,
michael@0 40 int buffer_length,
michael@0 41 bool is_plain_text,
michael@0 42 bool* is_reliable) {
michael@0 43 bool allow_extended_lang = false;
michael@0 44 Language language3[3];
michael@0 45 int percent3[3];
michael@0 46 double normalized_score3[3];
michael@0 47 int text_bytes;
michael@0 48 int flags = 0;
michael@0 49 Language plus_one = UNKNOWN_LANGUAGE;
michael@0 50 const char* tld_hint = "";
michael@0 51 int encoding_hint = UNKNOWN_ENCODING;
michael@0 52 Language language_hint = UNKNOWN_LANGUAGE;
michael@0 53 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
michael@0 54
michael@0 55 Language lang = DetectLanguageSummaryV2(
michael@0 56 buffer,
michael@0 57 buffer_length,
michael@0 58 is_plain_text,
michael@0 59 &cldhints,
michael@0 60 allow_extended_lang,
michael@0 61 flags,
michael@0 62 plus_one,
michael@0 63 language3,
michael@0 64 percent3,
michael@0 65 normalized_score3,
michael@0 66 NULL,
michael@0 67 &text_bytes,
michael@0 68 is_reliable);
michael@0 69 // Default to English
michael@0 70 if (lang == UNKNOWN_LANGUAGE) {
michael@0 71 lang = ENGLISH;
michael@0 72 }
michael@0 73 return lang;
michael@0 74 }
michael@0 75
michael@0 76 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
michael@0 77 Language DetectLanguageSummary(
michael@0 78 const char* buffer,
michael@0 79 int buffer_length,
michael@0 80 bool is_plain_text,
michael@0 81 Language* language3,
michael@0 82 int* percent3,
michael@0 83 int* text_bytes,
michael@0 84 bool* is_reliable) {
michael@0 85 double normalized_score3[3];
michael@0 86 bool allow_extended_lang = false;
michael@0 87 int flags = 0;
michael@0 88 Language plus_one = UNKNOWN_LANGUAGE;
michael@0 89 const char* tld_hint = "";
michael@0 90 int encoding_hint = UNKNOWN_ENCODING;
michael@0 91 Language language_hint = UNKNOWN_LANGUAGE;
michael@0 92 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
michael@0 93
michael@0 94 Language lang = DetectLanguageSummaryV2(
michael@0 95 buffer,
michael@0 96 buffer_length,
michael@0 97 is_plain_text,
michael@0 98 &cldhints,
michael@0 99 allow_extended_lang,
michael@0 100 flags,
michael@0 101 plus_one,
michael@0 102 language3,
michael@0 103 percent3,
michael@0 104 normalized_score3,
michael@0 105 NULL,
michael@0 106 text_bytes,
michael@0 107 is_reliable);
michael@0 108 // Default to English
michael@0 109 if (lang == UNKNOWN_LANGUAGE) {
michael@0 110 lang = ENGLISH;
michael@0 111 }
michael@0 112 return lang;
michael@0 113 }
michael@0 114
michael@0 115 // Same as above, with hints supplied
michael@0 116 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
michael@0 117 Language DetectLanguageSummary(
michael@0 118 const char* buffer,
michael@0 119 int buffer_length,
michael@0 120 bool is_plain_text,
michael@0 121 const char* tld_hint, // "id" boosts Indonesian
michael@0 122 int encoding_hint, // SJS boosts Japanese
michael@0 123 Language language_hint, // ITALIAN boosts it
michael@0 124 Language* language3,
michael@0 125 int* percent3,
michael@0 126 int* text_bytes,
michael@0 127 bool* is_reliable) {
michael@0 128 double normalized_score3[3];
michael@0 129 bool allow_extended_lang = false;
michael@0 130 int flags = 0;
michael@0 131 Language plus_one = UNKNOWN_LANGUAGE;
michael@0 132 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
michael@0 133
michael@0 134 Language lang = DetectLanguageSummaryV2(
michael@0 135 buffer,
michael@0 136 buffer_length,
michael@0 137 is_plain_text,
michael@0 138 &cldhints,
michael@0 139 allow_extended_lang,
michael@0 140 flags,
michael@0 141 plus_one,
michael@0 142 language3,
michael@0 143 percent3,
michael@0 144 normalized_score3,
michael@0 145 NULL,
michael@0 146 text_bytes,
michael@0 147 is_reliable);
michael@0 148 // Default to English
michael@0 149 if (lang == UNKNOWN_LANGUAGE) {
michael@0 150 lang = ENGLISH;
michael@0 151 }
michael@0 152 return lang;
michael@0 153 }
michael@0 154
michael@0 155
michael@0 156 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
michael@0 157 // languages.
michael@0 158 // Extended languages are additional Google interface languages and Unicode
michael@0 159 // single-language scripts, from ext_lang_enc.h
michael@0 160 Language ExtDetectLanguageSummary(
michael@0 161 const char* buffer,
michael@0 162 int buffer_length,
michael@0 163 bool is_plain_text,
michael@0 164 Language* language3,
michael@0 165 int* percent3,
michael@0 166 int* text_bytes,
michael@0 167 bool* is_reliable) {
michael@0 168 double normalized_score3[3];
michael@0 169 bool allow_extended_lang = true;
michael@0 170 int flags = 0;
michael@0 171 Language plus_one = UNKNOWN_LANGUAGE;
michael@0 172 const char* tld_hint = "";
michael@0 173 int encoding_hint = UNKNOWN_ENCODING;
michael@0 174 Language language_hint = UNKNOWN_LANGUAGE;
michael@0 175 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
michael@0 176
michael@0 177 Language lang = DetectLanguageSummaryV2(
michael@0 178 buffer,
michael@0 179 buffer_length,
michael@0 180 is_plain_text,
michael@0 181 &cldhints,
michael@0 182 allow_extended_lang,
michael@0 183 flags,
michael@0 184 plus_one,
michael@0 185 language3,
michael@0 186 percent3,
michael@0 187 normalized_score3,
michael@0 188 NULL,
michael@0 189 text_bytes,
michael@0 190 is_reliable);
michael@0 191 // Do not default to English
michael@0 192 return lang;
michael@0 193 }
michael@0 194
michael@0 195 // Same as above, with hints supplied
michael@0 196 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
michael@0 197 // languages.
michael@0 198 // Extended languages are additional Google interface languages and Unicode
michael@0 199 // single-language scripts, from ext_lang_enc.h
michael@0 200 Language ExtDetectLanguageSummary(
michael@0 201 const char* buffer,
michael@0 202 int buffer_length,
michael@0 203 bool is_plain_text,
michael@0 204 const char* tld_hint, // "id" boosts Indonesian
michael@0 205 int encoding_hint, // SJS boosts Japanese
michael@0 206 Language language_hint, // ITALIAN boosts it
michael@0 207 Language* language3,
michael@0 208 int* percent3,
michael@0 209 int* text_bytes,
michael@0 210 bool* is_reliable) {
michael@0 211 double normalized_score3[3];
michael@0 212 bool allow_extended_lang = true;
michael@0 213 int flags = 0;
michael@0 214 Language plus_one = UNKNOWN_LANGUAGE;
michael@0 215 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
michael@0 216
michael@0 217 Language lang = DetectLanguageSummaryV2(
michael@0 218 buffer,
michael@0 219 buffer_length,
michael@0 220 is_plain_text,
michael@0 221 &cldhints,
michael@0 222 allow_extended_lang,
michael@0 223 flags,
michael@0 224 plus_one,
michael@0 225 language3,
michael@0 226 percent3,
michael@0 227 normalized_score3,
michael@0 228 NULL,
michael@0 229 text_bytes,
michael@0 230 is_reliable);
michael@0 231 // Do not default to English
michael@0 232 return lang;
michael@0 233 }
michael@0 234
michael@0 235 // Same as above, and also returns internal language scores as a ratio to
michael@0 236 // normal score for real text in that language. Scores close to 1.0 indicate
michael@0 237 // normal text, while scores far away from 1.0 indicate badly-skewed text or
michael@0 238 // gibberish
michael@0 239 //
michael@0 240 Language ExtDetectLanguageSummary(
michael@0 241 const char* buffer,
michael@0 242 int buffer_length,
michael@0 243 bool is_plain_text,
michael@0 244 const char* tld_hint, // "id" boosts Indonesian
michael@0 245 int encoding_hint, // SJS boosts Japanese
michael@0 246 Language language_hint, // ITALIAN boosts it
michael@0 247 Language* language3,
michael@0 248 int* percent3,
michael@0 249 double* normalized_score3,
michael@0 250 int* text_bytes,
michael@0 251 bool* is_reliable) {
michael@0 252 bool allow_extended_lang = true;
michael@0 253 int flags = 0;
michael@0 254 Language plus_one = UNKNOWN_LANGUAGE;
michael@0 255 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
michael@0 256
michael@0 257 Language lang = DetectLanguageSummaryV2(
michael@0 258 buffer,
michael@0 259 buffer_length,
michael@0 260 is_plain_text,
michael@0 261 &cldhints,
michael@0 262 allow_extended_lang,
michael@0 263 flags,
michael@0 264 plus_one,
michael@0 265 language3,
michael@0 266 percent3,
michael@0 267 normalized_score3,
michael@0 268 NULL,
michael@0 269 text_bytes,
michael@0 270 is_reliable);
michael@0 271 // Do not default to English
michael@0 272 return lang;
michael@0 273 }
michael@0 274
michael@0 275 // Use this one.
michael@0 276 // Hints are collected into a struct.
michael@0 277 // Flags are passed in (normally zero).
michael@0 278 //
michael@0 279 // Also returns 3 internal language scores as a ratio to
michael@0 280 // normal score for real text in that language. Scores close to 1.0 indicate
michael@0 281 // normal text, while scores far away from 1.0 indicate badly-skewed text or
michael@0 282 // gibberish
michael@0 283 //
michael@0 284 // Returns a vector of chunks in different languages, so that caller may
michael@0 285 // spell-check, translate, or otherwaise process different parts of the input
michael@0 286 // buffer in language-dependant ways.
michael@0 287 //
michael@0 288 Language ExtDetectLanguageSummary(
michael@0 289 const char* buffer,
michael@0 290 int buffer_length,
michael@0 291 bool is_plain_text,
michael@0 292 const CLDHints* cld_hints,
michael@0 293 int flags,
michael@0 294 Language* language3,
michael@0 295 int* percent3,
michael@0 296 double* normalized_score3,
michael@0 297 ResultChunkVector* resultchunkvector,
michael@0 298 int* text_bytes,
michael@0 299 bool* is_reliable) {
michael@0 300 bool allow_extended_lang = true;
michael@0 301 Language plus_one = UNKNOWN_LANGUAGE;
michael@0 302
michael@0 303 Language lang = DetectLanguageSummaryV2(
michael@0 304 buffer,
michael@0 305 buffer_length,
michael@0 306 is_plain_text,
michael@0 307 cld_hints,
michael@0 308 allow_extended_lang,
michael@0 309 flags,
michael@0 310 plus_one,
michael@0 311 language3,
michael@0 312 percent3,
michael@0 313 normalized_score3,
michael@0 314 resultchunkvector,
michael@0 315 text_bytes,
michael@0 316 is_reliable);
michael@0 317 // Do not default to English
michael@0 318 return lang;
michael@0 319 }
michael@0 320
michael@0 321 } // End namespace CLD2
michael@0 322

mercurial