1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/browser/components/translation/cld2/internal/compact_lang_det.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,322 @@ 1.4 +// Copyright 2013 Google Inc. All Rights Reserved. 1.5 +// 1.6 +// Licensed under the Apache License, Version 2.0 (the "License"); 1.7 +// you may not use this file except in compliance with the License. 1.8 +// You may obtain a copy of the License at 1.9 +// 1.10 +// http://www.apache.org/licenses/LICENSE-2.0 1.11 +// 1.12 +// Unless required by applicable law or agreed to in writing, software 1.13 +// distributed under the License is distributed on an "AS IS" BASIS, 1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.15 +// See the License for the specific language governing permissions and 1.16 +// limitations under the License. 1.17 + 1.18 +// 1.19 +// Author: dsites@google.com (Dick Sites) 1.20 +// 1.21 + 1.22 +#include <stdio.h> 1.23 +#include <stdlib.h> 1.24 + 1.25 +#include "../public/compact_lang_det.h" 1.26 +#include "../public/encodings.h" 1.27 +#include "compact_lang_det_impl.h" 1.28 +#include "integral_types.h" 1.29 +#include "lang_script.h" 1.30 + 1.31 +namespace CLD2 { 1.32 + 1.33 +// String is "code_version - data_scrape_date" 1.34 +//static const char* kDetectLanguageVersion = "V2.0 - 20130715"; 1.35 + 1.36 + 1.37 +// Large-table version for all ~160 languages 1.38 +// Small-table version for all ~60 languages 1.39 + 1.40 +// Scan interchange-valid UTF-8 bytes and detect most likely language 1.41 +Language DetectLanguage( 1.42 + const char* buffer, 1.43 + int buffer_length, 1.44 + bool is_plain_text, 1.45 + bool* is_reliable) { 1.46 + bool allow_extended_lang = false; 1.47 + Language language3[3]; 1.48 + int percent3[3]; 1.49 + double normalized_score3[3]; 1.50 + int text_bytes; 1.51 + int flags = 0; 1.52 + Language plus_one = UNKNOWN_LANGUAGE; 1.53 + const char* tld_hint = ""; 1.54 + int encoding_hint = UNKNOWN_ENCODING; 1.55 + Language language_hint = UNKNOWN_LANGUAGE; 1.56 + CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; 1.57 + 1.58 + Language lang = DetectLanguageSummaryV2( 1.59 + buffer, 1.60 + buffer_length, 1.61 + is_plain_text, 1.62 + &cldhints, 1.63 + allow_extended_lang, 1.64 + flags, 1.65 + plus_one, 1.66 + language3, 1.67 + percent3, 1.68 + normalized_score3, 1.69 + NULL, 1.70 + &text_bytes, 1.71 + is_reliable); 1.72 + // Default to English 1.73 + if (lang == UNKNOWN_LANGUAGE) { 1.74 + lang = ENGLISH; 1.75 + } 1.76 + return lang; 1.77 +} 1.78 + 1.79 +// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. 1.80 +Language DetectLanguageSummary( 1.81 + const char* buffer, 1.82 + int buffer_length, 1.83 + bool is_plain_text, 1.84 + Language* language3, 1.85 + int* percent3, 1.86 + int* text_bytes, 1.87 + bool* is_reliable) { 1.88 + double normalized_score3[3]; 1.89 + bool allow_extended_lang = false; 1.90 + int flags = 0; 1.91 + Language plus_one = UNKNOWN_LANGUAGE; 1.92 + const char* tld_hint = ""; 1.93 + int encoding_hint = UNKNOWN_ENCODING; 1.94 + Language language_hint = UNKNOWN_LANGUAGE; 1.95 + CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; 1.96 + 1.97 + Language lang = DetectLanguageSummaryV2( 1.98 + buffer, 1.99 + buffer_length, 1.100 + is_plain_text, 1.101 + &cldhints, 1.102 + allow_extended_lang, 1.103 + flags, 1.104 + plus_one, 1.105 + language3, 1.106 + percent3, 1.107 + normalized_score3, 1.108 + NULL, 1.109 + text_bytes, 1.110 + is_reliable); 1.111 + // Default to English 1.112 + if (lang == UNKNOWN_LANGUAGE) { 1.113 + lang = ENGLISH; 1.114 + } 1.115 + return lang; 1.116 +} 1.117 + 1.118 +// Same as above, with hints supplied 1.119 +// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. 1.120 +Language DetectLanguageSummary( 1.121 + const char* buffer, 1.122 + int buffer_length, 1.123 + bool is_plain_text, 1.124 + const char* tld_hint, // "id" boosts Indonesian 1.125 + int encoding_hint, // SJS boosts Japanese 1.126 + Language language_hint, // ITALIAN boosts it 1.127 + Language* language3, 1.128 + int* percent3, 1.129 + int* text_bytes, 1.130 + bool* is_reliable) { 1.131 + double normalized_score3[3]; 1.132 + bool allow_extended_lang = false; 1.133 + int flags = 0; 1.134 + Language plus_one = UNKNOWN_LANGUAGE; 1.135 + CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; 1.136 + 1.137 + Language lang = DetectLanguageSummaryV2( 1.138 + buffer, 1.139 + buffer_length, 1.140 + is_plain_text, 1.141 + &cldhints, 1.142 + allow_extended_lang, 1.143 + flags, 1.144 + plus_one, 1.145 + language3, 1.146 + percent3, 1.147 + normalized_score3, 1.148 + NULL, 1.149 + text_bytes, 1.150 + is_reliable); 1.151 + // Default to English 1.152 + if (lang == UNKNOWN_LANGUAGE) { 1.153 + lang = ENGLISH; 1.154 + } 1.155 + return lang; 1.156 +} 1.157 + 1.158 + 1.159 +// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended 1.160 +// languages. 1.161 +// Extended languages are additional Google interface languages and Unicode 1.162 +// single-language scripts, from ext_lang_enc.h 1.163 +Language ExtDetectLanguageSummary( 1.164 + const char* buffer, 1.165 + int buffer_length, 1.166 + bool is_plain_text, 1.167 + Language* language3, 1.168 + int* percent3, 1.169 + int* text_bytes, 1.170 + bool* is_reliable) { 1.171 + double normalized_score3[3]; 1.172 + bool allow_extended_lang = true; 1.173 + int flags = 0; 1.174 + Language plus_one = UNKNOWN_LANGUAGE; 1.175 + const char* tld_hint = ""; 1.176 + int encoding_hint = UNKNOWN_ENCODING; 1.177 + Language language_hint = UNKNOWN_LANGUAGE; 1.178 + CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; 1.179 + 1.180 + Language lang = DetectLanguageSummaryV2( 1.181 + buffer, 1.182 + buffer_length, 1.183 + is_plain_text, 1.184 + &cldhints, 1.185 + allow_extended_lang, 1.186 + flags, 1.187 + plus_one, 1.188 + language3, 1.189 + percent3, 1.190 + normalized_score3, 1.191 + NULL, 1.192 + text_bytes, 1.193 + is_reliable); 1.194 + // Do not default to English 1.195 + return lang; 1.196 +} 1.197 + 1.198 +// Same as above, with hints supplied 1.199 +// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended 1.200 +// languages. 1.201 +// Extended languages are additional Google interface languages and Unicode 1.202 +// single-language scripts, from ext_lang_enc.h 1.203 +Language ExtDetectLanguageSummary( 1.204 + const char* buffer, 1.205 + int buffer_length, 1.206 + bool is_plain_text, 1.207 + const char* tld_hint, // "id" boosts Indonesian 1.208 + int encoding_hint, // SJS boosts Japanese 1.209 + Language language_hint, // ITALIAN boosts it 1.210 + Language* language3, 1.211 + int* percent3, 1.212 + int* text_bytes, 1.213 + bool* is_reliable) { 1.214 + double normalized_score3[3]; 1.215 + bool allow_extended_lang = true; 1.216 + int flags = 0; 1.217 + Language plus_one = UNKNOWN_LANGUAGE; 1.218 + CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; 1.219 + 1.220 + Language lang = DetectLanguageSummaryV2( 1.221 + buffer, 1.222 + buffer_length, 1.223 + is_plain_text, 1.224 + &cldhints, 1.225 + allow_extended_lang, 1.226 + flags, 1.227 + plus_one, 1.228 + language3, 1.229 + percent3, 1.230 + normalized_score3, 1.231 + NULL, 1.232 + text_bytes, 1.233 + is_reliable); 1.234 + // Do not default to English 1.235 + return lang; 1.236 +} 1.237 + 1.238 +// Same as above, and also returns internal language scores as a ratio to 1.239 +// normal score for real text in that language. Scores close to 1.0 indicate 1.240 +// normal text, while scores far away from 1.0 indicate badly-skewed text or 1.241 +// gibberish 1.242 +// 1.243 +Language ExtDetectLanguageSummary( 1.244 + const char* buffer, 1.245 + int buffer_length, 1.246 + bool is_plain_text, 1.247 + const char* tld_hint, // "id" boosts Indonesian 1.248 + int encoding_hint, // SJS boosts Japanese 1.249 + Language language_hint, // ITALIAN boosts it 1.250 + Language* language3, 1.251 + int* percent3, 1.252 + double* normalized_score3, 1.253 + int* text_bytes, 1.254 + bool* is_reliable) { 1.255 + bool allow_extended_lang = true; 1.256 + int flags = 0; 1.257 + Language plus_one = UNKNOWN_LANGUAGE; 1.258 + CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; 1.259 + 1.260 + Language lang = DetectLanguageSummaryV2( 1.261 + buffer, 1.262 + buffer_length, 1.263 + is_plain_text, 1.264 + &cldhints, 1.265 + allow_extended_lang, 1.266 + flags, 1.267 + plus_one, 1.268 + language3, 1.269 + percent3, 1.270 + normalized_score3, 1.271 + NULL, 1.272 + text_bytes, 1.273 + is_reliable); 1.274 + // Do not default to English 1.275 + return lang; 1.276 +} 1.277 + 1.278 +// Use this one. 1.279 +// Hints are collected into a struct. 1.280 +// Flags are passed in (normally zero). 1.281 +// 1.282 +// Also returns 3 internal language scores as a ratio to 1.283 +// normal score for real text in that language. Scores close to 1.0 indicate 1.284 +// normal text, while scores far away from 1.0 indicate badly-skewed text or 1.285 +// gibberish 1.286 +// 1.287 +// Returns a vector of chunks in different languages, so that caller may 1.288 +// spell-check, translate, or otherwaise process different parts of the input 1.289 +// buffer in language-dependant ways. 1.290 +// 1.291 +Language ExtDetectLanguageSummary( 1.292 + const char* buffer, 1.293 + int buffer_length, 1.294 + bool is_plain_text, 1.295 + const CLDHints* cld_hints, 1.296 + int flags, 1.297 + Language* language3, 1.298 + int* percent3, 1.299 + double* normalized_score3, 1.300 + ResultChunkVector* resultchunkvector, 1.301 + int* text_bytes, 1.302 + bool* is_reliable) { 1.303 + bool allow_extended_lang = true; 1.304 + Language plus_one = UNKNOWN_LANGUAGE; 1.305 + 1.306 + Language lang = DetectLanguageSummaryV2( 1.307 + buffer, 1.308 + buffer_length, 1.309 + is_plain_text, 1.310 + cld_hints, 1.311 + allow_extended_lang, 1.312 + flags, 1.313 + plus_one, 1.314 + language3, 1.315 + percent3, 1.316 + normalized_score3, 1.317 + resultchunkvector, 1.318 + text_bytes, 1.319 + is_reliable); 1.320 + // Do not default to English 1.321 + return lang; 1.322 +} 1.323 + 1.324 +} // End namespace CLD2 1.325 +