Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | // Copyright 2013 Google Inc. All Rights Reserved. |
michael@0 | 2 | // |
michael@0 | 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
michael@0 | 4 | // you may not use this file except in compliance with the License. |
michael@0 | 5 | // You may obtain a copy of the License at |
michael@0 | 6 | // |
michael@0 | 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
michael@0 | 8 | // |
michael@0 | 9 | // Unless required by applicable law or agreed to in writing, software |
michael@0 | 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
michael@0 | 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
michael@0 | 12 | // See the License for the specific language governing permissions and |
michael@0 | 13 | // limitations under the License. |
michael@0 | 14 | |
michael@0 | 15 | // |
michael@0 | 16 | // Author: dsites@google.com (Dick Sites) |
michael@0 | 17 | // |
michael@0 | 18 | |
michael@0 | 19 | #include <stdio.h> |
michael@0 | 20 | #include <stdlib.h> |
michael@0 | 21 | |
michael@0 | 22 | #include "../public/compact_lang_det.h" |
michael@0 | 23 | #include "../public/encodings.h" |
michael@0 | 24 | #include "compact_lang_det_impl.h" |
michael@0 | 25 | #include "integral_types.h" |
michael@0 | 26 | #include "lang_script.h" |
michael@0 | 27 | |
michael@0 | 28 | namespace CLD2 { |
michael@0 | 29 | |
michael@0 | 30 | // String is "code_version - data_scrape_date" |
michael@0 | 31 | //static const char* kDetectLanguageVersion = "V2.0 - 20130715"; |
michael@0 | 32 | |
michael@0 | 33 | |
michael@0 | 34 | // Large-table version for all ~160 languages |
michael@0 | 35 | // Small-table version for all ~60 languages |
michael@0 | 36 | |
michael@0 | 37 | // Scan interchange-valid UTF-8 bytes and detect most likely language |
michael@0 | 38 | Language DetectLanguage( |
michael@0 | 39 | const char* buffer, |
michael@0 | 40 | int buffer_length, |
michael@0 | 41 | bool is_plain_text, |
michael@0 | 42 | bool* is_reliable) { |
michael@0 | 43 | bool allow_extended_lang = false; |
michael@0 | 44 | Language language3[3]; |
michael@0 | 45 | int percent3[3]; |
michael@0 | 46 | double normalized_score3[3]; |
michael@0 | 47 | int text_bytes; |
michael@0 | 48 | int flags = 0; |
michael@0 | 49 | Language plus_one = UNKNOWN_LANGUAGE; |
michael@0 | 50 | const char* tld_hint = ""; |
michael@0 | 51 | int encoding_hint = UNKNOWN_ENCODING; |
michael@0 | 52 | Language language_hint = UNKNOWN_LANGUAGE; |
michael@0 | 53 | CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; |
michael@0 | 54 | |
michael@0 | 55 | Language lang = DetectLanguageSummaryV2( |
michael@0 | 56 | buffer, |
michael@0 | 57 | buffer_length, |
michael@0 | 58 | is_plain_text, |
michael@0 | 59 | &cldhints, |
michael@0 | 60 | allow_extended_lang, |
michael@0 | 61 | flags, |
michael@0 | 62 | plus_one, |
michael@0 | 63 | language3, |
michael@0 | 64 | percent3, |
michael@0 | 65 | normalized_score3, |
michael@0 | 66 | NULL, |
michael@0 | 67 | &text_bytes, |
michael@0 | 68 | is_reliable); |
michael@0 | 69 | // Default to English |
michael@0 | 70 | if (lang == UNKNOWN_LANGUAGE) { |
michael@0 | 71 | lang = ENGLISH; |
michael@0 | 72 | } |
michael@0 | 73 | return lang; |
michael@0 | 74 | } |
michael@0 | 75 | |
michael@0 | 76 | // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. |
michael@0 | 77 | Language DetectLanguageSummary( |
michael@0 | 78 | const char* buffer, |
michael@0 | 79 | int buffer_length, |
michael@0 | 80 | bool is_plain_text, |
michael@0 | 81 | Language* language3, |
michael@0 | 82 | int* percent3, |
michael@0 | 83 | int* text_bytes, |
michael@0 | 84 | bool* is_reliable) { |
michael@0 | 85 | double normalized_score3[3]; |
michael@0 | 86 | bool allow_extended_lang = false; |
michael@0 | 87 | int flags = 0; |
michael@0 | 88 | Language plus_one = UNKNOWN_LANGUAGE; |
michael@0 | 89 | const char* tld_hint = ""; |
michael@0 | 90 | int encoding_hint = UNKNOWN_ENCODING; |
michael@0 | 91 | Language language_hint = UNKNOWN_LANGUAGE; |
michael@0 | 92 | CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; |
michael@0 | 93 | |
michael@0 | 94 | Language lang = DetectLanguageSummaryV2( |
michael@0 | 95 | buffer, |
michael@0 | 96 | buffer_length, |
michael@0 | 97 | is_plain_text, |
michael@0 | 98 | &cldhints, |
michael@0 | 99 | allow_extended_lang, |
michael@0 | 100 | flags, |
michael@0 | 101 | plus_one, |
michael@0 | 102 | language3, |
michael@0 | 103 | percent3, |
michael@0 | 104 | normalized_score3, |
michael@0 | 105 | NULL, |
michael@0 | 106 | text_bytes, |
michael@0 | 107 | is_reliable); |
michael@0 | 108 | // Default to English |
michael@0 | 109 | if (lang == UNKNOWN_LANGUAGE) { |
michael@0 | 110 | lang = ENGLISH; |
michael@0 | 111 | } |
michael@0 | 112 | return lang; |
michael@0 | 113 | } |
michael@0 | 114 | |
michael@0 | 115 | // Same as above, with hints supplied |
michael@0 | 116 | // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. |
michael@0 | 117 | Language DetectLanguageSummary( |
michael@0 | 118 | const char* buffer, |
michael@0 | 119 | int buffer_length, |
michael@0 | 120 | bool is_plain_text, |
michael@0 | 121 | const char* tld_hint, // "id" boosts Indonesian |
michael@0 | 122 | int encoding_hint, // SJS boosts Japanese |
michael@0 | 123 | Language language_hint, // ITALIAN boosts it |
michael@0 | 124 | Language* language3, |
michael@0 | 125 | int* percent3, |
michael@0 | 126 | int* text_bytes, |
michael@0 | 127 | bool* is_reliable) { |
michael@0 | 128 | double normalized_score3[3]; |
michael@0 | 129 | bool allow_extended_lang = false; |
michael@0 | 130 | int flags = 0; |
michael@0 | 131 | Language plus_one = UNKNOWN_LANGUAGE; |
michael@0 | 132 | CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; |
michael@0 | 133 | |
michael@0 | 134 | Language lang = DetectLanguageSummaryV2( |
michael@0 | 135 | buffer, |
michael@0 | 136 | buffer_length, |
michael@0 | 137 | is_plain_text, |
michael@0 | 138 | &cldhints, |
michael@0 | 139 | allow_extended_lang, |
michael@0 | 140 | flags, |
michael@0 | 141 | plus_one, |
michael@0 | 142 | language3, |
michael@0 | 143 | percent3, |
michael@0 | 144 | normalized_score3, |
michael@0 | 145 | NULL, |
michael@0 | 146 | text_bytes, |
michael@0 | 147 | is_reliable); |
michael@0 | 148 | // Default to English |
michael@0 | 149 | if (lang == UNKNOWN_LANGUAGE) { |
michael@0 | 150 | lang = ENGLISH; |
michael@0 | 151 | } |
michael@0 | 152 | return lang; |
michael@0 | 153 | } |
michael@0 | 154 | |
michael@0 | 155 | |
michael@0 | 156 | // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended |
michael@0 | 157 | // languages. |
michael@0 | 158 | // Extended languages are additional Google interface languages and Unicode |
michael@0 | 159 | // single-language scripts, from ext_lang_enc.h |
michael@0 | 160 | Language ExtDetectLanguageSummary( |
michael@0 | 161 | const char* buffer, |
michael@0 | 162 | int buffer_length, |
michael@0 | 163 | bool is_plain_text, |
michael@0 | 164 | Language* language3, |
michael@0 | 165 | int* percent3, |
michael@0 | 166 | int* text_bytes, |
michael@0 | 167 | bool* is_reliable) { |
michael@0 | 168 | double normalized_score3[3]; |
michael@0 | 169 | bool allow_extended_lang = true; |
michael@0 | 170 | int flags = 0; |
michael@0 | 171 | Language plus_one = UNKNOWN_LANGUAGE; |
michael@0 | 172 | const char* tld_hint = ""; |
michael@0 | 173 | int encoding_hint = UNKNOWN_ENCODING; |
michael@0 | 174 | Language language_hint = UNKNOWN_LANGUAGE; |
michael@0 | 175 | CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; |
michael@0 | 176 | |
michael@0 | 177 | Language lang = DetectLanguageSummaryV2( |
michael@0 | 178 | buffer, |
michael@0 | 179 | buffer_length, |
michael@0 | 180 | is_plain_text, |
michael@0 | 181 | &cldhints, |
michael@0 | 182 | allow_extended_lang, |
michael@0 | 183 | flags, |
michael@0 | 184 | plus_one, |
michael@0 | 185 | language3, |
michael@0 | 186 | percent3, |
michael@0 | 187 | normalized_score3, |
michael@0 | 188 | NULL, |
michael@0 | 189 | text_bytes, |
michael@0 | 190 | is_reliable); |
michael@0 | 191 | // Do not default to English |
michael@0 | 192 | return lang; |
michael@0 | 193 | } |
michael@0 | 194 | |
michael@0 | 195 | // Same as above, with hints supplied |
michael@0 | 196 | // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended |
michael@0 | 197 | // languages. |
michael@0 | 198 | // Extended languages are additional Google interface languages and Unicode |
michael@0 | 199 | // single-language scripts, from ext_lang_enc.h |
michael@0 | 200 | Language ExtDetectLanguageSummary( |
michael@0 | 201 | const char* buffer, |
michael@0 | 202 | int buffer_length, |
michael@0 | 203 | bool is_plain_text, |
michael@0 | 204 | const char* tld_hint, // "id" boosts Indonesian |
michael@0 | 205 | int encoding_hint, // SJS boosts Japanese |
michael@0 | 206 | Language language_hint, // ITALIAN boosts it |
michael@0 | 207 | Language* language3, |
michael@0 | 208 | int* percent3, |
michael@0 | 209 | int* text_bytes, |
michael@0 | 210 | bool* is_reliable) { |
michael@0 | 211 | double normalized_score3[3]; |
michael@0 | 212 | bool allow_extended_lang = true; |
michael@0 | 213 | int flags = 0; |
michael@0 | 214 | Language plus_one = UNKNOWN_LANGUAGE; |
michael@0 | 215 | CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; |
michael@0 | 216 | |
michael@0 | 217 | Language lang = DetectLanguageSummaryV2( |
michael@0 | 218 | buffer, |
michael@0 | 219 | buffer_length, |
michael@0 | 220 | is_plain_text, |
michael@0 | 221 | &cldhints, |
michael@0 | 222 | allow_extended_lang, |
michael@0 | 223 | flags, |
michael@0 | 224 | plus_one, |
michael@0 | 225 | language3, |
michael@0 | 226 | percent3, |
michael@0 | 227 | normalized_score3, |
michael@0 | 228 | NULL, |
michael@0 | 229 | text_bytes, |
michael@0 | 230 | is_reliable); |
michael@0 | 231 | // Do not default to English |
michael@0 | 232 | return lang; |
michael@0 | 233 | } |
michael@0 | 234 | |
michael@0 | 235 | // Same as above, and also returns internal language scores as a ratio to |
michael@0 | 236 | // normal score for real text in that language. Scores close to 1.0 indicate |
michael@0 | 237 | // normal text, while scores far away from 1.0 indicate badly-skewed text or |
michael@0 | 238 | // gibberish |
michael@0 | 239 | // |
michael@0 | 240 | Language ExtDetectLanguageSummary( |
michael@0 | 241 | const char* buffer, |
michael@0 | 242 | int buffer_length, |
michael@0 | 243 | bool is_plain_text, |
michael@0 | 244 | const char* tld_hint, // "id" boosts Indonesian |
michael@0 | 245 | int encoding_hint, // SJS boosts Japanese |
michael@0 | 246 | Language language_hint, // ITALIAN boosts it |
michael@0 | 247 | Language* language3, |
michael@0 | 248 | int* percent3, |
michael@0 | 249 | double* normalized_score3, |
michael@0 | 250 | int* text_bytes, |
michael@0 | 251 | bool* is_reliable) { |
michael@0 | 252 | bool allow_extended_lang = true; |
michael@0 | 253 | int flags = 0; |
michael@0 | 254 | Language plus_one = UNKNOWN_LANGUAGE; |
michael@0 | 255 | CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; |
michael@0 | 256 | |
michael@0 | 257 | Language lang = DetectLanguageSummaryV2( |
michael@0 | 258 | buffer, |
michael@0 | 259 | buffer_length, |
michael@0 | 260 | is_plain_text, |
michael@0 | 261 | &cldhints, |
michael@0 | 262 | allow_extended_lang, |
michael@0 | 263 | flags, |
michael@0 | 264 | plus_one, |
michael@0 | 265 | language3, |
michael@0 | 266 | percent3, |
michael@0 | 267 | normalized_score3, |
michael@0 | 268 | NULL, |
michael@0 | 269 | text_bytes, |
michael@0 | 270 | is_reliable); |
michael@0 | 271 | // Do not default to English |
michael@0 | 272 | return lang; |
michael@0 | 273 | } |
michael@0 | 274 | |
michael@0 | 275 | // Use this one. |
michael@0 | 276 | // Hints are collected into a struct. |
michael@0 | 277 | // Flags are passed in (normally zero). |
michael@0 | 278 | // |
michael@0 | 279 | // Also returns 3 internal language scores as a ratio to |
michael@0 | 280 | // normal score for real text in that language. Scores close to 1.0 indicate |
michael@0 | 281 | // normal text, while scores far away from 1.0 indicate badly-skewed text or |
michael@0 | 282 | // gibberish |
michael@0 | 283 | // |
michael@0 | 284 | // Returns a vector of chunks in different languages, so that caller may |
michael@0 | 285 | // spell-check, translate, or otherwaise process different parts of the input |
michael@0 | 286 | // buffer in language-dependant ways. |
michael@0 | 287 | // |
michael@0 | 288 | Language ExtDetectLanguageSummary( |
michael@0 | 289 | const char* buffer, |
michael@0 | 290 | int buffer_length, |
michael@0 | 291 | bool is_plain_text, |
michael@0 | 292 | const CLDHints* cld_hints, |
michael@0 | 293 | int flags, |
michael@0 | 294 | Language* language3, |
michael@0 | 295 | int* percent3, |
michael@0 | 296 | double* normalized_score3, |
michael@0 | 297 | ResultChunkVector* resultchunkvector, |
michael@0 | 298 | int* text_bytes, |
michael@0 | 299 | bool* is_reliable) { |
michael@0 | 300 | bool allow_extended_lang = true; |
michael@0 | 301 | Language plus_one = UNKNOWN_LANGUAGE; |
michael@0 | 302 | |
michael@0 | 303 | Language lang = DetectLanguageSummaryV2( |
michael@0 | 304 | buffer, |
michael@0 | 305 | buffer_length, |
michael@0 | 306 | is_plain_text, |
michael@0 | 307 | cld_hints, |
michael@0 | 308 | allow_extended_lang, |
michael@0 | 309 | flags, |
michael@0 | 310 | plus_one, |
michael@0 | 311 | language3, |
michael@0 | 312 | percent3, |
michael@0 | 313 | normalized_score3, |
michael@0 | 314 | resultchunkvector, |
michael@0 | 315 | text_bytes, |
michael@0 | 316 | is_reliable); |
michael@0 | 317 | // Do not default to English |
michael@0 | 318 | return lang; |
michael@0 | 319 | } |
michael@0 | 320 | |
michael@0 | 321 | } // End namespace CLD2 |
michael@0 | 322 |