Wed, 31 Dec 2014 07:53:36 +0100
Correct small whitespace inconsistency, lost while renaming variables.
michael@0 | 1 | // Copyright 2013 Google Inc. All Rights Reserved. |
michael@0 | 2 | // |
michael@0 | 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
michael@0 | 4 | // you may not use this file except in compliance with the License. |
michael@0 | 5 | // You may obtain a copy of the License at |
michael@0 | 6 | // |
michael@0 | 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
michael@0 | 8 | // |
michael@0 | 9 | // Unless required by applicable law or agreed to in writing, software |
michael@0 | 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
michael@0 | 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
michael@0 | 12 | // See the License for the specific language governing permissions and |
michael@0 | 13 | // limitations under the License. |
michael@0 | 14 | |
michael@0 | 15 | // |
michael@0 | 16 | // Author: dsites@google.com (Dick Sites) |
michael@0 | 17 | // |
michael@0 | 18 | |
michael@0 | 19 | // NOTE: |
michael@0 | 20 | // Baybayin (ancient script of the Philippines) is detected as TAGALOG. |
michael@0 | 21 | // Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE. |
michael@0 | 22 | // HAITIAN_CREOLE is detected as such. |
michael@0 | 23 | // NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly) |
michael@0 | 24 | // PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE. |
michael@0 | 25 | // ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as ROMANIAN. |
michael@0 | 26 | // BOSNIAN is not detected as such, but likely scores as Croatian or Serbian. |
michael@0 | 27 | // MONTENEGRIN is not detected as such, but likely scores as Serbian. |
michael@0 | 28 | // CROATIAN is detected in the Latin script |
michael@0 | 29 | // SERBIAN is detected in the Cyrililc and Latin scripts |
michael@0 | 30 | // Zhuang is detected in the Latin script only. |
michael@0 | 31 | // |
michael@0 | 32 | // The languages X_PIG_LATIN and X_KLINGON are detected in the |
michael@0 | 33 | // extended calls ExtDetectLanguageSummary(). |
michael@0 | 34 | // |
michael@0 | 35 | // UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure |
michael@0 | 36 | // is high enough. This happens with non-text input such as the bytes of a |
michael@0 | 37 | // JPEG, and also with text in languages outside training set. |
michael@0 | 38 | // |
michael@0 | 39 | // The following languages are to be detected in multiple scripts: |
michael@0 | 40 | // AZERBAIJANI (Latin, Cyrillic*, Arabic*) |
michael@0 | 41 | // BURMESE (Latin, Myanmar) |
michael@0 | 42 | // HAUSA (Latin, Arabic) |
michael@0 | 43 | // KASHMIRI (Arabic, Devanagari) |
michael@0 | 44 | // KAZAKH (Latin, Cyrillic, Arabic) |
michael@0 | 45 | // KURDISH (Latin*, Arabic) |
michael@0 | 46 | // KYRGYZ (Cyrillic, Arabic) |
michael@0 | 47 | // LIMBU (Devanagari, Limbu) |
michael@0 | 48 | // MONGOLIAN (Cyrillic, Mongolian) |
michael@0 | 49 | // SANSKRIT (Latin, Devanagari) |
michael@0 | 50 | // SINDHI (Arabic, Devanagari) |
michael@0 | 51 | // TAGALOG (Latin, Tagalog) |
michael@0 | 52 | // TAJIK (Cyrillic, Arabic*) |
michael@0 | 53 | // TATAR (Latin, Cyrillic, Arabic) |
michael@0 | 54 | // TURKMEN (Latin, Cyrillic, Arabic) |
michael@0 | 55 | // UIGHUR (Latin, Cyrillic, Arabic) |
michael@0 | 56 | // UZBEK (Latin, Cyrillic, Arabic) |
michael@0 | 57 | // |
michael@0 | 58 | // * Due to a shortage of training text, AZERBAIJANI is not currently detected |
michael@0 | 59 | // in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in |
michael@0 | 60 | // Arabic script. |
michael@0 | 61 | // |
michael@0 | 62 | |
michael@0 | 63 | #ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_ |
michael@0 | 64 | #define I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_ |
michael@0 | 65 | |
michael@0 | 66 | #include <vector> |
michael@0 | 67 | #include "../internal/lang_script.h" // For Language |
michael@0 | 68 | |
michael@0 | 69 | namespace CLD2 { |
michael@0 | 70 | |
michael@0 | 71 | // Scan interchange-valid UTF-8 bytes and detect most likely language, |
michael@0 | 72 | // or set of languages. |
michael@0 | 73 | // |
michael@0 | 74 | // Design goals: |
michael@0 | 75 | // Skip over big stretches of HTML tags |
michael@0 | 76 | // Able to return ranges of different languages |
michael@0 | 77 | // Relatively small tables and relatively fast processing |
michael@0 | 78 | // Thread safe |
michael@0 | 79 | // |
michael@0 | 80 | // For HTML documents, tags are skipped, along with <script> ... </script> |
michael@0 | 81 | // and <style> ... </style> sequences, and entities are expanded. |
michael@0 | 82 | // |
michael@0 | 83 | // We distinguish between bytes of the raw input buffer and bytes of non-tag |
michael@0 | 84 | // text letters. Since tags can be over 50% of the bytes of an HTML Page, |
michael@0 | 85 | // and are nearly all seven-bit ASCII English, we prefer to distinguish |
michael@0 | 86 | // language mixture fractions based on just the non-tag text. |
michael@0 | 87 | // |
michael@0 | 88 | // Inputs: text and text_length |
michael@0 | 89 | // Code skips HTML tags and expands HTML entities, unless |
michael@0 | 90 | // is_plain_text is true |
michael@0 | 91 | // Outputs: |
michael@0 | 92 | // language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE |
michael@0 | 93 | // percent3 is an array of the text percentages 0..100 of the top 3 languages |
michael@0 | 94 | // text_bytes is the amount of non-tag/letters-only text found |
michael@0 | 95 | // is_reliable set true if the returned Language is some amount more |
michael@0 | 96 | // probable then the second-best Language. Calculation is a complex function |
michael@0 | 97 | // of the length of the text and the different-script runs of text. |
michael@0 | 98 | // Return value: the most likely Language for the majority of the input text |
michael@0 | 99 | // Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text |
michael@0 | 100 | // defaults to ENGLISH. |
michael@0 | 101 | // |
michael@0 | 102 | // The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for |
michael@0 | 103 | // backwards compatibility with a different detector. |
michael@0 | 104 | // |
michael@0 | 105 | // The third version may return UNKNOWN_LANGUAGE, and also returns extended |
michael@0 | 106 | // language codes from lang_script.h |
michael@0 | 107 | // |
michael@0 | 108 | |
michael@0 | 109 | |
michael@0 | 110 | // Instead of individual arguments, pass in hints as an initialized struct |
michael@0 | 111 | // Init to {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE} if not known. |
michael@0 | 112 | // |
michael@0 | 113 | // Pass in hints whenever possible; doing so improves detection accuracy. The |
michael@0 | 114 | // set of passed-in hints are all information that is external to the text |
michael@0 | 115 | // itself. |
michael@0 | 116 | // |
michael@0 | 117 | // The content_language_hint is intended to come from an HTTP header |
michael@0 | 118 | // Content-Language: field, the tld_hint from the hostname of a URL, the |
michael@0 | 119 | // encoding-hint from an encoding detector applied to the input |
michael@0 | 120 | // document, and the language hint from any other context you might have. |
michael@0 | 121 | // The lang= tags inside an HTML document will be picked up as hints |
michael@0 | 122 | // by code within the compact language detector. |
michael@0 | 123 | |
michael@0 | 124 | typedef struct { |
michael@0 | 125 | const char* content_language_hint; // "mi,en" boosts Maori and English |
michael@0 | 126 | const char* tld_hint; // "id" boosts Indonesian |
michael@0 | 127 | int encoding_hint; // SJS boosts Japanese |
michael@0 | 128 | Language language_hint; // ITALIAN boosts it |
michael@0 | 129 | } CLDHints; |
michael@0 | 130 | |
michael@0 | 131 | static const int kMaxResultChunkBytes = 65535; |
michael@0 | 132 | |
michael@0 | 133 | // For returning a vector of per-language pieces of the input buffer |
michael@0 | 134 | // Unreliable and too-short are mapped to UNKNOWN_LANGUAGE |
michael@0 | 135 | typedef struct { |
michael@0 | 136 | int offset; // Starting byte offset in original buffer |
michael@0 | 137 | uint16 bytes; // Number of bytes in chunk |
michael@0 | 138 | uint16 lang1; // Top lang, as full Language. Apply |
michael@0 | 139 | // static_cast<Language>() to this short value. |
michael@0 | 140 | } ResultChunk; |
michael@0 | 141 | typedef std::vector<ResultChunk> ResultChunkVector; |
michael@0 | 142 | |
michael@0 | 143 | |
michael@0 | 144 | // Scan interchange-valid UTF-8 bytes and detect most likely language |
michael@0 | 145 | Language DetectLanguage( |
michael@0 | 146 | const char* buffer, |
michael@0 | 147 | int buffer_length, |
michael@0 | 148 | bool is_plain_text, |
michael@0 | 149 | bool* is_reliable); |
michael@0 | 150 | |
michael@0 | 151 | // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. |
michael@0 | 152 | // language3[0] is usually also the return value |
michael@0 | 153 | Language DetectLanguageSummary( |
michael@0 | 154 | const char* buffer, |
michael@0 | 155 | int buffer_length, |
michael@0 | 156 | bool is_plain_text, |
michael@0 | 157 | Language* language3, |
michael@0 | 158 | int* percent3, |
michael@0 | 159 | int* text_bytes, |
michael@0 | 160 | bool* is_reliable); |
michael@0 | 161 | |
michael@0 | 162 | // Same as above, with hints supplied |
michael@0 | 163 | // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. |
michael@0 | 164 | // language3[0] is usually also the return value |
michael@0 | 165 | Language DetectLanguageSummary( |
michael@0 | 166 | const char* buffer, |
michael@0 | 167 | int buffer_length, |
michael@0 | 168 | bool is_plain_text, |
michael@0 | 169 | const char* tld_hint, // "id" boosts Indonesian |
michael@0 | 170 | int encoding_hint, // SJS boosts Japanese |
michael@0 | 171 | Language language_hint, // ITALIAN boosts it |
michael@0 | 172 | Language* language3, |
michael@0 | 173 | int* percent3, |
michael@0 | 174 | int* text_bytes, |
michael@0 | 175 | bool* is_reliable); |
michael@0 | 176 | |
michael@0 | 177 | // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended |
michael@0 | 178 | // languages. |
michael@0 | 179 | // |
michael@0 | 180 | // Extended languages are additional interface languages and Unicode |
michael@0 | 181 | // single-language scripts, from lang_script.h |
michael@0 | 182 | // |
michael@0 | 183 | // language3[0] is usually also the return value |
michael@0 | 184 | Language ExtDetectLanguageSummary( |
michael@0 | 185 | const char* buffer, |
michael@0 | 186 | int buffer_length, |
michael@0 | 187 | bool is_plain_text, |
michael@0 | 188 | Language* language3, |
michael@0 | 189 | int* percent3, |
michael@0 | 190 | int* text_bytes, |
michael@0 | 191 | bool* is_reliable); |
michael@0 | 192 | |
michael@0 | 193 | // Same as above, with hints supplied |
michael@0 | 194 | // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended |
michael@0 | 195 | // languages. |
michael@0 | 196 | // |
michael@0 | 197 | // Extended languages are additional Google interface languages and Unicode |
michael@0 | 198 | // single-language scripts, from lang_script.h |
michael@0 | 199 | // |
michael@0 | 200 | // language3[0] is usually also the return value |
michael@0 | 201 | Language ExtDetectLanguageSummary( |
michael@0 | 202 | const char* buffer, |
michael@0 | 203 | int buffer_length, |
michael@0 | 204 | bool is_plain_text, |
michael@0 | 205 | const char* tld_hint, // "id" boosts Indonesian |
michael@0 | 206 | int encoding_hint, // SJS boosts Japanese |
michael@0 | 207 | Language language_hint, // ITALIAN boosts it |
michael@0 | 208 | Language* language3, |
michael@0 | 209 | int* percent3, |
michael@0 | 210 | int* text_bytes, |
michael@0 | 211 | bool* is_reliable); |
michael@0 | 212 | |
michael@0 | 213 | // Same as above, and also returns 3 internal language scores as a ratio to |
michael@0 | 214 | // normal score for real text in that language. Scores close to 1.0 indicate |
michael@0 | 215 | // normal text, while scores far away from 1.0 indicate badly-skewed text or |
michael@0 | 216 | // gibberish |
michael@0 | 217 | // |
michael@0 | 218 | Language ExtDetectLanguageSummary( |
michael@0 | 219 | const char* buffer, |
michael@0 | 220 | int buffer_length, |
michael@0 | 221 | bool is_plain_text, |
michael@0 | 222 | const char* tld_hint, // "id" boosts Indonesian |
michael@0 | 223 | int encoding_hint, // SJS boosts Japanese |
michael@0 | 224 | Language language_hint, // ITALIAN boosts it |
michael@0 | 225 | Language* language3, |
michael@0 | 226 | int* percent3, |
michael@0 | 227 | double* normalized_score3, |
michael@0 | 228 | int* text_bytes, |
michael@0 | 229 | bool* is_reliable); |
michael@0 | 230 | |
michael@0 | 231 | |
michael@0 | 232 | // Use this one. |
michael@0 | 233 | // Hints are collected into a struct. |
michael@0 | 234 | // Flags are passed in (normally zero). |
michael@0 | 235 | // |
michael@0 | 236 | // Also returns 3 internal language scores as a ratio to |
michael@0 | 237 | // normal score for real text in that language. Scores close to 1.0 indicate |
michael@0 | 238 | // normal text, while scores far away from 1.0 indicate badly-skewed text or |
michael@0 | 239 | // gibberish |
michael@0 | 240 | // |
michael@0 | 241 | // Returns a vector of chunks in different languages, so that caller may |
michael@0 | 242 | // spell-check, translate, or otherwaise process different parts of the input |
michael@0 | 243 | // buffer in language-dependant ways. |
michael@0 | 244 | // |
michael@0 | 245 | Language ExtDetectLanguageSummary( |
michael@0 | 246 | const char* buffer, |
michael@0 | 247 | int buffer_length, |
michael@0 | 248 | bool is_plain_text, |
michael@0 | 249 | const CLDHints* cld_hints, |
michael@0 | 250 | int flags, |
michael@0 | 251 | Language* language3, |
michael@0 | 252 | int* percent3, |
michael@0 | 253 | double* normalized_score3, |
michael@0 | 254 | ResultChunkVector* resultchunkvector, |
michael@0 | 255 | int* text_bytes, |
michael@0 | 256 | bool* is_reliable); |
michael@0 | 257 | |
michael@0 | 258 | // Return version text string |
michael@0 | 259 | // String is "code_version - data_build_date" |
michael@0 | 260 | const char* DetectLanguageVersion(); |
michael@0 | 261 | |
michael@0 | 262 | |
michael@0 | 263 | // Public use flags, debug output controls |
michael@0 | 264 | static const int kCLDFlagScoreAsQuads = 0x0100; // Force Greek, etc. => quads |
michael@0 | 265 | static const int kCLDFlagHtml = 0x0200; // Debug HTML => stderr |
michael@0 | 266 | static const int kCLDFlagCr = 0x0400; // <cr> per chunk if HTML |
michael@0 | 267 | static const int kCLDFlagVerbose = 0x0800; // More debug HTML => stderr |
michael@0 | 268 | static const int kCLDFlagQuiet = 0x1000; // Less debug HTML => stderr |
michael@0 | 269 | static const int kCLDFlagEcho = 0x2000; // Echo input => stderr |
michael@0 | 270 | |
michael@0 | 271 | |
michael@0 | 272 | /*** |
michael@0 | 273 | |
michael@0 | 274 | Flag meanings: |
michael@0 | 275 | kCLDFlagScoreAsQuads |
michael@0 | 276 | Normally, several languages are detected solely by their Unicode script. |
michael@0 | 277 | Combined with appropritate lookup tables, this flag forces them instead |
michael@0 | 278 | to be detected via quadgrams. This can be a useful refinement when looking |
michael@0 | 279 | for meaningful text in these languages, instead of just character sets. |
michael@0 | 280 | The default tables do not support this use. |
michael@0 | 281 | kCLDFlagHtml |
michael@0 | 282 | For each detection call, write an HTML file to stderr, showing the text |
michael@0 | 283 | chunks and their detected languages. |
michael@0 | 284 | kCLDFlagCr |
michael@0 | 285 | In that HTML file, force a new line for each chunk. |
michael@0 | 286 | kCLDFlagVerbose |
michael@0 | 287 | In that HTML file, show every lookup entry. |
michael@0 | 288 | kCLDFlagQuiet |
michael@0 | 289 | In that HTML file, suppress most of the output detail. |
michael@0 | 290 | kCLDFlagEcho |
michael@0 | 291 | Echo every input buffer to stderr. |
michael@0 | 292 | ***/ |
michael@0 | 293 | |
michael@0 | 294 | // Debug output: Print the resultchunkvector to file f |
michael@0 | 295 | void DumpResultChunkVector(FILE* f, const char* src, |
michael@0 | 296 | ResultChunkVector* resultchunkvector); |
michael@0 | 297 | |
michael@0 | 298 | #ifdef CLD2_DYNAMIC_MODE |
michael@0 | 299 | |
michael@0 | 300 | // If compiled with dynamic mode, load data from the specified file location. |
michael@0 | 301 | // If other data has already been loaded, it is discarded and the data is read |
michael@0 | 302 | // in from the specified file location again (even if the file has not changed). |
michael@0 | 303 | // WARNING: Before calling this method, language detection will always fail |
michael@0 | 304 | // and will always return the unknown language. |
michael@0 | 305 | void loadData(const char* fileName); |
michael@0 | 306 | |
michael@0 | 307 | // If compiled with dynamic mode, unload the previously-loaded data. |
michael@0 | 308 | // WARNING: After calling this method, language detection will no longer work |
michael@0 | 309 | // and will always return the unknown language. |
michael@0 | 310 | void unloadData(); |
michael@0 | 311 | |
michael@0 | 312 | // Returns true if and only if data has been loaded via a call to loadData(...) |
michael@0 | 313 | // and has not been subsequently unladed via a call to unloadDate(). |
michael@0 | 314 | bool isDataLoaded(); |
michael@0 | 315 | |
michael@0 | 316 | #endif // #ifdef CLD2_DYNAMIC_MODE |
michael@0 | 317 | |
michael@0 | 318 | }; // End namespace CLD2 |
michael@0 | 319 | |
michael@0 | 320 | #endif // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_ |