|
1 // Copyright 2013 Google Inc. All Rights Reserved. |
|
2 // |
|
3 // Licensed under the Apache License, Version 2.0 (the "License"); |
|
4 // you may not use this file except in compliance with the License. |
|
5 // You may obtain a copy of the License at |
|
6 // |
|
7 // http://www.apache.org/licenses/LICENSE-2.0 |
|
8 // |
|
9 // Unless required by applicable law or agreed to in writing, software |
|
10 // distributed under the License is distributed on an "AS IS" BASIS, |
|
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
12 // See the License for the specific language governing permissions and |
|
13 // limitations under the License. |
|
14 |
|
15 // |
|
16 // Author: dsites@google.com (Dick Sites) |
|
17 // |
|
18 |
|
19 // NOTE: |
|
20 // Baybayin (ancient script of the Philippines) is detected as TAGALOG. |
|
21 // Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE. |
|
22 // HAITIAN_CREOLE is detected as such. |
|
23 // NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly) |
|
24 // PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE. |
|
25 // ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as ROMANIAN. |
|
26 // BOSNIAN is not detected as such, but likely scores as Croatian or Serbian. |
|
27 // MONTENEGRIN is not detected as such, but likely scores as Serbian. |
|
28 // CROATIAN is detected in the Latin script |
|
29 // SERBIAN is detected in the Cyrililc and Latin scripts |
|
30 // Zhuang is detected in the Latin script only. |
|
31 // |
|
32 // The languages X_PIG_LATIN and X_KLINGON are detected in the |
|
33 // extended calls ExtDetectLanguageSummary(). |
|
34 // |
|
35 // UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure |
|
36 // is high enough. This happens with non-text input such as the bytes of a |
|
37 // JPEG, and also with text in languages outside training set. |
|
38 // |
|
39 // The following languages are to be detected in multiple scripts: |
|
40 // AZERBAIJANI (Latin, Cyrillic*, Arabic*) |
|
41 // BURMESE (Latin, Myanmar) |
|
42 // HAUSA (Latin, Arabic) |
|
43 // KASHMIRI (Arabic, Devanagari) |
|
44 // KAZAKH (Latin, Cyrillic, Arabic) |
|
45 // KURDISH (Latin*, Arabic) |
|
46 // KYRGYZ (Cyrillic, Arabic) |
|
47 // LIMBU (Devanagari, Limbu) |
|
48 // MONGOLIAN (Cyrillic, Mongolian) |
|
49 // SANSKRIT (Latin, Devanagari) |
|
50 // SINDHI (Arabic, Devanagari) |
|
51 // TAGALOG (Latin, Tagalog) |
|
52 // TAJIK (Cyrillic, Arabic*) |
|
53 // TATAR (Latin, Cyrillic, Arabic) |
|
54 // TURKMEN (Latin, Cyrillic, Arabic) |
|
55 // UIGHUR (Latin, Cyrillic, Arabic) |
|
56 // UZBEK (Latin, Cyrillic, Arabic) |
|
57 // |
|
58 // * Due to a shortage of training text, AZERBAIJANI is not currently detected |
|
59 // in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in |
|
60 // Arabic script. |
|
61 // |
|
62 |
|
63 #ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_ |
|
64 #define I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_ |
|
65 |
|
66 #include <vector> |
|
67 #include "../internal/lang_script.h" // For Language |
|
68 |
|
69 namespace CLD2 { |
|
70 |
|
71 // Scan interchange-valid UTF-8 bytes and detect most likely language, |
|
72 // or set of languages. |
|
73 // |
|
74 // Design goals: |
|
75 // Skip over big stretches of HTML tags |
|
76 // Able to return ranges of different languages |
|
77 // Relatively small tables and relatively fast processing |
|
78 // Thread safe |
|
79 // |
|
80 // For HTML documents, tags are skipped, along with <script> ... </script> |
|
81 // and <style> ... </style> sequences, and entities are expanded. |
|
82 // |
|
83 // We distinguish between bytes of the raw input buffer and bytes of non-tag |
|
84 // text letters. Since tags can be over 50% of the bytes of an HTML Page, |
|
85 // and are nearly all seven-bit ASCII English, we prefer to distinguish |
|
86 // language mixture fractions based on just the non-tag text. |
|
87 // |
|
88 // Inputs: text and text_length |
|
89 // Code skips HTML tags and expands HTML entities, unless |
|
90 // is_plain_text is true |
|
91 // Outputs: |
|
92 // language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE |
|
93 // percent3 is an array of the text percentages 0..100 of the top 3 languages |
|
94 // text_bytes is the amount of non-tag/letters-only text found |
|
95 // is_reliable set true if the returned Language is some amount more |
|
96 // probable then the second-best Language. Calculation is a complex function |
|
97 // of the length of the text and the different-script runs of text. |
|
98 // Return value: the most likely Language for the majority of the input text |
|
99 // Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text |
|
100 // defaults to ENGLISH. |
|
101 // |
|
102 // The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for |
|
103 // backwards compatibility with a different detector. |
|
104 // |
|
105 // The third version may return UNKNOWN_LANGUAGE, and also returns extended |
|
106 // language codes from lang_script.h |
|
107 // |
|
108 |
|
109 |
|
110 // Instead of individual arguments, pass in hints as an initialized struct |
|
111 // Init to {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE} if not known. |
|
112 // |
|
113 // Pass in hints whenever possible; doing so improves detection accuracy. The |
|
114 // set of passed-in hints are all information that is external to the text |
|
115 // itself. |
|
116 // |
|
117 // The content_language_hint is intended to come from an HTTP header |
|
118 // Content-Language: field, the tld_hint from the hostname of a URL, the |
|
119 // encoding-hint from an encoding detector applied to the input |
|
120 // document, and the language hint from any other context you might have. |
|
121 // The lang= tags inside an HTML document will be picked up as hints |
|
122 // by code within the compact language detector. |
|
123 |
|
124 typedef struct { |
|
125 const char* content_language_hint; // "mi,en" boosts Maori and English |
|
126 const char* tld_hint; // "id" boosts Indonesian |
|
127 int encoding_hint; // SJS boosts Japanese |
|
128 Language language_hint; // ITALIAN boosts it |
|
129 } CLDHints; |
|
130 |
|
131 static const int kMaxResultChunkBytes = 65535; |
|
132 |
|
133 // For returning a vector of per-language pieces of the input buffer |
|
134 // Unreliable and too-short are mapped to UNKNOWN_LANGUAGE |
|
135 typedef struct { |
|
136 int offset; // Starting byte offset in original buffer |
|
137 uint16 bytes; // Number of bytes in chunk |
|
138 uint16 lang1; // Top lang, as full Language. Apply |
|
139 // static_cast<Language>() to this short value. |
|
140 } ResultChunk; |
|
141 typedef std::vector<ResultChunk> ResultChunkVector; |
|
142 |
|
143 |
|
144 // Scan interchange-valid UTF-8 bytes and detect most likely language |
|
145 Language DetectLanguage( |
|
146 const char* buffer, |
|
147 int buffer_length, |
|
148 bool is_plain_text, |
|
149 bool* is_reliable); |
|
150 |
|
151 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. |
|
152 // language3[0] is usually also the return value |
|
153 Language DetectLanguageSummary( |
|
154 const char* buffer, |
|
155 int buffer_length, |
|
156 bool is_plain_text, |
|
157 Language* language3, |
|
158 int* percent3, |
|
159 int* text_bytes, |
|
160 bool* is_reliable); |
|
161 |
|
162 // Same as above, with hints supplied |
|
163 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. |
|
164 // language3[0] is usually also the return value |
|
165 Language DetectLanguageSummary( |
|
166 const char* buffer, |
|
167 int buffer_length, |
|
168 bool is_plain_text, |
|
169 const char* tld_hint, // "id" boosts Indonesian |
|
170 int encoding_hint, // SJS boosts Japanese |
|
171 Language language_hint, // ITALIAN boosts it |
|
172 Language* language3, |
|
173 int* percent3, |
|
174 int* text_bytes, |
|
175 bool* is_reliable); |
|
176 |
|
177 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended |
|
178 // languages. |
|
179 // |
|
180 // Extended languages are additional interface languages and Unicode |
|
181 // single-language scripts, from lang_script.h |
|
182 // |
|
183 // language3[0] is usually also the return value |
|
184 Language ExtDetectLanguageSummary( |
|
185 const char* buffer, |
|
186 int buffer_length, |
|
187 bool is_plain_text, |
|
188 Language* language3, |
|
189 int* percent3, |
|
190 int* text_bytes, |
|
191 bool* is_reliable); |
|
192 |
|
193 // Same as above, with hints supplied |
|
194 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended |
|
195 // languages. |
|
196 // |
|
197 // Extended languages are additional Google interface languages and Unicode |
|
198 // single-language scripts, from lang_script.h |
|
199 // |
|
200 // language3[0] is usually also the return value |
|
201 Language ExtDetectLanguageSummary( |
|
202 const char* buffer, |
|
203 int buffer_length, |
|
204 bool is_plain_text, |
|
205 const char* tld_hint, // "id" boosts Indonesian |
|
206 int encoding_hint, // SJS boosts Japanese |
|
207 Language language_hint, // ITALIAN boosts it |
|
208 Language* language3, |
|
209 int* percent3, |
|
210 int* text_bytes, |
|
211 bool* is_reliable); |
|
212 |
|
213 // Same as above, and also returns 3 internal language scores as a ratio to |
|
214 // normal score for real text in that language. Scores close to 1.0 indicate |
|
215 // normal text, while scores far away from 1.0 indicate badly-skewed text or |
|
216 // gibberish |
|
217 // |
|
218 Language ExtDetectLanguageSummary( |
|
219 const char* buffer, |
|
220 int buffer_length, |
|
221 bool is_plain_text, |
|
222 const char* tld_hint, // "id" boosts Indonesian |
|
223 int encoding_hint, // SJS boosts Japanese |
|
224 Language language_hint, // ITALIAN boosts it |
|
225 Language* language3, |
|
226 int* percent3, |
|
227 double* normalized_score3, |
|
228 int* text_bytes, |
|
229 bool* is_reliable); |
|
230 |
|
231 |
|
232 // Use this one. |
|
233 // Hints are collected into a struct. |
|
234 // Flags are passed in (normally zero). |
|
235 // |
|
236 // Also returns 3 internal language scores as a ratio to |
|
237 // normal score for real text in that language. Scores close to 1.0 indicate |
|
238 // normal text, while scores far away from 1.0 indicate badly-skewed text or |
|
239 // gibberish |
|
240 // |
|
241 // Returns a vector of chunks in different languages, so that caller may |
|
242 // spell-check, translate, or otherwaise process different parts of the input |
|
243 // buffer in language-dependant ways. |
|
244 // |
|
245 Language ExtDetectLanguageSummary( |
|
246 const char* buffer, |
|
247 int buffer_length, |
|
248 bool is_plain_text, |
|
249 const CLDHints* cld_hints, |
|
250 int flags, |
|
251 Language* language3, |
|
252 int* percent3, |
|
253 double* normalized_score3, |
|
254 ResultChunkVector* resultchunkvector, |
|
255 int* text_bytes, |
|
256 bool* is_reliable); |
|
257 |
|
258 // Return version text string |
|
259 // String is "code_version - data_build_date" |
|
260 const char* DetectLanguageVersion(); |
|
261 |
|
262 |
|
263 // Public use flags, debug output controls |
|
264 static const int kCLDFlagScoreAsQuads = 0x0100; // Force Greek, etc. => quads |
|
265 static const int kCLDFlagHtml = 0x0200; // Debug HTML => stderr |
|
266 static const int kCLDFlagCr = 0x0400; // <cr> per chunk if HTML |
|
267 static const int kCLDFlagVerbose = 0x0800; // More debug HTML => stderr |
|
268 static const int kCLDFlagQuiet = 0x1000; // Less debug HTML => stderr |
|
269 static const int kCLDFlagEcho = 0x2000; // Echo input => stderr |
|
270 |
|
271 |
|
272 /*** |
|
273 |
|
274 Flag meanings: |
|
275 kCLDFlagScoreAsQuads |
|
276 Normally, several languages are detected solely by their Unicode script. |
|
277 Combined with appropritate lookup tables, this flag forces them instead |
|
278 to be detected via quadgrams. This can be a useful refinement when looking |
|
279 for meaningful text in these languages, instead of just character sets. |
|
280 The default tables do not support this use. |
|
281 kCLDFlagHtml |
|
282 For each detection call, write an HTML file to stderr, showing the text |
|
283 chunks and their detected languages. |
|
284 kCLDFlagCr |
|
285 In that HTML file, force a new line for each chunk. |
|
286 kCLDFlagVerbose |
|
287 In that HTML file, show every lookup entry. |
|
288 kCLDFlagQuiet |
|
289 In that HTML file, suppress most of the output detail. |
|
290 kCLDFlagEcho |
|
291 Echo every input buffer to stderr. |
|
292 ***/ |
|
293 |
|
294 // Debug output: Print the resultchunkvector to file f |
|
295 void DumpResultChunkVector(FILE* f, const char* src, |
|
296 ResultChunkVector* resultchunkvector); |
|
297 |
|
298 #ifdef CLD2_DYNAMIC_MODE |
|
299 |
|
300 // If compiled with dynamic mode, load data from the specified file location. |
|
301 // If other data has already been loaded, it is discarded and the data is read |
|
302 // in from the specified file location again (even if the file has not changed). |
|
303 // WARNING: Before calling this method, language detection will always fail |
|
304 // and will always return the unknown language. |
|
305 void loadData(const char* fileName); |
|
306 |
|
307 // If compiled with dynamic mode, unload the previously-loaded data. |
|
308 // WARNING: After calling this method, language detection will no longer work |
|
309 // and will always return the unknown language. |
|
310 void unloadData(); |
|
311 |
|
312 // Returns true if and only if data has been loaded via a call to loadData(...) |
|
313 // and has not been subsequently unladed via a call to unloadDate(). |
|
314 bool isDataLoaded(); |
|
315 |
|
316 #endif // #ifdef CLD2_DYNAMIC_MODE |
|
317 |
|
318 }; // End namespace CLD2 |
|
319 |
|
320 #endif // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_ |