|
1 // Copyright 2013 Google Inc. All Rights Reserved. |
|
2 // |
|
3 // Licensed under the Apache License, Version 2.0 (the "License"); |
|
4 // you may not use this file except in compliance with the License. |
|
5 // You may obtain a copy of the License at |
|
6 // |
|
7 // http://www.apache.org/licenses/LICENSE-2.0 |
|
8 // |
|
9 // Unless required by applicable law or agreed to in writing, software |
|
10 // distributed under the License is distributed on an "AS IS" BASIS, |
|
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
12 // See the License for the specific language governing permissions and |
|
13 // limitations under the License. |
|
14 |
|
15 // |
|
16 // Author: dsites@google.com (Dick Sites) |
|
17 // |
|
18 |
|
19 #include <stdio.h> |
|
20 #include <stdlib.h> |
|
21 |
|
22 #include "../public/compact_lang_det.h" |
|
23 #include "../public/encodings.h" |
|
24 #include "compact_lang_det_impl.h" |
|
25 #include "integral_types.h" |
|
26 #include "lang_script.h" |
|
27 |
|
28 namespace CLD2 { |
|
29 |
|
30 // String is "code_version - data_scrape_date" |
|
31 //static const char* kDetectLanguageVersion = "V2.0 - 20130715"; |
|
32 |
|
33 |
|
34 // Large-table version for all ~160 languages |
|
35 // Small-table version for all ~60 languages |
|
36 |
|
37 // Scan interchange-valid UTF-8 bytes and detect most likely language |
|
38 Language DetectLanguage( |
|
39 const char* buffer, |
|
40 int buffer_length, |
|
41 bool is_plain_text, |
|
42 bool* is_reliable) { |
|
43 bool allow_extended_lang = false; |
|
44 Language language3[3]; |
|
45 int percent3[3]; |
|
46 double normalized_score3[3]; |
|
47 int text_bytes; |
|
48 int flags = 0; |
|
49 Language plus_one = UNKNOWN_LANGUAGE; |
|
50 const char* tld_hint = ""; |
|
51 int encoding_hint = UNKNOWN_ENCODING; |
|
52 Language language_hint = UNKNOWN_LANGUAGE; |
|
53 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; |
|
54 |
|
55 Language lang = DetectLanguageSummaryV2( |
|
56 buffer, |
|
57 buffer_length, |
|
58 is_plain_text, |
|
59 &cldhints, |
|
60 allow_extended_lang, |
|
61 flags, |
|
62 plus_one, |
|
63 language3, |
|
64 percent3, |
|
65 normalized_score3, |
|
66 NULL, |
|
67 &text_bytes, |
|
68 is_reliable); |
|
69 // Default to English |
|
70 if (lang == UNKNOWN_LANGUAGE) { |
|
71 lang = ENGLISH; |
|
72 } |
|
73 return lang; |
|
74 } |
|
75 |
|
76 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. |
|
77 Language DetectLanguageSummary( |
|
78 const char* buffer, |
|
79 int buffer_length, |
|
80 bool is_plain_text, |
|
81 Language* language3, |
|
82 int* percent3, |
|
83 int* text_bytes, |
|
84 bool* is_reliable) { |
|
85 double normalized_score3[3]; |
|
86 bool allow_extended_lang = false; |
|
87 int flags = 0; |
|
88 Language plus_one = UNKNOWN_LANGUAGE; |
|
89 const char* tld_hint = ""; |
|
90 int encoding_hint = UNKNOWN_ENCODING; |
|
91 Language language_hint = UNKNOWN_LANGUAGE; |
|
92 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; |
|
93 |
|
94 Language lang = DetectLanguageSummaryV2( |
|
95 buffer, |
|
96 buffer_length, |
|
97 is_plain_text, |
|
98 &cldhints, |
|
99 allow_extended_lang, |
|
100 flags, |
|
101 plus_one, |
|
102 language3, |
|
103 percent3, |
|
104 normalized_score3, |
|
105 NULL, |
|
106 text_bytes, |
|
107 is_reliable); |
|
108 // Default to English |
|
109 if (lang == UNKNOWN_LANGUAGE) { |
|
110 lang = ENGLISH; |
|
111 } |
|
112 return lang; |
|
113 } |
|
114 |
|
115 // Same as above, with hints supplied |
|
116 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. |
|
117 Language DetectLanguageSummary( |
|
118 const char* buffer, |
|
119 int buffer_length, |
|
120 bool is_plain_text, |
|
121 const char* tld_hint, // "id" boosts Indonesian |
|
122 int encoding_hint, // SJS boosts Japanese |
|
123 Language language_hint, // ITALIAN boosts it |
|
124 Language* language3, |
|
125 int* percent3, |
|
126 int* text_bytes, |
|
127 bool* is_reliable) { |
|
128 double normalized_score3[3]; |
|
129 bool allow_extended_lang = false; |
|
130 int flags = 0; |
|
131 Language plus_one = UNKNOWN_LANGUAGE; |
|
132 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; |
|
133 |
|
134 Language lang = DetectLanguageSummaryV2( |
|
135 buffer, |
|
136 buffer_length, |
|
137 is_plain_text, |
|
138 &cldhints, |
|
139 allow_extended_lang, |
|
140 flags, |
|
141 plus_one, |
|
142 language3, |
|
143 percent3, |
|
144 normalized_score3, |
|
145 NULL, |
|
146 text_bytes, |
|
147 is_reliable); |
|
148 // Default to English |
|
149 if (lang == UNKNOWN_LANGUAGE) { |
|
150 lang = ENGLISH; |
|
151 } |
|
152 return lang; |
|
153 } |
|
154 |
|
155 |
|
156 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended |
|
157 // languages. |
|
158 // Extended languages are additional Google interface languages and Unicode |
|
159 // single-language scripts, from ext_lang_enc.h |
|
160 Language ExtDetectLanguageSummary( |
|
161 const char* buffer, |
|
162 int buffer_length, |
|
163 bool is_plain_text, |
|
164 Language* language3, |
|
165 int* percent3, |
|
166 int* text_bytes, |
|
167 bool* is_reliable) { |
|
168 double normalized_score3[3]; |
|
169 bool allow_extended_lang = true; |
|
170 int flags = 0; |
|
171 Language plus_one = UNKNOWN_LANGUAGE; |
|
172 const char* tld_hint = ""; |
|
173 int encoding_hint = UNKNOWN_ENCODING; |
|
174 Language language_hint = UNKNOWN_LANGUAGE; |
|
175 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; |
|
176 |
|
177 Language lang = DetectLanguageSummaryV2( |
|
178 buffer, |
|
179 buffer_length, |
|
180 is_plain_text, |
|
181 &cldhints, |
|
182 allow_extended_lang, |
|
183 flags, |
|
184 plus_one, |
|
185 language3, |
|
186 percent3, |
|
187 normalized_score3, |
|
188 NULL, |
|
189 text_bytes, |
|
190 is_reliable); |
|
191 // Do not default to English |
|
192 return lang; |
|
193 } |
|
194 |
|
195 // Same as above, with hints supplied |
|
196 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended |
|
197 // languages. |
|
198 // Extended languages are additional Google interface languages and Unicode |
|
199 // single-language scripts, from ext_lang_enc.h |
|
200 Language ExtDetectLanguageSummary( |
|
201 const char* buffer, |
|
202 int buffer_length, |
|
203 bool is_plain_text, |
|
204 const char* tld_hint, // "id" boosts Indonesian |
|
205 int encoding_hint, // SJS boosts Japanese |
|
206 Language language_hint, // ITALIAN boosts it |
|
207 Language* language3, |
|
208 int* percent3, |
|
209 int* text_bytes, |
|
210 bool* is_reliable) { |
|
211 double normalized_score3[3]; |
|
212 bool allow_extended_lang = true; |
|
213 int flags = 0; |
|
214 Language plus_one = UNKNOWN_LANGUAGE; |
|
215 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; |
|
216 |
|
217 Language lang = DetectLanguageSummaryV2( |
|
218 buffer, |
|
219 buffer_length, |
|
220 is_plain_text, |
|
221 &cldhints, |
|
222 allow_extended_lang, |
|
223 flags, |
|
224 plus_one, |
|
225 language3, |
|
226 percent3, |
|
227 normalized_score3, |
|
228 NULL, |
|
229 text_bytes, |
|
230 is_reliable); |
|
231 // Do not default to English |
|
232 return lang; |
|
233 } |
|
234 |
|
235 // Same as above, and also returns internal language scores as a ratio to |
|
236 // normal score for real text in that language. Scores close to 1.0 indicate |
|
237 // normal text, while scores far away from 1.0 indicate badly-skewed text or |
|
238 // gibberish |
|
239 // |
|
240 Language ExtDetectLanguageSummary( |
|
241 const char* buffer, |
|
242 int buffer_length, |
|
243 bool is_plain_text, |
|
244 const char* tld_hint, // "id" boosts Indonesian |
|
245 int encoding_hint, // SJS boosts Japanese |
|
246 Language language_hint, // ITALIAN boosts it |
|
247 Language* language3, |
|
248 int* percent3, |
|
249 double* normalized_score3, |
|
250 int* text_bytes, |
|
251 bool* is_reliable) { |
|
252 bool allow_extended_lang = true; |
|
253 int flags = 0; |
|
254 Language plus_one = UNKNOWN_LANGUAGE; |
|
255 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint}; |
|
256 |
|
257 Language lang = DetectLanguageSummaryV2( |
|
258 buffer, |
|
259 buffer_length, |
|
260 is_plain_text, |
|
261 &cldhints, |
|
262 allow_extended_lang, |
|
263 flags, |
|
264 plus_one, |
|
265 language3, |
|
266 percent3, |
|
267 normalized_score3, |
|
268 NULL, |
|
269 text_bytes, |
|
270 is_reliable); |
|
271 // Do not default to English |
|
272 return lang; |
|
273 } |
|
274 |
|
275 // Use this one. |
|
276 // Hints are collected into a struct. |
|
277 // Flags are passed in (normally zero). |
|
278 // |
|
279 // Also returns 3 internal language scores as a ratio to |
|
280 // normal score for real text in that language. Scores close to 1.0 indicate |
|
281 // normal text, while scores far away from 1.0 indicate badly-skewed text or |
|
282 // gibberish |
|
283 // |
|
284 // Returns a vector of chunks in different languages, so that caller may |
|
285 // spell-check, translate, or otherwaise process different parts of the input |
|
286 // buffer in language-dependant ways. |
|
287 // |
|
288 Language ExtDetectLanguageSummary( |
|
289 const char* buffer, |
|
290 int buffer_length, |
|
291 bool is_plain_text, |
|
292 const CLDHints* cld_hints, |
|
293 int flags, |
|
294 Language* language3, |
|
295 int* percent3, |
|
296 double* normalized_score3, |
|
297 ResultChunkVector* resultchunkvector, |
|
298 int* text_bytes, |
|
299 bool* is_reliable) { |
|
300 bool allow_extended_lang = true; |
|
301 Language plus_one = UNKNOWN_LANGUAGE; |
|
302 |
|
303 Language lang = DetectLanguageSummaryV2( |
|
304 buffer, |
|
305 buffer_length, |
|
306 is_plain_text, |
|
307 cld_hints, |
|
308 allow_extended_lang, |
|
309 flags, |
|
310 plus_one, |
|
311 language3, |
|
312 percent3, |
|
313 normalized_score3, |
|
314 resultchunkvector, |
|
315 text_bytes, |
|
316 is_reliable); |
|
317 // Do not default to English |
|
318 return lang; |
|
319 } |
|
320 |
|
321 } // End namespace CLD2 |
|
322 |