|
1 /* |
|
2 ****************************************************************************** |
|
3 * |
|
4 * Copyright (C) 2008-2013, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ****************************************************************************** |
|
8 * file name: uspoof_wsconf.cpp |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 2009Jan05 (refactoring earlier files) |
|
14 * created by: Andy Heninger |
|
15 * |
|
16 * Internal functions for compililing Whole Script confusable source data |
|
17 * into its binary (runtime) form. The binary data format is described |
|
18 * in uspoof_impl.h |
|
19 */ |
|
20 |
|
21 #include "unicode/utypes.h" |
|
22 #include "unicode/uspoof.h" |
|
23 |
|
24 #if !UCONFIG_NO_NORMALIZATION |
|
25 |
|
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
|
27 |
|
28 #include "unicode/unorm.h" |
|
29 #include "unicode/uregex.h" |
|
30 #include "unicode/ustring.h" |
|
31 #include "cmemory.h" |
|
32 #include "scriptset.h" |
|
33 #include "uspoof_impl.h" |
|
34 #include "uhash.h" |
|
35 #include "uvector.h" |
|
36 #include "uassert.h" |
|
37 #include "uspoof_wsconf.h" |
|
38 |
|
39 U_NAMESPACE_USE |
|
40 |
|
41 |
|
42 // Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt |
|
43 // Example Lines: |
|
44 // 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O |
|
45 // 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I |
|
46 // | | | | |
|
47 // | | | |---- Which table, Any Case or Lower Case (A or L) |
|
48 // | | |----------Target script. We need this. |
|
49 // | |----------------Src script. Should match the script of the source |
|
50 // | code points. Beyond checking that, we don't keep it. |
|
51 // |--------------------------------Source code points or range. |
|
52 // |
|
53 // The expression will match _all_ lines, including erroneous lines. |
|
54 // The result of the parse is returned via the contents of the (match) groups. |
|
55 static const char *parseExp = |
|
56 "(?m)" // Multi-line mode |
|
57 "^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1. |
|
58 "|^(?:" // OR |
|
59 "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3. |
|
60 "\\s*([A-Za-z]+)\\s*;" // The source script. Group 4. |
|
61 "\\s*([A-Za-z]+)\\s*;" // The target script. Group 5. |
|
62 "\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7 |
|
63 "[ \\t]*(?:#.*?)?" // Trailing commment |
|
64 ")$|" // OR |
|
65 "^(.*?)$"; // An error line. Group 8. |
|
66 // Any line not matching the preceding |
|
67 // parts of the expression.will match |
|
68 // this, and thus be flagged as an error |
|
69 |
|
70 |
|
71 // Extract a regular expression match group into a char * string. |
|
72 // The group must contain only invariant characters. |
|
73 // Used for script names |
|
74 // |
|
75 static void extractGroup( |
|
76 URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) { |
|
77 |
|
78 UChar ubuf[50]; |
|
79 ubuf[0] = 0; |
|
80 destBuf[0] = 0; |
|
81 int32_t len = uregex_group(e, group, ubuf, 50, &status); |
|
82 if (U_FAILURE(status) || len == -1 || len >= destCapacity) { |
|
83 return; |
|
84 } |
|
85 UnicodeString s(FALSE, ubuf, len); // Aliasing constructor |
|
86 s.extract(0, len, destBuf, destCapacity, US_INV); |
|
87 } |
|
88 |
|
89 |
|
90 |
|
91 U_NAMESPACE_BEGIN |
|
92 |
|
93 // Build the Whole Script Confusable data |
|
94 // |
|
95 // TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class, |
|
96 // because everything is local to this one build function anyhow, |
|
97 // OR |
|
98 // break this function into more reasonably sized pieces, with |
|
99 // state in WSConfusableDataBuilder. |
|
100 // |
|
101 void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS, |
|
102 int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) |
|
103 { |
|
104 if (U_FAILURE(status)) { |
|
105 return; |
|
106 } |
|
107 URegularExpression *parseRegexp = NULL; |
|
108 int32_t inputLen = 0; |
|
109 UChar *input = NULL; |
|
110 int32_t lineNum = 0; |
|
111 |
|
112 UVector *scriptSets = NULL; |
|
113 uint32_t rtScriptSetsCount = 2; |
|
114 |
|
115 UTrie2 *anyCaseTrie = NULL; |
|
116 UTrie2 *lowerCaseTrie = NULL; |
|
117 |
|
118 anyCaseTrie = utrie2_open(0, 0, &status); |
|
119 lowerCaseTrie = utrie2_open(0, 0, &status); |
|
120 |
|
121 UnicodeString pattern(parseExp, -1, US_INV); |
|
122 |
|
123 // The scriptSets vector provides a mapping from TRIE values to the set of scripts. |
|
124 // |
|
125 // Reserved TRIE values: |
|
126 // 0: Code point has no whole script confusables. |
|
127 // 1: Code point is of script Common or Inherited. |
|
128 // These code points do not participate in whole script confusable detection. |
|
129 // (This is logically equivalent to saying that they contain confusables in |
|
130 // all scripts) |
|
131 // |
|
132 // Because Trie values are indexes into the ScriptSets vector, pre-fill |
|
133 // vector positions 0 and 1 to avoid conflicts with the reserved values. |
|
134 |
|
135 scriptSets = new UVector(status); |
|
136 if (scriptSets == NULL) { |
|
137 status = U_MEMORY_ALLOCATION_ERROR; |
|
138 goto cleanup; |
|
139 } |
|
140 scriptSets->addElement((void *)NULL, status); |
|
141 scriptSets->addElement((void *)NULL, status); |
|
142 |
|
143 // Convert the user input data from UTF-8 to UChar (UTF-16) |
|
144 u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status); |
|
145 if (status != U_BUFFER_OVERFLOW_ERROR) { |
|
146 goto cleanup; |
|
147 } |
|
148 status = U_ZERO_ERROR; |
|
149 input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar))); |
|
150 if (input == NULL) { |
|
151 status = U_MEMORY_ALLOCATION_ERROR; |
|
152 goto cleanup; |
|
153 } |
|
154 u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status); |
|
155 |
|
156 parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); |
|
157 |
|
158 // Zap any Byte Order Mark at the start of input. Changing it to a space is benign |
|
159 // given the syntax of the input. |
|
160 if (*input == 0xfeff) { |
|
161 *input = 0x20; |
|
162 } |
|
163 |
|
164 // Parse the input, one line per iteration of this loop. |
|
165 uregex_setText(parseRegexp, input, inputLen, &status); |
|
166 while (uregex_findNext(parseRegexp, &status)) { |
|
167 lineNum++; |
|
168 if (uregex_start(parseRegexp, 1, &status) >= 0) { |
|
169 // this was a blank or comment line. |
|
170 continue; |
|
171 } |
|
172 if (uregex_start(parseRegexp, 8, &status) >= 0) { |
|
173 // input file syntax error. |
|
174 status = U_PARSE_ERROR; |
|
175 goto cleanup; |
|
176 } |
|
177 if (U_FAILURE(status)) { |
|
178 goto cleanup; |
|
179 } |
|
180 |
|
181 // Pick up the start and optional range end code points from the parsed line. |
|
182 UChar32 startCodePoint = SpoofImpl::ScanHex( |
|
183 input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status); |
|
184 UChar32 endCodePoint = startCodePoint; |
|
185 if (uregex_start(parseRegexp, 3, &status) >=0) { |
|
186 endCodePoint = SpoofImpl::ScanHex( |
|
187 input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status); |
|
188 } |
|
189 |
|
190 // Extract the two script names from the source line. We need these in an 8 bit |
|
191 // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on |
|
192 // to the ICU u_getPropertyValueEnum() function. Ugh. |
|
193 char srcScriptName[20]; |
|
194 char targScriptName[20]; |
|
195 extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status); |
|
196 extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status); |
|
197 UScriptCode srcScript = |
|
198 static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName)); |
|
199 UScriptCode targScript = |
|
200 static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName)); |
|
201 if (U_FAILURE(status)) { |
|
202 goto cleanup; |
|
203 } |
|
204 if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) { |
|
205 status = U_INVALID_FORMAT_ERROR; |
|
206 goto cleanup; |
|
207 } |
|
208 |
|
209 // select the table - (A) any case or (L) lower case only |
|
210 UTrie2 *table = anyCaseTrie; |
|
211 if (uregex_start(parseRegexp, 7, &status) >= 0) { |
|
212 table = lowerCaseTrie; |
|
213 } |
|
214 |
|
215 // Build the set of scripts containing confusable characters for |
|
216 // the code point(s) specified in this input line. |
|
217 // Sanity check that the script of the source code point is the same |
|
218 // as the source script indicated in the input file. Failure of this check is |
|
219 // an error in the input file. |
|
220 // Include the source script in the set (needed for Mixed Script Confusable detection). |
|
221 // |
|
222 UChar32 cp; |
|
223 for (cp=startCodePoint; cp<=endCodePoint; cp++) { |
|
224 int32_t setIndex = utrie2_get32(table, cp); |
|
225 BuilderScriptSet *bsset = NULL; |
|
226 if (setIndex > 0) { |
|
227 U_ASSERT(setIndex < scriptSets->size()); |
|
228 bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex)); |
|
229 } else { |
|
230 bsset = new BuilderScriptSet(); |
|
231 if (bsset == NULL) { |
|
232 status = U_MEMORY_ALLOCATION_ERROR; |
|
233 goto cleanup; |
|
234 } |
|
235 bsset->codePoint = cp; |
|
236 bsset->trie = table; |
|
237 bsset->sset = new ScriptSet(); |
|
238 setIndex = scriptSets->size(); |
|
239 bsset->index = setIndex; |
|
240 bsset->rindex = 0; |
|
241 if (bsset->sset == NULL) { |
|
242 status = U_MEMORY_ALLOCATION_ERROR; |
|
243 goto cleanup; |
|
244 } |
|
245 scriptSets->addElement(bsset, status); |
|
246 utrie2_set32(table, cp, setIndex, &status); |
|
247 } |
|
248 bsset->sset->set(targScript, status); |
|
249 bsset->sset->set(srcScript, status); |
|
250 |
|
251 if (U_FAILURE(status)) { |
|
252 goto cleanup; |
|
253 } |
|
254 UScriptCode cpScript = uscript_getScript(cp, &status); |
|
255 if (cpScript != srcScript) { |
|
256 status = U_INVALID_FORMAT_ERROR; |
|
257 goto cleanup; |
|
258 } |
|
259 } |
|
260 } |
|
261 |
|
262 // Eliminate duplicate script sets. At this point we have a separate |
|
263 // script set for every code point that had data in the input file. |
|
264 // |
|
265 // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them |
|
266 // |
|
267 // printf("Number of scriptSets: %d\n", scriptSets->size()); |
|
268 { |
|
269 int32_t duplicateCount = 0; |
|
270 rtScriptSetsCount = 2; |
|
271 for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) { |
|
272 BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri)); |
|
273 if (outerSet->index != static_cast<uint32_t>(outeri)) { |
|
274 // This set was already identified as a duplicate. |
|
275 // It will not be allocated a position in the runtime array of ScriptSets. |
|
276 continue; |
|
277 } |
|
278 outerSet->rindex = rtScriptSetsCount++; |
|
279 for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) { |
|
280 BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri)); |
|
281 if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) { |
|
282 delete innerSet->sset; |
|
283 innerSet->scriptSetOwned = FALSE; |
|
284 innerSet->sset = outerSet->sset; |
|
285 innerSet->index = outeri; |
|
286 innerSet->rindex = outerSet->rindex; |
|
287 duplicateCount++; |
|
288 } |
|
289 // But this doesn't get all. We need to fix the TRIE. |
|
290 } |
|
291 } |
|
292 // printf("Number of distinct script sets: %d\n", rtScriptSetsCount); |
|
293 } |
|
294 |
|
295 |
|
296 |
|
297 // Update the Trie values to be reflect the run time script indexes (after duplicate merging). |
|
298 // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets |
|
299 // are unused, which is why the loop index starts at 2.) |
|
300 { |
|
301 for (int32_t i=2; i<scriptSets->size(); i++) { |
|
302 BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); |
|
303 if (bSet->rindex != (uint32_t)i) { |
|
304 utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status); |
|
305 } |
|
306 } |
|
307 } |
|
308 |
|
309 // For code points with script==Common or script==Inherited, |
|
310 // Set the reserved value of 1 into both Tries. These characters do not participate |
|
311 // in Whole Script Confusable detection; this reserved value is the means |
|
312 // by which they are detected. |
|
313 { |
|
314 UnicodeSet ignoreSet; |
|
315 ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); |
|
316 UnicodeSet inheritedSet; |
|
317 inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); |
|
318 ignoreSet.addAll(inheritedSet); |
|
319 for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) { |
|
320 UChar32 rangeStart = ignoreSet.getRangeStart(rn); |
|
321 UChar32 rangeEnd = ignoreSet.getRangeEnd(rn); |
|
322 utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); |
|
323 utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); |
|
324 } |
|
325 } |
|
326 |
|
327 // Serialize the data to the Spoof Detector |
|
328 { |
|
329 utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status); |
|
330 int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status); |
|
331 // printf("Any case Trie size: %d\n", size); |
|
332 if (status != U_BUFFER_OVERFLOW_ERROR) { |
|
333 goto cleanup; |
|
334 } |
|
335 status = U_ZERO_ERROR; |
|
336 spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit; |
|
337 spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size; |
|
338 spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie; |
|
339 void *where = spImpl->fSpoofData->reserveSpace(size, status); |
|
340 utrie2_serialize(anyCaseTrie, where, size, &status); |
|
341 |
|
342 utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status); |
|
343 size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status); |
|
344 // printf("Lower case Trie size: %d\n", size); |
|
345 if (status != U_BUFFER_OVERFLOW_ERROR) { |
|
346 goto cleanup; |
|
347 } |
|
348 status = U_ZERO_ERROR; |
|
349 spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit; |
|
350 spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size; |
|
351 spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie; |
|
352 where = spImpl->fSpoofData->reserveSpace(size, status); |
|
353 utrie2_serialize(lowerCaseTrie, where, size, &status); |
|
354 |
|
355 spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit; |
|
356 spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount; |
|
357 ScriptSet *rtScriptSets = static_cast<ScriptSet *> |
|
358 (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status)); |
|
359 uint32_t rindex = 2; |
|
360 for (int32_t i=2; i<scriptSets->size(); i++) { |
|
361 BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); |
|
362 if (bSet->rindex < rindex) { |
|
363 // We have already copied this script set to the serialized data. |
|
364 continue; |
|
365 } |
|
366 U_ASSERT(rindex == bSet->rindex); |
|
367 rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits. |
|
368 rindex++; |
|
369 } |
|
370 } |
|
371 |
|
372 // Open new utrie2s from the serialized data. We don't want to keep the ones |
|
373 // we just built because we would then have two copies of the data, one internal to |
|
374 // the utries that we have already constructed, and one in the serialized data area. |
|
375 // An alternative would be to not pre-serialize the Trie data, but that makes the |
|
376 // spoof detector data different, depending on how the detector was constructed. |
|
377 // It's simpler to keep the data always the same. |
|
378 |
|
379 spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized( |
|
380 UTRIE2_16_VALUE_BITS, |
|
381 (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie, |
|
382 spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, |
|
383 NULL, |
|
384 &status); |
|
385 |
|
386 spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized( |
|
387 UTRIE2_16_VALUE_BITS, |
|
388 (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie, |
|
389 spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, |
|
390 NULL, |
|
391 &status); |
|
392 |
|
393 |
|
394 |
|
395 cleanup: |
|
396 if (U_FAILURE(status)) { |
|
397 pe->line = lineNum; |
|
398 } |
|
399 uregex_close(parseRegexp); |
|
400 uprv_free(input); |
|
401 |
|
402 int32_t i; |
|
403 if (scriptSets != NULL) { |
|
404 for (i=0; i<scriptSets->size(); i++) { |
|
405 BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); |
|
406 delete bsset; |
|
407 } |
|
408 delete scriptSets; |
|
409 } |
|
410 utrie2_close(anyCaseTrie); |
|
411 utrie2_close(lowerCaseTrie); |
|
412 return; |
|
413 } |
|
414 |
|
415 U_NAMESPACE_END |
|
416 |
|
417 |
|
418 |
|
419 BuilderScriptSet::BuilderScriptSet() { |
|
420 codePoint = -1; |
|
421 trie = NULL; |
|
422 sset = NULL; |
|
423 index = 0; |
|
424 rindex = 0; |
|
425 scriptSetOwned = TRUE; |
|
426 } |
|
427 |
|
428 BuilderScriptSet::~BuilderScriptSet() { |
|
429 if (scriptSetOwned) { |
|
430 delete sset; |
|
431 } |
|
432 } |
|
433 |
|
434 #endif |
|
435 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
|
436 |