|
1 /* |
|
2 ****************************************************************************** |
|
3 * |
|
4 * Copyright (C) 1999-2013, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ****************************************************************************** |
|
8 * file name: unames.c |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 1999oct04 |
|
14 * created by: Markus W. Scherer |
|
15 */ |
|
16 |
|
17 #include "unicode/utypes.h" |
|
18 #include "unicode/putil.h" |
|
19 #include "unicode/uchar.h" |
|
20 #include "unicode/udata.h" |
|
21 #include "unicode/utf.h" |
|
22 #include "unicode/utf16.h" |
|
23 #include "uassert.h" |
|
24 #include "ustr_imp.h" |
|
25 #include "umutex.h" |
|
26 #include "cmemory.h" |
|
27 #include "cstring.h" |
|
28 #include "ucln_cmn.h" |
|
29 #include "udataswp.h" |
|
30 #include "uprops.h" |
|
31 |
|
32 U_NAMESPACE_BEGIN |
|
33 |
|
34 /* prototypes ------------------------------------------------------------- */ |
|
35 |
|
36 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
|
37 |
|
38 static const char DATA_NAME[] = "unames"; |
|
39 static const char DATA_TYPE[] = "icu"; |
|
40 |
|
41 #define GROUP_SHIFT 5 |
|
42 #define LINES_PER_GROUP (1L<<GROUP_SHIFT) |
|
43 #define GROUP_MASK (LINES_PER_GROUP-1) |
|
44 |
|
45 /* |
|
46 * This struct was replaced by explicitly accessing equivalent |
|
47 * fields from triples of uint16_t. |
|
48 * The Group struct was padded to 8 bytes on compilers for early ARM CPUs, |
|
49 * which broke the assumption that sizeof(Group)==6 and that the ++ operator |
|
50 * would advance by 6 bytes (3 uint16_t). |
|
51 * |
|
52 * We can't just change the data structure because it's loaded from a data file, |
|
53 * and we don't want to make it less compact, so we changed the access code. |
|
54 * |
|
55 * For details see ICU tickets 6331 and 6008. |
|
56 typedef struct { |
|
57 uint16_t groupMSB, |
|
58 offsetHigh, offsetLow; / * avoid padding * / |
|
59 } Group; |
|
60 */ |
|
61 enum { |
|
62 GROUP_MSB, |
|
63 GROUP_OFFSET_HIGH, |
|
64 GROUP_OFFSET_LOW, |
|
65 GROUP_LENGTH |
|
66 }; |
|
67 |
|
68 /* |
|
69 * Get the 32-bit group offset. |
|
70 * @param group (const uint16_t *) pointer to a Group triple of uint16_t |
|
71 * @return group offset (int32_t) |
|
72 */ |
|
73 #define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW]) |
|
74 |
|
75 #define NEXT_GROUP(group) ((group)+GROUP_LENGTH) |
|
76 #define PREV_GROUP(group) ((group)-GROUP_LENGTH) |
|
77 |
|
78 typedef struct { |
|
79 uint32_t start, end; |
|
80 uint8_t type, variant; |
|
81 uint16_t size; |
|
82 } AlgorithmicRange; |
|
83 |
|
84 typedef struct { |
|
85 uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset; |
|
86 } UCharNames; |
|
87 |
|
88 /* |
|
89 * Get the groups table from a UCharNames struct. |
|
90 * The groups table consists of one uint16_t groupCount followed by |
|
91 * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH |
|
92 * and the comment for the old struct Group above. |
|
93 * |
|
94 * @param names (const UCharNames *) pointer to the UCharNames indexes |
|
95 * @return (const uint16_t *) pointer to the groups table |
|
96 */ |
|
97 #define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset) |
|
98 |
|
99 typedef struct { |
|
100 const char *otherName; |
|
101 UChar32 code; |
|
102 } FindName; |
|
103 |
|
104 #define DO_FIND_NAME NULL |
|
105 |
|
106 static UDataMemory *uCharNamesData=NULL; |
|
107 static UCharNames *uCharNames=NULL; |
|
108 static icu::UInitOnce gCharNamesInitOnce = U_INITONCE_INITIALIZER; |
|
109 |
|
110 /* |
|
111 * Maximum length of character names (regular & 1.0). |
|
112 */ |
|
113 static int32_t gMaxNameLength=0; |
|
114 |
|
115 /* |
|
116 * Set of chars used in character names (regular & 1.0). |
|
117 * Chars are platform-dependent (can be EBCDIC). |
|
118 */ |
|
119 static uint32_t gNameSet[8]={ 0 }; |
|
120 |
|
121 #define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT |
|
122 #define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1 |
|
123 #define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2 |
|
124 |
|
125 #define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3) |
|
126 |
|
127 static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = { |
|
128 "unassigned", |
|
129 "uppercase letter", |
|
130 "lowercase letter", |
|
131 "titlecase letter", |
|
132 "modifier letter", |
|
133 "other letter", |
|
134 "non spacing mark", |
|
135 "enclosing mark", |
|
136 "combining spacing mark", |
|
137 "decimal digit number", |
|
138 "letter number", |
|
139 "other number", |
|
140 "space separator", |
|
141 "line separator", |
|
142 "paragraph separator", |
|
143 "control", |
|
144 "format", |
|
145 "private use area", |
|
146 "surrogate", |
|
147 "dash punctuation", |
|
148 "start punctuation", |
|
149 "end punctuation", |
|
150 "connector punctuation", |
|
151 "other punctuation", |
|
152 "math symbol", |
|
153 "currency symbol", |
|
154 "modifier symbol", |
|
155 "other symbol", |
|
156 "initial punctuation", |
|
157 "final punctuation", |
|
158 "noncharacter", |
|
159 "lead surrogate", |
|
160 "trail surrogate" |
|
161 }; |
|
162 |
|
163 /* implementation ----------------------------------------------------------- */ |
|
164 |
|
165 static UBool U_CALLCONV unames_cleanup(void) |
|
166 { |
|
167 if(uCharNamesData) { |
|
168 udata_close(uCharNamesData); |
|
169 uCharNamesData = NULL; |
|
170 } |
|
171 if(uCharNames) { |
|
172 uCharNames = NULL; |
|
173 } |
|
174 gCharNamesInitOnce.reset(); |
|
175 gMaxNameLength=0; |
|
176 return TRUE; |
|
177 } |
|
178 |
|
179 static UBool U_CALLCONV |
|
180 isAcceptable(void * /*context*/, |
|
181 const char * /*type*/, const char * /*name*/, |
|
182 const UDataInfo *pInfo) { |
|
183 return (UBool)( |
|
184 pInfo->size>=20 && |
|
185 pInfo->isBigEndian==U_IS_BIG_ENDIAN && |
|
186 pInfo->charsetFamily==U_CHARSET_FAMILY && |
|
187 pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */ |
|
188 pInfo->dataFormat[1]==0x6e && |
|
189 pInfo->dataFormat[2]==0x61 && |
|
190 pInfo->dataFormat[3]==0x6d && |
|
191 pInfo->formatVersion[0]==1); |
|
192 } |
|
193 |
|
194 static void U_CALLCONV |
|
195 loadCharNames(UErrorCode &status) { |
|
196 U_ASSERT(uCharNamesData == NULL); |
|
197 U_ASSERT(uCharNames == NULL); |
|
198 |
|
199 uCharNamesData = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &status); |
|
200 if(U_FAILURE(status)) { |
|
201 uCharNamesData = NULL; |
|
202 } else { |
|
203 uCharNames = (UCharNames *)udata_getMemory(uCharNamesData); |
|
204 } |
|
205 ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup); |
|
206 } |
|
207 |
|
208 |
|
209 static UBool |
|
210 isDataLoaded(UErrorCode *pErrorCode) { |
|
211 umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode); |
|
212 return U_SUCCESS(*pErrorCode); |
|
213 } |
|
214 |
|
215 #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \ |
|
216 if((bufferLength)>0) { \ |
|
217 *(buffer)++=c; \ |
|
218 --(bufferLength); \ |
|
219 } \ |
|
220 ++(bufferPos); \ |
|
221 } |
|
222 |
|
223 #define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT |
|
224 |
|
225 /* |
|
226 * Important: expandName() and compareName() are almost the same - |
|
227 * apply fixes to both. |
|
228 * |
|
229 * UnicodeData.txt uses ';' as a field separator, so no |
|
230 * field can contain ';' as part of its contents. |
|
231 * In unames.dat, it is marked as token[';']==-1 only if the |
|
232 * semicolon is used in the data file - which is iff we |
|
233 * have Unicode 1.0 names or ISO comments or aliases. |
|
234 * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases |
|
235 * although we know that it will never be part of a name. |
|
236 */ |
|
237 static uint16_t |
|
238 expandName(UCharNames *names, |
|
239 const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, |
|
240 char *buffer, uint16_t bufferLength) { |
|
241 uint16_t *tokens=(uint16_t *)names+8; |
|
242 uint16_t token, tokenCount=*tokens++, bufferPos=0; |
|
243 uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset; |
|
244 uint8_t c; |
|
245 |
|
246 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { |
|
247 /* |
|
248 * skip the modern name if it is not requested _and_ |
|
249 * if the semicolon byte value is a character, not a token number |
|
250 */ |
|
251 if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { |
|
252 int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice; |
|
253 do { |
|
254 while(nameLength>0) { |
|
255 --nameLength; |
|
256 if(*name++==';') { |
|
257 break; |
|
258 } |
|
259 } |
|
260 } while(--fieldIndex>0); |
|
261 } else { |
|
262 /* |
|
263 * the semicolon byte value is a token number, therefore |
|
264 * only modern names are stored in unames.dat and there is no |
|
265 * such requested alternate name here |
|
266 */ |
|
267 nameLength=0; |
|
268 } |
|
269 } |
|
270 |
|
271 /* write each letter directly, and write a token word per token */ |
|
272 while(nameLength>0) { |
|
273 --nameLength; |
|
274 c=*name++; |
|
275 |
|
276 if(c>=tokenCount) { |
|
277 if(c!=';') { |
|
278 /* implicit letter */ |
|
279 WRITE_CHAR(buffer, bufferLength, bufferPos, c); |
|
280 } else { |
|
281 /* finished */ |
|
282 break; |
|
283 } |
|
284 } else { |
|
285 token=tokens[c]; |
|
286 if(token==(uint16_t)(-2)) { |
|
287 /* this is a lead byte for a double-byte token */ |
|
288 token=tokens[c<<8|*name++]; |
|
289 --nameLength; |
|
290 } |
|
291 if(token==(uint16_t)(-1)) { |
|
292 if(c!=';') { |
|
293 /* explicit letter */ |
|
294 WRITE_CHAR(buffer, bufferLength, bufferPos, c); |
|
295 } else { |
|
296 /* stop, but skip the semicolon if we are seeking |
|
297 extended names and there was no 2.0 name but there |
|
298 is a 1.0 name. */ |
|
299 if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) { |
|
300 if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { |
|
301 continue; |
|
302 } |
|
303 } |
|
304 /* finished */ |
|
305 break; |
|
306 } |
|
307 } else { |
|
308 /* write token word */ |
|
309 uint8_t *tokenString=tokenStrings+token; |
|
310 while((c=*tokenString++)!=0) { |
|
311 WRITE_CHAR(buffer, bufferLength, bufferPos, c); |
|
312 } |
|
313 } |
|
314 } |
|
315 } |
|
316 |
|
317 /* zero-terminate */ |
|
318 if(bufferLength>0) { |
|
319 *buffer=0; |
|
320 } |
|
321 |
|
322 return bufferPos; |
|
323 } |
|
324 |
|
325 /* |
|
326 * compareName() is almost the same as expandName() except that it compares |
|
327 * the currently expanded name to an input name. |
|
328 * It returns the match/no match result as soon as possible. |
|
329 */ |
|
330 static UBool |
|
331 compareName(UCharNames *names, |
|
332 const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, |
|
333 const char *otherName) { |
|
334 uint16_t *tokens=(uint16_t *)names+8; |
|
335 uint16_t token, tokenCount=*tokens++; |
|
336 uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset; |
|
337 uint8_t c; |
|
338 const char *origOtherName = otherName; |
|
339 |
|
340 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { |
|
341 /* |
|
342 * skip the modern name if it is not requested _and_ |
|
343 * if the semicolon byte value is a character, not a token number |
|
344 */ |
|
345 if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { |
|
346 int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice; |
|
347 do { |
|
348 while(nameLength>0) { |
|
349 --nameLength; |
|
350 if(*name++==';') { |
|
351 break; |
|
352 } |
|
353 } |
|
354 } while(--fieldIndex>0); |
|
355 } else { |
|
356 /* |
|
357 * the semicolon byte value is a token number, therefore |
|
358 * only modern names are stored in unames.dat and there is no |
|
359 * such requested alternate name here |
|
360 */ |
|
361 nameLength=0; |
|
362 } |
|
363 } |
|
364 |
|
365 /* compare each letter directly, and compare a token word per token */ |
|
366 while(nameLength>0) { |
|
367 --nameLength; |
|
368 c=*name++; |
|
369 |
|
370 if(c>=tokenCount) { |
|
371 if(c!=';') { |
|
372 /* implicit letter */ |
|
373 if((char)c!=*otherName++) { |
|
374 return FALSE; |
|
375 } |
|
376 } else { |
|
377 /* finished */ |
|
378 break; |
|
379 } |
|
380 } else { |
|
381 token=tokens[c]; |
|
382 if(token==(uint16_t)(-2)) { |
|
383 /* this is a lead byte for a double-byte token */ |
|
384 token=tokens[c<<8|*name++]; |
|
385 --nameLength; |
|
386 } |
|
387 if(token==(uint16_t)(-1)) { |
|
388 if(c!=';') { |
|
389 /* explicit letter */ |
|
390 if((char)c!=*otherName++) { |
|
391 return FALSE; |
|
392 } |
|
393 } else { |
|
394 /* stop, but skip the semicolon if we are seeking |
|
395 extended names and there was no 2.0 name but there |
|
396 is a 1.0 name. */ |
|
397 if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) { |
|
398 if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { |
|
399 continue; |
|
400 } |
|
401 } |
|
402 /* finished */ |
|
403 break; |
|
404 } |
|
405 } else { |
|
406 /* write token word */ |
|
407 uint8_t *tokenString=tokenStrings+token; |
|
408 while((c=*tokenString++)!=0) { |
|
409 if((char)c!=*otherName++) { |
|
410 return FALSE; |
|
411 } |
|
412 } |
|
413 } |
|
414 } |
|
415 } |
|
416 |
|
417 /* complete match? */ |
|
418 return (UBool)(*otherName==0); |
|
419 } |
|
420 |
|
421 static uint8_t getCharCat(UChar32 cp) { |
|
422 uint8_t cat; |
|
423 |
|
424 if (U_IS_UNICODE_NONCHAR(cp)) { |
|
425 return U_NONCHARACTER_CODE_POINT; |
|
426 } |
|
427 |
|
428 if ((cat = u_charType(cp)) == U_SURROGATE) { |
|
429 cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE; |
|
430 } |
|
431 |
|
432 return cat; |
|
433 } |
|
434 |
|
435 static const char *getCharCatName(UChar32 cp) { |
|
436 uint8_t cat = getCharCat(cp); |
|
437 |
|
438 /* Return unknown if the table of names above is not up to |
|
439 date. */ |
|
440 |
|
441 if (cat >= LENGTHOF(charCatNames)) { |
|
442 return "unknown"; |
|
443 } else { |
|
444 return charCatNames[cat]; |
|
445 } |
|
446 } |
|
447 |
|
448 static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) { |
|
449 const char *catname = getCharCatName(code); |
|
450 uint16_t length = 0; |
|
451 |
|
452 UChar32 cp; |
|
453 int ndigits, i; |
|
454 |
|
455 WRITE_CHAR(buffer, bufferLength, length, '<'); |
|
456 while (catname[length - 1]) { |
|
457 WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]); |
|
458 } |
|
459 WRITE_CHAR(buffer, bufferLength, length, '-'); |
|
460 for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4) |
|
461 ; |
|
462 if (ndigits < 4) |
|
463 ndigits = 4; |
|
464 for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) { |
|
465 uint8_t v = (uint8_t)(cp & 0xf); |
|
466 buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10); |
|
467 } |
|
468 buffer += ndigits; |
|
469 length += ndigits; |
|
470 WRITE_CHAR(buffer, bufferLength, length, '>'); |
|
471 |
|
472 return length; |
|
473 } |
|
474 |
|
475 /* |
|
476 * getGroup() does a binary search for the group that contains the |
|
477 * Unicode code point "code". |
|
478 * The return value is always a valid Group* that may contain "code" |
|
479 * or else is the highest group before "code". |
|
480 * If the lowest group is after "code", then that one is returned. |
|
481 */ |
|
482 static const uint16_t * |
|
483 getGroup(UCharNames *names, uint32_t code) { |
|
484 const uint16_t *groups=GET_GROUPS(names); |
|
485 uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT), |
|
486 start=0, |
|
487 limit=*groups++, |
|
488 number; |
|
489 |
|
490 /* binary search for the group of names that contains the one for code */ |
|
491 while(start<limit-1) { |
|
492 number=(uint16_t)((start+limit)/2); |
|
493 if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) { |
|
494 limit=number; |
|
495 } else { |
|
496 start=number; |
|
497 } |
|
498 } |
|
499 |
|
500 /* return this regardless of whether it is an exact match */ |
|
501 return groups+start*GROUP_LENGTH; |
|
502 } |
|
503 |
|
504 /* |
|
505 * expandGroupLengths() reads a block of compressed lengths of 32 strings and |
|
506 * expands them into offsets and lengths for each string. |
|
507 * Lengths are stored with a variable-width encoding in consecutive nibbles: |
|
508 * If a nibble<0xc, then it is the length itself (0=empty string). |
|
509 * If a nibble>=0xc, then it forms a length value with the following nibble. |
|
510 * Calculation see below. |
|
511 * The offsets and lengths arrays must be at least 33 (one more) long because |
|
512 * there is no check here at the end if the last nibble is still used. |
|
513 */ |
|
514 static const uint8_t * |
|
515 expandGroupLengths(const uint8_t *s, |
|
516 uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) { |
|
517 /* read the lengths of the 32 strings in this group and get each string's offset */ |
|
518 uint16_t i=0, offset=0, length=0; |
|
519 uint8_t lengthByte; |
|
520 |
|
521 /* all 32 lengths must be read to get the offset of the first group string */ |
|
522 while(i<LINES_PER_GROUP) { |
|
523 lengthByte=*s++; |
|
524 |
|
525 /* read even nibble - MSBs of lengthByte */ |
|
526 if(length>=12) { |
|
527 /* double-nibble length spread across two bytes */ |
|
528 length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12); |
|
529 lengthByte&=0xf; |
|
530 } else if((lengthByte /* &0xf0 */)>=0xc0) { |
|
531 /* double-nibble length spread across this one byte */ |
|
532 length=(uint16_t)((lengthByte&0x3f)+12); |
|
533 } else { |
|
534 /* single-nibble length in MSBs */ |
|
535 length=(uint16_t)(lengthByte>>4); |
|
536 lengthByte&=0xf; |
|
537 } |
|
538 |
|
539 *offsets++=offset; |
|
540 *lengths++=length; |
|
541 |
|
542 offset+=length; |
|
543 ++i; |
|
544 |
|
545 /* read odd nibble - LSBs of lengthByte */ |
|
546 if((lengthByte&0xf0)==0) { |
|
547 /* this nibble was not consumed for a double-nibble length above */ |
|
548 length=lengthByte; |
|
549 if(length<12) { |
|
550 /* single-nibble length in LSBs */ |
|
551 *offsets++=offset; |
|
552 *lengths++=length; |
|
553 |
|
554 offset+=length; |
|
555 ++i; |
|
556 } |
|
557 } else { |
|
558 length=0; /* prevent double-nibble detection in the next iteration */ |
|
559 } |
|
560 } |
|
561 |
|
562 /* now, s is at the first group string */ |
|
563 return s; |
|
564 } |
|
565 |
|
566 static uint16_t |
|
567 expandGroupName(UCharNames *names, const uint16_t *group, |
|
568 uint16_t lineNumber, UCharNameChoice nameChoice, |
|
569 char *buffer, uint16_t bufferLength) { |
|
570 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; |
|
571 const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group); |
|
572 s=expandGroupLengths(s, offsets, lengths); |
|
573 return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice, |
|
574 buffer, bufferLength); |
|
575 } |
|
576 |
|
577 static uint16_t |
|
578 getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice, |
|
579 char *buffer, uint16_t bufferLength) { |
|
580 const uint16_t *group=getGroup(names, code); |
|
581 if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) { |
|
582 return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice, |
|
583 buffer, bufferLength); |
|
584 } else { |
|
585 /* group not found */ |
|
586 /* zero-terminate */ |
|
587 if(bufferLength>0) { |
|
588 *buffer=0; |
|
589 } |
|
590 return 0; |
|
591 } |
|
592 } |
|
593 |
|
594 /* |
|
595 * enumGroupNames() enumerates all the names in a 32-group |
|
596 * and either calls the enumerator function or finds a given input name. |
|
597 */ |
|
598 static UBool |
|
599 enumGroupNames(UCharNames *names, const uint16_t *group, |
|
600 UChar32 start, UChar32 end, |
|
601 UEnumCharNamesFn *fn, void *context, |
|
602 UCharNameChoice nameChoice) { |
|
603 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; |
|
604 const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group); |
|
605 |
|
606 s=expandGroupLengths(s, offsets, lengths); |
|
607 if(fn!=DO_FIND_NAME) { |
|
608 char buffer[200]; |
|
609 uint16_t length; |
|
610 |
|
611 while(start<=end) { |
|
612 length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer)); |
|
613 if (!length && nameChoice == U_EXTENDED_CHAR_NAME) { |
|
614 buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0; |
|
615 } |
|
616 /* here, we assume that the buffer is large enough */ |
|
617 if(length>0) { |
|
618 if(!fn(context, start, nameChoice, buffer, length)) { |
|
619 return FALSE; |
|
620 } |
|
621 } |
|
622 ++start; |
|
623 } |
|
624 } else { |
|
625 const char *otherName=((FindName *)context)->otherName; |
|
626 while(start<=end) { |
|
627 if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) { |
|
628 ((FindName *)context)->code=start; |
|
629 return FALSE; |
|
630 } |
|
631 ++start; |
|
632 } |
|
633 } |
|
634 return TRUE; |
|
635 } |
|
636 |
|
637 /* |
|
638 * enumExtNames enumerate extended names. |
|
639 * It only needs to do it if it is called with a real function and not |
|
640 * with the dummy DO_FIND_NAME, because u_charFromName() does a check |
|
641 * for extended names by itself. |
|
642 */ |
|
643 static UBool |
|
644 enumExtNames(UChar32 start, UChar32 end, |
|
645 UEnumCharNamesFn *fn, void *context) |
|
646 { |
|
647 if(fn!=DO_FIND_NAME) { |
|
648 char buffer[200]; |
|
649 uint16_t length; |
|
650 |
|
651 while(start<=end) { |
|
652 buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0; |
|
653 /* here, we assume that the buffer is large enough */ |
|
654 if(length>0) { |
|
655 if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) { |
|
656 return FALSE; |
|
657 } |
|
658 } |
|
659 ++start; |
|
660 } |
|
661 } |
|
662 |
|
663 return TRUE; |
|
664 } |
|
665 |
|
666 static UBool |
|
667 enumNames(UCharNames *names, |
|
668 UChar32 start, UChar32 limit, |
|
669 UEnumCharNamesFn *fn, void *context, |
|
670 UCharNameChoice nameChoice) { |
|
671 uint16_t startGroupMSB, endGroupMSB, groupCount; |
|
672 const uint16_t *group, *groupLimit; |
|
673 |
|
674 startGroupMSB=(uint16_t)(start>>GROUP_SHIFT); |
|
675 endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT); |
|
676 |
|
677 /* find the group that contains start, or the highest before it */ |
|
678 group=getGroup(names, start); |
|
679 |
|
680 if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) { |
|
681 /* enumerate synthetic names between start and the group start */ |
|
682 UChar32 extLimit=((UChar32)group[GROUP_MSB]<<GROUP_SHIFT); |
|
683 if(extLimit>limit) { |
|
684 extLimit=limit; |
|
685 } |
|
686 if(!enumExtNames(start, extLimit-1, fn, context)) { |
|
687 return FALSE; |
|
688 } |
|
689 start=extLimit; |
|
690 } |
|
691 |
|
692 if(startGroupMSB==endGroupMSB) { |
|
693 if(startGroupMSB==group[GROUP_MSB]) { |
|
694 /* if start and limit-1 are in the same group, then enumerate only in that one */ |
|
695 return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice); |
|
696 } |
|
697 } else { |
|
698 const uint16_t *groups=GET_GROUPS(names); |
|
699 groupCount=*groups++; |
|
700 groupLimit=groups+groupCount*GROUP_LENGTH; |
|
701 |
|
702 if(startGroupMSB==group[GROUP_MSB]) { |
|
703 /* enumerate characters in the partial start group */ |
|
704 if((start&GROUP_MASK)!=0) { |
|
705 if(!enumGroupNames(names, group, |
|
706 start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1, |
|
707 fn, context, nameChoice)) { |
|
708 return FALSE; |
|
709 } |
|
710 group=NEXT_GROUP(group); /* continue with the next group */ |
|
711 } |
|
712 } else if(startGroupMSB>group[GROUP_MSB]) { |
|
713 /* make sure that we start enumerating with the first group after start */ |
|
714 const uint16_t *nextGroup=NEXT_GROUP(group); |
|
715 if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) { |
|
716 UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT; |
|
717 if (end > limit) { |
|
718 end = limit; |
|
719 } |
|
720 if (!enumExtNames(start, end - 1, fn, context)) { |
|
721 return FALSE; |
|
722 } |
|
723 } |
|
724 group=nextGroup; |
|
725 } |
|
726 |
|
727 /* enumerate entire groups between the start- and end-groups */ |
|
728 while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) { |
|
729 const uint16_t *nextGroup; |
|
730 start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT; |
|
731 if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) { |
|
732 return FALSE; |
|
733 } |
|
734 nextGroup=NEXT_GROUP(group); |
|
735 if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) { |
|
736 UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT; |
|
737 if (end > limit) { |
|
738 end = limit; |
|
739 } |
|
740 if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) { |
|
741 return FALSE; |
|
742 } |
|
743 } |
|
744 group=nextGroup; |
|
745 } |
|
746 |
|
747 /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */ |
|
748 if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) { |
|
749 return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice); |
|
750 } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) { |
|
751 UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT; |
|
752 if (next > start) { |
|
753 start = next; |
|
754 } |
|
755 } else { |
|
756 return TRUE; |
|
757 } |
|
758 } |
|
759 |
|
760 /* we have not found a group, which means everything is made of |
|
761 extended names. */ |
|
762 if (nameChoice == U_EXTENDED_CHAR_NAME) { |
|
763 if (limit > UCHAR_MAX_VALUE + 1) { |
|
764 limit = UCHAR_MAX_VALUE + 1; |
|
765 } |
|
766 return enumExtNames(start, limit - 1, fn, context); |
|
767 } |
|
768 |
|
769 return TRUE; |
|
770 } |
|
771 |
|
772 static uint16_t |
|
773 writeFactorSuffix(const uint16_t *factors, uint16_t count, |
|
774 const char *s, /* suffix elements */ |
|
775 uint32_t code, |
|
776 uint16_t indexes[8], /* output fields from here */ |
|
777 const char *elementBases[8], const char *elements[8], |
|
778 char *buffer, uint16_t bufferLength) { |
|
779 uint16_t i, factor, bufferPos=0; |
|
780 char c; |
|
781 |
|
782 /* write elements according to the factors */ |
|
783 |
|
784 /* |
|
785 * the factorized elements are determined by modulo arithmetic |
|
786 * with the factors of this algorithm |
|
787 * |
|
788 * note that for fewer operations, count is decremented here |
|
789 */ |
|
790 --count; |
|
791 for(i=count; i>0; --i) { |
|
792 factor=factors[i]; |
|
793 indexes[i]=(uint16_t)(code%factor); |
|
794 code/=factor; |
|
795 } |
|
796 /* |
|
797 * we don't need to calculate the last modulus because start<=code<=end |
|
798 * guarantees here that code<=factors[0] |
|
799 */ |
|
800 indexes[0]=(uint16_t)code; |
|
801 |
|
802 /* write each element */ |
|
803 for(;;) { |
|
804 if(elementBases!=NULL) { |
|
805 *elementBases++=s; |
|
806 } |
|
807 |
|
808 /* skip indexes[i] strings */ |
|
809 factor=indexes[i]; |
|
810 while(factor>0) { |
|
811 while(*s++!=0) {} |
|
812 --factor; |
|
813 } |
|
814 if(elements!=NULL) { |
|
815 *elements++=s; |
|
816 } |
|
817 |
|
818 /* write element */ |
|
819 while((c=*s++)!=0) { |
|
820 WRITE_CHAR(buffer, bufferLength, bufferPos, c); |
|
821 } |
|
822 |
|
823 /* we do not need to perform the rest of this loop for i==count - break here */ |
|
824 if(i>=count) { |
|
825 break; |
|
826 } |
|
827 |
|
828 /* skip the rest of the strings for this factors[i] */ |
|
829 factor=(uint16_t)(factors[i]-indexes[i]-1); |
|
830 while(factor>0) { |
|
831 while(*s++!=0) {} |
|
832 --factor; |
|
833 } |
|
834 |
|
835 ++i; |
|
836 } |
|
837 |
|
838 /* zero-terminate */ |
|
839 if(bufferLength>0) { |
|
840 *buffer=0; |
|
841 } |
|
842 |
|
843 return bufferPos; |
|
844 } |
|
845 |
|
846 /* |
|
847 * Important: |
|
848 * Parts of findAlgName() are almost the same as some of getAlgName(). |
|
849 * Fixes must be applied to both. |
|
850 */ |
|
851 static uint16_t |
|
852 getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice, |
|
853 char *buffer, uint16_t bufferLength) { |
|
854 uint16_t bufferPos=0; |
|
855 |
|
856 /* Only the normative character name can be algorithmic. */ |
|
857 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { |
|
858 /* zero-terminate */ |
|
859 if(bufferLength>0) { |
|
860 *buffer=0; |
|
861 } |
|
862 return 0; |
|
863 } |
|
864 |
|
865 switch(range->type) { |
|
866 case 0: { |
|
867 /* name = prefix hex-digits */ |
|
868 const char *s=(const char *)(range+1); |
|
869 char c; |
|
870 |
|
871 uint16_t i, count; |
|
872 |
|
873 /* copy prefix */ |
|
874 while((c=*s++)!=0) { |
|
875 WRITE_CHAR(buffer, bufferLength, bufferPos, c); |
|
876 } |
|
877 |
|
878 /* write hexadecimal code point value */ |
|
879 count=range->variant; |
|
880 |
|
881 /* zero-terminate */ |
|
882 if(count<bufferLength) { |
|
883 buffer[count]=0; |
|
884 } |
|
885 |
|
886 for(i=count; i>0;) { |
|
887 if(--i<bufferLength) { |
|
888 c=(char)(code&0xf); |
|
889 if(c<10) { |
|
890 c+='0'; |
|
891 } else { |
|
892 c+='A'-10; |
|
893 } |
|
894 buffer[i]=c; |
|
895 } |
|
896 code>>=4; |
|
897 } |
|
898 |
|
899 bufferPos+=count; |
|
900 break; |
|
901 } |
|
902 case 1: { |
|
903 /* name = prefix factorized-elements */ |
|
904 uint16_t indexes[8]; |
|
905 const uint16_t *factors=(const uint16_t *)(range+1); |
|
906 uint16_t count=range->variant; |
|
907 const char *s=(const char *)(factors+count); |
|
908 char c; |
|
909 |
|
910 /* copy prefix */ |
|
911 while((c=*s++)!=0) { |
|
912 WRITE_CHAR(buffer, bufferLength, bufferPos, c); |
|
913 } |
|
914 |
|
915 bufferPos+=writeFactorSuffix(factors, count, |
|
916 s, code-range->start, indexes, NULL, NULL, buffer, bufferLength); |
|
917 break; |
|
918 } |
|
919 default: |
|
920 /* undefined type */ |
|
921 /* zero-terminate */ |
|
922 if(bufferLength>0) { |
|
923 *buffer=0; |
|
924 } |
|
925 break; |
|
926 } |
|
927 |
|
928 return bufferPos; |
|
929 } |
|
930 |
|
931 /* |
|
932 * Important: enumAlgNames() and findAlgName() are almost the same. |
|
933 * Any fix must be applied to both. |
|
934 */ |
|
935 static UBool |
|
936 enumAlgNames(AlgorithmicRange *range, |
|
937 UChar32 start, UChar32 limit, |
|
938 UEnumCharNamesFn *fn, void *context, |
|
939 UCharNameChoice nameChoice) { |
|
940 char buffer[200]; |
|
941 uint16_t length; |
|
942 |
|
943 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { |
|
944 return TRUE; |
|
945 } |
|
946 |
|
947 switch(range->type) { |
|
948 case 0: { |
|
949 char *s, *end; |
|
950 char c; |
|
951 |
|
952 /* get the full name of the start character */ |
|
953 length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer)); |
|
954 if(length<=0) { |
|
955 return TRUE; |
|
956 } |
|
957 |
|
958 /* call the enumerator function with this first character */ |
|
959 if(!fn(context, start, nameChoice, buffer, length)) { |
|
960 return FALSE; |
|
961 } |
|
962 |
|
963 /* go to the end of the name; all these names have the same length */ |
|
964 end=buffer; |
|
965 while(*end!=0) { |
|
966 ++end; |
|
967 } |
|
968 |
|
969 /* enumerate the rest of the names */ |
|
970 while(++start<limit) { |
|
971 /* increment the hexadecimal number on a character-basis */ |
|
972 s=end; |
|
973 for (;;) { |
|
974 c=*--s; |
|
975 if(('0'<=c && c<'9') || ('A'<=c && c<'F')) { |
|
976 *s=(char)(c+1); |
|
977 break; |
|
978 } else if(c=='9') { |
|
979 *s='A'; |
|
980 break; |
|
981 } else if(c=='F') { |
|
982 *s='0'; |
|
983 } |
|
984 } |
|
985 |
|
986 if(!fn(context, start, nameChoice, buffer, length)) { |
|
987 return FALSE; |
|
988 } |
|
989 } |
|
990 break; |
|
991 } |
|
992 case 1: { |
|
993 uint16_t indexes[8]; |
|
994 const char *elementBases[8], *elements[8]; |
|
995 const uint16_t *factors=(const uint16_t *)(range+1); |
|
996 uint16_t count=range->variant; |
|
997 const char *s=(const char *)(factors+count); |
|
998 char *suffix, *t; |
|
999 uint16_t prefixLength, i, idx; |
|
1000 |
|
1001 char c; |
|
1002 |
|
1003 /* name = prefix factorized-elements */ |
|
1004 |
|
1005 /* copy prefix */ |
|
1006 suffix=buffer; |
|
1007 prefixLength=0; |
|
1008 while((c=*s++)!=0) { |
|
1009 *suffix++=c; |
|
1010 ++prefixLength; |
|
1011 } |
|
1012 |
|
1013 /* append the suffix of the start character */ |
|
1014 length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count, |
|
1015 s, (uint32_t)start-range->start, |
|
1016 indexes, elementBases, elements, |
|
1017 suffix, (uint16_t)(sizeof(buffer)-prefixLength))); |
|
1018 |
|
1019 /* call the enumerator function with this first character */ |
|
1020 if(!fn(context, start, nameChoice, buffer, length)) { |
|
1021 return FALSE; |
|
1022 } |
|
1023 |
|
1024 /* enumerate the rest of the names */ |
|
1025 while(++start<limit) { |
|
1026 /* increment the indexes in lexical order bound by the factors */ |
|
1027 i=count; |
|
1028 for (;;) { |
|
1029 idx=(uint16_t)(indexes[--i]+1); |
|
1030 if(idx<factors[i]) { |
|
1031 /* skip one index and its element string */ |
|
1032 indexes[i]=idx; |
|
1033 s=elements[i]; |
|
1034 while(*s++!=0) { |
|
1035 } |
|
1036 elements[i]=s; |
|
1037 break; |
|
1038 } else { |
|
1039 /* reset this index to 0 and its element string to the first one */ |
|
1040 indexes[i]=0; |
|
1041 elements[i]=elementBases[i]; |
|
1042 } |
|
1043 } |
|
1044 |
|
1045 /* to make matters a little easier, just append all elements to the suffix */ |
|
1046 t=suffix; |
|
1047 length=prefixLength; |
|
1048 for(i=0; i<count; ++i) { |
|
1049 s=elements[i]; |
|
1050 while((c=*s++)!=0) { |
|
1051 *t++=c; |
|
1052 ++length; |
|
1053 } |
|
1054 } |
|
1055 /* zero-terminate */ |
|
1056 *t=0; |
|
1057 |
|
1058 if(!fn(context, start, nameChoice, buffer, length)) { |
|
1059 return FALSE; |
|
1060 } |
|
1061 } |
|
1062 break; |
|
1063 } |
|
1064 default: |
|
1065 /* undefined type */ |
|
1066 break; |
|
1067 } |
|
1068 |
|
1069 return TRUE; |
|
1070 } |
|
1071 |
|
1072 /* |
|
1073 * findAlgName() is almost the same as enumAlgNames() except that it |
|
1074 * returns the code point for a name if it fits into the range. |
|
1075 * It returns 0xffff otherwise. |
|
1076 */ |
|
1077 static UChar32 |
|
1078 findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) { |
|
1079 UChar32 code; |
|
1080 |
|
1081 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { |
|
1082 return 0xffff; |
|
1083 } |
|
1084 |
|
1085 switch(range->type) { |
|
1086 case 0: { |
|
1087 /* name = prefix hex-digits */ |
|
1088 const char *s=(const char *)(range+1); |
|
1089 char c; |
|
1090 |
|
1091 uint16_t i, count; |
|
1092 |
|
1093 /* compare prefix */ |
|
1094 while((c=*s++)!=0) { |
|
1095 if((char)c!=*otherName++) { |
|
1096 return 0xffff; |
|
1097 } |
|
1098 } |
|
1099 |
|
1100 /* read hexadecimal code point value */ |
|
1101 count=range->variant; |
|
1102 code=0; |
|
1103 for(i=0; i<count; ++i) { |
|
1104 c=*otherName++; |
|
1105 if('0'<=c && c<='9') { |
|
1106 code=(code<<4)|(c-'0'); |
|
1107 } else if('A'<=c && c<='F') { |
|
1108 code=(code<<4)|(c-'A'+10); |
|
1109 } else { |
|
1110 return 0xffff; |
|
1111 } |
|
1112 } |
|
1113 |
|
1114 /* does it fit into the range? */ |
|
1115 if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) { |
|
1116 return code; |
|
1117 } |
|
1118 break; |
|
1119 } |
|
1120 case 1: { |
|
1121 char buffer[64]; |
|
1122 uint16_t indexes[8]; |
|
1123 const char *elementBases[8], *elements[8]; |
|
1124 const uint16_t *factors=(const uint16_t *)(range+1); |
|
1125 uint16_t count=range->variant; |
|
1126 const char *s=(const char *)(factors+count), *t; |
|
1127 UChar32 start, limit; |
|
1128 uint16_t i, idx; |
|
1129 |
|
1130 char c; |
|
1131 |
|
1132 /* name = prefix factorized-elements */ |
|
1133 |
|
1134 /* compare prefix */ |
|
1135 while((c=*s++)!=0) { |
|
1136 if((char)c!=*otherName++) { |
|
1137 return 0xffff; |
|
1138 } |
|
1139 } |
|
1140 |
|
1141 start=(UChar32)range->start; |
|
1142 limit=(UChar32)(range->end+1); |
|
1143 |
|
1144 /* initialize the suffix elements for enumeration; indexes should all be set to 0 */ |
|
1145 writeFactorSuffix(factors, count, s, 0, |
|
1146 indexes, elementBases, elements, buffer, sizeof(buffer)); |
|
1147 |
|
1148 /* compare the first suffix */ |
|
1149 if(0==uprv_strcmp(otherName, buffer)) { |
|
1150 return start; |
|
1151 } |
|
1152 |
|
1153 /* enumerate and compare the rest of the suffixes */ |
|
1154 while(++start<limit) { |
|
1155 /* increment the indexes in lexical order bound by the factors */ |
|
1156 i=count; |
|
1157 for (;;) { |
|
1158 idx=(uint16_t)(indexes[--i]+1); |
|
1159 if(idx<factors[i]) { |
|
1160 /* skip one index and its element string */ |
|
1161 indexes[i]=idx; |
|
1162 s=elements[i]; |
|
1163 while(*s++!=0) {} |
|
1164 elements[i]=s; |
|
1165 break; |
|
1166 } else { |
|
1167 /* reset this index to 0 and its element string to the first one */ |
|
1168 indexes[i]=0; |
|
1169 elements[i]=elementBases[i]; |
|
1170 } |
|
1171 } |
|
1172 |
|
1173 /* to make matters a little easier, just compare all elements of the suffix */ |
|
1174 t=otherName; |
|
1175 for(i=0; i<count; ++i) { |
|
1176 s=elements[i]; |
|
1177 while((c=*s++)!=0) { |
|
1178 if(c!=*t++) { |
|
1179 s=""; /* does not match */ |
|
1180 i=99; |
|
1181 } |
|
1182 } |
|
1183 } |
|
1184 if(i<99 && *t==0) { |
|
1185 return start; |
|
1186 } |
|
1187 } |
|
1188 break; |
|
1189 } |
|
1190 default: |
|
1191 /* undefined type */ |
|
1192 break; |
|
1193 } |
|
1194 |
|
1195 return 0xffff; |
|
1196 } |
|
1197 |
|
1198 /* sets of name characters, maximum name lengths ---------------------------- */ |
|
1199 |
|
1200 #define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f))) |
|
1201 #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0) |
|
1202 |
|
1203 static int32_t |
|
1204 calcStringSetLength(uint32_t set[8], const char *s) { |
|
1205 int32_t length=0; |
|
1206 char c; |
|
1207 |
|
1208 while((c=*s++)!=0) { |
|
1209 SET_ADD(set, c); |
|
1210 ++length; |
|
1211 } |
|
1212 return length; |
|
1213 } |
|
1214 |
|
1215 static int32_t |
|
1216 calcAlgNameSetsLengths(int32_t maxNameLength) { |
|
1217 AlgorithmicRange *range; |
|
1218 uint32_t *p; |
|
1219 uint32_t rangeCount; |
|
1220 int32_t length; |
|
1221 |
|
1222 /* enumerate algorithmic ranges */ |
|
1223 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); |
|
1224 rangeCount=*p; |
|
1225 range=(AlgorithmicRange *)(p+1); |
|
1226 while(rangeCount>0) { |
|
1227 switch(range->type) { |
|
1228 case 0: |
|
1229 /* name = prefix + (range->variant times) hex-digits */ |
|
1230 /* prefix */ |
|
1231 length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant; |
|
1232 if(length>maxNameLength) { |
|
1233 maxNameLength=length; |
|
1234 } |
|
1235 break; |
|
1236 case 1: { |
|
1237 /* name = prefix factorized-elements */ |
|
1238 const uint16_t *factors=(const uint16_t *)(range+1); |
|
1239 const char *s; |
|
1240 int32_t i, count=range->variant, factor, factorLength, maxFactorLength; |
|
1241 |
|
1242 /* prefix length */ |
|
1243 s=(const char *)(factors+count); |
|
1244 length=calcStringSetLength(gNameSet, s); |
|
1245 s+=length+1; /* start of factor suffixes */ |
|
1246 |
|
1247 /* get the set and maximum factor suffix length for each factor */ |
|
1248 for(i=0; i<count; ++i) { |
|
1249 maxFactorLength=0; |
|
1250 for(factor=factors[i]; factor>0; --factor) { |
|
1251 factorLength=calcStringSetLength(gNameSet, s); |
|
1252 s+=factorLength+1; |
|
1253 if(factorLength>maxFactorLength) { |
|
1254 maxFactorLength=factorLength; |
|
1255 } |
|
1256 } |
|
1257 length+=maxFactorLength; |
|
1258 } |
|
1259 |
|
1260 if(length>maxNameLength) { |
|
1261 maxNameLength=length; |
|
1262 } |
|
1263 break; |
|
1264 } |
|
1265 default: |
|
1266 /* unknown type */ |
|
1267 break; |
|
1268 } |
|
1269 |
|
1270 range=(AlgorithmicRange *)((uint8_t *)range+range->size); |
|
1271 --rangeCount; |
|
1272 } |
|
1273 return maxNameLength; |
|
1274 } |
|
1275 |
|
1276 static int32_t |
|
1277 calcExtNameSetsLengths(int32_t maxNameLength) { |
|
1278 int32_t i, length; |
|
1279 |
|
1280 for(i=0; i<LENGTHOF(charCatNames); ++i) { |
|
1281 /* |
|
1282 * for each category, count the length of the category name |
|
1283 * plus 9= |
|
1284 * 2 for <> |
|
1285 * 1 for - |
|
1286 * 6 for most hex digits per code point |
|
1287 */ |
|
1288 length=9+calcStringSetLength(gNameSet, charCatNames[i]); |
|
1289 if(length>maxNameLength) { |
|
1290 maxNameLength=length; |
|
1291 } |
|
1292 } |
|
1293 return maxNameLength; |
|
1294 } |
|
1295 |
|
1296 static int32_t |
|
1297 calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths, |
|
1298 uint32_t set[8], |
|
1299 const uint8_t **pLine, const uint8_t *lineLimit) { |
|
1300 const uint8_t *line=*pLine; |
|
1301 int32_t length=0, tokenLength; |
|
1302 uint16_t c, token; |
|
1303 |
|
1304 while(line!=lineLimit && (c=*line++)!=(uint8_t)';') { |
|
1305 if(c>=tokenCount) { |
|
1306 /* implicit letter */ |
|
1307 SET_ADD(set, c); |
|
1308 ++length; |
|
1309 } else { |
|
1310 token=tokens[c]; |
|
1311 if(token==(uint16_t)(-2)) { |
|
1312 /* this is a lead byte for a double-byte token */ |
|
1313 c=c<<8|*line++; |
|
1314 token=tokens[c]; |
|
1315 } |
|
1316 if(token==(uint16_t)(-1)) { |
|
1317 /* explicit letter */ |
|
1318 SET_ADD(set, c); |
|
1319 ++length; |
|
1320 } else { |
|
1321 /* count token word */ |
|
1322 if(tokenLengths!=NULL) { |
|
1323 /* use cached token length */ |
|
1324 tokenLength=tokenLengths[c]; |
|
1325 if(tokenLength==0) { |
|
1326 tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token); |
|
1327 tokenLengths[c]=(int8_t)tokenLength; |
|
1328 } |
|
1329 } else { |
|
1330 tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token); |
|
1331 } |
|
1332 length+=tokenLength; |
|
1333 } |
|
1334 } |
|
1335 } |
|
1336 |
|
1337 *pLine=line; |
|
1338 return length; |
|
1339 } |
|
1340 |
|
1341 static void |
|
1342 calcGroupNameSetsLengths(int32_t maxNameLength) { |
|
1343 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; |
|
1344 |
|
1345 uint16_t *tokens=(uint16_t *)uCharNames+8; |
|
1346 uint16_t tokenCount=*tokens++; |
|
1347 uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset; |
|
1348 |
|
1349 int8_t *tokenLengths; |
|
1350 |
|
1351 const uint16_t *group; |
|
1352 const uint8_t *s, *line, *lineLimit; |
|
1353 |
|
1354 int32_t groupCount, lineNumber, length; |
|
1355 |
|
1356 tokenLengths=(int8_t *)uprv_malloc(tokenCount); |
|
1357 if(tokenLengths!=NULL) { |
|
1358 uprv_memset(tokenLengths, 0, tokenCount); |
|
1359 } |
|
1360 |
|
1361 group=GET_GROUPS(uCharNames); |
|
1362 groupCount=*group++; |
|
1363 |
|
1364 /* enumerate all groups */ |
|
1365 while(groupCount>0) { |
|
1366 s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group); |
|
1367 s=expandGroupLengths(s, offsets, lengths); |
|
1368 |
|
1369 /* enumerate all lines in each group */ |
|
1370 for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) { |
|
1371 line=s+offsets[lineNumber]; |
|
1372 length=lengths[lineNumber]; |
|
1373 if(length==0) { |
|
1374 continue; |
|
1375 } |
|
1376 |
|
1377 lineLimit=line+length; |
|
1378 |
|
1379 /* read regular name */ |
|
1380 length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit); |
|
1381 if(length>maxNameLength) { |
|
1382 maxNameLength=length; |
|
1383 } |
|
1384 if(line==lineLimit) { |
|
1385 continue; |
|
1386 } |
|
1387 |
|
1388 /* read Unicode 1.0 name */ |
|
1389 length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit); |
|
1390 if(length>maxNameLength) { |
|
1391 maxNameLength=length; |
|
1392 } |
|
1393 if(line==lineLimit) { |
|
1394 continue; |
|
1395 } |
|
1396 |
|
1397 /* read ISO comment */ |
|
1398 /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/ |
|
1399 } |
|
1400 |
|
1401 group=NEXT_GROUP(group); |
|
1402 --groupCount; |
|
1403 } |
|
1404 |
|
1405 if(tokenLengths!=NULL) { |
|
1406 uprv_free(tokenLengths); |
|
1407 } |
|
1408 |
|
1409 /* set gMax... - name length last for threading */ |
|
1410 gMaxNameLength=maxNameLength; |
|
1411 } |
|
1412 |
|
1413 static UBool |
|
1414 calcNameSetsLengths(UErrorCode *pErrorCode) { |
|
1415 static const char extChars[]="0123456789ABCDEF<>-"; |
|
1416 int32_t i, maxNameLength; |
|
1417 |
|
1418 if(gMaxNameLength!=0) { |
|
1419 return TRUE; |
|
1420 } |
|
1421 |
|
1422 if(!isDataLoaded(pErrorCode)) { |
|
1423 return FALSE; |
|
1424 } |
|
1425 |
|
1426 /* set hex digits, used in various names, and <>-, used in extended names */ |
|
1427 for(i=0; i<(int32_t)sizeof(extChars)-1; ++i) { |
|
1428 SET_ADD(gNameSet, extChars[i]); |
|
1429 } |
|
1430 |
|
1431 /* set sets and lengths from algorithmic names */ |
|
1432 maxNameLength=calcAlgNameSetsLengths(0); |
|
1433 |
|
1434 /* set sets and lengths from extended names */ |
|
1435 maxNameLength=calcExtNameSetsLengths(maxNameLength); |
|
1436 |
|
1437 /* set sets and lengths from group names, set global maximum values */ |
|
1438 calcGroupNameSetsLengths(maxNameLength); |
|
1439 |
|
1440 return TRUE; |
|
1441 } |
|
1442 |
|
1443 /* public API --------------------------------------------------------------- */ |
|
1444 |
|
1445 U_CAPI int32_t U_EXPORT2 |
|
1446 u_charName(UChar32 code, UCharNameChoice nameChoice, |
|
1447 char *buffer, int32_t bufferLength, |
|
1448 UErrorCode *pErrorCode) { |
|
1449 AlgorithmicRange *algRange; |
|
1450 uint32_t *p; |
|
1451 uint32_t i; |
|
1452 int32_t length; |
|
1453 |
|
1454 /* check the argument values */ |
|
1455 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
|
1456 return 0; |
|
1457 } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || |
|
1458 bufferLength<0 || (bufferLength>0 && buffer==NULL) |
|
1459 ) { |
|
1460 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
1461 return 0; |
|
1462 } |
|
1463 |
|
1464 if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) { |
|
1465 return u_terminateChars(buffer, bufferLength, 0, pErrorCode); |
|
1466 } |
|
1467 |
|
1468 length=0; |
|
1469 |
|
1470 /* try algorithmic names first */ |
|
1471 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); |
|
1472 i=*p; |
|
1473 algRange=(AlgorithmicRange *)(p+1); |
|
1474 while(i>0) { |
|
1475 if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) { |
|
1476 length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength); |
|
1477 break; |
|
1478 } |
|
1479 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); |
|
1480 --i; |
|
1481 } |
|
1482 |
|
1483 if(i==0) { |
|
1484 if (nameChoice == U_EXTENDED_CHAR_NAME) { |
|
1485 length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength); |
|
1486 if (!length) { |
|
1487 /* extended character name */ |
|
1488 length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength); |
|
1489 } |
|
1490 } else { |
|
1491 /* normal character name */ |
|
1492 length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength); |
|
1493 } |
|
1494 } |
|
1495 |
|
1496 return u_terminateChars(buffer, bufferLength, length, pErrorCode); |
|
1497 } |
|
1498 |
|
1499 U_CAPI int32_t U_EXPORT2 |
|
1500 u_getISOComment(UChar32 /*c*/, |
|
1501 char *dest, int32_t destCapacity, |
|
1502 UErrorCode *pErrorCode) { |
|
1503 /* check the argument values */ |
|
1504 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
|
1505 return 0; |
|
1506 } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) { |
|
1507 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
1508 return 0; |
|
1509 } |
|
1510 |
|
1511 return u_terminateChars(dest, destCapacity, 0, pErrorCode); |
|
1512 } |
|
1513 |
|
1514 U_CAPI UChar32 U_EXPORT2 |
|
1515 u_charFromName(UCharNameChoice nameChoice, |
|
1516 const char *name, |
|
1517 UErrorCode *pErrorCode) { |
|
1518 char upper[120], lower[120]; |
|
1519 FindName findName; |
|
1520 AlgorithmicRange *algRange; |
|
1521 uint32_t *p; |
|
1522 uint32_t i; |
|
1523 UChar32 cp = 0; |
|
1524 char c0; |
|
1525 UChar32 error = 0xffff; /* Undefined, but use this for backwards compatibility. */ |
|
1526 |
|
1527 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
|
1528 return error; |
|
1529 } |
|
1530 |
|
1531 if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) { |
|
1532 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
1533 return error; |
|
1534 } |
|
1535 |
|
1536 if(!isDataLoaded(pErrorCode)) { |
|
1537 return error; |
|
1538 } |
|
1539 |
|
1540 /* construct the uppercase and lowercase of the name first */ |
|
1541 for(i=0; i<sizeof(upper); ++i) { |
|
1542 if((c0=*name++)!=0) { |
|
1543 upper[i]=uprv_toupper(c0); |
|
1544 lower[i]=uprv_tolower(c0); |
|
1545 } else { |
|
1546 upper[i]=lower[i]=0; |
|
1547 break; |
|
1548 } |
|
1549 } |
|
1550 if(i==sizeof(upper)) { |
|
1551 /* name too long, there is no such character */ |
|
1552 *pErrorCode = U_ILLEGAL_CHAR_FOUND; |
|
1553 return error; |
|
1554 } |
|
1555 |
|
1556 /* try extended names first */ |
|
1557 if (lower[0] == '<') { |
|
1558 if (nameChoice == U_EXTENDED_CHAR_NAME) { |
|
1559 if (lower[--i] == '>') { |
|
1560 for (--i; lower[i] && lower[i] != '-'; --i) { |
|
1561 } |
|
1562 |
|
1563 if (lower[i] == '-') { /* We've got a category. */ |
|
1564 uint32_t cIdx; |
|
1565 |
|
1566 lower[i] = 0; |
|
1567 |
|
1568 for (++i; lower[i] != '>'; ++i) { |
|
1569 if (lower[i] >= '0' && lower[i] <= '9') { |
|
1570 cp = (cp << 4) + lower[i] - '0'; |
|
1571 } else if (lower[i] >= 'a' && lower[i] <= 'f') { |
|
1572 cp = (cp << 4) + lower[i] - 'a' + 10; |
|
1573 } else { |
|
1574 *pErrorCode = U_ILLEGAL_CHAR_FOUND; |
|
1575 return error; |
|
1576 } |
|
1577 } |
|
1578 |
|
1579 /* Now validate the category name. |
|
1580 We could use a binary search, or a trie, if |
|
1581 we really wanted to. */ |
|
1582 |
|
1583 for (lower[i] = 0, cIdx = 0; cIdx < LENGTHOF(charCatNames); ++cIdx) { |
|
1584 |
|
1585 if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) { |
|
1586 if (getCharCat(cp) == cIdx) { |
|
1587 return cp; |
|
1588 } |
|
1589 break; |
|
1590 } |
|
1591 } |
|
1592 } |
|
1593 } |
|
1594 } |
|
1595 |
|
1596 *pErrorCode = U_ILLEGAL_CHAR_FOUND; |
|
1597 return error; |
|
1598 } |
|
1599 |
|
1600 /* try algorithmic names now */ |
|
1601 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); |
|
1602 i=*p; |
|
1603 algRange=(AlgorithmicRange *)(p+1); |
|
1604 while(i>0) { |
|
1605 if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) { |
|
1606 return cp; |
|
1607 } |
|
1608 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); |
|
1609 --i; |
|
1610 } |
|
1611 |
|
1612 /* normal character name */ |
|
1613 findName.otherName=upper; |
|
1614 findName.code=error; |
|
1615 enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice); |
|
1616 if (findName.code == error) { |
|
1617 *pErrorCode = U_ILLEGAL_CHAR_FOUND; |
|
1618 } |
|
1619 return findName.code; |
|
1620 } |
|
1621 |
|
1622 U_CAPI void U_EXPORT2 |
|
1623 u_enumCharNames(UChar32 start, UChar32 limit, |
|
1624 UEnumCharNamesFn *fn, |
|
1625 void *context, |
|
1626 UCharNameChoice nameChoice, |
|
1627 UErrorCode *pErrorCode) { |
|
1628 AlgorithmicRange *algRange; |
|
1629 uint32_t *p; |
|
1630 uint32_t i; |
|
1631 |
|
1632 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
|
1633 return; |
|
1634 } |
|
1635 |
|
1636 if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) { |
|
1637 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
1638 return; |
|
1639 } |
|
1640 |
|
1641 if((uint32_t) limit > UCHAR_MAX_VALUE + 1) { |
|
1642 limit = UCHAR_MAX_VALUE + 1; |
|
1643 } |
|
1644 if((uint32_t)start>=(uint32_t)limit) { |
|
1645 return; |
|
1646 } |
|
1647 |
|
1648 if(!isDataLoaded(pErrorCode)) { |
|
1649 return; |
|
1650 } |
|
1651 |
|
1652 /* interleave the data-driven ones with the algorithmic ones */ |
|
1653 /* iterate over all algorithmic ranges; assume that they are in ascending order */ |
|
1654 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); |
|
1655 i=*p; |
|
1656 algRange=(AlgorithmicRange *)(p+1); |
|
1657 while(i>0) { |
|
1658 /* enumerate the character names before the current algorithmic range */ |
|
1659 /* here: start<limit */ |
|
1660 if((uint32_t)start<algRange->start) { |
|
1661 if((uint32_t)limit<=algRange->start) { |
|
1662 enumNames(uCharNames, start, limit, fn, context, nameChoice); |
|
1663 return; |
|
1664 } |
|
1665 if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) { |
|
1666 return; |
|
1667 } |
|
1668 start=(UChar32)algRange->start; |
|
1669 } |
|
1670 /* enumerate the character names in the current algorithmic range */ |
|
1671 /* here: algRange->start<=start<limit */ |
|
1672 if((uint32_t)start<=algRange->end) { |
|
1673 if((uint32_t)limit<=(algRange->end+1)) { |
|
1674 enumAlgNames(algRange, start, limit, fn, context, nameChoice); |
|
1675 return; |
|
1676 } |
|
1677 if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) { |
|
1678 return; |
|
1679 } |
|
1680 start=(UChar32)algRange->end+1; |
|
1681 } |
|
1682 /* continue to the next algorithmic range (here: start<limit) */ |
|
1683 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); |
|
1684 --i; |
|
1685 } |
|
1686 /* enumerate the character names after the last algorithmic range */ |
|
1687 enumNames(uCharNames, start, limit, fn, context, nameChoice); |
|
1688 } |
|
1689 |
|
1690 U_CAPI int32_t U_EXPORT2 |
|
1691 uprv_getMaxCharNameLength() { |
|
1692 UErrorCode errorCode=U_ZERO_ERROR; |
|
1693 if(calcNameSetsLengths(&errorCode)) { |
|
1694 return gMaxNameLength; |
|
1695 } else { |
|
1696 return 0; |
|
1697 } |
|
1698 } |
|
1699 |
|
1700 /** |
|
1701 * Converts the char set cset into a Unicode set uset. |
|
1702 * @param cset Set of 256 bit flags corresponding to a set of chars. |
|
1703 * @param uset USet to receive characters. Existing contents are deleted. |
|
1704 */ |
|
1705 static void |
|
1706 charSetToUSet(uint32_t cset[8], const USetAdder *sa) { |
|
1707 UChar us[256]; |
|
1708 char cs[256]; |
|
1709 |
|
1710 int32_t i, length; |
|
1711 UErrorCode errorCode; |
|
1712 |
|
1713 errorCode=U_ZERO_ERROR; |
|
1714 |
|
1715 if(!calcNameSetsLengths(&errorCode)) { |
|
1716 return; |
|
1717 } |
|
1718 |
|
1719 /* build a char string with all chars that are used in character names */ |
|
1720 length=0; |
|
1721 for(i=0; i<256; ++i) { |
|
1722 if(SET_CONTAINS(cset, i)) { |
|
1723 cs[length++]=(char)i; |
|
1724 } |
|
1725 } |
|
1726 |
|
1727 /* convert the char string to a UChar string */ |
|
1728 u_charsToUChars(cs, us, length); |
|
1729 |
|
1730 /* add each UChar to the USet */ |
|
1731 for(i=0; i<length; ++i) { |
|
1732 if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */ |
|
1733 sa->add(sa->set, us[i]); |
|
1734 } |
|
1735 } |
|
1736 } |
|
1737 |
|
1738 /** |
|
1739 * Fills set with characters that are used in Unicode character names. |
|
1740 * @param set USet to receive characters. |
|
1741 */ |
|
1742 U_CAPI void U_EXPORT2 |
|
1743 uprv_getCharNameCharacters(const USetAdder *sa) { |
|
1744 charSetToUSet(gNameSet, sa); |
|
1745 } |
|
1746 |
|
1747 /* data swapping ------------------------------------------------------------ */ |
|
1748 |
|
1749 /* |
|
1750 * The token table contains non-negative entries for token bytes, |
|
1751 * and -1 for bytes that represent themselves in the data file's charset. |
|
1752 * -2 entries are used for lead bytes. |
|
1753 * |
|
1754 * Direct bytes (-1 entries) must be translated from the input charset family |
|
1755 * to the output charset family. |
|
1756 * makeTokenMap() writes a permutation mapping for this. |
|
1757 * Use it once for single-/lead-byte tokens and once more for all trail byte |
|
1758 * tokens. (';' is an unused trail byte marked with -1.) |
|
1759 */ |
|
1760 static void |
|
1761 makeTokenMap(const UDataSwapper *ds, |
|
1762 int16_t tokens[], uint16_t tokenCount, |
|
1763 uint8_t map[256], |
|
1764 UErrorCode *pErrorCode) { |
|
1765 UBool usedOutChar[256]; |
|
1766 uint16_t i, j; |
|
1767 uint8_t c1, c2; |
|
1768 |
|
1769 if(U_FAILURE(*pErrorCode)) { |
|
1770 return; |
|
1771 } |
|
1772 |
|
1773 if(ds->inCharset==ds->outCharset) { |
|
1774 /* Same charset family: identity permutation */ |
|
1775 for(i=0; i<256; ++i) { |
|
1776 map[i]=(uint8_t)i; |
|
1777 } |
|
1778 } else { |
|
1779 uprv_memset(map, 0, 256); |
|
1780 uprv_memset(usedOutChar, 0, 256); |
|
1781 |
|
1782 if(tokenCount>256) { |
|
1783 tokenCount=256; |
|
1784 } |
|
1785 |
|
1786 /* set the direct bytes (byte 0 always maps to itself) */ |
|
1787 for(i=1; i<tokenCount; ++i) { |
|
1788 if(tokens[i]==-1) { |
|
1789 /* convert the direct byte character */ |
|
1790 c1=(uint8_t)i; |
|
1791 ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode); |
|
1792 if(U_FAILURE(*pErrorCode)) { |
|
1793 udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n", |
|
1794 i, ds->inCharset); |
|
1795 return; |
|
1796 } |
|
1797 |
|
1798 /* enter the converted character into the map and mark it used */ |
|
1799 map[c1]=c2; |
|
1800 usedOutChar[c2]=TRUE; |
|
1801 } |
|
1802 } |
|
1803 |
|
1804 /* set the mappings for the rest of the permutation */ |
|
1805 for(i=j=1; i<tokenCount; ++i) { |
|
1806 /* set mappings that were not set for direct bytes */ |
|
1807 if(map[i]==0) { |
|
1808 /* set an output byte value that was not used as an output byte above */ |
|
1809 while(usedOutChar[j]) { |
|
1810 ++j; |
|
1811 } |
|
1812 map[i]=(uint8_t)j++; |
|
1813 } |
|
1814 } |
|
1815 |
|
1816 /* |
|
1817 * leave mappings at tokenCount and above unset if tokenCount<256 |
|
1818 * because they won't be used |
|
1819 */ |
|
1820 } |
|
1821 } |
|
1822 |
|
1823 U_CAPI int32_t U_EXPORT2 |
|
1824 uchar_swapNames(const UDataSwapper *ds, |
|
1825 const void *inData, int32_t length, void *outData, |
|
1826 UErrorCode *pErrorCode) { |
|
1827 const UDataInfo *pInfo; |
|
1828 int32_t headerSize; |
|
1829 |
|
1830 const uint8_t *inBytes; |
|
1831 uint8_t *outBytes; |
|
1832 |
|
1833 uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset, |
|
1834 offset, i, count, stringsCount; |
|
1835 |
|
1836 const AlgorithmicRange *inRange; |
|
1837 AlgorithmicRange *outRange; |
|
1838 |
|
1839 /* udata_swapDataHeader checks the arguments */ |
|
1840 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); |
|
1841 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
|
1842 return 0; |
|
1843 } |
|
1844 |
|
1845 /* check data format and format version */ |
|
1846 pInfo=(const UDataInfo *)((const char *)inData+4); |
|
1847 if(!( |
|
1848 pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */ |
|
1849 pInfo->dataFormat[1]==0x6e && |
|
1850 pInfo->dataFormat[2]==0x61 && |
|
1851 pInfo->dataFormat[3]==0x6d && |
|
1852 pInfo->formatVersion[0]==1 |
|
1853 )) { |
|
1854 udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n", |
|
1855 pInfo->dataFormat[0], pInfo->dataFormat[1], |
|
1856 pInfo->dataFormat[2], pInfo->dataFormat[3], |
|
1857 pInfo->formatVersion[0]); |
|
1858 *pErrorCode=U_UNSUPPORTED_ERROR; |
|
1859 return 0; |
|
1860 } |
|
1861 |
|
1862 inBytes=(const uint8_t *)inData+headerSize; |
|
1863 outBytes=(uint8_t *)outData+headerSize; |
|
1864 if(length<0) { |
|
1865 algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]); |
|
1866 } else { |
|
1867 length-=headerSize; |
|
1868 if( length<20 || |
|
1869 (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3])) |
|
1870 ) { |
|
1871 udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n", |
|
1872 length); |
|
1873 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
|
1874 return 0; |
|
1875 } |
|
1876 } |
|
1877 |
|
1878 if(length<0) { |
|
1879 /* preflighting: iterate through algorithmic ranges */ |
|
1880 offset=algNamesOffset; |
|
1881 count=ds->readUInt32(*((const uint32_t *)(inBytes+offset))); |
|
1882 offset+=4; |
|
1883 |
|
1884 for(i=0; i<count; ++i) { |
|
1885 inRange=(const AlgorithmicRange *)(inBytes+offset); |
|
1886 offset+=ds->readUInt16(inRange->size); |
|
1887 } |
|
1888 } else { |
|
1889 /* swap data */ |
|
1890 const uint16_t *p; |
|
1891 uint16_t *q, *temp; |
|
1892 |
|
1893 int16_t tokens[512]; |
|
1894 uint16_t tokenCount; |
|
1895 |
|
1896 uint8_t map[256], trailMap[256]; |
|
1897 |
|
1898 /* copy the data for inaccessible bytes */ |
|
1899 if(inBytes!=outBytes) { |
|
1900 uprv_memcpy(outBytes, inBytes, length); |
|
1901 } |
|
1902 |
|
1903 /* the initial 4 offsets first */ |
|
1904 tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]); |
|
1905 groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]); |
|
1906 groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]); |
|
1907 ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode); |
|
1908 |
|
1909 /* |
|
1910 * now the tokens table |
|
1911 * it needs to be permutated along with the compressed name strings |
|
1912 */ |
|
1913 p=(const uint16_t *)(inBytes+16); |
|
1914 q=(uint16_t *)(outBytes+16); |
|
1915 |
|
1916 /* read and swap the tokenCount */ |
|
1917 tokenCount=ds->readUInt16(*p); |
|
1918 ds->swapArray16(ds, p, 2, q, pErrorCode); |
|
1919 ++p; |
|
1920 ++q; |
|
1921 |
|
1922 /* read the first 512 tokens and make the token maps */ |
|
1923 if(tokenCount<=512) { |
|
1924 count=tokenCount; |
|
1925 } else { |
|
1926 count=512; |
|
1927 } |
|
1928 for(i=0; i<count; ++i) { |
|
1929 tokens[i]=udata_readInt16(ds, p[i]); |
|
1930 } |
|
1931 for(; i<512; ++i) { |
|
1932 tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */ |
|
1933 } |
|
1934 makeTokenMap(ds, tokens, tokenCount, map, pErrorCode); |
|
1935 makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode); |
|
1936 if(U_FAILURE(*pErrorCode)) { |
|
1937 return 0; |
|
1938 } |
|
1939 |
|
1940 /* |
|
1941 * swap and permutate the tokens |
|
1942 * go through a temporary array to support in-place swapping |
|
1943 */ |
|
1944 temp=(uint16_t *)uprv_malloc(tokenCount*2); |
|
1945 if(temp==NULL) { |
|
1946 udata_printError(ds, "out of memory swapping %u unames.icu tokens\n", |
|
1947 tokenCount); |
|
1948 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
|
1949 return 0; |
|
1950 } |
|
1951 |
|
1952 /* swap and permutate single-/lead-byte tokens */ |
|
1953 for(i=0; i<tokenCount && i<256; ++i) { |
|
1954 ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode); |
|
1955 } |
|
1956 |
|
1957 /* swap and permutate trail-byte tokens */ |
|
1958 for(; i<tokenCount; ++i) { |
|
1959 ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode); |
|
1960 } |
|
1961 |
|
1962 /* copy the result into the output and free the temporary array */ |
|
1963 uprv_memcpy(q, temp, tokenCount*2); |
|
1964 uprv_free(temp); |
|
1965 |
|
1966 /* |
|
1967 * swap the token strings but not a possible padding byte after |
|
1968 * the terminating NUL of the last string |
|
1969 */ |
|
1970 udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset), |
|
1971 outBytes+tokenStringOffset, pErrorCode); |
|
1972 if(U_FAILURE(*pErrorCode)) { |
|
1973 udata_printError(ds, "uchar_swapNames(token strings) failed\n"); |
|
1974 return 0; |
|
1975 } |
|
1976 |
|
1977 /* swap the group table */ |
|
1978 count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset))); |
|
1979 ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2), |
|
1980 outBytes+groupsOffset, pErrorCode); |
|
1981 |
|
1982 /* |
|
1983 * swap the group strings |
|
1984 * swap the string bytes but not the nibble-encoded string lengths |
|
1985 */ |
|
1986 if(ds->inCharset!=ds->outCharset) { |
|
1987 uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1]; |
|
1988 |
|
1989 const uint8_t *inStrings, *nextInStrings; |
|
1990 uint8_t *outStrings; |
|
1991 |
|
1992 uint8_t c; |
|
1993 |
|
1994 inStrings=inBytes+groupStringOffset; |
|
1995 outStrings=outBytes+groupStringOffset; |
|
1996 |
|
1997 stringsCount=algNamesOffset-groupStringOffset; |
|
1998 |
|
1999 /* iterate through string groups until only a few padding bytes are left */ |
|
2000 while(stringsCount>32) { |
|
2001 nextInStrings=expandGroupLengths(inStrings, offsets, lengths); |
|
2002 |
|
2003 /* move past the length bytes */ |
|
2004 stringsCount-=(uint32_t)(nextInStrings-inStrings); |
|
2005 outStrings+=nextInStrings-inStrings; |
|
2006 inStrings=nextInStrings; |
|
2007 |
|
2008 count=offsets[31]+lengths[31]; /* total number of string bytes in this group */ |
|
2009 stringsCount-=count; |
|
2010 |
|
2011 /* swap the string bytes using map[] and trailMap[] */ |
|
2012 while(count>0) { |
|
2013 c=*inStrings++; |
|
2014 *outStrings++=map[c]; |
|
2015 if(tokens[c]!=-2) { |
|
2016 --count; |
|
2017 } else { |
|
2018 /* token lead byte: swap the trail byte, too */ |
|
2019 *outStrings++=trailMap[*inStrings++]; |
|
2020 count-=2; |
|
2021 } |
|
2022 } |
|
2023 } |
|
2024 } |
|
2025 |
|
2026 /* swap the algorithmic ranges */ |
|
2027 offset=algNamesOffset; |
|
2028 count=ds->readUInt32(*((const uint32_t *)(inBytes+offset))); |
|
2029 ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode); |
|
2030 offset+=4; |
|
2031 |
|
2032 for(i=0; i<count; ++i) { |
|
2033 if(offset>(uint32_t)length) { |
|
2034 udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n", |
|
2035 length, i); |
|
2036 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
|
2037 return 0; |
|
2038 } |
|
2039 |
|
2040 inRange=(const AlgorithmicRange *)(inBytes+offset); |
|
2041 outRange=(AlgorithmicRange *)(outBytes+offset); |
|
2042 offset+=ds->readUInt16(inRange->size); |
|
2043 |
|
2044 ds->swapArray32(ds, inRange, 8, outRange, pErrorCode); |
|
2045 ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode); |
|
2046 switch(inRange->type) { |
|
2047 case 0: |
|
2048 /* swap prefix string */ |
|
2049 ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)), |
|
2050 outRange+1, pErrorCode); |
|
2051 if(U_FAILURE(*pErrorCode)) { |
|
2052 udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n", |
|
2053 i); |
|
2054 return 0; |
|
2055 } |
|
2056 break; |
|
2057 case 1: |
|
2058 { |
|
2059 /* swap factors and the prefix and factor strings */ |
|
2060 uint32_t factorsCount; |
|
2061 |
|
2062 factorsCount=inRange->variant; |
|
2063 p=(const uint16_t *)(inRange+1); |
|
2064 q=(uint16_t *)(outRange+1); |
|
2065 ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode); |
|
2066 |
|
2067 /* swap the strings, up to the last terminating NUL */ |
|
2068 p+=factorsCount; |
|
2069 q+=factorsCount; |
|
2070 stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p); |
|
2071 while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) { |
|
2072 --stringsCount; |
|
2073 } |
|
2074 ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode); |
|
2075 } |
|
2076 break; |
|
2077 default: |
|
2078 udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n", |
|
2079 inRange->type, i); |
|
2080 *pErrorCode=U_UNSUPPORTED_ERROR; |
|
2081 return 0; |
|
2082 } |
|
2083 } |
|
2084 } |
|
2085 |
|
2086 return headerSize+(int32_t)offset; |
|
2087 } |
|
2088 |
|
2089 U_NAMESPACE_END |
|
2090 |
|
2091 /* |
|
2092 * Hey, Emacs, please set the following: |
|
2093 * |
|
2094 * Local Variables: |
|
2095 * indent-tabs-mode: nil |
|
2096 * End: |
|
2097 * |
|
2098 */ |