1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/unames.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,2098 @@ 1.4 +/* 1.5 +****************************************************************************** 1.6 +* 1.7 +* Copyright (C) 1999-2013, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +****************************************************************************** 1.11 +* file name: unames.c 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 1999oct04 1.17 +* created by: Markus W. Scherer 1.18 +*/ 1.19 + 1.20 +#include "unicode/utypes.h" 1.21 +#include "unicode/putil.h" 1.22 +#include "unicode/uchar.h" 1.23 +#include "unicode/udata.h" 1.24 +#include "unicode/utf.h" 1.25 +#include "unicode/utf16.h" 1.26 +#include "uassert.h" 1.27 +#include "ustr_imp.h" 1.28 +#include "umutex.h" 1.29 +#include "cmemory.h" 1.30 +#include "cstring.h" 1.31 +#include "ucln_cmn.h" 1.32 +#include "udataswp.h" 1.33 +#include "uprops.h" 1.34 + 1.35 +U_NAMESPACE_BEGIN 1.36 + 1.37 +/* prototypes ------------------------------------------------------------- */ 1.38 + 1.39 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 1.40 + 1.41 +static const char DATA_NAME[] = "unames"; 1.42 +static const char DATA_TYPE[] = "icu"; 1.43 + 1.44 +#define GROUP_SHIFT 5 1.45 +#define LINES_PER_GROUP (1L<<GROUP_SHIFT) 1.46 +#define GROUP_MASK (LINES_PER_GROUP-1) 1.47 + 1.48 +/* 1.49 + * This struct was replaced by explicitly accessing equivalent 1.50 + * fields from triples of uint16_t. 1.51 + * The Group struct was padded to 8 bytes on compilers for early ARM CPUs, 1.52 + * which broke the assumption that sizeof(Group)==6 and that the ++ operator 1.53 + * would advance by 6 bytes (3 uint16_t). 1.54 + * 1.55 + * We can't just change the data structure because it's loaded from a data file, 1.56 + * and we don't want to make it less compact, so we changed the access code. 1.57 + * 1.58 + * For details see ICU tickets 6331 and 6008. 1.59 +typedef struct { 1.60 + uint16_t groupMSB, 1.61 + offsetHigh, offsetLow; / * avoid padding * / 1.62 +} Group; 1.63 + */ 1.64 +enum { 1.65 + GROUP_MSB, 1.66 + GROUP_OFFSET_HIGH, 1.67 + GROUP_OFFSET_LOW, 1.68 + GROUP_LENGTH 1.69 +}; 1.70 + 1.71 +/* 1.72 + * Get the 32-bit group offset. 1.73 + * @param group (const uint16_t *) pointer to a Group triple of uint16_t 1.74 + * @return group offset (int32_t) 1.75 + */ 1.76 +#define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW]) 1.77 + 1.78 +#define NEXT_GROUP(group) ((group)+GROUP_LENGTH) 1.79 +#define PREV_GROUP(group) ((group)-GROUP_LENGTH) 1.80 + 1.81 +typedef struct { 1.82 + uint32_t start, end; 1.83 + uint8_t type, variant; 1.84 + uint16_t size; 1.85 +} AlgorithmicRange; 1.86 + 1.87 +typedef struct { 1.88 + uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset; 1.89 +} UCharNames; 1.90 + 1.91 +/* 1.92 + * Get the groups table from a UCharNames struct. 1.93 + * The groups table consists of one uint16_t groupCount followed by 1.94 + * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH 1.95 + * and the comment for the old struct Group above. 1.96 + * 1.97 + * @param names (const UCharNames *) pointer to the UCharNames indexes 1.98 + * @return (const uint16_t *) pointer to the groups table 1.99 + */ 1.100 +#define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset) 1.101 + 1.102 +typedef struct { 1.103 + const char *otherName; 1.104 + UChar32 code; 1.105 +} FindName; 1.106 + 1.107 +#define DO_FIND_NAME NULL 1.108 + 1.109 +static UDataMemory *uCharNamesData=NULL; 1.110 +static UCharNames *uCharNames=NULL; 1.111 +static icu::UInitOnce gCharNamesInitOnce = U_INITONCE_INITIALIZER; 1.112 + 1.113 +/* 1.114 + * Maximum length of character names (regular & 1.0). 1.115 + */ 1.116 +static int32_t gMaxNameLength=0; 1.117 + 1.118 +/* 1.119 + * Set of chars used in character names (regular & 1.0). 1.120 + * Chars are platform-dependent (can be EBCDIC). 1.121 + */ 1.122 +static uint32_t gNameSet[8]={ 0 }; 1.123 + 1.124 +#define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT 1.125 +#define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1 1.126 +#define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2 1.127 + 1.128 +#define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3) 1.129 + 1.130 +static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = { 1.131 + "unassigned", 1.132 + "uppercase letter", 1.133 + "lowercase letter", 1.134 + "titlecase letter", 1.135 + "modifier letter", 1.136 + "other letter", 1.137 + "non spacing mark", 1.138 + "enclosing mark", 1.139 + "combining spacing mark", 1.140 + "decimal digit number", 1.141 + "letter number", 1.142 + "other number", 1.143 + "space separator", 1.144 + "line separator", 1.145 + "paragraph separator", 1.146 + "control", 1.147 + "format", 1.148 + "private use area", 1.149 + "surrogate", 1.150 + "dash punctuation", 1.151 + "start punctuation", 1.152 + "end punctuation", 1.153 + "connector punctuation", 1.154 + "other punctuation", 1.155 + "math symbol", 1.156 + "currency symbol", 1.157 + "modifier symbol", 1.158 + "other symbol", 1.159 + "initial punctuation", 1.160 + "final punctuation", 1.161 + "noncharacter", 1.162 + "lead surrogate", 1.163 + "trail surrogate" 1.164 +}; 1.165 + 1.166 +/* implementation ----------------------------------------------------------- */ 1.167 + 1.168 +static UBool U_CALLCONV unames_cleanup(void) 1.169 +{ 1.170 + if(uCharNamesData) { 1.171 + udata_close(uCharNamesData); 1.172 + uCharNamesData = NULL; 1.173 + } 1.174 + if(uCharNames) { 1.175 + uCharNames = NULL; 1.176 + } 1.177 + gCharNamesInitOnce.reset(); 1.178 + gMaxNameLength=0; 1.179 + return TRUE; 1.180 +} 1.181 + 1.182 +static UBool U_CALLCONV 1.183 +isAcceptable(void * /*context*/, 1.184 + const char * /*type*/, const char * /*name*/, 1.185 + const UDataInfo *pInfo) { 1.186 + return (UBool)( 1.187 + pInfo->size>=20 && 1.188 + pInfo->isBigEndian==U_IS_BIG_ENDIAN && 1.189 + pInfo->charsetFamily==U_CHARSET_FAMILY && 1.190 + pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */ 1.191 + pInfo->dataFormat[1]==0x6e && 1.192 + pInfo->dataFormat[2]==0x61 && 1.193 + pInfo->dataFormat[3]==0x6d && 1.194 + pInfo->formatVersion[0]==1); 1.195 +} 1.196 + 1.197 +static void U_CALLCONV 1.198 +loadCharNames(UErrorCode &status) { 1.199 + U_ASSERT(uCharNamesData == NULL); 1.200 + U_ASSERT(uCharNames == NULL); 1.201 + 1.202 + uCharNamesData = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &status); 1.203 + if(U_FAILURE(status)) { 1.204 + uCharNamesData = NULL; 1.205 + } else { 1.206 + uCharNames = (UCharNames *)udata_getMemory(uCharNamesData); 1.207 + } 1.208 + ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup); 1.209 +} 1.210 + 1.211 + 1.212 +static UBool 1.213 +isDataLoaded(UErrorCode *pErrorCode) { 1.214 + umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode); 1.215 + return U_SUCCESS(*pErrorCode); 1.216 +} 1.217 + 1.218 +#define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \ 1.219 + if((bufferLength)>0) { \ 1.220 + *(buffer)++=c; \ 1.221 + --(bufferLength); \ 1.222 + } \ 1.223 + ++(bufferPos); \ 1.224 +} 1.225 + 1.226 +#define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT 1.227 + 1.228 +/* 1.229 + * Important: expandName() and compareName() are almost the same - 1.230 + * apply fixes to both. 1.231 + * 1.232 + * UnicodeData.txt uses ';' as a field separator, so no 1.233 + * field can contain ';' as part of its contents. 1.234 + * In unames.dat, it is marked as token[';']==-1 only if the 1.235 + * semicolon is used in the data file - which is iff we 1.236 + * have Unicode 1.0 names or ISO comments or aliases. 1.237 + * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases 1.238 + * although we know that it will never be part of a name. 1.239 + */ 1.240 +static uint16_t 1.241 +expandName(UCharNames *names, 1.242 + const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, 1.243 + char *buffer, uint16_t bufferLength) { 1.244 + uint16_t *tokens=(uint16_t *)names+8; 1.245 + uint16_t token, tokenCount=*tokens++, bufferPos=0; 1.246 + uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset; 1.247 + uint8_t c; 1.248 + 1.249 + if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { 1.250 + /* 1.251 + * skip the modern name if it is not requested _and_ 1.252 + * if the semicolon byte value is a character, not a token number 1.253 + */ 1.254 + if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { 1.255 + int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice; 1.256 + do { 1.257 + while(nameLength>0) { 1.258 + --nameLength; 1.259 + if(*name++==';') { 1.260 + break; 1.261 + } 1.262 + } 1.263 + } while(--fieldIndex>0); 1.264 + } else { 1.265 + /* 1.266 + * the semicolon byte value is a token number, therefore 1.267 + * only modern names are stored in unames.dat and there is no 1.268 + * such requested alternate name here 1.269 + */ 1.270 + nameLength=0; 1.271 + } 1.272 + } 1.273 + 1.274 + /* write each letter directly, and write a token word per token */ 1.275 + while(nameLength>0) { 1.276 + --nameLength; 1.277 + c=*name++; 1.278 + 1.279 + if(c>=tokenCount) { 1.280 + if(c!=';') { 1.281 + /* implicit letter */ 1.282 + WRITE_CHAR(buffer, bufferLength, bufferPos, c); 1.283 + } else { 1.284 + /* finished */ 1.285 + break; 1.286 + } 1.287 + } else { 1.288 + token=tokens[c]; 1.289 + if(token==(uint16_t)(-2)) { 1.290 + /* this is a lead byte for a double-byte token */ 1.291 + token=tokens[c<<8|*name++]; 1.292 + --nameLength; 1.293 + } 1.294 + if(token==(uint16_t)(-1)) { 1.295 + if(c!=';') { 1.296 + /* explicit letter */ 1.297 + WRITE_CHAR(buffer, bufferLength, bufferPos, c); 1.298 + } else { 1.299 + /* stop, but skip the semicolon if we are seeking 1.300 + extended names and there was no 2.0 name but there 1.301 + is a 1.0 name. */ 1.302 + if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) { 1.303 + if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { 1.304 + continue; 1.305 + } 1.306 + } 1.307 + /* finished */ 1.308 + break; 1.309 + } 1.310 + } else { 1.311 + /* write token word */ 1.312 + uint8_t *tokenString=tokenStrings+token; 1.313 + while((c=*tokenString++)!=0) { 1.314 + WRITE_CHAR(buffer, bufferLength, bufferPos, c); 1.315 + } 1.316 + } 1.317 + } 1.318 + } 1.319 + 1.320 + /* zero-terminate */ 1.321 + if(bufferLength>0) { 1.322 + *buffer=0; 1.323 + } 1.324 + 1.325 + return bufferPos; 1.326 +} 1.327 + 1.328 +/* 1.329 + * compareName() is almost the same as expandName() except that it compares 1.330 + * the currently expanded name to an input name. 1.331 + * It returns the match/no match result as soon as possible. 1.332 + */ 1.333 +static UBool 1.334 +compareName(UCharNames *names, 1.335 + const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, 1.336 + const char *otherName) { 1.337 + uint16_t *tokens=(uint16_t *)names+8; 1.338 + uint16_t token, tokenCount=*tokens++; 1.339 + uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset; 1.340 + uint8_t c; 1.341 + const char *origOtherName = otherName; 1.342 + 1.343 + if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { 1.344 + /* 1.345 + * skip the modern name if it is not requested _and_ 1.346 + * if the semicolon byte value is a character, not a token number 1.347 + */ 1.348 + if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { 1.349 + int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice; 1.350 + do { 1.351 + while(nameLength>0) { 1.352 + --nameLength; 1.353 + if(*name++==';') { 1.354 + break; 1.355 + } 1.356 + } 1.357 + } while(--fieldIndex>0); 1.358 + } else { 1.359 + /* 1.360 + * the semicolon byte value is a token number, therefore 1.361 + * only modern names are stored in unames.dat and there is no 1.362 + * such requested alternate name here 1.363 + */ 1.364 + nameLength=0; 1.365 + } 1.366 + } 1.367 + 1.368 + /* compare each letter directly, and compare a token word per token */ 1.369 + while(nameLength>0) { 1.370 + --nameLength; 1.371 + c=*name++; 1.372 + 1.373 + if(c>=tokenCount) { 1.374 + if(c!=';') { 1.375 + /* implicit letter */ 1.376 + if((char)c!=*otherName++) { 1.377 + return FALSE; 1.378 + } 1.379 + } else { 1.380 + /* finished */ 1.381 + break; 1.382 + } 1.383 + } else { 1.384 + token=tokens[c]; 1.385 + if(token==(uint16_t)(-2)) { 1.386 + /* this is a lead byte for a double-byte token */ 1.387 + token=tokens[c<<8|*name++]; 1.388 + --nameLength; 1.389 + } 1.390 + if(token==(uint16_t)(-1)) { 1.391 + if(c!=';') { 1.392 + /* explicit letter */ 1.393 + if((char)c!=*otherName++) { 1.394 + return FALSE; 1.395 + } 1.396 + } else { 1.397 + /* stop, but skip the semicolon if we are seeking 1.398 + extended names and there was no 2.0 name but there 1.399 + is a 1.0 name. */ 1.400 + if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) { 1.401 + if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { 1.402 + continue; 1.403 + } 1.404 + } 1.405 + /* finished */ 1.406 + break; 1.407 + } 1.408 + } else { 1.409 + /* write token word */ 1.410 + uint8_t *tokenString=tokenStrings+token; 1.411 + while((c=*tokenString++)!=0) { 1.412 + if((char)c!=*otherName++) { 1.413 + return FALSE; 1.414 + } 1.415 + } 1.416 + } 1.417 + } 1.418 + } 1.419 + 1.420 + /* complete match? */ 1.421 + return (UBool)(*otherName==0); 1.422 +} 1.423 + 1.424 +static uint8_t getCharCat(UChar32 cp) { 1.425 + uint8_t cat; 1.426 + 1.427 + if (U_IS_UNICODE_NONCHAR(cp)) { 1.428 + return U_NONCHARACTER_CODE_POINT; 1.429 + } 1.430 + 1.431 + if ((cat = u_charType(cp)) == U_SURROGATE) { 1.432 + cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE; 1.433 + } 1.434 + 1.435 + return cat; 1.436 +} 1.437 + 1.438 +static const char *getCharCatName(UChar32 cp) { 1.439 + uint8_t cat = getCharCat(cp); 1.440 + 1.441 + /* Return unknown if the table of names above is not up to 1.442 + date. */ 1.443 + 1.444 + if (cat >= LENGTHOF(charCatNames)) { 1.445 + return "unknown"; 1.446 + } else { 1.447 + return charCatNames[cat]; 1.448 + } 1.449 +} 1.450 + 1.451 +static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) { 1.452 + const char *catname = getCharCatName(code); 1.453 + uint16_t length = 0; 1.454 + 1.455 + UChar32 cp; 1.456 + int ndigits, i; 1.457 + 1.458 + WRITE_CHAR(buffer, bufferLength, length, '<'); 1.459 + while (catname[length - 1]) { 1.460 + WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]); 1.461 + } 1.462 + WRITE_CHAR(buffer, bufferLength, length, '-'); 1.463 + for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4) 1.464 + ; 1.465 + if (ndigits < 4) 1.466 + ndigits = 4; 1.467 + for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) { 1.468 + uint8_t v = (uint8_t)(cp & 0xf); 1.469 + buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10); 1.470 + } 1.471 + buffer += ndigits; 1.472 + length += ndigits; 1.473 + WRITE_CHAR(buffer, bufferLength, length, '>'); 1.474 + 1.475 + return length; 1.476 +} 1.477 + 1.478 +/* 1.479 + * getGroup() does a binary search for the group that contains the 1.480 + * Unicode code point "code". 1.481 + * The return value is always a valid Group* that may contain "code" 1.482 + * or else is the highest group before "code". 1.483 + * If the lowest group is after "code", then that one is returned. 1.484 + */ 1.485 +static const uint16_t * 1.486 +getGroup(UCharNames *names, uint32_t code) { 1.487 + const uint16_t *groups=GET_GROUPS(names); 1.488 + uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT), 1.489 + start=0, 1.490 + limit=*groups++, 1.491 + number; 1.492 + 1.493 + /* binary search for the group of names that contains the one for code */ 1.494 + while(start<limit-1) { 1.495 + number=(uint16_t)((start+limit)/2); 1.496 + if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) { 1.497 + limit=number; 1.498 + } else { 1.499 + start=number; 1.500 + } 1.501 + } 1.502 + 1.503 + /* return this regardless of whether it is an exact match */ 1.504 + return groups+start*GROUP_LENGTH; 1.505 +} 1.506 + 1.507 +/* 1.508 + * expandGroupLengths() reads a block of compressed lengths of 32 strings and 1.509 + * expands them into offsets and lengths for each string. 1.510 + * Lengths are stored with a variable-width encoding in consecutive nibbles: 1.511 + * If a nibble<0xc, then it is the length itself (0=empty string). 1.512 + * If a nibble>=0xc, then it forms a length value with the following nibble. 1.513 + * Calculation see below. 1.514 + * The offsets and lengths arrays must be at least 33 (one more) long because 1.515 + * there is no check here at the end if the last nibble is still used. 1.516 + */ 1.517 +static const uint8_t * 1.518 +expandGroupLengths(const uint8_t *s, 1.519 + uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) { 1.520 + /* read the lengths of the 32 strings in this group and get each string's offset */ 1.521 + uint16_t i=0, offset=0, length=0; 1.522 + uint8_t lengthByte; 1.523 + 1.524 + /* all 32 lengths must be read to get the offset of the first group string */ 1.525 + while(i<LINES_PER_GROUP) { 1.526 + lengthByte=*s++; 1.527 + 1.528 + /* read even nibble - MSBs of lengthByte */ 1.529 + if(length>=12) { 1.530 + /* double-nibble length spread across two bytes */ 1.531 + length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12); 1.532 + lengthByte&=0xf; 1.533 + } else if((lengthByte /* &0xf0 */)>=0xc0) { 1.534 + /* double-nibble length spread across this one byte */ 1.535 + length=(uint16_t)((lengthByte&0x3f)+12); 1.536 + } else { 1.537 + /* single-nibble length in MSBs */ 1.538 + length=(uint16_t)(lengthByte>>4); 1.539 + lengthByte&=0xf; 1.540 + } 1.541 + 1.542 + *offsets++=offset; 1.543 + *lengths++=length; 1.544 + 1.545 + offset+=length; 1.546 + ++i; 1.547 + 1.548 + /* read odd nibble - LSBs of lengthByte */ 1.549 + if((lengthByte&0xf0)==0) { 1.550 + /* this nibble was not consumed for a double-nibble length above */ 1.551 + length=lengthByte; 1.552 + if(length<12) { 1.553 + /* single-nibble length in LSBs */ 1.554 + *offsets++=offset; 1.555 + *lengths++=length; 1.556 + 1.557 + offset+=length; 1.558 + ++i; 1.559 + } 1.560 + } else { 1.561 + length=0; /* prevent double-nibble detection in the next iteration */ 1.562 + } 1.563 + } 1.564 + 1.565 + /* now, s is at the first group string */ 1.566 + return s; 1.567 +} 1.568 + 1.569 +static uint16_t 1.570 +expandGroupName(UCharNames *names, const uint16_t *group, 1.571 + uint16_t lineNumber, UCharNameChoice nameChoice, 1.572 + char *buffer, uint16_t bufferLength) { 1.573 + uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; 1.574 + const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group); 1.575 + s=expandGroupLengths(s, offsets, lengths); 1.576 + return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice, 1.577 + buffer, bufferLength); 1.578 +} 1.579 + 1.580 +static uint16_t 1.581 +getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice, 1.582 + char *buffer, uint16_t bufferLength) { 1.583 + const uint16_t *group=getGroup(names, code); 1.584 + if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) { 1.585 + return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice, 1.586 + buffer, bufferLength); 1.587 + } else { 1.588 + /* group not found */ 1.589 + /* zero-terminate */ 1.590 + if(bufferLength>0) { 1.591 + *buffer=0; 1.592 + } 1.593 + return 0; 1.594 + } 1.595 +} 1.596 + 1.597 +/* 1.598 + * enumGroupNames() enumerates all the names in a 32-group 1.599 + * and either calls the enumerator function or finds a given input name. 1.600 + */ 1.601 +static UBool 1.602 +enumGroupNames(UCharNames *names, const uint16_t *group, 1.603 + UChar32 start, UChar32 end, 1.604 + UEnumCharNamesFn *fn, void *context, 1.605 + UCharNameChoice nameChoice) { 1.606 + uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; 1.607 + const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group); 1.608 + 1.609 + s=expandGroupLengths(s, offsets, lengths); 1.610 + if(fn!=DO_FIND_NAME) { 1.611 + char buffer[200]; 1.612 + uint16_t length; 1.613 + 1.614 + while(start<=end) { 1.615 + length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer)); 1.616 + if (!length && nameChoice == U_EXTENDED_CHAR_NAME) { 1.617 + buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0; 1.618 + } 1.619 + /* here, we assume that the buffer is large enough */ 1.620 + if(length>0) { 1.621 + if(!fn(context, start, nameChoice, buffer, length)) { 1.622 + return FALSE; 1.623 + } 1.624 + } 1.625 + ++start; 1.626 + } 1.627 + } else { 1.628 + const char *otherName=((FindName *)context)->otherName; 1.629 + while(start<=end) { 1.630 + if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) { 1.631 + ((FindName *)context)->code=start; 1.632 + return FALSE; 1.633 + } 1.634 + ++start; 1.635 + } 1.636 + } 1.637 + return TRUE; 1.638 +} 1.639 + 1.640 +/* 1.641 + * enumExtNames enumerate extended names. 1.642 + * It only needs to do it if it is called with a real function and not 1.643 + * with the dummy DO_FIND_NAME, because u_charFromName() does a check 1.644 + * for extended names by itself. 1.645 + */ 1.646 +static UBool 1.647 +enumExtNames(UChar32 start, UChar32 end, 1.648 + UEnumCharNamesFn *fn, void *context) 1.649 +{ 1.650 + if(fn!=DO_FIND_NAME) { 1.651 + char buffer[200]; 1.652 + uint16_t length; 1.653 + 1.654 + while(start<=end) { 1.655 + buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0; 1.656 + /* here, we assume that the buffer is large enough */ 1.657 + if(length>0) { 1.658 + if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) { 1.659 + return FALSE; 1.660 + } 1.661 + } 1.662 + ++start; 1.663 + } 1.664 + } 1.665 + 1.666 + return TRUE; 1.667 +} 1.668 + 1.669 +static UBool 1.670 +enumNames(UCharNames *names, 1.671 + UChar32 start, UChar32 limit, 1.672 + UEnumCharNamesFn *fn, void *context, 1.673 + UCharNameChoice nameChoice) { 1.674 + uint16_t startGroupMSB, endGroupMSB, groupCount; 1.675 + const uint16_t *group, *groupLimit; 1.676 + 1.677 + startGroupMSB=(uint16_t)(start>>GROUP_SHIFT); 1.678 + endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT); 1.679 + 1.680 + /* find the group that contains start, or the highest before it */ 1.681 + group=getGroup(names, start); 1.682 + 1.683 + if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) { 1.684 + /* enumerate synthetic names between start and the group start */ 1.685 + UChar32 extLimit=((UChar32)group[GROUP_MSB]<<GROUP_SHIFT); 1.686 + if(extLimit>limit) { 1.687 + extLimit=limit; 1.688 + } 1.689 + if(!enumExtNames(start, extLimit-1, fn, context)) { 1.690 + return FALSE; 1.691 + } 1.692 + start=extLimit; 1.693 + } 1.694 + 1.695 + if(startGroupMSB==endGroupMSB) { 1.696 + if(startGroupMSB==group[GROUP_MSB]) { 1.697 + /* if start and limit-1 are in the same group, then enumerate only in that one */ 1.698 + return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice); 1.699 + } 1.700 + } else { 1.701 + const uint16_t *groups=GET_GROUPS(names); 1.702 + groupCount=*groups++; 1.703 + groupLimit=groups+groupCount*GROUP_LENGTH; 1.704 + 1.705 + if(startGroupMSB==group[GROUP_MSB]) { 1.706 + /* enumerate characters in the partial start group */ 1.707 + if((start&GROUP_MASK)!=0) { 1.708 + if(!enumGroupNames(names, group, 1.709 + start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1, 1.710 + fn, context, nameChoice)) { 1.711 + return FALSE; 1.712 + } 1.713 + group=NEXT_GROUP(group); /* continue with the next group */ 1.714 + } 1.715 + } else if(startGroupMSB>group[GROUP_MSB]) { 1.716 + /* make sure that we start enumerating with the first group after start */ 1.717 + const uint16_t *nextGroup=NEXT_GROUP(group); 1.718 + if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) { 1.719 + UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT; 1.720 + if (end > limit) { 1.721 + end = limit; 1.722 + } 1.723 + if (!enumExtNames(start, end - 1, fn, context)) { 1.724 + return FALSE; 1.725 + } 1.726 + } 1.727 + group=nextGroup; 1.728 + } 1.729 + 1.730 + /* enumerate entire groups between the start- and end-groups */ 1.731 + while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) { 1.732 + const uint16_t *nextGroup; 1.733 + start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT; 1.734 + if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) { 1.735 + return FALSE; 1.736 + } 1.737 + nextGroup=NEXT_GROUP(group); 1.738 + if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) { 1.739 + UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT; 1.740 + if (end > limit) { 1.741 + end = limit; 1.742 + } 1.743 + if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) { 1.744 + return FALSE; 1.745 + } 1.746 + } 1.747 + group=nextGroup; 1.748 + } 1.749 + 1.750 + /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */ 1.751 + if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) { 1.752 + return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice); 1.753 + } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) { 1.754 + UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT; 1.755 + if (next > start) { 1.756 + start = next; 1.757 + } 1.758 + } else { 1.759 + return TRUE; 1.760 + } 1.761 + } 1.762 + 1.763 + /* we have not found a group, which means everything is made of 1.764 + extended names. */ 1.765 + if (nameChoice == U_EXTENDED_CHAR_NAME) { 1.766 + if (limit > UCHAR_MAX_VALUE + 1) { 1.767 + limit = UCHAR_MAX_VALUE + 1; 1.768 + } 1.769 + return enumExtNames(start, limit - 1, fn, context); 1.770 + } 1.771 + 1.772 + return TRUE; 1.773 +} 1.774 + 1.775 +static uint16_t 1.776 +writeFactorSuffix(const uint16_t *factors, uint16_t count, 1.777 + const char *s, /* suffix elements */ 1.778 + uint32_t code, 1.779 + uint16_t indexes[8], /* output fields from here */ 1.780 + const char *elementBases[8], const char *elements[8], 1.781 + char *buffer, uint16_t bufferLength) { 1.782 + uint16_t i, factor, bufferPos=0; 1.783 + char c; 1.784 + 1.785 + /* write elements according to the factors */ 1.786 + 1.787 + /* 1.788 + * the factorized elements are determined by modulo arithmetic 1.789 + * with the factors of this algorithm 1.790 + * 1.791 + * note that for fewer operations, count is decremented here 1.792 + */ 1.793 + --count; 1.794 + for(i=count; i>0; --i) { 1.795 + factor=factors[i]; 1.796 + indexes[i]=(uint16_t)(code%factor); 1.797 + code/=factor; 1.798 + } 1.799 + /* 1.800 + * we don't need to calculate the last modulus because start<=code<=end 1.801 + * guarantees here that code<=factors[0] 1.802 + */ 1.803 + indexes[0]=(uint16_t)code; 1.804 + 1.805 + /* write each element */ 1.806 + for(;;) { 1.807 + if(elementBases!=NULL) { 1.808 + *elementBases++=s; 1.809 + } 1.810 + 1.811 + /* skip indexes[i] strings */ 1.812 + factor=indexes[i]; 1.813 + while(factor>0) { 1.814 + while(*s++!=0) {} 1.815 + --factor; 1.816 + } 1.817 + if(elements!=NULL) { 1.818 + *elements++=s; 1.819 + } 1.820 + 1.821 + /* write element */ 1.822 + while((c=*s++)!=0) { 1.823 + WRITE_CHAR(buffer, bufferLength, bufferPos, c); 1.824 + } 1.825 + 1.826 + /* we do not need to perform the rest of this loop for i==count - break here */ 1.827 + if(i>=count) { 1.828 + break; 1.829 + } 1.830 + 1.831 + /* skip the rest of the strings for this factors[i] */ 1.832 + factor=(uint16_t)(factors[i]-indexes[i]-1); 1.833 + while(factor>0) { 1.834 + while(*s++!=0) {} 1.835 + --factor; 1.836 + } 1.837 + 1.838 + ++i; 1.839 + } 1.840 + 1.841 + /* zero-terminate */ 1.842 + if(bufferLength>0) { 1.843 + *buffer=0; 1.844 + } 1.845 + 1.846 + return bufferPos; 1.847 +} 1.848 + 1.849 +/* 1.850 + * Important: 1.851 + * Parts of findAlgName() are almost the same as some of getAlgName(). 1.852 + * Fixes must be applied to both. 1.853 + */ 1.854 +static uint16_t 1.855 +getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice, 1.856 + char *buffer, uint16_t bufferLength) { 1.857 + uint16_t bufferPos=0; 1.858 + 1.859 + /* Only the normative character name can be algorithmic. */ 1.860 + if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { 1.861 + /* zero-terminate */ 1.862 + if(bufferLength>0) { 1.863 + *buffer=0; 1.864 + } 1.865 + return 0; 1.866 + } 1.867 + 1.868 + switch(range->type) { 1.869 + case 0: { 1.870 + /* name = prefix hex-digits */ 1.871 + const char *s=(const char *)(range+1); 1.872 + char c; 1.873 + 1.874 + uint16_t i, count; 1.875 + 1.876 + /* copy prefix */ 1.877 + while((c=*s++)!=0) { 1.878 + WRITE_CHAR(buffer, bufferLength, bufferPos, c); 1.879 + } 1.880 + 1.881 + /* write hexadecimal code point value */ 1.882 + count=range->variant; 1.883 + 1.884 + /* zero-terminate */ 1.885 + if(count<bufferLength) { 1.886 + buffer[count]=0; 1.887 + } 1.888 + 1.889 + for(i=count; i>0;) { 1.890 + if(--i<bufferLength) { 1.891 + c=(char)(code&0xf); 1.892 + if(c<10) { 1.893 + c+='0'; 1.894 + } else { 1.895 + c+='A'-10; 1.896 + } 1.897 + buffer[i]=c; 1.898 + } 1.899 + code>>=4; 1.900 + } 1.901 + 1.902 + bufferPos+=count; 1.903 + break; 1.904 + } 1.905 + case 1: { 1.906 + /* name = prefix factorized-elements */ 1.907 + uint16_t indexes[8]; 1.908 + const uint16_t *factors=(const uint16_t *)(range+1); 1.909 + uint16_t count=range->variant; 1.910 + const char *s=(const char *)(factors+count); 1.911 + char c; 1.912 + 1.913 + /* copy prefix */ 1.914 + while((c=*s++)!=0) { 1.915 + WRITE_CHAR(buffer, bufferLength, bufferPos, c); 1.916 + } 1.917 + 1.918 + bufferPos+=writeFactorSuffix(factors, count, 1.919 + s, code-range->start, indexes, NULL, NULL, buffer, bufferLength); 1.920 + break; 1.921 + } 1.922 + default: 1.923 + /* undefined type */ 1.924 + /* zero-terminate */ 1.925 + if(bufferLength>0) { 1.926 + *buffer=0; 1.927 + } 1.928 + break; 1.929 + } 1.930 + 1.931 + return bufferPos; 1.932 +} 1.933 + 1.934 +/* 1.935 + * Important: enumAlgNames() and findAlgName() are almost the same. 1.936 + * Any fix must be applied to both. 1.937 + */ 1.938 +static UBool 1.939 +enumAlgNames(AlgorithmicRange *range, 1.940 + UChar32 start, UChar32 limit, 1.941 + UEnumCharNamesFn *fn, void *context, 1.942 + UCharNameChoice nameChoice) { 1.943 + char buffer[200]; 1.944 + uint16_t length; 1.945 + 1.946 + if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { 1.947 + return TRUE; 1.948 + } 1.949 + 1.950 + switch(range->type) { 1.951 + case 0: { 1.952 + char *s, *end; 1.953 + char c; 1.954 + 1.955 + /* get the full name of the start character */ 1.956 + length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer)); 1.957 + if(length<=0) { 1.958 + return TRUE; 1.959 + } 1.960 + 1.961 + /* call the enumerator function with this first character */ 1.962 + if(!fn(context, start, nameChoice, buffer, length)) { 1.963 + return FALSE; 1.964 + } 1.965 + 1.966 + /* go to the end of the name; all these names have the same length */ 1.967 + end=buffer; 1.968 + while(*end!=0) { 1.969 + ++end; 1.970 + } 1.971 + 1.972 + /* enumerate the rest of the names */ 1.973 + while(++start<limit) { 1.974 + /* increment the hexadecimal number on a character-basis */ 1.975 + s=end; 1.976 + for (;;) { 1.977 + c=*--s; 1.978 + if(('0'<=c && c<'9') || ('A'<=c && c<'F')) { 1.979 + *s=(char)(c+1); 1.980 + break; 1.981 + } else if(c=='9') { 1.982 + *s='A'; 1.983 + break; 1.984 + } else if(c=='F') { 1.985 + *s='0'; 1.986 + } 1.987 + } 1.988 + 1.989 + if(!fn(context, start, nameChoice, buffer, length)) { 1.990 + return FALSE; 1.991 + } 1.992 + } 1.993 + break; 1.994 + } 1.995 + case 1: { 1.996 + uint16_t indexes[8]; 1.997 + const char *elementBases[8], *elements[8]; 1.998 + const uint16_t *factors=(const uint16_t *)(range+1); 1.999 + uint16_t count=range->variant; 1.1000 + const char *s=(const char *)(factors+count); 1.1001 + char *suffix, *t; 1.1002 + uint16_t prefixLength, i, idx; 1.1003 + 1.1004 + char c; 1.1005 + 1.1006 + /* name = prefix factorized-elements */ 1.1007 + 1.1008 + /* copy prefix */ 1.1009 + suffix=buffer; 1.1010 + prefixLength=0; 1.1011 + while((c=*s++)!=0) { 1.1012 + *suffix++=c; 1.1013 + ++prefixLength; 1.1014 + } 1.1015 + 1.1016 + /* append the suffix of the start character */ 1.1017 + length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count, 1.1018 + s, (uint32_t)start-range->start, 1.1019 + indexes, elementBases, elements, 1.1020 + suffix, (uint16_t)(sizeof(buffer)-prefixLength))); 1.1021 + 1.1022 + /* call the enumerator function with this first character */ 1.1023 + if(!fn(context, start, nameChoice, buffer, length)) { 1.1024 + return FALSE; 1.1025 + } 1.1026 + 1.1027 + /* enumerate the rest of the names */ 1.1028 + while(++start<limit) { 1.1029 + /* increment the indexes in lexical order bound by the factors */ 1.1030 + i=count; 1.1031 + for (;;) { 1.1032 + idx=(uint16_t)(indexes[--i]+1); 1.1033 + if(idx<factors[i]) { 1.1034 + /* skip one index and its element string */ 1.1035 + indexes[i]=idx; 1.1036 + s=elements[i]; 1.1037 + while(*s++!=0) { 1.1038 + } 1.1039 + elements[i]=s; 1.1040 + break; 1.1041 + } else { 1.1042 + /* reset this index to 0 and its element string to the first one */ 1.1043 + indexes[i]=0; 1.1044 + elements[i]=elementBases[i]; 1.1045 + } 1.1046 + } 1.1047 + 1.1048 + /* to make matters a little easier, just append all elements to the suffix */ 1.1049 + t=suffix; 1.1050 + length=prefixLength; 1.1051 + for(i=0; i<count; ++i) { 1.1052 + s=elements[i]; 1.1053 + while((c=*s++)!=0) { 1.1054 + *t++=c; 1.1055 + ++length; 1.1056 + } 1.1057 + } 1.1058 + /* zero-terminate */ 1.1059 + *t=0; 1.1060 + 1.1061 + if(!fn(context, start, nameChoice, buffer, length)) { 1.1062 + return FALSE; 1.1063 + } 1.1064 + } 1.1065 + break; 1.1066 + } 1.1067 + default: 1.1068 + /* undefined type */ 1.1069 + break; 1.1070 + } 1.1071 + 1.1072 + return TRUE; 1.1073 +} 1.1074 + 1.1075 +/* 1.1076 + * findAlgName() is almost the same as enumAlgNames() except that it 1.1077 + * returns the code point for a name if it fits into the range. 1.1078 + * It returns 0xffff otherwise. 1.1079 + */ 1.1080 +static UChar32 1.1081 +findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) { 1.1082 + UChar32 code; 1.1083 + 1.1084 + if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { 1.1085 + return 0xffff; 1.1086 + } 1.1087 + 1.1088 + switch(range->type) { 1.1089 + case 0: { 1.1090 + /* name = prefix hex-digits */ 1.1091 + const char *s=(const char *)(range+1); 1.1092 + char c; 1.1093 + 1.1094 + uint16_t i, count; 1.1095 + 1.1096 + /* compare prefix */ 1.1097 + while((c=*s++)!=0) { 1.1098 + if((char)c!=*otherName++) { 1.1099 + return 0xffff; 1.1100 + } 1.1101 + } 1.1102 + 1.1103 + /* read hexadecimal code point value */ 1.1104 + count=range->variant; 1.1105 + code=0; 1.1106 + for(i=0; i<count; ++i) { 1.1107 + c=*otherName++; 1.1108 + if('0'<=c && c<='9') { 1.1109 + code=(code<<4)|(c-'0'); 1.1110 + } else if('A'<=c && c<='F') { 1.1111 + code=(code<<4)|(c-'A'+10); 1.1112 + } else { 1.1113 + return 0xffff; 1.1114 + } 1.1115 + } 1.1116 + 1.1117 + /* does it fit into the range? */ 1.1118 + if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) { 1.1119 + return code; 1.1120 + } 1.1121 + break; 1.1122 + } 1.1123 + case 1: { 1.1124 + char buffer[64]; 1.1125 + uint16_t indexes[8]; 1.1126 + const char *elementBases[8], *elements[8]; 1.1127 + const uint16_t *factors=(const uint16_t *)(range+1); 1.1128 + uint16_t count=range->variant; 1.1129 + const char *s=(const char *)(factors+count), *t; 1.1130 + UChar32 start, limit; 1.1131 + uint16_t i, idx; 1.1132 + 1.1133 + char c; 1.1134 + 1.1135 + /* name = prefix factorized-elements */ 1.1136 + 1.1137 + /* compare prefix */ 1.1138 + while((c=*s++)!=0) { 1.1139 + if((char)c!=*otherName++) { 1.1140 + return 0xffff; 1.1141 + } 1.1142 + } 1.1143 + 1.1144 + start=(UChar32)range->start; 1.1145 + limit=(UChar32)(range->end+1); 1.1146 + 1.1147 + /* initialize the suffix elements for enumeration; indexes should all be set to 0 */ 1.1148 + writeFactorSuffix(factors, count, s, 0, 1.1149 + indexes, elementBases, elements, buffer, sizeof(buffer)); 1.1150 + 1.1151 + /* compare the first suffix */ 1.1152 + if(0==uprv_strcmp(otherName, buffer)) { 1.1153 + return start; 1.1154 + } 1.1155 + 1.1156 + /* enumerate and compare the rest of the suffixes */ 1.1157 + while(++start<limit) { 1.1158 + /* increment the indexes in lexical order bound by the factors */ 1.1159 + i=count; 1.1160 + for (;;) { 1.1161 + idx=(uint16_t)(indexes[--i]+1); 1.1162 + if(idx<factors[i]) { 1.1163 + /* skip one index and its element string */ 1.1164 + indexes[i]=idx; 1.1165 + s=elements[i]; 1.1166 + while(*s++!=0) {} 1.1167 + elements[i]=s; 1.1168 + break; 1.1169 + } else { 1.1170 + /* reset this index to 0 and its element string to the first one */ 1.1171 + indexes[i]=0; 1.1172 + elements[i]=elementBases[i]; 1.1173 + } 1.1174 + } 1.1175 + 1.1176 + /* to make matters a little easier, just compare all elements of the suffix */ 1.1177 + t=otherName; 1.1178 + for(i=0; i<count; ++i) { 1.1179 + s=elements[i]; 1.1180 + while((c=*s++)!=0) { 1.1181 + if(c!=*t++) { 1.1182 + s=""; /* does not match */ 1.1183 + i=99; 1.1184 + } 1.1185 + } 1.1186 + } 1.1187 + if(i<99 && *t==0) { 1.1188 + return start; 1.1189 + } 1.1190 + } 1.1191 + break; 1.1192 + } 1.1193 + default: 1.1194 + /* undefined type */ 1.1195 + break; 1.1196 + } 1.1197 + 1.1198 + return 0xffff; 1.1199 +} 1.1200 + 1.1201 +/* sets of name characters, maximum name lengths ---------------------------- */ 1.1202 + 1.1203 +#define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f))) 1.1204 +#define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0) 1.1205 + 1.1206 +static int32_t 1.1207 +calcStringSetLength(uint32_t set[8], const char *s) { 1.1208 + int32_t length=0; 1.1209 + char c; 1.1210 + 1.1211 + while((c=*s++)!=0) { 1.1212 + SET_ADD(set, c); 1.1213 + ++length; 1.1214 + } 1.1215 + return length; 1.1216 +} 1.1217 + 1.1218 +static int32_t 1.1219 +calcAlgNameSetsLengths(int32_t maxNameLength) { 1.1220 + AlgorithmicRange *range; 1.1221 + uint32_t *p; 1.1222 + uint32_t rangeCount; 1.1223 + int32_t length; 1.1224 + 1.1225 + /* enumerate algorithmic ranges */ 1.1226 + p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); 1.1227 + rangeCount=*p; 1.1228 + range=(AlgorithmicRange *)(p+1); 1.1229 + while(rangeCount>0) { 1.1230 + switch(range->type) { 1.1231 + case 0: 1.1232 + /* name = prefix + (range->variant times) hex-digits */ 1.1233 + /* prefix */ 1.1234 + length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant; 1.1235 + if(length>maxNameLength) { 1.1236 + maxNameLength=length; 1.1237 + } 1.1238 + break; 1.1239 + case 1: { 1.1240 + /* name = prefix factorized-elements */ 1.1241 + const uint16_t *factors=(const uint16_t *)(range+1); 1.1242 + const char *s; 1.1243 + int32_t i, count=range->variant, factor, factorLength, maxFactorLength; 1.1244 + 1.1245 + /* prefix length */ 1.1246 + s=(const char *)(factors+count); 1.1247 + length=calcStringSetLength(gNameSet, s); 1.1248 + s+=length+1; /* start of factor suffixes */ 1.1249 + 1.1250 + /* get the set and maximum factor suffix length for each factor */ 1.1251 + for(i=0; i<count; ++i) { 1.1252 + maxFactorLength=0; 1.1253 + for(factor=factors[i]; factor>0; --factor) { 1.1254 + factorLength=calcStringSetLength(gNameSet, s); 1.1255 + s+=factorLength+1; 1.1256 + if(factorLength>maxFactorLength) { 1.1257 + maxFactorLength=factorLength; 1.1258 + } 1.1259 + } 1.1260 + length+=maxFactorLength; 1.1261 + } 1.1262 + 1.1263 + if(length>maxNameLength) { 1.1264 + maxNameLength=length; 1.1265 + } 1.1266 + break; 1.1267 + } 1.1268 + default: 1.1269 + /* unknown type */ 1.1270 + break; 1.1271 + } 1.1272 + 1.1273 + range=(AlgorithmicRange *)((uint8_t *)range+range->size); 1.1274 + --rangeCount; 1.1275 + } 1.1276 + return maxNameLength; 1.1277 +} 1.1278 + 1.1279 +static int32_t 1.1280 +calcExtNameSetsLengths(int32_t maxNameLength) { 1.1281 + int32_t i, length; 1.1282 + 1.1283 + for(i=0; i<LENGTHOF(charCatNames); ++i) { 1.1284 + /* 1.1285 + * for each category, count the length of the category name 1.1286 + * plus 9= 1.1287 + * 2 for <> 1.1288 + * 1 for - 1.1289 + * 6 for most hex digits per code point 1.1290 + */ 1.1291 + length=9+calcStringSetLength(gNameSet, charCatNames[i]); 1.1292 + if(length>maxNameLength) { 1.1293 + maxNameLength=length; 1.1294 + } 1.1295 + } 1.1296 + return maxNameLength; 1.1297 +} 1.1298 + 1.1299 +static int32_t 1.1300 +calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths, 1.1301 + uint32_t set[8], 1.1302 + const uint8_t **pLine, const uint8_t *lineLimit) { 1.1303 + const uint8_t *line=*pLine; 1.1304 + int32_t length=0, tokenLength; 1.1305 + uint16_t c, token; 1.1306 + 1.1307 + while(line!=lineLimit && (c=*line++)!=(uint8_t)';') { 1.1308 + if(c>=tokenCount) { 1.1309 + /* implicit letter */ 1.1310 + SET_ADD(set, c); 1.1311 + ++length; 1.1312 + } else { 1.1313 + token=tokens[c]; 1.1314 + if(token==(uint16_t)(-2)) { 1.1315 + /* this is a lead byte for a double-byte token */ 1.1316 + c=c<<8|*line++; 1.1317 + token=tokens[c]; 1.1318 + } 1.1319 + if(token==(uint16_t)(-1)) { 1.1320 + /* explicit letter */ 1.1321 + SET_ADD(set, c); 1.1322 + ++length; 1.1323 + } else { 1.1324 + /* count token word */ 1.1325 + if(tokenLengths!=NULL) { 1.1326 + /* use cached token length */ 1.1327 + tokenLength=tokenLengths[c]; 1.1328 + if(tokenLength==0) { 1.1329 + tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token); 1.1330 + tokenLengths[c]=(int8_t)tokenLength; 1.1331 + } 1.1332 + } else { 1.1333 + tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token); 1.1334 + } 1.1335 + length+=tokenLength; 1.1336 + } 1.1337 + } 1.1338 + } 1.1339 + 1.1340 + *pLine=line; 1.1341 + return length; 1.1342 +} 1.1343 + 1.1344 +static void 1.1345 +calcGroupNameSetsLengths(int32_t maxNameLength) { 1.1346 + uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; 1.1347 + 1.1348 + uint16_t *tokens=(uint16_t *)uCharNames+8; 1.1349 + uint16_t tokenCount=*tokens++; 1.1350 + uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset; 1.1351 + 1.1352 + int8_t *tokenLengths; 1.1353 + 1.1354 + const uint16_t *group; 1.1355 + const uint8_t *s, *line, *lineLimit; 1.1356 + 1.1357 + int32_t groupCount, lineNumber, length; 1.1358 + 1.1359 + tokenLengths=(int8_t *)uprv_malloc(tokenCount); 1.1360 + if(tokenLengths!=NULL) { 1.1361 + uprv_memset(tokenLengths, 0, tokenCount); 1.1362 + } 1.1363 + 1.1364 + group=GET_GROUPS(uCharNames); 1.1365 + groupCount=*group++; 1.1366 + 1.1367 + /* enumerate all groups */ 1.1368 + while(groupCount>0) { 1.1369 + s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group); 1.1370 + s=expandGroupLengths(s, offsets, lengths); 1.1371 + 1.1372 + /* enumerate all lines in each group */ 1.1373 + for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) { 1.1374 + line=s+offsets[lineNumber]; 1.1375 + length=lengths[lineNumber]; 1.1376 + if(length==0) { 1.1377 + continue; 1.1378 + } 1.1379 + 1.1380 + lineLimit=line+length; 1.1381 + 1.1382 + /* read regular name */ 1.1383 + length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit); 1.1384 + if(length>maxNameLength) { 1.1385 + maxNameLength=length; 1.1386 + } 1.1387 + if(line==lineLimit) { 1.1388 + continue; 1.1389 + } 1.1390 + 1.1391 + /* read Unicode 1.0 name */ 1.1392 + length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit); 1.1393 + if(length>maxNameLength) { 1.1394 + maxNameLength=length; 1.1395 + } 1.1396 + if(line==lineLimit) { 1.1397 + continue; 1.1398 + } 1.1399 + 1.1400 + /* read ISO comment */ 1.1401 + /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/ 1.1402 + } 1.1403 + 1.1404 + group=NEXT_GROUP(group); 1.1405 + --groupCount; 1.1406 + } 1.1407 + 1.1408 + if(tokenLengths!=NULL) { 1.1409 + uprv_free(tokenLengths); 1.1410 + } 1.1411 + 1.1412 + /* set gMax... - name length last for threading */ 1.1413 + gMaxNameLength=maxNameLength; 1.1414 +} 1.1415 + 1.1416 +static UBool 1.1417 +calcNameSetsLengths(UErrorCode *pErrorCode) { 1.1418 + static const char extChars[]="0123456789ABCDEF<>-"; 1.1419 + int32_t i, maxNameLength; 1.1420 + 1.1421 + if(gMaxNameLength!=0) { 1.1422 + return TRUE; 1.1423 + } 1.1424 + 1.1425 + if(!isDataLoaded(pErrorCode)) { 1.1426 + return FALSE; 1.1427 + } 1.1428 + 1.1429 + /* set hex digits, used in various names, and <>-, used in extended names */ 1.1430 + for(i=0; i<(int32_t)sizeof(extChars)-1; ++i) { 1.1431 + SET_ADD(gNameSet, extChars[i]); 1.1432 + } 1.1433 + 1.1434 + /* set sets and lengths from algorithmic names */ 1.1435 + maxNameLength=calcAlgNameSetsLengths(0); 1.1436 + 1.1437 + /* set sets and lengths from extended names */ 1.1438 + maxNameLength=calcExtNameSetsLengths(maxNameLength); 1.1439 + 1.1440 + /* set sets and lengths from group names, set global maximum values */ 1.1441 + calcGroupNameSetsLengths(maxNameLength); 1.1442 + 1.1443 + return TRUE; 1.1444 +} 1.1445 + 1.1446 +/* public API --------------------------------------------------------------- */ 1.1447 + 1.1448 +U_CAPI int32_t U_EXPORT2 1.1449 +u_charName(UChar32 code, UCharNameChoice nameChoice, 1.1450 + char *buffer, int32_t bufferLength, 1.1451 + UErrorCode *pErrorCode) { 1.1452 + AlgorithmicRange *algRange; 1.1453 + uint32_t *p; 1.1454 + uint32_t i; 1.1455 + int32_t length; 1.1456 + 1.1457 + /* check the argument values */ 1.1458 + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 1.1459 + return 0; 1.1460 + } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || 1.1461 + bufferLength<0 || (bufferLength>0 && buffer==NULL) 1.1462 + ) { 1.1463 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.1464 + return 0; 1.1465 + } 1.1466 + 1.1467 + if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) { 1.1468 + return u_terminateChars(buffer, bufferLength, 0, pErrorCode); 1.1469 + } 1.1470 + 1.1471 + length=0; 1.1472 + 1.1473 + /* try algorithmic names first */ 1.1474 + p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); 1.1475 + i=*p; 1.1476 + algRange=(AlgorithmicRange *)(p+1); 1.1477 + while(i>0) { 1.1478 + if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) { 1.1479 + length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength); 1.1480 + break; 1.1481 + } 1.1482 + algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); 1.1483 + --i; 1.1484 + } 1.1485 + 1.1486 + if(i==0) { 1.1487 + if (nameChoice == U_EXTENDED_CHAR_NAME) { 1.1488 + length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength); 1.1489 + if (!length) { 1.1490 + /* extended character name */ 1.1491 + length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength); 1.1492 + } 1.1493 + } else { 1.1494 + /* normal character name */ 1.1495 + length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength); 1.1496 + } 1.1497 + } 1.1498 + 1.1499 + return u_terminateChars(buffer, bufferLength, length, pErrorCode); 1.1500 +} 1.1501 + 1.1502 +U_CAPI int32_t U_EXPORT2 1.1503 +u_getISOComment(UChar32 /*c*/, 1.1504 + char *dest, int32_t destCapacity, 1.1505 + UErrorCode *pErrorCode) { 1.1506 + /* check the argument values */ 1.1507 + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 1.1508 + return 0; 1.1509 + } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) { 1.1510 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.1511 + return 0; 1.1512 + } 1.1513 + 1.1514 + return u_terminateChars(dest, destCapacity, 0, pErrorCode); 1.1515 +} 1.1516 + 1.1517 +U_CAPI UChar32 U_EXPORT2 1.1518 +u_charFromName(UCharNameChoice nameChoice, 1.1519 + const char *name, 1.1520 + UErrorCode *pErrorCode) { 1.1521 + char upper[120], lower[120]; 1.1522 + FindName findName; 1.1523 + AlgorithmicRange *algRange; 1.1524 + uint32_t *p; 1.1525 + uint32_t i; 1.1526 + UChar32 cp = 0; 1.1527 + char c0; 1.1528 + UChar32 error = 0xffff; /* Undefined, but use this for backwards compatibility. */ 1.1529 + 1.1530 + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 1.1531 + return error; 1.1532 + } 1.1533 + 1.1534 + if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) { 1.1535 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.1536 + return error; 1.1537 + } 1.1538 + 1.1539 + if(!isDataLoaded(pErrorCode)) { 1.1540 + return error; 1.1541 + } 1.1542 + 1.1543 + /* construct the uppercase and lowercase of the name first */ 1.1544 + for(i=0; i<sizeof(upper); ++i) { 1.1545 + if((c0=*name++)!=0) { 1.1546 + upper[i]=uprv_toupper(c0); 1.1547 + lower[i]=uprv_tolower(c0); 1.1548 + } else { 1.1549 + upper[i]=lower[i]=0; 1.1550 + break; 1.1551 + } 1.1552 + } 1.1553 + if(i==sizeof(upper)) { 1.1554 + /* name too long, there is no such character */ 1.1555 + *pErrorCode = U_ILLEGAL_CHAR_FOUND; 1.1556 + return error; 1.1557 + } 1.1558 + 1.1559 + /* try extended names first */ 1.1560 + if (lower[0] == '<') { 1.1561 + if (nameChoice == U_EXTENDED_CHAR_NAME) { 1.1562 + if (lower[--i] == '>') { 1.1563 + for (--i; lower[i] && lower[i] != '-'; --i) { 1.1564 + } 1.1565 + 1.1566 + if (lower[i] == '-') { /* We've got a category. */ 1.1567 + uint32_t cIdx; 1.1568 + 1.1569 + lower[i] = 0; 1.1570 + 1.1571 + for (++i; lower[i] != '>'; ++i) { 1.1572 + if (lower[i] >= '0' && lower[i] <= '9') { 1.1573 + cp = (cp << 4) + lower[i] - '0'; 1.1574 + } else if (lower[i] >= 'a' && lower[i] <= 'f') { 1.1575 + cp = (cp << 4) + lower[i] - 'a' + 10; 1.1576 + } else { 1.1577 + *pErrorCode = U_ILLEGAL_CHAR_FOUND; 1.1578 + return error; 1.1579 + } 1.1580 + } 1.1581 + 1.1582 + /* Now validate the category name. 1.1583 + We could use a binary search, or a trie, if 1.1584 + we really wanted to. */ 1.1585 + 1.1586 + for (lower[i] = 0, cIdx = 0; cIdx < LENGTHOF(charCatNames); ++cIdx) { 1.1587 + 1.1588 + if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) { 1.1589 + if (getCharCat(cp) == cIdx) { 1.1590 + return cp; 1.1591 + } 1.1592 + break; 1.1593 + } 1.1594 + } 1.1595 + } 1.1596 + } 1.1597 + } 1.1598 + 1.1599 + *pErrorCode = U_ILLEGAL_CHAR_FOUND; 1.1600 + return error; 1.1601 + } 1.1602 + 1.1603 + /* try algorithmic names now */ 1.1604 + p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); 1.1605 + i=*p; 1.1606 + algRange=(AlgorithmicRange *)(p+1); 1.1607 + while(i>0) { 1.1608 + if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) { 1.1609 + return cp; 1.1610 + } 1.1611 + algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); 1.1612 + --i; 1.1613 + } 1.1614 + 1.1615 + /* normal character name */ 1.1616 + findName.otherName=upper; 1.1617 + findName.code=error; 1.1618 + enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice); 1.1619 + if (findName.code == error) { 1.1620 + *pErrorCode = U_ILLEGAL_CHAR_FOUND; 1.1621 + } 1.1622 + return findName.code; 1.1623 +} 1.1624 + 1.1625 +U_CAPI void U_EXPORT2 1.1626 +u_enumCharNames(UChar32 start, UChar32 limit, 1.1627 + UEnumCharNamesFn *fn, 1.1628 + void *context, 1.1629 + UCharNameChoice nameChoice, 1.1630 + UErrorCode *pErrorCode) { 1.1631 + AlgorithmicRange *algRange; 1.1632 + uint32_t *p; 1.1633 + uint32_t i; 1.1634 + 1.1635 + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 1.1636 + return; 1.1637 + } 1.1638 + 1.1639 + if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) { 1.1640 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.1641 + return; 1.1642 + } 1.1643 + 1.1644 + if((uint32_t) limit > UCHAR_MAX_VALUE + 1) { 1.1645 + limit = UCHAR_MAX_VALUE + 1; 1.1646 + } 1.1647 + if((uint32_t)start>=(uint32_t)limit) { 1.1648 + return; 1.1649 + } 1.1650 + 1.1651 + if(!isDataLoaded(pErrorCode)) { 1.1652 + return; 1.1653 + } 1.1654 + 1.1655 + /* interleave the data-driven ones with the algorithmic ones */ 1.1656 + /* iterate over all algorithmic ranges; assume that they are in ascending order */ 1.1657 + p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); 1.1658 + i=*p; 1.1659 + algRange=(AlgorithmicRange *)(p+1); 1.1660 + while(i>0) { 1.1661 + /* enumerate the character names before the current algorithmic range */ 1.1662 + /* here: start<limit */ 1.1663 + if((uint32_t)start<algRange->start) { 1.1664 + if((uint32_t)limit<=algRange->start) { 1.1665 + enumNames(uCharNames, start, limit, fn, context, nameChoice); 1.1666 + return; 1.1667 + } 1.1668 + if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) { 1.1669 + return; 1.1670 + } 1.1671 + start=(UChar32)algRange->start; 1.1672 + } 1.1673 + /* enumerate the character names in the current algorithmic range */ 1.1674 + /* here: algRange->start<=start<limit */ 1.1675 + if((uint32_t)start<=algRange->end) { 1.1676 + if((uint32_t)limit<=(algRange->end+1)) { 1.1677 + enumAlgNames(algRange, start, limit, fn, context, nameChoice); 1.1678 + return; 1.1679 + } 1.1680 + if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) { 1.1681 + return; 1.1682 + } 1.1683 + start=(UChar32)algRange->end+1; 1.1684 + } 1.1685 + /* continue to the next algorithmic range (here: start<limit) */ 1.1686 + algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); 1.1687 + --i; 1.1688 + } 1.1689 + /* enumerate the character names after the last algorithmic range */ 1.1690 + enumNames(uCharNames, start, limit, fn, context, nameChoice); 1.1691 +} 1.1692 + 1.1693 +U_CAPI int32_t U_EXPORT2 1.1694 +uprv_getMaxCharNameLength() { 1.1695 + UErrorCode errorCode=U_ZERO_ERROR; 1.1696 + if(calcNameSetsLengths(&errorCode)) { 1.1697 + return gMaxNameLength; 1.1698 + } else { 1.1699 + return 0; 1.1700 + } 1.1701 +} 1.1702 + 1.1703 +/** 1.1704 + * Converts the char set cset into a Unicode set uset. 1.1705 + * @param cset Set of 256 bit flags corresponding to a set of chars. 1.1706 + * @param uset USet to receive characters. Existing contents are deleted. 1.1707 + */ 1.1708 +static void 1.1709 +charSetToUSet(uint32_t cset[8], const USetAdder *sa) { 1.1710 + UChar us[256]; 1.1711 + char cs[256]; 1.1712 + 1.1713 + int32_t i, length; 1.1714 + UErrorCode errorCode; 1.1715 + 1.1716 + errorCode=U_ZERO_ERROR; 1.1717 + 1.1718 + if(!calcNameSetsLengths(&errorCode)) { 1.1719 + return; 1.1720 + } 1.1721 + 1.1722 + /* build a char string with all chars that are used in character names */ 1.1723 + length=0; 1.1724 + for(i=0; i<256; ++i) { 1.1725 + if(SET_CONTAINS(cset, i)) { 1.1726 + cs[length++]=(char)i; 1.1727 + } 1.1728 + } 1.1729 + 1.1730 + /* convert the char string to a UChar string */ 1.1731 + u_charsToUChars(cs, us, length); 1.1732 + 1.1733 + /* add each UChar to the USet */ 1.1734 + for(i=0; i<length; ++i) { 1.1735 + if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */ 1.1736 + sa->add(sa->set, us[i]); 1.1737 + } 1.1738 + } 1.1739 +} 1.1740 + 1.1741 +/** 1.1742 + * Fills set with characters that are used in Unicode character names. 1.1743 + * @param set USet to receive characters. 1.1744 + */ 1.1745 +U_CAPI void U_EXPORT2 1.1746 +uprv_getCharNameCharacters(const USetAdder *sa) { 1.1747 + charSetToUSet(gNameSet, sa); 1.1748 +} 1.1749 + 1.1750 +/* data swapping ------------------------------------------------------------ */ 1.1751 + 1.1752 +/* 1.1753 + * The token table contains non-negative entries for token bytes, 1.1754 + * and -1 for bytes that represent themselves in the data file's charset. 1.1755 + * -2 entries are used for lead bytes. 1.1756 + * 1.1757 + * Direct bytes (-1 entries) must be translated from the input charset family 1.1758 + * to the output charset family. 1.1759 + * makeTokenMap() writes a permutation mapping for this. 1.1760 + * Use it once for single-/lead-byte tokens and once more for all trail byte 1.1761 + * tokens. (';' is an unused trail byte marked with -1.) 1.1762 + */ 1.1763 +static void 1.1764 +makeTokenMap(const UDataSwapper *ds, 1.1765 + int16_t tokens[], uint16_t tokenCount, 1.1766 + uint8_t map[256], 1.1767 + UErrorCode *pErrorCode) { 1.1768 + UBool usedOutChar[256]; 1.1769 + uint16_t i, j; 1.1770 + uint8_t c1, c2; 1.1771 + 1.1772 + if(U_FAILURE(*pErrorCode)) { 1.1773 + return; 1.1774 + } 1.1775 + 1.1776 + if(ds->inCharset==ds->outCharset) { 1.1777 + /* Same charset family: identity permutation */ 1.1778 + for(i=0; i<256; ++i) { 1.1779 + map[i]=(uint8_t)i; 1.1780 + } 1.1781 + } else { 1.1782 + uprv_memset(map, 0, 256); 1.1783 + uprv_memset(usedOutChar, 0, 256); 1.1784 + 1.1785 + if(tokenCount>256) { 1.1786 + tokenCount=256; 1.1787 + } 1.1788 + 1.1789 + /* set the direct bytes (byte 0 always maps to itself) */ 1.1790 + for(i=1; i<tokenCount; ++i) { 1.1791 + if(tokens[i]==-1) { 1.1792 + /* convert the direct byte character */ 1.1793 + c1=(uint8_t)i; 1.1794 + ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode); 1.1795 + if(U_FAILURE(*pErrorCode)) { 1.1796 + udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n", 1.1797 + i, ds->inCharset); 1.1798 + return; 1.1799 + } 1.1800 + 1.1801 + /* enter the converted character into the map and mark it used */ 1.1802 + map[c1]=c2; 1.1803 + usedOutChar[c2]=TRUE; 1.1804 + } 1.1805 + } 1.1806 + 1.1807 + /* set the mappings for the rest of the permutation */ 1.1808 + for(i=j=1; i<tokenCount; ++i) { 1.1809 + /* set mappings that were not set for direct bytes */ 1.1810 + if(map[i]==0) { 1.1811 + /* set an output byte value that was not used as an output byte above */ 1.1812 + while(usedOutChar[j]) { 1.1813 + ++j; 1.1814 + } 1.1815 + map[i]=(uint8_t)j++; 1.1816 + } 1.1817 + } 1.1818 + 1.1819 + /* 1.1820 + * leave mappings at tokenCount and above unset if tokenCount<256 1.1821 + * because they won't be used 1.1822 + */ 1.1823 + } 1.1824 +} 1.1825 + 1.1826 +U_CAPI int32_t U_EXPORT2 1.1827 +uchar_swapNames(const UDataSwapper *ds, 1.1828 + const void *inData, int32_t length, void *outData, 1.1829 + UErrorCode *pErrorCode) { 1.1830 + const UDataInfo *pInfo; 1.1831 + int32_t headerSize; 1.1832 + 1.1833 + const uint8_t *inBytes; 1.1834 + uint8_t *outBytes; 1.1835 + 1.1836 + uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset, 1.1837 + offset, i, count, stringsCount; 1.1838 + 1.1839 + const AlgorithmicRange *inRange; 1.1840 + AlgorithmicRange *outRange; 1.1841 + 1.1842 + /* udata_swapDataHeader checks the arguments */ 1.1843 + headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 1.1844 + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 1.1845 + return 0; 1.1846 + } 1.1847 + 1.1848 + /* check data format and format version */ 1.1849 + pInfo=(const UDataInfo *)((const char *)inData+4); 1.1850 + if(!( 1.1851 + pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */ 1.1852 + pInfo->dataFormat[1]==0x6e && 1.1853 + pInfo->dataFormat[2]==0x61 && 1.1854 + pInfo->dataFormat[3]==0x6d && 1.1855 + pInfo->formatVersion[0]==1 1.1856 + )) { 1.1857 + udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n", 1.1858 + pInfo->dataFormat[0], pInfo->dataFormat[1], 1.1859 + pInfo->dataFormat[2], pInfo->dataFormat[3], 1.1860 + pInfo->formatVersion[0]); 1.1861 + *pErrorCode=U_UNSUPPORTED_ERROR; 1.1862 + return 0; 1.1863 + } 1.1864 + 1.1865 + inBytes=(const uint8_t *)inData+headerSize; 1.1866 + outBytes=(uint8_t *)outData+headerSize; 1.1867 + if(length<0) { 1.1868 + algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]); 1.1869 + } else { 1.1870 + length-=headerSize; 1.1871 + if( length<20 || 1.1872 + (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3])) 1.1873 + ) { 1.1874 + udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n", 1.1875 + length); 1.1876 + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1.1877 + return 0; 1.1878 + } 1.1879 + } 1.1880 + 1.1881 + if(length<0) { 1.1882 + /* preflighting: iterate through algorithmic ranges */ 1.1883 + offset=algNamesOffset; 1.1884 + count=ds->readUInt32(*((const uint32_t *)(inBytes+offset))); 1.1885 + offset+=4; 1.1886 + 1.1887 + for(i=0; i<count; ++i) { 1.1888 + inRange=(const AlgorithmicRange *)(inBytes+offset); 1.1889 + offset+=ds->readUInt16(inRange->size); 1.1890 + } 1.1891 + } else { 1.1892 + /* swap data */ 1.1893 + const uint16_t *p; 1.1894 + uint16_t *q, *temp; 1.1895 + 1.1896 + int16_t tokens[512]; 1.1897 + uint16_t tokenCount; 1.1898 + 1.1899 + uint8_t map[256], trailMap[256]; 1.1900 + 1.1901 + /* copy the data for inaccessible bytes */ 1.1902 + if(inBytes!=outBytes) { 1.1903 + uprv_memcpy(outBytes, inBytes, length); 1.1904 + } 1.1905 + 1.1906 + /* the initial 4 offsets first */ 1.1907 + tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]); 1.1908 + groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]); 1.1909 + groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]); 1.1910 + ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode); 1.1911 + 1.1912 + /* 1.1913 + * now the tokens table 1.1914 + * it needs to be permutated along with the compressed name strings 1.1915 + */ 1.1916 + p=(const uint16_t *)(inBytes+16); 1.1917 + q=(uint16_t *)(outBytes+16); 1.1918 + 1.1919 + /* read and swap the tokenCount */ 1.1920 + tokenCount=ds->readUInt16(*p); 1.1921 + ds->swapArray16(ds, p, 2, q, pErrorCode); 1.1922 + ++p; 1.1923 + ++q; 1.1924 + 1.1925 + /* read the first 512 tokens and make the token maps */ 1.1926 + if(tokenCount<=512) { 1.1927 + count=tokenCount; 1.1928 + } else { 1.1929 + count=512; 1.1930 + } 1.1931 + for(i=0; i<count; ++i) { 1.1932 + tokens[i]=udata_readInt16(ds, p[i]); 1.1933 + } 1.1934 + for(; i<512; ++i) { 1.1935 + tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */ 1.1936 + } 1.1937 + makeTokenMap(ds, tokens, tokenCount, map, pErrorCode); 1.1938 + makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode); 1.1939 + if(U_FAILURE(*pErrorCode)) { 1.1940 + return 0; 1.1941 + } 1.1942 + 1.1943 + /* 1.1944 + * swap and permutate the tokens 1.1945 + * go through a temporary array to support in-place swapping 1.1946 + */ 1.1947 + temp=(uint16_t *)uprv_malloc(tokenCount*2); 1.1948 + if(temp==NULL) { 1.1949 + udata_printError(ds, "out of memory swapping %u unames.icu tokens\n", 1.1950 + tokenCount); 1.1951 + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1.1952 + return 0; 1.1953 + } 1.1954 + 1.1955 + /* swap and permutate single-/lead-byte tokens */ 1.1956 + for(i=0; i<tokenCount && i<256; ++i) { 1.1957 + ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode); 1.1958 + } 1.1959 + 1.1960 + /* swap and permutate trail-byte tokens */ 1.1961 + for(; i<tokenCount; ++i) { 1.1962 + ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode); 1.1963 + } 1.1964 + 1.1965 + /* copy the result into the output and free the temporary array */ 1.1966 + uprv_memcpy(q, temp, tokenCount*2); 1.1967 + uprv_free(temp); 1.1968 + 1.1969 + /* 1.1970 + * swap the token strings but not a possible padding byte after 1.1971 + * the terminating NUL of the last string 1.1972 + */ 1.1973 + udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset), 1.1974 + outBytes+tokenStringOffset, pErrorCode); 1.1975 + if(U_FAILURE(*pErrorCode)) { 1.1976 + udata_printError(ds, "uchar_swapNames(token strings) failed\n"); 1.1977 + return 0; 1.1978 + } 1.1979 + 1.1980 + /* swap the group table */ 1.1981 + count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset))); 1.1982 + ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2), 1.1983 + outBytes+groupsOffset, pErrorCode); 1.1984 + 1.1985 + /* 1.1986 + * swap the group strings 1.1987 + * swap the string bytes but not the nibble-encoded string lengths 1.1988 + */ 1.1989 + if(ds->inCharset!=ds->outCharset) { 1.1990 + uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1]; 1.1991 + 1.1992 + const uint8_t *inStrings, *nextInStrings; 1.1993 + uint8_t *outStrings; 1.1994 + 1.1995 + uint8_t c; 1.1996 + 1.1997 + inStrings=inBytes+groupStringOffset; 1.1998 + outStrings=outBytes+groupStringOffset; 1.1999 + 1.2000 + stringsCount=algNamesOffset-groupStringOffset; 1.2001 + 1.2002 + /* iterate through string groups until only a few padding bytes are left */ 1.2003 + while(stringsCount>32) { 1.2004 + nextInStrings=expandGroupLengths(inStrings, offsets, lengths); 1.2005 + 1.2006 + /* move past the length bytes */ 1.2007 + stringsCount-=(uint32_t)(nextInStrings-inStrings); 1.2008 + outStrings+=nextInStrings-inStrings; 1.2009 + inStrings=nextInStrings; 1.2010 + 1.2011 + count=offsets[31]+lengths[31]; /* total number of string bytes in this group */ 1.2012 + stringsCount-=count; 1.2013 + 1.2014 + /* swap the string bytes using map[] and trailMap[] */ 1.2015 + while(count>0) { 1.2016 + c=*inStrings++; 1.2017 + *outStrings++=map[c]; 1.2018 + if(tokens[c]!=-2) { 1.2019 + --count; 1.2020 + } else { 1.2021 + /* token lead byte: swap the trail byte, too */ 1.2022 + *outStrings++=trailMap[*inStrings++]; 1.2023 + count-=2; 1.2024 + } 1.2025 + } 1.2026 + } 1.2027 + } 1.2028 + 1.2029 + /* swap the algorithmic ranges */ 1.2030 + offset=algNamesOffset; 1.2031 + count=ds->readUInt32(*((const uint32_t *)(inBytes+offset))); 1.2032 + ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode); 1.2033 + offset+=4; 1.2034 + 1.2035 + for(i=0; i<count; ++i) { 1.2036 + if(offset>(uint32_t)length) { 1.2037 + udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n", 1.2038 + length, i); 1.2039 + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1.2040 + return 0; 1.2041 + } 1.2042 + 1.2043 + inRange=(const AlgorithmicRange *)(inBytes+offset); 1.2044 + outRange=(AlgorithmicRange *)(outBytes+offset); 1.2045 + offset+=ds->readUInt16(inRange->size); 1.2046 + 1.2047 + ds->swapArray32(ds, inRange, 8, outRange, pErrorCode); 1.2048 + ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode); 1.2049 + switch(inRange->type) { 1.2050 + case 0: 1.2051 + /* swap prefix string */ 1.2052 + ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)), 1.2053 + outRange+1, pErrorCode); 1.2054 + if(U_FAILURE(*pErrorCode)) { 1.2055 + udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n", 1.2056 + i); 1.2057 + return 0; 1.2058 + } 1.2059 + break; 1.2060 + case 1: 1.2061 + { 1.2062 + /* swap factors and the prefix and factor strings */ 1.2063 + uint32_t factorsCount; 1.2064 + 1.2065 + factorsCount=inRange->variant; 1.2066 + p=(const uint16_t *)(inRange+1); 1.2067 + q=(uint16_t *)(outRange+1); 1.2068 + ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode); 1.2069 + 1.2070 + /* swap the strings, up to the last terminating NUL */ 1.2071 + p+=factorsCount; 1.2072 + q+=factorsCount; 1.2073 + stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p); 1.2074 + while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) { 1.2075 + --stringsCount; 1.2076 + } 1.2077 + ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode); 1.2078 + } 1.2079 + break; 1.2080 + default: 1.2081 + udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n", 1.2082 + inRange->type, i); 1.2083 + *pErrorCode=U_UNSUPPORTED_ERROR; 1.2084 + return 0; 1.2085 + } 1.2086 + } 1.2087 + } 1.2088 + 1.2089 + return headerSize+(int32_t)offset; 1.2090 +} 1.2091 + 1.2092 +U_NAMESPACE_END 1.2093 + 1.2094 +/* 1.2095 + * Hey, Emacs, please set the following: 1.2096 + * 1.2097 + * Local Variables: 1.2098 + * indent-tabs-mode: nil 1.2099 + * End: 1.2100 + * 1.2101 + */