michael@0: /* michael@0: ****************************************************************************** michael@0: * michael@0: * Copyright (C) 1999-2013, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ****************************************************************************** michael@0: * file name: unames.c michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 1999oct04 michael@0: * created by: Markus W. Scherer michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "unicode/putil.h" michael@0: #include "unicode/uchar.h" michael@0: #include "unicode/udata.h" michael@0: #include "unicode/utf.h" michael@0: #include "unicode/utf16.h" michael@0: #include "uassert.h" michael@0: #include "ustr_imp.h" michael@0: #include "umutex.h" michael@0: #include "cmemory.h" michael@0: #include "cstring.h" michael@0: #include "ucln_cmn.h" michael@0: #include "udataswp.h" michael@0: #include "uprops.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: /* prototypes ------------------------------------------------------------- */ michael@0: michael@0: #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) michael@0: michael@0: static const char DATA_NAME[] = "unames"; michael@0: static const char DATA_TYPE[] = "icu"; michael@0: michael@0: #define GROUP_SHIFT 5 michael@0: #define LINES_PER_GROUP (1L<groupsOffset) michael@0: michael@0: typedef struct { michael@0: const char *otherName; michael@0: UChar32 code; michael@0: } FindName; michael@0: michael@0: #define DO_FIND_NAME NULL michael@0: michael@0: static UDataMemory *uCharNamesData=NULL; michael@0: static UCharNames *uCharNames=NULL; michael@0: static icu::UInitOnce gCharNamesInitOnce = U_INITONCE_INITIALIZER; michael@0: michael@0: /* michael@0: * Maximum length of character names (regular & 1.0). michael@0: */ michael@0: static int32_t gMaxNameLength=0; michael@0: michael@0: /* michael@0: * Set of chars used in character names (regular & 1.0). michael@0: * Chars are platform-dependent (can be EBCDIC). michael@0: */ michael@0: static uint32_t gNameSet[8]={ 0 }; michael@0: michael@0: #define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT michael@0: #define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1 michael@0: #define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2 michael@0: michael@0: #define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3) michael@0: michael@0: static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = { michael@0: "unassigned", michael@0: "uppercase letter", michael@0: "lowercase letter", michael@0: "titlecase letter", michael@0: "modifier letter", michael@0: "other letter", michael@0: "non spacing mark", michael@0: "enclosing mark", michael@0: "combining spacing mark", michael@0: "decimal digit number", michael@0: "letter number", michael@0: "other number", michael@0: "space separator", michael@0: "line separator", michael@0: "paragraph separator", michael@0: "control", michael@0: "format", michael@0: "private use area", michael@0: "surrogate", michael@0: "dash punctuation", michael@0: "start punctuation", michael@0: "end punctuation", michael@0: "connector punctuation", michael@0: "other punctuation", michael@0: "math symbol", michael@0: "currency symbol", michael@0: "modifier symbol", michael@0: "other symbol", michael@0: "initial punctuation", michael@0: "final punctuation", michael@0: "noncharacter", michael@0: "lead surrogate", michael@0: "trail surrogate" michael@0: }; michael@0: michael@0: /* implementation ----------------------------------------------------------- */ michael@0: michael@0: static UBool U_CALLCONV unames_cleanup(void) michael@0: { michael@0: if(uCharNamesData) { michael@0: udata_close(uCharNamesData); michael@0: uCharNamesData = NULL; michael@0: } michael@0: if(uCharNames) { michael@0: uCharNames = NULL; michael@0: } michael@0: gCharNamesInitOnce.reset(); michael@0: gMaxNameLength=0; michael@0: return TRUE; michael@0: } michael@0: michael@0: static UBool U_CALLCONV michael@0: isAcceptable(void * /*context*/, michael@0: const char * /*type*/, const char * /*name*/, michael@0: const UDataInfo *pInfo) { michael@0: return (UBool)( michael@0: pInfo->size>=20 && michael@0: pInfo->isBigEndian==U_IS_BIG_ENDIAN && michael@0: pInfo->charsetFamily==U_CHARSET_FAMILY && michael@0: pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */ michael@0: pInfo->dataFormat[1]==0x6e && michael@0: pInfo->dataFormat[2]==0x61 && michael@0: pInfo->dataFormat[3]==0x6d && michael@0: pInfo->formatVersion[0]==1); michael@0: } michael@0: michael@0: static void U_CALLCONV michael@0: loadCharNames(UErrorCode &status) { michael@0: U_ASSERT(uCharNamesData == NULL); michael@0: U_ASSERT(uCharNames == NULL); michael@0: michael@0: uCharNamesData = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &status); michael@0: if(U_FAILURE(status)) { michael@0: uCharNamesData = NULL; michael@0: } else { michael@0: uCharNames = (UCharNames *)udata_getMemory(uCharNamesData); michael@0: } michael@0: ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup); michael@0: } michael@0: michael@0: michael@0: static UBool michael@0: isDataLoaded(UErrorCode *pErrorCode) { michael@0: umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode); michael@0: return U_SUCCESS(*pErrorCode); michael@0: } michael@0: michael@0: #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \ michael@0: if((bufferLength)>0) { \ michael@0: *(buffer)++=c; \ michael@0: --(bufferLength); \ michael@0: } \ michael@0: ++(bufferPos); \ michael@0: } michael@0: michael@0: #define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT michael@0: michael@0: /* michael@0: * Important: expandName() and compareName() are almost the same - michael@0: * apply fixes to both. michael@0: * michael@0: * UnicodeData.txt uses ';' as a field separator, so no michael@0: * field can contain ';' as part of its contents. michael@0: * In unames.dat, it is marked as token[';']==-1 only if the michael@0: * semicolon is used in the data file - which is iff we michael@0: * have Unicode 1.0 names or ISO comments or aliases. michael@0: * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases michael@0: * although we know that it will never be part of a name. michael@0: */ michael@0: static uint16_t michael@0: expandName(UCharNames *names, michael@0: const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, michael@0: char *buffer, uint16_t bufferLength) { michael@0: uint16_t *tokens=(uint16_t *)names+8; michael@0: uint16_t token, tokenCount=*tokens++, bufferPos=0; michael@0: uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset; michael@0: uint8_t c; michael@0: michael@0: if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { michael@0: /* michael@0: * skip the modern name if it is not requested _and_ michael@0: * if the semicolon byte value is a character, not a token number michael@0: */ michael@0: if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { michael@0: int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice; michael@0: do { michael@0: while(nameLength>0) { michael@0: --nameLength; michael@0: if(*name++==';') { michael@0: break; michael@0: } michael@0: } michael@0: } while(--fieldIndex>0); michael@0: } else { michael@0: /* michael@0: * the semicolon byte value is a token number, therefore michael@0: * only modern names are stored in unames.dat and there is no michael@0: * such requested alternate name here michael@0: */ michael@0: nameLength=0; michael@0: } michael@0: } michael@0: michael@0: /* write each letter directly, and write a token word per token */ michael@0: while(nameLength>0) { michael@0: --nameLength; michael@0: c=*name++; michael@0: michael@0: if(c>=tokenCount) { michael@0: if(c!=';') { michael@0: /* implicit letter */ michael@0: WRITE_CHAR(buffer, bufferLength, bufferPos, c); michael@0: } else { michael@0: /* finished */ michael@0: break; michael@0: } michael@0: } else { michael@0: token=tokens[c]; michael@0: if(token==(uint16_t)(-2)) { michael@0: /* this is a lead byte for a double-byte token */ michael@0: token=tokens[c<<8|*name++]; michael@0: --nameLength; michael@0: } michael@0: if(token==(uint16_t)(-1)) { michael@0: if(c!=';') { michael@0: /* explicit letter */ michael@0: WRITE_CHAR(buffer, bufferLength, bufferPos, c); michael@0: } else { michael@0: /* stop, but skip the semicolon if we are seeking michael@0: extended names and there was no 2.0 name but there michael@0: is a 1.0 name. */ michael@0: if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) { michael@0: if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { michael@0: continue; michael@0: } michael@0: } michael@0: /* finished */ michael@0: break; michael@0: } michael@0: } else { michael@0: /* write token word */ michael@0: uint8_t *tokenString=tokenStrings+token; michael@0: while((c=*tokenString++)!=0) { michael@0: WRITE_CHAR(buffer, bufferLength, bufferPos, c); michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* zero-terminate */ michael@0: if(bufferLength>0) { michael@0: *buffer=0; michael@0: } michael@0: michael@0: return bufferPos; michael@0: } michael@0: michael@0: /* michael@0: * compareName() is almost the same as expandName() except that it compares michael@0: * the currently expanded name to an input name. michael@0: * It returns the match/no match result as soon as possible. michael@0: */ michael@0: static UBool michael@0: compareName(UCharNames *names, michael@0: const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, michael@0: const char *otherName) { michael@0: uint16_t *tokens=(uint16_t *)names+8; michael@0: uint16_t token, tokenCount=*tokens++; michael@0: uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset; michael@0: uint8_t c; michael@0: const char *origOtherName = otherName; michael@0: michael@0: if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { michael@0: /* michael@0: * skip the modern name if it is not requested _and_ michael@0: * if the semicolon byte value is a character, not a token number michael@0: */ michael@0: if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { michael@0: int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice; michael@0: do { michael@0: while(nameLength>0) { michael@0: --nameLength; michael@0: if(*name++==';') { michael@0: break; michael@0: } michael@0: } michael@0: } while(--fieldIndex>0); michael@0: } else { michael@0: /* michael@0: * the semicolon byte value is a token number, therefore michael@0: * only modern names are stored in unames.dat and there is no michael@0: * such requested alternate name here michael@0: */ michael@0: nameLength=0; michael@0: } michael@0: } michael@0: michael@0: /* compare each letter directly, and compare a token word per token */ michael@0: while(nameLength>0) { michael@0: --nameLength; michael@0: c=*name++; michael@0: michael@0: if(c>=tokenCount) { michael@0: if(c!=';') { michael@0: /* implicit letter */ michael@0: if((char)c!=*otherName++) { michael@0: return FALSE; michael@0: } michael@0: } else { michael@0: /* finished */ michael@0: break; michael@0: } michael@0: } else { michael@0: token=tokens[c]; michael@0: if(token==(uint16_t)(-2)) { michael@0: /* this is a lead byte for a double-byte token */ michael@0: token=tokens[c<<8|*name++]; michael@0: --nameLength; michael@0: } michael@0: if(token==(uint16_t)(-1)) { michael@0: if(c!=';') { michael@0: /* explicit letter */ michael@0: if((char)c!=*otherName++) { michael@0: return FALSE; michael@0: } michael@0: } else { michael@0: /* stop, but skip the semicolon if we are seeking michael@0: extended names and there was no 2.0 name but there michael@0: is a 1.0 name. */ michael@0: if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) { michael@0: if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { michael@0: continue; michael@0: } michael@0: } michael@0: /* finished */ michael@0: break; michael@0: } michael@0: } else { michael@0: /* write token word */ michael@0: uint8_t *tokenString=tokenStrings+token; michael@0: while((c=*tokenString++)!=0) { michael@0: if((char)c!=*otherName++) { michael@0: return FALSE; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* complete match? */ michael@0: return (UBool)(*otherName==0); michael@0: } michael@0: michael@0: static uint8_t getCharCat(UChar32 cp) { michael@0: uint8_t cat; michael@0: michael@0: if (U_IS_UNICODE_NONCHAR(cp)) { michael@0: return U_NONCHARACTER_CODE_POINT; michael@0: } michael@0: michael@0: if ((cat = u_charType(cp)) == U_SURROGATE) { michael@0: cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE; michael@0: } michael@0: michael@0: return cat; michael@0: } michael@0: michael@0: static const char *getCharCatName(UChar32 cp) { michael@0: uint8_t cat = getCharCat(cp); michael@0: michael@0: /* Return unknown if the table of names above is not up to michael@0: date. */ michael@0: michael@0: if (cat >= LENGTHOF(charCatNames)) { michael@0: return "unknown"; michael@0: } else { michael@0: return charCatNames[cat]; michael@0: } michael@0: } michael@0: michael@0: static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) { michael@0: const char *catname = getCharCatName(code); michael@0: uint16_t length = 0; michael@0: michael@0: UChar32 cp; michael@0: int ndigits, i; michael@0: michael@0: WRITE_CHAR(buffer, bufferLength, length, '<'); michael@0: while (catname[length - 1]) { michael@0: WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]); michael@0: } michael@0: WRITE_CHAR(buffer, bufferLength, length, '-'); michael@0: for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4) michael@0: ; michael@0: if (ndigits < 4) michael@0: ndigits = 4; michael@0: for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) { michael@0: uint8_t v = (uint8_t)(cp & 0xf); michael@0: buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10); michael@0: } michael@0: buffer += ndigits; michael@0: length += ndigits; michael@0: WRITE_CHAR(buffer, bufferLength, length, '>'); michael@0: michael@0: return length; michael@0: } michael@0: michael@0: /* michael@0: * getGroup() does a binary search for the group that contains the michael@0: * Unicode code point "code". michael@0: * The return value is always a valid Group* that may contain "code" michael@0: * or else is the highest group before "code". michael@0: * If the lowest group is after "code", then that one is returned. michael@0: */ michael@0: static const uint16_t * michael@0: getGroup(UCharNames *names, uint32_t code) { michael@0: const uint16_t *groups=GET_GROUPS(names); michael@0: uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT), michael@0: start=0, michael@0: limit=*groups++, michael@0: number; michael@0: michael@0: /* binary search for the group of names that contains the one for code */ michael@0: while(start=0xc, then it forms a length value with the following nibble. michael@0: * Calculation see below. michael@0: * The offsets and lengths arrays must be at least 33 (one more) long because michael@0: * there is no check here at the end if the last nibble is still used. michael@0: */ michael@0: static const uint8_t * michael@0: expandGroupLengths(const uint8_t *s, michael@0: uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) { michael@0: /* read the lengths of the 32 strings in this group and get each string's offset */ michael@0: uint16_t i=0, offset=0, length=0; michael@0: uint8_t lengthByte; michael@0: michael@0: /* all 32 lengths must be read to get the offset of the first group string */ michael@0: while(i=12) { michael@0: /* double-nibble length spread across two bytes */ michael@0: length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12); michael@0: lengthByte&=0xf; michael@0: } else if((lengthByte /* &0xf0 */)>=0xc0) { michael@0: /* double-nibble length spread across this one byte */ michael@0: length=(uint16_t)((lengthByte&0x3f)+12); michael@0: } else { michael@0: /* single-nibble length in MSBs */ michael@0: length=(uint16_t)(lengthByte>>4); michael@0: lengthByte&=0xf; michael@0: } michael@0: michael@0: *offsets++=offset; michael@0: *lengths++=length; michael@0: michael@0: offset+=length; michael@0: ++i; michael@0: michael@0: /* read odd nibble - LSBs of lengthByte */ michael@0: if((lengthByte&0xf0)==0) { michael@0: /* this nibble was not consumed for a double-nibble length above */ michael@0: length=lengthByte; michael@0: if(length<12) { michael@0: /* single-nibble length in LSBs */ michael@0: *offsets++=offset; michael@0: *lengths++=length; michael@0: michael@0: offset+=length; michael@0: ++i; michael@0: } michael@0: } else { michael@0: length=0; /* prevent double-nibble detection in the next iteration */ michael@0: } michael@0: } michael@0: michael@0: /* now, s is at the first group string */ michael@0: return s; michael@0: } michael@0: michael@0: static uint16_t michael@0: expandGroupName(UCharNames *names, const uint16_t *group, michael@0: uint16_t lineNumber, UCharNameChoice nameChoice, michael@0: char *buffer, uint16_t bufferLength) { michael@0: uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; michael@0: const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group); michael@0: s=expandGroupLengths(s, offsets, lengths); michael@0: return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice, michael@0: buffer, bufferLength); michael@0: } michael@0: michael@0: static uint16_t michael@0: getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice, michael@0: char *buffer, uint16_t bufferLength) { michael@0: const uint16_t *group=getGroup(names, code); michael@0: if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) { michael@0: return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice, michael@0: buffer, bufferLength); michael@0: } else { michael@0: /* group not found */ michael@0: /* zero-terminate */ michael@0: if(bufferLength>0) { michael@0: *buffer=0; michael@0: } michael@0: return 0; michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * enumGroupNames() enumerates all the names in a 32-group michael@0: * and either calls the enumerator function or finds a given input name. michael@0: */ michael@0: static UBool michael@0: enumGroupNames(UCharNames *names, const uint16_t *group, michael@0: UChar32 start, UChar32 end, michael@0: UEnumCharNamesFn *fn, void *context, michael@0: UCharNameChoice nameChoice) { michael@0: uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; michael@0: const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group); michael@0: michael@0: s=expandGroupLengths(s, offsets, lengths); michael@0: if(fn!=DO_FIND_NAME) { michael@0: char buffer[200]; michael@0: uint16_t length; michael@0: michael@0: while(start<=end) { michael@0: length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer)); michael@0: if (!length && nameChoice == U_EXTENDED_CHAR_NAME) { michael@0: buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0; michael@0: } michael@0: /* here, we assume that the buffer is large enough */ michael@0: if(length>0) { michael@0: if(!fn(context, start, nameChoice, buffer, length)) { michael@0: return FALSE; michael@0: } michael@0: } michael@0: ++start; michael@0: } michael@0: } else { michael@0: const char *otherName=((FindName *)context)->otherName; michael@0: while(start<=end) { michael@0: if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) { michael@0: ((FindName *)context)->code=start; michael@0: return FALSE; michael@0: } michael@0: ++start; michael@0: } michael@0: } michael@0: return TRUE; michael@0: } michael@0: michael@0: /* michael@0: * enumExtNames enumerate extended names. michael@0: * It only needs to do it if it is called with a real function and not michael@0: * with the dummy DO_FIND_NAME, because u_charFromName() does a check michael@0: * for extended names by itself. michael@0: */ michael@0: static UBool michael@0: enumExtNames(UChar32 start, UChar32 end, michael@0: UEnumCharNamesFn *fn, void *context) michael@0: { michael@0: if(fn!=DO_FIND_NAME) { michael@0: char buffer[200]; michael@0: uint16_t length; michael@0: michael@0: while(start<=end) { michael@0: buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0; michael@0: /* here, we assume that the buffer is large enough */ michael@0: if(length>0) { michael@0: if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) { michael@0: return FALSE; michael@0: } michael@0: } michael@0: ++start; michael@0: } michael@0: } michael@0: michael@0: return TRUE; michael@0: } michael@0: michael@0: static UBool michael@0: enumNames(UCharNames *names, michael@0: UChar32 start, UChar32 limit, michael@0: UEnumCharNamesFn *fn, void *context, michael@0: UCharNameChoice nameChoice) { michael@0: uint16_t startGroupMSB, endGroupMSB, groupCount; michael@0: const uint16_t *group, *groupLimit; michael@0: michael@0: startGroupMSB=(uint16_t)(start>>GROUP_SHIFT); michael@0: endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT); michael@0: michael@0: /* find the group that contains start, or the highest before it */ michael@0: group=getGroup(names, start); michael@0: michael@0: if(startGroupMSBlimit) { michael@0: extLimit=limit; michael@0: } michael@0: if(!enumExtNames(start, extLimit-1, fn, context)) { michael@0: return FALSE; michael@0: } michael@0: start=extLimit; michael@0: } michael@0: michael@0: if(startGroupMSB==endGroupMSB) { michael@0: if(startGroupMSB==group[GROUP_MSB]) { michael@0: /* if start and limit-1 are in the same group, then enumerate only in that one */ michael@0: return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice); michael@0: } michael@0: } else { michael@0: const uint16_t *groups=GET_GROUPS(names); michael@0: groupCount=*groups++; michael@0: groupLimit=groups+groupCount*GROUP_LENGTH; michael@0: michael@0: if(startGroupMSB==group[GROUP_MSB]) { michael@0: /* enumerate characters in the partial start group */ michael@0: if((start&GROUP_MASK)!=0) { michael@0: if(!enumGroupNames(names, group, michael@0: start, ((UChar32)startGroupMSB<group[GROUP_MSB]) { michael@0: /* make sure that we start enumerating with the first group after start */ michael@0: const uint16_t *nextGroup=NEXT_GROUP(group); michael@0: if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) { michael@0: UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT; michael@0: if (end > limit) { michael@0: end = limit; michael@0: } michael@0: if (!enumExtNames(start, end - 1, fn, context)) { michael@0: return FALSE; michael@0: } michael@0: } michael@0: group=nextGroup; michael@0: } michael@0: michael@0: /* enumerate entire groups between the start- and end-groups */ michael@0: while(group group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) { michael@0: UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT; michael@0: if (end > limit) { michael@0: end = limit; michael@0: } michael@0: if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) { michael@0: return FALSE; michael@0: } michael@0: } michael@0: group=nextGroup; michael@0: } michael@0: michael@0: /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */ michael@0: if(group start) { michael@0: start = next; michael@0: } michael@0: } else { michael@0: return TRUE; michael@0: } michael@0: } michael@0: michael@0: /* we have not found a group, which means everything is made of michael@0: extended names. */ michael@0: if (nameChoice == U_EXTENDED_CHAR_NAME) { michael@0: if (limit > UCHAR_MAX_VALUE + 1) { michael@0: limit = UCHAR_MAX_VALUE + 1; michael@0: } michael@0: return enumExtNames(start, limit - 1, fn, context); michael@0: } michael@0: michael@0: return TRUE; michael@0: } michael@0: michael@0: static uint16_t michael@0: writeFactorSuffix(const uint16_t *factors, uint16_t count, michael@0: const char *s, /* suffix elements */ michael@0: uint32_t code, michael@0: uint16_t indexes[8], /* output fields from here */ michael@0: const char *elementBases[8], const char *elements[8], michael@0: char *buffer, uint16_t bufferLength) { michael@0: uint16_t i, factor, bufferPos=0; michael@0: char c; michael@0: michael@0: /* write elements according to the factors */ michael@0: michael@0: /* michael@0: * the factorized elements are determined by modulo arithmetic michael@0: * with the factors of this algorithm michael@0: * michael@0: * note that for fewer operations, count is decremented here michael@0: */ michael@0: --count; michael@0: for(i=count; i>0; --i) { michael@0: factor=factors[i]; michael@0: indexes[i]=(uint16_t)(code%factor); michael@0: code/=factor; michael@0: } michael@0: /* michael@0: * we don't need to calculate the last modulus because start<=code<=end michael@0: * guarantees here that code<=factors[0] michael@0: */ michael@0: indexes[0]=(uint16_t)code; michael@0: michael@0: /* write each element */ michael@0: for(;;) { michael@0: if(elementBases!=NULL) { michael@0: *elementBases++=s; michael@0: } michael@0: michael@0: /* skip indexes[i] strings */ michael@0: factor=indexes[i]; michael@0: while(factor>0) { michael@0: while(*s++!=0) {} michael@0: --factor; michael@0: } michael@0: if(elements!=NULL) { michael@0: *elements++=s; michael@0: } michael@0: michael@0: /* write element */ michael@0: while((c=*s++)!=0) { michael@0: WRITE_CHAR(buffer, bufferLength, bufferPos, c); michael@0: } michael@0: michael@0: /* we do not need to perform the rest of this loop for i==count - break here */ michael@0: if(i>=count) { michael@0: break; michael@0: } michael@0: michael@0: /* skip the rest of the strings for this factors[i] */ michael@0: factor=(uint16_t)(factors[i]-indexes[i]-1); michael@0: while(factor>0) { michael@0: while(*s++!=0) {} michael@0: --factor; michael@0: } michael@0: michael@0: ++i; michael@0: } michael@0: michael@0: /* zero-terminate */ michael@0: if(bufferLength>0) { michael@0: *buffer=0; michael@0: } michael@0: michael@0: return bufferPos; michael@0: } michael@0: michael@0: /* michael@0: * Important: michael@0: * Parts of findAlgName() are almost the same as some of getAlgName(). michael@0: * Fixes must be applied to both. michael@0: */ michael@0: static uint16_t michael@0: getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice, michael@0: char *buffer, uint16_t bufferLength) { michael@0: uint16_t bufferPos=0; michael@0: michael@0: /* Only the normative character name can be algorithmic. */ michael@0: if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { michael@0: /* zero-terminate */ michael@0: if(bufferLength>0) { michael@0: *buffer=0; michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: switch(range->type) { michael@0: case 0: { michael@0: /* name = prefix hex-digits */ michael@0: const char *s=(const char *)(range+1); michael@0: char c; michael@0: michael@0: uint16_t i, count; michael@0: michael@0: /* copy prefix */ michael@0: while((c=*s++)!=0) { michael@0: WRITE_CHAR(buffer, bufferLength, bufferPos, c); michael@0: } michael@0: michael@0: /* write hexadecimal code point value */ michael@0: count=range->variant; michael@0: michael@0: /* zero-terminate */ michael@0: if(count0;) { michael@0: if(--i>=4; michael@0: } michael@0: michael@0: bufferPos+=count; michael@0: break; michael@0: } michael@0: case 1: { michael@0: /* name = prefix factorized-elements */ michael@0: uint16_t indexes[8]; michael@0: const uint16_t *factors=(const uint16_t *)(range+1); michael@0: uint16_t count=range->variant; michael@0: const char *s=(const char *)(factors+count); michael@0: char c; michael@0: michael@0: /* copy prefix */ michael@0: while((c=*s++)!=0) { michael@0: WRITE_CHAR(buffer, bufferLength, bufferPos, c); michael@0: } michael@0: michael@0: bufferPos+=writeFactorSuffix(factors, count, michael@0: s, code-range->start, indexes, NULL, NULL, buffer, bufferLength); michael@0: break; michael@0: } michael@0: default: michael@0: /* undefined type */ michael@0: /* zero-terminate */ michael@0: if(bufferLength>0) { michael@0: *buffer=0; michael@0: } michael@0: break; michael@0: } michael@0: michael@0: return bufferPos; michael@0: } michael@0: michael@0: /* michael@0: * Important: enumAlgNames() and findAlgName() are almost the same. michael@0: * Any fix must be applied to both. michael@0: */ michael@0: static UBool michael@0: enumAlgNames(AlgorithmicRange *range, michael@0: UChar32 start, UChar32 limit, michael@0: UEnumCharNamesFn *fn, void *context, michael@0: UCharNameChoice nameChoice) { michael@0: char buffer[200]; michael@0: uint16_t length; michael@0: michael@0: if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { michael@0: return TRUE; michael@0: } michael@0: michael@0: switch(range->type) { michael@0: case 0: { michael@0: char *s, *end; michael@0: char c; michael@0: michael@0: /* get the full name of the start character */ michael@0: length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer)); michael@0: if(length<=0) { michael@0: return TRUE; michael@0: } michael@0: michael@0: /* call the enumerator function with this first character */ michael@0: if(!fn(context, start, nameChoice, buffer, length)) { michael@0: return FALSE; michael@0: } michael@0: michael@0: /* go to the end of the name; all these names have the same length */ michael@0: end=buffer; michael@0: while(*end!=0) { michael@0: ++end; michael@0: } michael@0: michael@0: /* enumerate the rest of the names */ michael@0: while(++startvariant; michael@0: const char *s=(const char *)(factors+count); michael@0: char *suffix, *t; michael@0: uint16_t prefixLength, i, idx; michael@0: michael@0: char c; michael@0: michael@0: /* name = prefix factorized-elements */ michael@0: michael@0: /* copy prefix */ michael@0: suffix=buffer; michael@0: prefixLength=0; michael@0: while((c=*s++)!=0) { michael@0: *suffix++=c; michael@0: ++prefixLength; michael@0: } michael@0: michael@0: /* append the suffix of the start character */ michael@0: length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count, michael@0: s, (uint32_t)start-range->start, michael@0: indexes, elementBases, elements, michael@0: suffix, (uint16_t)(sizeof(buffer)-prefixLength))); michael@0: michael@0: /* call the enumerator function with this first character */ michael@0: if(!fn(context, start, nameChoice, buffer, length)) { michael@0: return FALSE; michael@0: } michael@0: michael@0: /* enumerate the rest of the names */ michael@0: while(++starttype) { michael@0: case 0: { michael@0: /* name = prefix hex-digits */ michael@0: const char *s=(const char *)(range+1); michael@0: char c; michael@0: michael@0: uint16_t i, count; michael@0: michael@0: /* compare prefix */ michael@0: while((c=*s++)!=0) { michael@0: if((char)c!=*otherName++) { michael@0: return 0xffff; michael@0: } michael@0: } michael@0: michael@0: /* read hexadecimal code point value */ michael@0: count=range->variant; michael@0: code=0; michael@0: for(i=0; istart<=(uint32_t)code && (uint32_t)code<=range->end) { michael@0: return code; michael@0: } michael@0: break; michael@0: } michael@0: case 1: { michael@0: char buffer[64]; michael@0: uint16_t indexes[8]; michael@0: const char *elementBases[8], *elements[8]; michael@0: const uint16_t *factors=(const uint16_t *)(range+1); michael@0: uint16_t count=range->variant; michael@0: const char *s=(const char *)(factors+count), *t; michael@0: UChar32 start, limit; michael@0: uint16_t i, idx; michael@0: michael@0: char c; michael@0: michael@0: /* name = prefix factorized-elements */ michael@0: michael@0: /* compare prefix */ michael@0: while((c=*s++)!=0) { michael@0: if((char)c!=*otherName++) { michael@0: return 0xffff; michael@0: } michael@0: } michael@0: michael@0: start=(UChar32)range->start; michael@0: limit=(UChar32)(range->end+1); michael@0: michael@0: /* initialize the suffix elements for enumeration; indexes should all be set to 0 */ michael@0: writeFactorSuffix(factors, count, s, 0, michael@0: indexes, elementBases, elements, buffer, sizeof(buffer)); michael@0: michael@0: /* compare the first suffix */ michael@0: if(0==uprv_strcmp(otherName, buffer)) { michael@0: return start; michael@0: } michael@0: michael@0: /* enumerate and compare the rest of the suffixes */ michael@0: while(++start>5]|=((uint32_t)1<<((uint8_t)c&0x1f))) michael@0: #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0) michael@0: michael@0: static int32_t michael@0: calcStringSetLength(uint32_t set[8], const char *s) { michael@0: int32_t length=0; michael@0: char c; michael@0: michael@0: while((c=*s++)!=0) { michael@0: SET_ADD(set, c); michael@0: ++length; michael@0: } michael@0: return length; michael@0: } michael@0: michael@0: static int32_t michael@0: calcAlgNameSetsLengths(int32_t maxNameLength) { michael@0: AlgorithmicRange *range; michael@0: uint32_t *p; michael@0: uint32_t rangeCount; michael@0: int32_t length; michael@0: michael@0: /* enumerate algorithmic ranges */ michael@0: p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); michael@0: rangeCount=*p; michael@0: range=(AlgorithmicRange *)(p+1); michael@0: while(rangeCount>0) { michael@0: switch(range->type) { michael@0: case 0: michael@0: /* name = prefix + (range->variant times) hex-digits */ michael@0: /* prefix */ michael@0: length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant; michael@0: if(length>maxNameLength) { michael@0: maxNameLength=length; michael@0: } michael@0: break; michael@0: case 1: { michael@0: /* name = prefix factorized-elements */ michael@0: const uint16_t *factors=(const uint16_t *)(range+1); michael@0: const char *s; michael@0: int32_t i, count=range->variant, factor, factorLength, maxFactorLength; michael@0: michael@0: /* prefix length */ michael@0: s=(const char *)(factors+count); michael@0: length=calcStringSetLength(gNameSet, s); michael@0: s+=length+1; /* start of factor suffixes */ michael@0: michael@0: /* get the set and maximum factor suffix length for each factor */ michael@0: for(i=0; i0; --factor) { michael@0: factorLength=calcStringSetLength(gNameSet, s); michael@0: s+=factorLength+1; michael@0: if(factorLength>maxFactorLength) { michael@0: maxFactorLength=factorLength; michael@0: } michael@0: } michael@0: length+=maxFactorLength; michael@0: } michael@0: michael@0: if(length>maxNameLength) { michael@0: maxNameLength=length; michael@0: } michael@0: break; michael@0: } michael@0: default: michael@0: /* unknown type */ michael@0: break; michael@0: } michael@0: michael@0: range=(AlgorithmicRange *)((uint8_t *)range+range->size); michael@0: --rangeCount; michael@0: } michael@0: return maxNameLength; michael@0: } michael@0: michael@0: static int32_t michael@0: calcExtNameSetsLengths(int32_t maxNameLength) { michael@0: int32_t i, length; michael@0: michael@0: for(i=0; i michael@0: * 1 for - michael@0: * 6 for most hex digits per code point michael@0: */ michael@0: length=9+calcStringSetLength(gNameSet, charCatNames[i]); michael@0: if(length>maxNameLength) { michael@0: maxNameLength=length; michael@0: } michael@0: } michael@0: return maxNameLength; michael@0: } michael@0: michael@0: static int32_t michael@0: calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths, michael@0: uint32_t set[8], michael@0: const uint8_t **pLine, const uint8_t *lineLimit) { michael@0: const uint8_t *line=*pLine; michael@0: int32_t length=0, tokenLength; michael@0: uint16_t c, token; michael@0: michael@0: while(line!=lineLimit && (c=*line++)!=(uint8_t)';') { michael@0: if(c>=tokenCount) { michael@0: /* implicit letter */ michael@0: SET_ADD(set, c); michael@0: ++length; michael@0: } else { michael@0: token=tokens[c]; michael@0: if(token==(uint16_t)(-2)) { michael@0: /* this is a lead byte for a double-byte token */ michael@0: c=c<<8|*line++; michael@0: token=tokens[c]; michael@0: } michael@0: if(token==(uint16_t)(-1)) { michael@0: /* explicit letter */ michael@0: SET_ADD(set, c); michael@0: ++length; michael@0: } else { michael@0: /* count token word */ michael@0: if(tokenLengths!=NULL) { michael@0: /* use cached token length */ michael@0: tokenLength=tokenLengths[c]; michael@0: if(tokenLength==0) { michael@0: tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token); michael@0: tokenLengths[c]=(int8_t)tokenLength; michael@0: } michael@0: } else { michael@0: tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token); michael@0: } michael@0: length+=tokenLength; michael@0: } michael@0: } michael@0: } michael@0: michael@0: *pLine=line; michael@0: return length; michael@0: } michael@0: michael@0: static void michael@0: calcGroupNameSetsLengths(int32_t maxNameLength) { michael@0: uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; michael@0: michael@0: uint16_t *tokens=(uint16_t *)uCharNames+8; michael@0: uint16_t tokenCount=*tokens++; michael@0: uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset; michael@0: michael@0: int8_t *tokenLengths; michael@0: michael@0: const uint16_t *group; michael@0: const uint8_t *s, *line, *lineLimit; michael@0: michael@0: int32_t groupCount, lineNumber, length; michael@0: michael@0: tokenLengths=(int8_t *)uprv_malloc(tokenCount); michael@0: if(tokenLengths!=NULL) { michael@0: uprv_memset(tokenLengths, 0, tokenCount); michael@0: } michael@0: michael@0: group=GET_GROUPS(uCharNames); michael@0: groupCount=*group++; michael@0: michael@0: /* enumerate all groups */ michael@0: while(groupCount>0) { michael@0: s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group); michael@0: s=expandGroupLengths(s, offsets, lengths); michael@0: michael@0: /* enumerate all lines in each group */ michael@0: for(lineNumber=0; lineNumbermaxNameLength) { michael@0: maxNameLength=length; michael@0: } michael@0: if(line==lineLimit) { michael@0: continue; michael@0: } michael@0: michael@0: /* read Unicode 1.0 name */ michael@0: length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit); michael@0: if(length>maxNameLength) { michael@0: maxNameLength=length; michael@0: } michael@0: if(line==lineLimit) { michael@0: continue; michael@0: } michael@0: michael@0: /* read ISO comment */ michael@0: /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/ michael@0: } michael@0: michael@0: group=NEXT_GROUP(group); michael@0: --groupCount; michael@0: } michael@0: michael@0: if(tokenLengths!=NULL) { michael@0: uprv_free(tokenLengths); michael@0: } michael@0: michael@0: /* set gMax... - name length last for threading */ michael@0: gMaxNameLength=maxNameLength; michael@0: } michael@0: michael@0: static UBool michael@0: calcNameSetsLengths(UErrorCode *pErrorCode) { michael@0: static const char extChars[]="0123456789ABCDEF<>-"; michael@0: int32_t i, maxNameLength; michael@0: michael@0: if(gMaxNameLength!=0) { michael@0: return TRUE; michael@0: } michael@0: michael@0: if(!isDataLoaded(pErrorCode)) { michael@0: return FALSE; michael@0: } michael@0: michael@0: /* set hex digits, used in various names, and <>-, used in extended names */ michael@0: for(i=0; i<(int32_t)sizeof(extChars)-1; ++i) { michael@0: SET_ADD(gNameSet, extChars[i]); michael@0: } michael@0: michael@0: /* set sets and lengths from algorithmic names */ michael@0: maxNameLength=calcAlgNameSetsLengths(0); michael@0: michael@0: /* set sets and lengths from extended names */ michael@0: maxNameLength=calcExtNameSetsLengths(maxNameLength); michael@0: michael@0: /* set sets and lengths from group names, set global maximum values */ michael@0: calcGroupNameSetsLengths(maxNameLength); michael@0: michael@0: return TRUE; michael@0: } michael@0: michael@0: /* public API --------------------------------------------------------------- */ michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: u_charName(UChar32 code, UCharNameChoice nameChoice, michael@0: char *buffer, int32_t bufferLength, michael@0: UErrorCode *pErrorCode) { michael@0: AlgorithmicRange *algRange; michael@0: uint32_t *p; michael@0: uint32_t i; michael@0: int32_t length; michael@0: michael@0: /* check the argument values */ michael@0: if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { michael@0: return 0; michael@0: } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || michael@0: bufferLength<0 || (bufferLength>0 && buffer==NULL) michael@0: ) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) { michael@0: return u_terminateChars(buffer, bufferLength, 0, pErrorCode); michael@0: } michael@0: michael@0: length=0; michael@0: michael@0: /* try algorithmic names first */ michael@0: p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); michael@0: i=*p; michael@0: algRange=(AlgorithmicRange *)(p+1); michael@0: while(i>0) { michael@0: if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) { michael@0: length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength); michael@0: break; michael@0: } michael@0: algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); michael@0: --i; michael@0: } michael@0: michael@0: if(i==0) { michael@0: if (nameChoice == U_EXTENDED_CHAR_NAME) { michael@0: length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength); michael@0: if (!length) { michael@0: /* extended character name */ michael@0: length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength); michael@0: } michael@0: } else { michael@0: /* normal character name */ michael@0: length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength); michael@0: } michael@0: } michael@0: michael@0: return u_terminateChars(buffer, bufferLength, length, pErrorCode); michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: u_getISOComment(UChar32 /*c*/, michael@0: char *dest, int32_t destCapacity, michael@0: UErrorCode *pErrorCode) { michael@0: /* check the argument values */ michael@0: if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { michael@0: return 0; michael@0: } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: return u_terminateChars(dest, destCapacity, 0, pErrorCode); michael@0: } michael@0: michael@0: U_CAPI UChar32 U_EXPORT2 michael@0: u_charFromName(UCharNameChoice nameChoice, michael@0: const char *name, michael@0: UErrorCode *pErrorCode) { michael@0: char upper[120], lower[120]; michael@0: FindName findName; michael@0: AlgorithmicRange *algRange; michael@0: uint32_t *p; michael@0: uint32_t i; michael@0: UChar32 cp = 0; michael@0: char c0; michael@0: UChar32 error = 0xffff; /* Undefined, but use this for backwards compatibility. */ michael@0: michael@0: if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { michael@0: return error; michael@0: } michael@0: michael@0: if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return error; michael@0: } michael@0: michael@0: if(!isDataLoaded(pErrorCode)) { michael@0: return error; michael@0: } michael@0: michael@0: /* construct the uppercase and lowercase of the name first */ michael@0: for(i=0; i') { michael@0: for (--i; lower[i] && lower[i] != '-'; --i) { michael@0: } michael@0: michael@0: if (lower[i] == '-') { /* We've got a category. */ michael@0: uint32_t cIdx; michael@0: michael@0: lower[i] = 0; michael@0: michael@0: for (++i; lower[i] != '>'; ++i) { michael@0: if (lower[i] >= '0' && lower[i] <= '9') { michael@0: cp = (cp << 4) + lower[i] - '0'; michael@0: } else if (lower[i] >= 'a' && lower[i] <= 'f') { michael@0: cp = (cp << 4) + lower[i] - 'a' + 10; michael@0: } else { michael@0: *pErrorCode = U_ILLEGAL_CHAR_FOUND; michael@0: return error; michael@0: } michael@0: } michael@0: michael@0: /* Now validate the category name. michael@0: We could use a binary search, or a trie, if michael@0: we really wanted to. */ michael@0: michael@0: for (lower[i] = 0, cIdx = 0; cIdx < LENGTHOF(charCatNames); ++cIdx) { michael@0: michael@0: if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) { michael@0: if (getCharCat(cp) == cIdx) { michael@0: return cp; michael@0: } michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: *pErrorCode = U_ILLEGAL_CHAR_FOUND; michael@0: return error; michael@0: } michael@0: michael@0: /* try algorithmic names now */ michael@0: p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); michael@0: i=*p; michael@0: algRange=(AlgorithmicRange *)(p+1); michael@0: while(i>0) { michael@0: if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) { michael@0: return cp; michael@0: } michael@0: algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); michael@0: --i; michael@0: } michael@0: michael@0: /* normal character name */ michael@0: findName.otherName=upper; michael@0: findName.code=error; michael@0: enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice); michael@0: if (findName.code == error) { michael@0: *pErrorCode = U_ILLEGAL_CHAR_FOUND; michael@0: } michael@0: return findName.code; michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: u_enumCharNames(UChar32 start, UChar32 limit, michael@0: UEnumCharNamesFn *fn, michael@0: void *context, michael@0: UCharNameChoice nameChoice, michael@0: UErrorCode *pErrorCode) { michael@0: AlgorithmicRange *algRange; michael@0: uint32_t *p; michael@0: uint32_t i; michael@0: michael@0: if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { michael@0: return; michael@0: } michael@0: michael@0: if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return; michael@0: } michael@0: michael@0: if((uint32_t) limit > UCHAR_MAX_VALUE + 1) { michael@0: limit = UCHAR_MAX_VALUE + 1; michael@0: } michael@0: if((uint32_t)start>=(uint32_t)limit) { michael@0: return; michael@0: } michael@0: michael@0: if(!isDataLoaded(pErrorCode)) { michael@0: return; michael@0: } michael@0: michael@0: /* interleave the data-driven ones with the algorithmic ones */ michael@0: /* iterate over all algorithmic ranges; assume that they are in ascending order */ michael@0: p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); michael@0: i=*p; michael@0: algRange=(AlgorithmicRange *)(p+1); michael@0: while(i>0) { michael@0: /* enumerate the character names before the current algorithmic range */ michael@0: /* here: startstart) { michael@0: if((uint32_t)limit<=algRange->start) { michael@0: enumNames(uCharNames, start, limit, fn, context, nameChoice); michael@0: return; michael@0: } michael@0: if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) { michael@0: return; michael@0: } michael@0: start=(UChar32)algRange->start; michael@0: } michael@0: /* enumerate the character names in the current algorithmic range */ michael@0: /* here: algRange->start<=startend) { michael@0: if((uint32_t)limit<=(algRange->end+1)) { michael@0: enumAlgNames(algRange, start, limit, fn, context, nameChoice); michael@0: return; michael@0: } michael@0: if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) { michael@0: return; michael@0: } michael@0: start=(UChar32)algRange->end+1; michael@0: } michael@0: /* continue to the next algorithmic range (here: startsize); michael@0: --i; michael@0: } michael@0: /* enumerate the character names after the last algorithmic range */ michael@0: enumNames(uCharNames, start, limit, fn, context, nameChoice); michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: uprv_getMaxCharNameLength() { michael@0: UErrorCode errorCode=U_ZERO_ERROR; michael@0: if(calcNameSetsLengths(&errorCode)) { michael@0: return gMaxNameLength; michael@0: } else { michael@0: return 0; michael@0: } michael@0: } michael@0: michael@0: /** michael@0: * Converts the char set cset into a Unicode set uset. michael@0: * @param cset Set of 256 bit flags corresponding to a set of chars. michael@0: * @param uset USet to receive characters. Existing contents are deleted. michael@0: */ michael@0: static void michael@0: charSetToUSet(uint32_t cset[8], const USetAdder *sa) { michael@0: UChar us[256]; michael@0: char cs[256]; michael@0: michael@0: int32_t i, length; michael@0: UErrorCode errorCode; michael@0: michael@0: errorCode=U_ZERO_ERROR; michael@0: michael@0: if(!calcNameSetsLengths(&errorCode)) { michael@0: return; michael@0: } michael@0: michael@0: /* build a char string with all chars that are used in character names */ michael@0: length=0; michael@0: for(i=0; i<256; ++i) { michael@0: if(SET_CONTAINS(cset, i)) { michael@0: cs[length++]=(char)i; michael@0: } michael@0: } michael@0: michael@0: /* convert the char string to a UChar string */ michael@0: u_charsToUChars(cs, us, length); michael@0: michael@0: /* add each UChar to the USet */ michael@0: for(i=0; iadd(sa->set, us[i]); michael@0: } michael@0: } michael@0: } michael@0: michael@0: /** michael@0: * Fills set with characters that are used in Unicode character names. michael@0: * @param set USet to receive characters. michael@0: */ michael@0: U_CAPI void U_EXPORT2 michael@0: uprv_getCharNameCharacters(const USetAdder *sa) { michael@0: charSetToUSet(gNameSet, sa); michael@0: } michael@0: michael@0: /* data swapping ------------------------------------------------------------ */ michael@0: michael@0: /* michael@0: * The token table contains non-negative entries for token bytes, michael@0: * and -1 for bytes that represent themselves in the data file's charset. michael@0: * -2 entries are used for lead bytes. michael@0: * michael@0: * Direct bytes (-1 entries) must be translated from the input charset family michael@0: * to the output charset family. michael@0: * makeTokenMap() writes a permutation mapping for this. michael@0: * Use it once for single-/lead-byte tokens and once more for all trail byte michael@0: * tokens. (';' is an unused trail byte marked with -1.) michael@0: */ michael@0: static void michael@0: makeTokenMap(const UDataSwapper *ds, michael@0: int16_t tokens[], uint16_t tokenCount, michael@0: uint8_t map[256], michael@0: UErrorCode *pErrorCode) { michael@0: UBool usedOutChar[256]; michael@0: uint16_t i, j; michael@0: uint8_t c1, c2; michael@0: michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return; michael@0: } michael@0: michael@0: if(ds->inCharset==ds->outCharset) { michael@0: /* Same charset family: identity permutation */ michael@0: for(i=0; i<256; ++i) { michael@0: map[i]=(uint8_t)i; michael@0: } michael@0: } else { michael@0: uprv_memset(map, 0, 256); michael@0: uprv_memset(usedOutChar, 0, 256); michael@0: michael@0: if(tokenCount>256) { michael@0: tokenCount=256; michael@0: } michael@0: michael@0: /* set the direct bytes (byte 0 always maps to itself) */ michael@0: for(i=1; iswapInvChars(ds, &c1, 1, &c2, pErrorCode); michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n", michael@0: i, ds->inCharset); michael@0: return; michael@0: } michael@0: michael@0: /* enter the converted character into the map and mark it used */ michael@0: map[c1]=c2; michael@0: usedOutChar[c2]=TRUE; michael@0: } michael@0: } michael@0: michael@0: /* set the mappings for the rest of the permutation */ michael@0: for(i=j=1; idataFormat[0]==0x75 && /* dataFormat="unam" */ michael@0: pInfo->dataFormat[1]==0x6e && michael@0: pInfo->dataFormat[2]==0x61 && michael@0: pInfo->dataFormat[3]==0x6d && michael@0: pInfo->formatVersion[0]==1 michael@0: )) { michael@0: udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n", michael@0: pInfo->dataFormat[0], pInfo->dataFormat[1], michael@0: pInfo->dataFormat[2], pInfo->dataFormat[3], michael@0: pInfo->formatVersion[0]); michael@0: *pErrorCode=U_UNSUPPORTED_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: inBytes=(const uint8_t *)inData+headerSize; michael@0: outBytes=(uint8_t *)outData+headerSize; michael@0: if(length<0) { michael@0: algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]); michael@0: } else { michael@0: length-=headerSize; michael@0: if( length<20 || michael@0: (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3])) michael@0: ) { michael@0: udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n", michael@0: length); michael@0: *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return 0; michael@0: } michael@0: } michael@0: michael@0: if(length<0) { michael@0: /* preflighting: iterate through algorithmic ranges */ michael@0: offset=algNamesOffset; michael@0: count=ds->readUInt32(*((const uint32_t *)(inBytes+offset))); michael@0: offset+=4; michael@0: michael@0: for(i=0; ireadUInt16(inRange->size); michael@0: } michael@0: } else { michael@0: /* swap data */ michael@0: const uint16_t *p; michael@0: uint16_t *q, *temp; michael@0: michael@0: int16_t tokens[512]; michael@0: uint16_t tokenCount; michael@0: michael@0: uint8_t map[256], trailMap[256]; michael@0: michael@0: /* copy the data for inaccessible bytes */ michael@0: if(inBytes!=outBytes) { michael@0: uprv_memcpy(outBytes, inBytes, length); michael@0: } michael@0: michael@0: /* the initial 4 offsets first */ michael@0: tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]); michael@0: groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]); michael@0: groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]); michael@0: ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode); michael@0: michael@0: /* michael@0: * now the tokens table michael@0: * it needs to be permutated along with the compressed name strings michael@0: */ michael@0: p=(const uint16_t *)(inBytes+16); michael@0: q=(uint16_t *)(outBytes+16); michael@0: michael@0: /* read and swap the tokenCount */ michael@0: tokenCount=ds->readUInt16(*p); michael@0: ds->swapArray16(ds, p, 2, q, pErrorCode); michael@0: ++p; michael@0: ++q; michael@0: michael@0: /* read the first 512 tokens and make the token maps */ michael@0: if(tokenCount<=512) { michael@0: count=tokenCount; michael@0: } else { michael@0: count=512; michael@0: } michael@0: for(i=0; i256 ? tokenCount-256 : 0), trailMap, pErrorCode); michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return 0; michael@0: } michael@0: michael@0: /* michael@0: * swap and permutate the tokens michael@0: * go through a temporary array to support in-place swapping michael@0: */ michael@0: temp=(uint16_t *)uprv_malloc(tokenCount*2); michael@0: if(temp==NULL) { michael@0: udata_printError(ds, "out of memory swapping %u unames.icu tokens\n", michael@0: tokenCount); michael@0: *pErrorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: /* swap and permutate single-/lead-byte tokens */ michael@0: for(i=0; iswapArray16(ds, p+i, 2, temp+map[i], pErrorCode); michael@0: } michael@0: michael@0: /* swap and permutate trail-byte tokens */ michael@0: for(; iswapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode); michael@0: } michael@0: michael@0: /* copy the result into the output and free the temporary array */ michael@0: uprv_memcpy(q, temp, tokenCount*2); michael@0: uprv_free(temp); michael@0: michael@0: /* michael@0: * swap the token strings but not a possible padding byte after michael@0: * the terminating NUL of the last string michael@0: */ michael@0: udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset), michael@0: outBytes+tokenStringOffset, pErrorCode); michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: udata_printError(ds, "uchar_swapNames(token strings) failed\n"); michael@0: return 0; michael@0: } michael@0: michael@0: /* swap the group table */ michael@0: count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset))); michael@0: ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2), michael@0: outBytes+groupsOffset, pErrorCode); michael@0: michael@0: /* michael@0: * swap the group strings michael@0: * swap the string bytes but not the nibble-encoded string lengths michael@0: */ michael@0: if(ds->inCharset!=ds->outCharset) { michael@0: uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1]; michael@0: michael@0: const uint8_t *inStrings, *nextInStrings; michael@0: uint8_t *outStrings; michael@0: michael@0: uint8_t c; michael@0: michael@0: inStrings=inBytes+groupStringOffset; michael@0: outStrings=outBytes+groupStringOffset; michael@0: michael@0: stringsCount=algNamesOffset-groupStringOffset; michael@0: michael@0: /* iterate through string groups until only a few padding bytes are left */ michael@0: while(stringsCount>32) { michael@0: nextInStrings=expandGroupLengths(inStrings, offsets, lengths); michael@0: michael@0: /* move past the length bytes */ michael@0: stringsCount-=(uint32_t)(nextInStrings-inStrings); michael@0: outStrings+=nextInStrings-inStrings; michael@0: inStrings=nextInStrings; michael@0: michael@0: count=offsets[31]+lengths[31]; /* total number of string bytes in this group */ michael@0: stringsCount-=count; michael@0: michael@0: /* swap the string bytes using map[] and trailMap[] */ michael@0: while(count>0) { michael@0: c=*inStrings++; michael@0: *outStrings++=map[c]; michael@0: if(tokens[c]!=-2) { michael@0: --count; michael@0: } else { michael@0: /* token lead byte: swap the trail byte, too */ michael@0: *outStrings++=trailMap[*inStrings++]; michael@0: count-=2; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* swap the algorithmic ranges */ michael@0: offset=algNamesOffset; michael@0: count=ds->readUInt32(*((const uint32_t *)(inBytes+offset))); michael@0: ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode); michael@0: offset+=4; michael@0: michael@0: for(i=0; i(uint32_t)length) { michael@0: udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n", michael@0: length, i); michael@0: *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: inRange=(const AlgorithmicRange *)(inBytes+offset); michael@0: outRange=(AlgorithmicRange *)(outBytes+offset); michael@0: offset+=ds->readUInt16(inRange->size); michael@0: michael@0: ds->swapArray32(ds, inRange, 8, outRange, pErrorCode); michael@0: ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode); michael@0: switch(inRange->type) { michael@0: case 0: michael@0: /* swap prefix string */ michael@0: ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)), michael@0: outRange+1, pErrorCode); michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n", michael@0: i); michael@0: return 0; michael@0: } michael@0: break; michael@0: case 1: michael@0: { michael@0: /* swap factors and the prefix and factor strings */ michael@0: uint32_t factorsCount; michael@0: michael@0: factorsCount=inRange->variant; michael@0: p=(const uint16_t *)(inRange+1); michael@0: q=(uint16_t *)(outRange+1); michael@0: ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode); michael@0: michael@0: /* swap the strings, up to the last terminating NUL */ michael@0: p+=factorsCount; michael@0: q+=factorsCount; michael@0: stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p); michael@0: while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) { michael@0: --stringsCount; michael@0: } michael@0: ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode); michael@0: } michael@0: break; michael@0: default: michael@0: udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n", michael@0: inRange->type, i); michael@0: *pErrorCode=U_UNSUPPORTED_ERROR; michael@0: return 0; michael@0: } michael@0: } michael@0: } michael@0: michael@0: return headerSize+(int32_t)offset; michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: /* michael@0: * Hey, Emacs, please set the following: michael@0: * michael@0: * Local Variables: michael@0: * indent-tabs-mode: nil michael@0: * End: michael@0: * michael@0: */