michael@0: /*
michael@0: ******************************************************************************
michael@0: *
michael@0: *   Copyright (C) 1999-2013, International Business Machines
michael@0: *   Corporation and others.  All Rights Reserved.
michael@0: *
michael@0: ******************************************************************************
michael@0: *   file name:  unames.c
michael@0: *   encoding:   US-ASCII
michael@0: *   tab size:   8 (not used)
michael@0: *   indentation:4
michael@0: *
michael@0: *   created on: 1999oct04
michael@0: *   created by: Markus W. Scherer
michael@0: */
michael@0: 
michael@0: #include "unicode/utypes.h"
michael@0: #include "unicode/putil.h"
michael@0: #include "unicode/uchar.h"
michael@0: #include "unicode/udata.h"
michael@0: #include "unicode/utf.h"
michael@0: #include "unicode/utf16.h"
michael@0: #include "uassert.h"
michael@0: #include "ustr_imp.h"
michael@0: #include "umutex.h"
michael@0: #include "cmemory.h"
michael@0: #include "cstring.h"
michael@0: #include "ucln_cmn.h"
michael@0: #include "udataswp.h"
michael@0: #include "uprops.h"
michael@0: 
michael@0: U_NAMESPACE_BEGIN
michael@0: 
michael@0: /* prototypes ------------------------------------------------------------- */
michael@0: 
michael@0: #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
michael@0: 
michael@0: static const char DATA_NAME[] = "unames";
michael@0: static const char DATA_TYPE[] = "icu";
michael@0: 
michael@0: #define GROUP_SHIFT 5
michael@0: #define LINES_PER_GROUP (1L<<GROUP_SHIFT)
michael@0: #define GROUP_MASK (LINES_PER_GROUP-1)
michael@0: 
michael@0: /*
michael@0:  * This struct was replaced by explicitly accessing equivalent
michael@0:  * fields from triples of uint16_t.
michael@0:  * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
michael@0:  * which broke the assumption that sizeof(Group)==6 and that the ++ operator
michael@0:  * would advance by 6 bytes (3 uint16_t).
michael@0:  *
michael@0:  * We can't just change the data structure because it's loaded from a data file,
michael@0:  * and we don't want to make it less compact, so we changed the access code.
michael@0:  *
michael@0:  * For details see ICU tickets 6331 and 6008.
michael@0: typedef struct {
michael@0:     uint16_t groupMSB,
michael@0:              offsetHigh, offsetLow; / * avoid padding * /
michael@0: } Group;
michael@0:  */
michael@0: enum {
michael@0:     GROUP_MSB,
michael@0:     GROUP_OFFSET_HIGH,
michael@0:     GROUP_OFFSET_LOW,
michael@0:     GROUP_LENGTH
michael@0: };
michael@0: 
michael@0: /*
michael@0:  * Get the 32-bit group offset.
michael@0:  * @param group (const uint16_t *) pointer to a Group triple of uint16_t
michael@0:  * @return group offset (int32_t)
michael@0:  */
michael@0: #define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
michael@0: 
michael@0: #define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
michael@0: #define PREV_GROUP(group) ((group)-GROUP_LENGTH)
michael@0: 
michael@0: typedef struct {
michael@0:     uint32_t start, end;
michael@0:     uint8_t type, variant;
michael@0:     uint16_t size;
michael@0: } AlgorithmicRange;
michael@0: 
michael@0: typedef struct {
michael@0:     uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
michael@0: } UCharNames;
michael@0: 
michael@0: /*
michael@0:  * Get the groups table from a UCharNames struct.
michael@0:  * The groups table consists of one uint16_t groupCount followed by
michael@0:  * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
michael@0:  * and the comment for the old struct Group above.
michael@0:  *
michael@0:  * @param names (const UCharNames *) pointer to the UCharNames indexes
michael@0:  * @return (const uint16_t *) pointer to the groups table
michael@0:  */
michael@0: #define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
michael@0: 
michael@0: typedef struct {
michael@0:     const char *otherName;
michael@0:     UChar32 code;
michael@0: } FindName;
michael@0: 
michael@0: #define DO_FIND_NAME NULL
michael@0: 
michael@0: static UDataMemory *uCharNamesData=NULL;
michael@0: static UCharNames *uCharNames=NULL;
michael@0: static icu::UInitOnce gCharNamesInitOnce = U_INITONCE_INITIALIZER;
michael@0: 
michael@0: /*
michael@0:  * Maximum length of character names (regular & 1.0).
michael@0:  */
michael@0: static int32_t gMaxNameLength=0;
michael@0: 
michael@0: /*
michael@0:  * Set of chars used in character names (regular & 1.0).
michael@0:  * Chars are platform-dependent (can be EBCDIC).
michael@0:  */
michael@0: static uint32_t gNameSet[8]={ 0 };
michael@0: 
michael@0: #define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
michael@0: #define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
michael@0: #define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
michael@0: 
michael@0: #define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
michael@0: 
michael@0: static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
michael@0:     "unassigned",
michael@0:     "uppercase letter",
michael@0:     "lowercase letter",
michael@0:     "titlecase letter",
michael@0:     "modifier letter",
michael@0:     "other letter",
michael@0:     "non spacing mark",
michael@0:     "enclosing mark",
michael@0:     "combining spacing mark",
michael@0:     "decimal digit number",
michael@0:     "letter number",
michael@0:     "other number",
michael@0:     "space separator",
michael@0:     "line separator",
michael@0:     "paragraph separator",
michael@0:     "control",
michael@0:     "format",
michael@0:     "private use area",
michael@0:     "surrogate",
michael@0:     "dash punctuation",   
michael@0:     "start punctuation",
michael@0:     "end punctuation",
michael@0:     "connector punctuation",
michael@0:     "other punctuation",
michael@0:     "math symbol",
michael@0:     "currency symbol",
michael@0:     "modifier symbol",
michael@0:     "other symbol",
michael@0:     "initial punctuation",
michael@0:     "final punctuation",
michael@0:     "noncharacter",
michael@0:     "lead surrogate",
michael@0:     "trail surrogate"
michael@0: };
michael@0: 
michael@0: /* implementation ----------------------------------------------------------- */
michael@0: 
michael@0: static UBool U_CALLCONV unames_cleanup(void)
michael@0: {
michael@0:     if(uCharNamesData) {
michael@0:         udata_close(uCharNamesData);
michael@0:         uCharNamesData = NULL;
michael@0:     }
michael@0:     if(uCharNames) {
michael@0:         uCharNames = NULL;
michael@0:     }
michael@0:     gCharNamesInitOnce.reset();
michael@0:     gMaxNameLength=0;
michael@0:     return TRUE;
michael@0: }
michael@0: 
michael@0: static UBool U_CALLCONV
michael@0: isAcceptable(void * /*context*/,
michael@0:              const char * /*type*/, const char * /*name*/,
michael@0:              const UDataInfo *pInfo) {
michael@0:     return (UBool)(
michael@0:         pInfo->size>=20 &&
michael@0:         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
michael@0:         pInfo->charsetFamily==U_CHARSET_FAMILY &&
michael@0:         pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
michael@0:         pInfo->dataFormat[1]==0x6e &&
michael@0:         pInfo->dataFormat[2]==0x61 &&
michael@0:         pInfo->dataFormat[3]==0x6d &&
michael@0:         pInfo->formatVersion[0]==1);
michael@0: }
michael@0: 
michael@0: static void U_CALLCONV
michael@0: loadCharNames(UErrorCode &status) {
michael@0:     U_ASSERT(uCharNamesData == NULL);
michael@0:     U_ASSERT(uCharNames == NULL);
michael@0: 
michael@0:     uCharNamesData = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &status);
michael@0:     if(U_FAILURE(status)) {
michael@0:         uCharNamesData = NULL;
michael@0:     } else {
michael@0:         uCharNames = (UCharNames *)udata_getMemory(uCharNamesData);
michael@0:     }
michael@0:     ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
michael@0: }
michael@0: 
michael@0: 
michael@0: static UBool
michael@0: isDataLoaded(UErrorCode *pErrorCode) {
michael@0:     umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode);
michael@0:     return U_SUCCESS(*pErrorCode);
michael@0: }
michael@0: 
michael@0: #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
michael@0:     if((bufferLength)>0) { \
michael@0:         *(buffer)++=c; \
michael@0:         --(bufferLength); \
michael@0:     } \
michael@0:     ++(bufferPos); \
michael@0: }
michael@0: 
michael@0: #define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
michael@0: 
michael@0: /*
michael@0:  * Important: expandName() and compareName() are almost the same -
michael@0:  * apply fixes to both.
michael@0:  *
michael@0:  * UnicodeData.txt uses ';' as a field separator, so no
michael@0:  * field can contain ';' as part of its contents.
michael@0:  * In unames.dat, it is marked as token[';']==-1 only if the
michael@0:  * semicolon is used in the data file - which is iff we
michael@0:  * have Unicode 1.0 names or ISO comments or aliases.
michael@0:  * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
michael@0:  * although we know that it will never be part of a name.
michael@0:  */
michael@0: static uint16_t
michael@0: expandName(UCharNames *names,
michael@0:            const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
michael@0:            char *buffer, uint16_t bufferLength) {
michael@0:     uint16_t *tokens=(uint16_t *)names+8;
michael@0:     uint16_t token, tokenCount=*tokens++, bufferPos=0;
michael@0:     uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
michael@0:     uint8_t c;
michael@0: 
michael@0:     if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
michael@0:         /*
michael@0:          * skip the modern name if it is not requested _and_
michael@0:          * if the semicolon byte value is a character, not a token number
michael@0:          */
michael@0:         if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
michael@0:             int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
michael@0:             do {
michael@0:                 while(nameLength>0) {
michael@0:                     --nameLength;
michael@0:                     if(*name++==';') {
michael@0:                         break;
michael@0:                     }
michael@0:                 }
michael@0:             } while(--fieldIndex>0);
michael@0:         } else {
michael@0:             /*
michael@0:              * the semicolon byte value is a token number, therefore
michael@0:              * only modern names are stored in unames.dat and there is no
michael@0:              * such requested alternate name here
michael@0:              */
michael@0:             nameLength=0;
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     /* write each letter directly, and write a token word per token */
michael@0:     while(nameLength>0) {
michael@0:         --nameLength;
michael@0:         c=*name++;
michael@0: 
michael@0:         if(c>=tokenCount) {
michael@0:             if(c!=';') {
michael@0:                 /* implicit letter */
michael@0:                 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
michael@0:             } else {
michael@0:                 /* finished */
michael@0:                 break;
michael@0:             }
michael@0:         } else {
michael@0:             token=tokens[c];
michael@0:             if(token==(uint16_t)(-2)) {
michael@0:                 /* this is a lead byte for a double-byte token */
michael@0:                 token=tokens[c<<8|*name++];
michael@0:                 --nameLength;
michael@0:             }
michael@0:             if(token==(uint16_t)(-1)) {
michael@0:                 if(c!=';') {
michael@0:                     /* explicit letter */
michael@0:                     WRITE_CHAR(buffer, bufferLength, bufferPos, c);
michael@0:                 } else {
michael@0:                     /* stop, but skip the semicolon if we are seeking
michael@0:                        extended names and there was no 2.0 name but there
michael@0:                        is a 1.0 name. */
michael@0:                     if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
michael@0:                         if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
michael@0:                             continue;
michael@0:                         }
michael@0:                     }
michael@0:                     /* finished */
michael@0:                     break;
michael@0:                 }
michael@0:             } else {
michael@0:                 /* write token word */
michael@0:                 uint8_t *tokenString=tokenStrings+token;
michael@0:                 while((c=*tokenString++)!=0) {
michael@0:                     WRITE_CHAR(buffer, bufferLength, bufferPos, c);
michael@0:                 }
michael@0:             }
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     /* zero-terminate */
michael@0:     if(bufferLength>0) {
michael@0:         *buffer=0;
michael@0:     }
michael@0: 
michael@0:     return bufferPos;
michael@0: }
michael@0: 
michael@0: /*
michael@0:  * compareName() is almost the same as expandName() except that it compares
michael@0:  * the currently expanded name to an input name.
michael@0:  * It returns the match/no match result as soon as possible.
michael@0:  */
michael@0: static UBool
michael@0: compareName(UCharNames *names,
michael@0:             const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
michael@0:             const char *otherName) {
michael@0:     uint16_t *tokens=(uint16_t *)names+8;
michael@0:     uint16_t token, tokenCount=*tokens++;
michael@0:     uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
michael@0:     uint8_t c;
michael@0:     const char *origOtherName = otherName;
michael@0: 
michael@0:     if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
michael@0:         /*
michael@0:          * skip the modern name if it is not requested _and_
michael@0:          * if the semicolon byte value is a character, not a token number
michael@0:          */
michael@0:         if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
michael@0:             int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
michael@0:             do {
michael@0:                 while(nameLength>0) {
michael@0:                     --nameLength;
michael@0:                     if(*name++==';') {
michael@0:                         break;
michael@0:                     }
michael@0:                 }
michael@0:             } while(--fieldIndex>0);
michael@0:         } else {
michael@0:             /*
michael@0:              * the semicolon byte value is a token number, therefore
michael@0:              * only modern names are stored in unames.dat and there is no
michael@0:              * such requested alternate name here
michael@0:              */
michael@0:             nameLength=0;
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     /* compare each letter directly, and compare a token word per token */
michael@0:     while(nameLength>0) {
michael@0:         --nameLength;
michael@0:         c=*name++;
michael@0: 
michael@0:         if(c>=tokenCount) {
michael@0:             if(c!=';') {
michael@0:                 /* implicit letter */
michael@0:                 if((char)c!=*otherName++) {
michael@0:                     return FALSE;
michael@0:                 }
michael@0:             } else {
michael@0:                 /* finished */
michael@0:                 break;
michael@0:             }
michael@0:         } else {
michael@0:             token=tokens[c];
michael@0:             if(token==(uint16_t)(-2)) {
michael@0:                 /* this is a lead byte for a double-byte token */
michael@0:                 token=tokens[c<<8|*name++];
michael@0:                 --nameLength;
michael@0:             }
michael@0:             if(token==(uint16_t)(-1)) {
michael@0:                 if(c!=';') {
michael@0:                     /* explicit letter */
michael@0:                     if((char)c!=*otherName++) {
michael@0:                         return FALSE;
michael@0:                     }
michael@0:                 } else {
michael@0:                     /* stop, but skip the semicolon if we are seeking
michael@0:                        extended names and there was no 2.0 name but there
michael@0:                        is a 1.0 name. */
michael@0:                     if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
michael@0:                         if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
michael@0:                             continue;
michael@0:                         }
michael@0:                     }
michael@0:                     /* finished */
michael@0:                     break;
michael@0:                 }
michael@0:             } else {
michael@0:                 /* write token word */
michael@0:                 uint8_t *tokenString=tokenStrings+token;
michael@0:                 while((c=*tokenString++)!=0) {
michael@0:                     if((char)c!=*otherName++) {
michael@0:                         return FALSE;
michael@0:                     }
michael@0:                 }
michael@0:             }
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     /* complete match? */
michael@0:     return (UBool)(*otherName==0);
michael@0: }
michael@0: 
michael@0: static uint8_t getCharCat(UChar32 cp) {
michael@0:     uint8_t cat;
michael@0: 
michael@0:     if (U_IS_UNICODE_NONCHAR(cp)) {
michael@0:         return U_NONCHARACTER_CODE_POINT;
michael@0:     }
michael@0: 
michael@0:     if ((cat = u_charType(cp)) == U_SURROGATE) {
michael@0:         cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
michael@0:     }
michael@0: 
michael@0:     return cat;
michael@0: }
michael@0: 
michael@0: static const char *getCharCatName(UChar32 cp) {
michael@0:     uint8_t cat = getCharCat(cp);
michael@0: 
michael@0:     /* Return unknown if the table of names above is not up to
michael@0:        date. */
michael@0: 
michael@0:     if (cat >= LENGTHOF(charCatNames)) {
michael@0:         return "unknown";
michael@0:     } else {
michael@0:         return charCatNames[cat];
michael@0:     }
michael@0: }
michael@0: 
michael@0: static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
michael@0:     const char *catname = getCharCatName(code);
michael@0:     uint16_t length = 0;
michael@0: 
michael@0:     UChar32 cp;
michael@0:     int ndigits, i;
michael@0:     
michael@0:     WRITE_CHAR(buffer, bufferLength, length, '<');
michael@0:     while (catname[length - 1]) {
michael@0:         WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
michael@0:     }
michael@0:     WRITE_CHAR(buffer, bufferLength, length, '-');
michael@0:     for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
michael@0:         ;
michael@0:     if (ndigits < 4)
michael@0:         ndigits = 4;
michael@0:     for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
michael@0:         uint8_t v = (uint8_t)(cp & 0xf);
michael@0:         buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
michael@0:     }
michael@0:     buffer += ndigits;
michael@0:     length += ndigits;
michael@0:     WRITE_CHAR(buffer, bufferLength, length, '>');
michael@0: 
michael@0:     return length;
michael@0: }
michael@0: 
michael@0: /*
michael@0:  * getGroup() does a binary search for the group that contains the
michael@0:  * Unicode code point "code".
michael@0:  * The return value is always a valid Group* that may contain "code"
michael@0:  * or else is the highest group before "code".
michael@0:  * If the lowest group is after "code", then that one is returned.
michael@0:  */
michael@0: static const uint16_t *
michael@0: getGroup(UCharNames *names, uint32_t code) {
michael@0:     const uint16_t *groups=GET_GROUPS(names);
michael@0:     uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
michael@0:              start=0,
michael@0:              limit=*groups++,
michael@0:              number;
michael@0: 
michael@0:     /* binary search for the group of names that contains the one for code */
michael@0:     while(start<limit-1) {
michael@0:         number=(uint16_t)((start+limit)/2);
michael@0:         if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
michael@0:             limit=number;
michael@0:         } else {
michael@0:             start=number;
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     /* return this regardless of whether it is an exact match */
michael@0:     return groups+start*GROUP_LENGTH;
michael@0: }
michael@0: 
michael@0: /*
michael@0:  * expandGroupLengths() reads a block of compressed lengths of 32 strings and
michael@0:  * expands them into offsets and lengths for each string.
michael@0:  * Lengths are stored with a variable-width encoding in consecutive nibbles:
michael@0:  * If a nibble<0xc, then it is the length itself (0=empty string).
michael@0:  * If a nibble>=0xc, then it forms a length value with the following nibble.
michael@0:  * Calculation see below.
michael@0:  * The offsets and lengths arrays must be at least 33 (one more) long because
michael@0:  * there is no check here at the end if the last nibble is still used.
michael@0:  */
michael@0: static const uint8_t *
michael@0: expandGroupLengths(const uint8_t *s,
michael@0:                    uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
michael@0:     /* read the lengths of the 32 strings in this group and get each string's offset */
michael@0:     uint16_t i=0, offset=0, length=0;
michael@0:     uint8_t lengthByte;
michael@0: 
michael@0:     /* all 32 lengths must be read to get the offset of the first group string */
michael@0:     while(i<LINES_PER_GROUP) {
michael@0:         lengthByte=*s++;
michael@0: 
michael@0:         /* read even nibble - MSBs of lengthByte */
michael@0:         if(length>=12) {
michael@0:             /* double-nibble length spread across two bytes */
michael@0:             length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12);
michael@0:             lengthByte&=0xf;
michael@0:         } else if((lengthByte /* &0xf0 */)>=0xc0) {
michael@0:             /* double-nibble length spread across this one byte */
michael@0:             length=(uint16_t)((lengthByte&0x3f)+12);
michael@0:         } else {
michael@0:             /* single-nibble length in MSBs */
michael@0:             length=(uint16_t)(lengthByte>>4);
michael@0:             lengthByte&=0xf;
michael@0:         }
michael@0: 
michael@0:         *offsets++=offset;
michael@0:         *lengths++=length;
michael@0: 
michael@0:         offset+=length;
michael@0:         ++i;
michael@0: 
michael@0:         /* read odd nibble - LSBs of lengthByte */
michael@0:         if((lengthByte&0xf0)==0) {
michael@0:             /* this nibble was not consumed for a double-nibble length above */
michael@0:             length=lengthByte;
michael@0:             if(length<12) {
michael@0:                 /* single-nibble length in LSBs */
michael@0:                 *offsets++=offset;
michael@0:                 *lengths++=length;
michael@0: 
michael@0:                 offset+=length;
michael@0:                 ++i;
michael@0:             }
michael@0:         } else {
michael@0:             length=0;   /* prevent double-nibble detection in the next iteration */
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     /* now, s is at the first group string */
michael@0:     return s;
michael@0: }
michael@0: 
michael@0: static uint16_t
michael@0: expandGroupName(UCharNames *names, const uint16_t *group,
michael@0:                 uint16_t lineNumber, UCharNameChoice nameChoice,
michael@0:                 char *buffer, uint16_t bufferLength) {
michael@0:     uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
michael@0:     const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
michael@0:     s=expandGroupLengths(s, offsets, lengths);
michael@0:     return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
michael@0:                       buffer, bufferLength);
michael@0: }
michael@0: 
michael@0: static uint16_t
michael@0: getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
michael@0:         char *buffer, uint16_t bufferLength) {
michael@0:     const uint16_t *group=getGroup(names, code);
michael@0:     if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) {
michael@0:         return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
michael@0:                                buffer, bufferLength);
michael@0:     } else {
michael@0:         /* group not found */
michael@0:         /* zero-terminate */
michael@0:         if(bufferLength>0) {
michael@0:             *buffer=0;
michael@0:         }
michael@0:         return 0;
michael@0:     }
michael@0: }
michael@0: 
michael@0: /*
michael@0:  * enumGroupNames() enumerates all the names in a 32-group
michael@0:  * and either calls the enumerator function or finds a given input name.
michael@0:  */
michael@0: static UBool
michael@0: enumGroupNames(UCharNames *names, const uint16_t *group,
michael@0:                UChar32 start, UChar32 end,
michael@0:                UEnumCharNamesFn *fn, void *context,
michael@0:                UCharNameChoice nameChoice) {
michael@0:     uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
michael@0:     const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
michael@0: 
michael@0:     s=expandGroupLengths(s, offsets, lengths);
michael@0:     if(fn!=DO_FIND_NAME) {
michael@0:         char buffer[200];
michael@0:         uint16_t length;
michael@0: 
michael@0:         while(start<=end) {
michael@0:             length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
michael@0:             if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
michael@0:                 buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
michael@0:             }
michael@0:             /* here, we assume that the buffer is large enough */
michael@0:             if(length>0) {
michael@0:                 if(!fn(context, start, nameChoice, buffer, length)) {
michael@0:                     return FALSE;
michael@0:                 }
michael@0:             }
michael@0:             ++start;
michael@0:         }
michael@0:     } else {
michael@0:         const char *otherName=((FindName *)context)->otherName;
michael@0:         while(start<=end) {
michael@0:             if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
michael@0:                 ((FindName *)context)->code=start;
michael@0:                 return FALSE;
michael@0:             }
michael@0:             ++start;
michael@0:         }
michael@0:     }
michael@0:     return TRUE;
michael@0: }
michael@0: 
michael@0: /*
michael@0:  * enumExtNames enumerate extended names.
michael@0:  * It only needs to do it if it is called with a real function and not
michael@0:  * with the dummy DO_FIND_NAME, because u_charFromName() does a check
michael@0:  * for extended names by itself.
michael@0:  */ 
michael@0: static UBool
michael@0: enumExtNames(UChar32 start, UChar32 end,
michael@0:              UEnumCharNamesFn *fn, void *context)
michael@0: {
michael@0:     if(fn!=DO_FIND_NAME) {
michael@0:         char buffer[200];
michael@0:         uint16_t length;
michael@0:         
michael@0:         while(start<=end) {
michael@0:             buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
michael@0:             /* here, we assume that the buffer is large enough */
michael@0:             if(length>0) {
michael@0:                 if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
michael@0:                     return FALSE;
michael@0:                 }
michael@0:             }
michael@0:             ++start;
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     return TRUE;
michael@0: }
michael@0: 
michael@0: static UBool
michael@0: enumNames(UCharNames *names,
michael@0:           UChar32 start, UChar32 limit,
michael@0:           UEnumCharNamesFn *fn, void *context,
michael@0:           UCharNameChoice nameChoice) {
michael@0:     uint16_t startGroupMSB, endGroupMSB, groupCount;
michael@0:     const uint16_t *group, *groupLimit;
michael@0: 
michael@0:     startGroupMSB=(uint16_t)(start>>GROUP_SHIFT);
michael@0:     endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT);
michael@0: 
michael@0:     /* find the group that contains start, or the highest before it */
michael@0:     group=getGroup(names, start);
michael@0: 
michael@0:     if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) {
michael@0:         /* enumerate synthetic names between start and the group start */
michael@0:         UChar32 extLimit=((UChar32)group[GROUP_MSB]<<GROUP_SHIFT);
michael@0:         if(extLimit>limit) {
michael@0:             extLimit=limit;
michael@0:         }
michael@0:         if(!enumExtNames(start, extLimit-1, fn, context)) {
michael@0:             return FALSE;
michael@0:         }
michael@0:         start=extLimit;
michael@0:     }
michael@0: 
michael@0:     if(startGroupMSB==endGroupMSB) {
michael@0:         if(startGroupMSB==group[GROUP_MSB]) {
michael@0:             /* if start and limit-1 are in the same group, then enumerate only in that one */
michael@0:             return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
michael@0:         }
michael@0:     } else {
michael@0:         const uint16_t *groups=GET_GROUPS(names);
michael@0:         groupCount=*groups++;
michael@0:         groupLimit=groups+groupCount*GROUP_LENGTH;
michael@0: 
michael@0:         if(startGroupMSB==group[GROUP_MSB]) {
michael@0:             /* enumerate characters in the partial start group */
michael@0:             if((start&GROUP_MASK)!=0) {
michael@0:                 if(!enumGroupNames(names, group,
michael@0:                                    start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1,
michael@0:                                    fn, context, nameChoice)) {
michael@0:                     return FALSE;
michael@0:                 }
michael@0:                 group=NEXT_GROUP(group); /* continue with the next group */
michael@0:             }
michael@0:         } else if(startGroupMSB>group[GROUP_MSB]) {
michael@0:             /* make sure that we start enumerating with the first group after start */
michael@0:             const uint16_t *nextGroup=NEXT_GROUP(group);
michael@0:             if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
michael@0:                 UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
michael@0:                 if (end > limit) {
michael@0:                     end = limit;
michael@0:                 }
michael@0:                 if (!enumExtNames(start, end - 1, fn, context)) {
michael@0:                     return FALSE;
michael@0:                 }
michael@0:             }
michael@0:             group=nextGroup;
michael@0:         }
michael@0: 
michael@0:         /* enumerate entire groups between the start- and end-groups */
michael@0:         while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
michael@0:             const uint16_t *nextGroup;
michael@0:             start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT;
michael@0:             if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
michael@0:                 return FALSE;
michael@0:             }
michael@0:             nextGroup=NEXT_GROUP(group);
michael@0:             if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
michael@0:                 UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
michael@0:                 if (end > limit) {
michael@0:                     end = limit;
michael@0:                 }
michael@0:                 if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
michael@0:                     return FALSE;
michael@0:                 }
michael@0:             }
michael@0:             group=nextGroup;
michael@0:         }
michael@0: 
michael@0:         /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
michael@0:         if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
michael@0:             return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
michael@0:         } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
michael@0:             UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
michael@0:             if (next > start) {
michael@0:                 start = next;
michael@0:             }
michael@0:         } else {
michael@0:             return TRUE;
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     /* we have not found a group, which means everything is made of
michael@0:        extended names. */
michael@0:     if (nameChoice == U_EXTENDED_CHAR_NAME) {
michael@0:         if (limit > UCHAR_MAX_VALUE + 1) {
michael@0:             limit = UCHAR_MAX_VALUE + 1;
michael@0:         }
michael@0:         return enumExtNames(start, limit - 1, fn, context);
michael@0:     }
michael@0:     
michael@0:     return TRUE;
michael@0: }
michael@0: 
michael@0: static uint16_t
michael@0: writeFactorSuffix(const uint16_t *factors, uint16_t count,
michael@0:                   const char *s, /* suffix elements */
michael@0:                   uint32_t code,
michael@0:                   uint16_t indexes[8], /* output fields from here */
michael@0:                   const char *elementBases[8], const char *elements[8],
michael@0:                   char *buffer, uint16_t bufferLength) {
michael@0:     uint16_t i, factor, bufferPos=0;
michael@0:     char c;
michael@0: 
michael@0:     /* write elements according to the factors */
michael@0: 
michael@0:     /*
michael@0:      * the factorized elements are determined by modulo arithmetic
michael@0:      * with the factors of this algorithm
michael@0:      *
michael@0:      * note that for fewer operations, count is decremented here
michael@0:      */
michael@0:     --count;
michael@0:     for(i=count; i>0; --i) {
michael@0:         factor=factors[i];
michael@0:         indexes[i]=(uint16_t)(code%factor);
michael@0:         code/=factor;
michael@0:     }
michael@0:     /*
michael@0:      * we don't need to calculate the last modulus because start<=code<=end
michael@0:      * guarantees here that code<=factors[0]
michael@0:      */
michael@0:     indexes[0]=(uint16_t)code;
michael@0: 
michael@0:     /* write each element */
michael@0:     for(;;) {
michael@0:         if(elementBases!=NULL) {
michael@0:             *elementBases++=s;
michael@0:         }
michael@0: 
michael@0:         /* skip indexes[i] strings */
michael@0:         factor=indexes[i];
michael@0:         while(factor>0) {
michael@0:             while(*s++!=0) {}
michael@0:             --factor;
michael@0:         }
michael@0:         if(elements!=NULL) {
michael@0:             *elements++=s;
michael@0:         }
michael@0: 
michael@0:         /* write element */
michael@0:         while((c=*s++)!=0) {
michael@0:             WRITE_CHAR(buffer, bufferLength, bufferPos, c);
michael@0:         }
michael@0: 
michael@0:         /* we do not need to perform the rest of this loop for i==count - break here */
michael@0:         if(i>=count) {
michael@0:             break;
michael@0:         }
michael@0: 
michael@0:         /* skip the rest of the strings for this factors[i] */
michael@0:         factor=(uint16_t)(factors[i]-indexes[i]-1);
michael@0:         while(factor>0) {
michael@0:             while(*s++!=0) {}
michael@0:             --factor;
michael@0:         }
michael@0: 
michael@0:         ++i;
michael@0:     }
michael@0: 
michael@0:     /* zero-terminate */
michael@0:     if(bufferLength>0) {
michael@0:         *buffer=0;
michael@0:     }
michael@0: 
michael@0:     return bufferPos;
michael@0: }
michael@0: 
michael@0: /*
michael@0:  * Important:
michael@0:  * Parts of findAlgName() are almost the same as some of getAlgName().
michael@0:  * Fixes must be applied to both.
michael@0:  */
michael@0: static uint16_t
michael@0: getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
michael@0:         char *buffer, uint16_t bufferLength) {
michael@0:     uint16_t bufferPos=0;
michael@0: 
michael@0:     /* Only the normative character name can be algorithmic. */
michael@0:     if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
michael@0:         /* zero-terminate */
michael@0:         if(bufferLength>0) {
michael@0:             *buffer=0;
michael@0:         }
michael@0:         return 0;
michael@0:     }
michael@0: 
michael@0:     switch(range->type) {
michael@0:     case 0: {
michael@0:         /* name = prefix hex-digits */
michael@0:         const char *s=(const char *)(range+1);
michael@0:         char c;
michael@0: 
michael@0:         uint16_t i, count;
michael@0: 
michael@0:         /* copy prefix */
michael@0:         while((c=*s++)!=0) {
michael@0:             WRITE_CHAR(buffer, bufferLength, bufferPos, c);
michael@0:         }
michael@0: 
michael@0:         /* write hexadecimal code point value */
michael@0:         count=range->variant;
michael@0: 
michael@0:         /* zero-terminate */
michael@0:         if(count<bufferLength) {
michael@0:             buffer[count]=0;
michael@0:         }
michael@0: 
michael@0:         for(i=count; i>0;) {
michael@0:             if(--i<bufferLength) {
michael@0:                 c=(char)(code&0xf);
michael@0:                 if(c<10) {
michael@0:                     c+='0';
michael@0:                 } else {
michael@0:                     c+='A'-10;
michael@0:                 }
michael@0:                 buffer[i]=c;
michael@0:             }
michael@0:             code>>=4;
michael@0:         }
michael@0: 
michael@0:         bufferPos+=count;
michael@0:         break;
michael@0:     }
michael@0:     case 1: {
michael@0:         /* name = prefix factorized-elements */
michael@0:         uint16_t indexes[8];
michael@0:         const uint16_t *factors=(const uint16_t *)(range+1);
michael@0:         uint16_t count=range->variant;
michael@0:         const char *s=(const char *)(factors+count);
michael@0:         char c;
michael@0: 
michael@0:         /* copy prefix */
michael@0:         while((c=*s++)!=0) {
michael@0:             WRITE_CHAR(buffer, bufferLength, bufferPos, c);
michael@0:         }
michael@0: 
michael@0:         bufferPos+=writeFactorSuffix(factors, count,
michael@0:                                      s, code-range->start, indexes, NULL, NULL, buffer, bufferLength);
michael@0:         break;
michael@0:     }
michael@0:     default:
michael@0:         /* undefined type */
michael@0:         /* zero-terminate */
michael@0:         if(bufferLength>0) {
michael@0:             *buffer=0;
michael@0:         }
michael@0:         break;
michael@0:     }
michael@0: 
michael@0:     return bufferPos;
michael@0: }
michael@0: 
michael@0: /*
michael@0:  * Important: enumAlgNames() and findAlgName() are almost the same.
michael@0:  * Any fix must be applied to both.
michael@0:  */
michael@0: static UBool
michael@0: enumAlgNames(AlgorithmicRange *range,
michael@0:              UChar32 start, UChar32 limit,
michael@0:              UEnumCharNamesFn *fn, void *context,
michael@0:              UCharNameChoice nameChoice) {
michael@0:     char buffer[200];
michael@0:     uint16_t length;
michael@0: 
michael@0:     if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
michael@0:         return TRUE;
michael@0:     }
michael@0: 
michael@0:     switch(range->type) {
michael@0:     case 0: {
michael@0:         char *s, *end;
michael@0:         char c;
michael@0: 
michael@0:         /* get the full name of the start character */
michael@0:         length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer));
michael@0:         if(length<=0) {
michael@0:             return TRUE;
michael@0:         }
michael@0: 
michael@0:         /* call the enumerator function with this first character */
michael@0:         if(!fn(context, start, nameChoice, buffer, length)) {
michael@0:             return FALSE;
michael@0:         }
michael@0: 
michael@0:         /* go to the end of the name; all these names have the same length */
michael@0:         end=buffer;
michael@0:         while(*end!=0) {
michael@0:             ++end;
michael@0:         }
michael@0: 
michael@0:         /* enumerate the rest of the names */
michael@0:         while(++start<limit) {
michael@0:             /* increment the hexadecimal number on a character-basis */
michael@0:             s=end;
michael@0:             for (;;) {
michael@0:                 c=*--s;
michael@0:                 if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {
michael@0:                     *s=(char)(c+1);
michael@0:                     break;
michael@0:                 } else if(c=='9') {
michael@0:                     *s='A';
michael@0:                     break;
michael@0:                 } else if(c=='F') {
michael@0:                     *s='0';
michael@0:                 }
michael@0:             }
michael@0: 
michael@0:             if(!fn(context, start, nameChoice, buffer, length)) {
michael@0:                 return FALSE;
michael@0:             }
michael@0:         }
michael@0:         break;
michael@0:     }
michael@0:     case 1: {
michael@0:         uint16_t indexes[8];
michael@0:         const char *elementBases[8], *elements[8];
michael@0:         const uint16_t *factors=(const uint16_t *)(range+1);
michael@0:         uint16_t count=range->variant;
michael@0:         const char *s=(const char *)(factors+count);
michael@0:         char *suffix, *t;
michael@0:         uint16_t prefixLength, i, idx;
michael@0: 
michael@0:         char c;
michael@0: 
michael@0:         /* name = prefix factorized-elements */
michael@0: 
michael@0:         /* copy prefix */
michael@0:         suffix=buffer;
michael@0:         prefixLength=0;
michael@0:         while((c=*s++)!=0) {
michael@0:             *suffix++=c;
michael@0:             ++prefixLength;
michael@0:         }
michael@0: 
michael@0:         /* append the suffix of the start character */
michael@0:         length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count,
michael@0:                                               s, (uint32_t)start-range->start,
michael@0:                                               indexes, elementBases, elements,
michael@0:                                               suffix, (uint16_t)(sizeof(buffer)-prefixLength)));
michael@0: 
michael@0:         /* call the enumerator function with this first character */
michael@0:         if(!fn(context, start, nameChoice, buffer, length)) {
michael@0:             return FALSE;
michael@0:         }
michael@0: 
michael@0:         /* enumerate the rest of the names */
michael@0:         while(++start<limit) {
michael@0:             /* increment the indexes in lexical order bound by the factors */
michael@0:             i=count;
michael@0:             for (;;) {
michael@0:                 idx=(uint16_t)(indexes[--i]+1);
michael@0:                 if(idx<factors[i]) {
michael@0:                     /* skip one index and its element string */
michael@0:                     indexes[i]=idx;
michael@0:                     s=elements[i];
michael@0:                     while(*s++!=0) {
michael@0:                     }
michael@0:                     elements[i]=s;
michael@0:                     break;
michael@0:                 } else {
michael@0:                     /* reset this index to 0 and its element string to the first one */
michael@0:                     indexes[i]=0;
michael@0:                     elements[i]=elementBases[i];
michael@0:                 }
michael@0:             }
michael@0: 
michael@0:             /* to make matters a little easier, just append all elements to the suffix */
michael@0:             t=suffix;
michael@0:             length=prefixLength;
michael@0:             for(i=0; i<count; ++i) {
michael@0:                 s=elements[i];
michael@0:                 while((c=*s++)!=0) {
michael@0:                     *t++=c;
michael@0:                     ++length;
michael@0:                 }
michael@0:             }
michael@0:             /* zero-terminate */
michael@0:             *t=0;
michael@0: 
michael@0:             if(!fn(context, start, nameChoice, buffer, length)) {
michael@0:                 return FALSE;
michael@0:             }
michael@0:         }
michael@0:         break;
michael@0:     }
michael@0:     default:
michael@0:         /* undefined type */
michael@0:         break;
michael@0:     }
michael@0: 
michael@0:     return TRUE;
michael@0: }
michael@0: 
michael@0: /*
michael@0:  * findAlgName() is almost the same as enumAlgNames() except that it
michael@0:  * returns the code point for a name if it fits into the range.
michael@0:  * It returns 0xffff otherwise.
michael@0:  */
michael@0: static UChar32
michael@0: findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {
michael@0:     UChar32 code;
michael@0: 
michael@0:     if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
michael@0:         return 0xffff;
michael@0:     }
michael@0: 
michael@0:     switch(range->type) {
michael@0:     case 0: {
michael@0:         /* name = prefix hex-digits */
michael@0:         const char *s=(const char *)(range+1);
michael@0:         char c;
michael@0: 
michael@0:         uint16_t i, count;
michael@0: 
michael@0:         /* compare prefix */
michael@0:         while((c=*s++)!=0) {
michael@0:             if((char)c!=*otherName++) {
michael@0:                 return 0xffff;
michael@0:             }
michael@0:         }
michael@0: 
michael@0:         /* read hexadecimal code point value */
michael@0:         count=range->variant;
michael@0:         code=0;
michael@0:         for(i=0; i<count; ++i) {
michael@0:             c=*otherName++;
michael@0:             if('0'<=c && c<='9') {
michael@0:                 code=(code<<4)|(c-'0');
michael@0:             } else if('A'<=c && c<='F') {
michael@0:                 code=(code<<4)|(c-'A'+10);
michael@0:             } else {
michael@0:                 return 0xffff;
michael@0:             }
michael@0:         }
michael@0: 
michael@0:         /* does it fit into the range? */
michael@0:         if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) {
michael@0:             return code;
michael@0:         }
michael@0:         break;
michael@0:     }
michael@0:     case 1: {
michael@0:         char buffer[64];
michael@0:         uint16_t indexes[8];
michael@0:         const char *elementBases[8], *elements[8];
michael@0:         const uint16_t *factors=(const uint16_t *)(range+1);
michael@0:         uint16_t count=range->variant;
michael@0:         const char *s=(const char *)(factors+count), *t;
michael@0:         UChar32 start, limit;
michael@0:         uint16_t i, idx;
michael@0: 
michael@0:         char c;
michael@0: 
michael@0:         /* name = prefix factorized-elements */
michael@0: 
michael@0:         /* compare prefix */
michael@0:         while((c=*s++)!=0) {
michael@0:             if((char)c!=*otherName++) {
michael@0:                 return 0xffff;
michael@0:             }
michael@0:         }
michael@0: 
michael@0:         start=(UChar32)range->start;
michael@0:         limit=(UChar32)(range->end+1);
michael@0: 
michael@0:         /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
michael@0:         writeFactorSuffix(factors, count, s, 0,
michael@0:                           indexes, elementBases, elements, buffer, sizeof(buffer));
michael@0: 
michael@0:         /* compare the first suffix */
michael@0:         if(0==uprv_strcmp(otherName, buffer)) {
michael@0:             return start;
michael@0:         }
michael@0: 
michael@0:         /* enumerate and compare the rest of the suffixes */
michael@0:         while(++start<limit) {
michael@0:             /* increment the indexes in lexical order bound by the factors */
michael@0:             i=count;
michael@0:             for (;;) {
michael@0:                 idx=(uint16_t)(indexes[--i]+1);
michael@0:                 if(idx<factors[i]) {
michael@0:                     /* skip one index and its element string */
michael@0:                     indexes[i]=idx;
michael@0:                     s=elements[i];
michael@0:                     while(*s++!=0) {}
michael@0:                     elements[i]=s;
michael@0:                     break;
michael@0:                 } else {
michael@0:                     /* reset this index to 0 and its element string to the first one */
michael@0:                     indexes[i]=0;
michael@0:                     elements[i]=elementBases[i];
michael@0:                 }
michael@0:             }
michael@0: 
michael@0:             /* to make matters a little easier, just compare all elements of the suffix */
michael@0:             t=otherName;
michael@0:             for(i=0; i<count; ++i) {
michael@0:                 s=elements[i];
michael@0:                 while((c=*s++)!=0) {
michael@0:                     if(c!=*t++) {
michael@0:                         s=""; /* does not match */
michael@0:                         i=99;
michael@0:                     }
michael@0:                 }
michael@0:             }
michael@0:             if(i<99 && *t==0) {
michael@0:                 return start;
michael@0:             }
michael@0:         }
michael@0:         break;
michael@0:     }
michael@0:     default:
michael@0:         /* undefined type */
michael@0:         break;
michael@0:     }
michael@0: 
michael@0:     return 0xffff;
michael@0: }
michael@0: 
michael@0: /* sets of name characters, maximum name lengths ---------------------------- */
michael@0: 
michael@0: #define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
michael@0: #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
michael@0: 
michael@0: static int32_t
michael@0: calcStringSetLength(uint32_t set[8], const char *s) {
michael@0:     int32_t length=0;
michael@0:     char c;
michael@0: 
michael@0:     while((c=*s++)!=0) {
michael@0:         SET_ADD(set, c);
michael@0:         ++length;
michael@0:     }
michael@0:     return length;
michael@0: }
michael@0: 
michael@0: static int32_t
michael@0: calcAlgNameSetsLengths(int32_t maxNameLength) {
michael@0:     AlgorithmicRange *range;
michael@0:     uint32_t *p;
michael@0:     uint32_t rangeCount;
michael@0:     int32_t length;
michael@0: 
michael@0:     /* enumerate algorithmic ranges */
michael@0:     p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
michael@0:     rangeCount=*p;
michael@0:     range=(AlgorithmicRange *)(p+1);
michael@0:     while(rangeCount>0) {
michael@0:         switch(range->type) {
michael@0:         case 0:
michael@0:             /* name = prefix + (range->variant times) hex-digits */
michael@0:             /* prefix */
michael@0:             length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant;
michael@0:             if(length>maxNameLength) {
michael@0:                 maxNameLength=length;
michael@0:             }
michael@0:             break;
michael@0:         case 1: {
michael@0:             /* name = prefix factorized-elements */
michael@0:             const uint16_t *factors=(const uint16_t *)(range+1);
michael@0:             const char *s;
michael@0:             int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
michael@0: 
michael@0:             /* prefix length */
michael@0:             s=(const char *)(factors+count);
michael@0:             length=calcStringSetLength(gNameSet, s);
michael@0:             s+=length+1; /* start of factor suffixes */
michael@0: 
michael@0:             /* get the set and maximum factor suffix length for each factor */
michael@0:             for(i=0; i<count; ++i) {
michael@0:                 maxFactorLength=0;
michael@0:                 for(factor=factors[i]; factor>0; --factor) {
michael@0:                     factorLength=calcStringSetLength(gNameSet, s);
michael@0:                     s+=factorLength+1;
michael@0:                     if(factorLength>maxFactorLength) {
michael@0:                         maxFactorLength=factorLength;
michael@0:                     }
michael@0:                 }
michael@0:                 length+=maxFactorLength;
michael@0:             }
michael@0: 
michael@0:             if(length>maxNameLength) {
michael@0:                 maxNameLength=length;
michael@0:             }
michael@0:             break;
michael@0:         }
michael@0:         default:
michael@0:             /* unknown type */
michael@0:             break;
michael@0:         }
michael@0: 
michael@0:         range=(AlgorithmicRange *)((uint8_t *)range+range->size);
michael@0:         --rangeCount;
michael@0:     }
michael@0:     return maxNameLength;
michael@0: }
michael@0: 
michael@0: static int32_t
michael@0: calcExtNameSetsLengths(int32_t maxNameLength) {
michael@0:     int32_t i, length;
michael@0: 
michael@0:     for(i=0; i<LENGTHOF(charCatNames); ++i) {
michael@0:         /*
michael@0:          * for each category, count the length of the category name
michael@0:          * plus 9=
michael@0:          * 2 for <>
michael@0:          * 1 for -
michael@0:          * 6 for most hex digits per code point
michael@0:          */
michael@0:         length=9+calcStringSetLength(gNameSet, charCatNames[i]);
michael@0:         if(length>maxNameLength) {
michael@0:             maxNameLength=length;
michael@0:         }
michael@0:     }
michael@0:     return maxNameLength;
michael@0: }
michael@0: 
michael@0: static int32_t
michael@0: calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths,
michael@0:                   uint32_t set[8],
michael@0:                   const uint8_t **pLine, const uint8_t *lineLimit) {
michael@0:     const uint8_t *line=*pLine;
michael@0:     int32_t length=0, tokenLength;
michael@0:     uint16_t c, token;
michael@0: 
michael@0:     while(line!=lineLimit && (c=*line++)!=(uint8_t)';') {
michael@0:         if(c>=tokenCount) {
michael@0:             /* implicit letter */
michael@0:             SET_ADD(set, c);
michael@0:             ++length;
michael@0:         } else {
michael@0:             token=tokens[c];
michael@0:             if(token==(uint16_t)(-2)) {
michael@0:                 /* this is a lead byte for a double-byte token */
michael@0:                 c=c<<8|*line++;
michael@0:                 token=tokens[c];
michael@0:             }
michael@0:             if(token==(uint16_t)(-1)) {
michael@0:                 /* explicit letter */
michael@0:                 SET_ADD(set, c);
michael@0:                 ++length;
michael@0:             } else {
michael@0:                 /* count token word */
michael@0:                 if(tokenLengths!=NULL) {
michael@0:                     /* use cached token length */
michael@0:                     tokenLength=tokenLengths[c];
michael@0:                     if(tokenLength==0) {
michael@0:                         tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
michael@0:                         tokenLengths[c]=(int8_t)tokenLength;
michael@0:                     }
michael@0:                 } else {
michael@0:                     tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
michael@0:                 }
michael@0:                 length+=tokenLength;
michael@0:             }
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     *pLine=line;
michael@0:     return length;
michael@0: }
michael@0: 
michael@0: static void
michael@0: calcGroupNameSetsLengths(int32_t maxNameLength) {
michael@0:     uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
michael@0: 
michael@0:     uint16_t *tokens=(uint16_t *)uCharNames+8;
michael@0:     uint16_t tokenCount=*tokens++;
michael@0:     uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset;
michael@0: 
michael@0:     int8_t *tokenLengths;
michael@0: 
michael@0:     const uint16_t *group;
michael@0:     const uint8_t *s, *line, *lineLimit;
michael@0: 
michael@0:     int32_t groupCount, lineNumber, length;
michael@0: 
michael@0:     tokenLengths=(int8_t *)uprv_malloc(tokenCount);
michael@0:     if(tokenLengths!=NULL) {
michael@0:         uprv_memset(tokenLengths, 0, tokenCount);
michael@0:     }
michael@0: 
michael@0:     group=GET_GROUPS(uCharNames);
michael@0:     groupCount=*group++;
michael@0: 
michael@0:     /* enumerate all groups */
michael@0:     while(groupCount>0) {
michael@0:         s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group);
michael@0:         s=expandGroupLengths(s, offsets, lengths);
michael@0: 
michael@0:         /* enumerate all lines in each group */
michael@0:         for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
michael@0:             line=s+offsets[lineNumber];
michael@0:             length=lengths[lineNumber];
michael@0:             if(length==0) {
michael@0:                 continue;
michael@0:             }
michael@0: 
michael@0:             lineLimit=line+length;
michael@0: 
michael@0:             /* read regular name */
michael@0:             length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
michael@0:             if(length>maxNameLength) {
michael@0:                 maxNameLength=length;
michael@0:             }
michael@0:             if(line==lineLimit) {
michael@0:                 continue;
michael@0:             }
michael@0: 
michael@0:             /* read Unicode 1.0 name */
michael@0:             length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
michael@0:             if(length>maxNameLength) {
michael@0:                 maxNameLength=length;
michael@0:             }
michael@0:             if(line==lineLimit) {
michael@0:                 continue;
michael@0:             }
michael@0: 
michael@0:             /* read ISO comment */
michael@0:             /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
michael@0:         }
michael@0: 
michael@0:         group=NEXT_GROUP(group);
michael@0:         --groupCount;
michael@0:     }
michael@0: 
michael@0:     if(tokenLengths!=NULL) {
michael@0:         uprv_free(tokenLengths);
michael@0:     }
michael@0: 
michael@0:     /* set gMax... - name length last for threading */
michael@0:     gMaxNameLength=maxNameLength;
michael@0: }
michael@0: 
michael@0: static UBool
michael@0: calcNameSetsLengths(UErrorCode *pErrorCode) {
michael@0:     static const char extChars[]="0123456789ABCDEF<>-";
michael@0:     int32_t i, maxNameLength;
michael@0: 
michael@0:     if(gMaxNameLength!=0) {
michael@0:         return TRUE;
michael@0:     }
michael@0: 
michael@0:     if(!isDataLoaded(pErrorCode)) {
michael@0:         return FALSE;
michael@0:     }
michael@0: 
michael@0:     /* set hex digits, used in various names, and <>-, used in extended names */
michael@0:     for(i=0; i<(int32_t)sizeof(extChars)-1; ++i) {
michael@0:         SET_ADD(gNameSet, extChars[i]);
michael@0:     }
michael@0: 
michael@0:     /* set sets and lengths from algorithmic names */
michael@0:     maxNameLength=calcAlgNameSetsLengths(0);
michael@0: 
michael@0:     /* set sets and lengths from extended names */
michael@0:     maxNameLength=calcExtNameSetsLengths(maxNameLength);
michael@0: 
michael@0:     /* set sets and lengths from group names, set global maximum values */
michael@0:     calcGroupNameSetsLengths(maxNameLength);
michael@0: 
michael@0:     return TRUE;
michael@0: }
michael@0: 
michael@0: /* public API --------------------------------------------------------------- */
michael@0: 
michael@0: U_CAPI int32_t U_EXPORT2
michael@0: u_charName(UChar32 code, UCharNameChoice nameChoice,
michael@0:            char *buffer, int32_t bufferLength,
michael@0:            UErrorCode *pErrorCode) {
michael@0:     AlgorithmicRange *algRange;
michael@0:     uint32_t *p;
michael@0:     uint32_t i;
michael@0:     int32_t length;
michael@0: 
michael@0:     /* check the argument values */
michael@0:     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
michael@0:         return 0;
michael@0:     } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
michael@0:               bufferLength<0 || (bufferLength>0 && buffer==NULL)
michael@0:     ) {
michael@0:         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0:         return 0;
michael@0:     }
michael@0: 
michael@0:     if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
michael@0:         return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
michael@0:     }
michael@0: 
michael@0:     length=0;
michael@0: 
michael@0:     /* try algorithmic names first */
michael@0:     p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
michael@0:     i=*p;
michael@0:     algRange=(AlgorithmicRange *)(p+1);
michael@0:     while(i>0) {
michael@0:         if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
michael@0:             length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
michael@0:             break;
michael@0:         }
michael@0:         algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
michael@0:         --i;
michael@0:     }
michael@0: 
michael@0:     if(i==0) {
michael@0:         if (nameChoice == U_EXTENDED_CHAR_NAME) {
michael@0:             length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
michael@0:             if (!length) {
michael@0:                 /* extended character name */
michael@0:                 length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
michael@0:             }
michael@0:         } else {
michael@0:             /* normal character name */
michael@0:             length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     return u_terminateChars(buffer, bufferLength, length, pErrorCode);
michael@0: }
michael@0: 
michael@0: U_CAPI int32_t U_EXPORT2
michael@0: u_getISOComment(UChar32 /*c*/,
michael@0:                 char *dest, int32_t destCapacity,
michael@0:                 UErrorCode *pErrorCode) {
michael@0:     /* check the argument values */
michael@0:     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
michael@0:         return 0;
michael@0:     } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
michael@0:         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0:         return 0;
michael@0:     }
michael@0: 
michael@0:     return u_terminateChars(dest, destCapacity, 0, pErrorCode);
michael@0: }
michael@0: 
michael@0: U_CAPI UChar32 U_EXPORT2
michael@0: u_charFromName(UCharNameChoice nameChoice,
michael@0:                const char *name,
michael@0:                UErrorCode *pErrorCode) {
michael@0:     char upper[120], lower[120];
michael@0:     FindName findName;
michael@0:     AlgorithmicRange *algRange;
michael@0:     uint32_t *p;
michael@0:     uint32_t i;
michael@0:     UChar32 cp = 0;
michael@0:     char c0;
michael@0:     UChar32 error = 0xffff;     /* Undefined, but use this for backwards compatibility. */
michael@0: 
michael@0:     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
michael@0:         return error;
michael@0:     }
michael@0: 
michael@0:     if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) {
michael@0:         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0:         return error;
michael@0:     }
michael@0: 
michael@0:     if(!isDataLoaded(pErrorCode)) {
michael@0:         return error;
michael@0:     }
michael@0: 
michael@0:     /* construct the uppercase and lowercase of the name first */
michael@0:     for(i=0; i<sizeof(upper); ++i) {
michael@0:         if((c0=*name++)!=0) {
michael@0:             upper[i]=uprv_toupper(c0);
michael@0:             lower[i]=uprv_tolower(c0);
michael@0:         } else {
michael@0:             upper[i]=lower[i]=0;
michael@0:             break;
michael@0:         }
michael@0:     }
michael@0:     if(i==sizeof(upper)) {
michael@0:         /* name too long, there is no such character */
michael@0:         *pErrorCode = U_ILLEGAL_CHAR_FOUND;
michael@0:         return error;
michael@0:     }
michael@0: 
michael@0:     /* try extended names first */
michael@0:     if (lower[0] == '<') {
michael@0:         if (nameChoice == U_EXTENDED_CHAR_NAME) {
michael@0:             if (lower[--i] == '>') {
michael@0:                 for (--i; lower[i] && lower[i] != '-'; --i) {
michael@0:                 }
michael@0: 
michael@0:                 if (lower[i] == '-') { /* We've got a category. */
michael@0:                     uint32_t cIdx;
michael@0: 
michael@0:                     lower[i] = 0;
michael@0: 
michael@0:                     for (++i; lower[i] != '>'; ++i) {
michael@0:                         if (lower[i] >= '0' && lower[i] <= '9') {
michael@0:                             cp = (cp << 4) + lower[i] - '0';
michael@0:                         } else if (lower[i] >= 'a' && lower[i] <= 'f') {
michael@0:                             cp = (cp << 4) + lower[i] - 'a' + 10;
michael@0:                         } else {
michael@0:                             *pErrorCode = U_ILLEGAL_CHAR_FOUND;
michael@0:                             return error;
michael@0:                         }
michael@0:                     }
michael@0: 
michael@0:                     /* Now validate the category name.
michael@0:                        We could use a binary search, or a trie, if
michael@0:                        we really wanted to. */
michael@0: 
michael@0:                     for (lower[i] = 0, cIdx = 0; cIdx < LENGTHOF(charCatNames); ++cIdx) {
michael@0: 
michael@0:                         if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
michael@0:                             if (getCharCat(cp) == cIdx) {
michael@0:                                 return cp;
michael@0:                             }
michael@0:                             break;
michael@0:                         }
michael@0:                     }
michael@0:                 }
michael@0:             }
michael@0:         }
michael@0: 
michael@0:         *pErrorCode = U_ILLEGAL_CHAR_FOUND;
michael@0:         return error;
michael@0:     }
michael@0: 
michael@0:     /* try algorithmic names now */
michael@0:     p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
michael@0:     i=*p;
michael@0:     algRange=(AlgorithmicRange *)(p+1);
michael@0:     while(i>0) {
michael@0:         if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
michael@0:             return cp;
michael@0:         }
michael@0:         algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
michael@0:         --i;
michael@0:     }
michael@0: 
michael@0:     /* normal character name */
michael@0:     findName.otherName=upper;
michael@0:     findName.code=error;
michael@0:     enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
michael@0:     if (findName.code == error) {
michael@0:          *pErrorCode = U_ILLEGAL_CHAR_FOUND;
michael@0:     }
michael@0:     return findName.code;
michael@0: }
michael@0: 
michael@0: U_CAPI void U_EXPORT2
michael@0: u_enumCharNames(UChar32 start, UChar32 limit,
michael@0:                 UEnumCharNamesFn *fn,
michael@0:                 void *context,
michael@0:                 UCharNameChoice nameChoice,
michael@0:                 UErrorCode *pErrorCode) {
michael@0:     AlgorithmicRange *algRange;
michael@0:     uint32_t *p;
michael@0:     uint32_t i;
michael@0: 
michael@0:     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
michael@0:         return;
michael@0:     }
michael@0: 
michael@0:     if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) {
michael@0:         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0:         return;
michael@0:     }
michael@0: 
michael@0:     if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
michael@0:         limit = UCHAR_MAX_VALUE + 1;
michael@0:     }
michael@0:     if((uint32_t)start>=(uint32_t)limit) {
michael@0:         return;
michael@0:     }
michael@0: 
michael@0:     if(!isDataLoaded(pErrorCode)) {
michael@0:         return;
michael@0:     }
michael@0: 
michael@0:     /* interleave the data-driven ones with the algorithmic ones */
michael@0:     /* iterate over all algorithmic ranges; assume that they are in ascending order */
michael@0:     p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
michael@0:     i=*p;
michael@0:     algRange=(AlgorithmicRange *)(p+1);
michael@0:     while(i>0) {
michael@0:         /* enumerate the character names before the current algorithmic range */
michael@0:         /* here: start<limit */
michael@0:         if((uint32_t)start<algRange->start) {
michael@0:             if((uint32_t)limit<=algRange->start) {
michael@0:                 enumNames(uCharNames, start, limit, fn, context, nameChoice);
michael@0:                 return;
michael@0:             }
michael@0:             if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
michael@0:                 return;
michael@0:             }
michael@0:             start=(UChar32)algRange->start;
michael@0:         }
michael@0:         /* enumerate the character names in the current algorithmic range */
michael@0:         /* here: algRange->start<=start<limit */
michael@0:         if((uint32_t)start<=algRange->end) {
michael@0:             if((uint32_t)limit<=(algRange->end+1)) {
michael@0:                 enumAlgNames(algRange, start, limit, fn, context, nameChoice);
michael@0:                 return;
michael@0:             }
michael@0:             if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
michael@0:                 return;
michael@0:             }
michael@0:             start=(UChar32)algRange->end+1;
michael@0:         }
michael@0:         /* continue to the next algorithmic range (here: start<limit) */
michael@0:         algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
michael@0:         --i;
michael@0:     }
michael@0:     /* enumerate the character names after the last algorithmic range */
michael@0:     enumNames(uCharNames, start, limit, fn, context, nameChoice);
michael@0: }
michael@0: 
michael@0: U_CAPI int32_t U_EXPORT2
michael@0: uprv_getMaxCharNameLength() {
michael@0:     UErrorCode errorCode=U_ZERO_ERROR;
michael@0:     if(calcNameSetsLengths(&errorCode)) {
michael@0:         return gMaxNameLength;
michael@0:     } else {
michael@0:         return 0;
michael@0:     }
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Converts the char set cset into a Unicode set uset.
michael@0:  * @param cset Set of 256 bit flags corresponding to a set of chars.
michael@0:  * @param uset USet to receive characters. Existing contents are deleted.
michael@0:  */
michael@0: static void
michael@0: charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
michael@0:     UChar us[256];
michael@0:     char cs[256];
michael@0: 
michael@0:     int32_t i, length;
michael@0:     UErrorCode errorCode;
michael@0: 
michael@0:     errorCode=U_ZERO_ERROR;
michael@0: 
michael@0:     if(!calcNameSetsLengths(&errorCode)) {
michael@0:         return;
michael@0:     }
michael@0: 
michael@0:     /* build a char string with all chars that are used in character names */
michael@0:     length=0;
michael@0:     for(i=0; i<256; ++i) {
michael@0:         if(SET_CONTAINS(cset, i)) {
michael@0:             cs[length++]=(char)i;
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     /* convert the char string to a UChar string */
michael@0:     u_charsToUChars(cs, us, length);
michael@0: 
michael@0:     /* add each UChar to the USet */
michael@0:     for(i=0; i<length; ++i) {
michael@0:         if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */
michael@0:             sa->add(sa->set, us[i]);
michael@0:         }
michael@0:     }
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Fills set with characters that are used in Unicode character names.
michael@0:  * @param set USet to receive characters.
michael@0:  */
michael@0: U_CAPI void U_EXPORT2
michael@0: uprv_getCharNameCharacters(const USetAdder *sa) {
michael@0:     charSetToUSet(gNameSet, sa);
michael@0: }
michael@0: 
michael@0: /* data swapping ------------------------------------------------------------ */
michael@0: 
michael@0: /*
michael@0:  * The token table contains non-negative entries for token bytes,
michael@0:  * and -1 for bytes that represent themselves in the data file's charset.
michael@0:  * -2 entries are used for lead bytes.
michael@0:  *
michael@0:  * Direct bytes (-1 entries) must be translated from the input charset family
michael@0:  * to the output charset family.
michael@0:  * makeTokenMap() writes a permutation mapping for this.
michael@0:  * Use it once for single-/lead-byte tokens and once more for all trail byte
michael@0:  * tokens. (';' is an unused trail byte marked with -1.)
michael@0:  */
michael@0: static void
michael@0: makeTokenMap(const UDataSwapper *ds,
michael@0:              int16_t tokens[], uint16_t tokenCount,
michael@0:              uint8_t map[256],
michael@0:              UErrorCode *pErrorCode) {
michael@0:     UBool usedOutChar[256];
michael@0:     uint16_t i, j;
michael@0:     uint8_t c1, c2;
michael@0: 
michael@0:     if(U_FAILURE(*pErrorCode)) {
michael@0:         return;
michael@0:     }
michael@0: 
michael@0:     if(ds->inCharset==ds->outCharset) {
michael@0:         /* Same charset family: identity permutation */
michael@0:         for(i=0; i<256; ++i) {
michael@0:             map[i]=(uint8_t)i;
michael@0:         }
michael@0:     } else {
michael@0:         uprv_memset(map, 0, 256);
michael@0:         uprv_memset(usedOutChar, 0, 256);
michael@0: 
michael@0:         if(tokenCount>256) {
michael@0:             tokenCount=256;
michael@0:         }
michael@0: 
michael@0:         /* set the direct bytes (byte 0 always maps to itself) */
michael@0:         for(i=1; i<tokenCount; ++i) {
michael@0:             if(tokens[i]==-1) {
michael@0:                 /* convert the direct byte character */
michael@0:                 c1=(uint8_t)i;
michael@0:                 ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
michael@0:                 if(U_FAILURE(*pErrorCode)) {
michael@0:                     udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
michael@0:                                      i, ds->inCharset);
michael@0:                     return;
michael@0:                 }
michael@0: 
michael@0:                 /* enter the converted character into the map and mark it used */
michael@0:                 map[c1]=c2;
michael@0:                 usedOutChar[c2]=TRUE;
michael@0:             }
michael@0:         }
michael@0: 
michael@0:         /* set the mappings for the rest of the permutation */
michael@0:         for(i=j=1; i<tokenCount; ++i) {
michael@0:             /* set mappings that were not set for direct bytes */
michael@0:             if(map[i]==0) {
michael@0:                 /* set an output byte value that was not used as an output byte above */
michael@0:                 while(usedOutChar[j]) {
michael@0:                     ++j;
michael@0:                 }
michael@0:                 map[i]=(uint8_t)j++;
michael@0:             }
michael@0:         }
michael@0: 
michael@0:         /*
michael@0:          * leave mappings at tokenCount and above unset if tokenCount<256
michael@0:          * because they won't be used
michael@0:          */
michael@0:     }
michael@0: }
michael@0: 
michael@0: U_CAPI int32_t U_EXPORT2
michael@0: uchar_swapNames(const UDataSwapper *ds,
michael@0:                 const void *inData, int32_t length, void *outData,
michael@0:                 UErrorCode *pErrorCode) {
michael@0:     const UDataInfo *pInfo;
michael@0:     int32_t headerSize;
michael@0: 
michael@0:     const uint8_t *inBytes;
michael@0:     uint8_t *outBytes;
michael@0: 
michael@0:     uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
michael@0:              offset, i, count, stringsCount;
michael@0: 
michael@0:     const AlgorithmicRange *inRange;
michael@0:     AlgorithmicRange *outRange;
michael@0: 
michael@0:     /* udata_swapDataHeader checks the arguments */
michael@0:     headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
michael@0:     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
michael@0:         return 0;
michael@0:     }
michael@0: 
michael@0:     /* check data format and format version */
michael@0:     pInfo=(const UDataInfo *)((const char *)inData+4);
michael@0:     if(!(
michael@0:         pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
michael@0:         pInfo->dataFormat[1]==0x6e &&
michael@0:         pInfo->dataFormat[2]==0x61 &&
michael@0:         pInfo->dataFormat[3]==0x6d &&
michael@0:         pInfo->formatVersion[0]==1
michael@0:     )) {
michael@0:         udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
michael@0:                          pInfo->dataFormat[0], pInfo->dataFormat[1],
michael@0:                          pInfo->dataFormat[2], pInfo->dataFormat[3],
michael@0:                          pInfo->formatVersion[0]);
michael@0:         *pErrorCode=U_UNSUPPORTED_ERROR;
michael@0:         return 0;
michael@0:     }
michael@0: 
michael@0:     inBytes=(const uint8_t *)inData+headerSize;
michael@0:     outBytes=(uint8_t *)outData+headerSize;
michael@0:     if(length<0) {
michael@0:         algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
michael@0:     } else {
michael@0:         length-=headerSize;
michael@0:         if( length<20 ||
michael@0:             (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
michael@0:         ) {
michael@0:             udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
michael@0:                              length);
michael@0:             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0:             return 0;
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     if(length<0) {
michael@0:         /* preflighting: iterate through algorithmic ranges */
michael@0:         offset=algNamesOffset;
michael@0:         count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
michael@0:         offset+=4;
michael@0: 
michael@0:         for(i=0; i<count; ++i) {
michael@0:             inRange=(const AlgorithmicRange *)(inBytes+offset);
michael@0:             offset+=ds->readUInt16(inRange->size);
michael@0:         }
michael@0:     } else {
michael@0:         /* swap data */
michael@0:         const uint16_t *p;
michael@0:         uint16_t *q, *temp;
michael@0: 
michael@0:         int16_t tokens[512];
michael@0:         uint16_t tokenCount;
michael@0: 
michael@0:         uint8_t map[256], trailMap[256];
michael@0: 
michael@0:         /* copy the data for inaccessible bytes */
michael@0:         if(inBytes!=outBytes) {
michael@0:             uprv_memcpy(outBytes, inBytes, length);
michael@0:         }
michael@0: 
michael@0:         /* the initial 4 offsets first */
michael@0:         tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
michael@0:         groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
michael@0:         groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
michael@0:         ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
michael@0: 
michael@0:         /*
michael@0:          * now the tokens table
michael@0:          * it needs to be permutated along with the compressed name strings
michael@0:          */
michael@0:         p=(const uint16_t *)(inBytes+16);
michael@0:         q=(uint16_t *)(outBytes+16);
michael@0: 
michael@0:         /* read and swap the tokenCount */
michael@0:         tokenCount=ds->readUInt16(*p);
michael@0:         ds->swapArray16(ds, p, 2, q, pErrorCode);
michael@0:         ++p;
michael@0:         ++q;
michael@0: 
michael@0:         /* read the first 512 tokens and make the token maps */
michael@0:         if(tokenCount<=512) {
michael@0:             count=tokenCount;
michael@0:         } else {
michael@0:             count=512;
michael@0:         }
michael@0:         for(i=0; i<count; ++i) {
michael@0:             tokens[i]=udata_readInt16(ds, p[i]);
michael@0:         }
michael@0:         for(; i<512; ++i) {
michael@0:             tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
michael@0:         }
michael@0:         makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
michael@0:         makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
michael@0:         if(U_FAILURE(*pErrorCode)) {
michael@0:             return 0;
michael@0:         }
michael@0: 
michael@0:         /*
michael@0:          * swap and permutate the tokens
michael@0:          * go through a temporary array to support in-place swapping
michael@0:          */
michael@0:         temp=(uint16_t *)uprv_malloc(tokenCount*2);
michael@0:         if(temp==NULL) {
michael@0:             udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
michael@0:                              tokenCount);
michael@0:             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0:             return 0;
michael@0:         }
michael@0: 
michael@0:         /* swap and permutate single-/lead-byte tokens */
michael@0:         for(i=0; i<tokenCount && i<256; ++i) {
michael@0:             ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
michael@0:         }
michael@0: 
michael@0:         /* swap and permutate trail-byte tokens */
michael@0:         for(; i<tokenCount; ++i) {
michael@0:             ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
michael@0:         }
michael@0: 
michael@0:         /* copy the result into the output and free the temporary array */
michael@0:         uprv_memcpy(q, temp, tokenCount*2);
michael@0:         uprv_free(temp);
michael@0: 
michael@0:         /*
michael@0:          * swap the token strings but not a possible padding byte after
michael@0:          * the terminating NUL of the last string
michael@0:          */
michael@0:         udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
michael@0:                                     outBytes+tokenStringOffset, pErrorCode);
michael@0:         if(U_FAILURE(*pErrorCode)) {
michael@0:             udata_printError(ds, "uchar_swapNames(token strings) failed\n");
michael@0:             return 0;
michael@0:         }
michael@0: 
michael@0:         /* swap the group table */
michael@0:         count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
michael@0:         ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
michael@0:                            outBytes+groupsOffset, pErrorCode);
michael@0: 
michael@0:         /*
michael@0:          * swap the group strings
michael@0:          * swap the string bytes but not the nibble-encoded string lengths
michael@0:          */
michael@0:         if(ds->inCharset!=ds->outCharset) {
michael@0:             uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
michael@0: 
michael@0:             const uint8_t *inStrings, *nextInStrings;
michael@0:             uint8_t *outStrings;
michael@0: 
michael@0:             uint8_t c;
michael@0: 
michael@0:             inStrings=inBytes+groupStringOffset;
michael@0:             outStrings=outBytes+groupStringOffset;
michael@0: 
michael@0:             stringsCount=algNamesOffset-groupStringOffset;
michael@0: 
michael@0:             /* iterate through string groups until only a few padding bytes are left */
michael@0:             while(stringsCount>32) {
michael@0:                 nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
michael@0: 
michael@0:                 /* move past the length bytes */
michael@0:                 stringsCount-=(uint32_t)(nextInStrings-inStrings);
michael@0:                 outStrings+=nextInStrings-inStrings;
michael@0:                 inStrings=nextInStrings;
michael@0: 
michael@0:                 count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
michael@0:                 stringsCount-=count;
michael@0: 
michael@0:                 /* swap the string bytes using map[] and trailMap[] */
michael@0:                 while(count>0) {
michael@0:                     c=*inStrings++;
michael@0:                     *outStrings++=map[c];
michael@0:                     if(tokens[c]!=-2) {
michael@0:                         --count;
michael@0:                     } else {
michael@0:                         /* token lead byte: swap the trail byte, too */
michael@0:                         *outStrings++=trailMap[*inStrings++];
michael@0:                         count-=2;
michael@0:                     }
michael@0:                 }
michael@0:             }
michael@0:         }
michael@0: 
michael@0:         /* swap the algorithmic ranges */
michael@0:         offset=algNamesOffset;
michael@0:         count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
michael@0:         ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
michael@0:         offset+=4;
michael@0: 
michael@0:         for(i=0; i<count; ++i) {
michael@0:             if(offset>(uint32_t)length) {
michael@0:                 udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
michael@0:                                  length, i);
michael@0:                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0:                 return 0;
michael@0:             }
michael@0: 
michael@0:             inRange=(const AlgorithmicRange *)(inBytes+offset);
michael@0:             outRange=(AlgorithmicRange *)(outBytes+offset);
michael@0:             offset+=ds->readUInt16(inRange->size);
michael@0: 
michael@0:             ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
michael@0:             ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
michael@0:             switch(inRange->type) {
michael@0:             case 0:
michael@0:                 /* swap prefix string */
michael@0:                 ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
michael@0:                                     outRange+1, pErrorCode);
michael@0:                 if(U_FAILURE(*pErrorCode)) {
michael@0:                     udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
michael@0:                                      i);
michael@0:                     return 0;
michael@0:                 }
michael@0:                 break;
michael@0:             case 1:
michael@0:                 {
michael@0:                     /* swap factors and the prefix and factor strings */
michael@0:                     uint32_t factorsCount;
michael@0: 
michael@0:                     factorsCount=inRange->variant;
michael@0:                     p=(const uint16_t *)(inRange+1);
michael@0:                     q=(uint16_t *)(outRange+1);
michael@0:                     ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
michael@0: 
michael@0:                     /* swap the strings, up to the last terminating NUL */
michael@0:                     p+=factorsCount;
michael@0:                     q+=factorsCount;
michael@0:                     stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
michael@0:                     while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
michael@0:                         --stringsCount;
michael@0:                     }
michael@0:                     ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
michael@0:                 }
michael@0:                 break;
michael@0:             default:
michael@0:                 udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
michael@0:                                  inRange->type, i);
michael@0:                 *pErrorCode=U_UNSUPPORTED_ERROR;
michael@0:                 return 0;
michael@0:             }
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     return headerSize+(int32_t)offset;
michael@0: }
michael@0: 
michael@0: U_NAMESPACE_END
michael@0: 
michael@0: /*
michael@0:  * Hey, Emacs, please set the following:
michael@0:  *
michael@0:  * Local Variables:
michael@0:  * indent-tabs-mode: nil
michael@0:  * End:
michael@0:  *
michael@0:  */