The Tor Browser: comparison intl/icu/source/common/unames.cpp

--1:000000000000
+:892c1de9d9fe
+/*
+******************************************************************************
+*
+*   Copyright (C) 1999-2013, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+*
+******************************************************************************
+*   file name:  unames.c
+*   encoding:   US-ASCII
+*   tab size:   8 (not used)
+*   indentation:4
+*
+*   created on: 1999oct04
+*   created by: Markus W. Scherer
+*/
+#include "unicode/utypes.h"
+#include "unicode/putil.h"
+#include "unicode/uchar.h"
+#include "unicode/udata.h"
+#include "unicode/utf.h"
+#include "unicode/utf16.h"
+#include "uassert.h"
+#include "ustr_imp.h"
+#include "umutex.h"
+#include "cmemory.h"
+#include "cstring.h"
+#include "ucln_cmn.h"
+#include "udataswp.h"
+#include "uprops.h"
+U_NAMESPACE_BEGIN
+/* prototypes ------------------------------------------------------------- */
+#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
+static const char DATA_NAME[] = "unames";
+static const char DATA_TYPE[] = "icu";
+#define GROUP_SHIFT 5
+#define LINES_PER_GROUP (1L<<GROUP_SHIFT)
+#define GROUP_MASK (LINES_PER_GROUP-1)
+/*
+* This struct was replaced by explicitly accessing equivalent
+* fields from triples of uint16_t.
+* The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
+* which broke the assumption that sizeof(Group)==6 and that the ++ operator
+* would advance by 6 bytes (3 uint16_t).
+*
+* We can't just change the data structure because it's loaded from a data file,
+* and we don't want to make it less compact, so we changed the access code.
+*
+* For details see ICU tickets 6331 and 6008.
+typedef struct {
+uint16_t groupMSB,
+offsetHigh, offsetLow; / * avoid padding * /
+} Group;
+*/
+enum {
+GROUP_MSB,
+GROUP_OFFSET_HIGH,
+GROUP_OFFSET_LOW,
+GROUP_LENGTH
+};
+/*
+* Get the 32-bit group offset.
+* @param group (const uint16_t *) pointer to a Group triple of uint16_t
+* @return group offset (int32_t)
+*/
+#define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
+#define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
+#define PREV_GROUP(group) ((group)-GROUP_LENGTH)
+typedef struct {
+uint32_t start, end;
+uint8_t type, variant;
+uint16_t size;
+} AlgorithmicRange;
+typedef struct {
+uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
+} UCharNames;
+/*
+* Get the groups table from a UCharNames struct.
+* The groups table consists of one uint16_t groupCount followed by
+* groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
+* and the comment for the old struct Group above.
+*
+* @param names (const UCharNames *) pointer to the UCharNames indexes
+* @return (const uint16_t *) pointer to the groups table
+*/
+#define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
+typedef struct {
+const char *otherName;
+UChar32 code;
+} FindName;
+#define DO_FIND_NAME NULL
+static UDataMemory *uCharNamesData=NULL;
+static UCharNames *uCharNames=NULL;
+static icu::UInitOnce gCharNamesInitOnce = U_INITONCE_INITIALIZER;
+/*
+* Maximum length of character names (regular & 1.0).
+*/
+static int32_t gMaxNameLength=0;
+/*
+* Set of chars used in character names (regular & 1.0).
+* Chars are platform-dependent (can be EBCDIC).
+*/
+static uint32_t gNameSet[8]={ 0 };
+#define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
+#define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
+#define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
+#define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
+static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
+"unassigned",
+"uppercase letter",
+"lowercase letter",
+"titlecase letter",
+"modifier letter",
+"other letter",
+"non spacing mark",
+"enclosing mark",
+"combining spacing mark",
+"decimal digit number",
+"letter number",
+"other number",
+"space separator",
+"line separator",
+"paragraph separator",
+"control",
+"format",
+"private use area",
+"surrogate",
+"dash punctuation",
+"start punctuation",
+"end punctuation",
+"connector punctuation",
+"other punctuation",
+"math symbol",
+"currency symbol",
+"modifier symbol",
+"other symbol",
+"initial punctuation",
+"final punctuation",
+"noncharacter",
+"lead surrogate",
+"trail surrogate"
+};
+/* implementation ----------------------------------------------------------- */
+static UBool U_CALLCONV unames_cleanup(void)
+{
+if(uCharNamesData) {
+udata_close(uCharNamesData);
+uCharNamesData = NULL;
+}
+if(uCharNames) {
+uCharNames = NULL;
+}
+gCharNamesInitOnce.reset();
+gMaxNameLength=0;
+return TRUE;
+}
+static UBool U_CALLCONV
+isAcceptable(void * /*context*/,
+const char * /*type*/, const char * /*name*/,
+const UDataInfo *pInfo) {
+return (UBool)(
+pInfo->size>=20 &&
+pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
+pInfo->charsetFamily==U_CHARSET_FAMILY &&
+pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
+pInfo->dataFormat[1]==0x6e &&
+pInfo->dataFormat[2]==0x61 &&
+pInfo->dataFormat[3]==0x6d &&
+pInfo->formatVersion[0]==1);
+}
+static void U_CALLCONV
+loadCharNames(UErrorCode &status) {
+U_ASSERT(uCharNamesData == NULL);
+U_ASSERT(uCharNames == NULL);
+uCharNamesData = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &status);
+if(U_FAILURE(status)) {
+uCharNamesData = NULL;
+} else {
+uCharNames = (UCharNames *)udata_getMemory(uCharNamesData);
+}
+ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
+}
+static UBool
+isDataLoaded(UErrorCode *pErrorCode) {
+umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode);
+return U_SUCCESS(*pErrorCode);
+}
+#define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
+if((bufferLength)>0) { \
+*(buffer)++=c; \
+--(bufferLength); \
+} \
+++(bufferPos); \
+}
+#define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
+/*
+* Important: expandName() and compareName() are almost the same -
+* apply fixes to both.
+*
+* UnicodeData.txt uses ';' as a field separator, so no
+* field can contain ';' as part of its contents.
+* In unames.dat, it is marked as token[';']==-1 only if the
+* semicolon is used in the data file - which is iff we
+* have Unicode 1.0 names or ISO comments or aliases.
+* So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
+* although we know that it will never be part of a name.
+*/
+static uint16_t
+expandName(UCharNames *names,
+const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
+char *buffer, uint16_t bufferLength) {
+uint16_t *tokens=(uint16_t *)names+8;
+uint16_t token, tokenCount=*tokens++, bufferPos=0;
+uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
+uint8_t c;
+if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
+/*
+* skip the modern name if it is not requested _and_
+* if the semicolon byte value is a character, not a token number
+*/
+if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
+int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
+do {
+while(nameLength>0) {
+--nameLength;
+if(*name++==';') {
+break;
+}
+}
+} while(--fieldIndex>0);
+} else {
+/*
+* the semicolon byte value is a token number, therefore
+* only modern names are stored in unames.dat and there is no
+* such requested alternate name here
+*/
+nameLength=0;
+}
+}
+/* write each letter directly, and write a token word per token */
+while(nameLength>0) {
+--nameLength;
+c=*name++;
+if(c>=tokenCount) {
+if(c!=';') {
+/* implicit letter */
+WRITE_CHAR(buffer, bufferLength, bufferPos, c);
+} else {
+/* finished */
+break;
+}
+} else {
+token=tokens[c];
+if(token==(uint16_t)(-2)) {
+/* this is a lead byte for a double-byte token */
+token=tokens[c<<8|*name++];
+--nameLength;
+}
+if(token==(uint16_t)(-1)) {
+if(c!=';') {
+/* explicit letter */
+WRITE_CHAR(buffer, bufferLength, bufferPos, c);
+} else {
+/* stop, but skip the semicolon if we are seeking
+extended names and there was no 2.0 name but there
+is a 1.0 name. */
+if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
+if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
+continue;
+}
+}
+/* finished */
+break;
+}
+} else {
+/* write token word */
+uint8_t *tokenString=tokenStrings+token;
+while((c=*tokenString++)!=0) {
+WRITE_CHAR(buffer, bufferLength, bufferPos, c);
+}
+}
+}
+}
+/* zero-terminate */
+if(bufferLength>0) {
+*buffer=0;
+}
+return bufferPos;
+}
+/*
+* compareName() is almost the same as expandName() except that it compares
+* the currently expanded name to an input name.
+* It returns the match/no match result as soon as possible.
+*/
+static UBool
+compareName(UCharNames *names,
+const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
+const char *otherName) {
+uint16_t *tokens=(uint16_t *)names+8;
+uint16_t token, tokenCount=*tokens++;
+uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
+uint8_t c;
+const char *origOtherName = otherName;
+if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
+/*
+* skip the modern name if it is not requested _and_
+* if the semicolon byte value is a character, not a token number
+*/
+if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
+int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
+do {
+while(nameLength>0) {
+--nameLength;
+if(*name++==';') {
+break;
+}
+}
+} while(--fieldIndex>0);
+} else {
+/*
+* the semicolon byte value is a token number, therefore
+* only modern names are stored in unames.dat and there is no
+* such requested alternate name here
+*/
+nameLength=0;
+}
+}
+/* compare each letter directly, and compare a token word per token */
+while(nameLength>0) {
+--nameLength;
+c=*name++;
+if(c>=tokenCount) {
+if(c!=';') {
+/* implicit letter */
+if((char)c!=*otherName++) {
+return FALSE;
+}
+} else {
+/* finished */
+break;
+}
+} else {
+token=tokens[c];
+if(token==(uint16_t)(-2)) {
+/* this is a lead byte for a double-byte token */
+token=tokens[c<<8|*name++];
+--nameLength;
+}
+if(token==(uint16_t)(-1)) {
+if(c!=';') {
+/* explicit letter */
+if((char)c!=*otherName++) {
+return FALSE;
+}
+} else {
+/* stop, but skip the semicolon if we are seeking
+extended names and there was no 2.0 name but there
+is a 1.0 name. */
+if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
+if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
+continue;
+}
+}
+/* finished */
+break;
+}
+} else {
+/* write token word */
+uint8_t *tokenString=tokenStrings+token;
+while((c=*tokenString++)!=0) {
+if((char)c!=*otherName++) {
+return FALSE;
+}
+}
+}
+}
+}
+/* complete match? */
+return (UBool)(*otherName==0);
+}
+static uint8_t getCharCat(UChar32 cp) {
+uint8_t cat;
+if (U_IS_UNICODE_NONCHAR(cp)) {
+return U_NONCHARACTER_CODE_POINT;
+}
+if ((cat = u_charType(cp)) == U_SURROGATE) {
+cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
+}
+return cat;
+}
+static const char *getCharCatName(UChar32 cp) {
+uint8_t cat = getCharCat(cp);
+/* Return unknown if the table of names above is not up to
+date. */
+if (cat >= LENGTHOF(charCatNames)) {
+return "unknown";
+} else {
+return charCatNames[cat];
+}
+}
+static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
+const char *catname = getCharCatName(code);
+uint16_t length = 0;
+UChar32 cp;
+int ndigits, i;
+WRITE_CHAR(buffer, bufferLength, length, '<');
+while (catname[length - 1]) {
+WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
+}
+WRITE_CHAR(buffer, bufferLength, length, '-');
+for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
+;
+if (ndigits < 4)
+ndigits = 4;
+for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
+uint8_t v = (uint8_t)(cp & 0xf);
+buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
+}
+buffer += ndigits;
+length += ndigits;
+WRITE_CHAR(buffer, bufferLength, length, '>');
+return length;
+}
+/*
+* getGroup() does a binary search for the group that contains the
+* Unicode code point "code".
+* The return value is always a valid Group* that may contain "code"
+* or else is the highest group before "code".
+* If the lowest group is after "code", then that one is returned.
+*/
+static const uint16_t *
+getGroup(UCharNames *names, uint32_t code) {
+const uint16_t *groups=GET_GROUPS(names);
+uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
+start=0,
+limit=*groups++,
+number;
+/* binary search for the group of names that contains the one for code */
+while(start<limit-1) {
+number=(uint16_t)((start+limit)/2);
+if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
+limit=number;
+} else {
+start=number;
+}
+}
+/* return this regardless of whether it is an exact match */
+return groups+start*GROUP_LENGTH;
+}
+/*
+* expandGroupLengths() reads a block of compressed lengths of 32 strings and
+* expands them into offsets and lengths for each string.
+* Lengths are stored with a variable-width encoding in consecutive nibbles:
+* If a nibble<0xc, then it is the length itself (0=empty string).
+* If a nibble>=0xc, then it forms a length value with the following nibble.
+* Calculation see below.
+* The offsets and lengths arrays must be at least 33 (one more) long because
+* there is no check here at the end if the last nibble is still used.
+*/
+static const uint8_t *
+expandGroupLengths(const uint8_t *s,
+uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
+/* read the lengths of the 32 strings in this group and get each string's offset */
+uint16_t i=0, offset=0, length=0;
+uint8_t lengthByte;
+/* all 32 lengths must be read to get the offset of the first group string */
+while(i<LINES_PER_GROUP) {
+lengthByte=*s++;
+/* read even nibble - MSBs of lengthByte */
+if(length>=12) {
+/* double-nibble length spread across two bytes */
+length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12);
+lengthByte&=0xf;
+} else if((lengthByte /* &0xf0 */)>=0xc0) {
+/* double-nibble length spread across this one byte */
+length=(uint16_t)((lengthByte&0x3f)+12);
+} else {
+/* single-nibble length in MSBs */
+length=(uint16_t)(lengthByte>>4);
+lengthByte&=0xf;
+}
+*offsets++=offset;
+*lengths++=length;
+offset+=length;
+++i;
+/* read odd nibble - LSBs of lengthByte */
+if((lengthByte&0xf0)==0) {
+/* this nibble was not consumed for a double-nibble length above */
+length=lengthByte;
+if(length<12) {
+/* single-nibble length in LSBs */
+*offsets++=offset;
+*lengths++=length;
+offset+=length;
+++i;
+}
+} else {
+length=0;   /* prevent double-nibble detection in the next iteration */
+}
+}
+/* now, s is at the first group string */
+return s;
+}
+static uint16_t
+expandGroupName(UCharNames *names, const uint16_t *group,
+uint16_t lineNumber, UCharNameChoice nameChoice,
+char *buffer, uint16_t bufferLength) {
+uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
+const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
+s=expandGroupLengths(s, offsets, lengths);
+return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
+buffer, bufferLength);
+}
+static uint16_t
+getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
+char *buffer, uint16_t bufferLength) {
+const uint16_t *group=getGroup(names, code);
+if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) {
+return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
+buffer, bufferLength);
+} else {
+/* group not found */
+/* zero-terminate */
+if(bufferLength>0) {
+*buffer=0;
+}
+return 0;
+}
+}
+/*
+* enumGroupNames() enumerates all the names in a 32-group
+* and either calls the enumerator function or finds a given input name.
+*/
+static UBool
+enumGroupNames(UCharNames *names, const uint16_t *group,
+UChar32 start, UChar32 end,
+UEnumCharNamesFn *fn, void *context,
+UCharNameChoice nameChoice) {
+uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
+const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
+s=expandGroupLengths(s, offsets, lengths);
+if(fn!=DO_FIND_NAME) {
+char buffer[200];
+uint16_t length;
+while(start<=end) {
+length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
+if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
+buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
+}
+/* here, we assume that the buffer is large enough */
+if(length>0) {
+if(!fn(context, start, nameChoice, buffer, length)) {
+return FALSE;
+}
+}
+++start;
+}
+} else {
+const char *otherName=((FindName *)context)->otherName;
+while(start<=end) {
+if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
+((FindName *)context)->code=start;
+return FALSE;
+}
+++start;
+}
+}
+return TRUE;
+}
+/*
+* enumExtNames enumerate extended names.
+* It only needs to do it if it is called with a real function and not
+* with the dummy DO_FIND_NAME, because u_charFromName() does a check
+* for extended names by itself.
+*/
+static UBool
+enumExtNames(UChar32 start, UChar32 end,
+UEnumCharNamesFn *fn, void *context)
+{
+if(fn!=DO_FIND_NAME) {
+char buffer[200];
+uint16_t length;
+while(start<=end) {
+buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
+/* here, we assume that the buffer is large enough */
+if(length>0) {
+if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
+return FALSE;
+}
+}
+++start;
+}
+}
+return TRUE;
+}
+static UBool
+enumNames(UCharNames *names,
+UChar32 start, UChar32 limit,
+UEnumCharNamesFn *fn, void *context,
+UCharNameChoice nameChoice) {
+uint16_t startGroupMSB, endGroupMSB, groupCount;
+const uint16_t *group, *groupLimit;
+startGroupMSB=(uint16_t)(start>>GROUP_SHIFT);
+endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT);
+/* find the group that contains start, or the highest before it */
+group=getGroup(names, start);
+if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) {
+/* enumerate synthetic names between start and the group start */
+UChar32 extLimit=((UChar32)group[GROUP_MSB]<<GROUP_SHIFT);
+if(extLimit>limit) {
+extLimit=limit;
+}
+if(!enumExtNames(start, extLimit-1, fn, context)) {
+return FALSE;
+}
+start=extLimit;
+}
+if(startGroupMSB==endGroupMSB) {
+if(startGroupMSB==group[GROUP_MSB]) {
+/* if start and limit-1 are in the same group, then enumerate only in that one */
+return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
+}
+} else {
+const uint16_t *groups=GET_GROUPS(names);
+groupCount=*groups++;
+groupLimit=groups+groupCount*GROUP_LENGTH;
+if(startGroupMSB==group[GROUP_MSB]) {
+/* enumerate characters in the partial start group */
+if((start&GROUP_MASK)!=0) {
+if(!enumGroupNames(names, group,
+start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1,
+fn, context, nameChoice)) {
+return FALSE;
+}
+group=NEXT_GROUP(group); /* continue with the next group */
+}
+} else if(startGroupMSB>group[GROUP_MSB]) {
+/* make sure that we start enumerating with the first group after start */
+const uint16_t *nextGroup=NEXT_GROUP(group);
+if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
+UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
+if (end > limit) {
+end = limit;
+}
+if (!enumExtNames(start, end - 1, fn, context)) {
+return FALSE;
+}
+}
+group=nextGroup;
+}
+/* enumerate entire groups between the start- and end-groups */
+while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
+const uint16_t *nextGroup;
+start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT;
+if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
+return FALSE;
+}
+nextGroup=NEXT_GROUP(group);
+if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
+UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
+if (end > limit) {
+end = limit;
+}
+if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
+return FALSE;
+}
+}
+group=nextGroup;
+}
+/* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
+if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
+return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
+} else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
+UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
+if (next > start) {
+start = next;
+}
+} else {
+return TRUE;
+}
+}
+/* we have not found a group, which means everything is made of
+extended names. */
+if (nameChoice == U_EXTENDED_CHAR_NAME) {
+if (limit > UCHAR_MAX_VALUE + 1) {
+limit = UCHAR_MAX_VALUE + 1;
+}
+return enumExtNames(start, limit - 1, fn, context);
+}
+return TRUE;
+}
+static uint16_t
+writeFactorSuffix(const uint16_t *factors, uint16_t count,
+const char *s, /* suffix elements */
+uint32_t code,
+uint16_t indexes[8], /* output fields from here */
+const char *elementBases[8], const char *elements[8],
+char *buffer, uint16_t bufferLength) {
+uint16_t i, factor, bufferPos=0;
+char c;
+/* write elements according to the factors */
+/*
+* the factorized elements are determined by modulo arithmetic
+* with the factors of this algorithm
+*
+* note that for fewer operations, count is decremented here
+*/
+--count;
+for(i=count; i>0; --i) {
+factor=factors[i];
+indexes[i]=(uint16_t)(code%factor);
+code/=factor;
+}
+/*
+* we don't need to calculate the last modulus because start<=code<=end
+* guarantees here that code<=factors[0]
+*/
+indexes[0]=(uint16_t)code;
+/* write each element */
+for(;;) {
+if(elementBases!=NULL) {
+*elementBases++=s;
+}
+/* skip indexes[i] strings */
+factor=indexes[i];
+while(factor>0) {
+while(*s++!=0) {}
+--factor;
+}
+if(elements!=NULL) {
+*elements++=s;
+}
+/* write element */
+while((c=*s++)!=0) {
+WRITE_CHAR(buffer, bufferLength, bufferPos, c);
+}
+/* we do not need to perform the rest of this loop for i==count - break here */
+if(i>=count) {
+break;
+}
+/* skip the rest of the strings for this factors[i] */
+factor=(uint16_t)(factors[i]-indexes[i]-1);
+while(factor>0) {
+while(*s++!=0) {}
+--factor;
+}
+++i;
+}
+/* zero-terminate */
+if(bufferLength>0) {
+*buffer=0;
+}
+return bufferPos;
+}
+/*
+* Important:
+* Parts of findAlgName() are almost the same as some of getAlgName().
+* Fixes must be applied to both.
+*/
+static uint16_t
+getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
+char *buffer, uint16_t bufferLength) {
+uint16_t bufferPos=0;
+/* Only the normative character name can be algorithmic. */
+if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
+/* zero-terminate */
+if(bufferLength>0) {
+*buffer=0;
+}
+return 0;
+}
+switch(range->type) {
+case 0: {
+/* name = prefix hex-digits */
+const char *s=(const char *)(range+1);
+char c;
+uint16_t i, count;
+/* copy prefix */
+while((c=*s++)!=0) {
+WRITE_CHAR(buffer, bufferLength, bufferPos, c);
+}
+/* write hexadecimal code point value */
+count=range->variant;
+/* zero-terminate */
+if(count<bufferLength) {
+buffer[count]=0;
+}
+for(i=count; i>0;) {
+if(--i<bufferLength) {
+c=(char)(code&0xf);
+if(c<10) {
+c+='0';
+} else {
+c+='A'-10;
+}
+buffer[i]=c;
+}
+code>>=4;
+}
+bufferPos+=count;
+break;
+}
+case 1: {
+/* name = prefix factorized-elements */
+uint16_t indexes[8];
+const uint16_t *factors=(const uint16_t *)(range+1);
+uint16_t count=range->variant;
+const char *s=(const char *)(factors+count);
+char c;
+/* copy prefix */
+while((c=*s++)!=0) {
+WRITE_CHAR(buffer, bufferLength, bufferPos, c);
+}
+bufferPos+=writeFactorSuffix(factors, count,
+s, code-range->start, indexes, NULL, NULL, buffer, bufferLength);
+break;
+}
+default:
+/* undefined type */
+/* zero-terminate */
+if(bufferLength>0) {
+*buffer=0;
+}
+break;
+}
+return bufferPos;
+}
+/*
+* Important: enumAlgNames() and findAlgName() are almost the same.
+* Any fix must be applied to both.
+*/
+static UBool
+enumAlgNames(AlgorithmicRange *range,
+UChar32 start, UChar32 limit,
+UEnumCharNamesFn *fn, void *context,
+UCharNameChoice nameChoice) {
+char buffer[200];
+uint16_t length;
+if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
+return TRUE;
+}
+switch(range->type) {
+case 0: {
+char *s, *end;
+char c;
+/* get the full name of the start character */
+length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer));
+if(length<=0) {
+return TRUE;
+}
+/* call the enumerator function with this first character */
+if(!fn(context, start, nameChoice, buffer, length)) {
+return FALSE;
+}
+/* go to the end of the name; all these names have the same length */
+end=buffer;
+while(*end!=0) {
+++end;
+}
+/* enumerate the rest of the names */
+while(++start<limit) {
+/* increment the hexadecimal number on a character-basis */
+s=end;
+for (;;) {
+c=*--s;
+if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {
+*s=(char)(c+1);
+break;
+} else if(c=='9') {
+*s='A';
+break;
+} else if(c=='F') {
+*s='0';
+}
+}
+if(!fn(context, start, nameChoice, buffer, length)) {
+return FALSE;
+}
+}
+break;
+}
+case 1: {
+uint16_t indexes[8];
+const char *elementBases[8], *elements[8];
+const uint16_t *factors=(const uint16_t *)(range+1);
+uint16_t count=range->variant;
+const char *s=(const char *)(factors+count);
+char *suffix, *t;
+uint16_t prefixLength, i, idx;
+char c;
+/* name = prefix factorized-elements */
+/* copy prefix */
+suffix=buffer;
+prefixLength=0;
+while((c=*s++)!=0) {
+*suffix++=c;
+++prefixLength;
+}
+/* append the suffix of the start character */
+length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count,
+s, (uint32_t)start-range->start,
+indexes, elementBases, elements,
+suffix, (uint16_t)(sizeof(buffer)-prefixLength)));
+/* call the enumerator function with this first character */
+if(!fn(context, start, nameChoice, buffer, length)) {
+return FALSE;
+}
+/* enumerate the rest of the names */
+while(++start<limit) {
+/* increment the indexes in lexical order bound by the factors */
+i=count;
+for (;;) {
+idx=(uint16_t)(indexes[--i]+1);
+if(idx<factors[i]) {
+/* skip one index and its element string */
+indexes[i]=idx;
+s=elements[i];
+while(*s++!=0) {
+}
+elements[i]=s;
+break;
+} else {
+/* reset this index to 0 and its element string to the first one */
+indexes[i]=0;
+elements[i]=elementBases[i];
+}
+}
+/* to make matters a little easier, just append all elements to the suffix */
+t=suffix;
+length=prefixLength;
+for(i=0; i<count; ++i) {
+s=elements[i];
+while((c=*s++)!=0) {
+*t++=c;
+++length;
+}
+}
+/* zero-terminate */
+*t=0;
+if(!fn(context, start, nameChoice, buffer, length)) {
+return FALSE;
+}
+}
+break;
+}
+default:
+/* undefined type */
+break;
+}
+return TRUE;
+}
+/*
+* findAlgName() is almost the same as enumAlgNames() except that it
+* returns the code point for a name if it fits into the range.
+* It returns 0xffff otherwise.
+*/
+static UChar32
+findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {
+UChar32 code;
+if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
+return 0xffff;
+}
+switch(range->type) {
+case 0: {
+/* name = prefix hex-digits */
+const char *s=(const char *)(range+1);
+char c;
+uint16_t i, count;
+/* compare prefix */
+while((c=*s++)!=0) {
+if((char)c!=*otherName++) {
+return 0xffff;
+}
+}
+/* read hexadecimal code point value */
+count=range->variant;
+code=0;
+for(i=0; i<count; ++i) {
+c=*otherName++;
+if('0'<=c && c<='9') {
+code=(code<<4)|(c-'0');
+} else if('A'<=c && c<='F') {
+code=(code<<4)|(c-'A'+10);
+} else {
+return 0xffff;
+}
+}
+/* does it fit into the range? */
+if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) {
+return code;
+}
+break;
+}
+case 1: {
+char buffer[64];
+uint16_t indexes[8];
+const char *elementBases[8], *elements[8];
+const uint16_t *factors=(const uint16_t *)(range+1);
+uint16_t count=range->variant;
+const char *s=(const char *)(factors+count), *t;
+UChar32 start, limit;
+uint16_t i, idx;
+char c;
+/* name = prefix factorized-elements */
+/* compare prefix */
+while((c=*s++)!=0) {
+if((char)c!=*otherName++) {
+return 0xffff;
+}
+}
+start=(UChar32)range->start;
+limit=(UChar32)(range->end+1);
+/* initialize the suffix elements for enumeration; indexes should all be set to 0 */
+writeFactorSuffix(factors, count, s, 0,
+indexes, elementBases, elements, buffer, sizeof(buffer));
+/* compare the first suffix */
+if(0==uprv_strcmp(otherName, buffer)) {
+return start;
+}
+/* enumerate and compare the rest of the suffixes */
+while(++start<limit) {
+/* increment the indexes in lexical order bound by the factors */
+i=count;
+for (;;) {
+idx=(uint16_t)(indexes[--i]+1);
+if(idx<factors[i]) {
+/* skip one index and its element string */
+indexes[i]=idx;
+s=elements[i];
+while(*s++!=0) {}
+elements[i]=s;
+break;
+} else {
+/* reset this index to 0 and its element string to the first one */
+indexes[i]=0;
+elements[i]=elementBases[i];
+}
+}
+/* to make matters a little easier, just compare all elements of the suffix */
+t=otherName;
+for(i=0; i<count; ++i) {
+s=elements[i];
+while((c=*s++)!=0) {
+if(c!=*t++) {
+s=""; /* does not match */
+i=99;
+}
+}
+}
+if(i<99 && *t==0) {
+return start;
+}
+}
+break;
+}
+default:
+/* undefined type */
+break;
+}
+return 0xffff;
+}
+/* sets of name characters, maximum name lengths ---------------------------- */
+#define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
+#define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
+static int32_t
+calcStringSetLength(uint32_t set[8], const char *s) {
+int32_t length=0;
+char c;
+while((c=*s++)!=0) {
+SET_ADD(set, c);
+++length;
+}
+return length;
+}
+static int32_t
+calcAlgNameSetsLengths(int32_t maxNameLength) {
+AlgorithmicRange *range;
+uint32_t *p;
+uint32_t rangeCount;
+int32_t length;
+/* enumerate algorithmic ranges */
+p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
+rangeCount=*p;
+range=(AlgorithmicRange *)(p+1);
+while(rangeCount>0) {
+switch(range->type) {
+case 0:
+/* name = prefix + (range->variant times) hex-digits */
+/* prefix */
+length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant;
+if(length>maxNameLength) {
+maxNameLength=length;
+}
+break;
+case 1: {
+/* name = prefix factorized-elements */
+const uint16_t *factors=(const uint16_t *)(range+1);
+const char *s;
+int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
+/* prefix length */
+s=(const char *)(factors+count);
+length=calcStringSetLength(gNameSet, s);
+s+=length+1; /* start of factor suffixes */
+/* get the set and maximum factor suffix length for each factor */
+for(i=0; i<count; ++i) {
+maxFactorLength=0;
+for(factor=factors[i]; factor>0; --factor) {
+factorLength=calcStringSetLength(gNameSet, s);
+s+=factorLength+1;
+if(factorLength>maxFactorLength) {
+maxFactorLength=factorLength;
+}
+}
+length+=maxFactorLength;
+}
+if(length>maxNameLength) {
+maxNameLength=length;
+}
+break;
+}
+default:
+/* unknown type */
+break;
+}
+range=(AlgorithmicRange *)((uint8_t *)range+range->size);
+--rangeCount;
+}
+return maxNameLength;
+}
+static int32_t
+calcExtNameSetsLengths(int32_t maxNameLength) {
+int32_t i, length;
+for(i=0; i<LENGTHOF(charCatNames); ++i) {
+/*
+* for each category, count the length of the category name
+* plus 9=
+* 2 for <>
+* 1 for -
+* 6 for most hex digits per code point
+*/
+length=9+calcStringSetLength(gNameSet, charCatNames[i]);
+if(length>maxNameLength) {
+maxNameLength=length;
+}
+}
+return maxNameLength;
+}
+static int32_t
+calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths,
+uint32_t set[8],
+const uint8_t **pLine, const uint8_t *lineLimit) {
+const uint8_t *line=*pLine;
+int32_t length=0, tokenLength;
+uint16_t c, token;
+while(line!=lineLimit && (c=*line++)!=(uint8_t)';') {
+if(c>=tokenCount) {
+/* implicit letter */
+SET_ADD(set, c);
+++length;
+} else {
+token=tokens[c];
+if(token==(uint16_t)(-2)) {
+/* this is a lead byte for a double-byte token */
+c=c<<8|*line++;
+token=tokens[c];
+}
+if(token==(uint16_t)(-1)) {
+/* explicit letter */
+SET_ADD(set, c);
+++length;
+} else {
+/* count token word */
+if(tokenLengths!=NULL) {
+/* use cached token length */
+tokenLength=tokenLengths[c];
+if(tokenLength==0) {
+tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
+tokenLengths[c]=(int8_t)tokenLength;
+}
+} else {
+tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
+}
+length+=tokenLength;
+}
+}
+}
+*pLine=line;
+return length;
+}
+static void
+calcGroupNameSetsLengths(int32_t maxNameLength) {
+uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
+uint16_t *tokens=(uint16_t *)uCharNames+8;
+uint16_t tokenCount=*tokens++;
+uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset;
+int8_t *tokenLengths;
+const uint16_t *group;
+const uint8_t *s, *line, *lineLimit;
+int32_t groupCount, lineNumber, length;
+tokenLengths=(int8_t *)uprv_malloc(tokenCount);
+if(tokenLengths!=NULL) {
+uprv_memset(tokenLengths, 0, tokenCount);
+}
+group=GET_GROUPS(uCharNames);
+groupCount=*group++;
+/* enumerate all groups */
+while(groupCount>0) {
+s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group);
+s=expandGroupLengths(s, offsets, lengths);
+/* enumerate all lines in each group */
+for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
+line=s+offsets[lineNumber];
+length=lengths[lineNumber];
+if(length==0) {
+continue;
+}
+lineLimit=line+length;
+/* read regular name */
+length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
+if(length>maxNameLength) {
+maxNameLength=length;
+}
+if(line==lineLimit) {
+continue;
+}
+/* read Unicode 1.0 name */
+length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
+if(length>maxNameLength) {
+maxNameLength=length;
+}
+if(line==lineLimit) {
+continue;
+}
+/* read ISO comment */
+/*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
+}
+group=NEXT_GROUP(group);
+--groupCount;
+}
+if(tokenLengths!=NULL) {
+uprv_free(tokenLengths);
+}
+/* set gMax... - name length last for threading */
+gMaxNameLength=maxNameLength;
+}
+static UBool
+calcNameSetsLengths(UErrorCode *pErrorCode) {
+static const char extChars[]="0123456789ABCDEF<>-";
+int32_t i, maxNameLength;
+if(gMaxNameLength!=0) {
+return TRUE;
+}
+if(!isDataLoaded(pErrorCode)) {
+return FALSE;
+}
+/* set hex digits, used in various names, and <>-, used in extended names */
+for(i=0; i<(int32_t)sizeof(extChars)-1; ++i) {
+SET_ADD(gNameSet, extChars[i]);
+}
+/* set sets and lengths from algorithmic names */
+maxNameLength=calcAlgNameSetsLengths(0);
+/* set sets and lengths from extended names */
+maxNameLength=calcExtNameSetsLengths(maxNameLength);
+/* set sets and lengths from group names, set global maximum values */
+calcGroupNameSetsLengths(maxNameLength);
+return TRUE;
+}
+/* public API --------------------------------------------------------------- */
+U_CAPI int32_t U_EXPORT2
+u_charName(UChar32 code, UCharNameChoice nameChoice,
+char *buffer, int32_t bufferLength,
+UErrorCode *pErrorCode) {
+AlgorithmicRange *algRange;
+uint32_t *p;
+uint32_t i;
+int32_t length;
+/* check the argument values */
+if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+return 0;
+} else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
+bufferLength<0 || (bufferLength>0 && buffer==NULL)
+) {
+*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+return 0;
+}
+if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
+return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
+}
+length=0;
+/* try algorithmic names first */
+p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
+i=*p;
+algRange=(AlgorithmicRange *)(p+1);
+while(i>0) {
+if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
+length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
+break;
+}
+algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
+--i;
+}
+if(i==0) {
+if (nameChoice == U_EXTENDED_CHAR_NAME) {
+length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
+if (!length) {
+/* extended character name */
+length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
+}
+} else {
+/* normal character name */
+length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
+}
+}
+return u_terminateChars(buffer, bufferLength, length, pErrorCode);
+}
+U_CAPI int32_t U_EXPORT2
+u_getISOComment(UChar32 /*c*/,
+char *dest, int32_t destCapacity,
+UErrorCode *pErrorCode) {
+/* check the argument values */
+if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+return 0;
+} else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
+*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+return 0;
+}
+return u_terminateChars(dest, destCapacity, 0, pErrorCode);
+}
+U_CAPI UChar32 U_EXPORT2
+u_charFromName(UCharNameChoice nameChoice,
+const char *name,
+UErrorCode *pErrorCode) {
+char upper[120], lower[120];
+FindName findName;
+AlgorithmicRange *algRange;
+uint32_t *p;
+uint32_t i;
+UChar32 cp = 0;
+char c0;
+UChar32 error = 0xffff;     /* Undefined, but use this for backwards compatibility. */
+if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+return error;
+}
+if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) {
+*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+return error;
+}
+if(!isDataLoaded(pErrorCode)) {
+return error;
+}
+/* construct the uppercase and lowercase of the name first */
+for(i=0; i<sizeof(upper); ++i) {
+if((c0=*name++)!=0) {
+upper[i]=uprv_toupper(c0);
+lower[i]=uprv_tolower(c0);
+} else {
+upper[i]=lower[i]=0;
+break;
+}
+}
+if(i==sizeof(upper)) {
+/* name too long, there is no such character */
+*pErrorCode = U_ILLEGAL_CHAR_FOUND;
+return error;
+}
+/* try extended names first */
+if (lower[0] == '<') {
+if (nameChoice == U_EXTENDED_CHAR_NAME) {
+if (lower[--i] == '>') {
+for (--i; lower[i] && lower[i] != '-'; --i) {
+}
+if (lower[i] == '-') { /* We've got a category. */
+uint32_t cIdx;
+lower[i] = 0;
+for (++i; lower[i] != '>'; ++i) {
+if (lower[i] >= '0' && lower[i] <= '9') {
+cp = (cp << 4) + lower[i] - '0';
+} else if (lower[i] >= 'a' && lower[i] <= 'f') {
+cp = (cp << 4) + lower[i] - 'a' + 10;
+} else {
+*pErrorCode = U_ILLEGAL_CHAR_FOUND;
+return error;
+}
+}
+/* Now validate the category name.
+We could use a binary search, or a trie, if
+we really wanted to. */
+for (lower[i] = 0, cIdx = 0; cIdx < LENGTHOF(charCatNames); ++cIdx) {
+if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
+if (getCharCat(cp) == cIdx) {
+return cp;
+}
+break;
+}
+}
+}
+}
+}
+*pErrorCode = U_ILLEGAL_CHAR_FOUND;
+return error;
+}
+/* try algorithmic names now */
+p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
+i=*p;
+algRange=(AlgorithmicRange *)(p+1);
+while(i>0) {
+if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
+return cp;
+}
+algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
+--i;
+}
+/* normal character name */
+findName.otherName=upper;
+findName.code=error;
+enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
+if (findName.code == error) {
+*pErrorCode = U_ILLEGAL_CHAR_FOUND;
+}
+return findName.code;
+}
+U_CAPI void U_EXPORT2
+u_enumCharNames(UChar32 start, UChar32 limit,
+UEnumCharNamesFn *fn,
+void *context,
+UCharNameChoice nameChoice,
+UErrorCode *pErrorCode) {
+AlgorithmicRange *algRange;
+uint32_t *p;
+uint32_t i;
+if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+return;
+}
+if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) {
+*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+return;
+}
+if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
+limit = UCHAR_MAX_VALUE + 1;
+}
+if((uint32_t)start>=(uint32_t)limit) {
+return;
+}
+if(!isDataLoaded(pErrorCode)) {
+return;
+}
+/* interleave the data-driven ones with the algorithmic ones */
+/* iterate over all algorithmic ranges; assume that they are in ascending order */
+p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
+i=*p;
+algRange=(AlgorithmicRange *)(p+1);
+while(i>0) {
+/* enumerate the character names before the current algorithmic range */
+/* here: start<limit */
+if((uint32_t)start<algRange->start) {
+if((uint32_t)limit<=algRange->start) {
+enumNames(uCharNames, start, limit, fn, context, nameChoice);
+return;
+}
+if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
+return;
+}
+start=(UChar32)algRange->start;
+}
+/* enumerate the character names in the current algorithmic range */
+/* here: algRange->start<=start<limit */
+if((uint32_t)start<=algRange->end) {
+if((uint32_t)limit<=(algRange->end+1)) {
+enumAlgNames(algRange, start, limit, fn, context, nameChoice);
+return;
+}
+if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
+return;
+}
+start=(UChar32)algRange->end+1;
+}
+/* continue to the next algorithmic range (here: start<limit) */
+algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
+--i;
+}
+/* enumerate the character names after the last algorithmic range */
+enumNames(uCharNames, start, limit, fn, context, nameChoice);
+}
+U_CAPI int32_t U_EXPORT2
+uprv_getMaxCharNameLength() {
+UErrorCode errorCode=U_ZERO_ERROR;
+if(calcNameSetsLengths(&errorCode)) {
+return gMaxNameLength;
+} else {
+return 0;
+}
+}
+/**
+* Converts the char set cset into a Unicode set uset.
+* @param cset Set of 256 bit flags corresponding to a set of chars.
+* @param uset USet to receive characters. Existing contents are deleted.
+*/
+static void
+charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
+UChar us[256];
+char cs[256];
+int32_t i, length;
+UErrorCode errorCode;
+errorCode=U_ZERO_ERROR;
+if(!calcNameSetsLengths(&errorCode)) {
+return;
+}
+/* build a char string with all chars that are used in character names */
+length=0;
+for(i=0; i<256; ++i) {
+if(SET_CONTAINS(cset, i)) {
+cs[length++]=(char)i;
+}
+}
+/* convert the char string to a UChar string */
+u_charsToUChars(cs, us, length);
+/* add each UChar to the USet */
+for(i=0; i<length; ++i) {
+if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */
+sa->add(sa->set, us[i]);
+}
+}
+}
+/**
+* Fills set with characters that are used in Unicode character names.
+* @param set USet to receive characters.
+*/
+U_CAPI void U_EXPORT2
+uprv_getCharNameCharacters(const USetAdder *sa) {
+charSetToUSet(gNameSet, sa);
+}
+/* data swapping ------------------------------------------------------------ */
+/*
+* The token table contains non-negative entries for token bytes,
+* and -1 for bytes that represent themselves in the data file's charset.
+* -2 entries are used for lead bytes.
+*
+* Direct bytes (-1 entries) must be translated from the input charset family
+* to the output charset family.
+* makeTokenMap() writes a permutation mapping for this.
+* Use it once for single-/lead-byte tokens and once more for all trail byte
+* tokens. (';' is an unused trail byte marked with -1.)
+*/
+static void
+makeTokenMap(const UDataSwapper *ds,
+int16_t tokens[], uint16_t tokenCount,
+uint8_t map[256],
+UErrorCode *pErrorCode) {
+UBool usedOutChar[256];
+uint16_t i, j;
+uint8_t c1, c2;
+if(U_FAILURE(*pErrorCode)) {
+return;
+}
+if(ds->inCharset==ds->outCharset) {
+/* Same charset family: identity permutation */
+for(i=0; i<256; ++i) {
+map[i]=(uint8_t)i;
+}
+} else {
+uprv_memset(map, 0, 256);
+uprv_memset(usedOutChar, 0, 256);
+if(tokenCount>256) {
+tokenCount=256;
+}
+/* set the direct bytes (byte 0 always maps to itself) */
+for(i=1; i<tokenCount; ++i) {
+if(tokens[i]==-1) {
+/* convert the direct byte character */
+c1=(uint8_t)i;
+ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
+if(U_FAILURE(*pErrorCode)) {
+udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
+i, ds->inCharset);
+return;
+}
+/* enter the converted character into the map and mark it used */
+map[c1]=c2;
+usedOutChar[c2]=TRUE;
+}
+}
+/* set the mappings for the rest of the permutation */
+for(i=j=1; i<tokenCount; ++i) {
+/* set mappings that were not set for direct bytes */
+if(map[i]==0) {
+/* set an output byte value that was not used as an output byte above */
+while(usedOutChar[j]) {
+++j;
+}
+map[i]=(uint8_t)j++;
+}
+}
+/*
+* leave mappings at tokenCount and above unset if tokenCount<256
+* because they won't be used
+*/
+}
+}
+U_CAPI int32_t U_EXPORT2
+uchar_swapNames(const UDataSwapper *ds,
+const void *inData, int32_t length, void *outData,
+UErrorCode *pErrorCode) {
+const UDataInfo *pInfo;
+int32_t headerSize;
+const uint8_t *inBytes;
+uint8_t *outBytes;
+uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
+offset, i, count, stringsCount;
+const AlgorithmicRange *inRange;
+AlgorithmicRange *outRange;
+/* udata_swapDataHeader checks the arguments */
+headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
+if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+return 0;
+}
+/* check data format and format version */
+pInfo=(const UDataInfo *)((const char *)inData+4);
+if(!(
+pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
+pInfo->dataFormat[1]==0x6e &&
+pInfo->dataFormat[2]==0x61 &&
+pInfo->dataFormat[3]==0x6d &&
+pInfo->formatVersion[0]==1
+)) {
+udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
+pInfo->dataFormat[0], pInfo->dataFormat[1],
+pInfo->dataFormat[2], pInfo->dataFormat[3],
+pInfo->formatVersion[0]);
+*pErrorCode=U_UNSUPPORTED_ERROR;
+return 0;
+}
+inBytes=(const uint8_t *)inData+headerSize;
+outBytes=(uint8_t *)outData+headerSize;
+if(length<0) {
+algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
+} else {
+length-=headerSize;
+if( length<20 ||
+(uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
+) {
+udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
+length);
+*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+return 0;
+}
+}
+if(length<0) {
+/* preflighting: iterate through algorithmic ranges */
+offset=algNamesOffset;
+count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
+offset+=4;
+for(i=0; i<count; ++i) {
+inRange=(const AlgorithmicRange *)(inBytes+offset);
+offset+=ds->readUInt16(inRange->size);
+}
+} else {
+/* swap data */
+const uint16_t *p;
+uint16_t *q, *temp;
+int16_t tokens[512];
+uint16_t tokenCount;
+uint8_t map[256], trailMap[256];
+/* copy the data for inaccessible bytes */
+if(inBytes!=outBytes) {
+uprv_memcpy(outBytes, inBytes, length);
+}
+/* the initial 4 offsets first */
+tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
+groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
+groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
+ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
+/*
+* now the tokens table
+* it needs to be permutated along with the compressed name strings
+*/
+p=(const uint16_t *)(inBytes+16);
+q=(uint16_t *)(outBytes+16);
+/* read and swap the tokenCount */
+tokenCount=ds->readUInt16(*p);
+ds->swapArray16(ds, p, 2, q, pErrorCode);
+++p;
+++q;
+/* read the first 512 tokens and make the token maps */
+if(tokenCount<=512) {
+count=tokenCount;
+} else {
+count=512;
+}
+for(i=0; i<count; ++i) {
+tokens[i]=udata_readInt16(ds, p[i]);
+}
+for(; i<512; ++i) {
+tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
+}
+makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
+makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
+if(U_FAILURE(*pErrorCode)) {
+return 0;
+}
+/*
+* swap and permutate the tokens
+* go through a temporary array to support in-place swapping
+*/
+temp=(uint16_t *)uprv_malloc(tokenCount*2);
+if(temp==NULL) {
+udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
+tokenCount);
+*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+return 0;
+}
+/* swap and permutate single-/lead-byte tokens */
+for(i=0; i<tokenCount && i<256; ++i) {
+ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
+}
+/* swap and permutate trail-byte tokens */
+for(; i<tokenCount; ++i) {
+ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
+}
+/* copy the result into the output and free the temporary array */
+uprv_memcpy(q, temp, tokenCount*2);
+uprv_free(temp);
+/*
+* swap the token strings but not a possible padding byte after
+* the terminating NUL of the last string
+*/
+udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
+outBytes+tokenStringOffset, pErrorCode);
+if(U_FAILURE(*pErrorCode)) {
+udata_printError(ds, "uchar_swapNames(token strings) failed\n");
+return 0;
+}
+/* swap the group table */
+count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
+ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
+outBytes+groupsOffset, pErrorCode);
+/*
+* swap the group strings
+* swap the string bytes but not the nibble-encoded string lengths
+*/
+if(ds->inCharset!=ds->outCharset) {
+uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
+const uint8_t *inStrings, *nextInStrings;
+uint8_t *outStrings;
+uint8_t c;
+inStrings=inBytes+groupStringOffset;
+outStrings=outBytes+groupStringOffset;
+stringsCount=algNamesOffset-groupStringOffset;
+/* iterate through string groups until only a few padding bytes are left */
+while(stringsCount>32) {
+nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
+/* move past the length bytes */
+stringsCount-=(uint32_t)(nextInStrings-inStrings);
+outStrings+=nextInStrings-inStrings;
+inStrings=nextInStrings;
+count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
+stringsCount-=count;
+/* swap the string bytes using map[] and trailMap[] */
+while(count>0) {
+c=*inStrings++;
+*outStrings++=map[c];
+if(tokens[c]!=-2) {
+--count;
+} else {
+/* token lead byte: swap the trail byte, too */
+*outStrings++=trailMap[*inStrings++];
+count-=2;
+}
+}
+}
+}
+/* swap the algorithmic ranges */
+offset=algNamesOffset;
+count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
+ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
+offset+=4;
+for(i=0; i<count; ++i) {
+if(offset>(uint32_t)length) {
+udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
+length, i);
+*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+return 0;
+}
+inRange=(const AlgorithmicRange *)(inBytes+offset);
+outRange=(AlgorithmicRange *)(outBytes+offset);
+offset+=ds->readUInt16(inRange->size);
+ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
+ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
+switch(inRange->type) {
+case 0:
+/* swap prefix string */
+ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
+outRange+1, pErrorCode);
+if(U_FAILURE(*pErrorCode)) {
+udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
+i);
+return 0;
+}
+break;
+case 1:
+{
+/* swap factors and the prefix and factor strings */
+uint32_t factorsCount;
+factorsCount=inRange->variant;
+p=(const uint16_t *)(inRange+1);
+q=(uint16_t *)(outRange+1);
+ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
+/* swap the strings, up to the last terminating NUL */
+p+=factorsCount;
+q+=factorsCount;
+stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
+while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
+--stringsCount;
+}
+ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
+}
+break;
+default:
+udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
+inRange->type, i);
+*pErrorCode=U_UNSUPPORTED_ERROR;
+return 0;
+}
+}
+}
+return headerSize+(int32_t)offset;
+}
+U_NAMESPACE_END
+/*
+* Hey, Emacs, please set the following:
+*
+* Local Variables:
+* indent-tabs-mode: nil
+* End:
+*
+*/

The Tor Browser / file comparison

comparison: intl/icu/source/common/unames.cpp

intl/icu/source/common/unames.cpp