michael@0: /* michael@0: ******************************************************************************* michael@0: * michael@0: * Copyright (C) 2001-2012, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ******************************************************************************* michael@0: * file name: ucol_tok.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created 02/22/2001 michael@0: * created by: Vladimir Weinstein michael@0: * michael@0: * This module reads a tailoring rule string and produces a list of michael@0: * tokens that will be turned into collation elements michael@0: * michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_COLLATION michael@0: michael@0: #include "unicode/uscript.h" michael@0: #include "unicode/ustring.h" michael@0: #include "unicode/uchar.h" michael@0: #include "unicode/uniset.h" michael@0: michael@0: #include "cmemory.h" michael@0: #include "cstring.h" michael@0: #include "patternprops.h" michael@0: #include "ucol_bld.h" michael@0: #include "ucol_tok.h" michael@0: #include "ulocimp.h" michael@0: #include "uresimp.h" michael@0: michael@0: // Define this only for debugging. michael@0: // #define DEBUG_FOR_COLL_RULES 1 michael@0: michael@0: #ifdef DEBUG_FOR_COLL_RULES michael@0: #include michael@0: #endif michael@0: michael@0: U_NAMESPACE_USE michael@0: michael@0: U_CDECL_BEGIN michael@0: static int32_t U_CALLCONV michael@0: uhash_hashTokens(const UHashTok k) michael@0: { michael@0: int32_t hash = 0; michael@0: //uint32_t key = (uint32_t)k.integer; michael@0: UColToken *key = (UColToken *)k.pointer; michael@0: if (key != 0) { michael@0: int32_t len = (key->source & 0xFF000000)>>24; michael@0: int32_t inc = ((len - 32) / 32) + 1; michael@0: michael@0: const UChar *p = (key->source & 0x00FFFFFF) + *(key->rulesToParseHdl); michael@0: const UChar *limit = p + len; michael@0: michael@0: while (psource & 0x00FFFFFF) + *(p1->rulesToParseHdl); michael@0: const UChar *s2 = (p2->source & 0x00FFFFFF) + *(p2->rulesToParseHdl); michael@0: uint32_t s1L = ((p1->source & 0xFF000000) >> 24); michael@0: uint32_t s2L = ((p2->source & 0xFF000000) >> 24); michael@0: const UChar *end = s1+s1L-1; michael@0: michael@0: if (p1 == p2) { michael@0: return TRUE; michael@0: } michael@0: if (p1->source == 0 || p2->source == 0) { michael@0: return FALSE; michael@0: } michael@0: if(s1L != s2L) { michael@0: return FALSE; michael@0: } michael@0: if(p1->source == p2->source) { michael@0: return TRUE; michael@0: } michael@0: while((s1 < end) && *s1 == *s2) { michael@0: ++s1; michael@0: ++s2; michael@0: } michael@0: if(*s1 == *s2) { michael@0: return TRUE; michael@0: } else { michael@0: return FALSE; michael@0: } michael@0: } michael@0: U_CDECL_END michael@0: michael@0: /* michael@0: * Debug messages used to pinpoint where a format error occurred. michael@0: * A better way is to include context-sensitive information in syntaxError() function. michael@0: * michael@0: * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR michael@0: * in the compile line. michael@0: */ michael@0: /* #define DEBUG_FOR_FORMAT_ERROR 1 */ michael@0: michael@0: #ifdef DEBUG_FOR_FORMAT_ERROR michael@0: #define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__);} michael@0: #else michael@0: #define DBG_FORMAT_ERROR michael@0: #endif michael@0: michael@0: michael@0: /* michael@0: * Controls debug messages so that the output can be compared before and after a michael@0: * big change. Prints the information of every code point that comes out of the michael@0: * collation parser and its strength into a file. When a big change in format michael@0: * happens, the files before and after the change should be identical. michael@0: * michael@0: * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS michael@0: * in the compile line. michael@0: */ michael@0: // #define DEBUG_FOR_CODE_POINTS 1 michael@0: michael@0: #ifdef DEBUG_FOR_CODE_POINTS michael@0: FILE* dfcp_fp = NULL; michael@0: #endif michael@0: michael@0: michael@0: typedef struct { michael@0: uint32_t startCE; michael@0: uint32_t startContCE; michael@0: uint32_t limitCE; michael@0: uint32_t limitContCE; michael@0: } indirectBoundaries; michael@0: michael@0: /* these values are used for finding CE values for indirect positioning. */ michael@0: /* Indirect positioning is a mechanism for allowing resets on symbolic */ michael@0: /* values. It only works for resets and you cannot tailor indirect names */ michael@0: /* An indirect name can define either an anchor point or a range. An */ michael@0: /* anchor point behaves in exactly the same way as a code point in reset */ michael@0: /* would, except that it cannot be tailored. A range (we currently only */ michael@0: /* know for the [top] range will explicitly set the upper bound for */ michael@0: /* generated CEs, thus allowing for better control over how many CEs can */ michael@0: /* be squeezed between in the range without performance penalty. */ michael@0: /* In that respect, we use [top] for tailoring of locales that use CJK */ michael@0: /* characters. Other indirect values are currently a pure convenience, */ michael@0: /* they can be used to assure that the CEs will be always positioned in */ michael@0: /* the same place relative to a point with known properties (e.g. first */ michael@0: /* primary ignorable). */ michael@0: static indirectBoundaries ucolIndirectBoundaries[15]; michael@0: /* michael@0: static indirectBoundaries ucolIndirectBoundaries[11] = { michael@0: { UCOL_RESET_TOP_VALUE, 0, michael@0: UCOL_NEXT_TOP_VALUE, 0 }, michael@0: { UCOL_FIRST_PRIMARY_IGNORABLE, 0, michael@0: 0, 0 }, michael@0: { UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT, michael@0: 0, 0 }, michael@0: { UCOL_FIRST_SECONDARY_IGNORABLE, 0, michael@0: 0, 0 }, michael@0: { UCOL_LAST_SECONDARY_IGNORABLE, 0, michael@0: 0, 0 }, michael@0: { UCOL_FIRST_TERTIARY_IGNORABLE, 0, michael@0: 0, 0 }, michael@0: { UCOL_LAST_TERTIARY_IGNORABLE, 0, michael@0: 0, 0 }, michael@0: { UCOL_FIRST_VARIABLE, 0, michael@0: 0, 0 }, michael@0: { UCOL_LAST_VARIABLE, 0, michael@0: 0, 0 }, michael@0: { UCOL_FIRST_NON_VARIABLE, 0, michael@0: 0, 0 }, michael@0: { UCOL_LAST_NON_VARIABLE, 0, michael@0: 0, 0 }, michael@0: }; michael@0: */ michael@0: michael@0: static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) { michael@0: michael@0: // Set values for the top - TODO: once we have values for all the indirects, we are going michael@0: // to initalize here. michael@0: ucolIndirectBoundaries[indexR].startCE = start[0]; michael@0: ucolIndirectBoundaries[indexR].startContCE = start[1]; michael@0: if(end) { michael@0: ucolIndirectBoundaries[indexR].limitCE = end[0]; michael@0: ucolIndirectBoundaries[indexR].limitContCE = end[1]; michael@0: } else { michael@0: ucolIndirectBoundaries[indexR].limitCE = 0; michael@0: ucolIndirectBoundaries[indexR].limitContCE = 0; michael@0: } michael@0: } michael@0: michael@0: michael@0: static inline michael@0: void syntaxError(const UChar* rules, michael@0: int32_t pos, michael@0: int32_t rulesLen, michael@0: UParseError* parseError) michael@0: { michael@0: parseError->offset = pos; michael@0: parseError->line = 0 ; /* we are not using line numbers */ michael@0: michael@0: // for pre-context michael@0: int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1)); michael@0: int32_t stop = pos; michael@0: michael@0: u_memcpy(parseError->preContext,rules+start,stop-start); michael@0: //null terminate the buffer michael@0: parseError->preContext[stop-start] = 0; michael@0: michael@0: //for post-context michael@0: start = pos+1; michael@0: stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) : michael@0: rulesLen; michael@0: michael@0: if(start < stop) { michael@0: u_memcpy(parseError->postContext,rules+start,stop-start); michael@0: //null terminate the buffer michael@0: parseError->postContext[stop-start]= 0; michael@0: } else { michael@0: parseError->postContext[0] = 0; michael@0: } michael@0: } michael@0: michael@0: static michael@0: void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) { michael@0: switch(attrib) { michael@0: case UCOL_HIRAGANA_QUATERNARY_MODE: michael@0: opts->hiraganaQ = value; michael@0: break; michael@0: case UCOL_FRENCH_COLLATION: michael@0: opts->frenchCollation = value; michael@0: break; michael@0: case UCOL_ALTERNATE_HANDLING: michael@0: opts->alternateHandling = value; michael@0: break; michael@0: case UCOL_CASE_FIRST: michael@0: opts->caseFirst = value; michael@0: break; michael@0: case UCOL_CASE_LEVEL: michael@0: opts->caseLevel = value; michael@0: break; michael@0: case UCOL_NORMALIZATION_MODE: michael@0: opts->normalizationMode = value; michael@0: break; michael@0: case UCOL_STRENGTH: michael@0: opts->strength = value; michael@0: break; michael@0: case UCOL_NUMERIC_COLLATION: michael@0: opts->numericCollation = value; michael@0: break; michael@0: case UCOL_ATTRIBUTE_COUNT: michael@0: default: michael@0: break; michael@0: } michael@0: } michael@0: michael@0: #define UTOK_OPTION_COUNT 22 michael@0: michael@0: static UBool didInit = FALSE; michael@0: /* we can be strict, or we can be lenient */ michael@0: /* I'd surely be lenient with the option arguments */ michael@0: /* maybe even with options */ michael@0: U_STRING_DECL(suboption_00, "non-ignorable", 13); michael@0: U_STRING_DECL(suboption_01, "shifted", 7); michael@0: michael@0: U_STRING_DECL(suboption_02, "lower", 5); michael@0: U_STRING_DECL(suboption_03, "upper", 5); michael@0: U_STRING_DECL(suboption_04, "off", 3); michael@0: U_STRING_DECL(suboption_05, "on", 2); michael@0: U_STRING_DECL(suboption_06, "1", 1); michael@0: U_STRING_DECL(suboption_07, "2", 1); michael@0: U_STRING_DECL(suboption_08, "3", 1); michael@0: U_STRING_DECL(suboption_09, "4", 1); michael@0: U_STRING_DECL(suboption_10, "I", 1); michael@0: michael@0: U_STRING_DECL(suboption_11, "primary", 7); michael@0: U_STRING_DECL(suboption_12, "secondary", 9); michael@0: U_STRING_DECL(suboption_13, "tertiary", 8); michael@0: U_STRING_DECL(suboption_14, "variable", 8); michael@0: U_STRING_DECL(suboption_15, "regular", 7); michael@0: U_STRING_DECL(suboption_16, "implicit", 8); michael@0: U_STRING_DECL(suboption_17, "trailing", 8); michael@0: michael@0: michael@0: U_STRING_DECL(option_00, "undefined", 9); michael@0: U_STRING_DECL(option_01, "rearrange", 9); michael@0: U_STRING_DECL(option_02, "alternate", 9); michael@0: U_STRING_DECL(option_03, "backwards", 9); michael@0: U_STRING_DECL(option_04, "variable top", 12); michael@0: U_STRING_DECL(option_05, "top", 3); michael@0: U_STRING_DECL(option_06, "normalization", 13); michael@0: U_STRING_DECL(option_07, "caseLevel", 9); michael@0: U_STRING_DECL(option_08, "caseFirst", 9); michael@0: U_STRING_DECL(option_09, "scriptOrder", 11); michael@0: U_STRING_DECL(option_10, "charsetname", 11); michael@0: U_STRING_DECL(option_11, "charset", 7); michael@0: U_STRING_DECL(option_12, "before", 6); michael@0: U_STRING_DECL(option_13, "hiraganaQ", 9); michael@0: U_STRING_DECL(option_14, "strength", 8); michael@0: U_STRING_DECL(option_15, "first", 5); michael@0: U_STRING_DECL(option_16, "last", 4); michael@0: U_STRING_DECL(option_17, "optimize", 8); michael@0: U_STRING_DECL(option_18, "suppressContractions", 20); michael@0: U_STRING_DECL(option_19, "numericOrdering", 15); michael@0: U_STRING_DECL(option_20, "import", 6); michael@0: U_STRING_DECL(option_21, "reorder", 7); michael@0: michael@0: /* michael@0: [last variable] last variable value michael@0: [last primary ignorable] largest CE for primary ignorable michael@0: [last secondary ignorable] largest CE for secondary ignorable michael@0: [last tertiary ignorable] largest CE for tertiary ignorable michael@0: [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8) michael@0: */ michael@0: michael@0: michael@0: static const ucolTokSuboption alternateSub[2] = { michael@0: {suboption_00, 13, UCOL_NON_IGNORABLE}, michael@0: {suboption_01, 7, UCOL_SHIFTED} michael@0: }; michael@0: michael@0: static const ucolTokSuboption caseFirstSub[3] = { michael@0: {suboption_02, 5, UCOL_LOWER_FIRST}, michael@0: {suboption_03, 5, UCOL_UPPER_FIRST}, michael@0: {suboption_04, 3, UCOL_OFF}, michael@0: }; michael@0: michael@0: static const ucolTokSuboption onOffSub[2] = { michael@0: {suboption_04, 3, UCOL_OFF}, michael@0: {suboption_05, 2, UCOL_ON} michael@0: }; michael@0: michael@0: static const ucolTokSuboption frenchSub[1] = { michael@0: {suboption_07, 1, UCOL_ON} michael@0: }; michael@0: michael@0: static const ucolTokSuboption beforeSub[3] = { michael@0: {suboption_06, 1, UCOL_PRIMARY}, michael@0: {suboption_07, 1, UCOL_SECONDARY}, michael@0: {suboption_08, 1, UCOL_TERTIARY} michael@0: }; michael@0: michael@0: static const ucolTokSuboption strengthSub[5] = { michael@0: {suboption_06, 1, UCOL_PRIMARY}, michael@0: {suboption_07, 1, UCOL_SECONDARY}, michael@0: {suboption_08, 1, UCOL_TERTIARY}, michael@0: {suboption_09, 1, UCOL_QUATERNARY}, michael@0: {suboption_10, 1, UCOL_IDENTICAL}, michael@0: }; michael@0: michael@0: static const ucolTokSuboption firstLastSub[7] = { michael@0: {suboption_11, 7, UCOL_PRIMARY}, michael@0: {suboption_12, 9, UCOL_PRIMARY}, michael@0: {suboption_13, 8, UCOL_PRIMARY}, michael@0: {suboption_14, 8, UCOL_PRIMARY}, michael@0: {suboption_15, 7, UCOL_PRIMARY}, michael@0: {suboption_16, 8, UCOL_PRIMARY}, michael@0: {suboption_17, 8, UCOL_PRIMARY}, michael@0: }; michael@0: michael@0: enum OptionNumber { michael@0: OPTION_ALTERNATE_HANDLING = 0, michael@0: OPTION_FRENCH_COLLATION, michael@0: OPTION_CASE_LEVEL, michael@0: OPTION_CASE_FIRST, michael@0: OPTION_NORMALIZATION_MODE, michael@0: OPTION_HIRAGANA_QUATERNARY, michael@0: OPTION_STRENGTH, michael@0: OPTION_NUMERIC_COLLATION, michael@0: OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION, michael@0: OPTION_VARIABLE_TOP, michael@0: OPTION_REARRANGE, michael@0: OPTION_BEFORE, michael@0: OPTION_TOP, michael@0: OPTION_FIRST, michael@0: OPTION_LAST, michael@0: OPTION_OPTIMIZE, michael@0: OPTION_SUPPRESS_CONTRACTIONS, michael@0: OPTION_UNDEFINED, michael@0: OPTION_SCRIPT_ORDER, michael@0: OPTION_CHARSET_NAME, michael@0: OPTION_CHARSET, michael@0: OPTION_IMPORT, michael@0: OPTION_SCRIPTREORDER michael@0: } ; michael@0: michael@0: static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = { michael@0: /*00*/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */ michael@0: /*01*/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards" */ michael@0: /*02*/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /*"caseLevel" */ michael@0: /*03*/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst" */ michael@0: /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */ michael@0: /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */ michael@0: /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */ michael@0: /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /*"numericOrdering"*/ michael@0: /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */ michael@0: /*09*/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */ michael@0: /*10*/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */ michael@0: /*11*/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */ michael@0: /*12*/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */ michael@0: /*13*/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */ michael@0: /*14*/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize" */ michael@0: /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions" */ michael@0: /*16*/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */ michael@0: /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */ michael@0: /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */ michael@0: /*19*/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charset" */ michael@0: /*20*/ {option_20, 6, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"import" */ michael@0: /*21*/ {option_21, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"reorder" */ michael@0: }; michael@0: michael@0: static michael@0: int32_t u_strncmpNoCase(const UChar *s1, michael@0: const UChar *s2, michael@0: int32_t n) michael@0: { michael@0: if(n > 0) { michael@0: int32_t rc; michael@0: for(;;) { michael@0: rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2); michael@0: if(rc != 0 || *s1 == 0 || --n == 0) { michael@0: return rc; michael@0: } michael@0: ++s1; michael@0: ++s2; michael@0: } michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: static michael@0: void ucol_uprv_tok_initData() { michael@0: if(!didInit) { michael@0: U_STRING_INIT(suboption_00, "non-ignorable", 13); michael@0: U_STRING_INIT(suboption_01, "shifted", 7); michael@0: michael@0: U_STRING_INIT(suboption_02, "lower", 5); michael@0: U_STRING_INIT(suboption_03, "upper", 5); michael@0: U_STRING_INIT(suboption_04, "off", 3); michael@0: U_STRING_INIT(suboption_05, "on", 2); michael@0: michael@0: U_STRING_INIT(suboption_06, "1", 1); michael@0: U_STRING_INIT(suboption_07, "2", 1); michael@0: U_STRING_INIT(suboption_08, "3", 1); michael@0: U_STRING_INIT(suboption_09, "4", 1); michael@0: U_STRING_INIT(suboption_10, "I", 1); michael@0: michael@0: U_STRING_INIT(suboption_11, "primary", 7); michael@0: U_STRING_INIT(suboption_12, "secondary", 9); michael@0: U_STRING_INIT(suboption_13, "tertiary", 8); michael@0: U_STRING_INIT(suboption_14, "variable", 8); michael@0: U_STRING_INIT(suboption_15, "regular", 7); michael@0: U_STRING_INIT(suboption_16, "implicit", 8); michael@0: U_STRING_INIT(suboption_17, "trailing", 8); michael@0: michael@0: michael@0: U_STRING_INIT(option_00, "undefined", 9); michael@0: U_STRING_INIT(option_01, "rearrange", 9); michael@0: U_STRING_INIT(option_02, "alternate", 9); michael@0: U_STRING_INIT(option_03, "backwards", 9); michael@0: U_STRING_INIT(option_04, "variable top", 12); michael@0: U_STRING_INIT(option_05, "top", 3); michael@0: U_STRING_INIT(option_06, "normalization", 13); michael@0: U_STRING_INIT(option_07, "caseLevel", 9); michael@0: U_STRING_INIT(option_08, "caseFirst", 9); michael@0: U_STRING_INIT(option_09, "scriptOrder", 11); michael@0: U_STRING_INIT(option_10, "charsetname", 11); michael@0: U_STRING_INIT(option_11, "charset", 7); michael@0: U_STRING_INIT(option_12, "before", 6); michael@0: U_STRING_INIT(option_13, "hiraganaQ", 9); michael@0: U_STRING_INIT(option_14, "strength", 8); michael@0: U_STRING_INIT(option_15, "first", 5); michael@0: U_STRING_INIT(option_16, "last", 4); michael@0: U_STRING_INIT(option_17, "optimize", 8); michael@0: U_STRING_INIT(option_18, "suppressContractions", 20); michael@0: U_STRING_INIT(option_19, "numericOrdering", 15); michael@0: U_STRING_INIT(option_20, "import ", 6); michael@0: U_STRING_INIT(option_21, "reorder", 7); michael@0: didInit = TRUE; michael@0: } michael@0: } michael@0: michael@0: michael@0: // This function reads basic options to set in the runtime collator michael@0: // used by data driven tests. Should not support build time options michael@0: U_CAPI const UChar * U_EXPORT2 michael@0: ucol_tok_getNextArgument(const UChar *start, const UChar *end, michael@0: UColAttribute *attrib, UColAttributeValue *value, michael@0: UErrorCode *status) michael@0: { michael@0: uint32_t i = 0; michael@0: int32_t j=0; michael@0: UBool foundOption = FALSE; michael@0: const UChar *optionArg = NULL; michael@0: michael@0: ucol_uprv_tok_initData(); michael@0: michael@0: while(start < end && PatternProps::isWhiteSpace(*start)) { /* eat whitespace */ michael@0: start++; michael@0: } michael@0: if(start >= end) { michael@0: return NULL; michael@0: } michael@0: /* skip opening '[' */ michael@0: if(*start == 0x005b) { michael@0: start++; michael@0: } else { michael@0: *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '[' michael@0: return NULL; michael@0: } michael@0: michael@0: while(i < UTOK_OPTION_COUNT) { michael@0: if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) { michael@0: foundOption = TRUE; michael@0: if(end - start > rulesOptions[i].optionLen) { michael@0: optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */ michael@0: while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */ michael@0: optionArg++; michael@0: } michael@0: } michael@0: break; michael@0: } michael@0: i++; michael@0: } michael@0: michael@0: if(!foundOption) { michael@0: *status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: michael@0: if(optionArg) { michael@0: for(j = 0; jopts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal); michael@0: *attrib = rulesOptions[i].attr; michael@0: *value = rulesOptions[i].subopts[j].attrVal; michael@0: optionArg += rulesOptions[i].subopts[j].subLen; michael@0: while(PatternProps::isWhiteSpace(*optionArg)) { /* eat whitespace */ michael@0: optionArg++; michael@0: } michael@0: if(*optionArg == 0x005d) { michael@0: optionArg++; michael@0: return optionArg; michael@0: } else { michael@0: *status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: *status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: michael@0: static michael@0: USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) { michael@0: while(*start != 0x005b) { /* advance while we find the first '[' */ michael@0: start++; michael@0: } michael@0: // now we need to get a balanced set of '[]'. The problem is that a set can have michael@0: // many, and *end point to the first closing '[' michael@0: int32_t noOpenBraces = 1; michael@0: int32_t current = 1; // skip the opening brace michael@0: while(start+current < end && noOpenBraces != 0) { michael@0: if(start[current] == 0x005b) { michael@0: noOpenBraces++; michael@0: } else if(start[current] == 0x005D) { // closing brace michael@0: noOpenBraces--; michael@0: } michael@0: current++; michael@0: } michael@0: michael@0: if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) { michael@0: *status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: return uset_openPattern(start, current, status); michael@0: } michael@0: michael@0: /** michael@0: * Reads an option and matches the option name with the predefined options. (Case-insensitive.) michael@0: * @param start Pointer to the start UChar. michael@0: * @param end Pointer to the last valid pointer beyond which the option will not extend. michael@0: * @param optionArg Address of the pointer at which the options start (after the option name) michael@0: * @return The index of the option, or -1 if the option is not valid. michael@0: */ michael@0: static michael@0: int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) { michael@0: int32_t i = 0; michael@0: ucol_uprv_tok_initData(); michael@0: michael@0: while(PatternProps::isWhiteSpace(*start)) { /* eat whitespace */ michael@0: start++; michael@0: } michael@0: while(i < UTOK_OPTION_COUNT) { michael@0: if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) { michael@0: if(end - start > rulesOptions[i].optionLen) { michael@0: *optionArg = start+rulesOptions[i].optionLen; /* End of option name; start of the options */ michael@0: while(PatternProps::isWhiteSpace(**optionArg)) { /* eat whitespace */ michael@0: (*optionArg)++; michael@0: } michael@0: } michael@0: break; michael@0: } michael@0: i++; michael@0: } michael@0: if(i == UTOK_OPTION_COUNT) { michael@0: i = -1; // didn't find an option michael@0: } michael@0: return i; michael@0: } michael@0: michael@0: michael@0: static michael@0: void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status) { michael@0: int32_t codeCount = 0; michael@0: int32_t codeIndex = 0; michael@0: char conversion[64]; michael@0: int32_t tokenLength = 0; michael@0: const UChar* space; michael@0: michael@0: const UChar* current = src->current; michael@0: const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current); michael@0: michael@0: // eat leading whitespace michael@0: while(current < end && u_isWhitespace(*current)) { michael@0: current++; michael@0: } michael@0: michael@0: while(current < end) { michael@0: space = u_memchr(current, 0x0020, end - current); michael@0: space = space == 0 ? end : space; michael@0: tokenLength = space - current; michael@0: if (tokenLength < 4) { michael@0: *status = U_INVALID_FORMAT_ERROR; michael@0: return; michael@0: } michael@0: codeCount++; michael@0: current += tokenLength; michael@0: while(current < end && u_isWhitespace(*current)) { /* eat whitespace */ michael@0: ++current; michael@0: } michael@0: } michael@0: michael@0: if (codeCount == 0) { michael@0: *status = U_INVALID_FORMAT_ERROR; michael@0: } michael@0: michael@0: src->reorderCodesLength = codeCount; michael@0: src->reorderCodes = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t)); michael@0: current = src->current; michael@0: michael@0: // eat leading whitespace michael@0: while(current < end && u_isWhitespace(*current)) { michael@0: current++; michael@0: } michael@0: michael@0: while(current < end) { michael@0: space = u_memchr(current, 0x0020, end - current); michael@0: space = space == 0 ? end : space; michael@0: tokenLength = space - current; michael@0: if (tokenLength < 4) { michael@0: *status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return; michael@0: } else { michael@0: u_UCharsToChars(current, conversion, tokenLength); michael@0: conversion[tokenLength] = '\0'; michael@0: src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion); michael@0: if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) { michael@0: src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion); michael@0: } michael@0: if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) { michael@0: *status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: } michael@0: } michael@0: codeIndex++; michael@0: current += tokenLength; michael@0: while(current < end && u_isWhitespace(*current)) { /* eat whitespace */ michael@0: ++current; michael@0: } michael@0: } michael@0: } michael@0: michael@0: // reads and conforms to various options in rules michael@0: // end is the position of the first closing ']' michael@0: // However, some of the options take an UnicodeSet definition michael@0: // which needs to duplicate the closing ']' michael@0: // for example: '[copy [\uAC00-\uD7FF]]' michael@0: // These options will move end to the second ']' and the michael@0: // caller will set the current to it. michael@0: static michael@0: uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) { michael@0: const UChar* start = src->current; michael@0: int32_t i = 0; michael@0: int32_t j=0; michael@0: const UChar *optionArg = NULL; michael@0: michael@0: uint8_t result = 0; michael@0: michael@0: start++; /*skip opening '['*/ michael@0: i = ucol_uprv_tok_readOption(start, src->end, &optionArg); michael@0: if(optionArg) { michael@0: src->current = optionArg; michael@0: } michael@0: michael@0: if(i < 0) { michael@0: *status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: } else { michael@0: int32_t noOpenBraces = 1; michael@0: switch(i) { michael@0: case OPTION_ALTERNATE_HANDLING: michael@0: case OPTION_FRENCH_COLLATION: michael@0: case OPTION_CASE_LEVEL: michael@0: case OPTION_CASE_FIRST: michael@0: case OPTION_NORMALIZATION_MODE: michael@0: case OPTION_HIRAGANA_QUATERNARY: michael@0: case OPTION_STRENGTH: michael@0: case OPTION_NUMERIC_COLLATION: michael@0: if(optionArg) { michael@0: for(j = 0; jopts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal); michael@0: result = UCOL_TOK_SUCCESS; michael@0: } michael@0: } michael@0: } michael@0: if(result == 0) { michael@0: *status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: } michael@0: break; michael@0: case OPTION_VARIABLE_TOP: michael@0: result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP; michael@0: break; michael@0: case OPTION_REARRANGE: michael@0: result = UCOL_TOK_SUCCESS; michael@0: break; michael@0: case OPTION_BEFORE: michael@0: if(optionArg) { michael@0: for(j = 0; jparsedToken.indirectIndex*/ michael@0: src->parsedToken.indirectIndex = 0; michael@0: result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP; michael@0: break; michael@0: case OPTION_FIRST: michael@0: case OPTION_LAST: /* first, last */ michael@0: for(j = 0; jparsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2); michael@0: result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;; michael@0: } michael@0: } michael@0: if(result == 0) { michael@0: *status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: } michael@0: break; michael@0: case OPTION_OPTIMIZE: michael@0: case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before normalization michael@0: // we need to move end here michael@0: src->current++; // skip opening brace michael@0: while(src->current < src->end && noOpenBraces != 0) { michael@0: if(*src->current == 0x005b) { michael@0: noOpenBraces++; michael@0: } else if(*src->current == 0x005D) { // closing brace michael@0: noOpenBraces--; michael@0: } michael@0: src->current++; michael@0: } michael@0: result = UCOL_TOK_SUCCESS; michael@0: break; michael@0: case OPTION_SCRIPTREORDER: michael@0: ucol_tok_parseScriptReorder(src, status); michael@0: break; michael@0: default: michael@0: *status = U_UNSUPPORTED_ERROR; michael@0: break; michael@0: } michael@0: } michael@0: src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current)); michael@0: return result; michael@0: } michael@0: michael@0: michael@0: inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) { michael@0: if (stuff == NULL || len <= 0) { michael@0: return; michael@0: } michael@0: UnicodeString tempStuff(FALSE, stuff, len); michael@0: if(src->extraCurrent+len >= src->extraEnd) { michael@0: /* reallocate */ michael@0: if (stuff >= src->source && stuff <= src->end) { michael@0: // Copy the "stuff" contents into tempStuff's own buffer. michael@0: // UnicodeString is copy-on-write. michael@0: if (len > 0) { michael@0: tempStuff.setCharAt(0, tempStuff[0]); michael@0: } else { michael@0: tempStuff.remove(); michael@0: } michael@0: } michael@0: UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar)); michael@0: if(newSrc != NULL) { michael@0: src->current = newSrc + (src->current - src->source); michael@0: src->extraCurrent = newSrc + (src->extraCurrent - src->source); michael@0: src->end = newSrc + (src->end - src->source); michael@0: src->extraEnd = newSrc + (src->extraEnd-src->source)*2; michael@0: src->sourceCurrent = newSrc + (src->sourceCurrent-src->source); michael@0: src->source = newSrc; michael@0: } else { michael@0: *status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: } michael@0: if(len == 1) { michael@0: *src->extraCurrent++ = tempStuff[0]; michael@0: } else { michael@0: u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len); michael@0: src->extraCurrent += len; michael@0: } michael@0: } michael@0: michael@0: inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) { michael@0: /* michael@0: top = TRUE; michael@0: */ michael@0: UChar buff[5]; michael@0: src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); michael@0: buff[0] = 0xFFFE; michael@0: buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16); michael@0: buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF); michael@0: if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) { michael@0: src->parsedToken.charsLen = 3; michael@0: ucol_tok_addToExtraCurrent(src, buff, 3, status); michael@0: } else { michael@0: buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16); michael@0: buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF); michael@0: src->parsedToken.charsLen = 5; michael@0: ucol_tok_addToExtraCurrent(src, buff, 5, status); michael@0: } michael@0: return TRUE; michael@0: } michael@0: michael@0: static UBool isCharNewLine(UChar c){ michael@0: switch(c){ michael@0: case 0x000A: /* LF */ michael@0: case 0x000D: /* CR */ michael@0: case 0x000C: /* FF */ michael@0: case 0x0085: /* NEL */ michael@0: case 0x2028: /* LS */ michael@0: case 0x2029: /* PS */ michael@0: return TRUE; michael@0: default: michael@0: return FALSE; michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * This function is called several times when a range is processed. Each time, the next code point michael@0: * is processed. michael@0: * The following variables must be set before calling this function: michael@0: * src->currentRangeCp: The current code point to process. michael@0: * src->lastRangeCp: The last code point in the range. michael@0: * Pre-requisite: src->currentRangeCp <= src->lastRangeCp. michael@0: */ michael@0: static const UChar* michael@0: ucol_tok_processNextCodePointInRange(UColTokenParser *src, michael@0: UErrorCode *status) michael@0: { michael@0: // Append current code point to source michael@0: UChar buff[U16_MAX_LENGTH]; michael@0: uint32_t i = 0; michael@0: michael@0: uint32_t nChars = U16_LENGTH(src->currentRangeCp); michael@0: src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); michael@0: src->parsedToken.charsLen = nChars; michael@0: michael@0: U16_APPEND_UNSAFE(buff, i, src->currentRangeCp); michael@0: ucol_tok_addToExtraCurrent(src, buff, nChars, status); michael@0: michael@0: ++src->currentRangeCp; michael@0: if (src->currentRangeCp > src->lastRangeCp) { michael@0: src->inRange = FALSE; michael@0: michael@0: if (src->currentStarredCharIndex > src->lastStarredCharIndex) { michael@0: src->isStarred = FALSE; michael@0: } michael@0: } else { michael@0: src->previousCp = src->currentRangeCp; michael@0: } michael@0: return src->current; michael@0: } michael@0: michael@0: /* michael@0: * This function is called several times when a starred list is processed. Each time, the next code point michael@0: * in the list is processed. michael@0: * The following variables must be set before calling this function: michael@0: * src->currentStarredCharIndex: Index (in src->source) of the first char of the current code point. michael@0: * src->lastStarredCharIndex: Index to the last character in the list. michael@0: * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex. michael@0: */ michael@0: static const UChar* michael@0: ucol_tok_processNextTokenInStarredList(UColTokenParser *src) michael@0: { michael@0: // Extract the characters corresponding to the next code point. michael@0: UChar32 cp; michael@0: src->parsedToken.charsOffset = src->currentStarredCharIndex; michael@0: int32_t prev = src->currentStarredCharIndex; michael@0: U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src->source), cp); michael@0: src->parsedToken.charsLen = src->currentStarredCharIndex - prev; michael@0: michael@0: // When we are done parsing the starred string, turn the flag off so that michael@0: // the normal processing is restored. michael@0: if (src->currentStarredCharIndex > src->lastStarredCharIndex) { michael@0: src->isStarred = FALSE; michael@0: } michael@0: src->previousCp = cp; michael@0: return src->current; michael@0: } michael@0: michael@0: /* michael@0: * Partially parses the next token, keeps the indices in src->parsedToken, and updates the counters. michael@0: * michael@0: * This routine parses and separates almost all tokens. The following are the syntax characters recognized. michael@0: * # : Comment character michael@0: * & : Reset operator michael@0: * = : Equality michael@0: * < : Primary collation michael@0: * << : Secondary collation michael@0: * <<< : Tertiary collation michael@0: * ; : Secondary collation michael@0: * , : Tertiary collation michael@0: * / : Expansions michael@0: * | : Prefix michael@0: * - : Range michael@0: michael@0: * ! : Java Thai modifier, ignored michael@0: * @ : French only michael@0: michael@0: * [] : Options michael@0: * '' : Quotes michael@0: * michael@0: * Along with operators =, <, <<, <<<, the operator * is supported to indicate a list. For example, &a<*bcdexyz michael@0: * is equivalent to &aparsedToken counterparts michael@0: // no need to use them anymore since we have src->parsedToken. michael@0: // Ideally, token parser would be a nice class... Once, when I have michael@0: // more time (around 2020 probably). michael@0: uint32_t newExtensionLen = 0; michael@0: uint32_t extensionOffset = 0; michael@0: uint32_t newStrength = UCOL_TOK_UNSET; michael@0: UChar buff[10]; michael@0: michael@0: src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0; michael@0: src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0; michael@0: src->parsedToken.indirectIndex = 0; michael@0: michael@0: while (src->current < src->end) { michael@0: UChar ch = *(src->current); michael@0: michael@0: if (inQuote) { michael@0: if (ch == 0x0027/*'\''*/) { michael@0: inQuote = FALSE; michael@0: } else { michael@0: if ((src->parsedToken.charsLen == 0) || inChars) { michael@0: if(src->parsedToken.charsLen == 0) { michael@0: src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); michael@0: } michael@0: src->parsedToken.charsLen++; michael@0: } else { michael@0: if(newExtensionLen == 0) { michael@0: extensionOffset = (uint32_t)(src->extraCurrent - src->source); michael@0: } michael@0: newExtensionLen++; michael@0: } michael@0: } michael@0: }else if(isEscaped){ michael@0: isEscaped =FALSE; michael@0: if (newStrength == UCOL_TOK_UNSET) { michael@0: *status = U_INVALID_FORMAT_ERROR; michael@0: syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); michael@0: DBG_FORMAT_ERROR michael@0: return NULL; michael@0: // enabling rules to start with non-tokens a < b michael@0: // newStrength = UCOL_TOK_RESET; michael@0: } michael@0: if(ch != 0x0000 && src->current != src->end) { michael@0: if (inChars) { michael@0: if(src->parsedToken.charsLen == 0) { michael@0: src->parsedToken.charsOffset = (uint32_t)(src->current - src->source); michael@0: } michael@0: src->parsedToken.charsLen++; michael@0: } else { michael@0: if(newExtensionLen == 0) { michael@0: extensionOffset = (uint32_t)(src->current - src->source); michael@0: } michael@0: newExtensionLen++; michael@0: } michael@0: } michael@0: }else { michael@0: if(!PatternProps::isWhiteSpace(ch)) { michael@0: /* Sets the strength for this entry */ michael@0: switch (ch) { michael@0: case 0x003D/*'='*/ : michael@0: if (newStrength != UCOL_TOK_UNSET) { michael@0: goto EndOfLoop; michael@0: } michael@0: michael@0: /* if we start with strength, we'll reset to top */ michael@0: if(startOfRules == TRUE) { michael@0: src->parsedToken.indirectIndex = 5; michael@0: top = ucol_tok_doSetTop(src, status); michael@0: newStrength = UCOL_TOK_RESET; michael@0: goto EndOfLoop; michael@0: } michael@0: newStrength = UCOL_IDENTICAL; michael@0: if(*(src->current+1) == 0x002A) {/*'*'*/ michael@0: src->current++; michael@0: src->isStarred = TRUE; michael@0: } michael@0: break; michael@0: michael@0: case 0x002C/*','*/: michael@0: if (newStrength != UCOL_TOK_UNSET) { michael@0: goto EndOfLoop; michael@0: } michael@0: michael@0: /* if we start with strength, we'll reset to top */ michael@0: if(startOfRules == TRUE) { michael@0: src->parsedToken.indirectIndex = 5; michael@0: top = ucol_tok_doSetTop(src, status); michael@0: newStrength = UCOL_TOK_RESET; michael@0: goto EndOfLoop; michael@0: } michael@0: newStrength = UCOL_TERTIARY; michael@0: break; michael@0: michael@0: case 0x003B/*';'*/: michael@0: if (newStrength != UCOL_TOK_UNSET) { michael@0: goto EndOfLoop; michael@0: } michael@0: michael@0: /* if we start with strength, we'll reset to top */ michael@0: if(startOfRules == TRUE) { michael@0: src->parsedToken.indirectIndex = 5; michael@0: top = ucol_tok_doSetTop(src, status); michael@0: newStrength = UCOL_TOK_RESET; michael@0: goto EndOfLoop; michael@0: } michael@0: newStrength = UCOL_SECONDARY; michael@0: break; michael@0: michael@0: case 0x003C/*'<'*/: michael@0: if (newStrength != UCOL_TOK_UNSET) { michael@0: goto EndOfLoop; michael@0: } michael@0: michael@0: /* if we start with strength, we'll reset to top */ michael@0: if(startOfRules == TRUE) { michael@0: src->parsedToken.indirectIndex = 5; michael@0: top = ucol_tok_doSetTop(src, status); michael@0: newStrength = UCOL_TOK_RESET; michael@0: goto EndOfLoop; michael@0: } michael@0: /* before this, do a scan to verify whether this is */ michael@0: /* another strength */ michael@0: if(*(src->current+1) == 0x003C) { michael@0: src->current++; michael@0: if(*(src->current+1) == 0x003C) { michael@0: src->current++; /* three in a row! */ michael@0: newStrength = UCOL_TERTIARY; michael@0: } else { /* two in a row */ michael@0: newStrength = UCOL_SECONDARY; michael@0: } michael@0: } else { /* just one */ michael@0: newStrength = UCOL_PRIMARY; michael@0: } michael@0: if(*(src->current+1) == 0x002A) {/*'*'*/ michael@0: src->current++; michael@0: src->isStarred = TRUE; michael@0: } michael@0: break; michael@0: michael@0: case 0x0026/*'&'*/: michael@0: if (newStrength != UCOL_TOK_UNSET) { michael@0: /**/ michael@0: goto EndOfLoop; michael@0: } michael@0: michael@0: newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */ michael@0: break; michael@0: michael@0: case 0x005b/*'['*/: michael@0: /* options - read an option, analyze it */ michael@0: if(u_strchr(src->current, 0x005d /*']'*/) != NULL) { michael@0: uint8_t result = ucol_uprv_tok_readAndSetOption(src, status); michael@0: if(U_SUCCESS(*status)) { michael@0: if(result & UCOL_TOK_TOP) { michael@0: if(newStrength == UCOL_TOK_RESET) { michael@0: top = ucol_tok_doSetTop(src, status); michael@0: if(before) { // This is a combination of before and indirection like '&[before 2][first regular]parsedToken.charsLen+=2; michael@0: buff[0] = 0x002d; michael@0: buff[1] = before; michael@0: ucol_tok_addToExtraCurrent(src, buff, 2, status); michael@0: } michael@0: michael@0: src->current++; michael@0: goto EndOfLoop; michael@0: } else { michael@0: *status = U_INVALID_FORMAT_ERROR; michael@0: syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); michael@0: DBG_FORMAT_ERROR michael@0: } michael@0: } else if(result & UCOL_TOK_VARIABLE_TOP) { michael@0: if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) { michael@0: variableTop = TRUE; michael@0: src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); michael@0: src->parsedToken.charsLen = 1; michael@0: buff[0] = 0xFFFF; michael@0: ucol_tok_addToExtraCurrent(src, buff, 1, status); michael@0: src->current++; michael@0: goto EndOfLoop; michael@0: } else { michael@0: *status = U_INVALID_FORMAT_ERROR; michael@0: syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); michael@0: DBG_FORMAT_ERROR michael@0: } michael@0: } else if (result & UCOL_TOK_BEFORE){ michael@0: if(newStrength == UCOL_TOK_RESET) { michael@0: before = result & UCOL_TOK_BEFORE; michael@0: } else { michael@0: *status = U_INVALID_FORMAT_ERROR; michael@0: syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); michael@0: DBG_FORMAT_ERROR michael@0: } michael@0: } michael@0: } else { michael@0: *status = U_INVALID_FORMAT_ERROR; michael@0: syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); michael@0: DBG_FORMAT_ERROR michael@0: return NULL; michael@0: } michael@0: } michael@0: break; michael@0: case 0x0021/*! skip java thai modifier reordering*/: michael@0: break; michael@0: case 0x002F/*'/'*/: michael@0: wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */ michael@0: inChars = FALSE; /* we're now processing expansion */ michael@0: break; michael@0: case 0x005C /* back slash for escaped chars */: michael@0: isEscaped = TRUE; michael@0: break; michael@0: /* found a quote, we're gonna start copying */ michael@0: case 0x0027/*'\''*/: michael@0: if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */ michael@0: *status = U_INVALID_FORMAT_ERROR; michael@0: syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); michael@0: DBG_FORMAT_ERROR michael@0: return NULL; michael@0: // enabling rules to start with a non-token character a < b michael@0: // newStrength = UCOL_TOK_RESET; michael@0: } michael@0: michael@0: inQuote = TRUE; michael@0: michael@0: if(inChars) { /* we're doing characters */ michael@0: if(wasInQuote == FALSE) { michael@0: src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); michael@0: } michael@0: if (src->parsedToken.charsLen != 0) { michael@0: ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status); michael@0: } michael@0: src->parsedToken.charsLen++; michael@0: } else { /* we're doing an expansion */ michael@0: if(wasInQuote == FALSE) { michael@0: extensionOffset = (uint32_t)(src->extraCurrent - src->source); michael@0: } michael@0: if (newExtensionLen != 0) { michael@0: ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status); michael@0: } michael@0: newExtensionLen++; michael@0: } michael@0: michael@0: wasInQuote = TRUE; michael@0: michael@0: ch = *(++(src->current)); michael@0: if(ch == 0x0027) { /* copy the double quote */ michael@0: ucol_tok_addToExtraCurrent(src, &ch, 1, status); michael@0: inQuote = FALSE; michael@0: } michael@0: break; michael@0: michael@0: /* '@' is french only if the strength is not currently set */ michael@0: /* if it is, it's just a regular character in collation rules */ michael@0: case 0x0040/*'@'*/: michael@0: if (newStrength == UCOL_TOK_UNSET) { michael@0: src->opts->frenchCollation = UCOL_ON; michael@0: break; michael@0: } michael@0: michael@0: case 0x007C /*|*/: /* this means we have actually been reading prefix part */ michael@0: // we want to store read characters to the prefix part and continue reading michael@0: // the characters (proper way would be to restart reading the chars, but in michael@0: // that case we would have to complicate the token hasher, which I do not michael@0: // intend to play with. Instead, we will do prefixes when prefixes are due michael@0: // (before adding the elements). michael@0: src->parsedToken.prefixOffset = src->parsedToken.charsOffset; michael@0: src->parsedToken.prefixLen = src->parsedToken.charsLen; michael@0: michael@0: if(inChars) { /* we're doing characters */ michael@0: if(wasInQuote == FALSE) { michael@0: src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); michael@0: } michael@0: if (src->parsedToken.charsLen != 0) { michael@0: ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status); michael@0: } michael@0: src->parsedToken.charsLen++; michael@0: } michael@0: michael@0: wasInQuote = TRUE; michael@0: michael@0: do { michael@0: ch = *(++(src->current)); michael@0: // skip whitespace between '|' and the character michael@0: } while (PatternProps::isWhiteSpace(ch)); michael@0: break; michael@0: michael@0: //charsOffset = 0; michael@0: //newCharsLen = 0; michael@0: //break; // We want to store the whole prefix/character sequence. If we break michael@0: // the '|' is going to get lost. michael@0: michael@0: case 0x002D /*-*/: /* A range. */ michael@0: if (newStrength != UCOL_TOK_UNSET) { michael@0: // While processing the pending token, the isStarred field michael@0: // is reset, so it needs to be saved for the next michael@0: // invocation. michael@0: src->savedIsStarred = src->isStarred; michael@0: goto EndOfLoop; michael@0: } michael@0: src->isStarred = src->savedIsStarred; michael@0: michael@0: // Ranges are valid only in starred tokens. michael@0: if (!src->isStarred) { michael@0: *status = U_INVALID_FORMAT_ERROR; michael@0: syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); michael@0: DBG_FORMAT_ERROR michael@0: return NULL; michael@0: } michael@0: newStrength = src->parsedToken.strength; michael@0: src->inRange = TRUE; michael@0: break; michael@0: michael@0: case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */ michael@0: do { michael@0: ch = *(++(src->current)); michael@0: } while (!isCharNewLine(ch)); michael@0: michael@0: break; michael@0: default: michael@0: if (newStrength == UCOL_TOK_UNSET) { michael@0: *status = U_INVALID_FORMAT_ERROR; michael@0: syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); michael@0: DBG_FORMAT_ERROR michael@0: return NULL; michael@0: } michael@0: michael@0: if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) { michael@0: *status = U_INVALID_FORMAT_ERROR; michael@0: syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); michael@0: DBG_FORMAT_ERROR michael@0: return NULL; michael@0: } michael@0: michael@0: if(ch == 0x0000 && src->current+1 == src->end) { michael@0: break; michael@0: } michael@0: michael@0: if (inChars) { michael@0: if(src->parsedToken.charsLen == 0) { michael@0: src->parsedToken.charsOffset = (uint32_t)(src->current - src->source); michael@0: } michael@0: src->parsedToken.charsLen++; michael@0: } else { michael@0: if(newExtensionLen == 0) { michael@0: extensionOffset = (uint32_t)(src->current - src->source); michael@0: } michael@0: newExtensionLen++; michael@0: } michael@0: michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: michael@0: if(wasInQuote) { michael@0: if(ch != 0x27) { michael@0: if(inQuote || !PatternProps::isWhiteSpace(ch)) { michael@0: ucol_tok_addToExtraCurrent(src, &ch, 1, status); michael@0: } michael@0: } michael@0: } michael@0: michael@0: src->current++; michael@0: } michael@0: michael@0: EndOfLoop: michael@0: wasInQuote = FALSE; michael@0: if (newStrength == UCOL_TOK_UNSET) { michael@0: return NULL; michael@0: } michael@0: michael@0: if (src->parsedToken.charsLen == 0 && top == FALSE) { michael@0: syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); michael@0: *status = U_INVALID_FORMAT_ERROR; michael@0: DBG_FORMAT_ERROR michael@0: return NULL; michael@0: } michael@0: michael@0: src->parsedToken.strength = newStrength; michael@0: src->parsedToken.extensionOffset = extensionOffset; michael@0: src->parsedToken.extensionLen = newExtensionLen; michael@0: src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before; michael@0: michael@0: return src->current; michael@0: } michael@0: michael@0: /* michael@0: * Parses the next token, keeps the indices in src->parsedToken, and updates the counters. michael@0: * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported. michael@0: * michael@0: * In addition to what ucol_tok_parseNextTokenInternal() does, this function does the following: michael@0: * 1) ucol_tok_parseNextTokenInternal() returns a range as a single token. This function separates michael@0: * it to separate tokens and returns one by one. In order to do that, the necessary states are michael@0: * cached as member variables of the token parser. michael@0: * 2) When encountering a range, ucol_tok_parseNextTokenInternal() processes characters up to the michael@0: * starting character as a single list token (which is separated into individual characters here) michael@0: * and as another list token starting with the last character in the range. Before expanding it michael@0: * as a list of tokens, this function expands the range by filling the intermediate characters and michael@0: * returns them one by one as separate tokens. michael@0: * Necessary checks are done for invalid combinations. michael@0: */ michael@0: U_CAPI const UChar* U_EXPORT2 michael@0: ucol_tok_parseNextToken(UColTokenParser *src, michael@0: UBool startOfRules, michael@0: UParseError *parseError, michael@0: UErrorCode *status) michael@0: { michael@0: const UChar *nextToken; michael@0: michael@0: if (src->inRange) { michael@0: // We are not done processing a range. Continue it. michael@0: return ucol_tok_processNextCodePointInRange(src, status); michael@0: } else if (src->isStarred) { michael@0: // We are not done processing a starred token. Continue it. michael@0: return ucol_tok_processNextTokenInStarredList(src); michael@0: } michael@0: michael@0: // Get the next token. michael@0: nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, status); michael@0: michael@0: if (nextToken == NULL) { michael@0: return NULL; michael@0: } michael@0: michael@0: if (src->inRange) { michael@0: // A new range has started. michael@0: // Check whether it is a chain of ranges with more than one hyphen. michael@0: if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) { michael@0: *status = U_INVALID_FORMAT_ERROR; michael@0: syntaxError(src->source,src->parsedToken.charsOffset-1, michael@0: src->parsedToken.charsOffset+src->parsedToken.charsLen, parseError); michael@0: DBG_FORMAT_ERROR michael@0: return NULL; michael@0: } michael@0: michael@0: // The current token indicates the second code point of the range. michael@0: // Process just that, and then proceed with the star. michael@0: src->currentStarredCharIndex = src->parsedToken.charsOffset; michael@0: U16_NEXT(src->source, src->currentStarredCharIndex, michael@0: (uint32_t)(src->end - src->source), src->lastRangeCp); michael@0: if (src->lastRangeCp <= src->previousCp) { michael@0: *status = U_INVALID_FORMAT_ERROR; michael@0: syntaxError(src->source,src->parsedToken.charsOffset-1, michael@0: src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError); michael@0: DBG_FORMAT_ERROR michael@0: return NULL; michael@0: } michael@0: michael@0: // Set current range code point to process the range loop michael@0: src->currentRangeCp = src->previousCp + 1; michael@0: michael@0: src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1; michael@0: michael@0: return ucol_tok_processNextCodePointInRange(src, status); michael@0: } else if (src->isStarred) { michael@0: // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that michael@0: // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be michael@0: // separated into several tokens and returned. michael@0: src->currentStarredCharIndex = src->parsedToken.charsOffset; michael@0: src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1; michael@0: michael@0: return ucol_tok_processNextTokenInStarredList(src); michael@0: } else { michael@0: // Set previous codepoint michael@0: U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end - src->source), src->previousCp); michael@0: } michael@0: return nextToken; michael@0: } michael@0: michael@0: michael@0: /* michael@0: Processing Description michael@0: 1 Build a ListList. Each list has a header, which contains two lists (positive michael@0: and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and michael@0: reset may be null. michael@0: 2 As you process, you keep a LAST pointer that points to the last token you michael@0: handled. michael@0: michael@0: */ michael@0: michael@0: static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext, michael@0: UParseError *parseError, UErrorCode *status) michael@0: { michael@0: if(src->resultLen == src->listCapacity) { michael@0: // Unfortunately, this won't work, as we store addresses of lhs in token michael@0: src->listCapacity *= 2; michael@0: src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader)); michael@0: if(src->lh == NULL) { michael@0: *status = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: } michael@0: /* do the reset thing */ michael@0: UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); michael@0: /* test for NULL */ michael@0: if (sourceToken == NULL) { michael@0: *status = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: sourceToken->rulesToParseHdl = &(src->source); michael@0: sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; michael@0: sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset; michael@0: michael@0: sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset); michael@0: sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset); michael@0: michael@0: // keep the flags around so that we know about before michael@0: sourceToken->flags = src->parsedToken.flags; michael@0: michael@0: if(src->parsedToken.prefixOffset != 0) { michael@0: // this is a syntax error michael@0: *status = U_INVALID_FORMAT_ERROR; michael@0: syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError); michael@0: DBG_FORMAT_ERROR michael@0: uprv_free(sourceToken); michael@0: return 0; michael@0: } else { michael@0: sourceToken->prefix = 0; michael@0: } michael@0: michael@0: sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */ michael@0: sourceToken->strength = UCOL_TOK_RESET; michael@0: sourceToken->next = NULL; michael@0: sourceToken->previous = NULL; michael@0: sourceToken->noOfCEs = 0; michael@0: sourceToken->noOfExpCEs = 0; michael@0: sourceToken->listHeader = &src->lh[src->resultLen]; michael@0: michael@0: src->lh[src->resultLen].first = NULL; michael@0: src->lh[src->resultLen].last = NULL; michael@0: src->lh[src->resultLen].first = NULL; michael@0: src->lh[src->resultLen].last = NULL; michael@0: michael@0: src->lh[src->resultLen].reset = sourceToken; michael@0: michael@0: /* michael@0: 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ... michael@0: First convert all expansions into normal form. Examples: michael@0: If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * michael@0: d * ... into &x * c/y * d * ... michael@0: Note: reset values can never have expansions, although they can cause the michael@0: very next item to have one. They may be contractions, if they are found michael@0: earlier in the list. michael@0: */ michael@0: *expandNext = 0; michael@0: if(expand != NULL) { michael@0: /* check to see if there is an expansion */ michael@0: if(src->parsedToken.charsLen > 1) { michael@0: uint32_t resetCharsOffset; michael@0: resetCharsOffset = (uint32_t)(expand - src->source); michael@0: sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset; michael@0: *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset); michael@0: } michael@0: } michael@0: michael@0: src->resultLen++; michael@0: michael@0: uhash_put(src->tailored, sourceToken, sourceToken, status); michael@0: michael@0: return sourceToken; michael@0: } michael@0: michael@0: static michael@0: inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) { michael@0: if(U_FAILURE(*status)) { michael@0: return NULL; michael@0: } michael@0: /* this is a virgin before - we need to fish the anchor from the UCA */ michael@0: collIterate s; michael@0: uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND; michael@0: uint32_t CE, SecondCE; michael@0: // uint32_t invPos; michael@0: if(sourceToken != NULL) { michael@0: uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status); michael@0: } else { michael@0: uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status); michael@0: } michael@0: if(U_FAILURE(*status)) { michael@0: return NULL; michael@0: } michael@0: michael@0: baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F; michael@0: baseContCE = ucol_getNextCE(src->UCA, &s, status); michael@0: if(baseContCE == UCOL_NO_MORE_CES) { michael@0: baseContCE = 0; michael@0: } michael@0: michael@0: michael@0: UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); michael@0: uint32_t ch = 0; michael@0: uint32_t expandNext = 0; michael@0: UColToken key; michael@0: michael@0: if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ michael@0: uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16); michael@0: uint32_t raw = uprv_uca_getRawFromImplicit(primary); michael@0: ch = uprv_uca_getCodePointFromRaw(raw-1); michael@0: uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1); michael@0: CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; michael@0: SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER; michael@0: michael@0: src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); michael@0: *src->extraCurrent++ = 0xFFFE; michael@0: *src->extraCurrent++ = (UChar)ch; michael@0: src->parsedToken.charsLen++; michael@0: michael@0: key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/; michael@0: key.rulesToParseHdl = &(src->source); michael@0: michael@0: //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key); michael@0: sourceToken = (UColToken *)uhash_get(src->tailored, &key); michael@0: michael@0: if(sourceToken == NULL) { michael@0: src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F; michael@0: if(isContinuation(SecondCE)) { michael@0: src->lh[src->resultLen].baseContCE = SecondCE; michael@0: } else { michael@0: src->lh[src->resultLen].baseContCE = 0; michael@0: } michael@0: src->lh[src->resultLen].nextCE = 0; michael@0: src->lh[src->resultLen].nextContCE = 0; michael@0: src->lh[src->resultLen].previousCE = 0; michael@0: src->lh[src->resultLen].previousContCE = 0; michael@0: michael@0: src->lh[src->resultLen].indirect = FALSE; michael@0: michael@0: sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); michael@0: } michael@0: michael@0: } else { michael@0: /* invPos = */ ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength); michael@0: michael@0: // we got the previous CE. Now we need to see if the difference between michael@0: // the two CEs is really of the requested strength. michael@0: // if it's a bigger difference (we asked for secondary and got primary), we michael@0: // need to modify the CE. michael@0: if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) { michael@0: // adjust the strength michael@0: // now we are in the situation where our baseCE should actually be modified in michael@0: // order to get the CE in the right position. michael@0: if(strength == UCOL_SECONDARY) { michael@0: CE = baseCE - 0x0200; michael@0: } else { // strength == UCOL_TERTIARY michael@0: CE = baseCE - 0x02; michael@0: } michael@0: if(baseContCE) { michael@0: if(strength == UCOL_SECONDARY) { michael@0: SecondCE = baseContCE - 0x0200; michael@0: } else { // strength == UCOL_TERTIARY michael@0: SecondCE = baseContCE - 0x02; michael@0: } michael@0: } michael@0: } michael@0: michael@0: #if 0 michael@0: // the code below relies on getting a code point from the inverse table, in order to be michael@0: // able to merge the situations like &x < 9 &[before 1]a < d. This won't work: michael@0: // 1. There are many code points that have the same CE michael@0: // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken. michael@0: // Also, in case when there is no equivalent strength before an element, we have to actually michael@0: // construct one. For example, &[before 2]a << x won't result in x << a, because the element michael@0: // before a is a primary difference. michael@0: michael@0: //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); michael@0: michael@0: michael@0: ch = CETable[3*invPos+2]; michael@0: michael@0: if((ch & UCOL_INV_SIZEMASK) != 0) { michael@0: uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts); michael@0: uint32_t offset = (ch & UCOL_INV_OFFSETMASK); michael@0: ch = conts[offset]; michael@0: } michael@0: michael@0: *src->extraCurrent++ = (UChar)ch; michael@0: src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1); michael@0: src->parsedToken.charsLen = 1; michael@0: michael@0: // We got an UCA before. However, this might have been tailored. michael@0: // example: michael@0: // &\u30ca = \u306a michael@0: // &[before 3]\u306a<<<\u306a|\u309d michael@0: michael@0: michael@0: // uint32_t key = (*newCharsLen << 24) | *charsOffset; michael@0: key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/; michael@0: key.rulesToParseHdl = &(src->source); michael@0: michael@0: //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key); michael@0: sourceToken = (UColToken *)uhash_get(src->tailored, &key); michael@0: #endif michael@0: michael@0: // here is how it should be. The situation such as &[before 1]a < x, should be michael@0: // resolved exactly as if we wrote &a > x. michael@0: // therefore, I don't really care if the UCA value before a has been changed. michael@0: // However, I do care if the strength between my element and the previous element michael@0: // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll michael@0: // have to construct the base CE. michael@0: michael@0: michael@0: michael@0: // if we found a tailored thing, we have to use the UCA value and construct michael@0: // a new reset token with constructed name michael@0: //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) { michael@0: // character to which we want to anchor is already tailored. michael@0: // We need to construct a new token which will be the anchor michael@0: // point michael@0: //*(src->extraCurrent-1) = 0xFFFE; michael@0: //*src->extraCurrent++ = (UChar)ch; michael@0: // grab before michael@0: src->parsedToken.charsOffset -= 10; michael@0: src->parsedToken.charsLen += 10; michael@0: src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F; michael@0: if(isContinuation(SecondCE)) { michael@0: src->lh[src->resultLen].baseContCE = SecondCE; michael@0: } else { michael@0: src->lh[src->resultLen].baseContCE = 0; michael@0: } michael@0: src->lh[src->resultLen].nextCE = 0; michael@0: src->lh[src->resultLen].nextContCE = 0; michael@0: src->lh[src->resultLen].previousCE = 0; michael@0: src->lh[src->resultLen].previousContCE = 0; michael@0: michael@0: src->lh[src->resultLen].indirect = FALSE; michael@0: michael@0: sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); michael@0: //} michael@0: } michael@0: michael@0: return sourceToken; michael@0: michael@0: } michael@0: michael@0: uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) { michael@0: UColToken *lastToken = NULL; michael@0: const UChar *parseEnd = NULL; michael@0: uint32_t expandNext = 0; michael@0: UBool variableTop = FALSE; michael@0: UBool top = FALSE; michael@0: uint16_t specs = 0; michael@0: UColTokListHeader *ListList = NULL; michael@0: michael@0: src->parsedToken.strength = UCOL_TOK_UNSET; michael@0: michael@0: ListList = src->lh; michael@0: michael@0: if(U_FAILURE(*status)) { michael@0: return 0; michael@0: } michael@0: #ifdef DEBUG_FOR_CODE_POINTS michael@0: char filename[35]; michael@0: sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid()); michael@0: dfcp_fp = fopen(filename, "a"); michael@0: fprintf(stdout, "Output is in the file %s.\n", filename); michael@0: #endif michael@0: michael@0: #ifdef DEBUG_FOR_COLL_RULES michael@0: std::string s3; michael@0: UnicodeString(src->source).toUTF8String(s3); michael@0: std::cout << "src->source = " << s3 << std::endl; michael@0: #endif michael@0: michael@0: while(src->current < src->end || src->isStarred) { michael@0: src->parsedToken.prefixOffset = 0; michael@0: michael@0: parseEnd = ucol_tok_parseNextToken(src, michael@0: (UBool)(lastToken == NULL), michael@0: parseError, michael@0: status); michael@0: michael@0: specs = src->parsedToken.flags; michael@0: michael@0: michael@0: variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0); michael@0: top = ((specs & UCOL_TOK_TOP) != 0); michael@0: michael@0: if(U_SUCCESS(*status) && parseEnd != NULL) { michael@0: UColToken *sourceToken = NULL; michael@0: //uint32_t key = 0; michael@0: uint32_t lastStrength = UCOL_TOK_UNSET; michael@0: michael@0: if(lastToken != NULL ) { michael@0: lastStrength = lastToken->strength; michael@0: } michael@0: michael@0: #ifdef DEBUG_FOR_CODE_POINTS michael@0: UChar32 cp; michael@0: U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->extraEnd - src->source), cp); michael@0: fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsedToken.strength); michael@0: #endif michael@0: //key = newCharsLen << 24 | charsOffset; michael@0: UColToken key; michael@0: key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; michael@0: key.rulesToParseHdl = &(src->source); michael@0: michael@0: /* 4 Lookup each source in the CharsToToken map, and find a sourceToken */ michael@0: sourceToken = (UColToken *)uhash_get(src->tailored, &key); michael@0: michael@0: if(src->parsedToken.strength != UCOL_TOK_RESET) { michael@0: if(lastToken == NULL) { /* this means that rules haven't started properly */ michael@0: *status = U_INVALID_FORMAT_ERROR; michael@0: syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError); michael@0: DBG_FORMAT_ERROR michael@0: return 0; michael@0: } michael@0: /* 6 Otherwise (when relation != reset) */ michael@0: if(sourceToken == NULL) { michael@0: /* If sourceToken is null, create new one, */ michael@0: sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); michael@0: /* test for NULL */ michael@0: if (sourceToken == NULL) { michael@0: *status = U_MEMORY_ALLOCATION_ERROR; michael@0: return 0; michael@0: } michael@0: sourceToken->rulesToParseHdl = &(src->source); michael@0: sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; michael@0: michael@0: sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset); michael@0: michael@0: sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset; michael@0: sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset); michael@0: michael@0: sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */ michael@0: sourceToken->next = NULL; michael@0: sourceToken->previous = NULL; michael@0: sourceToken->noOfCEs = 0; michael@0: sourceToken->noOfExpCEs = 0; michael@0: // keep the flags around so that we know about before michael@0: sourceToken->flags = src->parsedToken.flags; michael@0: uhash_put(src->tailored, sourceToken, sourceToken, status); michael@0: if(U_FAILURE(*status)) { michael@0: return 0; michael@0: } michael@0: } else { michael@0: /* we could have fished out a reset here */ michael@0: if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) { michael@0: /* otherwise remove sourceToken from where it was. */ michael@0: if(sourceToken->next != NULL) { michael@0: if(sourceToken->next->strength > sourceToken->strength) { michael@0: sourceToken->next->strength = sourceToken->strength; michael@0: } michael@0: sourceToken->next->previous = sourceToken->previous; michael@0: } else { michael@0: sourceToken->listHeader->last = sourceToken->previous; michael@0: } michael@0: michael@0: if(sourceToken->previous != NULL) { michael@0: sourceToken->previous->next = sourceToken->next; michael@0: } else { michael@0: sourceToken->listHeader->first = sourceToken->next; michael@0: } michael@0: sourceToken->next = NULL; michael@0: sourceToken->previous = NULL; michael@0: } michael@0: } michael@0: michael@0: sourceToken->strength = src->parsedToken.strength; michael@0: sourceToken->listHeader = lastToken->listHeader; michael@0: michael@0: /* michael@0: 1. Find the strongest strength in each list, and set strongestP and strongestN michael@0: accordingly in the headers. michael@0: */ michael@0: if(lastStrength == UCOL_TOK_RESET michael@0: || sourceToken->listHeader->first == 0) { michael@0: /* If LAST is a reset michael@0: insert sourceToken in the list. */ michael@0: if(sourceToken->listHeader->first == 0) { michael@0: sourceToken->listHeader->first = sourceToken; michael@0: sourceToken->listHeader->last = sourceToken; michael@0: } else { /* we need to find a place for us */ michael@0: /* and we'll get in front of the same strength */ michael@0: if(sourceToken->listHeader->first->strength <= sourceToken->strength) { michael@0: sourceToken->next = sourceToken->listHeader->first; michael@0: sourceToken->next->previous = sourceToken; michael@0: sourceToken->listHeader->first = sourceToken; michael@0: sourceToken->previous = NULL; michael@0: } else { michael@0: lastToken = sourceToken->listHeader->first; michael@0: while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) { michael@0: lastToken = lastToken->next; michael@0: } michael@0: if(lastToken->next != NULL) { michael@0: lastToken->next->previous = sourceToken; michael@0: } else { michael@0: sourceToken->listHeader->last = sourceToken; michael@0: } michael@0: sourceToken->previous = lastToken; michael@0: sourceToken->next = lastToken->next; michael@0: lastToken->next = sourceToken; michael@0: } michael@0: } michael@0: } else { michael@0: /* Otherwise (when LAST is not a reset) michael@0: if polarity (LAST) == polarity(relation), insert sourceToken after LAST, michael@0: otherwise insert before. michael@0: when inserting after or before, search to the next position with the same michael@0: strength in that direction. (This is called postpone insertion). */ michael@0: if(sourceToken != lastToken) { michael@0: if(lastToken->polarity == sourceToken->polarity) { michael@0: while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) { michael@0: lastToken = lastToken->next; michael@0: } michael@0: sourceToken->previous = lastToken; michael@0: if(lastToken->next != NULL) { michael@0: lastToken->next->previous = sourceToken; michael@0: } else { michael@0: sourceToken->listHeader->last = sourceToken; michael@0: } michael@0: michael@0: sourceToken->next = lastToken->next; michael@0: lastToken->next = sourceToken; michael@0: } else { michael@0: while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) { michael@0: lastToken = lastToken->previous; michael@0: } michael@0: sourceToken->next = lastToken; michael@0: if(lastToken->previous != NULL) { michael@0: lastToken->previous->next = sourceToken; michael@0: } else { michael@0: sourceToken->listHeader->first = sourceToken; michael@0: } michael@0: sourceToken->previous = lastToken->previous; michael@0: lastToken->previous = sourceToken; michael@0: } michael@0: } else { /* repeated one thing twice in rules, stay with the stronger strength */ michael@0: if(lastStrength < sourceToken->strength) { michael@0: sourceToken->strength = lastStrength; michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* if the token was a variable top, we're gonna put it in */ michael@0: if(variableTop == TRUE && src->varTop == NULL) { michael@0: variableTop = FALSE; michael@0: src->varTop = sourceToken; michael@0: } michael@0: michael@0: // Treat the expansions. michael@0: // There are two types of expansions: explicit (x / y) and reset based propagating expansions michael@0: // (&abc * d * e <=> &ab * d / c * e / c) michael@0: // if both of them are in effect for a token, they are combined. michael@0: michael@0: sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset; michael@0: michael@0: if(expandNext != 0) { michael@0: if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */ michael@0: expandNext = 0; michael@0: } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */ michael@0: sourceToken->expansion = expandNext; michael@0: } else { /* there is both explicit and implicit expansion. We need to make a combination */ michael@0: uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar)); michael@0: uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar)); michael@0: sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source)); michael@0: src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen; michael@0: } michael@0: } michael@0: michael@0: // This is just for debugging purposes michael@0: if(sourceToken->expansion != 0) { michael@0: sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset); michael@0: } else { michael@0: sourceToken->debugExpansion = 0; michael@0: } michael@0: // if the previous token was a reset before, the strength of this michael@0: // token must match the strength of before. Otherwise we have an michael@0: // undefined situation. michael@0: // In other words, we currently have a cludge which we use to michael@0: // represent &a >> x. This is written as &[before 2]a << x. michael@0: if((lastToken->flags & UCOL_TOK_BEFORE) != 0) { michael@0: uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1; michael@0: if(beforeStrength != sourceToken->strength) { michael@0: *status = U_INVALID_FORMAT_ERROR; michael@0: syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError); michael@0: DBG_FORMAT_ERROR michael@0: return 0; michael@0: } michael@0: } michael@0: } else { michael@0: if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) { michael@0: /* if the previous token was also a reset, */ michael@0: /*this means that we have two consecutive resets */ michael@0: /* and we want to remove the previous one if empty*/ michael@0: if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) { michael@0: src->resultLen--; michael@0: } michael@0: } michael@0: michael@0: if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */ michael@0: uint32_t searchCharsLen = src->parsedToken.charsLen; michael@0: while(searchCharsLen > 1 && sourceToken == NULL) { michael@0: searchCharsLen--; michael@0: //key = searchCharsLen << 24 | charsOffset; michael@0: UColToken key; michael@0: key.source = searchCharsLen << 24 | src->parsedToken.charsOffset; michael@0: key.rulesToParseHdl = &(src->source); michael@0: sourceToken = (UColToken *)uhash_get(src->tailored, &key); michael@0: } michael@0: if(sourceToken != NULL) { michael@0: expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen); michael@0: } michael@0: } michael@0: michael@0: if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */ michael@0: if(top == FALSE) { /* there is no indirection */ michael@0: uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; michael@0: if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) { michael@0: /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */ michael@0: while(sourceToken->strength > strength && sourceToken->previous != NULL) { michael@0: sourceToken = sourceToken->previous; michael@0: } michael@0: /* here, either we hit the strength or NULL */ michael@0: if(sourceToken->strength == strength) { michael@0: if(sourceToken->previous != NULL) { michael@0: sourceToken = sourceToken->previous; michael@0: } else { /* start of list */ michael@0: sourceToken = sourceToken->listHeader->reset; michael@0: } michael@0: } else { /* we hit NULL */ michael@0: /* we should be doing the else part */ michael@0: sourceToken = sourceToken->listHeader->reset; michael@0: sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status); michael@0: } michael@0: } else { michael@0: sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status); michael@0: } michael@0: } else { /* this is both before and indirection */ michael@0: top = FALSE; michael@0: ListList[src->resultLen].previousCE = 0; michael@0: ListList[src->resultLen].previousContCE = 0; michael@0: ListList[src->resultLen].indirect = TRUE; michael@0: /* we need to do slightly more work. we need to get the baseCE using the */ michael@0: /* inverse UCA & getPrevious. The next bound is not set, and will be decided */ michael@0: /* in ucol_bld */ michael@0: uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; michael@0: uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE; michael@0: uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F; michael@0: uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; michael@0: michael@0: UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); michael@0: if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && michael@0: (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ michael@0: uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16); michael@0: uint32_t raw = uprv_uca_getRawFromImplicit(primary); michael@0: uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1); michael@0: CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; michael@0: SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER; michael@0: } else { michael@0: /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/ michael@0: ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength); michael@0: } michael@0: michael@0: ListList[src->resultLen].baseCE = CE; michael@0: ListList[src->resultLen].baseContCE = SecondCE; michael@0: ListList[src->resultLen].nextCE = 0; michael@0: ListList[src->resultLen].nextContCE = 0; michael@0: michael@0: sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); michael@0: } michael@0: } michael@0: michael@0: michael@0: /* 5 If the relation is a reset: michael@0: If sourceToken is null michael@0: Create new list, create new sourceToken, make the baseCE from source, put michael@0: the sourceToken in ListHeader of the new list */ michael@0: if(sourceToken == NULL) { michael@0: /* michael@0: 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ... michael@0: First convert all expansions into normal form. Examples: michael@0: If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * michael@0: d * ... into &x * c/y * d * ... michael@0: Note: reset values can never have expansions, although they can cause the michael@0: very next item to have one. They may be contractions, if they are found michael@0: earlier in the list. michael@0: */ michael@0: if(top == FALSE) { michael@0: collIterate s; michael@0: uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; michael@0: michael@0: uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status); michael@0: michael@0: CE = ucol_getNextCE(src->UCA, &s, status); michael@0: const UChar *expand = s.pos; michael@0: SecondCE = ucol_getNextCE(src->UCA, &s, status); michael@0: michael@0: ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F; michael@0: if(isContinuation(SecondCE)) { michael@0: ListList[src->resultLen].baseContCE = SecondCE; michael@0: } else { michael@0: ListList[src->resultLen].baseContCE = 0; michael@0: } michael@0: ListList[src->resultLen].nextCE = 0; michael@0: ListList[src->resultLen].nextContCE = 0; michael@0: ListList[src->resultLen].previousCE = 0; michael@0: ListList[src->resultLen].previousContCE = 0; michael@0: ListList[src->resultLen].indirect = FALSE; michael@0: sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status); michael@0: } else { /* top == TRUE */ michael@0: /* just use the supplied values */ michael@0: top = FALSE; michael@0: ListList[src->resultLen].previousCE = 0; michael@0: ListList[src->resultLen].previousContCE = 0; michael@0: ListList[src->resultLen].indirect = TRUE; michael@0: ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE; michael@0: ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE; michael@0: ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE; michael@0: ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE; michael@0: michael@0: sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); michael@0: michael@0: } michael@0: } else { /* reset to something already in rules */ michael@0: top = FALSE; michael@0: } michael@0: } michael@0: /* 7 After all this, set LAST to point to sourceToken, and goto step 3. */ michael@0: lastToken = sourceToken; michael@0: } else { michael@0: if(U_FAILURE(*status)) { michael@0: return 0; michael@0: } michael@0: } michael@0: } michael@0: #ifdef DEBUG_FOR_CODE_POINTS michael@0: fclose(dfcp_fp); michael@0: #endif michael@0: michael@0: michael@0: if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) { michael@0: src->resultLen--; michael@0: } michael@0: return src->resultLen; michael@0: } michael@0: michael@0: const UChar* ucol_tok_getRulesFromBundle( michael@0: void* /*context*/, michael@0: const char* locale, michael@0: const char* type, michael@0: int32_t* pLength, michael@0: UErrorCode* status) michael@0: { michael@0: const UChar* rules = NULL; michael@0: UResourceBundle* bundle; michael@0: UResourceBundle* collations; michael@0: UResourceBundle* collation; michael@0: michael@0: *pLength = 0; michael@0: michael@0: bundle = ures_open(U_ICUDATA_COLL, locale, status); michael@0: if(U_SUCCESS(*status)){ michael@0: collations = ures_getByKey(bundle, "collations", NULL, status); michael@0: if(U_SUCCESS(*status)){ michael@0: collation = ures_getByKey(collations, type, NULL, status); michael@0: if(U_SUCCESS(*status)){ michael@0: rules = ures_getStringByKey(collation, "Sequence", pLength, status); michael@0: if(U_FAILURE(*status)){ michael@0: *pLength = 0; michael@0: rules = NULL; michael@0: } michael@0: ures_close(collation); michael@0: } michael@0: ures_close(collations); michael@0: } michael@0: } michael@0: michael@0: ures_close(bundle); michael@0: michael@0: return rules; michael@0: } michael@0: michael@0: void ucol_tok_initTokenList( michael@0: UColTokenParser *src, michael@0: const UChar *rules, michael@0: uint32_t rulesLength, michael@0: const UCollator *UCA, michael@0: GetCollationRulesFunction importFunc, michael@0: void* context, michael@0: UErrorCode *status) { michael@0: U_NAMESPACE_USE michael@0: michael@0: uint32_t nSize = 0; michael@0: uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE); michael@0: michael@0: bool needToDeallocRules = false; michael@0: michael@0: if(U_FAILURE(*status)) { michael@0: return; michael@0: } michael@0: michael@0: // set everything to zero, so that we can clean up gracefully michael@0: uprv_memset(src, 0, sizeof(UColTokenParser)); michael@0: michael@0: // first we need to find options that don't like to be normalized, michael@0: // like copy and remove... michael@0: //const UChar *openBrace = rules; michael@0: int32_t optionNumber = -1; michael@0: const UChar *setStart = NULL; michael@0: uint32_t i = 0; michael@0: while(i < rulesLength) { michael@0: if(rules[i] == 0x005B) { // '[': start of an option michael@0: /* Gets the following: michael@0: optionNumber: The index of the option. michael@0: setStart: The pointer at which the option arguments start. michael@0: */ michael@0: optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart); michael@0: michael@0: if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */ michael@0: // [optimize] michael@0: USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status); michael@0: if(U_SUCCESS(*status)) { michael@0: if(src->copySet == NULL) { michael@0: src->copySet = newSet; michael@0: } else { michael@0: uset_addAll(src->copySet, newSet); michael@0: uset_close(newSet); michael@0: } michael@0: } else { michael@0: return; michael@0: } michael@0: } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) { michael@0: USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status); michael@0: if(U_SUCCESS(*status)) { michael@0: if(src->removeSet == NULL) { michael@0: src->removeSet = newSet; michael@0: } else { michael@0: uset_addAll(src->removeSet, newSet); michael@0: uset_close(newSet); michael@0: } michael@0: } else { michael@0: return; michael@0: } michael@0: } else if(optionNumber == OPTION_IMPORT){ michael@0: // [import ] michael@0: michael@0: // Find the address of the closing ]. michael@0: UChar* import_end = u_strchr(setStart, 0x005D); michael@0: int32_t optionEndOffset = (int32_t)(import_end + 1 - rules); michael@0: // Ignore trailing whitespace. michael@0: while(PatternProps::isWhiteSpace(*(import_end-1))) { michael@0: --import_end; michael@0: } michael@0: michael@0: int32_t optionLength = (int32_t)(import_end - setStart); michael@0: char option[50]; michael@0: if(optionLength >= (int32_t)sizeof(option)) { michael@0: *status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return; michael@0: } michael@0: u_UCharsToChars(setStart, option, optionLength); michael@0: option[optionLength] = 0; michael@0: michael@0: *status = U_ZERO_ERROR; michael@0: char locale[50]; michael@0: int32_t templ; michael@0: uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &templ, status); michael@0: if(U_FAILURE(*status)) { michael@0: *status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return; michael@0: } michael@0: michael@0: char type[50]; michael@0: if (uloc_getKeywordValue(locale, "collation", type, (int32_t)sizeof(type), status) <= 0 || michael@0: U_FAILURE(*status) michael@0: ) { michael@0: *status = U_ZERO_ERROR; michael@0: uprv_strcpy(type, "standard"); michael@0: } michael@0: michael@0: // TODO: Use public functions when available, see ticket #8134. michael@0: char *keywords = (char *)locale_getKeywordsStart(locale); michael@0: if(keywords != NULL) { michael@0: *keywords = 0; michael@0: } michael@0: michael@0: int32_t importRulesLength = 0; michael@0: const UChar* importRules = importFunc(context, locale, type, &importRulesLength, status); michael@0: michael@0: #ifdef DEBUG_FOR_COLL_RULES michael@0: std::string s; michael@0: UnicodeString(importRules).toUTF8String(s); michael@0: std::cout << "Import rules = " << s << std::endl; michael@0: #endif michael@0: michael@0: // Add the length of the imported rules to length of the original rules, michael@0: // and subtract the length of the import option. michael@0: uint32_t newRulesLength = rulesLength + importRulesLength - (optionEndOffset - i); michael@0: michael@0: UChar* newRules = (UChar*)uprv_malloc(newRulesLength*sizeof(UChar)); michael@0: michael@0: #ifdef DEBUG_FOR_COLL_RULES michael@0: std::string s1; michael@0: UnicodeString(rules).toUTF8String(s1); michael@0: std::cout << "Original rules = " << s1 << std::endl; michael@0: #endif michael@0: michael@0: michael@0: // Copy the section of the original rules leading up to the import michael@0: uprv_memcpy(newRules, rules, i*sizeof(UChar)); michael@0: // Copy the imported rules michael@0: uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UChar)); michael@0: // Copy the rest of the original rules (minus the import option itself) michael@0: uprv_memcpy(newRules+i+importRulesLength, michael@0: rules+optionEndOffset, michael@0: (rulesLength-optionEndOffset)*sizeof(UChar)); michael@0: michael@0: #ifdef DEBUG_FOR_COLL_RULES michael@0: std::string s2; michael@0: UnicodeString(newRules).toUTF8String(s2); michael@0: std::cout << "Resulting rules = " << s2 << std::endl; michael@0: #endif michael@0: michael@0: if(needToDeallocRules){ michael@0: // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free michael@0: uprv_free((void*)rules); michael@0: } michael@0: needToDeallocRules = true; michael@0: rules = newRules; michael@0: rulesLength = newRulesLength; michael@0: michael@0: estimatedSize += importRulesLength*2; michael@0: michael@0: // First character of the new rules needs to be processed michael@0: i--; michael@0: } michael@0: } michael@0: //openBrace++; michael@0: i++; michael@0: } michael@0: michael@0: src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar)); michael@0: /* test for NULL */ michael@0: if (src->source == NULL) { michael@0: *status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: uprv_memset(src->source, 0, estimatedSize*sizeof(UChar)); michael@0: nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status); michael@0: if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) { michael@0: *status = U_ZERO_ERROR; michael@0: src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar)); michael@0: /* test for NULL */ michael@0: if (src->source == NULL) { michael@0: *status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status); michael@0: } michael@0: if(needToDeallocRules){ michael@0: // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free michael@0: uprv_free((void*)rules); michael@0: } michael@0: michael@0: michael@0: src->current = src->source; michael@0: src->end = src->source+nSize; michael@0: src->sourceCurrent = src->source; michael@0: src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly michael@0: src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE; michael@0: src->varTop = NULL; michael@0: src->UCA = UCA; michael@0: src->invUCA = ucol_initInverseUCA(status); michael@0: src->parsedToken.charsLen = 0; michael@0: src->parsedToken.charsOffset = 0; michael@0: src->parsedToken.extensionLen = 0; michael@0: src->parsedToken.extensionOffset = 0; michael@0: src->parsedToken.prefixLen = 0; michael@0: src->parsedToken.prefixOffset = 0; michael@0: src->parsedToken.flags = 0; michael@0: src->parsedToken.strength = UCOL_TOK_UNSET; michael@0: src->buildCCTabFlag = FALSE; michael@0: src->isStarred = FALSE; michael@0: src->inRange = FALSE; michael@0: src->lastRangeCp = 0; michael@0: src->previousCp = 0; michael@0: michael@0: if(U_FAILURE(*status)) { michael@0: return; michael@0: } michael@0: src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status); michael@0: if(U_FAILURE(*status)) { michael@0: return; michael@0: } michael@0: uhash_setValueDeleter(src->tailored, uprv_free); michael@0: michael@0: src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet)); michael@0: /* test for NULL */ michael@0: if (src->opts == NULL) { michael@0: *status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: michael@0: uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet)); michael@0: michael@0: src->lh = 0; michael@0: src->listCapacity = 1024; michael@0: src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader)); michael@0: //Test for NULL michael@0: if (src->lh == NULL) { michael@0: *status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader)); michael@0: src->resultLen = 0; michael@0: michael@0: UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); michael@0: michael@0: // UCOL_RESET_TOP_VALUE michael@0: setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT); michael@0: // UCOL_FIRST_PRIMARY_IGNORABLE michael@0: setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0); michael@0: // UCOL_LAST_PRIMARY_IGNORABLE michael@0: setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0); michael@0: // UCOL_FIRST_SECONDARY_IGNORABLE michael@0: setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0); michael@0: // UCOL_LAST_SECONDARY_IGNORABLE michael@0: setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0); michael@0: // UCOL_FIRST_TERTIARY_IGNORABLE michael@0: setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0); michael@0: // UCOL_LAST_TERTIARY_IGNORABLE michael@0: setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0); michael@0: // UCOL_FIRST_VARIABLE michael@0: setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0); michael@0: // UCOL_LAST_VARIABLE michael@0: setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0); michael@0: // UCOL_FIRST_NON_VARIABLE michael@0: setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0); michael@0: // UCOL_LAST_NON_VARIABLE michael@0: setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT); michael@0: // UCOL_FIRST_IMPLICIT michael@0: setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0); michael@0: // UCOL_LAST_IMPLICIT michael@0: setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING); michael@0: // UCOL_FIRST_TRAILING michael@0: setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0); michael@0: // UCOL_LAST_TRAILING michael@0: setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0); michael@0: ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24); michael@0: } michael@0: michael@0: michael@0: void ucol_tok_closeTokenList(UColTokenParser *src) { michael@0: if(src->copySet != NULL) { michael@0: uset_close(src->copySet); michael@0: } michael@0: if(src->removeSet != NULL) { michael@0: uset_close(src->removeSet); michael@0: } michael@0: if(src->tailored != NULL) { michael@0: uhash_close(src->tailored); michael@0: } michael@0: if(src->lh != NULL) { michael@0: uprv_free(src->lh); michael@0: } michael@0: if(src->source != NULL) { michael@0: uprv_free(src->source); michael@0: } michael@0: if(src->opts != NULL) { michael@0: uprv_free(src->opts); michael@0: } michael@0: if (src->reorderCodes != NULL) { michael@0: uprv_free(src->reorderCodes); michael@0: } michael@0: } michael@0: michael@0: #endif /* #if !UCONFIG_NO_COLLATION */