michael@0: /* michael@0: ******************************************************************************* michael@0: * michael@0: * Copyright (C) 2001-2013, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ******************************************************************************* michael@0: * file name: ucol_bld.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created 02/22/2001 michael@0: * created by: Vladimir Weinstein michael@0: * michael@0: * This module builds a collator based on the rule set. michael@0: * michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_COLLATION michael@0: michael@0: #include "unicode/ucoleitr.h" michael@0: #include "unicode/udata.h" michael@0: #include "unicode/uchar.h" michael@0: #include "unicode/uniset.h" michael@0: #include "unicode/uscript.h" michael@0: #include "unicode/ustring.h" michael@0: #include "unicode/utf16.h" michael@0: #include "normalizer2impl.h" michael@0: #include "uassert.h" michael@0: #include "ucol_bld.h" michael@0: #include "ucol_elm.h" michael@0: #include "ucol_cnt.h" michael@0: #include "ucln_in.h" michael@0: #include "umutex.h" michael@0: #include "cmemory.h" michael@0: #include "cstring.h" michael@0: michael@0: #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) michael@0: michael@0: static const InverseUCATableHeader* _staticInvUCA = NULL; michael@0: static UDataMemory* invUCA_DATA_MEM = NULL; michael@0: static icu::UInitOnce gStaticInvUCAInitOnce = U_INITONCE_INITIALIZER; michael@0: michael@0: U_CDECL_BEGIN michael@0: static UBool U_CALLCONV michael@0: isAcceptableInvUCA(void * /*context*/, michael@0: const char * /*type*/, const char * /*name*/, michael@0: const UDataInfo *pInfo) michael@0: { michael@0: /* context, type & name are intentionally not used */ michael@0: if( pInfo->size>=20 && michael@0: pInfo->isBigEndian==U_IS_BIG_ENDIAN && michael@0: pInfo->charsetFamily==U_CHARSET_FAMILY && michael@0: pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */ michael@0: pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 && michael@0: pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 && michael@0: pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 && michael@0: pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 && michael@0: pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&& michael@0: //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 && michael@0: //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 && michael@0: //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 && michael@0: ) michael@0: { michael@0: // TODO: Check that the invuca data version (pInfo->dataVersion) michael@0: // matches the ucadata version. michael@0: return TRUE; michael@0: } else { michael@0: return FALSE; michael@0: } michael@0: } michael@0: U_CDECL_END michael@0: michael@0: /* michael@0: * Takes two CEs (lead and continuation) and michael@0: * compares them as CEs should be compared: michael@0: * primary vs. primary, secondary vs. secondary michael@0: * tertiary vs. tertiary michael@0: */ michael@0: static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) { michael@0: uint32_t s1 = source0, s2, t1 = target0, t2; michael@0: if(isContinuation(source1)) { michael@0: s2 = source1; michael@0: } else { michael@0: s2 = 0; michael@0: } michael@0: if(isContinuation(target1)) { michael@0: t2 = target1; michael@0: } else { michael@0: t2 = 0; michael@0: } michael@0: michael@0: uint32_t s = 0, t = 0; michael@0: if(s1 == t1 && s2 == t2) { michael@0: return 0; michael@0: } michael@0: s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16); michael@0: t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16); michael@0: if(s < t) { michael@0: return -1; michael@0: } else if(s > t) { michael@0: return 1; michael@0: } else { michael@0: s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8; michael@0: t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8; michael@0: if(s < t) { michael@0: return -1; michael@0: } else if(s > t) { michael@0: return 1; michael@0: } else { michael@0: s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF); michael@0: t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF); michael@0: if(s < t) { michael@0: return -1; michael@0: } else { michael@0: return 1; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: static michael@0: int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) { michael@0: uint32_t bottom = 0, top = src->invUCA->tableSize; michael@0: uint32_t i = 0; michael@0: uint32_t first = 0, second = 0; michael@0: uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); michael@0: int32_t res = 0; michael@0: michael@0: while(bottom < top-1) { michael@0: i = (top+bottom)/2; michael@0: first = *(CETable+3*i); michael@0: second = *(CETable+3*i+1); michael@0: res = compareCEs(first, second, CE, SecondCE); michael@0: if(res > 0) { michael@0: top = i; michael@0: } else if(res < 0) { michael@0: bottom = i; michael@0: } else { michael@0: break; michael@0: } michael@0: } michael@0: michael@0: /* weiv: */ michael@0: /* in searching for elements, I have removed the failure */ michael@0: /* The reason for this is that the builder does not rely */ michael@0: /* on search mechanism telling it that it didn't find an */ michael@0: /* element. However, indirect positioning relies on being */ michael@0: /* able to find the elements around any CE, even if it is */ michael@0: /* not defined in the UCA. */ michael@0: return i; michael@0: /* michael@0: if((first == CE && second == SecondCE)) { michael@0: return i; michael@0: } else { michael@0: return -1; michael@0: } michael@0: */ michael@0: } michael@0: michael@0: static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = { michael@0: 0xFFFF0000, michael@0: 0xFFFFFF00, michael@0: 0xFFFFFFFF michael@0: }; michael@0: michael@0: U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src, michael@0: uint32_t CE, uint32_t contCE, michael@0: uint32_t *nextCE, uint32_t *nextContCE, michael@0: uint32_t strength) michael@0: { michael@0: uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); michael@0: int32_t iCE; michael@0: michael@0: iCE = ucol_inv_findCE(src, CE, contCE); michael@0: michael@0: if(iCE<0) { michael@0: *nextCE = UCOL_NOT_FOUND; michael@0: return -1; michael@0: } michael@0: michael@0: CE &= strengthMask[strength]; michael@0: contCE &= strengthMask[strength]; michael@0: michael@0: *nextCE = CE; michael@0: *nextContCE = contCE; michael@0: michael@0: while((*nextCE & strengthMask[strength]) == CE michael@0: && (*nextContCE & strengthMask[strength]) == contCE) michael@0: { michael@0: *nextCE = (*(CETable+3*(++iCE))); michael@0: *nextContCE = (*(CETable+3*(iCE)+1)); michael@0: } michael@0: michael@0: return iCE; michael@0: } michael@0: michael@0: U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src, michael@0: uint32_t CE, uint32_t contCE, michael@0: uint32_t *prevCE, uint32_t *prevContCE, michael@0: uint32_t strength) michael@0: { michael@0: uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); michael@0: int32_t iCE; michael@0: michael@0: iCE = ucol_inv_findCE(src, CE, contCE); michael@0: michael@0: if(iCE<0) { michael@0: *prevCE = UCOL_NOT_FOUND; michael@0: return -1; michael@0: } michael@0: michael@0: CE &= strengthMask[strength]; michael@0: contCE &= strengthMask[strength]; michael@0: michael@0: *prevCE = CE; michael@0: *prevContCE = contCE; michael@0: michael@0: while((*prevCE & strengthMask[strength]) == CE michael@0: && (*prevContCE & strengthMask[strength])== contCE michael@0: && iCE > 0) /* this condition should prevent falling off the edge of the world */ michael@0: { michael@0: /* here, we end up in a singularity - zero */ michael@0: *prevCE = (*(CETable+3*(--iCE))); michael@0: *prevContCE = (*(CETable+3*(iCE)+1)); michael@0: } michael@0: michael@0: return iCE; michael@0: } michael@0: michael@0: U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE, michael@0: uint32_t prevCE, uint32_t prevContCE) michael@0: { michael@0: if(prevCE == CE && prevContCE == contCE) { michael@0: return UCOL_IDENTICAL; michael@0: } michael@0: if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY]) michael@0: || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY])) michael@0: { michael@0: return UCOL_PRIMARY; michael@0: } michael@0: if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY]) michael@0: || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY])) michael@0: { michael@0: return UCOL_SECONDARY; michael@0: } michael@0: return UCOL_TERTIARY; michael@0: } michael@0: michael@0: michael@0: /*static michael@0: inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) { michael@0: michael@0: uint32_t CE = lh->baseCE; michael@0: uint32_t SecondCE = lh->baseContCE; michael@0: michael@0: uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); michael@0: uint32_t previousCE, previousContCE; michael@0: int32_t iCE; michael@0: michael@0: iCE = ucol_inv_findCE(src, CE, SecondCE); michael@0: michael@0: if(iCE<0) { michael@0: return -1; michael@0: } michael@0: michael@0: CE &= strengthMask[strength]; michael@0: SecondCE &= strengthMask[strength]; michael@0: michael@0: previousCE = CE; michael@0: previousContCE = SecondCE; michael@0: michael@0: while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) { michael@0: previousCE = (*(CETable+3*(--iCE))); michael@0: previousContCE = (*(CETable+3*(iCE)+1)); michael@0: } michael@0: lh->previousCE = previousCE; michael@0: lh->previousContCE = previousContCE; michael@0: michael@0: return iCE; michael@0: }*/ michael@0: michael@0: static michael@0: inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) { michael@0: uint32_t CE = lh->baseCE; michael@0: uint32_t SecondCE = lh->baseContCE; michael@0: michael@0: uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); michael@0: uint32_t nextCE, nextContCE; michael@0: int32_t iCE; michael@0: michael@0: iCE = ucol_inv_findCE(src, CE, SecondCE); michael@0: michael@0: if(iCE<0) { michael@0: return -1; michael@0: } michael@0: michael@0: CE &= strengthMask[strength]; michael@0: SecondCE &= strengthMask[strength]; michael@0: michael@0: nextCE = CE; michael@0: nextContCE = SecondCE; michael@0: michael@0: while((nextCE & strengthMask[strength]) == CE michael@0: && (nextContCE & strengthMask[strength]) == SecondCE) michael@0: { michael@0: nextCE = (*(CETable+3*(++iCE))); michael@0: nextContCE = (*(CETable+3*(iCE)+1)); michael@0: } michael@0: michael@0: lh->nextCE = nextCE; michael@0: lh->nextContCE = nextContCE; michael@0: michael@0: return iCE; michael@0: } michael@0: michael@0: static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) { michael@0: /* reset all the gaps */ michael@0: int32_t i = 0; michael@0: uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); michael@0: uint32_t st = 0; michael@0: uint32_t t1, t2; michael@0: int32_t pos; michael@0: michael@0: UColToken *tok = lh->first; michael@0: uint32_t tokStrength = tok->strength; michael@0: michael@0: for(i = 0; i<3; i++) { michael@0: lh->gapsHi[3*i] = 0; michael@0: lh->gapsHi[3*i+1] = 0; michael@0: lh->gapsHi[3*i+2] = 0; michael@0: lh->gapsLo[3*i] = 0; michael@0: lh->gapsLo[3*i+1] = 0; michael@0: lh->gapsLo[3*i+2] = 0; michael@0: lh->numStr[i] = 0; michael@0: lh->fStrToken[i] = NULL; michael@0: lh->lStrToken[i] = NULL; michael@0: lh->pos[i] = -1; michael@0: } michael@0: michael@0: UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); michael@0: michael@0: if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ michael@0: //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */ michael@0: lh->pos[0] = 0; michael@0: t1 = lh->baseCE; michael@0: t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION; michael@0: lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; michael@0: lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; michael@0: lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; michael@0: uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16); michael@0: primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1); michael@0: michael@0: t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; michael@0: t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER; michael@0: michael@0: lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; michael@0: lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; michael@0: lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; michael@0: } else if(lh->indirect == TRUE && lh->nextCE != 0) { michael@0: //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) { michael@0: lh->pos[0] = 0; michael@0: t1 = lh->baseCE; michael@0: t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION; michael@0: lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; michael@0: lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; michael@0: lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; michael@0: t1 = lh->nextCE; michael@0: t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION; michael@0: lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; michael@0: lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; michael@0: lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; michael@0: } else { michael@0: for(;;) { michael@0: if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { michael@0: if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) { michael@0: lh->fStrToken[tokStrength] = tok; michael@0: } else { /* The CE must be implicit, since it's not in the table */ michael@0: /* Error */ michael@0: *status = U_INTERNAL_PROGRAM_ERROR; michael@0: } michael@0: } michael@0: michael@0: while(tok != NULL && tok->strength >= tokStrength) { michael@0: if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { michael@0: lh->lStrToken[tokStrength] = tok; michael@0: } michael@0: tok = tok->next; michael@0: } michael@0: if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) { michael@0: /* check if previous interval is the same and merge the intervals if it is so */ michael@0: if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) { michael@0: lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1]; michael@0: lh->fStrToken[tokStrength+1] = NULL; michael@0: lh->lStrToken[tokStrength+1] = NULL; michael@0: lh->pos[tokStrength+1] = -1; michael@0: } michael@0: } michael@0: if(tok != NULL) { michael@0: tokStrength = tok->strength; michael@0: } else { michael@0: break; michael@0: } michael@0: } michael@0: for(st = 0; st < 3; st++) { michael@0: if((pos = lh->pos[st]) >= 0) { michael@0: t1 = *(CETable+3*(pos)); michael@0: t2 = *(CETable+3*(pos)+1); michael@0: lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; michael@0: lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; michael@0: //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; michael@0: lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; michael@0: //pos--; michael@0: //t1 = *(CETable+3*(pos)); michael@0: //t2 = *(CETable+3*(pos)+1); michael@0: t1 = lh->baseCE; michael@0: t2 = lh->baseContCE; michael@0: lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; michael@0: lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; michael@0: lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: michael@0: #define ucol_countBytes(value, noOfBytes) \ michael@0: { \ michael@0: uint32_t mask = 0xFFFFFFFF; \ michael@0: (noOfBytes) = 0; \ michael@0: while(mask != 0) { \ michael@0: if(((value) & mask) != 0) { \ michael@0: (noOfBytes)++; \ michael@0: } \ michael@0: mask >>= 8; \ michael@0: } \ michael@0: } michael@0: michael@0: static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) { michael@0: if(U_SUCCESS(*status)) { michael@0: g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); michael@0: } michael@0: return g->current; michael@0: } michael@0: michael@0: static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) { michael@0: /* TODO: rename to enum names */ michael@0: uint32_t high, low, count=1; michael@0: uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF; michael@0: michael@0: if(strength == UCOL_SECONDARY) { michael@0: low = UCOL_COMMON_TOP2<<24; michael@0: high = 0xFFFFFFFF; michael@0: count = 0xFF - UCOL_COMMON_TOP2; michael@0: } else { michael@0: low = UCOL_BYTE_COMMON << 24; //0x05000000; michael@0: high = 0x40000000; michael@0: count = 0x40 - UCOL_BYTE_COMMON; michael@0: } michael@0: michael@0: if(tok->next != NULL && tok->next->strength == strength) { michael@0: count = tok->next->toInsert; michael@0: } michael@0: michael@0: g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); michael@0: g->current = UCOL_BYTE_COMMON<<24; michael@0: michael@0: if(g->noOfRanges == 0) { michael@0: *status = U_INTERNAL_PROGRAM_ERROR; michael@0: } michael@0: return g->current; michael@0: } michael@0: michael@0: static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) { michael@0: uint32_t strength = tok->strength; michael@0: uint32_t low = lows[fStrength*3+strength]; michael@0: uint32_t high = highs[fStrength*3+strength]; michael@0: uint32_t maxByte = 0; michael@0: if(strength == UCOL_TERTIARY) { michael@0: maxByte = 0x3F; michael@0: } else if(strength == UCOL_PRIMARY) { michael@0: maxByte = 0xFE; michael@0: } else { michael@0: maxByte = 0xFF; michael@0: } michael@0: michael@0: uint32_t count = tok->toInsert; michael@0: michael@0: if(low >= high && strength > UCOL_PRIMARY) { michael@0: int32_t s = strength; michael@0: for(;;) { michael@0: s--; michael@0: if(lows[fStrength*3+s] != highs[fStrength*3+s]) { michael@0: if(strength == UCOL_SECONDARY) { michael@0: if (low < UCOL_COMMON_TOP2<<24 ) { michael@0: // Override if low range is less than UCOL_COMMON_TOP2. michael@0: low = UCOL_COMMON_TOP2<<24; michael@0: } michael@0: high = 0xFFFFFFFF; michael@0: } else { michael@0: // Override if low range is less than UCOL_COMMON_BOT3. michael@0: if ( low < UCOL_COMMON_BOT3<<24 ) { michael@0: low = UCOL_COMMON_BOT3<<24; michael@0: } michael@0: high = 0x40000000; michael@0: } michael@0: break; michael@0: } michael@0: if(s<0) { michael@0: *status = U_INTERNAL_PROGRAM_ERROR; michael@0: return 0; michael@0: } michael@0: } michael@0: } michael@0: michael@0: if(low < 0x02000000) { michael@0: // We must not use CE weight byte 02, so we set it as the minimum lower bound. michael@0: // See http://site.icu-project.org/design/collation/bytes michael@0: low = 0x02000000; michael@0: } michael@0: michael@0: if(strength == UCOL_SECONDARY) { /* similar as simple */ michael@0: if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) { michael@0: low = UCOL_COMMON_TOP2<<24; michael@0: } michael@0: if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) { michael@0: high = UCOL_COMMON_TOP2<<24; michael@0: } michael@0: if(low < (UCOL_COMMON_BOT2<<24)) { michael@0: g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges); michael@0: g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); michael@0: //g->current = UCOL_COMMON_BOT2<<24; michael@0: return g->current; michael@0: } michael@0: } michael@0: michael@0: g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); michael@0: if(g->noOfRanges == 0) { michael@0: *status = U_INTERNAL_PROGRAM_ERROR; michael@0: } michael@0: g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); michael@0: return g->current; michael@0: } michael@0: michael@0: static michael@0: uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) { michael@0: uint32_t i = 0; michael@0: UChar c; michael@0: michael@0: if(U_FAILURE(*status)) { michael@0: return 0; michael@0: } michael@0: michael@0: if(sourceLen > resLen) { michael@0: *status = U_MEMORY_ALLOCATION_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: for(i = 0; i < sourceLen; i++) { michael@0: c = source[i]; michael@0: if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ michael@0: switch(c - 0x3000) { michael@0: case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E: michael@0: case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE: michael@0: c++; michael@0: break; michael@0: case 0xF5: michael@0: c = 0x30AB; michael@0: break; michael@0: case 0xF6: michael@0: c = 0x30B1; michael@0: break; michael@0: } michael@0: } michael@0: resBuf[i] = c; michael@0: } michael@0: return sourceLen; michael@0: } michael@0: michael@0: static michael@0: uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) { michael@0: uint32_t i = 0; michael@0: UChar c; michael@0: michael@0: if(U_FAILURE(*status)) { michael@0: return 0; michael@0: } michael@0: michael@0: if(sourceLen > resLen) { michael@0: *status = U_MEMORY_ALLOCATION_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: for(i = 0; i < sourceLen; i++) { michael@0: c = source[i]; michael@0: if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ michael@0: switch(c - 0x3000) { michael@0: case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F: michael@0: case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF: michael@0: c--; michael@0: break; michael@0: case 0xAB: michael@0: c = 0x30F5; michael@0: break; michael@0: case 0xB1: michael@0: c = 0x30F6; michael@0: break; michael@0: } michael@0: } michael@0: resBuf[i] = c; michael@0: } michael@0: return sourceLen; michael@0: } michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: static michael@0: uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) { michael@0: uint32_t i = 0; michael@0: UChar n[128]; michael@0: uint32_t nLen = 0; michael@0: uint32_t uCount = 0, lCount = 0; michael@0: michael@0: collIterate s; michael@0: uint32_t order = 0; michael@0: michael@0: if(U_FAILURE(*status)) { michael@0: return UCOL_LOWER_CASE; michael@0: } michael@0: michael@0: nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status); michael@0: if(U_SUCCESS(*status)) { michael@0: for(i = 0; i < nLen; i++) { michael@0: uprv_init_collIterate(UCA, &n[i], 1, &s, status); michael@0: order = ucol_getNextCE(UCA, &s, status); michael@0: if(isContinuation(order)) { michael@0: *status = U_INTERNAL_PROGRAM_ERROR; michael@0: return UCOL_LOWER_CASE; michael@0: } michael@0: if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) { michael@0: uCount++; michael@0: } else { michael@0: if(u_islower(n[i])) { michael@0: lCount++; michael@0: } else if(U_SUCCESS(*status)) { michael@0: UChar sk[1], lk[1]; michael@0: u_toSmallKana(&n[i], 1, sk, 1, status); michael@0: u_toLargeKana(&n[i], 1, lk, 1, status); michael@0: if(sk[0] == n[i] && lk[0] != n[i]) { michael@0: lCount++; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: if(uCount != 0 && lCount != 0) { michael@0: return UCOL_MIXED_CASE; michael@0: } else if(uCount != 0) { michael@0: return UCOL_UPPER_CASE; michael@0: } else { michael@0: return UCOL_LOWER_CASE; michael@0: } michael@0: } michael@0: michael@0: michael@0: U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) { michael@0: /* this one makes the table and stuff */ michael@0: uint32_t noOfBytes[3]; michael@0: uint32_t i; michael@0: michael@0: for(i = 0; i<3; i++) { michael@0: ucol_countBytes(CEparts[i], noOfBytes[i]); michael@0: } michael@0: michael@0: /* Here we have to pack CEs from parts */ michael@0: michael@0: uint32_t CEi = 0; michael@0: uint32_t value = 0; michael@0: michael@0: while(2*CEi 0) { michael@0: value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ michael@0: } else { michael@0: value = 0; michael@0: } michael@0: michael@0: if(2*CEi>(32-16*(CEi+1))) & 0xFFFF) << 16; michael@0: } michael@0: if(CEi>(32-8*(CEi+1))) & 0xFF) << 8; michael@0: } michael@0: if(CEi>(32-8*(CEi+1))) & 0x3F); michael@0: } michael@0: tok->CEs[CEi] = value; michael@0: CEi++; michael@0: } michael@0: if(CEi == 0) { /* totally ignorable */ michael@0: tok->noOfCEs = 1; michael@0: tok->CEs[0] = 0; michael@0: } else { /* there is at least something */ michael@0: tok->noOfCEs = CEi; michael@0: } michael@0: michael@0: michael@0: // we want to set case bits here and now, not later. michael@0: // Case bits handling michael@0: if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables michael@0: tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field michael@0: int32_t cSize = (tok->source & 0xFF000000) >> 24; michael@0: UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source; michael@0: michael@0: if(cSize > 1) { michael@0: // Do it manually michael@0: tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status); michael@0: } else { michael@0: // Copy it from the UCA michael@0: uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status); michael@0: tok->CEs[0] |= (caseCE & 0xC0); michael@0: } michael@0: } michael@0: michael@0: #if UCOL_DEBUG==2 michael@0: fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2])); michael@0: for(i = 0; inoOfCEs; i++) { michael@0: fprintf(stderr, "%08X ", tok->CEs[i]); michael@0: } michael@0: fprintf(stderr, "\n"); michael@0: #endif michael@0: } michael@0: michael@0: U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) { michael@0: ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT]; michael@0: uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT]; michael@0: michael@0: UColToken *tok = lh->last; michael@0: uint32_t t[UCOL_STRENGTH_LIMIT]; michael@0: michael@0: uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t)); michael@0: michael@0: /* must initialize ranges to avoid memory check warnings */ michael@0: for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) { michael@0: uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges)); michael@0: } michael@0: michael@0: tok->toInsert = 1; michael@0: t[tok->strength] = 1; michael@0: michael@0: while(tok->previous != NULL) { michael@0: if(tok->previous->strength < tok->strength) { /* going up */ michael@0: t[tok->strength] = 0; michael@0: t[tok->previous->strength]++; michael@0: } else if(tok->previous->strength > tok->strength) { /* going down */ michael@0: t[tok->previous->strength] = 1; michael@0: } else { michael@0: t[tok->strength]++; michael@0: } michael@0: tok=tok->previous; michael@0: tok->toInsert = t[tok->strength]; michael@0: } michael@0: michael@0: tok->toInsert = t[tok->strength]; michael@0: ucol_inv_getGapPositions(src, lh, status); michael@0: michael@0: #if UCOL_DEBUG michael@0: fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE); michael@0: int32_t j = 2; michael@0: for(j = 2; j >= 0; j--) { michael@0: fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]); michael@0: fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]); michael@0: } michael@0: tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE]; michael@0: michael@0: do { michael@0: fprintf(stderr,"%i", tok->strength); michael@0: tok = tok->next; michael@0: } while(tok != NULL); michael@0: fprintf(stderr, "\n"); michael@0: michael@0: tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE]; michael@0: michael@0: do { michael@0: fprintf(stderr,"%i", tok->toInsert); michael@0: tok = tok->next; michael@0: } while(tok != NULL); michael@0: #endif michael@0: michael@0: tok = lh->first; michael@0: uint32_t fStrength = UCOL_IDENTICAL; michael@0: uint32_t initStrength = UCOL_IDENTICAL; michael@0: michael@0: michael@0: CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16; michael@0: CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8; michael@0: CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16; michael@0: michael@0: while (tok != NULL && U_SUCCESS(*status)) { michael@0: fStrength = tok->strength; michael@0: if(fStrength < initStrength) { michael@0: initStrength = fStrength; michael@0: if(lh->pos[fStrength] == -1) { michael@0: while(lh->pos[fStrength] == -1 && fStrength > 0) { michael@0: fStrength--; michael@0: } michael@0: if(lh->pos[fStrength] == -1) { michael@0: *status = U_INTERNAL_PROGRAM_ERROR; michael@0: return; michael@0: } michael@0: } michael@0: if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */ michael@0: CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; michael@0: CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1]; michael@0: /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */ michael@0: CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); michael@0: } else if(initStrength == UCOL_SECONDARY) { /* secondaries */ michael@0: CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; michael@0: /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/ michael@0: CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); michael@0: CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); michael@0: } else { /* primaries */ michael@0: /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/ michael@0: CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); michael@0: CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status); michael@0: CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); michael@0: } michael@0: } else { michael@0: if(tok->strength == UCOL_TERTIARY) { michael@0: CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status); michael@0: } else if(tok->strength == UCOL_SECONDARY) { michael@0: CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status); michael@0: CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); michael@0: } else if(tok->strength == UCOL_PRIMARY) { michael@0: CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status); michael@0: CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status); michael@0: CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); michael@0: } michael@0: } michael@0: ucol_doCE(src, CEparts, tok, status); michael@0: tok = tok->next; michael@0: } michael@0: } michael@0: michael@0: U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) { michael@0: UCAElements el; michael@0: UColToken *tok = lh->first; michael@0: UColToken *expt = NULL; michael@0: uint32_t i = 0, j = 0; michael@0: const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status); michael@0: michael@0: while(tok != NULL && U_SUCCESS(*status)) { michael@0: /* first, check if there are any expansions */ michael@0: /* if there are expansions, we need to do a little bit more processing */ michael@0: /* since parts of expansion can be tailored, while others are not */ michael@0: if(tok->expansion != 0) { michael@0: uint32_t len = tok->expansion >> 24; michael@0: uint32_t currentSequenceLen = len; michael@0: uint32_t expOffset = tok->expansion & 0x00FFFFFF; michael@0: //uint32_t exp = currentSequenceLen | expOffset; michael@0: UColToken exp; michael@0: exp.source = currentSequenceLen | expOffset; michael@0: exp.rulesToParseHdl = &(src->source); michael@0: michael@0: while(len > 0) { michael@0: currentSequenceLen = len; michael@0: while(currentSequenceLen > 0) { michael@0: exp.source = (currentSequenceLen << 24) | expOffset; michael@0: if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */ michael@0: uint32_t noOfCEsToCopy = expt->noOfCEs; michael@0: for(j = 0; jexpCEs[tok->noOfExpCEs + j] = expt->CEs[j]; michael@0: } michael@0: tok->noOfExpCEs += noOfCEsToCopy; michael@0: // Smart people never try to add codepoints and CEs. michael@0: // For some odd reason, it won't work. michael@0: expOffset += currentSequenceLen; //noOfCEsToCopy; michael@0: len -= currentSequenceLen; //noOfCEsToCopy; michael@0: break; michael@0: } else { michael@0: currentSequenceLen--; michael@0: } michael@0: } michael@0: if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */ michael@0: /* will have to get one from UCA */ michael@0: /* first, get the UChars from the rules */ michael@0: /* then pick CEs out until there is no more and stuff them into expansion */ michael@0: collIterate s; michael@0: uint32_t order = 0; michael@0: uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status); michael@0: michael@0: for(;;) { michael@0: order = ucol_getNextCE(src->UCA, &s, status); michael@0: if(order == UCOL_NO_MORE_CES) { michael@0: break; michael@0: } michael@0: tok->expCEs[tok->noOfExpCEs++] = order; michael@0: } michael@0: expOffset++; michael@0: len--; michael@0: } michael@0: } michael@0: } else { michael@0: tok->noOfExpCEs = 0; michael@0: } michael@0: michael@0: /* set the ucaelement with obtained values */ michael@0: el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs; michael@0: /* copy CEs */ michael@0: for(i = 0; inoOfCEs; i++) { michael@0: el.CEs[i] = tok->CEs[i]; michael@0: } michael@0: for(i = 0; inoOfExpCEs; i++) { michael@0: el.CEs[i+tok->noOfCEs] = tok->expCEs[i]; michael@0: } michael@0: michael@0: /* copy UChars */ michael@0: // We kept prefix and source kind of together, as it is a kind of a contraction. michael@0: // However, now we have to slice the prefix off the main thing - michael@0: el.prefix = el.prefixChars; michael@0: el.cPoints = el.uchars; michael@0: if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the michael@0: // addPrefix function in ucol_elm. The reason is that we need to add both composed AND michael@0: // decomposed elements to the unsaf table. michael@0: el.prefixSize = tok->prefix>>24; michael@0: uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar)); michael@0: michael@0: el.cSize = (tok->source >> 24)-(tok->prefix>>24); michael@0: uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar)); michael@0: } else { michael@0: el.prefixSize = 0; michael@0: *el.prefix = 0; michael@0: michael@0: el.cSize = (tok->source >> 24); michael@0: uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar)); michael@0: } michael@0: if(src->UCA != NULL) { michael@0: for(i = 0; iimage->jamoSpecial = TRUE; michael@0: } michael@0: } michael@0: if (!src->buildCCTabFlag && el.cSize > 0) { michael@0: // Check the trailing canonical combining class (tccc) of the last character. michael@0: const UChar *s = el.cPoints + el.cSize; michael@0: uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s); michael@0: if ((fcd & 0xff) != 0) { michael@0: src->buildCCTabFlag = TRUE; michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* and then, add it */ michael@0: #if UCOL_DEBUG==2 michael@0: fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]); michael@0: #endif michael@0: uprv_uca_addAnElement(t, &el, status); michael@0: michael@0: #if UCOL_DEBUG_DUPLICATES michael@0: if(*status != U_ZERO_ERROR) { michael@0: fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource); michael@0: *status = U_ZERO_ERROR; michael@0: } michael@0: #endif michael@0: michael@0: tok = tok->next; michael@0: } michael@0: } michael@0: michael@0: U_CDECL_BEGIN michael@0: static UBool U_CALLCONV michael@0: _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) { michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: tempUCATable *t = (tempUCATable *)context; michael@0: if(value == 0) { michael@0: while(start < limit) { michael@0: uint32_t CE = utrie_get32(t->mapping, start, NULL); michael@0: if(CE == UCOL_NOT_FOUND) { michael@0: UCAElements el; michael@0: el.isThai = FALSE; michael@0: el.prefixSize = 0; michael@0: el.prefixChars[0] = 0; michael@0: el.prefix = el.prefixChars; michael@0: el.cPoints = el.uchars; michael@0: michael@0: el.cSize = 0; michael@0: U16_APPEND_UNSAFE(el.uchars, el.cSize, start); michael@0: michael@0: el.noOfCEs = 1; michael@0: el.CEs[0] = 0; michael@0: uprv_uca_addAnElement(t, &el, &status); michael@0: michael@0: } michael@0: start++; michael@0: } michael@0: } michael@0: if(U_FAILURE(status)) { michael@0: return FALSE; michael@0: } else { michael@0: return TRUE; michael@0: } michael@0: } michael@0: U_CDECL_END michael@0: michael@0: static void michael@0: ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t, michael@0: UChar32 start, UChar32 end, michael@0: UErrorCode *status) michael@0: { michael@0: //UChar decomp[256]; michael@0: uint32_t CE = UCOL_NOT_FOUND; michael@0: UChar32 u = 0; michael@0: UCAElements el; michael@0: el.isThai = FALSE; michael@0: el.prefixSize = 0; michael@0: el.prefixChars[0] = 0; michael@0: collIterate colIt; michael@0: michael@0: if(U_SUCCESS(*status)) { michael@0: for(u = start; u<=end; u++) { michael@0: if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND michael@0: /* this test is for contractions that are missing the starting element. */ michael@0: || ((isCntTableElement(CE)) && michael@0: (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND)) michael@0: ) michael@0: { michael@0: el.cSize = 0; michael@0: U16_APPEND_UNSAFE(el.uchars, el.cSize, u); michael@0: //decomp[0] = (UChar)u; michael@0: //el.uchars[0] = (UChar)u; michael@0: el.cPoints = el.uchars; michael@0: //el.cSize = 1; michael@0: el.noOfCEs = 0; michael@0: el.prefix = el.prefixChars; michael@0: el.prefixSize = 0; michael@0: //uprv_init_collIterate(src->UCA, decomp, 1, &colIt); michael@0: // We actually want to check whether this element is a special michael@0: // If it is an implicit element (hangul, CJK - we want to copy the michael@0: // special, not the resolved CEs) - for hangul, copying resolved michael@0: // would just make things the same (there is an expansion and it michael@0: // takes approximately the same amount of time to resolve as michael@0: // falling back to the UCA). michael@0: /* michael@0: UTRIE_GET32(src->UCA->mapping, u, CE); michael@0: tag = getCETag(CE); michael@0: if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG michael@0: || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG michael@0: || tag == LEAD_SURROGATE_TAG) { michael@0: el.CEs[el.noOfCEs++] = CE; michael@0: } else { michael@0: */ michael@0: // It turns out that it does not make sense to keep implicits michael@0: // unresolved. The cost of resolving them is big enough so that michael@0: // it doesn't make any difference whether we have to go to the UCA michael@0: // or not. michael@0: { michael@0: uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status); michael@0: while(CE != UCOL_NO_MORE_CES) { michael@0: CE = ucol_getNextCE(src->UCA, &colIt, status); michael@0: if(CE != UCOL_NO_MORE_CES) { michael@0: el.CEs[el.noOfCEs++] = CE; michael@0: } michael@0: } michael@0: } michael@0: uprv_uca_addAnElement(t, &el, status); michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: U_CFUNC UCATableHeader * michael@0: ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) { michael@0: U_NAMESPACE_USE michael@0: michael@0: uint32_t i = 0; michael@0: if(U_FAILURE(*status)) { michael@0: return NULL; michael@0: } michael@0: /* michael@0: 2. Eliminate the negative lists by doing the following for each non-null negative list: michael@0: o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE, michael@0: create new ListHeader X michael@0: o reverse the list, add to the end of X's positive list. Reset the strength of the michael@0: first item you add, based on the stronger strength levels of the two lists. michael@0: */ michael@0: /* michael@0: 3. For each ListHeader with a non-null positive list: michael@0: */ michael@0: /* michael@0: o Find all character strings with CEs between the baseCE and the michael@0: next/previous CE, at the strength of the first token. Add these to the michael@0: tailoring. michael@0: ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the michael@0: tailoring has & x < z... michael@0: ? Then we change the tailoring to & x <<< X << x' <<< X' < z ... michael@0: */ michael@0: /* It is possible that this part should be done even while constructing list */ michael@0: /* The problem is that it is unknown what is going to be the strongest weight */ michael@0: /* So we might as well do it here */ michael@0: michael@0: /* michael@0: o Allocate CEs for each token in the list, based on the total number N of the michael@0: largest level difference, and the gap G between baseCE and nextCE at that michael@0: level. The relation * between the last item and nextCE is the same as the michael@0: strongest strength. michael@0: o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1) michael@0: ? There are 3 primary items: a, d, e. Fit them into the primary gap. michael@0: Then fit b and c into the secondary gap between a and d, then fit q michael@0: into the tertiary gap between b and c. michael@0: michael@0: o Example: baseCE << b <<< q << c * nextCE(X,2) michael@0: ? There are 2 secondary items: b, c. Fit them into the secondary gap. michael@0: Then fit q into the tertiary gap between b and c. michael@0: o When incrementing primary values, we will not cross high byte michael@0: boundaries except where there is only a single-byte primary. That is to michael@0: ensure that the script reordering will continue to work. michael@0: */ michael@0: UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader)); michael@0: /* test for NULL */ michael@0: if (image == NULL) { michael@0: *status = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader)); michael@0: michael@0: for(i = 0; iresultLen; i++) { michael@0: /* now we need to generate the CEs */ michael@0: /* We stuff the initial value in the buffers, and increase the appropriate buffer */ michael@0: /* According to strength */ michael@0: if(U_SUCCESS(*status)) { michael@0: if(src->lh[i].first) { // if there are any elements michael@0: // due to the way parser works, subsequent tailorings michael@0: // may remove all the elements from a sequence, therefore michael@0: // leaving an empty tailoring sequence. michael@0: ucol_initBuffers(src, &src->lh[i], status); michael@0: } michael@0: } michael@0: if(U_FAILURE(*status)) { michael@0: uprv_free(image); michael@0: return NULL; michael@0: } michael@0: } michael@0: michael@0: if(src->varTop != NULL) { /* stuff the variable top value */ michael@0: src->opts->variableTopValue = (*(src->varTop->CEs))>>16; michael@0: /* remove it from the list */ michael@0: if(src->varTop->listHeader->first == src->varTop) { /* first in list */ michael@0: src->varTop->listHeader->first = src->varTop->next; michael@0: } michael@0: if(src->varTop->listHeader->last == src->varTop) { /* first in list */ michael@0: src->varTop->listHeader->last = src->varTop->previous; michael@0: } michael@0: if(src->varTop->next != NULL) { michael@0: src->varTop->next->previous = src->varTop->previous; michael@0: } michael@0: if(src->varTop->previous != NULL) { michael@0: src->varTop->previous->next = src->varTop->next; michael@0: } michael@0: } michael@0: michael@0: michael@0: tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status); michael@0: if(U_FAILURE(*status)) { michael@0: uprv_free(image); michael@0: return NULL; michael@0: } michael@0: michael@0: michael@0: /* After this, we have assigned CE values to all regular CEs */ michael@0: /* now we will go through list once more and resolve expansions, */ michael@0: /* make UCAElements structs and add them to table */ michael@0: for(i = 0; iresultLen; i++) { michael@0: /* now we need to generate the CEs */ michael@0: /* We stuff the initial value in the buffers, and increase the appropriate buffer */ michael@0: /* According to strength */ michael@0: if(U_SUCCESS(*status)) { michael@0: ucol_createElements(src, t, &src->lh[i], status); michael@0: } michael@0: } michael@0: michael@0: UCAElements el; michael@0: el.isThai = FALSE; michael@0: el.prefixSize = 0; michael@0: el.prefixChars[0] = 0; michael@0: michael@0: /* add latin-1 stuff */ michael@0: ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status); michael@0: michael@0: /* add stuff for copying */ michael@0: if(src->copySet != NULL) { michael@0: int32_t i = 0; michael@0: UnicodeSet *set = (UnicodeSet *)src->copySet; michael@0: for(i = 0; i < set->getRangeCount(); i++) { michael@0: ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status); michael@0: } michael@0: } michael@0: michael@0: if(U_SUCCESS(*status)) { michael@0: /* copy contractions from the UCA - this is felt mostly for cyrillic*/ michael@0: michael@0: uint32_t tailoredCE = UCOL_NOT_FOUND; michael@0: UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos); michael@0: int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosWidth; michael@0: UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status); michael@0: // Check for null pointer michael@0: if (ucaEl == NULL) { michael@0: *status = U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: while(*conts != 0) { michael@0: // A continuation is NUL-terminated and NUL-padded michael@0: // except if it has the maximum length. michael@0: int32_t contractionLength = maxUCAContractionLength; michael@0: while(contractionLength > 0 && conts[contractionLength - 1] == 0) { michael@0: --contractionLength; michael@0: } michael@0: UChar32 first; michael@0: int32_t firstLength = 0; michael@0: U16_NEXT(conts, firstLength, contractionLength, first); michael@0: tailoredCE = utrie_get32(t->mapping, first, NULL); michael@0: if(tailoredCE != UCOL_NOT_FOUND) { michael@0: UBool needToAdd = TRUE; michael@0: if(isCntTableElement(tailoredCE)) { michael@0: if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+firstLength, status) == TRUE) { michael@0: needToAdd = FALSE; michael@0: } michael@0: } michael@0: if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) { michael@0: UCAElements elm; michael@0: elm.cPoints = el.uchars; michael@0: elm.noOfCEs = 0; michael@0: elm.uchars[0] = *conts; michael@0: elm.uchars[1] = 0; michael@0: elm.cSize = 1; michael@0: elm.prefixChars[0] = *(conts+2); michael@0: elm.isThai = FALSE; michael@0: elm.prefix = elm.prefixChars; michael@0: elm.prefixSize = 1; michael@0: UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm); michael@0: if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) { michael@0: needToAdd = TRUE; michael@0: } michael@0: } michael@0: if(src->removeSet != NULL && uset_contains(src->removeSet, first)) { michael@0: needToAdd = FALSE; michael@0: } michael@0: michael@0: if(needToAdd == TRUE) { // we need to add if this contraction is not tailored. michael@0: if (*(conts+1) != 0) { // contractions michael@0: el.prefix = el.prefixChars; michael@0: el.prefixSize = 0; michael@0: el.cPoints = el.uchars; michael@0: el.noOfCEs = 0; michael@0: u_memcpy(el.uchars, conts, contractionLength); michael@0: el.cSize = contractionLength; michael@0: ucol_setText(ucaEl, el.uchars, el.cSize, status); michael@0: } michael@0: else { // pre-context character michael@0: UChar str[4] = { 0 }; michael@0: int32_t len=0; michael@0: int32_t preKeyLen=0; michael@0: michael@0: el.cPoints = el.uchars; michael@0: el.noOfCEs = 0; michael@0: el.uchars[0] = *conts; michael@0: el.uchars[1] = 0; michael@0: el.cSize = 1; michael@0: el.prefixChars[0] = *(conts+2); michael@0: el.prefix = el.prefixChars; michael@0: el.prefixSize = 1; michael@0: if (el.prefixChars[0]!=0) { michael@0: // get CE of prefix character first michael@0: str[0]=el.prefixChars[0]; michael@0: str[1]=0; michael@0: ucol_setText(ucaEl, str, 1, status); michael@0: while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) michael@0: != UCOL_NULLORDER) { michael@0: preKeyLen++; // count number of keys for prefix character michael@0: } michael@0: str[len++] = el.prefixChars[0]; michael@0: } michael@0: michael@0: str[len++] = el.uchars[0]; michael@0: str[len]=0; michael@0: ucol_setText(ucaEl, str, len, status); michael@0: // Skip the keys for prefix character, then copy the rest to el. michael@0: while ((preKeyLen-->0) && michael@0: (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) { michael@0: continue; michael@0: } michael@0: michael@0: } michael@0: while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) { michael@0: el.noOfCEs++; michael@0: } michael@0: uprv_uca_addAnElement(t, &el, status); michael@0: } michael@0: michael@0: } else if(src->removeSet != NULL && uset_contains(src->removeSet, first)) { michael@0: ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status); michael@0: } michael@0: conts+=maxUCAContractionLength; michael@0: } michael@0: ucol_closeElements(ucaEl); michael@0: } michael@0: michael@0: // Add completely ignorable elements michael@0: utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t); michael@0: michael@0: // add tailoring characters related canonical closures michael@0: uprv_uca_canonicalClosure(t, src, NULL, status); michael@0: michael@0: /* still need to produce compatibility closure */ michael@0: michael@0: UCATableHeader *myData = uprv_uca_assembleTable(t, status); michael@0: michael@0: uprv_uca_closeTempTable(t); michael@0: uprv_free(image); michael@0: michael@0: return myData; michael@0: } michael@0: michael@0: U_CDECL_BEGIN michael@0: static UBool U_CALLCONV michael@0: ucol_bld_cleanup(void) michael@0: { michael@0: udata_close(invUCA_DATA_MEM); michael@0: invUCA_DATA_MEM = NULL; michael@0: _staticInvUCA = NULL; michael@0: gStaticInvUCAInitOnce.reset(); michael@0: return TRUE; michael@0: } michael@0: U_CDECL_END michael@0: michael@0: static void U_CALLCONV initInverseUCA(UErrorCode &status) { michael@0: U_ASSERT(invUCA_DATA_MEM == NULL); michael@0: U_ASSERT(_staticInvUCA == NULL); michael@0: ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup); michael@0: InverseUCATableHeader *newInvUCA = NULL; michael@0: UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, &status); michael@0: michael@0: if(U_FAILURE(status)) { michael@0: if (result) { michael@0: udata_close(result); michael@0: } michael@0: // This is not needed, as we are talking about michael@0: // memory we got from UData michael@0: //uprv_free(newInvUCA); michael@0: return; michael@0: } michael@0: michael@0: if(result != NULL) { /* It looks like sometimes we can fail to find the data file */ michael@0: newInvUCA = (InverseUCATableHeader *)udata_getMemory(result); michael@0: UCollator *UCA = ucol_initUCA(&status); michael@0: // UCA versions of UCA and inverse UCA should match michael@0: if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) { michael@0: status = U_INVALID_FORMAT_ERROR; michael@0: udata_close(result); michael@0: return; michael@0: } michael@0: michael@0: invUCA_DATA_MEM = result; michael@0: _staticInvUCA = newInvUCA; michael@0: } michael@0: } michael@0: michael@0: michael@0: U_CAPI const InverseUCATableHeader * U_EXPORT2 michael@0: ucol_initInverseUCA(UErrorCode *status) michael@0: { michael@0: umtx_initOnce(gStaticInvUCAInitOnce, &initInverseUCA, *status); michael@0: return _staticInvUCA; michael@0: } michael@0: michael@0: /* This is the data that is used for non-script reordering codes. These _must_ be kept michael@0: * in order that they are to be applied as defaults and in synch with the UColReorderCode enum. michael@0: */ michael@0: static const char * const ReorderingTokenNames[] = { michael@0: "SPACE", michael@0: "PUNCT", michael@0: "SYMBOL", michael@0: "CURRENCY", michael@0: "DIGIT" michael@0: }; michael@0: michael@0: static void toUpper(const char* src, char* dst, uint32_t length) { michael@0: for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) { michael@0: *dst = uprv_toupper(*src); michael@0: } michael@0: *dst = '\0'; michael@0: } michael@0: michael@0: U_INTERNAL int32_t U_EXPORT2 michael@0: ucol_findReorderingEntry(const char* name) { michael@0: char buffer[32]; michael@0: toUpper(name, buffer, 32); michael@0: for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) { michael@0: if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) { michael@0: return entry + UCOL_REORDER_CODE_FIRST; michael@0: } michael@0: } michael@0: return USCRIPT_INVALID_CODE; michael@0: } michael@0: michael@0: #endif /* #if !UCONFIG_NO_COLLATION */