michael@0: /* michael@0: ******************************************************************************* michael@0: * michael@0: * Copyright (C) 2009-2013, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ******************************************************************************* michael@0: * file name: normalizer2impl.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2009nov22 michael@0: * created by: Markus W. Scherer michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_NORMALIZATION michael@0: michael@0: #include "unicode/normalizer2.h" michael@0: #include "unicode/udata.h" michael@0: #include "unicode/ustring.h" michael@0: #include "unicode/utf16.h" michael@0: #include "cmemory.h" michael@0: #include "mutex.h" michael@0: #include "normalizer2impl.h" michael@0: #include "putilimp.h" michael@0: #include "uassert.h" michael@0: #include "uset_imp.h" michael@0: #include "utrie2.h" michael@0: #include "uvector.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: // ReorderingBuffer -------------------------------------------------------- *** michael@0: michael@0: UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) { michael@0: int32_t length=str.length(); michael@0: start=str.getBuffer(destCapacity); michael@0: if(start==NULL) { michael@0: // getBuffer() already did str.setToBogus() michael@0: errorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: return FALSE; michael@0: } michael@0: limit=start+length; michael@0: remainingCapacity=str.getCapacity()-length; michael@0: reorderStart=start; michael@0: if(start==limit) { michael@0: lastCC=0; michael@0: } else { michael@0: setIterator(); michael@0: lastCC=previousCC(); michael@0: // Set reorderStart after the last code point with cc<=1 if there is one. michael@0: if(lastCC>1) { michael@0: while(previousCC()>1) {} michael@0: } michael@0: reorderStart=codePointLimit; michael@0: } michael@0: return TRUE; michael@0: } michael@0: michael@0: UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const { michael@0: int32_t length=(int32_t)(limit-start); michael@0: return michael@0: length==(int32_t)(otherLimit-otherStart) && michael@0: 0==u_memcmp(start, otherStart, length); michael@0: } michael@0: michael@0: UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) { michael@0: if(remainingCapacity<2 && !resize(2, errorCode)) { michael@0: return FALSE; michael@0: } michael@0: if(lastCC<=cc || cc==0) { michael@0: limit[0]=U16_LEAD(c); michael@0: limit[1]=U16_TRAIL(c); michael@0: limit+=2; michael@0: lastCC=cc; michael@0: if(cc<=1) { michael@0: reorderStart=limit; michael@0: } michael@0: } else { michael@0: insert(c, cc); michael@0: } michael@0: remainingCapacity-=2; michael@0: return TRUE; michael@0: } michael@0: michael@0: UBool ReorderingBuffer::append(const UChar *s, int32_t length, michael@0: uint8_t leadCC, uint8_t trailCC, michael@0: UErrorCode &errorCode) { michael@0: if(length==0) { michael@0: return TRUE; michael@0: } michael@0: if(remainingCapacity=codePointStart) { michael@0: return 0; michael@0: } michael@0: UChar32 c=*--codePointStart; michael@0: if(ccc;) {} michael@0: // insert c at codePointLimit, after the character with prevCC<=cc michael@0: UChar *q=limit; michael@0: UChar *r=limit+=U16_LENGTH(c); michael@0: do { michael@0: *--r=*--q; michael@0: } while(codePointLimit!=q); michael@0: writeCodePoint(q, c); michael@0: if(cc<=1) { michael@0: reorderStart=r; michael@0: } michael@0: } michael@0: michael@0: // Normalizer2Impl --------------------------------------------------------- *** michael@0: michael@0: struct CanonIterData : public UMemory { michael@0: CanonIterData(UErrorCode &errorCode); michael@0: ~CanonIterData(); michael@0: void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode); michael@0: UTrie2 *trie; michael@0: UVector canonStartSets; // contains UnicodeSet * michael@0: }; michael@0: michael@0: Normalizer2Impl::~Normalizer2Impl() { michael@0: udata_close(memory); michael@0: utrie2_close(normTrie); michael@0: delete fCanonIterData; michael@0: } michael@0: michael@0: UBool U_CALLCONV michael@0: Normalizer2Impl::isAcceptable(void *context, michael@0: const char * /* type */, const char * /*name*/, michael@0: const UDataInfo *pInfo) { michael@0: if( michael@0: pInfo->size>=20 && michael@0: pInfo->isBigEndian==U_IS_BIG_ENDIAN && michael@0: pInfo->charsetFamily==U_CHARSET_FAMILY && michael@0: pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ michael@0: pInfo->dataFormat[1]==0x72 && michael@0: pInfo->dataFormat[2]==0x6d && michael@0: pInfo->dataFormat[3]==0x32 && michael@0: pInfo->formatVersion[0]==2 michael@0: ) { michael@0: Normalizer2Impl *me=(Normalizer2Impl *)context; michael@0: uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4); michael@0: return TRUE; michael@0: } else { michael@0: return FALSE; michael@0: } michael@0: } michael@0: michael@0: void michael@0: Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) { michael@0: if(U_FAILURE(errorCode)) { michael@0: return; michael@0: } michael@0: memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode); michael@0: if(U_FAILURE(errorCode)) { michael@0: return; michael@0: } michael@0: const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory); michael@0: const int32_t *inIndexes=(const int32_t *)inBytes; michael@0: int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4; michael@0: if(indexesLength<=IX_MIN_MAYBE_YES) { michael@0: errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes. michael@0: return; michael@0: } michael@0: michael@0: minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; michael@0: minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; michael@0: michael@0: minYesNo=inIndexes[IX_MIN_YES_NO]; michael@0: minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; michael@0: minNoNo=inIndexes[IX_MIN_NO_NO]; michael@0: limitNoNo=inIndexes[IX_LIMIT_NO_NO]; michael@0: minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; michael@0: michael@0: int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET]; michael@0: int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; michael@0: normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, michael@0: inBytes+offset, nextOffset-offset, NULL, michael@0: &errorCode); michael@0: if(U_FAILURE(errorCode)) { michael@0: return; michael@0: } michael@0: michael@0: offset=nextOffset; michael@0: nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; michael@0: maybeYesCompositions=(const uint16_t *)(inBytes+offset); michael@0: extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes); michael@0: michael@0: // smallFCD: new in formatVersion 2 michael@0: offset=nextOffset; michael@0: smallFCD=inBytes+offset; michael@0: michael@0: // Build tccc180[]. michael@0: // gennorm2 enforces lccc=0 for c>=1) { michael@0: if((c&0xff)==0) { michael@0: bits=smallFCD[c>>8]; // one byte per 0x100 code points michael@0: } michael@0: if(bits&1) { michael@0: for(int i=0; i<0x20; ++i, ++c) { michael@0: tccc180[c]=(uint8_t)getFCD16FromNormData(c); michael@0: } michael@0: } else { michael@0: uprv_memset(tccc180+c, 0, 0x20); michael@0: c+=0x20; michael@0: } michael@0: } michael@0: } michael@0: michael@0: uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const { michael@0: UChar32 c; michael@0: if(cpStart==(cpLimit-1)) { michael@0: c=*cpStart; michael@0: } else { michael@0: c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]); michael@0: } michael@0: uint16_t prevNorm16=getNorm16(c); michael@0: if(prevNorm16<=minYesNo) { michael@0: return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 michael@0: } else { michael@0: return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo michael@0: } michael@0: } michael@0: michael@0: U_CDECL_BEGIN michael@0: michael@0: static UBool U_CALLCONV michael@0: enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) { michael@0: /* add the start code point to the USet */ michael@0: const USetAdder *sa=(const USetAdder *)context; michael@0: sa->add(sa->set, start); michael@0: return TRUE; michael@0: } michael@0: michael@0: static uint32_t U_CALLCONV michael@0: segmentStarterMapper(const void * /*context*/, uint32_t value) { michael@0: return value&CANON_NOT_SEGMENT_STARTER; michael@0: } michael@0: michael@0: U_CDECL_END michael@0: michael@0: void michael@0: Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const { michael@0: /* add the start code point of each same-value range of each trie */ michael@0: utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa); michael@0: michael@0: /* add Hangul LV syllables and LV+1 because of skippables */ michael@0: for(UChar c=Hangul::HANGUL_BASE; cadd(sa->set, c); michael@0: sa->add(sa->set, c+1); michael@0: } michael@0: sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */ michael@0: } michael@0: michael@0: void michael@0: Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const { michael@0: /* add the start code point of each same-value range of the canonical iterator data trie */ michael@0: if(ensureCanonIterData(errorCode)) { michael@0: // currently only used for the SEGMENT_STARTER property michael@0: utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa); michael@0: } michael@0: } michael@0: michael@0: const UChar * michael@0: Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src, michael@0: UChar32 minNeedDataCP, michael@0: ReorderingBuffer *buffer, michael@0: UErrorCode &errorCode) const { michael@0: // Make some effort to support NUL-terminated strings reasonably. michael@0: // Take the part of the fast quick check loop that does not look up michael@0: // data and check the first part of the string. michael@0: // After this prefix, determine the string length to simplify the rest michael@0: // of the code. michael@0: const UChar *prevSrc=src; michael@0: UChar c; michael@0: while((c=*src++)appendZeroCC(prevSrc, src, errorCode); michael@0: } michael@0: } michael@0: return src; michael@0: } michael@0: michael@0: // Dual functionality: michael@0: // buffer!=NULL: normalize michael@0: // buffer==NULL: isNormalized/spanQuickCheckYes michael@0: const UChar * michael@0: Normalizer2Impl::decompose(const UChar *src, const UChar *limit, michael@0: ReorderingBuffer *buffer, michael@0: UErrorCode &errorCode) const { michael@0: UChar32 minNoCP=minDecompNoCP; michael@0: if(limit==NULL) { michael@0: src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode); michael@0: if(U_FAILURE(errorCode)) { michael@0: return src; michael@0: } michael@0: limit=u_strchr(src, 0); michael@0: } michael@0: michael@0: const UChar *prevSrc; michael@0: UChar32 c=0; michael@0: uint16_t norm16=0; michael@0: michael@0: // only for quick check michael@0: const UChar *prevBoundary=src; michael@0: uint8_t prevCC=0; michael@0: michael@0: for(;;) { michael@0: // count code units below the minimum or with irrelevant data for the quick check michael@0: for(prevSrc=src; src!=limit;) { michael@0: if( (c=*src)appendZeroCC(prevSrc, src, errorCode)) { michael@0: break; michael@0: } michael@0: } else { michael@0: prevCC=0; michael@0: prevBoundary=src; michael@0: } michael@0: } michael@0: if(src==limit) { michael@0: break; michael@0: } michael@0: michael@0: // Check one above-minimum, relevant code point. michael@0: src+=U16_LENGTH(c); michael@0: if(buffer!=NULL) { michael@0: if(!decompose(c, norm16, *buffer, errorCode)) { michael@0: break; michael@0: } michael@0: } else { michael@0: if(isDecompYes(norm16)) { michael@0: uint8_t cc=getCCFromYesOrMaybe(norm16); michael@0: if(prevCC<=cc || cc==0) { michael@0: prevCC=cc; michael@0: if(cc<=1) { michael@0: prevBoundary=src; michael@0: } michael@0: continue; michael@0: } michael@0: } michael@0: return prevBoundary; // "no" or cc out of order michael@0: } michael@0: } michael@0: return src; michael@0: } michael@0: michael@0: // Decompose a short piece of text which is likely to contain characters that michael@0: // fail the quick check loop and/or where the quick check loop's overhead michael@0: // is unlikely to be amortized. michael@0: // Called by the compose() and makeFCD() implementations. michael@0: UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit, michael@0: ReorderingBuffer &buffer, michael@0: UErrorCode &errorCode) const { michael@0: while(src>8); michael@0: if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { michael@0: leadCC=(uint8_t)(*(mapping-1)>>8); michael@0: } else { michael@0: leadCC=0; michael@0: } michael@0: return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode); michael@0: } michael@0: } michael@0: } michael@0: michael@0: const UChar * michael@0: Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const { michael@0: const UChar *decomp=NULL; michael@0: uint16_t norm16; michael@0: for(;;) { michael@0: if(c>7)&1)-1; michael@0: uint16_t rm0=*rawMapping; michael@0: if(rm0<=MAPPING_LENGTH_MASK) { michael@0: length=rm0; michael@0: return (const UChar *)rawMapping-rm0; michael@0: } else { michael@0: // Copy the normal mapping and replace its first two code units with rm0. michael@0: buffer[0]=(UChar)rm0; michael@0: u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2); michael@0: length=mLength-1; michael@0: return buffer; michael@0: } michael@0: } else { michael@0: length=mLength; michael@0: return (const UChar *)mapping+1; michael@0: } michael@0: } michael@0: } michael@0: michael@0: void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit, michael@0: UBool doDecompose, michael@0: UnicodeString &safeMiddle, michael@0: ReorderingBuffer &buffer, michael@0: UErrorCode &errorCode) const { michael@0: buffer.copyReorderableSuffixTo(safeMiddle); michael@0: if(doDecompose) { michael@0: decompose(src, limit, &buffer, errorCode); michael@0: return; michael@0: } michael@0: // Just merge the strings at the boundary. michael@0: ForwardUTrie2StringIterator iter(normTrie, src, limit); michael@0: uint8_t firstCC, prevCC, cc; michael@0: firstCC=prevCC=cc=getCC(iter.next16()); michael@0: while(cc!=0) { michael@0: prevCC=cc; michael@0: cc=getCC(iter.next16()); michael@0: }; michael@0: if(limit==NULL) { // appendZeroCC() needs limit!=NULL michael@0: limit=u_strchr(iter.codePointStart, 0); michael@0: } michael@0: michael@0: if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) { michael@0: buffer.appendZeroCC(iter.codePointStart, limit, errorCode); michael@0: } michael@0: } michael@0: michael@0: // Note: hasDecompBoundary() could be implemented as aliases to michael@0: // hasFCDBoundaryBefore() and hasFCDBoundaryAfter() michael@0: // at the cost of building the FCD trie for a decomposition normalizer. michael@0: UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const { michael@0: for(;;) { michael@0: if(cMIN_NORMAL_MAYBE_YES) { michael@0: return FALSE; // ccc!=0 michael@0: } else if(isDecompNoAlgorithmic(norm16)) { michael@0: c=mapAlgorithmic(c, norm16); michael@0: } else { michael@0: // c decomposes, get everything from the variable-length extra data michael@0: const uint16_t *mapping=getMapping(norm16); michael@0: uint16_t firstUnit=*mapping; michael@0: if((firstUnit&MAPPING_LENGTH_MASK)==0) { michael@0: return FALSE; michael@0: } michael@0: if(!before) { michael@0: // decomp after-boundary: same as hasFCDBoundaryAfter(), michael@0: // fcd16<=1 || trailCC==0 michael@0: if(firstUnit>0x1ff) { michael@0: return FALSE; // trailCC>1 michael@0: } michael@0: if(firstUnit<=0xff) { michael@0: return TRUE; // trailCC==0 michael@0: } michael@0: // if(trailCC==1) test leadCC==0, same as checking for before-boundary michael@0: } michael@0: // TRUE if leadCC==0 (hasFCDBoundaryBefore()) michael@0: return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0; michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * Finds the recomposition result for michael@0: * a forward-combining "lead" character, michael@0: * specified with a pointer to its compositions list, michael@0: * and a backward-combining "trail" character. michael@0: * michael@0: * If the lead and trail characters combine, then this function returns michael@0: * the following "compositeAndFwd" value: michael@0: * Bits 21..1 composite character michael@0: * Bit 0 set if the composite is a forward-combining starter michael@0: * otherwise it returns -1. michael@0: * michael@0: * The compositions list has (trail, compositeAndFwd) pair entries, michael@0: * encoded as either pairs or triples of 16-bit units. michael@0: * The last entry has the high bit of its first unit set. michael@0: * michael@0: * The list is sorted by ascending trail characters (there are no duplicates). michael@0: * A linear search is used. michael@0: * michael@0: * See normalizer2impl.h for a more detailed description michael@0: * of the compositions list format. michael@0: */ michael@0: int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) { michael@0: uint16_t key1, firstUnit; michael@0: if(trail(firstUnit=*list)) { michael@0: list+=2+(firstUnit&COMP_1_TRIPLE); michael@0: } michael@0: if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { michael@0: if(firstUnit&COMP_1_TRIPLE) { michael@0: return ((int32_t)list[1]<<16)|list[2]; michael@0: } else { michael@0: return list[1]; michael@0: } michael@0: } michael@0: } else { michael@0: // trail character is 3400..10FFFF michael@0: // result entry has 3 units michael@0: key1=(uint16_t)(COMP_1_TRAIL_LIMIT+ michael@0: (((trail>>COMP_1_TRAIL_SHIFT))& michael@0: ~COMP_1_TRIPLE)); michael@0: uint16_t key2=(uint16_t)(trail<(firstUnit=*list)) { michael@0: list+=2+(firstUnit&COMP_1_TRIPLE); michael@0: } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { michael@0: if(key2>(secondUnit=list[1])) { michael@0: if(firstUnit&COMP_1_LAST_TUPLE) { michael@0: break; michael@0: } else { michael@0: list+=3; michael@0: } michael@0: } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { michael@0: return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2]; michael@0: } else { michael@0: break; michael@0: } michael@0: } else { michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: return -1; michael@0: } michael@0: michael@0: /** michael@0: * @param list some character's compositions list michael@0: * @param set recursively receives the composites from these compositions michael@0: */ michael@0: void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const { michael@0: uint16_t firstUnit; michael@0: int32_t compositeAndFwd; michael@0: do { michael@0: firstUnit=*list; michael@0: if((firstUnit&COMP_1_TRIPLE)==0) { michael@0: compositeAndFwd=list[1]; michael@0: list+=2; michael@0: } else { michael@0: compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2]; michael@0: list+=3; michael@0: } michael@0: UChar32 composite=compositeAndFwd>>1; michael@0: if((compositeAndFwd&1)!=0) { michael@0: addComposites(getCompositionsListForComposite(getNorm16(composite)), set); michael@0: } michael@0: set.add(composite); michael@0: } while((firstUnit&COMP_1_LAST_TUPLE)==0); michael@0: } michael@0: michael@0: /* michael@0: * Recomposes the buffer text starting at recomposeStartIndex michael@0: * (which is in NFD - decomposed and canonically ordered), michael@0: * and truncates the buffer contents. michael@0: * michael@0: * Note that recomposition never lengthens the text: michael@0: * Any character consists of either one or two code units; michael@0: * a composition may contain at most one more code unit than the original starter, michael@0: * while the combining mark that is removed has at least one code unit. michael@0: */ michael@0: void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex, michael@0: UBool onlyContiguous) const { michael@0: UChar *p=buffer.getStart()+recomposeStartIndex; michael@0: UChar *limit=buffer.getLimit(); michael@0: if(p==limit) { michael@0: return; michael@0: } michael@0: michael@0: UChar *starter, *pRemove, *q, *r; michael@0: const uint16_t *compositionsList; michael@0: UChar32 c, compositeAndFwd; michael@0: uint16_t norm16; michael@0: uint8_t cc, prevCC; michael@0: UBool starterIsSupplementary; michael@0: michael@0: // Some of the following variables are not used until we have a forward-combining starter michael@0: // and are only initialized now to avoid compiler warnings. michael@0: compositionsList=NULL; // used as indicator for whether we have a forward-combining starter michael@0: starter=NULL; michael@0: starterIsSupplementary=FALSE; michael@0: prevCC=0; michael@0: michael@0: for(;;) { michael@0: UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16); michael@0: cc=getCCFromYesOrMaybe(norm16); michael@0: if( // this character combines backward and michael@0: isMaybe(norm16) && michael@0: // we have seen a starter that combines forward and michael@0: compositionsList!=NULL && michael@0: // the backward-combining character is not blocked michael@0: (prevCC=0) { michael@0: // The starter and the combining mark (c) do combine. michael@0: UChar32 composite=compositeAndFwd>>1; michael@0: michael@0: // Replace the starter with the composite, remove the combining mark. michael@0: pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark michael@0: if(starterIsSupplementary) { michael@0: if(U_IS_SUPPLEMENTARY(composite)) { michael@0: // both are supplementary michael@0: starter[0]=U16_LEAD(composite); michael@0: starter[1]=U16_TRAIL(composite); michael@0: } else { michael@0: *starter=(UChar)composite; michael@0: // The composite is shorter than the starter, michael@0: // move the intermediate characters forward one. michael@0: starterIsSupplementary=FALSE; michael@0: q=starter+1; michael@0: r=q+1; michael@0: while(rminYesNo) { // composite 'a' has both mapping & compositions list michael@0: list+= // mapping pointer michael@0: 1+ // +1 to skip the first unit with the mapping lenth michael@0: (*list&MAPPING_LENGTH_MASK); // + mapping length michael@0: } michael@0: } michael@0: } else if(norm16>1; michael@0: #else michael@0: int32_t compositeAndFwd=combine(list, b); michael@0: return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL; michael@0: #endif michael@0: } michael@0: michael@0: // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. michael@0: // doCompose: normalize michael@0: // !doCompose: isNormalized (buffer must be empty and initialized) michael@0: UBool michael@0: Normalizer2Impl::compose(const UChar *src, const UChar *limit, michael@0: UBool onlyContiguous, michael@0: UBool doCompose, michael@0: ReorderingBuffer &buffer, michael@0: UErrorCode &errorCode) const { michael@0: /* michael@0: * prevBoundary points to the last character before the current one michael@0: * that has a composition boundary before it with ccc==0 and quick check "yes". michael@0: * Keeping track of prevBoundary saves us looking for a composition boundary michael@0: * when we find a "no" or "maybe". michael@0: * michael@0: * When we back out from prevSrc back to prevBoundary, michael@0: * then we also remove those same characters (which had been simply copied michael@0: * or canonically-order-inserted) from the ReorderingBuffer. michael@0: * Therefore, at all times, the [prevBoundary..prevSrc[ source units michael@0: * must correspond 1:1 to destination units at the end of the destination buffer. michael@0: */ michael@0: const UChar *prevBoundary=src; michael@0: UChar32 minNoMaybeCP=minCompNoMaybeCP; michael@0: if(limit==NULL) { michael@0: src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, michael@0: doCompose ? &buffer : NULL, michael@0: errorCode); michael@0: if(U_FAILURE(errorCode)) { michael@0: return FALSE; michael@0: } michael@0: if(prevBoundary=minNoNo. michael@0: * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) michael@0: * or has ccc!=0. michael@0: * Check for Jamo V/T, then for regular characters. michael@0: * c is not a Hangul syllable or Jamo L because those have "yes" properties. michael@0: */ michael@0: if(isJamoVT(norm16) && prevBoundary!=prevSrc) { michael@0: UChar prev=*(prevSrc-1); michael@0: UBool needToDecompose=FALSE; michael@0: if(c=MIN_YES_YES_WITH_CC) { michael@0: uint8_t cc=(uint8_t)norm16; // cc!=0 michael@0: if( onlyContiguous && // FCC michael@0: (doCompose ? buffer.getLastCC() : prevCC)==0 && michael@0: prevBoundarycc michael@0: ) { michael@0: // Fails FCD test, need to decompose and contiguously recompose. michael@0: if(!doCompose) { michael@0: return FALSE; michael@0: } michael@0: } else if(doCompose) { michael@0: if(!buffer.append(c, cc, errorCode)) { michael@0: break; michael@0: } michael@0: continue; michael@0: } else if(prevCC<=cc) { michael@0: prevCC=cc; michael@0: continue; michael@0: } else { michael@0: return FALSE; michael@0: } michael@0: } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) { michael@0: return FALSE; michael@0: } michael@0: michael@0: /* michael@0: * Find appropriate boundaries around this character, michael@0: * decompose the source text from between the boundaries, michael@0: * and recompose it. michael@0: * michael@0: * We may need to remove the last few characters from the ReorderingBuffer michael@0: * to account for source text that was copied or appended michael@0: * but needs to take part in the recomposition. michael@0: */ michael@0: michael@0: /* michael@0: * Find the last composition boundary in [prevBoundary..src[. michael@0: * It is either the decomposition of the current character (at prevSrc), michael@0: * or prevBoundary. michael@0: */ michael@0: if(hasCompBoundaryBefore(c, norm16)) { michael@0: prevBoundary=prevSrc; michael@0: } else if(doCompose) { michael@0: buffer.removeSuffix((int32_t)(prevSrc-prevBoundary)); michael@0: } michael@0: michael@0: // Find the next composition boundary in [src..limit[ - michael@0: // modifies src to point to the next starter. michael@0: src=(UChar *)findNextCompBoundary(src, limit); michael@0: michael@0: // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it. michael@0: int32_t recomposeStartIndex=buffer.length(); michael@0: if(!decomposeShort(prevBoundary, src, buffer, errorCode)) { michael@0: break; michael@0: } michael@0: recompose(buffer, recomposeStartIndex, onlyContiguous); michael@0: if(!doCompose) { michael@0: if(!buffer.equals(prevBoundary, src)) { michael@0: return FALSE; michael@0: } michael@0: buffer.remove(); michael@0: prevCC=0; michael@0: } michael@0: michael@0: // Move to the next starter. We never need to look back before this point again. michael@0: prevBoundary=src; michael@0: } michael@0: return TRUE; michael@0: } michael@0: michael@0: // Very similar to compose(): Make the same changes in both places if relevant. michael@0: // pQCResult==NULL: spanQuickCheckYes michael@0: // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES) michael@0: const UChar * michael@0: Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit, michael@0: UBool onlyContiguous, michael@0: UNormalizationCheckResult *pQCResult) const { michael@0: /* michael@0: * prevBoundary points to the last character before the current one michael@0: * that has a composition boundary before it with ccc==0 and quick check "yes". michael@0: */ michael@0: const UChar *prevBoundary=src; michael@0: UChar32 minNoMaybeCP=minCompNoMaybeCP; michael@0: if(limit==NULL) { michael@0: UErrorCode errorCode=U_ZERO_ERROR; michael@0: src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode); michael@0: if(prevBoundary=minNoNo. michael@0: * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) michael@0: * or has ccc!=0. michael@0: */ michael@0: if(isMaybeOrNonZeroCC(norm16)) { michael@0: uint8_t cc=getCCFromYesOrMaybe(norm16); michael@0: if( onlyContiguous && // FCC michael@0: cc!=0 && michael@0: prevCC==0 && michael@0: prevBoundarycc michael@0: ) { michael@0: // Fails FCD test. michael@0: } else if(prevCC<=cc || cc==0) { michael@0: prevCC=cc; michael@0: if(norm16= (testInert ? minNoNo : minMaybeYes)) { michael@0: return FALSE; michael@0: } else if(isDecompNoAlgorithmic(norm16)) { michael@0: c=mapAlgorithmic(c, norm16); michael@0: } else { michael@0: // c decomposes, get everything from the variable-length extra data. michael@0: // If testInert, then c must be a yesNo character which has lccc=0, michael@0: // otherwise it could be a noNo. michael@0: const uint16_t *mapping=getMapping(norm16); michael@0: uint16_t firstUnit=*mapping; michael@0: // TRUE if michael@0: // not MAPPING_NO_COMP_BOUNDARY_AFTER michael@0: // (which is set if michael@0: // c is not deleted, and michael@0: // it and its decomposition do not combine forward, and it has a starter) michael@0: // and if FCC then trailCC<=1 michael@0: return michael@0: (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 && michael@0: (!onlyContiguous || firstUnit<=0x1ff); michael@0: } michael@0: } michael@0: } michael@0: michael@0: const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const { michael@0: BackwardUTrie2StringIterator iter(normTrie, start, p); michael@0: uint16_t norm16; michael@0: do { michael@0: norm16=iter.previous16(); michael@0: } while(!hasCompBoundaryBefore(iter.codePoint, norm16)); michael@0: // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, michael@0: // but that's probably not worth the extra cost. michael@0: return iter.codePointStart; michael@0: } michael@0: michael@0: const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const { michael@0: ForwardUTrie2StringIterator iter(normTrie, p, limit); michael@0: uint16_t norm16; michael@0: do { michael@0: norm16=iter.next16(); michael@0: } while(!hasCompBoundaryBefore(iter.codePoint, norm16)); michael@0: return iter.codePointStart; michael@0: } michael@0: michael@0: // Note: normalizer2impl.cpp r30982 (2011-nov-27) michael@0: // still had getFCDTrie() which built and cached an FCD trie. michael@0: // That provided faster access to FCD data than getFCD16FromNormData() michael@0: // but required synchronization and consumed some 10kB of heap memory michael@0: // in any process that uses FCD (e.g., via collation). michael@0: // tccc180[] and smallFCD[] are intended to help with any loss of performance, michael@0: // at least for Latin & CJK. michael@0: michael@0: // Gets the FCD value from the regular normalization data. michael@0: uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const { michael@0: // Only loops for 1:1 algorithmic mappings. michael@0: for(;;) { michael@0: uint16_t norm16=getNorm16(c); michael@0: if(norm16<=minYesNo) { michael@0: // no decomposition or Hangul syllable, all zeros michael@0: return 0; michael@0: } else if(norm16>=MIN_NORMAL_MAYBE_YES) { michael@0: // combining mark michael@0: norm16&=0xff; michael@0: return norm16|(norm16<<8); michael@0: } else if(norm16>=minMaybeYes) { michael@0: return 0; michael@0: } else if(isDecompNoAlgorithmic(norm16)) { michael@0: c=mapAlgorithmic(c, norm16); michael@0: } else { michael@0: // c decomposes, get everything from the variable-length extra data michael@0: const uint16_t *mapping=getMapping(norm16); michael@0: uint16_t firstUnit=*mapping; michael@0: if((firstUnit&MAPPING_LENGTH_MASK)==0) { michael@0: // A character that is deleted (maps to an empty string) must michael@0: // get the worst-case lccc and tccc values because arbitrary michael@0: // characters on both sides will become adjacent. michael@0: return 0x1ff; michael@0: } else { michael@0: norm16=firstUnit>>8; // tccc michael@0: if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { michael@0: norm16|=*(mapping-1)&0xff00; // lccc michael@0: } michael@0: return norm16; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: // Dual functionality: michael@0: // buffer!=NULL: normalize michael@0: // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes michael@0: const UChar * michael@0: Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit, michael@0: ReorderingBuffer *buffer, michael@0: UErrorCode &errorCode) const { michael@0: // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. michael@0: // Similar to the prevBoundary in the compose() implementation. michael@0: const UChar *prevBoundary=src; michael@0: int32_t prevFCD16=0; michael@0: if(limit==NULL) { michael@0: src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode); michael@0: if(U_FAILURE(errorCode)) { michael@0: return src; michael@0: } michael@0: if(prevBoundary1) { michael@0: --prevBoundary; michael@0: } michael@0: } michael@0: limit=u_strchr(src, 0); michael@0: } michael@0: michael@0: // Note: In this function we use buffer->appendZeroCC() because we track michael@0: // the lead and trail combining classes here, rather than leaving it to michael@0: // the ReorderingBuffer. michael@0: // The exception is the call to decomposeShort() which uses the buffer michael@0: // in the normal way. michael@0: michael@0: const UChar *prevSrc; michael@0: UChar32 c=0; michael@0: uint16_t fcd16=0; michael@0: michael@0: for(;;) { michael@0: // count code units with lccc==0 michael@0: for(prevSrc=src; src!=limit;) { michael@0: if((c=*src)appendZeroCC(prevSrc, src, errorCode)) { michael@0: break; michael@0: } michael@0: if(src==limit) { michael@0: break; michael@0: } michael@0: prevBoundary=src; michael@0: // We know that the previous character's lccc==0. michael@0: if(prevFCD16<0) { michael@0: // Fetching the fcd16 value was deferred for this below-U+0300 code point. michael@0: UChar32 prev=~prevFCD16; michael@0: prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev); michael@0: if(prevFCD16>1) { michael@0: --prevBoundary; michael@0: } michael@0: } else { michael@0: const UChar *p=src-1; michael@0: if(U16_IS_TRAIL(*p) && prevSrc

1) { michael@0: prevBoundary=p; michael@0: } michael@0: } michael@0: // The start of the current character (c). michael@0: prevSrc=src; michael@0: } else if(src==limit) { michael@0: break; michael@0: } michael@0: michael@0: src+=U16_LENGTH(c); michael@0: // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. michael@0: // Check for proper order, and decompose locally if necessary. michael@0: if((prevFCD16&0xff)<=(fcd16>>8)) { michael@0: // proper order: prev tccc <= current lccc michael@0: if((fcd16&0xff)<=1) { michael@0: prevBoundary=src; michael@0: } michael@0: if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) { michael@0: break; michael@0: } michael@0: prevFCD16=fcd16; michael@0: continue; michael@0: } else if(buffer==NULL) { michael@0: return prevBoundary; // quick check "no" michael@0: } else { michael@0: /* michael@0: * Back out the part of the source that we copied or appended michael@0: * already but is now going to be decomposed. michael@0: * prevSrc is set to after what was copied/appended. michael@0: */ michael@0: buffer->removeSuffix((int32_t)(prevSrc-prevBoundary)); michael@0: /* michael@0: * Find the part of the source that needs to be decomposed, michael@0: * up to the next safe boundary. michael@0: */ michael@0: src=findNextFCDBoundary(src, limit); michael@0: /* michael@0: * The source text does not fulfill the conditions for FCD. michael@0: * Decompose and reorder a limited piece of the text. michael@0: */ michael@0: if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) { michael@0: break; michael@0: } michael@0: prevBoundary=src; michael@0: prevFCD16=0; michael@0: } michael@0: } michael@0: return src; michael@0: } michael@0: michael@0: void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit, michael@0: UBool doMakeFCD, michael@0: UnicodeString &safeMiddle, michael@0: ReorderingBuffer &buffer, michael@0: UErrorCode &errorCode) const { michael@0: if(!buffer.isEmpty()) { michael@0: const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit); michael@0: if(src!=firstBoundaryInSrc) { michael@0: const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(), michael@0: buffer.getLimit()); michael@0: int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest); michael@0: UnicodeString middle(lastBoundaryInDest, destSuffixLength); michael@0: buffer.removeSuffix(destSuffixLength); michael@0: safeMiddle=middle; michael@0: middle.append(src, (int32_t)(firstBoundaryInSrc-src)); michael@0: const UChar *middleStart=middle.getBuffer(); michael@0: makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode); michael@0: if(U_FAILURE(errorCode)) { michael@0: return; michael@0: } michael@0: src=firstBoundaryInSrc; michael@0: } michael@0: } michael@0: if(doMakeFCD) { michael@0: makeFCD(src, limit, &buffer, errorCode); michael@0: } else { michael@0: if(limit==NULL) { // appendZeroCC() needs limit!=NULL michael@0: limit=u_strchr(src, 0); michael@0: } michael@0: buffer.appendZeroCC(src, limit, errorCode); michael@0: } michael@0: } michael@0: michael@0: const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const { michael@0: while(start

0xff) {} michael@0: return p; michael@0: } michael@0: michael@0: const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const { michael@0: while(padd(firstOrigin); michael@0: } michael@0: } else { michael@0: set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)]; michael@0: } michael@0: set->add(origin); michael@0: } michael@0: } michael@0: michael@0: U_CDECL_BEGIN michael@0: michael@0: // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters. michael@0: // context: the Normalizer2Impl michael@0: static UBool U_CALLCONV michael@0: enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { michael@0: UErrorCode errorCode = U_ZERO_ERROR; michael@0: if (value != 0) { michael@0: Normalizer2Impl *impl = (Normalizer2Impl *)context; michael@0: impl->makeCanonIterDataFromNorm16( michael@0: start, end, (uint16_t)value, *impl->fCanonIterData, errorCode); michael@0: } michael@0: return U_SUCCESS(errorCode); michael@0: } michael@0: michael@0: michael@0: michael@0: // UInitOnce instantiation function for CanonIterData michael@0: michael@0: static void U_CALLCONV michael@0: initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) { michael@0: U_ASSERT(impl->fCanonIterData == NULL); michael@0: impl->fCanonIterData = new CanonIterData(errorCode); michael@0: if (impl->fCanonIterData == NULL) { michael@0: errorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: if (U_SUCCESS(errorCode)) { michael@0: utrie2_enum(impl->getNormTrie(), NULL, enumCIDRangeHandler, impl); michael@0: utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode); michael@0: } michael@0: if (U_FAILURE(errorCode)) { michael@0: delete impl->fCanonIterData; michael@0: impl->fCanonIterData = NULL; michael@0: } michael@0: } michael@0: michael@0: U_CDECL_END michael@0: michael@0: void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16, michael@0: CanonIterData &newData, michael@0: UErrorCode &errorCode) const { michael@0: if(norm16==0 || (minYesNo<=norm16 && norm16=minMaybeYes) { michael@0: // not a segment starter if it occurs in a decomposition or has cc!=0 michael@0: newValue|=CANON_NOT_SEGMENT_STARTER; michael@0: if(norm16=minNoNo) { michael@0: while(i(this); michael@0: umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode); michael@0: return U_SUCCESS(errorCode); michael@0: } michael@0: michael@0: int32_t Normalizer2Impl::getCanonValue(UChar32 c) const { michael@0: return (int32_t)utrie2_get32(fCanonIterData->trie, c); michael@0: } michael@0: michael@0: const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const { michael@0: return *(const UnicodeSet *)fCanonIterData->canonStartSets[n]; michael@0: } michael@0: michael@0: UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const { michael@0: return getCanonValue(c)>=0; michael@0: } michael@0: michael@0: UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const { michael@0: int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER; michael@0: if(canonValue==0) { michael@0: return FALSE; michael@0: } michael@0: set.clear(); michael@0: int32_t value=canonValue&CANON_VALUE_MASK; michael@0: if((canonValue&CANON_HAS_SET)!=0) { michael@0: set.addAll(getCanonStartSet(value)); michael@0: } else if(value!=0) { michael@0: set.add(value); michael@0: } michael@0: if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { michael@0: uint16_t norm16=getNorm16(c); michael@0: if(norm16==JAMO_L) { michael@0: UChar32 syllable= michael@0: (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT); michael@0: set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1); michael@0: } else { michael@0: addComposites(getCompositionsList(norm16), set); michael@0: } michael@0: } michael@0: return TRUE; michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: // Normalizer2 data swapping ----------------------------------------------- *** michael@0: michael@0: U_NAMESPACE_USE michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: unorm2_swap(const UDataSwapper *ds, michael@0: const void *inData, int32_t length, void *outData, michael@0: UErrorCode *pErrorCode) { michael@0: const UDataInfo *pInfo; michael@0: int32_t headerSize; michael@0: michael@0: const uint8_t *inBytes; michael@0: uint8_t *outBytes; michael@0: michael@0: const int32_t *inIndexes; michael@0: int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1]; michael@0: michael@0: int32_t i, offset, nextOffset, size; michael@0: michael@0: /* udata_swapDataHeader checks the arguments */ michael@0: headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); michael@0: if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { michael@0: return 0; michael@0: } michael@0: michael@0: /* check data format and format version */ michael@0: pInfo=(const UDataInfo *)((const char *)inData+4); michael@0: if(!( michael@0: pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ michael@0: pInfo->dataFormat[1]==0x72 && michael@0: pInfo->dataFormat[2]==0x6d && michael@0: pInfo->dataFormat[3]==0x32 && michael@0: (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2) michael@0: )) { michael@0: udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n", michael@0: pInfo->dataFormat[0], pInfo->dataFormat[1], michael@0: pInfo->dataFormat[2], pInfo->dataFormat[3], michael@0: pInfo->formatVersion[0]); michael@0: *pErrorCode=U_UNSUPPORTED_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: inBytes=(const uint8_t *)inData+headerSize; michael@0: outBytes=(uint8_t *)outData+headerSize; michael@0: michael@0: inIndexes=(const int32_t *)inBytes; michael@0: michael@0: if(length>=0) { michael@0: length-=headerSize; michael@0: if(length<(int32_t)sizeof(indexes)) { michael@0: udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n", michael@0: length); michael@0: *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return 0; michael@0: } michael@0: } michael@0: michael@0: /* read the first few indexes */ michael@0: for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) { michael@0: indexes[i]=udata_readInt32(ds, inIndexes[i]); michael@0: } michael@0: michael@0: /* get the total length of the data */ michael@0: size=indexes[Normalizer2Impl::IX_TOTAL_SIZE]; michael@0: michael@0: if(length>=0) { michael@0: if(lengthswapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode); michael@0: offset=nextOffset; michael@0: michael@0: /* swap the UTrie2 */ michael@0: nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]; michael@0: utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); michael@0: offset=nextOffset; michael@0: michael@0: /* swap the uint16_t extraData[] */ michael@0: nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]; michael@0: ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); michael@0: offset=nextOffset; michael@0: michael@0: /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */ michael@0: nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1]; michael@0: offset=nextOffset; michael@0: michael@0: U_ASSERT(offset==size); michael@0: } michael@0: michael@0: return headerSize+size; michael@0: } michael@0: michael@0: #endif // !UCONFIG_NO_NORMALIZATION