michael@0: /* michael@0: ******************************************************************************* michael@0: * michael@0: * Copyright (C) 2009-2012, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ******************************************************************************* michael@0: * file name: n2builder.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2009nov25 michael@0: * created by: Markus W. Scherer michael@0: * michael@0: * Builds Normalizer2 data and writes a binary .nrm file. michael@0: * For the file format see source/common/normalizer2impl.h. michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "n2builder.h" michael@0: michael@0: #include michael@0: #include michael@0: #include michael@0: #if U_HAVE_STD_STRING michael@0: #include michael@0: #endif michael@0: #include "unicode/errorcode.h" michael@0: #include "unicode/localpointer.h" michael@0: #include "unicode/putil.h" michael@0: #include "unicode/udata.h" michael@0: #include "unicode/uniset.h" michael@0: #include "unicode/unistr.h" michael@0: #include "unicode/ustring.h" michael@0: #include "hash.h" michael@0: #include "normalizer2impl.h" michael@0: #include "toolutil.h" michael@0: #include "unewdata.h" michael@0: #include "utrie2.h" michael@0: #include "uvectr32.h" michael@0: michael@0: #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) michael@0: michael@0: #if !UCONFIG_NO_NORMALIZATION michael@0: michael@0: /* UDataInfo cf. udata.h */ michael@0: static UDataInfo dataInfo={ michael@0: sizeof(UDataInfo), michael@0: 0, michael@0: michael@0: U_IS_BIG_ENDIAN, michael@0: U_CHARSET_FAMILY, michael@0: U_SIZEOF_UCHAR, michael@0: 0, michael@0: michael@0: { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */ michael@0: { 2, 0, 0, 0 }, /* formatVersion */ michael@0: { 5, 2, 0, 0 } /* dataVersion (Unicode version) */ michael@0: }; michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: class HangulIterator { michael@0: public: michael@0: struct Range { michael@0: UChar32 start, limit; michael@0: uint16_t norm16; michael@0: }; michael@0: michael@0: HangulIterator() : rangeIndex(0) {} michael@0: const Range *nextRange() { michael@0: if(rangeIndexREMOVED; } michael@0: michael@0: // Requires hasMapping() and well-formed mapping. michael@0: void setMappingCP() { michael@0: UChar32 c; michael@0: if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) { michael@0: mappingCP=c; michael@0: } else { michael@0: mappingCP=U_SENTINEL; michael@0: } michael@0: } michael@0: michael@0: const CompositionPair *getCompositionPairs(int32_t &length) const { michael@0: if(compositions==NULL) { michael@0: length=0; michael@0: return NULL; michael@0: } else { michael@0: length=compositions->size()/2; michael@0: return reinterpret_cast(compositions->getBuffer()); michael@0: } michael@0: } michael@0: michael@0: UnicodeString *mapping; michael@0: UnicodeString *rawMapping; // non-NULL if the mapping is further decomposed michael@0: UChar32 mappingCP; // >=0 if mapping to 1 code point michael@0: int32_t mappingPhase; michael@0: MappingType mappingType; michael@0: michael@0: UVector32 *compositions; // (trail, composite) pairs michael@0: uint8_t cc; michael@0: UBool combinesBack; michael@0: UBool hasNoCompBoundaryAfter; michael@0: michael@0: enum OffsetType { michael@0: OFFSET_NONE, michael@0: // Composition for back-combining character. Allowed, but not normally used. michael@0: OFFSET_MAYBE_YES, michael@0: // Composition for a starter that does not have a decomposition mapping. michael@0: OFFSET_YES_YES, michael@0: // Round-trip mapping & composition for a starter. michael@0: OFFSET_YES_NO_MAPPING_AND_COMPOSITION, michael@0: // Round-trip mapping for a starter that itself does not combine-forward. michael@0: OFFSET_YES_NO_MAPPING_ONLY, michael@0: // One-way mapping. michael@0: OFFSET_NO_NO, michael@0: // Delta for an algorithmic one-way mapping. michael@0: OFFSET_DELTA michael@0: }; michael@0: enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<rangeHandler(start, end, value); michael@0: } michael@0: michael@0: U_CDECL_END michael@0: michael@0: Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) : michael@0: phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL) { michael@0: memset(unicodeVersion, 0, sizeof(unicodeVersion)); michael@0: normTrie=utrie2_open(0, 0, &errorCode); michael@0: normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm)); michael@0: norms=allocNorm(); // unused Norm struct at index 0 michael@0: memset(indexes, 0, sizeof(indexes)); michael@0: memset(smallFCD, 0, sizeof(smallFCD)); michael@0: } michael@0: michael@0: Normalizer2DataBuilder::~Normalizer2DataBuilder() { michael@0: utrie2_close(normTrie); michael@0: int32_t normsLength=utm_countItems(normMem); michael@0: for(int32_t i=1; imappingType!=Norm::NONE) { michael@0: if( overrideHandling==OVERRIDE_NONE || michael@0: (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase) michael@0: ) { michael@0: fprintf(stderr, michael@0: "error in gennorm2 phase %d: " michael@0: "not permitted to override mapping for U+%04lX from phase %d\n", michael@0: (int)phase, (long)c, (int)p->mappingPhase); michael@0: exit(U_INVALID_FORMAT_ERROR); michael@0: } michael@0: delete p->mapping; michael@0: p->mapping=NULL; michael@0: } michael@0: p->mappingPhase=phase; michael@0: } michael@0: return p; michael@0: } michael@0: michael@0: void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) { michael@0: overrideHandling=oh; michael@0: ++phase; michael@0: } michael@0: michael@0: void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) { michael@0: createNorm(c)->cc=cc; michael@0: } michael@0: michael@0: uint8_t Normalizer2DataBuilder::getCC(UChar32 c) const { michael@0: return getNormRef(c).cc; michael@0: } michael@0: michael@0: static UBool isWellFormed(const UnicodeString &s) { michael@0: UErrorCode errorCode=U_ZERO_ERROR; michael@0: u_strToUTF8(NULL, 0, NULL, s.getBuffer(), s.length(), &errorCode); michael@0: return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR; michael@0: } michael@0: michael@0: void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) { michael@0: if(!isWellFormed(m)) { michael@0: fprintf(stderr, michael@0: "error in gennorm2 phase %d: " michael@0: "illegal one-way mapping from U+%04lX to malformed string\n", michael@0: (int)phase, (long)c); michael@0: exit(U_INVALID_FORMAT_ERROR); michael@0: } michael@0: Norm *p=checkNormForMapping(createNorm(c), c); michael@0: p->mapping=new UnicodeString(m); michael@0: p->mappingType=Norm::ONE_WAY; michael@0: p->setMappingCP(); michael@0: } michael@0: michael@0: void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) { michael@0: if(U_IS_SURROGATE(c)) { michael@0: fprintf(stderr, michael@0: "error in gennorm2 phase %d: " michael@0: "illegal round-trip mapping from surrogate code point U+%04lX\n", michael@0: (int)phase, (long)c); michael@0: exit(U_INVALID_FORMAT_ERROR); michael@0: } michael@0: if(!isWellFormed(m)) { michael@0: fprintf(stderr, michael@0: "error in gennorm2 phase %d: " michael@0: "illegal round-trip mapping from U+%04lX to malformed string\n", michael@0: (int)phase, (long)c); michael@0: exit(U_INVALID_FORMAT_ERROR); michael@0: } michael@0: int32_t numCP=u_countChar32(m.getBuffer(), m.length()); michael@0: if(numCP!=2) { michael@0: fprintf(stderr, michael@0: "error in gennorm2 phase %d: " michael@0: "illegal round-trip mapping from U+%04lX to %d!=2 code points\n", michael@0: (int)phase, (long)c, (int)numCP); michael@0: exit(U_INVALID_FORMAT_ERROR); michael@0: } michael@0: Norm *p=checkNormForMapping(createNorm(c), c); michael@0: p->mapping=new UnicodeString(m); michael@0: p->mappingType=Norm::ROUND_TRIP; michael@0: p->mappingCP=U_SENTINEL; michael@0: } michael@0: michael@0: void Normalizer2DataBuilder::removeMapping(UChar32 c) { michael@0: Norm *p=checkNormForMapping(getNorm(c), c); michael@0: if(p!=NULL) { michael@0: p->mappingType=Norm::REMOVED; michael@0: } michael@0: } michael@0: michael@0: class CompositionBuilder : public Normalizer2DBEnumerator { michael@0: public: michael@0: CompositionBuilder(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {} michael@0: virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { michael@0: builder.addComposition(start, end, value); michael@0: return TRUE; michael@0: } michael@0: }; michael@0: michael@0: void michael@0: Normalizer2DataBuilder::addComposition(UChar32 start, UChar32 end, uint32_t value) { michael@0: if(norms[value].mappingType==Norm::ROUND_TRIP) { michael@0: if(start!=end) { michael@0: fprintf(stderr, michael@0: "gennorm2 error: same round-trip mapping for " michael@0: "more than 1 code point U+%04lX..U+%04lX\n", michael@0: (long)start, (long)end); michael@0: exit(U_INVALID_FORMAT_ERROR); michael@0: } michael@0: if(norms[value].cc!=0) { michael@0: fprintf(stderr, michael@0: "gennorm2 error: " michael@0: "U+%04lX has a round-trip mapping and ccc!=0, " michael@0: "not possible in Unicode normalization\n", michael@0: (long)start); michael@0: exit(U_INVALID_FORMAT_ERROR); michael@0: } michael@0: // setRoundTripMapping() ensured that there are exactly two code points. michael@0: const UnicodeString &m=*norms[value].mapping; michael@0: UChar32 lead=m.char32At(0); michael@0: UChar32 trail=m.char32At(m.length()-1); michael@0: if(getCC(lead)!=0) { michael@0: fprintf(stderr, michael@0: "gennorm2 error: " michael@0: "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, " michael@0: "not possible in Unicode normalization\n", michael@0: (long)start, (long)lead); michael@0: exit(U_INVALID_FORMAT_ERROR); michael@0: } michael@0: // Flag for trailing character. michael@0: createNorm(trail)->combinesBack=TRUE; michael@0: // Insert (trail, composite) pair into compositions list for the lead character. michael@0: IcuToolErrorCode errorCode("gennorm2/addComposition()"); michael@0: Norm *leadNorm=createNorm(lead); michael@0: UVector32 *compositions=leadNorm->compositions; michael@0: int32_t i; michael@0: if(compositions==NULL) { michael@0: compositions=leadNorm->compositions=new UVector32(errorCode); michael@0: i=0; // "insert" the first pair at index 0 michael@0: } else { michael@0: // Insertion sort, and check for duplicate trail characters. michael@0: int32_t length; michael@0: const CompositionPair *pairs=leadNorm->getCompositionPairs(length); michael@0: for(i=0; iinsertElementAt(trail, 2*i, errorCode); michael@0: compositions->insertElementAt(start, 2*i+1, errorCode); michael@0: } michael@0: } michael@0: michael@0: UBool Normalizer2DataBuilder::combinesWithCCBetween(const Norm &norm, michael@0: uint8_t lowCC, uint8_t highCC) const { michael@0: if((highCC-lowCC)>=2) { michael@0: int32_t length; michael@0: const CompositionPair *pairs=norm.getCompositionPairs(length); michael@0: for(int32_t i=0; ichar32At(cNorm.mapping->length()-1); michael@0: uint8_t cTrailCC=getCC(cTrailChar); michael@0: if(cTrailCC>myTrailCC) { michael@0: fprintf(stderr, michael@0: "gennorm2 error: " michael@0: "U+%04lX's round-trip mapping's starter " michael@0: "U+%04lX decomposes and the " michael@0: "inner/earlier tccc=%hu > outer/following tccc=%hu, " michael@0: "not possible in Unicode normalization\n", michael@0: (long)start, (long)c, michael@0: (short)cTrailCC, (short)myTrailCC); michael@0: exit(U_INVALID_FORMAT_ERROR); michael@0: } michael@0: } else { michael@0: fprintf(stderr, michael@0: "gennorm2 error: " michael@0: "U+%04lX's round-trip mapping's non-starter " michael@0: "U+%04lX decomposes, " michael@0: "not possible in Unicode normalization\n", michael@0: (long)start, (long)c); michael@0: exit(U_INVALID_FORMAT_ERROR); michael@0: } michael@0: } michael@0: if(decomposed==NULL) { michael@0: decomposed=new UnicodeString(m, 0, prev); michael@0: } michael@0: decomposed->append(*cNorm.mapping); michael@0: } else if(Hangul::isHangul(c)) { michael@0: UChar buffer[3]; michael@0: int32_t hangulLength=Hangul::decompose(c, buffer); michael@0: if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) { michael@0: fprintf(stderr, michael@0: "gennorm2 error: " michael@0: "U+%04lX's round-trip mapping's non-starter " michael@0: "U+%04lX decomposes, " michael@0: "not possible in Unicode normalization\n", michael@0: (long)start, (long)c); michael@0: exit(U_INVALID_FORMAT_ERROR); michael@0: } michael@0: if(decomposed==NULL) { michael@0: decomposed=new UnicodeString(m, 0, prev); michael@0: } michael@0: decomposed->append(buffer, hangulLength); michael@0: } else if(decomposed!=NULL) { michael@0: decomposed->append(m, prev, i-prev); michael@0: } michael@0: } michael@0: if(decomposed!=NULL) { michael@0: if(norm.rawMapping==NULL) { michael@0: // Remember the original mapping when decomposing recursively. michael@0: norm.rawMapping=norm.mapping; michael@0: } else { michael@0: delete norm.mapping; michael@0: } michael@0: norm.mapping=decomposed; michael@0: // Not norm.setMappingCP(); because the original mapping michael@0: // is most likely to be encodable as a delta. michael@0: return TRUE; michael@0: } michael@0: } michael@0: return FALSE; michael@0: } michael@0: michael@0: class BuilderReorderingBuffer { michael@0: public: michael@0: BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE) {} michael@0: void reset() { michael@0: fLength=0; michael@0: fLastStarterIndex=-1; michael@0: fDidReorder=FALSE; michael@0: } michael@0: int32_t length() const { return fLength; } michael@0: UBool isEmpty() const { return fLength==0; } michael@0: int32_t lastStarterIndex() const { return fLastStarterIndex; } michael@0: UChar32 charAt(int32_t i) const { return fArray[i]>>8; } michael@0: uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; } michael@0: UBool didReorder() const { return fDidReorder; } michael@0: void append(UChar32 c, uint8_t cc) { michael@0: if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) { michael@0: if(cc==0) { michael@0: fLastStarterIndex=fLength; michael@0: } michael@0: fArray[fLength++]=(c<<8)|cc; michael@0: return; michael@0: } michael@0: // Let this character bubble back to its canonical order. michael@0: int32_t i=fLength-1; michael@0: while(i>fLastStarterIndex && ccAt(i)>cc) { michael@0: --i; michael@0: } michael@0: ++i; // after the last starter or prevCC<=cc michael@0: // Move this and the following characters forward one to make space. michael@0: for(int32_t j=fLength; imapping; michael@0: int32_t length=m.length(); michael@0: if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) { michael@0: return; // writeMapping() will complain about it and print the code point. michael@0: } michael@0: const UChar *s=m.getBuffer(); michael@0: int32_t i=0; michael@0: UChar32 c; michael@0: while(icompositions==NULL) { michael@0: return FALSE; // the last starter does not combine forward michael@0: } michael@0: // Compose as far as possible, and see if further compositions are possible. michael@0: uint8_t prevCC=0; michael@0: for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex=0 michael@0: ) { michael@0: buffer.setComposite(starter, combMarkIndex); michael@0: starterNorm=&getNormRef(starter); michael@0: if(starterNorm->compositions==NULL) { michael@0: return FALSE; // the composite does not combine further michael@0: } michael@0: } else { michael@0: prevCC=cc; michael@0: ++combMarkIndex; michael@0: } michael@0: } michael@0: // TRUE if the final, forward-combining starter is at the end. michael@0: return prevCC==0; michael@0: } michael@0: michael@0: // Requires p->hasMapping(). michael@0: // Returns the offset of the "first unit" from the beginning of the extraData for c. michael@0: // That is the same as the length of the optional data for the raw mapping and the ccc/lccc word. michael@0: int32_t Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString) { michael@0: UnicodeString &m=*p->mapping; michael@0: int32_t length=m.length(); michael@0: if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) { michael@0: fprintf(stderr, michael@0: "gennorm2 error: " michael@0: "mapping for U+%04lX longer than maximum of %d\n", michael@0: (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK); michael@0: exit(U_INVALID_FORMAT_ERROR); michael@0: } michael@0: int32_t leadCC, trailCC; michael@0: if(length==0) { michael@0: leadCC=trailCC=0; michael@0: } else { michael@0: leadCC=getCC(m.char32At(0)); michael@0: trailCC=getCC(m.char32At(length-1)); michael@0: } michael@0: if(ccc!=0 || leadCC!=0)) { michael@0: fprintf(stderr, michael@0: "gennorm2 error: " michael@0: "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n", michael@0: (long)c); michael@0: exit(U_INVALID_FORMAT_ERROR); michael@0: } michael@0: // Write small-FCD data. michael@0: if((leadCC|trailCC)!=0) { michael@0: UChar32 lead= c<=0xffff ? c : U16_LEAD(c); michael@0: smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7); michael@0: } michael@0: // Write the mapping & raw mapping extraData. michael@0: int32_t firstUnit=length|(trailCC<<8); michael@0: int32_t preMappingLength=0; michael@0: if(p->rawMapping!=NULL) { michael@0: UnicodeString &rm=*p->rawMapping; michael@0: int32_t rmLength=rm.length(); michael@0: if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) { michael@0: fprintf(stderr, michael@0: "gennorm2 error: " michael@0: "raw mapping for U+%04lX longer than maximum of %d\n", michael@0: (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK); michael@0: exit(U_INVALID_FORMAT_ERROR); michael@0: } michael@0: UChar rm0=rm.charAt(0); michael@0: if( rmLength==length-1 && michael@0: // 99: overlong substring lengths get pinned to remainder lengths anyway michael@0: 0==rm.compare(1, 99, m, 2, 99) && michael@0: rm0>Normalizer2Impl::MAPPING_LENGTH_MASK michael@0: ) { michael@0: // Compression: michael@0: // rawMapping=rm0+mapping.substring(2) -> store only rm0 michael@0: // michael@0: // The raw mapping is the same as the final mapping after replacing michael@0: // the final mapping's first two code units with the raw mapping's first one. michael@0: // In this case, we store only that first unit, rm0. michael@0: // This helps with a few hundred mappings. michael@0: dataString.append(rm0); michael@0: preMappingLength=1; michael@0: } else { michael@0: // Store the raw mapping with its length. michael@0: dataString.append(rm); michael@0: dataString.append((UChar)rmLength); michael@0: preMappingLength=rmLength+1; michael@0: } michael@0: firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING; michael@0: } michael@0: int32_t cccLccc=p->cc|(leadCC<<8); michael@0: if(cccLccc!=0) { michael@0: dataString.append((UChar)cccLccc); michael@0: ++preMappingLength; michael@0: firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD; michael@0: } michael@0: if(p->hasNoCompBoundaryAfter) { michael@0: firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER; michael@0: } michael@0: dataString.append((UChar)firstUnit); michael@0: dataString.append(m); michael@0: return preMappingLength; michael@0: } michael@0: michael@0: // Requires p->compositions!=NULL. michael@0: void Normalizer2DataBuilder::writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString) { michael@0: if(p->cc!=0) { michael@0: fprintf(stderr, michael@0: "gennorm2 error: " michael@0: "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n", michael@0: (long)c); michael@0: exit(U_INVALID_FORMAT_ERROR); michael@0: } michael@0: int32_t length; michael@0: const CompositionPair *pairs=p->getCompositionPairs(length); michael@0: for(int32_t i=0; i>16; michael@0: thirdUnit=compositeAndFwd; michael@0: } michael@0: } else { michael@0: firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+ michael@0: (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))| michael@0: Normalizer2Impl::COMP_1_TRIPLE; michael@0: secondUnit=(pair.trail<>16); michael@0: thirdUnit=compositeAndFwd; michael@0: } michael@0: // Set the high bit of the first unit if this is the last composition pair. michael@0: if(i==(length-1)) { michael@0: firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE; michael@0: } michael@0: dataString.append((UChar)firstUnit).append((UChar)secondUnit); michael@0: if(thirdUnit>=0) { michael@0: dataString.append((UChar)thirdUnit); michael@0: } michael@0: } michael@0: } michael@0: michael@0: class ExtraDataWriter : public Normalizer2DBEnumerator { michael@0: public: michael@0: ExtraDataWriter(Normalizer2DataBuilder &b) : michael@0: Normalizer2DBEnumerator(b), michael@0: yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions michael@0: yesNoMappingsAndCompositions(1000, (UChar32)0, 1) {} // 0=Hangul, 1=start of normal data michael@0: virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { michael@0: if(value!=0) { michael@0: if(start!=end) { michael@0: fprintf(stderr, michael@0: "gennorm2 error: unexpected shared data for " michael@0: "multiple code points U+%04lX..U+%04lX\n", michael@0: (long)start, (long)end); michael@0: exit(U_INTERNAL_PROGRAM_ERROR); michael@0: } michael@0: builder.writeExtraData(start, value, *this); michael@0: } michael@0: return TRUE; michael@0: } michael@0: UnicodeString maybeYesCompositions; michael@0: UnicodeString yesYesCompositions; michael@0: UnicodeString yesNoMappingsAndCompositions; michael@0: UnicodeString yesNoMappingsOnly; michael@0: UnicodeString noNoMappings; michael@0: Hashtable previousNoNoMappings; // If constructed in runtime code, pass in UErrorCode. michael@0: }; michael@0: michael@0: void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer) { michael@0: Norm *p=norms+value; michael@0: if(!p->hasMapping()) { michael@0: // Write small-FCD data. michael@0: // There is similar code in writeMapping() for characters that do have a mapping. michael@0: if(ccc!=0) { michael@0: fprintf(stderr, michael@0: "gennorm2 error: " michael@0: "U+%04lX below U+0300 has ccc!=0, not supported by ICU\n", michael@0: (long)c); michael@0: exit(U_INVALID_FORMAT_ERROR); michael@0: } michael@0: if(p->cc!=0) { michael@0: UChar32 lead= c<=0xffff ? c : U16_LEAD(c); michael@0: smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7); michael@0: } michael@0: } michael@0: if(p->combinesBack) { michael@0: if(p->hasMapping()) { michael@0: fprintf(stderr, michael@0: "gennorm2 error: " michael@0: "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n", michael@0: (long)c); michael@0: exit(U_INVALID_FORMAT_ERROR); michael@0: } michael@0: if(p->compositions!=NULL) { michael@0: p->offset= michael@0: (writer.maybeYesCompositions.length()<hasMapping()) { michael@0: if(p->compositions!=NULL) { michael@0: p->offset= michael@0: (writer.yesYesCompositions.length()<mappingType==Norm::ROUND_TRIP) { michael@0: if(p->compositions!=NULL) { michael@0: int32_t offset=writer.yesNoMappingsAndCompositions.length()+ michael@0: writeMapping(c, p, writer.yesNoMappingsAndCompositions); michael@0: p->offset=(offset<offset=(offset<compositions!=NULL) { michael@0: fprintf(stderr, michael@0: "gennorm2 error: " michael@0: "U+%04lX combines-forward and has a one-way mapping, " michael@0: "not possible in Unicode normalization\n", michael@0: (long)c); michael@0: exit(U_INVALID_FORMAT_ERROR); michael@0: } michael@0: if(p->cc==0 && optimization!=OPTIMIZE_FAST) { michael@0: // Try a compact, algorithmic encoding. michael@0: // Only for ccc=0, because we can't store additional information michael@0: // and we do not recursively follow an algorithmic encoding for access to the ccc. michael@0: // michael@0: // Also, if hasNoCompBoundaryAfter is set, we can only use the algorithmic encoding michael@0: // if the mappingCP decomposes further, to ensure that there is a place to store it. michael@0: // We want to see that the final mapping does not have exactly 1 code point, michael@0: // or else we would have to recursively ensure that the final mapping is stored michael@0: // in normal extraData. michael@0: if(p->mappingCP>=0 && (!p->hasNoCompBoundaryAfter || 1!=p->mapping->countChar32())) { michael@0: int32_t delta=p->mappingCP-c; michael@0: if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) { michael@0: p->offset=(delta<offset==0) { michael@0: int32_t oldNoNoLength=writer.noNoMappings.length(); michael@0: int32_t offset=oldNoNoLength+writeMapping(c, p, writer.noNoMappings); michael@0: UnicodeString newMapping=writer.noNoMappings.tempSubString(oldNoNoLength); michael@0: int32_t previousOffset=writer.previousNoNoMappings.geti(newMapping); michael@0: if(previousOffset!=0) { michael@0: // Duplicate, remove the new units and point to the old ones. michael@0: writer.noNoMappings.truncate(oldNoNoLength); michael@0: p->offset=((previousOffset-1)<offset=(offset<offset>>Norm::OFFSET_SHIFT; michael@0: int32_t norm16=0; michael@0: UBool isDecompNo=FALSE; michael@0: UBool isCompNoMaybe=FALSE; michael@0: switch(p->offset&Norm::OFFSET_MASK) { michael@0: case Norm::OFFSET_NONE: michael@0: // No mapping, no compositions list. michael@0: if(p->combinesBack) { michael@0: norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+p->cc; michael@0: isDecompNo=(UBool)(p->cc!=0); michael@0: isCompNoMaybe=TRUE; michael@0: } else if(p->cc!=0) { michael@0: norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+p->cc; michael@0: isDecompNo=isCompNoMaybe=TRUE; michael@0: } michael@0: break; michael@0: case Norm::OFFSET_MAYBE_YES: michael@0: norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset; michael@0: isCompNoMaybe=TRUE; michael@0: break; michael@0: case Norm::OFFSET_YES_YES: michael@0: norm16=offset; michael@0: break; michael@0: case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION: michael@0: norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset; michael@0: isDecompNo=TRUE; michael@0: break; michael@0: case Norm::OFFSET_YES_NO_MAPPING_ONLY: michael@0: norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+offset; michael@0: isDecompNo=TRUE; michael@0: break; michael@0: case Norm::OFFSET_NO_NO: michael@0: norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset; michael@0: isDecompNo=isCompNoMaybe=TRUE; michael@0: break; michael@0: case Norm::OFFSET_DELTA: michael@0: norm16=getCenterNoNoDelta()+offset; michael@0: isDecompNo=isCompNoMaybe=TRUE; michael@0: break; michael@0: default: // Should not occur. michael@0: exit(U_INTERNAL_PROGRAM_ERROR); michael@0: } michael@0: IcuToolErrorCode errorCode("gennorm2/writeNorm16()"); michael@0: utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode); michael@0: if(isDecompNo && startstart; climit; ++c) { michael@0: if(utrie2_get32(norm16Trie, c)!=0) { michael@0: fprintf(stderr, michael@0: "gennorm2 error: " michael@0: "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n", michael@0: (long)c); michael@0: exit(U_INVALID_FORMAT_ERROR); michael@0: } michael@0: } michael@0: } michael@0: // Set data for algorithmic runtime handling. michael@0: IcuToolErrorCode errorCode("gennorm2/setHangulData()"); michael@0: hi.reset(); michael@0: while((range=hi.nextRange())!=NULL) { michael@0: uint16_t norm16=range->norm16; michael@0: if(norm16==0) { michael@0: norm16=(uint16_t)indexes[Normalizer2Impl::IX_MIN_YES_NO]; // Hangul LV/LVT encoded as minYesNo michael@0: if(range->startstart; michael@0: } michael@0: } else { michael@0: if(range->startstart; michael@0: } michael@0: } michael@0: utrie2_setRange32(norm16Trie, range->start, range->limit-1, norm16, TRUE, errorCode); michael@0: errorCode.assertSuccess(); michael@0: } michael@0: } michael@0: michael@0: U_CDECL_BEGIN michael@0: michael@0: static UBool U_CALLCONV michael@0: enumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) { michael@0: uint32_t *pMaxValue=(uint32_t *)context; michael@0: if(value>*pMaxValue) { michael@0: *pMaxValue=value; michael@0: } michael@0: return TRUE; michael@0: } michael@0: michael@0: U_CDECL_END michael@0: michael@0: void Normalizer2DataBuilder::processData() { michael@0: IcuToolErrorCode errorCode("gennorm2/processData()"); michael@0: norm16Trie=utrie2_open(0, 0, errorCode); michael@0: errorCode.assertSuccess(); michael@0: michael@0: utrie2_enum(normTrie, NULL, enumRangeHandler, CompositionBuilder(*this).ptr()); michael@0: michael@0: Decomposer decomposer(*this); michael@0: do { michael@0: decomposer.didDecompose=FALSE; michael@0: utrie2_enum(normTrie, NULL, enumRangeHandler, &decomposer); michael@0: } while(decomposer.didDecompose); michael@0: michael@0: BuilderReorderingBuffer buffer; michael@0: int32_t normsLength=utm_countItems(normMem); michael@0: for(int32_t i=1; iminNoNoDelta) { michael@0: fprintf(stderr, michael@0: "gennorm2 error: " michael@0: "data structure overflow, too much mapping composition data\n"); michael@0: exit(U_BUFFER_OVERFLOW_ERROR); michael@0: } michael@0: michael@0: utrie2_enum(normTrie, NULL, enumRangeHandler, Norm16Writer(*this).ptr()); michael@0: michael@0: setHangulData(); michael@0: michael@0: // Look for the "worst" norm16 value of any supplementary code point michael@0: // corresponding to a lead surrogate, and set it as that surrogate's value. michael@0: // Enables quick check inner loops to look at only code units. michael@0: // michael@0: // We could be more sophisticated: michael@0: // We could collect a bit set for whether there are values in the different michael@0: // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.) michael@0: // and select the best value that only breaks the composition and/or decomposition michael@0: // inner loops if necessary. michael@0: // However, that seems like overkill for an optimization for supplementary characters. michael@0: for(UChar lead=0xd800; lead<0xdc00; ++lead) { michael@0: uint32_t maxValue=utrie2_get32(norm16Trie, lead); michael@0: utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &maxValue); michael@0: if( maxValue>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] && michael@0: maxValue>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO] michael@0: ) { michael@0: // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0. michael@0: // Otherwise it might end up at something like JAMO_VT which stays in michael@0: // the inner decomposition quick check loop. michael@0: maxValue=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1; michael@0: } michael@0: utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, maxValue, errorCode); michael@0: } michael@0: michael@0: // Adjust supplementary minimum code points to break quick check loops at their lead surrogates. michael@0: // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate) michael@0: // which is harmless. michael@0: // As a result, the minimum code points are always BMP code points. michael@0: int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]; michael@0: if(minCP>=0x10000) { michael@0: indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP); michael@0: } michael@0: minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]; michael@0: if(minCP>=0x10000) { michael@0: indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP); michael@0: } michael@0: } michael@0: michael@0: void Normalizer2DataBuilder::writeBinaryFile(const char *filename) { michael@0: processData(); michael@0: michael@0: IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()"); michael@0: utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode); michael@0: int32_t norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode); michael@0: if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) { michael@0: fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n", michael@0: errorCode.errorName()); michael@0: exit(errorCode.reset()); michael@0: } michael@0: errorCode.reset(); michael@0: LocalArray norm16TrieBytes(new uint8_t[norm16TrieLength]); michael@0: utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode); michael@0: errorCode.assertSuccess(); michael@0: michael@0: int32_t offset=(int32_t)sizeof(indexes); michael@0: indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset; michael@0: offset+=norm16TrieLength; michael@0: indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset; michael@0: offset+=extraData.length()*2; michael@0: indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset; michael@0: offset+=sizeof(smallFCD); michael@0: int32_t totalSize=offset; michael@0: for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) { michael@0: indexes[i]=totalSize; michael@0: } michael@0: michael@0: if(beVerbose) { michael@0: printf("size of normalization trie: %5ld bytes\n", (long)norm16TrieLength); michael@0: printf("size of 16-bit extra data: %5ld uint16_t\n", (long)extraData.length()); michael@0: printf("size of small-FCD data: %5ld bytes\n", (long)sizeof(smallFCD)); michael@0: printf("size of binary data file contents: %5ld bytes\n", (long)totalSize); michael@0: printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]); michael@0: printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]); michael@0: printf("minYesNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]); michael@0: printf("minYesNoMappingsOnly: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]); michael@0: printf("minNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]); michael@0: printf("limitNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]); michael@0: printf("minMaybeYes: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]); michael@0: } michael@0: michael@0: UVersionInfo nullVersion={ 0, 0, 0, 0 }; michael@0: if(0==memcmp(nullVersion, unicodeVersion, 4)) { michael@0: u_versionFromString(unicodeVersion, U_UNICODE_VERSION); michael@0: } michael@0: memcpy(dataInfo.dataVersion, unicodeVersion, 4); michael@0: UNewDataMemory *pData= michael@0: udata_create(NULL, NULL, filename, &dataInfo, michael@0: haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode); michael@0: if(errorCode.isFailure()) { michael@0: fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n", michael@0: filename, errorCode.errorName()); michael@0: exit(errorCode.reset()); michael@0: } michael@0: udata_writeBlock(pData, indexes, sizeof(indexes)); michael@0: udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength); michael@0: udata_writeUString(pData, extraData.getBuffer(), extraData.length()); michael@0: udata_writeBlock(pData, smallFCD, sizeof(smallFCD)); michael@0: int32_t writtenSize=udata_finish(pData, errorCode); michael@0: if(errorCode.isFailure()) { michael@0: fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName()); michael@0: exit(errorCode.reset()); michael@0: } michael@0: if(writtenSize!=totalSize) { michael@0: fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n", michael@0: (long)writtenSize, (long)totalSize); michael@0: exit(U_INTERNAL_PROGRAM_ERROR); michael@0: } michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif /* #if !UCONFIG_NO_NORMALIZATION */ michael@0: michael@0: /* michael@0: * Hey, Emacs, please set the following: michael@0: * michael@0: * Local Variables: michael@0: * indent-tabs-mode: nil michael@0: * End: michael@0: */