intl/icu/source/tools/gennorm2/n2builder.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2 *******************************************************************************
     3 *
     4 *   Copyright (C) 2009-2012, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 *******************************************************************************
     8 *   file name:  n2builder.cpp
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2009nov25
    14 *   created by: Markus W. Scherer
    15 *
    16 * Builds Normalizer2 data and writes a binary .nrm file.
    17 * For the file format see source/common/normalizer2impl.h.
    18 */
    20 #include "unicode/utypes.h"
    21 #include "n2builder.h"
    23 #include <stdio.h>
    24 #include <stdlib.h>
    25 #include <string.h>
    26 #if U_HAVE_STD_STRING
    27 #include <vector>
    28 #endif
    29 #include "unicode/errorcode.h"
    30 #include "unicode/localpointer.h"
    31 #include "unicode/putil.h"
    32 #include "unicode/udata.h"
    33 #include "unicode/uniset.h"
    34 #include "unicode/unistr.h"
    35 #include "unicode/ustring.h"
    36 #include "hash.h"
    37 #include "normalizer2impl.h"
    38 #include "toolutil.h"
    39 #include "unewdata.h"
    40 #include "utrie2.h"
    41 #include "uvectr32.h"
    43 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
    45 #if !UCONFIG_NO_NORMALIZATION
    47 /* UDataInfo cf. udata.h */
    48 static UDataInfo dataInfo={
    49     sizeof(UDataInfo),
    50     0,
    52     U_IS_BIG_ENDIAN,
    53     U_CHARSET_FAMILY,
    54     U_SIZEOF_UCHAR,
    55     0,
    57     { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */
    58     { 2, 0, 0, 0 },             /* formatVersion */
    59     { 5, 2, 0, 0 }              /* dataVersion (Unicode version) */
    60 };
    62 U_NAMESPACE_BEGIN
    64 class HangulIterator {
    65 public:
    66     struct Range {
    67         UChar32 start, limit;
    68         uint16_t norm16;
    69     };
    71     HangulIterator() : rangeIndex(0) {}
    72     const Range *nextRange() {
    73         if(rangeIndex<LENGTHOF(ranges)) {
    74             return ranges+rangeIndex++;
    75         } else {
    76             return NULL;
    77         }
    78     }
    79     void reset() { rangeIndex=0; }
    80 private:
    81     static const Range ranges[4];
    82     int32_t rangeIndex;
    83 };
    85 const HangulIterator::Range HangulIterator::ranges[4]={
    86     { Hangul::JAMO_L_BASE, Hangul::JAMO_L_BASE+Hangul::JAMO_L_COUNT, 1 },
    87     { Hangul::JAMO_V_BASE, Hangul::JAMO_V_BASE+Hangul::JAMO_V_COUNT, Normalizer2Impl::JAMO_VT },
    88     // JAMO_T_BASE+1: not U+11A7
    89     { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_BASE+Hangul::JAMO_T_COUNT, Normalizer2Impl::JAMO_VT },
    90     { Hangul::HANGUL_BASE, Hangul::HANGUL_BASE+Hangul::HANGUL_COUNT, 0 },  // will become minYesNo
    91 };
    93 struct CompositionPair {
    94     CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {}
    95     UChar32 trail, composite;
    96 };
    98 struct Norm {
    99     enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY };
   101     UBool hasMapping() const { return mappingType>REMOVED; }
   103     // Requires hasMapping() and well-formed mapping.
   104     void setMappingCP() {
   105         UChar32 c;
   106         if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) {
   107             mappingCP=c;
   108         } else {
   109             mappingCP=U_SENTINEL;
   110         }
   111     }
   113     const CompositionPair *getCompositionPairs(int32_t &length) const {
   114         if(compositions==NULL) {
   115             length=0;
   116             return NULL;
   117         } else {
   118             length=compositions->size()/2;
   119             return reinterpret_cast<const CompositionPair *>(compositions->getBuffer());
   120         }
   121     }
   123     UnicodeString *mapping;
   124     UnicodeString *rawMapping;  // non-NULL if the mapping is further decomposed
   125     UChar32 mappingCP;  // >=0 if mapping to 1 code point
   126     int32_t mappingPhase;
   127     MappingType mappingType;
   129     UVector32 *compositions;  // (trail, composite) pairs
   130     uint8_t cc;
   131     UBool combinesBack;
   132     UBool hasNoCompBoundaryAfter;
   134     enum OffsetType {
   135         OFFSET_NONE,
   136         // Composition for back-combining character. Allowed, but not normally used.
   137         OFFSET_MAYBE_YES,
   138         // Composition for a starter that does not have a decomposition mapping.
   139         OFFSET_YES_YES,
   140         // Round-trip mapping & composition for a starter.
   141         OFFSET_YES_NO_MAPPING_AND_COMPOSITION,
   142         // Round-trip mapping for a starter that itself does not combine-forward.
   143         OFFSET_YES_NO_MAPPING_ONLY,
   144         // One-way mapping.
   145         OFFSET_NO_NO,
   146         // Delta for an algorithmic one-way mapping.
   147         OFFSET_DELTA
   148     };
   149     enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 };
   150     int32_t offset;
   151 };
   153 class Normalizer2DBEnumerator {
   154 public:
   155     Normalizer2DBEnumerator(Normalizer2DataBuilder &b) : builder(b) {}
   156     virtual ~Normalizer2DBEnumerator() {}
   157     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) = 0;
   158     Normalizer2DBEnumerator *ptr() { return this; }
   159 protected:
   160     Normalizer2DataBuilder &builder;
   161 };
   163 U_CDECL_BEGIN
   165 static UBool U_CALLCONV
   166 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
   167     return ((Normalizer2DBEnumerator *)context)->rangeHandler(start, end, value);
   168 }
   170 U_CDECL_END
   172 Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) :
   173         phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL) {
   174     memset(unicodeVersion, 0, sizeof(unicodeVersion));
   175     normTrie=utrie2_open(0, 0, &errorCode);
   176     normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
   177     norms=allocNorm();  // unused Norm struct at index 0
   178     memset(indexes, 0, sizeof(indexes));
   179     memset(smallFCD, 0, sizeof(smallFCD));
   180 }
   182 Normalizer2DataBuilder::~Normalizer2DataBuilder() {
   183     utrie2_close(normTrie);
   184     int32_t normsLength=utm_countItems(normMem);
   185     for(int32_t i=1; i<normsLength; ++i) {
   186         delete norms[i].mapping;
   187         delete norms[i].rawMapping;
   188         delete norms[i].compositions;
   189     }
   190     utm_close(normMem);
   191     utrie2_close(norm16Trie);
   192 }
   194 void
   195 Normalizer2DataBuilder::setUnicodeVersion(const char *v) {
   196     UVersionInfo nullVersion={ 0, 0, 0, 0 };
   197     UVersionInfo version;
   198     u_versionFromString(version, v);
   199     if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) &&
   200         0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH)
   201     ) {
   202         char buffer[U_MAX_VERSION_STRING_LENGTH];
   203         u_versionToString(unicodeVersion, buffer);
   204         fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n",
   205                 buffer, v);
   206         exit(U_ILLEGAL_ARGUMENT_ERROR);
   207     }
   208     memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH);
   209 }
   211 Norm *Normalizer2DataBuilder::allocNorm() {
   212     Norm *p=(Norm *)utm_alloc(normMem);
   213     norms=(Norm *)utm_getStart(normMem);  // in case it got reallocated
   214     return p;
   215 }
   217 /* get an existing Norm unit */
   218 Norm *Normalizer2DataBuilder::getNorm(UChar32 c) {
   219     uint32_t i=utrie2_get32(normTrie, c);
   220     if(i==0) {
   221         return NULL;
   222     }
   223     return norms+i;
   224 }
   226 const Norm &Normalizer2DataBuilder::getNormRef(UChar32 c) const {
   227     return norms[utrie2_get32(normTrie, c)];
   228 }
   230 /*
   231  * get or create a Norm unit;
   232  * get or create the intermediate trie entries for it as well
   233  */
   234 Norm *Normalizer2DataBuilder::createNorm(UChar32 c) {
   235     uint32_t i=utrie2_get32(normTrie, c);
   236     if(i!=0) {
   237         return norms+i;
   238     } else {
   239         /* allocate Norm */
   240         Norm *p=allocNorm();
   241         IcuToolErrorCode errorCode("gennorm2/createNorm()");
   242         utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode);
   243         return p;
   244     }
   245 }
   247 Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) {
   248     if(p!=NULL) {
   249         if(p->mappingType!=Norm::NONE) {
   250             if( overrideHandling==OVERRIDE_NONE ||
   251                 (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase)
   252             ) {
   253                 fprintf(stderr,
   254                         "error in gennorm2 phase %d: "
   255                         "not permitted to override mapping for U+%04lX from phase %d\n",
   256                         (int)phase, (long)c, (int)p->mappingPhase);
   257                 exit(U_INVALID_FORMAT_ERROR);
   258             }
   259             delete p->mapping;
   260             p->mapping=NULL;
   261         }
   262         p->mappingPhase=phase;
   263     }
   264     return p;
   265 }
   267 void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) {
   268     overrideHandling=oh;
   269     ++phase;
   270 }
   272 void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) {
   273     createNorm(c)->cc=cc;
   274 }
   276 uint8_t Normalizer2DataBuilder::getCC(UChar32 c) const {
   277     return getNormRef(c).cc;
   278 }
   280 static UBool isWellFormed(const UnicodeString &s) {
   281     UErrorCode errorCode=U_ZERO_ERROR;
   282     u_strToUTF8(NULL, 0, NULL, s.getBuffer(), s.length(), &errorCode);
   283     return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR;
   284 }
   286 void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) {
   287     if(!isWellFormed(m)) {
   288         fprintf(stderr,
   289                 "error in gennorm2 phase %d: "
   290                 "illegal one-way mapping from U+%04lX to malformed string\n",
   291                 (int)phase, (long)c);
   292         exit(U_INVALID_FORMAT_ERROR);
   293     }
   294     Norm *p=checkNormForMapping(createNorm(c), c);
   295     p->mapping=new UnicodeString(m);
   296     p->mappingType=Norm::ONE_WAY;
   297     p->setMappingCP();
   298 }
   300 void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) {
   301     if(U_IS_SURROGATE(c)) {
   302         fprintf(stderr,
   303                 "error in gennorm2 phase %d: "
   304                 "illegal round-trip mapping from surrogate code point U+%04lX\n",
   305                 (int)phase, (long)c);
   306         exit(U_INVALID_FORMAT_ERROR);
   307     }
   308     if(!isWellFormed(m)) {
   309         fprintf(stderr,
   310                 "error in gennorm2 phase %d: "
   311                 "illegal round-trip mapping from U+%04lX to malformed string\n",
   312                 (int)phase, (long)c);
   313         exit(U_INVALID_FORMAT_ERROR);
   314     }
   315     int32_t numCP=u_countChar32(m.getBuffer(), m.length());
   316     if(numCP!=2) {
   317         fprintf(stderr,
   318                 "error in gennorm2 phase %d: "
   319                 "illegal round-trip mapping from U+%04lX to %d!=2 code points\n",
   320                 (int)phase, (long)c, (int)numCP);
   321         exit(U_INVALID_FORMAT_ERROR);
   322     }
   323     Norm *p=checkNormForMapping(createNorm(c), c);
   324     p->mapping=new UnicodeString(m);
   325     p->mappingType=Norm::ROUND_TRIP;
   326     p->mappingCP=U_SENTINEL;
   327 }
   329 void Normalizer2DataBuilder::removeMapping(UChar32 c) {
   330     Norm *p=checkNormForMapping(getNorm(c), c);
   331     if(p!=NULL) {
   332         p->mappingType=Norm::REMOVED;
   333     }
   334 }
   336 class CompositionBuilder : public Normalizer2DBEnumerator {
   337 public:
   338     CompositionBuilder(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {}
   339     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
   340         builder.addComposition(start, end, value);
   341         return TRUE;
   342     }
   343 };
   345 void
   346 Normalizer2DataBuilder::addComposition(UChar32 start, UChar32 end, uint32_t value) {
   347     if(norms[value].mappingType==Norm::ROUND_TRIP) {
   348         if(start!=end) {
   349             fprintf(stderr,
   350                     "gennorm2 error: same round-trip mapping for "
   351                     "more than 1 code point U+%04lX..U+%04lX\n",
   352                     (long)start, (long)end);
   353             exit(U_INVALID_FORMAT_ERROR);
   354         }
   355         if(norms[value].cc!=0) {
   356             fprintf(stderr,
   357                     "gennorm2 error: "
   358                     "U+%04lX has a round-trip mapping and ccc!=0, "
   359                     "not possible in Unicode normalization\n",
   360                     (long)start);
   361             exit(U_INVALID_FORMAT_ERROR);
   362         }
   363         // setRoundTripMapping() ensured that there are exactly two code points.
   364         const UnicodeString &m=*norms[value].mapping;
   365         UChar32 lead=m.char32At(0);
   366         UChar32 trail=m.char32At(m.length()-1);
   367         if(getCC(lead)!=0) {
   368             fprintf(stderr,
   369                     "gennorm2 error: "
   370                     "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
   371                     "not possible in Unicode normalization\n",
   372                     (long)start, (long)lead);
   373             exit(U_INVALID_FORMAT_ERROR);
   374         }
   375         // Flag for trailing character.
   376         createNorm(trail)->combinesBack=TRUE;
   377         // Insert (trail, composite) pair into compositions list for the lead character.
   378         IcuToolErrorCode errorCode("gennorm2/addComposition()");
   379         Norm *leadNorm=createNorm(lead);
   380         UVector32 *compositions=leadNorm->compositions;
   381         int32_t i;
   382         if(compositions==NULL) {
   383             compositions=leadNorm->compositions=new UVector32(errorCode);
   384             i=0;  // "insert" the first pair at index 0
   385         } else {
   386             // Insertion sort, and check for duplicate trail characters.
   387             int32_t length;
   388             const CompositionPair *pairs=leadNorm->getCompositionPairs(length);
   389             for(i=0; i<length; ++i) {
   390                 if(trail==pairs[i].trail) {
   391                     fprintf(stderr,
   392                             "gennorm2 error: same round-trip mapping for "
   393                             "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
   394                             (long)start, (long)lead, (long)trail);
   395                     exit(U_INVALID_FORMAT_ERROR);
   396                 }
   397                 if(trail<pairs[i].trail) {
   398                     break;
   399                 }
   400             }
   401         }
   402         compositions->insertElementAt(trail, 2*i, errorCode);
   403         compositions->insertElementAt(start, 2*i+1, errorCode);
   404     }
   405 }
   407 UBool Normalizer2DataBuilder::combinesWithCCBetween(const Norm &norm,
   408                                                     uint8_t lowCC, uint8_t highCC) const {
   409     if((highCC-lowCC)>=2) {
   410         int32_t length;
   411         const CompositionPair *pairs=norm.getCompositionPairs(length);
   412         for(int32_t i=0; i<length; ++i) {
   413             uint8_t trailCC=getCC(pairs[i].trail);
   414             if(lowCC<trailCC && trailCC<highCC) {
   415                 return TRUE;
   416             }
   417         }
   418     }
   419     return FALSE;
   420 }
   422 UChar32 Normalizer2DataBuilder::combine(const Norm &norm, UChar32 trail) const {
   423     int32_t length;
   424     const CompositionPair *pairs=norm.getCompositionPairs(length);
   425     for(int32_t i=0; i<length; ++i) {
   426         if(trail==pairs[i].trail) {
   427             return pairs[i].composite;
   428         }
   429         if(trail<pairs[i].trail) {
   430             break;
   431         }
   432     }
   433     return U_SENTINEL;
   434 }
   436 class Decomposer : public Normalizer2DBEnumerator {
   437 public:
   438     Decomposer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b), didDecompose(FALSE) {}
   439     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
   440         didDecompose|=builder.decompose(start, end, value);
   441         return TRUE;
   442     }
   443     UBool didDecompose;
   444 };
   446 UBool
   447 Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) {
   448     if(norms[value].hasMapping()) {
   449         Norm &norm=norms[value];
   450         const UnicodeString &m=*norm.mapping;
   451         UnicodeString *decomposed=NULL;
   452         const UChar *s=m.getBuffer();
   453         int32_t length=m.length();
   454         int32_t prev, i=0;
   455         UChar32 c;
   456         while(i<length) {
   457             prev=i;
   458             U16_NEXT(s, i, length, c);
   459             if(start<=c && c<=end) {
   460                 fprintf(stderr,
   461                         "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
   462                         (long)c);
   463                 exit(U_INVALID_FORMAT_ERROR);
   464             }
   465             const Norm &cNorm=getNormRef(c);
   466             if(cNorm.hasMapping()) {
   467                 if(norm.mappingType==Norm::ROUND_TRIP) {
   468                     if(prev==0) {
   469                         if(cNorm.mappingType!=Norm::ROUND_TRIP) {
   470                             fprintf(stderr,
   471                                     "gennorm2 error: "
   472                                     "U+%04lX's round-trip mapping's starter "
   473                                     "U+%04lX one-way-decomposes, "
   474                                     "not possible in Unicode normalization\n",
   475                                     (long)start, (long)c);
   476                             exit(U_INVALID_FORMAT_ERROR);
   477                         }
   478                         uint8_t myTrailCC=getCC(m.char32At(i));
   479                         UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1);
   480                         uint8_t cTrailCC=getCC(cTrailChar);
   481                         if(cTrailCC>myTrailCC) {
   482                             fprintf(stderr,
   483                                     "gennorm2 error: "
   484                                     "U+%04lX's round-trip mapping's starter "
   485                                     "U+%04lX decomposes and the "
   486                                     "inner/earlier tccc=%hu > outer/following tccc=%hu, "
   487                                     "not possible in Unicode normalization\n",
   488                                     (long)start, (long)c,
   489                                     (short)cTrailCC, (short)myTrailCC);
   490                             exit(U_INVALID_FORMAT_ERROR);
   491                         }
   492                     } else {
   493                         fprintf(stderr,
   494                                 "gennorm2 error: "
   495                                 "U+%04lX's round-trip mapping's non-starter "
   496                                 "U+%04lX decomposes, "
   497                                 "not possible in Unicode normalization\n",
   498                                 (long)start, (long)c);
   499                         exit(U_INVALID_FORMAT_ERROR);
   500                     }
   501                 }
   502                 if(decomposed==NULL) {
   503                     decomposed=new UnicodeString(m, 0, prev);
   504                 }
   505                 decomposed->append(*cNorm.mapping);
   506             } else if(Hangul::isHangul(c)) {
   507                 UChar buffer[3];
   508                 int32_t hangulLength=Hangul::decompose(c, buffer);
   509                 if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) {
   510                     fprintf(stderr,
   511                             "gennorm2 error: "
   512                             "U+%04lX's round-trip mapping's non-starter "
   513                             "U+%04lX decomposes, "
   514                             "not possible in Unicode normalization\n",
   515                             (long)start, (long)c);
   516                     exit(U_INVALID_FORMAT_ERROR);
   517                 }
   518                 if(decomposed==NULL) {
   519                     decomposed=new UnicodeString(m, 0, prev);
   520                 }
   521                 decomposed->append(buffer, hangulLength);
   522             } else if(decomposed!=NULL) {
   523                 decomposed->append(m, prev, i-prev);
   524             }
   525         }
   526         if(decomposed!=NULL) {
   527             if(norm.rawMapping==NULL) {
   528                 // Remember the original mapping when decomposing recursively.
   529                 norm.rawMapping=norm.mapping;
   530             } else {
   531                 delete norm.mapping;
   532             }
   533             norm.mapping=decomposed;
   534             // Not  norm.setMappingCP();  because the original mapping
   535             // is most likely to be encodable as a delta.
   536             return TRUE;
   537         }
   538     }
   539     return FALSE;
   540 }
   542 class BuilderReorderingBuffer {
   543 public:
   544     BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE) {}
   545     void reset() {
   546         fLength=0;
   547         fLastStarterIndex=-1;
   548         fDidReorder=FALSE;
   549     }
   550     int32_t length() const { return fLength; }
   551     UBool isEmpty() const { return fLength==0; }
   552     int32_t lastStarterIndex() const { return fLastStarterIndex; }
   553     UChar32 charAt(int32_t i) const { return fArray[i]>>8; }
   554     uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; }
   555     UBool didReorder() const { return fDidReorder; }
   556     void append(UChar32 c, uint8_t cc) {
   557         if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) {
   558             if(cc==0) {
   559                 fLastStarterIndex=fLength;
   560             }
   561             fArray[fLength++]=(c<<8)|cc;
   562             return;
   563         }
   564         // Let this character bubble back to its canonical order.
   565         int32_t i=fLength-1;
   566         while(i>fLastStarterIndex && ccAt(i)>cc) {
   567             --i;
   568         }
   569         ++i;  // after the last starter or prevCC<=cc
   570         // Move this and the following characters forward one to make space.
   571         for(int32_t j=fLength; i<j; --j) {
   572             fArray[j]=fArray[j-1];
   573         }
   574         fArray[i]=(c<<8)|cc;
   575         ++fLength;
   576         fDidReorder=TRUE;
   577     }
   578     void toString(UnicodeString &dest) {
   579         dest.remove();
   580         for(int32_t i=0; i<fLength; ++i) {
   581             dest.append(charAt(i));
   582         }
   583     }
   584     void setComposite(UChar32 composite, int32_t combMarkIndex) {
   585         fArray[fLastStarterIndex]=composite<<8;
   586         // Remove the combining mark that contributed to the composite.
   587         --fLength;
   588         while(combMarkIndex<fLength) {
   589             fArray[combMarkIndex]=fArray[combMarkIndex+1];
   590             ++combMarkIndex;
   591         }
   592     }
   593 private:
   594     int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK];
   595     int32_t fLength;
   596     int32_t fLastStarterIndex;
   597     UBool fDidReorder;
   598 };
   600 void
   601 Normalizer2DataBuilder::reorder(Norm *p, BuilderReorderingBuffer &buffer) {
   602     UnicodeString &m=*p->mapping;
   603     int32_t length=m.length();
   604     if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
   605         return;  // writeMapping() will complain about it and print the code point.
   606     }
   607     const UChar *s=m.getBuffer();
   608     int32_t i=0;
   609     UChar32 c;
   610     while(i<length) {
   611         U16_NEXT(s, i, length, c);
   612         buffer.append(c, getCC(c));
   613     }
   614     if(buffer.didReorder()) {
   615         buffer.toString(m);
   616     }
   617 }
   619 /*
   620  * Computes the flag for the last code branch in Normalizer2Impl::hasCompBoundaryAfter().
   621  * A starter character with a mapping does not have a composition boundary after it
   622  * if the character itself combines-forward (which is tested by the caller of this function),
   623  * or it is deleted (mapped to the empty string),
   624  * or its mapping contains no starter,
   625  * or the last starter combines-forward.
   626  */
   627 UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer) {
   628     if(buffer.isEmpty()) {
   629         return TRUE;  // maps-to-empty-string is no boundary of any kind
   630     }
   631     int32_t lastStarterIndex=buffer.lastStarterIndex();
   632     if(lastStarterIndex<0) {
   633         return TRUE;  // no starter
   634     }
   635     UChar32 starter=buffer.charAt(lastStarterIndex);
   636     if( Hangul::isJamoL(starter) ||
   637         (Hangul::isJamoV(starter) &&
   638          0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))
   639     ) {
   640         // A Jamo leading consonant or an LV pair combines-forward if it is at the end,
   641         // otherwise it is blocked.
   642         return lastStarterIndex==buffer.length()-1;
   643     }
   644     // Note: There can be no Hangul syllable in the fully decomposed mapping.
   645     const Norm *starterNorm=&getNormRef(starter);
   646     if(starterNorm->compositions==NULL) {
   647         return FALSE;  // the last starter does not combine forward
   648     }
   649     // Compose as far as possible, and see if further compositions are possible.
   650     uint8_t prevCC=0;
   651     for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length();) {
   652         uint8_t cc=buffer.ccAt(combMarkIndex);  // !=0 because after last starter
   653         if(combinesWithCCBetween(*starterNorm, prevCC, cc)) {
   654             return TRUE;
   655         }
   656         if( prevCC<cc &&
   657             (starter=combine(*starterNorm, buffer.charAt(combMarkIndex)))>=0
   658         ) {
   659             buffer.setComposite(starter, combMarkIndex);
   660             starterNorm=&getNormRef(starter);
   661             if(starterNorm->compositions==NULL) {
   662                 return FALSE;  // the composite does not combine further
   663             }
   664         } else {
   665             prevCC=cc;
   666             ++combMarkIndex;
   667         }
   668     }
   669     // TRUE if the final, forward-combining starter is at the end.
   670     return prevCC==0;
   671 }
   673 // Requires p->hasMapping().
   674 // Returns the offset of the "first unit" from the beginning of the extraData for c.
   675 // That is the same as the length of the optional data for the raw mapping and the ccc/lccc word.
   676 int32_t Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString) {
   677     UnicodeString &m=*p->mapping;
   678     int32_t length=m.length();
   679     if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
   680         fprintf(stderr,
   681                 "gennorm2 error: "
   682                 "mapping for U+%04lX longer than maximum of %d\n",
   683                 (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
   684         exit(U_INVALID_FORMAT_ERROR);
   685     }
   686     int32_t leadCC, trailCC;
   687     if(length==0) {
   688         leadCC=trailCC=0;
   689     } else {
   690         leadCC=getCC(m.char32At(0));
   691         trailCC=getCC(m.char32At(length-1));
   692     }
   693     if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (p->cc!=0 || leadCC!=0)) {
   694         fprintf(stderr,
   695                 "gennorm2 error: "
   696                 "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
   697                 (long)c);
   698         exit(U_INVALID_FORMAT_ERROR);
   699     }
   700     // Write small-FCD data.
   701     if((leadCC|trailCC)!=0) {
   702         UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
   703         smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
   704     }
   705     // Write the mapping & raw mapping extraData.
   706     int32_t firstUnit=length|(trailCC<<8);
   707     int32_t preMappingLength=0;
   708     if(p->rawMapping!=NULL) {
   709         UnicodeString &rm=*p->rawMapping;
   710         int32_t rmLength=rm.length();
   711         if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) {
   712             fprintf(stderr,
   713                     "gennorm2 error: "
   714                     "raw mapping for U+%04lX longer than maximum of %d\n",
   715                     (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
   716             exit(U_INVALID_FORMAT_ERROR);
   717         }
   718         UChar rm0=rm.charAt(0);
   719         if( rmLength==length-1 &&
   720             // 99: overlong substring lengths get pinned to remainder lengths anyway
   721             0==rm.compare(1, 99, m, 2, 99) &&
   722             rm0>Normalizer2Impl::MAPPING_LENGTH_MASK
   723         ) {
   724             // Compression:
   725             // rawMapping=rm0+mapping.substring(2) -> store only rm0
   726             //
   727             // The raw mapping is the same as the final mapping after replacing
   728             // the final mapping's first two code units with the raw mapping's first one.
   729             // In this case, we store only that first unit, rm0.
   730             // This helps with a few hundred mappings.
   731             dataString.append(rm0);
   732             preMappingLength=1;
   733         } else {
   734             // Store the raw mapping with its length.
   735             dataString.append(rm);
   736             dataString.append((UChar)rmLength);
   737             preMappingLength=rmLength+1;
   738         }
   739         firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING;
   740     }
   741     int32_t cccLccc=p->cc|(leadCC<<8);
   742     if(cccLccc!=0) {
   743         dataString.append((UChar)cccLccc);
   744         ++preMappingLength;
   745         firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
   746     }
   747     if(p->hasNoCompBoundaryAfter) {
   748         firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER;
   749     }
   750     dataString.append((UChar)firstUnit);
   751     dataString.append(m);
   752     return preMappingLength;
   753 }
   755 // Requires p->compositions!=NULL.
   756 void Normalizer2DataBuilder::writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString) {
   757     if(p->cc!=0) {
   758         fprintf(stderr,
   759                 "gennorm2 error: "
   760                 "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n",
   761                 (long)c);
   762         exit(U_INVALID_FORMAT_ERROR);
   763     }
   764     int32_t length;
   765     const CompositionPair *pairs=p->getCompositionPairs(length);
   766     for(int32_t i=0; i<length; ++i) {
   767         const CompositionPair &pair=pairs[i];
   768         // 22 bits for the composite character and whether it combines forward.
   769         UChar32 compositeAndFwd=pair.composite<<1;
   770         if(getNormRef(pair.composite).compositions!=NULL) {
   771             compositeAndFwd|=1;  // The composite character also combines-forward.
   772         }
   773         // Encode most pairs in two units and some in three.
   774         int32_t firstUnit, secondUnit, thirdUnit;
   775         if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) {
   776             if(compositeAndFwd<=0xffff) {
   777                 firstUnit=pair.trail<<1;
   778                 secondUnit=compositeAndFwd;
   779                 thirdUnit=-1;
   780             } else {
   781                 firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE;
   782                 secondUnit=compositeAndFwd>>16;
   783                 thirdUnit=compositeAndFwd;
   784             }
   785         } else {
   786             firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+
   787                        (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))|
   788                       Normalizer2Impl::COMP_1_TRIPLE;
   789             secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)|
   790                        (compositeAndFwd>>16);
   791             thirdUnit=compositeAndFwd;
   792         }
   793         // Set the high bit of the first unit if this is the last composition pair.
   794         if(i==(length-1)) {
   795             firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE;
   796         }
   797         dataString.append((UChar)firstUnit).append((UChar)secondUnit);
   798         if(thirdUnit>=0) {
   799             dataString.append((UChar)thirdUnit);
   800         }
   801     }
   802 }
   804 class ExtraDataWriter : public Normalizer2DBEnumerator {
   805 public:
   806     ExtraDataWriter(Normalizer2DataBuilder &b) :
   807         Normalizer2DBEnumerator(b),
   808         yesYesCompositions(1000, (UChar32)0xffff, 2),  // 0=inert, 1=Jamo L, 2=start of compositions
   809         yesNoMappingsAndCompositions(1000, (UChar32)0, 1) {}  // 0=Hangul, 1=start of normal data
   810     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
   811         if(value!=0) {
   812             if(start!=end) {
   813                 fprintf(stderr,
   814                         "gennorm2 error: unexpected shared data for "
   815                         "multiple code points U+%04lX..U+%04lX\n",
   816                         (long)start, (long)end);
   817                 exit(U_INTERNAL_PROGRAM_ERROR);
   818             }
   819             builder.writeExtraData(start, value, *this);
   820         }
   821         return TRUE;
   822     }
   823     UnicodeString maybeYesCompositions;
   824     UnicodeString yesYesCompositions;
   825     UnicodeString yesNoMappingsAndCompositions;
   826     UnicodeString yesNoMappingsOnly;
   827     UnicodeString noNoMappings;
   828     Hashtable previousNoNoMappings;  // If constructed in runtime code, pass in UErrorCode.
   829 };
   831 void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer) {
   832     Norm *p=norms+value;
   833     if(!p->hasMapping()) {
   834         // Write small-FCD data.
   835         // There is similar code in writeMapping() for characters that do have a mapping.
   836         if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && p->cc!=0) {
   837             fprintf(stderr,
   838                     "gennorm2 error: "
   839                     "U+%04lX below U+0300 has ccc!=0, not supported by ICU\n",
   840                     (long)c);
   841             exit(U_INVALID_FORMAT_ERROR);
   842         }
   843         if(p->cc!=0) {
   844             UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
   845             smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
   846         }
   847     }
   848     if(p->combinesBack) {
   849         if(p->hasMapping()) {
   850             fprintf(stderr,
   851                     "gennorm2 error: "
   852                     "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n",
   853                     (long)c);
   854             exit(U_INVALID_FORMAT_ERROR);
   855         }
   856         if(p->compositions!=NULL) {
   857             p->offset=
   858                 (writer.maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)|
   859                 Norm::OFFSET_MAYBE_YES;
   860             writeCompositions(c, p, writer.maybeYesCompositions);
   861         }
   862     } else if(!p->hasMapping()) {
   863         if(p->compositions!=NULL) {
   864             p->offset=
   865                 (writer.yesYesCompositions.length()<<Norm::OFFSET_SHIFT)|
   866                 Norm::OFFSET_YES_YES;
   867             writeCompositions(c, p, writer.yesYesCompositions);
   868         }
   869     } else if(p->mappingType==Norm::ROUND_TRIP) {
   870         if(p->compositions!=NULL) {
   871             int32_t offset=writer.yesNoMappingsAndCompositions.length()+
   872                            writeMapping(c, p, writer.yesNoMappingsAndCompositions);
   873             p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION;
   874             writeCompositions(c, p, writer.yesNoMappingsAndCompositions);
   875         } else {
   876             int32_t offset=writer.yesNoMappingsOnly.length()+
   877                            writeMapping(c, p, writer.yesNoMappingsOnly);
   878             p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_ONLY;
   879         }
   880     } else /* one-way */ {
   881         if(p->compositions!=NULL) {
   882             fprintf(stderr,
   883                     "gennorm2 error: "
   884                     "U+%04lX combines-forward and has a one-way mapping, "
   885                     "not possible in Unicode normalization\n",
   886                     (long)c);
   887             exit(U_INVALID_FORMAT_ERROR);
   888         }
   889         if(p->cc==0 && optimization!=OPTIMIZE_FAST) {
   890             // Try a compact, algorithmic encoding.
   891             // Only for ccc=0, because we can't store additional information
   892             // and we do not recursively follow an algorithmic encoding for access to the ccc.
   893             //
   894             // Also, if hasNoCompBoundaryAfter is set, we can only use the algorithmic encoding
   895             // if the mappingCP decomposes further, to ensure that there is a place to store it.
   896             // We want to see that the final mapping does not have exactly 1 code point,
   897             // or else we would have to recursively ensure that the final mapping is stored
   898             // in normal extraData.
   899             if(p->mappingCP>=0 && (!p->hasNoCompBoundaryAfter || 1!=p->mapping->countChar32())) {
   900                 int32_t delta=p->mappingCP-c;
   901                 if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
   902                     p->offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA;
   903                 }
   904             }
   905         }
   906         if(p->offset==0) {
   907             int32_t oldNoNoLength=writer.noNoMappings.length();
   908             int32_t offset=oldNoNoLength+writeMapping(c, p, writer.noNoMappings);
   909             UnicodeString newMapping=writer.noNoMappings.tempSubString(oldNoNoLength);
   910             int32_t previousOffset=writer.previousNoNoMappings.geti(newMapping);
   911             if(previousOffset!=0) {
   912                 // Duplicate, remove the new units and point to the old ones.
   913                 writer.noNoMappings.truncate(oldNoNoLength);
   914                 p->offset=((previousOffset-1)<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
   915             } else {
   916                 // Enter this new mapping into the hashtable, avoiding value 0 which is "not found".
   917                 IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()");
   918                 writer.previousNoNoMappings.puti(newMapping, offset+1, errorCode);
   919                 p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
   920             }
   921         }
   922     }
   923 }
   925 class Norm16Writer : public Normalizer2DBEnumerator {
   926 public:
   927     Norm16Writer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {}
   928     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
   929         builder.writeNorm16(start, end, value);
   930         return TRUE;
   931     }
   932 };
   934 void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, uint32_t value) {
   935     if(value!=0) {
   936         const Norm *p=norms+value;
   937         int32_t offset=p->offset>>Norm::OFFSET_SHIFT;
   938         int32_t norm16=0;
   939         UBool isDecompNo=FALSE;
   940         UBool isCompNoMaybe=FALSE;
   941         switch(p->offset&Norm::OFFSET_MASK) {
   942         case Norm::OFFSET_NONE:
   943             // No mapping, no compositions list.
   944             if(p->combinesBack) {
   945                 norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+p->cc;
   946                 isDecompNo=(UBool)(p->cc!=0);
   947                 isCompNoMaybe=TRUE;
   948             } else if(p->cc!=0) {
   949                 norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+p->cc;
   950                 isDecompNo=isCompNoMaybe=TRUE;
   951             }
   952             break;
   953         case Norm::OFFSET_MAYBE_YES:
   954             norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset;
   955             isCompNoMaybe=TRUE;
   956             break;
   957         case Norm::OFFSET_YES_YES:
   958             norm16=offset;
   959             break;
   960         case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION:
   961             norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset;
   962             isDecompNo=TRUE;
   963             break;
   964         case Norm::OFFSET_YES_NO_MAPPING_ONLY:
   965             norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+offset;
   966             isDecompNo=TRUE;
   967             break;
   968         case Norm::OFFSET_NO_NO:
   969             norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset;
   970             isDecompNo=isCompNoMaybe=TRUE;
   971             break;
   972         case Norm::OFFSET_DELTA:
   973             norm16=getCenterNoNoDelta()+offset;
   974             isDecompNo=isCompNoMaybe=TRUE;
   975             break;
   976         default:  // Should not occur.
   977             exit(U_INTERNAL_PROGRAM_ERROR);
   978         }
   979         IcuToolErrorCode errorCode("gennorm2/writeNorm16()");
   980         utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode);
   981         if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
   982             indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start;
   983         }
   984         if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
   985             indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start;
   986         }
   987     }
   988 }
   990 void Normalizer2DataBuilder::setHangulData() {
   991     HangulIterator hi;
   992     const HangulIterator::Range *range;
   993     // Check that none of the Hangul/Jamo code points have data.
   994     while((range=hi.nextRange())!=NULL) {
   995         for(UChar32 c=range->start; c<range->limit; ++c) {
   996             if(utrie2_get32(norm16Trie, c)!=0) {
   997                 fprintf(stderr,
   998                         "gennorm2 error: "
   999                         "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n",
  1000                         (long)c);
  1001                 exit(U_INVALID_FORMAT_ERROR);
  1005     // Set data for algorithmic runtime handling.
  1006     IcuToolErrorCode errorCode("gennorm2/setHangulData()");
  1007     hi.reset();
  1008     while((range=hi.nextRange())!=NULL) {
  1009         uint16_t norm16=range->norm16;
  1010         if(norm16==0) {
  1011             norm16=(uint16_t)indexes[Normalizer2Impl::IX_MIN_YES_NO];  // Hangul LV/LVT encoded as minYesNo
  1012             if(range->start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
  1013                 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=range->start;
  1015         } else {
  1016             if(range->start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {  // Jamo V/T are maybeYes
  1017                 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=range->start;
  1020         utrie2_setRange32(norm16Trie, range->start, range->limit-1, norm16, TRUE, errorCode);
  1021         errorCode.assertSuccess();
  1025 U_CDECL_BEGIN
  1027 static UBool U_CALLCONV
  1028 enumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
  1029     uint32_t *pMaxValue=(uint32_t *)context;
  1030     if(value>*pMaxValue) {
  1031         *pMaxValue=value;
  1033     return TRUE;
  1036 U_CDECL_END
  1038 void Normalizer2DataBuilder::processData() {
  1039     IcuToolErrorCode errorCode("gennorm2/processData()");
  1040     norm16Trie=utrie2_open(0, 0, errorCode);
  1041     errorCode.assertSuccess();
  1043     utrie2_enum(normTrie, NULL, enumRangeHandler, CompositionBuilder(*this).ptr());
  1045     Decomposer decomposer(*this);
  1046     do {
  1047         decomposer.didDecompose=FALSE;
  1048         utrie2_enum(normTrie, NULL, enumRangeHandler, &decomposer);
  1049     } while(decomposer.didDecompose);
  1051     BuilderReorderingBuffer buffer;
  1052     int32_t normsLength=utm_countItems(normMem);
  1053     for(int32_t i=1; i<normsLength; ++i) {
  1054         // Set the hasNoCompBoundaryAfter flag for use by the last code branch
  1055         // in Normalizer2Impl::hasCompBoundaryAfter().
  1056         // For details see the comments on hasNoCompBoundaryAfter(buffer).
  1057         const Norm &norm=norms[i];
  1058         if(norm.hasMapping()) {
  1059             if(norm.compositions!=NULL) {
  1060                 norms[i].hasNoCompBoundaryAfter=TRUE;
  1061             } else {
  1062                 buffer.reset();
  1063                 reorder(norms+i, buffer);
  1064                 norms[i].hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer);
  1069     indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
  1070     indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
  1072     ExtraDataWriter extraDataWriter(*this);
  1073     utrie2_enum(normTrie, NULL, enumRangeHandler, &extraDataWriter);
  1075     extraData=extraDataWriter.maybeYesCompositions;
  1076     extraData.append(extraDataWriter.yesYesCompositions).
  1077               append(extraDataWriter.yesNoMappingsAndCompositions).
  1078               append(extraDataWriter.yesNoMappingsOnly).
  1079               append(extraDataWriter.noNoMappings);
  1080     // Pad to even length for 4-byte alignment of following data.
  1081     if(extraData.length()&1) {
  1082         extraData.append((UChar)0);
  1085     indexes[Normalizer2Impl::IX_MIN_YES_NO]=
  1086         extraDataWriter.yesYesCompositions.length();
  1087     indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=
  1088         indexes[Normalizer2Impl::IX_MIN_YES_NO]+
  1089         extraDataWriter.yesNoMappingsAndCompositions.length();
  1090     indexes[Normalizer2Impl::IX_MIN_NO_NO]=
  1091         indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+
  1092         extraDataWriter.yesNoMappingsOnly.length();
  1093     indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=
  1094         indexes[Normalizer2Impl::IX_MIN_NO_NO]+
  1095         extraDataWriter.noNoMappings.length();
  1096     indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=
  1097         Normalizer2Impl::MIN_NORMAL_MAYBE_YES-
  1098         extraDataWriter.maybeYesCompositions.length();
  1100     int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA;
  1101     if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) {
  1102         fprintf(stderr,
  1103                 "gennorm2 error: "
  1104                 "data structure overflow, too much mapping composition data\n");
  1105         exit(U_BUFFER_OVERFLOW_ERROR);
  1108     utrie2_enum(normTrie, NULL, enumRangeHandler, Norm16Writer(*this).ptr());
  1110     setHangulData();
  1112     // Look for the "worst" norm16 value of any supplementary code point
  1113     // corresponding to a lead surrogate, and set it as that surrogate's value.
  1114     // Enables quick check inner loops to look at only code units.
  1115     //
  1116     // We could be more sophisticated:
  1117     // We could collect a bit set for whether there are values in the different
  1118     // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.)
  1119     // and select the best value that only breaks the composition and/or decomposition
  1120     // inner loops if necessary.
  1121     // However, that seems like overkill for an optimization for supplementary characters.
  1122     for(UChar lead=0xd800; lead<0xdc00; ++lead) {
  1123         uint32_t maxValue=utrie2_get32(norm16Trie, lead);
  1124         utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &maxValue);
  1125         if( maxValue>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] &&
  1126             maxValue>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO]
  1127         ) {
  1128             // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0.
  1129             // Otherwise it might end up at something like JAMO_VT which stays in
  1130             // the inner decomposition quick check loop.
  1131             maxValue=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1;
  1133         utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, maxValue, errorCode);
  1136     // Adjust supplementary minimum code points to break quick check loops at their lead surrogates.
  1137     // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate)
  1138     // which is harmless.
  1139     // As a result, the minimum code points are always BMP code points.
  1140     int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP];
  1141     if(minCP>=0x10000) {
  1142         indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP);
  1144     minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP];
  1145     if(minCP>=0x10000) {
  1146         indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP);
  1150 void Normalizer2DataBuilder::writeBinaryFile(const char *filename) {
  1151     processData();
  1153     IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()");
  1154     utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode);
  1155     int32_t norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode);
  1156     if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) {
  1157         fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n",
  1158                 errorCode.errorName());
  1159         exit(errorCode.reset());
  1161     errorCode.reset();
  1162     LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]);
  1163     utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode);
  1164     errorCode.assertSuccess();
  1166     int32_t offset=(int32_t)sizeof(indexes);
  1167     indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset;
  1168     offset+=norm16TrieLength;
  1169     indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset;
  1170     offset+=extraData.length()*2;
  1171     indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset;
  1172     offset+=sizeof(smallFCD);
  1173     int32_t totalSize=offset;
  1174     for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) {
  1175         indexes[i]=totalSize;
  1178     if(beVerbose) {
  1179         printf("size of normalization trie:         %5ld bytes\n", (long)norm16TrieLength);
  1180         printf("size of 16-bit extra data:          %5ld uint16_t\n", (long)extraData.length());
  1181         printf("size of small-FCD data:             %5ld bytes\n", (long)sizeof(smallFCD));
  1182         printf("size of binary data file contents:  %5ld bytes\n", (long)totalSize);
  1183         printf("minDecompNoCodePoint:              U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]);
  1184         printf("minCompNoMaybeCodePoint:           U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]);
  1185         printf("minYesNo:                          0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]);
  1186         printf("minYesNoMappingsOnly:              0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]);
  1187         printf("minNoNo:                           0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]);
  1188         printf("limitNoNo:                         0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]);
  1189         printf("minMaybeYes:                       0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]);
  1192     UVersionInfo nullVersion={ 0, 0, 0, 0 };
  1193     if(0==memcmp(nullVersion, unicodeVersion, 4)) {
  1194         u_versionFromString(unicodeVersion, U_UNICODE_VERSION);
  1196     memcpy(dataInfo.dataVersion, unicodeVersion, 4);
  1197     UNewDataMemory *pData=
  1198         udata_create(NULL, NULL, filename, &dataInfo,
  1199                      haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode);
  1200     if(errorCode.isFailure()) {
  1201         fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n",
  1202                 filename, errorCode.errorName());
  1203         exit(errorCode.reset());
  1205     udata_writeBlock(pData, indexes, sizeof(indexes));
  1206     udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength);
  1207     udata_writeUString(pData, extraData.getBuffer(), extraData.length());
  1208     udata_writeBlock(pData, smallFCD, sizeof(smallFCD));
  1209     int32_t writtenSize=udata_finish(pData, errorCode);
  1210     if(errorCode.isFailure()) {
  1211         fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName());
  1212         exit(errorCode.reset());
  1214     if(writtenSize!=totalSize) {
  1215         fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n",
  1216             (long)writtenSize, (long)totalSize);
  1217         exit(U_INTERNAL_PROGRAM_ERROR);
  1221 U_NAMESPACE_END
  1223 #endif /* #if !UCONFIG_NO_NORMALIZATION */
  1225 /*
  1226  * Hey, Emacs, please set the following:
  1228  * Local Variables:
  1229  * indent-tabs-mode: nil
  1230  * End:
  1231  */

mercurial