intl/icu/source/tools/gennorm2/n2builder.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/tools/gennorm2/n2builder.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1231 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 2009-2012, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +*******************************************************************************
    1.11 +*   file name:  n2builder.cpp
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created on: 2009nov25
    1.17 +*   created by: Markus W. Scherer
    1.18 +*
    1.19 +* Builds Normalizer2 data and writes a binary .nrm file.
    1.20 +* For the file format see source/common/normalizer2impl.h.
    1.21 +*/
    1.22 +
    1.23 +#include "unicode/utypes.h"
    1.24 +#include "n2builder.h"
    1.25 +
    1.26 +#include <stdio.h>
    1.27 +#include <stdlib.h>
    1.28 +#include <string.h>
    1.29 +#if U_HAVE_STD_STRING
    1.30 +#include <vector>
    1.31 +#endif
    1.32 +#include "unicode/errorcode.h"
    1.33 +#include "unicode/localpointer.h"
    1.34 +#include "unicode/putil.h"
    1.35 +#include "unicode/udata.h"
    1.36 +#include "unicode/uniset.h"
    1.37 +#include "unicode/unistr.h"
    1.38 +#include "unicode/ustring.h"
    1.39 +#include "hash.h"
    1.40 +#include "normalizer2impl.h"
    1.41 +#include "toolutil.h"
    1.42 +#include "unewdata.h"
    1.43 +#include "utrie2.h"
    1.44 +#include "uvectr32.h"
    1.45 +
    1.46 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
    1.47 +
    1.48 +#if !UCONFIG_NO_NORMALIZATION
    1.49 +
    1.50 +/* UDataInfo cf. udata.h */
    1.51 +static UDataInfo dataInfo={
    1.52 +    sizeof(UDataInfo),
    1.53 +    0,
    1.54 +
    1.55 +    U_IS_BIG_ENDIAN,
    1.56 +    U_CHARSET_FAMILY,
    1.57 +    U_SIZEOF_UCHAR,
    1.58 +    0,
    1.59 +
    1.60 +    { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */
    1.61 +    { 2, 0, 0, 0 },             /* formatVersion */
    1.62 +    { 5, 2, 0, 0 }              /* dataVersion (Unicode version) */
    1.63 +};
    1.64 +
    1.65 +U_NAMESPACE_BEGIN
    1.66 +
    1.67 +class HangulIterator {
    1.68 +public:
    1.69 +    struct Range {
    1.70 +        UChar32 start, limit;
    1.71 +        uint16_t norm16;
    1.72 +    };
    1.73 +
    1.74 +    HangulIterator() : rangeIndex(0) {}
    1.75 +    const Range *nextRange() {
    1.76 +        if(rangeIndex<LENGTHOF(ranges)) {
    1.77 +            return ranges+rangeIndex++;
    1.78 +        } else {
    1.79 +            return NULL;
    1.80 +        }
    1.81 +    }
    1.82 +    void reset() { rangeIndex=0; }
    1.83 +private:
    1.84 +    static const Range ranges[4];
    1.85 +    int32_t rangeIndex;
    1.86 +};
    1.87 +
    1.88 +const HangulIterator::Range HangulIterator::ranges[4]={
    1.89 +    { Hangul::JAMO_L_BASE, Hangul::JAMO_L_BASE+Hangul::JAMO_L_COUNT, 1 },
    1.90 +    { Hangul::JAMO_V_BASE, Hangul::JAMO_V_BASE+Hangul::JAMO_V_COUNT, Normalizer2Impl::JAMO_VT },
    1.91 +    // JAMO_T_BASE+1: not U+11A7
    1.92 +    { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_BASE+Hangul::JAMO_T_COUNT, Normalizer2Impl::JAMO_VT },
    1.93 +    { Hangul::HANGUL_BASE, Hangul::HANGUL_BASE+Hangul::HANGUL_COUNT, 0 },  // will become minYesNo
    1.94 +};
    1.95 +
    1.96 +struct CompositionPair {
    1.97 +    CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {}
    1.98 +    UChar32 trail, composite;
    1.99 +};
   1.100 +
   1.101 +struct Norm {
   1.102 +    enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY };
   1.103 +
   1.104 +    UBool hasMapping() const { return mappingType>REMOVED; }
   1.105 +
   1.106 +    // Requires hasMapping() and well-formed mapping.
   1.107 +    void setMappingCP() {
   1.108 +        UChar32 c;
   1.109 +        if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) {
   1.110 +            mappingCP=c;
   1.111 +        } else {
   1.112 +            mappingCP=U_SENTINEL;
   1.113 +        }
   1.114 +    }
   1.115 +
   1.116 +    const CompositionPair *getCompositionPairs(int32_t &length) const {
   1.117 +        if(compositions==NULL) {
   1.118 +            length=0;
   1.119 +            return NULL;
   1.120 +        } else {
   1.121 +            length=compositions->size()/2;
   1.122 +            return reinterpret_cast<const CompositionPair *>(compositions->getBuffer());
   1.123 +        }
   1.124 +    }
   1.125 +
   1.126 +    UnicodeString *mapping;
   1.127 +    UnicodeString *rawMapping;  // non-NULL if the mapping is further decomposed
   1.128 +    UChar32 mappingCP;  // >=0 if mapping to 1 code point
   1.129 +    int32_t mappingPhase;
   1.130 +    MappingType mappingType;
   1.131 +
   1.132 +    UVector32 *compositions;  // (trail, composite) pairs
   1.133 +    uint8_t cc;
   1.134 +    UBool combinesBack;
   1.135 +    UBool hasNoCompBoundaryAfter;
   1.136 +
   1.137 +    enum OffsetType {
   1.138 +        OFFSET_NONE,
   1.139 +        // Composition for back-combining character. Allowed, but not normally used.
   1.140 +        OFFSET_MAYBE_YES,
   1.141 +        // Composition for a starter that does not have a decomposition mapping.
   1.142 +        OFFSET_YES_YES,
   1.143 +        // Round-trip mapping & composition for a starter.
   1.144 +        OFFSET_YES_NO_MAPPING_AND_COMPOSITION,
   1.145 +        // Round-trip mapping for a starter that itself does not combine-forward.
   1.146 +        OFFSET_YES_NO_MAPPING_ONLY,
   1.147 +        // One-way mapping.
   1.148 +        OFFSET_NO_NO,
   1.149 +        // Delta for an algorithmic one-way mapping.
   1.150 +        OFFSET_DELTA
   1.151 +    };
   1.152 +    enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 };
   1.153 +    int32_t offset;
   1.154 +};
   1.155 +
   1.156 +class Normalizer2DBEnumerator {
   1.157 +public:
   1.158 +    Normalizer2DBEnumerator(Normalizer2DataBuilder &b) : builder(b) {}
   1.159 +    virtual ~Normalizer2DBEnumerator() {}
   1.160 +    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) = 0;
   1.161 +    Normalizer2DBEnumerator *ptr() { return this; }
   1.162 +protected:
   1.163 +    Normalizer2DataBuilder &builder;
   1.164 +};
   1.165 +
   1.166 +U_CDECL_BEGIN
   1.167 +
   1.168 +static UBool U_CALLCONV
   1.169 +enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
   1.170 +    return ((Normalizer2DBEnumerator *)context)->rangeHandler(start, end, value);
   1.171 +}
   1.172 +
   1.173 +U_CDECL_END
   1.174 +
   1.175 +Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) :
   1.176 +        phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL) {
   1.177 +    memset(unicodeVersion, 0, sizeof(unicodeVersion));
   1.178 +    normTrie=utrie2_open(0, 0, &errorCode);
   1.179 +    normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
   1.180 +    norms=allocNorm();  // unused Norm struct at index 0
   1.181 +    memset(indexes, 0, sizeof(indexes));
   1.182 +    memset(smallFCD, 0, sizeof(smallFCD));
   1.183 +}
   1.184 +
   1.185 +Normalizer2DataBuilder::~Normalizer2DataBuilder() {
   1.186 +    utrie2_close(normTrie);
   1.187 +    int32_t normsLength=utm_countItems(normMem);
   1.188 +    for(int32_t i=1; i<normsLength; ++i) {
   1.189 +        delete norms[i].mapping;
   1.190 +        delete norms[i].rawMapping;
   1.191 +        delete norms[i].compositions;
   1.192 +    }
   1.193 +    utm_close(normMem);
   1.194 +    utrie2_close(norm16Trie);
   1.195 +}
   1.196 +
   1.197 +void
   1.198 +Normalizer2DataBuilder::setUnicodeVersion(const char *v) {
   1.199 +    UVersionInfo nullVersion={ 0, 0, 0, 0 };
   1.200 +    UVersionInfo version;
   1.201 +    u_versionFromString(version, v);
   1.202 +    if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) &&
   1.203 +        0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH)
   1.204 +    ) {
   1.205 +        char buffer[U_MAX_VERSION_STRING_LENGTH];
   1.206 +        u_versionToString(unicodeVersion, buffer);
   1.207 +        fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n",
   1.208 +                buffer, v);
   1.209 +        exit(U_ILLEGAL_ARGUMENT_ERROR);
   1.210 +    }
   1.211 +    memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH);
   1.212 +}
   1.213 +
   1.214 +Norm *Normalizer2DataBuilder::allocNorm() {
   1.215 +    Norm *p=(Norm *)utm_alloc(normMem);
   1.216 +    norms=(Norm *)utm_getStart(normMem);  // in case it got reallocated
   1.217 +    return p;
   1.218 +}
   1.219 +
   1.220 +/* get an existing Norm unit */
   1.221 +Norm *Normalizer2DataBuilder::getNorm(UChar32 c) {
   1.222 +    uint32_t i=utrie2_get32(normTrie, c);
   1.223 +    if(i==0) {
   1.224 +        return NULL;
   1.225 +    }
   1.226 +    return norms+i;
   1.227 +}
   1.228 +
   1.229 +const Norm &Normalizer2DataBuilder::getNormRef(UChar32 c) const {
   1.230 +    return norms[utrie2_get32(normTrie, c)];
   1.231 +}
   1.232 +
   1.233 +/*
   1.234 + * get or create a Norm unit;
   1.235 + * get or create the intermediate trie entries for it as well
   1.236 + */
   1.237 +Norm *Normalizer2DataBuilder::createNorm(UChar32 c) {
   1.238 +    uint32_t i=utrie2_get32(normTrie, c);
   1.239 +    if(i!=0) {
   1.240 +        return norms+i;
   1.241 +    } else {
   1.242 +        /* allocate Norm */
   1.243 +        Norm *p=allocNorm();
   1.244 +        IcuToolErrorCode errorCode("gennorm2/createNorm()");
   1.245 +        utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode);
   1.246 +        return p;
   1.247 +    }
   1.248 +}
   1.249 +
   1.250 +Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) {
   1.251 +    if(p!=NULL) {
   1.252 +        if(p->mappingType!=Norm::NONE) {
   1.253 +            if( overrideHandling==OVERRIDE_NONE ||
   1.254 +                (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase)
   1.255 +            ) {
   1.256 +                fprintf(stderr,
   1.257 +                        "error in gennorm2 phase %d: "
   1.258 +                        "not permitted to override mapping for U+%04lX from phase %d\n",
   1.259 +                        (int)phase, (long)c, (int)p->mappingPhase);
   1.260 +                exit(U_INVALID_FORMAT_ERROR);
   1.261 +            }
   1.262 +            delete p->mapping;
   1.263 +            p->mapping=NULL;
   1.264 +        }
   1.265 +        p->mappingPhase=phase;
   1.266 +    }
   1.267 +    return p;
   1.268 +}
   1.269 +
   1.270 +void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) {
   1.271 +    overrideHandling=oh;
   1.272 +    ++phase;
   1.273 +}
   1.274 +
   1.275 +void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) {
   1.276 +    createNorm(c)->cc=cc;
   1.277 +}
   1.278 +
   1.279 +uint8_t Normalizer2DataBuilder::getCC(UChar32 c) const {
   1.280 +    return getNormRef(c).cc;
   1.281 +}
   1.282 +
   1.283 +static UBool isWellFormed(const UnicodeString &s) {
   1.284 +    UErrorCode errorCode=U_ZERO_ERROR;
   1.285 +    u_strToUTF8(NULL, 0, NULL, s.getBuffer(), s.length(), &errorCode);
   1.286 +    return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR;
   1.287 +}
   1.288 +
   1.289 +void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) {
   1.290 +    if(!isWellFormed(m)) {
   1.291 +        fprintf(stderr,
   1.292 +                "error in gennorm2 phase %d: "
   1.293 +                "illegal one-way mapping from U+%04lX to malformed string\n",
   1.294 +                (int)phase, (long)c);
   1.295 +        exit(U_INVALID_FORMAT_ERROR);
   1.296 +    }
   1.297 +    Norm *p=checkNormForMapping(createNorm(c), c);
   1.298 +    p->mapping=new UnicodeString(m);
   1.299 +    p->mappingType=Norm::ONE_WAY;
   1.300 +    p->setMappingCP();
   1.301 +}
   1.302 +
   1.303 +void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) {
   1.304 +    if(U_IS_SURROGATE(c)) {
   1.305 +        fprintf(stderr,
   1.306 +                "error in gennorm2 phase %d: "
   1.307 +                "illegal round-trip mapping from surrogate code point U+%04lX\n",
   1.308 +                (int)phase, (long)c);
   1.309 +        exit(U_INVALID_FORMAT_ERROR);
   1.310 +    }
   1.311 +    if(!isWellFormed(m)) {
   1.312 +        fprintf(stderr,
   1.313 +                "error in gennorm2 phase %d: "
   1.314 +                "illegal round-trip mapping from U+%04lX to malformed string\n",
   1.315 +                (int)phase, (long)c);
   1.316 +        exit(U_INVALID_FORMAT_ERROR);
   1.317 +    }
   1.318 +    int32_t numCP=u_countChar32(m.getBuffer(), m.length());
   1.319 +    if(numCP!=2) {
   1.320 +        fprintf(stderr,
   1.321 +                "error in gennorm2 phase %d: "
   1.322 +                "illegal round-trip mapping from U+%04lX to %d!=2 code points\n",
   1.323 +                (int)phase, (long)c, (int)numCP);
   1.324 +        exit(U_INVALID_FORMAT_ERROR);
   1.325 +    }
   1.326 +    Norm *p=checkNormForMapping(createNorm(c), c);
   1.327 +    p->mapping=new UnicodeString(m);
   1.328 +    p->mappingType=Norm::ROUND_TRIP;
   1.329 +    p->mappingCP=U_SENTINEL;
   1.330 +}
   1.331 +
   1.332 +void Normalizer2DataBuilder::removeMapping(UChar32 c) {
   1.333 +    Norm *p=checkNormForMapping(getNorm(c), c);
   1.334 +    if(p!=NULL) {
   1.335 +        p->mappingType=Norm::REMOVED;
   1.336 +    }
   1.337 +}
   1.338 +
   1.339 +class CompositionBuilder : public Normalizer2DBEnumerator {
   1.340 +public:
   1.341 +    CompositionBuilder(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {}
   1.342 +    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
   1.343 +        builder.addComposition(start, end, value);
   1.344 +        return TRUE;
   1.345 +    }
   1.346 +};
   1.347 +
   1.348 +void
   1.349 +Normalizer2DataBuilder::addComposition(UChar32 start, UChar32 end, uint32_t value) {
   1.350 +    if(norms[value].mappingType==Norm::ROUND_TRIP) {
   1.351 +        if(start!=end) {
   1.352 +            fprintf(stderr,
   1.353 +                    "gennorm2 error: same round-trip mapping for "
   1.354 +                    "more than 1 code point U+%04lX..U+%04lX\n",
   1.355 +                    (long)start, (long)end);
   1.356 +            exit(U_INVALID_FORMAT_ERROR);
   1.357 +        }
   1.358 +        if(norms[value].cc!=0) {
   1.359 +            fprintf(stderr,
   1.360 +                    "gennorm2 error: "
   1.361 +                    "U+%04lX has a round-trip mapping and ccc!=0, "
   1.362 +                    "not possible in Unicode normalization\n",
   1.363 +                    (long)start);
   1.364 +            exit(U_INVALID_FORMAT_ERROR);
   1.365 +        }
   1.366 +        // setRoundTripMapping() ensured that there are exactly two code points.
   1.367 +        const UnicodeString &m=*norms[value].mapping;
   1.368 +        UChar32 lead=m.char32At(0);
   1.369 +        UChar32 trail=m.char32At(m.length()-1);
   1.370 +        if(getCC(lead)!=0) {
   1.371 +            fprintf(stderr,
   1.372 +                    "gennorm2 error: "
   1.373 +                    "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
   1.374 +                    "not possible in Unicode normalization\n",
   1.375 +                    (long)start, (long)lead);
   1.376 +            exit(U_INVALID_FORMAT_ERROR);
   1.377 +        }
   1.378 +        // Flag for trailing character.
   1.379 +        createNorm(trail)->combinesBack=TRUE;
   1.380 +        // Insert (trail, composite) pair into compositions list for the lead character.
   1.381 +        IcuToolErrorCode errorCode("gennorm2/addComposition()");
   1.382 +        Norm *leadNorm=createNorm(lead);
   1.383 +        UVector32 *compositions=leadNorm->compositions;
   1.384 +        int32_t i;
   1.385 +        if(compositions==NULL) {
   1.386 +            compositions=leadNorm->compositions=new UVector32(errorCode);
   1.387 +            i=0;  // "insert" the first pair at index 0
   1.388 +        } else {
   1.389 +            // Insertion sort, and check for duplicate trail characters.
   1.390 +            int32_t length;
   1.391 +            const CompositionPair *pairs=leadNorm->getCompositionPairs(length);
   1.392 +            for(i=0; i<length; ++i) {
   1.393 +                if(trail==pairs[i].trail) {
   1.394 +                    fprintf(stderr,
   1.395 +                            "gennorm2 error: same round-trip mapping for "
   1.396 +                            "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
   1.397 +                            (long)start, (long)lead, (long)trail);
   1.398 +                    exit(U_INVALID_FORMAT_ERROR);
   1.399 +                }
   1.400 +                if(trail<pairs[i].trail) {
   1.401 +                    break;
   1.402 +                }
   1.403 +            }
   1.404 +        }
   1.405 +        compositions->insertElementAt(trail, 2*i, errorCode);
   1.406 +        compositions->insertElementAt(start, 2*i+1, errorCode);
   1.407 +    }
   1.408 +}
   1.409 +
   1.410 +UBool Normalizer2DataBuilder::combinesWithCCBetween(const Norm &norm,
   1.411 +                                                    uint8_t lowCC, uint8_t highCC) const {
   1.412 +    if((highCC-lowCC)>=2) {
   1.413 +        int32_t length;
   1.414 +        const CompositionPair *pairs=norm.getCompositionPairs(length);
   1.415 +        for(int32_t i=0; i<length; ++i) {
   1.416 +            uint8_t trailCC=getCC(pairs[i].trail);
   1.417 +            if(lowCC<trailCC && trailCC<highCC) {
   1.418 +                return TRUE;
   1.419 +            }
   1.420 +        }
   1.421 +    }
   1.422 +    return FALSE;
   1.423 +}
   1.424 +
   1.425 +UChar32 Normalizer2DataBuilder::combine(const Norm &norm, UChar32 trail) const {
   1.426 +    int32_t length;
   1.427 +    const CompositionPair *pairs=norm.getCompositionPairs(length);
   1.428 +    for(int32_t i=0; i<length; ++i) {
   1.429 +        if(trail==pairs[i].trail) {
   1.430 +            return pairs[i].composite;
   1.431 +        }
   1.432 +        if(trail<pairs[i].trail) {
   1.433 +            break;
   1.434 +        }
   1.435 +    }
   1.436 +    return U_SENTINEL;
   1.437 +}
   1.438 +
   1.439 +class Decomposer : public Normalizer2DBEnumerator {
   1.440 +public:
   1.441 +    Decomposer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b), didDecompose(FALSE) {}
   1.442 +    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
   1.443 +        didDecompose|=builder.decompose(start, end, value);
   1.444 +        return TRUE;
   1.445 +    }
   1.446 +    UBool didDecompose;
   1.447 +};
   1.448 +
   1.449 +UBool
   1.450 +Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) {
   1.451 +    if(norms[value].hasMapping()) {
   1.452 +        Norm &norm=norms[value];
   1.453 +        const UnicodeString &m=*norm.mapping;
   1.454 +        UnicodeString *decomposed=NULL;
   1.455 +        const UChar *s=m.getBuffer();
   1.456 +        int32_t length=m.length();
   1.457 +        int32_t prev, i=0;
   1.458 +        UChar32 c;
   1.459 +        while(i<length) {
   1.460 +            prev=i;
   1.461 +            U16_NEXT(s, i, length, c);
   1.462 +            if(start<=c && c<=end) {
   1.463 +                fprintf(stderr,
   1.464 +                        "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
   1.465 +                        (long)c);
   1.466 +                exit(U_INVALID_FORMAT_ERROR);
   1.467 +            }
   1.468 +            const Norm &cNorm=getNormRef(c);
   1.469 +            if(cNorm.hasMapping()) {
   1.470 +                if(norm.mappingType==Norm::ROUND_TRIP) {
   1.471 +                    if(prev==0) {
   1.472 +                        if(cNorm.mappingType!=Norm::ROUND_TRIP) {
   1.473 +                            fprintf(stderr,
   1.474 +                                    "gennorm2 error: "
   1.475 +                                    "U+%04lX's round-trip mapping's starter "
   1.476 +                                    "U+%04lX one-way-decomposes, "
   1.477 +                                    "not possible in Unicode normalization\n",
   1.478 +                                    (long)start, (long)c);
   1.479 +                            exit(U_INVALID_FORMAT_ERROR);
   1.480 +                        }
   1.481 +                        uint8_t myTrailCC=getCC(m.char32At(i));
   1.482 +                        UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1);
   1.483 +                        uint8_t cTrailCC=getCC(cTrailChar);
   1.484 +                        if(cTrailCC>myTrailCC) {
   1.485 +                            fprintf(stderr,
   1.486 +                                    "gennorm2 error: "
   1.487 +                                    "U+%04lX's round-trip mapping's starter "
   1.488 +                                    "U+%04lX decomposes and the "
   1.489 +                                    "inner/earlier tccc=%hu > outer/following tccc=%hu, "
   1.490 +                                    "not possible in Unicode normalization\n",
   1.491 +                                    (long)start, (long)c,
   1.492 +                                    (short)cTrailCC, (short)myTrailCC);
   1.493 +                            exit(U_INVALID_FORMAT_ERROR);
   1.494 +                        }
   1.495 +                    } else {
   1.496 +                        fprintf(stderr,
   1.497 +                                "gennorm2 error: "
   1.498 +                                "U+%04lX's round-trip mapping's non-starter "
   1.499 +                                "U+%04lX decomposes, "
   1.500 +                                "not possible in Unicode normalization\n",
   1.501 +                                (long)start, (long)c);
   1.502 +                        exit(U_INVALID_FORMAT_ERROR);
   1.503 +                    }
   1.504 +                }
   1.505 +                if(decomposed==NULL) {
   1.506 +                    decomposed=new UnicodeString(m, 0, prev);
   1.507 +                }
   1.508 +                decomposed->append(*cNorm.mapping);
   1.509 +            } else if(Hangul::isHangul(c)) {
   1.510 +                UChar buffer[3];
   1.511 +                int32_t hangulLength=Hangul::decompose(c, buffer);
   1.512 +                if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) {
   1.513 +                    fprintf(stderr,
   1.514 +                            "gennorm2 error: "
   1.515 +                            "U+%04lX's round-trip mapping's non-starter "
   1.516 +                            "U+%04lX decomposes, "
   1.517 +                            "not possible in Unicode normalization\n",
   1.518 +                            (long)start, (long)c);
   1.519 +                    exit(U_INVALID_FORMAT_ERROR);
   1.520 +                }
   1.521 +                if(decomposed==NULL) {
   1.522 +                    decomposed=new UnicodeString(m, 0, prev);
   1.523 +                }
   1.524 +                decomposed->append(buffer, hangulLength);
   1.525 +            } else if(decomposed!=NULL) {
   1.526 +                decomposed->append(m, prev, i-prev);
   1.527 +            }
   1.528 +        }
   1.529 +        if(decomposed!=NULL) {
   1.530 +            if(norm.rawMapping==NULL) {
   1.531 +                // Remember the original mapping when decomposing recursively.
   1.532 +                norm.rawMapping=norm.mapping;
   1.533 +            } else {
   1.534 +                delete norm.mapping;
   1.535 +            }
   1.536 +            norm.mapping=decomposed;
   1.537 +            // Not  norm.setMappingCP();  because the original mapping
   1.538 +            // is most likely to be encodable as a delta.
   1.539 +            return TRUE;
   1.540 +        }
   1.541 +    }
   1.542 +    return FALSE;
   1.543 +}
   1.544 +
   1.545 +class BuilderReorderingBuffer {
   1.546 +public:
   1.547 +    BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE) {}
   1.548 +    void reset() {
   1.549 +        fLength=0;
   1.550 +        fLastStarterIndex=-1;
   1.551 +        fDidReorder=FALSE;
   1.552 +    }
   1.553 +    int32_t length() const { return fLength; }
   1.554 +    UBool isEmpty() const { return fLength==0; }
   1.555 +    int32_t lastStarterIndex() const { return fLastStarterIndex; }
   1.556 +    UChar32 charAt(int32_t i) const { return fArray[i]>>8; }
   1.557 +    uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; }
   1.558 +    UBool didReorder() const { return fDidReorder; }
   1.559 +    void append(UChar32 c, uint8_t cc) {
   1.560 +        if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) {
   1.561 +            if(cc==0) {
   1.562 +                fLastStarterIndex=fLength;
   1.563 +            }
   1.564 +            fArray[fLength++]=(c<<8)|cc;
   1.565 +            return;
   1.566 +        }
   1.567 +        // Let this character bubble back to its canonical order.
   1.568 +        int32_t i=fLength-1;
   1.569 +        while(i>fLastStarterIndex && ccAt(i)>cc) {
   1.570 +            --i;
   1.571 +        }
   1.572 +        ++i;  // after the last starter or prevCC<=cc
   1.573 +        // Move this and the following characters forward one to make space.
   1.574 +        for(int32_t j=fLength; i<j; --j) {
   1.575 +            fArray[j]=fArray[j-1];
   1.576 +        }
   1.577 +        fArray[i]=(c<<8)|cc;
   1.578 +        ++fLength;
   1.579 +        fDidReorder=TRUE;
   1.580 +    }
   1.581 +    void toString(UnicodeString &dest) {
   1.582 +        dest.remove();
   1.583 +        for(int32_t i=0; i<fLength; ++i) {
   1.584 +            dest.append(charAt(i));
   1.585 +        }
   1.586 +    }
   1.587 +    void setComposite(UChar32 composite, int32_t combMarkIndex) {
   1.588 +        fArray[fLastStarterIndex]=composite<<8;
   1.589 +        // Remove the combining mark that contributed to the composite.
   1.590 +        --fLength;
   1.591 +        while(combMarkIndex<fLength) {
   1.592 +            fArray[combMarkIndex]=fArray[combMarkIndex+1];
   1.593 +            ++combMarkIndex;
   1.594 +        }
   1.595 +    }
   1.596 +private:
   1.597 +    int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK];
   1.598 +    int32_t fLength;
   1.599 +    int32_t fLastStarterIndex;
   1.600 +    UBool fDidReorder;
   1.601 +};
   1.602 +
   1.603 +void
   1.604 +Normalizer2DataBuilder::reorder(Norm *p, BuilderReorderingBuffer &buffer) {
   1.605 +    UnicodeString &m=*p->mapping;
   1.606 +    int32_t length=m.length();
   1.607 +    if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
   1.608 +        return;  // writeMapping() will complain about it and print the code point.
   1.609 +    }
   1.610 +    const UChar *s=m.getBuffer();
   1.611 +    int32_t i=0;
   1.612 +    UChar32 c;
   1.613 +    while(i<length) {
   1.614 +        U16_NEXT(s, i, length, c);
   1.615 +        buffer.append(c, getCC(c));
   1.616 +    }
   1.617 +    if(buffer.didReorder()) {
   1.618 +        buffer.toString(m);
   1.619 +    }
   1.620 +}
   1.621 +
   1.622 +/*
   1.623 + * Computes the flag for the last code branch in Normalizer2Impl::hasCompBoundaryAfter().
   1.624 + * A starter character with a mapping does not have a composition boundary after it
   1.625 + * if the character itself combines-forward (which is tested by the caller of this function),
   1.626 + * or it is deleted (mapped to the empty string),
   1.627 + * or its mapping contains no starter,
   1.628 + * or the last starter combines-forward.
   1.629 + */
   1.630 +UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer) {
   1.631 +    if(buffer.isEmpty()) {
   1.632 +        return TRUE;  // maps-to-empty-string is no boundary of any kind
   1.633 +    }
   1.634 +    int32_t lastStarterIndex=buffer.lastStarterIndex();
   1.635 +    if(lastStarterIndex<0) {
   1.636 +        return TRUE;  // no starter
   1.637 +    }
   1.638 +    UChar32 starter=buffer.charAt(lastStarterIndex);
   1.639 +    if( Hangul::isJamoL(starter) ||
   1.640 +        (Hangul::isJamoV(starter) &&
   1.641 +         0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))
   1.642 +    ) {
   1.643 +        // A Jamo leading consonant or an LV pair combines-forward if it is at the end,
   1.644 +        // otherwise it is blocked.
   1.645 +        return lastStarterIndex==buffer.length()-1;
   1.646 +    }
   1.647 +    // Note: There can be no Hangul syllable in the fully decomposed mapping.
   1.648 +    const Norm *starterNorm=&getNormRef(starter);
   1.649 +    if(starterNorm->compositions==NULL) {
   1.650 +        return FALSE;  // the last starter does not combine forward
   1.651 +    }
   1.652 +    // Compose as far as possible, and see if further compositions are possible.
   1.653 +    uint8_t prevCC=0;
   1.654 +    for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length();) {
   1.655 +        uint8_t cc=buffer.ccAt(combMarkIndex);  // !=0 because after last starter
   1.656 +        if(combinesWithCCBetween(*starterNorm, prevCC, cc)) {
   1.657 +            return TRUE;
   1.658 +        }
   1.659 +        if( prevCC<cc &&
   1.660 +            (starter=combine(*starterNorm, buffer.charAt(combMarkIndex)))>=0
   1.661 +        ) {
   1.662 +            buffer.setComposite(starter, combMarkIndex);
   1.663 +            starterNorm=&getNormRef(starter);
   1.664 +            if(starterNorm->compositions==NULL) {
   1.665 +                return FALSE;  // the composite does not combine further
   1.666 +            }
   1.667 +        } else {
   1.668 +            prevCC=cc;
   1.669 +            ++combMarkIndex;
   1.670 +        }
   1.671 +    }
   1.672 +    // TRUE if the final, forward-combining starter is at the end.
   1.673 +    return prevCC==0;
   1.674 +}
   1.675 +
   1.676 +// Requires p->hasMapping().
   1.677 +// Returns the offset of the "first unit" from the beginning of the extraData for c.
   1.678 +// That is the same as the length of the optional data for the raw mapping and the ccc/lccc word.
   1.679 +int32_t Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString) {
   1.680 +    UnicodeString &m=*p->mapping;
   1.681 +    int32_t length=m.length();
   1.682 +    if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
   1.683 +        fprintf(stderr,
   1.684 +                "gennorm2 error: "
   1.685 +                "mapping for U+%04lX longer than maximum of %d\n",
   1.686 +                (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
   1.687 +        exit(U_INVALID_FORMAT_ERROR);
   1.688 +    }
   1.689 +    int32_t leadCC, trailCC;
   1.690 +    if(length==0) {
   1.691 +        leadCC=trailCC=0;
   1.692 +    } else {
   1.693 +        leadCC=getCC(m.char32At(0));
   1.694 +        trailCC=getCC(m.char32At(length-1));
   1.695 +    }
   1.696 +    if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (p->cc!=0 || leadCC!=0)) {
   1.697 +        fprintf(stderr,
   1.698 +                "gennorm2 error: "
   1.699 +                "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
   1.700 +                (long)c);
   1.701 +        exit(U_INVALID_FORMAT_ERROR);
   1.702 +    }
   1.703 +    // Write small-FCD data.
   1.704 +    if((leadCC|trailCC)!=0) {
   1.705 +        UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
   1.706 +        smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
   1.707 +    }
   1.708 +    // Write the mapping & raw mapping extraData.
   1.709 +    int32_t firstUnit=length|(trailCC<<8);
   1.710 +    int32_t preMappingLength=0;
   1.711 +    if(p->rawMapping!=NULL) {
   1.712 +        UnicodeString &rm=*p->rawMapping;
   1.713 +        int32_t rmLength=rm.length();
   1.714 +        if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) {
   1.715 +            fprintf(stderr,
   1.716 +                    "gennorm2 error: "
   1.717 +                    "raw mapping for U+%04lX longer than maximum of %d\n",
   1.718 +                    (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
   1.719 +            exit(U_INVALID_FORMAT_ERROR);
   1.720 +        }
   1.721 +        UChar rm0=rm.charAt(0);
   1.722 +        if( rmLength==length-1 &&
   1.723 +            // 99: overlong substring lengths get pinned to remainder lengths anyway
   1.724 +            0==rm.compare(1, 99, m, 2, 99) &&
   1.725 +            rm0>Normalizer2Impl::MAPPING_LENGTH_MASK
   1.726 +        ) {
   1.727 +            // Compression:
   1.728 +            // rawMapping=rm0+mapping.substring(2) -> store only rm0
   1.729 +            //
   1.730 +            // The raw mapping is the same as the final mapping after replacing
   1.731 +            // the final mapping's first two code units with the raw mapping's first one.
   1.732 +            // In this case, we store only that first unit, rm0.
   1.733 +            // This helps with a few hundred mappings.
   1.734 +            dataString.append(rm0);
   1.735 +            preMappingLength=1;
   1.736 +        } else {
   1.737 +            // Store the raw mapping with its length.
   1.738 +            dataString.append(rm);
   1.739 +            dataString.append((UChar)rmLength);
   1.740 +            preMappingLength=rmLength+1;
   1.741 +        }
   1.742 +        firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING;
   1.743 +    }
   1.744 +    int32_t cccLccc=p->cc|(leadCC<<8);
   1.745 +    if(cccLccc!=0) {
   1.746 +        dataString.append((UChar)cccLccc);
   1.747 +        ++preMappingLength;
   1.748 +        firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
   1.749 +    }
   1.750 +    if(p->hasNoCompBoundaryAfter) {
   1.751 +        firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER;
   1.752 +    }
   1.753 +    dataString.append((UChar)firstUnit);
   1.754 +    dataString.append(m);
   1.755 +    return preMappingLength;
   1.756 +}
   1.757 +
   1.758 +// Requires p->compositions!=NULL.
   1.759 +void Normalizer2DataBuilder::writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString) {
   1.760 +    if(p->cc!=0) {
   1.761 +        fprintf(stderr,
   1.762 +                "gennorm2 error: "
   1.763 +                "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n",
   1.764 +                (long)c);
   1.765 +        exit(U_INVALID_FORMAT_ERROR);
   1.766 +    }
   1.767 +    int32_t length;
   1.768 +    const CompositionPair *pairs=p->getCompositionPairs(length);
   1.769 +    for(int32_t i=0; i<length; ++i) {
   1.770 +        const CompositionPair &pair=pairs[i];
   1.771 +        // 22 bits for the composite character and whether it combines forward.
   1.772 +        UChar32 compositeAndFwd=pair.composite<<1;
   1.773 +        if(getNormRef(pair.composite).compositions!=NULL) {
   1.774 +            compositeAndFwd|=1;  // The composite character also combines-forward.
   1.775 +        }
   1.776 +        // Encode most pairs in two units and some in three.
   1.777 +        int32_t firstUnit, secondUnit, thirdUnit;
   1.778 +        if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) {
   1.779 +            if(compositeAndFwd<=0xffff) {
   1.780 +                firstUnit=pair.trail<<1;
   1.781 +                secondUnit=compositeAndFwd;
   1.782 +                thirdUnit=-1;
   1.783 +            } else {
   1.784 +                firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE;
   1.785 +                secondUnit=compositeAndFwd>>16;
   1.786 +                thirdUnit=compositeAndFwd;
   1.787 +            }
   1.788 +        } else {
   1.789 +            firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+
   1.790 +                       (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))|
   1.791 +                      Normalizer2Impl::COMP_1_TRIPLE;
   1.792 +            secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)|
   1.793 +                       (compositeAndFwd>>16);
   1.794 +            thirdUnit=compositeAndFwd;
   1.795 +        }
   1.796 +        // Set the high bit of the first unit if this is the last composition pair.
   1.797 +        if(i==(length-1)) {
   1.798 +            firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE;
   1.799 +        }
   1.800 +        dataString.append((UChar)firstUnit).append((UChar)secondUnit);
   1.801 +        if(thirdUnit>=0) {
   1.802 +            dataString.append((UChar)thirdUnit);
   1.803 +        }
   1.804 +    }
   1.805 +}
   1.806 +
   1.807 +class ExtraDataWriter : public Normalizer2DBEnumerator {
   1.808 +public:
   1.809 +    ExtraDataWriter(Normalizer2DataBuilder &b) :
   1.810 +        Normalizer2DBEnumerator(b),
   1.811 +        yesYesCompositions(1000, (UChar32)0xffff, 2),  // 0=inert, 1=Jamo L, 2=start of compositions
   1.812 +        yesNoMappingsAndCompositions(1000, (UChar32)0, 1) {}  // 0=Hangul, 1=start of normal data
   1.813 +    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
   1.814 +        if(value!=0) {
   1.815 +            if(start!=end) {
   1.816 +                fprintf(stderr,
   1.817 +                        "gennorm2 error: unexpected shared data for "
   1.818 +                        "multiple code points U+%04lX..U+%04lX\n",
   1.819 +                        (long)start, (long)end);
   1.820 +                exit(U_INTERNAL_PROGRAM_ERROR);
   1.821 +            }
   1.822 +            builder.writeExtraData(start, value, *this);
   1.823 +        }
   1.824 +        return TRUE;
   1.825 +    }
   1.826 +    UnicodeString maybeYesCompositions;
   1.827 +    UnicodeString yesYesCompositions;
   1.828 +    UnicodeString yesNoMappingsAndCompositions;
   1.829 +    UnicodeString yesNoMappingsOnly;
   1.830 +    UnicodeString noNoMappings;
   1.831 +    Hashtable previousNoNoMappings;  // If constructed in runtime code, pass in UErrorCode.
   1.832 +};
   1.833 +
   1.834 +void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer) {
   1.835 +    Norm *p=norms+value;
   1.836 +    if(!p->hasMapping()) {
   1.837 +        // Write small-FCD data.
   1.838 +        // There is similar code in writeMapping() for characters that do have a mapping.
   1.839 +        if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && p->cc!=0) {
   1.840 +            fprintf(stderr,
   1.841 +                    "gennorm2 error: "
   1.842 +                    "U+%04lX below U+0300 has ccc!=0, not supported by ICU\n",
   1.843 +                    (long)c);
   1.844 +            exit(U_INVALID_FORMAT_ERROR);
   1.845 +        }
   1.846 +        if(p->cc!=0) {
   1.847 +            UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
   1.848 +            smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
   1.849 +        }
   1.850 +    }
   1.851 +    if(p->combinesBack) {
   1.852 +        if(p->hasMapping()) {
   1.853 +            fprintf(stderr,
   1.854 +                    "gennorm2 error: "
   1.855 +                    "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n",
   1.856 +                    (long)c);
   1.857 +            exit(U_INVALID_FORMAT_ERROR);
   1.858 +        }
   1.859 +        if(p->compositions!=NULL) {
   1.860 +            p->offset=
   1.861 +                (writer.maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)|
   1.862 +                Norm::OFFSET_MAYBE_YES;
   1.863 +            writeCompositions(c, p, writer.maybeYesCompositions);
   1.864 +        }
   1.865 +    } else if(!p->hasMapping()) {
   1.866 +        if(p->compositions!=NULL) {
   1.867 +            p->offset=
   1.868 +                (writer.yesYesCompositions.length()<<Norm::OFFSET_SHIFT)|
   1.869 +                Norm::OFFSET_YES_YES;
   1.870 +            writeCompositions(c, p, writer.yesYesCompositions);
   1.871 +        }
   1.872 +    } else if(p->mappingType==Norm::ROUND_TRIP) {
   1.873 +        if(p->compositions!=NULL) {
   1.874 +            int32_t offset=writer.yesNoMappingsAndCompositions.length()+
   1.875 +                           writeMapping(c, p, writer.yesNoMappingsAndCompositions);
   1.876 +            p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION;
   1.877 +            writeCompositions(c, p, writer.yesNoMappingsAndCompositions);
   1.878 +        } else {
   1.879 +            int32_t offset=writer.yesNoMappingsOnly.length()+
   1.880 +                           writeMapping(c, p, writer.yesNoMappingsOnly);
   1.881 +            p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_ONLY;
   1.882 +        }
   1.883 +    } else /* one-way */ {
   1.884 +        if(p->compositions!=NULL) {
   1.885 +            fprintf(stderr,
   1.886 +                    "gennorm2 error: "
   1.887 +                    "U+%04lX combines-forward and has a one-way mapping, "
   1.888 +                    "not possible in Unicode normalization\n",
   1.889 +                    (long)c);
   1.890 +            exit(U_INVALID_FORMAT_ERROR);
   1.891 +        }
   1.892 +        if(p->cc==0 && optimization!=OPTIMIZE_FAST) {
   1.893 +            // Try a compact, algorithmic encoding.
   1.894 +            // Only for ccc=0, because we can't store additional information
   1.895 +            // and we do not recursively follow an algorithmic encoding for access to the ccc.
   1.896 +            //
   1.897 +            // Also, if hasNoCompBoundaryAfter is set, we can only use the algorithmic encoding
   1.898 +            // if the mappingCP decomposes further, to ensure that there is a place to store it.
   1.899 +            // We want to see that the final mapping does not have exactly 1 code point,
   1.900 +            // or else we would have to recursively ensure that the final mapping is stored
   1.901 +            // in normal extraData.
   1.902 +            if(p->mappingCP>=0 && (!p->hasNoCompBoundaryAfter || 1!=p->mapping->countChar32())) {
   1.903 +                int32_t delta=p->mappingCP-c;
   1.904 +                if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
   1.905 +                    p->offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA;
   1.906 +                }
   1.907 +            }
   1.908 +        }
   1.909 +        if(p->offset==0) {
   1.910 +            int32_t oldNoNoLength=writer.noNoMappings.length();
   1.911 +            int32_t offset=oldNoNoLength+writeMapping(c, p, writer.noNoMappings);
   1.912 +            UnicodeString newMapping=writer.noNoMappings.tempSubString(oldNoNoLength);
   1.913 +            int32_t previousOffset=writer.previousNoNoMappings.geti(newMapping);
   1.914 +            if(previousOffset!=0) {
   1.915 +                // Duplicate, remove the new units and point to the old ones.
   1.916 +                writer.noNoMappings.truncate(oldNoNoLength);
   1.917 +                p->offset=((previousOffset-1)<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
   1.918 +            } else {
   1.919 +                // Enter this new mapping into the hashtable, avoiding value 0 which is "not found".
   1.920 +                IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()");
   1.921 +                writer.previousNoNoMappings.puti(newMapping, offset+1, errorCode);
   1.922 +                p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
   1.923 +            }
   1.924 +        }
   1.925 +    }
   1.926 +}
   1.927 +
   1.928 +class Norm16Writer : public Normalizer2DBEnumerator {
   1.929 +public:
   1.930 +    Norm16Writer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {}
   1.931 +    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
   1.932 +        builder.writeNorm16(start, end, value);
   1.933 +        return TRUE;
   1.934 +    }
   1.935 +};
   1.936 +
   1.937 +void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, uint32_t value) {
   1.938 +    if(value!=0) {
   1.939 +        const Norm *p=norms+value;
   1.940 +        int32_t offset=p->offset>>Norm::OFFSET_SHIFT;
   1.941 +        int32_t norm16=0;
   1.942 +        UBool isDecompNo=FALSE;
   1.943 +        UBool isCompNoMaybe=FALSE;
   1.944 +        switch(p->offset&Norm::OFFSET_MASK) {
   1.945 +        case Norm::OFFSET_NONE:
   1.946 +            // No mapping, no compositions list.
   1.947 +            if(p->combinesBack) {
   1.948 +                norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+p->cc;
   1.949 +                isDecompNo=(UBool)(p->cc!=0);
   1.950 +                isCompNoMaybe=TRUE;
   1.951 +            } else if(p->cc!=0) {
   1.952 +                norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+p->cc;
   1.953 +                isDecompNo=isCompNoMaybe=TRUE;
   1.954 +            }
   1.955 +            break;
   1.956 +        case Norm::OFFSET_MAYBE_YES:
   1.957 +            norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset;
   1.958 +            isCompNoMaybe=TRUE;
   1.959 +            break;
   1.960 +        case Norm::OFFSET_YES_YES:
   1.961 +            norm16=offset;
   1.962 +            break;
   1.963 +        case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION:
   1.964 +            norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset;
   1.965 +            isDecompNo=TRUE;
   1.966 +            break;
   1.967 +        case Norm::OFFSET_YES_NO_MAPPING_ONLY:
   1.968 +            norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+offset;
   1.969 +            isDecompNo=TRUE;
   1.970 +            break;
   1.971 +        case Norm::OFFSET_NO_NO:
   1.972 +            norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset;
   1.973 +            isDecompNo=isCompNoMaybe=TRUE;
   1.974 +            break;
   1.975 +        case Norm::OFFSET_DELTA:
   1.976 +            norm16=getCenterNoNoDelta()+offset;
   1.977 +            isDecompNo=isCompNoMaybe=TRUE;
   1.978 +            break;
   1.979 +        default:  // Should not occur.
   1.980 +            exit(U_INTERNAL_PROGRAM_ERROR);
   1.981 +        }
   1.982 +        IcuToolErrorCode errorCode("gennorm2/writeNorm16()");
   1.983 +        utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode);
   1.984 +        if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
   1.985 +            indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start;
   1.986 +        }
   1.987 +        if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
   1.988 +            indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start;
   1.989 +        }
   1.990 +    }
   1.991 +}
   1.992 +
   1.993 +void Normalizer2DataBuilder::setHangulData() {
   1.994 +    HangulIterator hi;
   1.995 +    const HangulIterator::Range *range;
   1.996 +    // Check that none of the Hangul/Jamo code points have data.
   1.997 +    while((range=hi.nextRange())!=NULL) {
   1.998 +        for(UChar32 c=range->start; c<range->limit; ++c) {
   1.999 +            if(utrie2_get32(norm16Trie, c)!=0) {
  1.1000 +                fprintf(stderr,
  1.1001 +                        "gennorm2 error: "
  1.1002 +                        "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n",
  1.1003 +                        (long)c);
  1.1004 +                exit(U_INVALID_FORMAT_ERROR);
  1.1005 +            }
  1.1006 +        }
  1.1007 +    }
  1.1008 +    // Set data for algorithmic runtime handling.
  1.1009 +    IcuToolErrorCode errorCode("gennorm2/setHangulData()");
  1.1010 +    hi.reset();
  1.1011 +    while((range=hi.nextRange())!=NULL) {
  1.1012 +        uint16_t norm16=range->norm16;
  1.1013 +        if(norm16==0) {
  1.1014 +            norm16=(uint16_t)indexes[Normalizer2Impl::IX_MIN_YES_NO];  // Hangul LV/LVT encoded as minYesNo
  1.1015 +            if(range->start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
  1.1016 +                indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=range->start;
  1.1017 +            }
  1.1018 +        } else {
  1.1019 +            if(range->start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {  // Jamo V/T are maybeYes
  1.1020 +                indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=range->start;
  1.1021 +            }
  1.1022 +        }
  1.1023 +        utrie2_setRange32(norm16Trie, range->start, range->limit-1, norm16, TRUE, errorCode);
  1.1024 +        errorCode.assertSuccess();
  1.1025 +    }
  1.1026 +}
  1.1027 +
  1.1028 +U_CDECL_BEGIN
  1.1029 +
  1.1030 +static UBool U_CALLCONV
  1.1031 +enumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
  1.1032 +    uint32_t *pMaxValue=(uint32_t *)context;
  1.1033 +    if(value>*pMaxValue) {
  1.1034 +        *pMaxValue=value;
  1.1035 +    }
  1.1036 +    return TRUE;
  1.1037 +}
  1.1038 +
  1.1039 +U_CDECL_END
  1.1040 +
  1.1041 +void Normalizer2DataBuilder::processData() {
  1.1042 +    IcuToolErrorCode errorCode("gennorm2/processData()");
  1.1043 +    norm16Trie=utrie2_open(0, 0, errorCode);
  1.1044 +    errorCode.assertSuccess();
  1.1045 +
  1.1046 +    utrie2_enum(normTrie, NULL, enumRangeHandler, CompositionBuilder(*this).ptr());
  1.1047 +
  1.1048 +    Decomposer decomposer(*this);
  1.1049 +    do {
  1.1050 +        decomposer.didDecompose=FALSE;
  1.1051 +        utrie2_enum(normTrie, NULL, enumRangeHandler, &decomposer);
  1.1052 +    } while(decomposer.didDecompose);
  1.1053 +
  1.1054 +    BuilderReorderingBuffer buffer;
  1.1055 +    int32_t normsLength=utm_countItems(normMem);
  1.1056 +    for(int32_t i=1; i<normsLength; ++i) {
  1.1057 +        // Set the hasNoCompBoundaryAfter flag for use by the last code branch
  1.1058 +        // in Normalizer2Impl::hasCompBoundaryAfter().
  1.1059 +        // For details see the comments on hasNoCompBoundaryAfter(buffer).
  1.1060 +        const Norm &norm=norms[i];
  1.1061 +        if(norm.hasMapping()) {
  1.1062 +            if(norm.compositions!=NULL) {
  1.1063 +                norms[i].hasNoCompBoundaryAfter=TRUE;
  1.1064 +            } else {
  1.1065 +                buffer.reset();
  1.1066 +                reorder(norms+i, buffer);
  1.1067 +                norms[i].hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer);
  1.1068 +            }
  1.1069 +        }
  1.1070 +    }
  1.1071 +
  1.1072 +    indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
  1.1073 +    indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
  1.1074 +
  1.1075 +    ExtraDataWriter extraDataWriter(*this);
  1.1076 +    utrie2_enum(normTrie, NULL, enumRangeHandler, &extraDataWriter);
  1.1077 +
  1.1078 +    extraData=extraDataWriter.maybeYesCompositions;
  1.1079 +    extraData.append(extraDataWriter.yesYesCompositions).
  1.1080 +              append(extraDataWriter.yesNoMappingsAndCompositions).
  1.1081 +              append(extraDataWriter.yesNoMappingsOnly).
  1.1082 +              append(extraDataWriter.noNoMappings);
  1.1083 +    // Pad to even length for 4-byte alignment of following data.
  1.1084 +    if(extraData.length()&1) {
  1.1085 +        extraData.append((UChar)0);
  1.1086 +    }
  1.1087 +
  1.1088 +    indexes[Normalizer2Impl::IX_MIN_YES_NO]=
  1.1089 +        extraDataWriter.yesYesCompositions.length();
  1.1090 +    indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=
  1.1091 +        indexes[Normalizer2Impl::IX_MIN_YES_NO]+
  1.1092 +        extraDataWriter.yesNoMappingsAndCompositions.length();
  1.1093 +    indexes[Normalizer2Impl::IX_MIN_NO_NO]=
  1.1094 +        indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+
  1.1095 +        extraDataWriter.yesNoMappingsOnly.length();
  1.1096 +    indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=
  1.1097 +        indexes[Normalizer2Impl::IX_MIN_NO_NO]+
  1.1098 +        extraDataWriter.noNoMappings.length();
  1.1099 +    indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=
  1.1100 +        Normalizer2Impl::MIN_NORMAL_MAYBE_YES-
  1.1101 +        extraDataWriter.maybeYesCompositions.length();
  1.1102 +
  1.1103 +    int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA;
  1.1104 +    if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) {
  1.1105 +        fprintf(stderr,
  1.1106 +                "gennorm2 error: "
  1.1107 +                "data structure overflow, too much mapping composition data\n");
  1.1108 +        exit(U_BUFFER_OVERFLOW_ERROR);
  1.1109 +    }
  1.1110 +
  1.1111 +    utrie2_enum(normTrie, NULL, enumRangeHandler, Norm16Writer(*this).ptr());
  1.1112 +
  1.1113 +    setHangulData();
  1.1114 +
  1.1115 +    // Look for the "worst" norm16 value of any supplementary code point
  1.1116 +    // corresponding to a lead surrogate, and set it as that surrogate's value.
  1.1117 +    // Enables quick check inner loops to look at only code units.
  1.1118 +    //
  1.1119 +    // We could be more sophisticated:
  1.1120 +    // We could collect a bit set for whether there are values in the different
  1.1121 +    // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.)
  1.1122 +    // and select the best value that only breaks the composition and/or decomposition
  1.1123 +    // inner loops if necessary.
  1.1124 +    // However, that seems like overkill for an optimization for supplementary characters.
  1.1125 +    for(UChar lead=0xd800; lead<0xdc00; ++lead) {
  1.1126 +        uint32_t maxValue=utrie2_get32(norm16Trie, lead);
  1.1127 +        utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &maxValue);
  1.1128 +        if( maxValue>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] &&
  1.1129 +            maxValue>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO]
  1.1130 +        ) {
  1.1131 +            // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0.
  1.1132 +            // Otherwise it might end up at something like JAMO_VT which stays in
  1.1133 +            // the inner decomposition quick check loop.
  1.1134 +            maxValue=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1;
  1.1135 +        }
  1.1136 +        utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, maxValue, errorCode);
  1.1137 +    }
  1.1138 +
  1.1139 +    // Adjust supplementary minimum code points to break quick check loops at their lead surrogates.
  1.1140 +    // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate)
  1.1141 +    // which is harmless.
  1.1142 +    // As a result, the minimum code points are always BMP code points.
  1.1143 +    int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP];
  1.1144 +    if(minCP>=0x10000) {
  1.1145 +        indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP);
  1.1146 +    }
  1.1147 +    minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP];
  1.1148 +    if(minCP>=0x10000) {
  1.1149 +        indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP);
  1.1150 +    }
  1.1151 +}
  1.1152 +
  1.1153 +void Normalizer2DataBuilder::writeBinaryFile(const char *filename) {
  1.1154 +    processData();
  1.1155 +
  1.1156 +    IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()");
  1.1157 +    utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode);
  1.1158 +    int32_t norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode);
  1.1159 +    if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) {
  1.1160 +        fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n",
  1.1161 +                errorCode.errorName());
  1.1162 +        exit(errorCode.reset());
  1.1163 +    }
  1.1164 +    errorCode.reset();
  1.1165 +    LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]);
  1.1166 +    utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode);
  1.1167 +    errorCode.assertSuccess();
  1.1168 +
  1.1169 +    int32_t offset=(int32_t)sizeof(indexes);
  1.1170 +    indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset;
  1.1171 +    offset+=norm16TrieLength;
  1.1172 +    indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset;
  1.1173 +    offset+=extraData.length()*2;
  1.1174 +    indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset;
  1.1175 +    offset+=sizeof(smallFCD);
  1.1176 +    int32_t totalSize=offset;
  1.1177 +    for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) {
  1.1178 +        indexes[i]=totalSize;
  1.1179 +    }
  1.1180 +
  1.1181 +    if(beVerbose) {
  1.1182 +        printf("size of normalization trie:         %5ld bytes\n", (long)norm16TrieLength);
  1.1183 +        printf("size of 16-bit extra data:          %5ld uint16_t\n", (long)extraData.length());
  1.1184 +        printf("size of small-FCD data:             %5ld bytes\n", (long)sizeof(smallFCD));
  1.1185 +        printf("size of binary data file contents:  %5ld bytes\n", (long)totalSize);
  1.1186 +        printf("minDecompNoCodePoint:              U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]);
  1.1187 +        printf("minCompNoMaybeCodePoint:           U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]);
  1.1188 +        printf("minYesNo:                          0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]);
  1.1189 +        printf("minYesNoMappingsOnly:              0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]);
  1.1190 +        printf("minNoNo:                           0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]);
  1.1191 +        printf("limitNoNo:                         0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]);
  1.1192 +        printf("minMaybeYes:                       0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]);
  1.1193 +    }
  1.1194 +
  1.1195 +    UVersionInfo nullVersion={ 0, 0, 0, 0 };
  1.1196 +    if(0==memcmp(nullVersion, unicodeVersion, 4)) {
  1.1197 +        u_versionFromString(unicodeVersion, U_UNICODE_VERSION);
  1.1198 +    }
  1.1199 +    memcpy(dataInfo.dataVersion, unicodeVersion, 4);
  1.1200 +    UNewDataMemory *pData=
  1.1201 +        udata_create(NULL, NULL, filename, &dataInfo,
  1.1202 +                     haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode);
  1.1203 +    if(errorCode.isFailure()) {
  1.1204 +        fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n",
  1.1205 +                filename, errorCode.errorName());
  1.1206 +        exit(errorCode.reset());
  1.1207 +    }
  1.1208 +    udata_writeBlock(pData, indexes, sizeof(indexes));
  1.1209 +    udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength);
  1.1210 +    udata_writeUString(pData, extraData.getBuffer(), extraData.length());
  1.1211 +    udata_writeBlock(pData, smallFCD, sizeof(smallFCD));
  1.1212 +    int32_t writtenSize=udata_finish(pData, errorCode);
  1.1213 +    if(errorCode.isFailure()) {
  1.1214 +        fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName());
  1.1215 +        exit(errorCode.reset());
  1.1216 +    }
  1.1217 +    if(writtenSize!=totalSize) {
  1.1218 +        fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n",
  1.1219 +            (long)writtenSize, (long)totalSize);
  1.1220 +        exit(U_INTERNAL_PROGRAM_ERROR);
  1.1221 +    }
  1.1222 +}
  1.1223 +
  1.1224 +U_NAMESPACE_END
  1.1225 +
  1.1226 +#endif /* #if !UCONFIG_NO_NORMALIZATION */
  1.1227 +
  1.1228 +/*
  1.1229 + * Hey, Emacs, please set the following:
  1.1230 + *
  1.1231 + * Local Variables:
  1.1232 + * indent-tabs-mode: nil
  1.1233 + * End:
  1.1234 + */

mercurial