intl/icu/source/common/normalizer2impl.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/normalizer2impl.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,779 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 2009-2013, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +*******************************************************************************
    1.11 +*   file name:  normalizer2impl.h
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created on: 2009nov22
    1.17 +*   created by: Markus W. Scherer
    1.18 +*/
    1.19 +
    1.20 +#ifndef __NORMALIZER2IMPL_H__
    1.21 +#define __NORMALIZER2IMPL_H__
    1.22 +
    1.23 +#include "unicode/utypes.h"
    1.24 +
    1.25 +#if !UCONFIG_NO_NORMALIZATION
    1.26 +
    1.27 +#include "unicode/normalizer2.h"
    1.28 +#include "unicode/udata.h"
    1.29 +#include "unicode/unistr.h"
    1.30 +#include "unicode/unorm.h"
    1.31 +#include "unicode/utf16.h"
    1.32 +#include "mutex.h"
    1.33 +#include "uset_imp.h"
    1.34 +#include "utrie2.h"
    1.35 +
    1.36 +U_NAMESPACE_BEGIN
    1.37 +
    1.38 +struct CanonIterData;
    1.39 +
    1.40 +class Hangul {
    1.41 +public:
    1.42 +    /* Korean Hangul and Jamo constants */
    1.43 +    enum {
    1.44 +        JAMO_L_BASE=0x1100,     /* "lead" jamo */
    1.45 +        JAMO_V_BASE=0x1161,     /* "vowel" jamo */
    1.46 +        JAMO_T_BASE=0x11a7,     /* "trail" jamo */
    1.47 +
    1.48 +        HANGUL_BASE=0xac00,
    1.49 +
    1.50 +        JAMO_L_COUNT=19,
    1.51 +        JAMO_V_COUNT=21,
    1.52 +        JAMO_T_COUNT=28,
    1.53 +
    1.54 +        JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT,
    1.55 +
    1.56 +        HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT,
    1.57 +        HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT
    1.58 +    };
    1.59 +
    1.60 +    static inline UBool isHangul(UChar32 c) {
    1.61 +        return HANGUL_BASE<=c && c<HANGUL_LIMIT;
    1.62 +    }
    1.63 +    static inline UBool
    1.64 +    isHangulWithoutJamoT(UChar c) {
    1.65 +        c-=HANGUL_BASE;
    1.66 +        return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
    1.67 +    }
    1.68 +    static inline UBool isJamoL(UChar32 c) {
    1.69 +        return (uint32_t)(c-JAMO_L_BASE)<JAMO_L_COUNT;
    1.70 +    }
    1.71 +    static inline UBool isJamoV(UChar32 c) {
    1.72 +        return (uint32_t)(c-JAMO_V_BASE)<JAMO_V_COUNT;
    1.73 +    }
    1.74 +
    1.75 +    /**
    1.76 +     * Decomposes c, which must be a Hangul syllable, into buffer
    1.77 +     * and returns the length of the decomposition (2 or 3).
    1.78 +     */
    1.79 +    static inline int32_t decompose(UChar32 c, UChar buffer[3]) {
    1.80 +        c-=HANGUL_BASE;
    1.81 +        UChar32 c2=c%JAMO_T_COUNT;
    1.82 +        c/=JAMO_T_COUNT;
    1.83 +        buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
    1.84 +        buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
    1.85 +        if(c2==0) {
    1.86 +            return 2;
    1.87 +        } else {
    1.88 +            buffer[2]=(UChar)(JAMO_T_BASE+c2);
    1.89 +            return 3;
    1.90 +        }
    1.91 +    }
    1.92 +
    1.93 +    /**
    1.94 +     * Decomposes c, which must be a Hangul syllable, into buffer.
    1.95 +     * This is the raw, not recursive, decomposition. Its length is always 2.
    1.96 +     */
    1.97 +    static inline void getRawDecomposition(UChar32 c, UChar buffer[2]) {
    1.98 +        UChar32 orig=c;
    1.99 +        c-=HANGUL_BASE;
   1.100 +        UChar32 c2=c%JAMO_T_COUNT;
   1.101 +        if(c2==0) {
   1.102 +            c/=JAMO_T_COUNT;
   1.103 +            buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
   1.104 +            buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
   1.105 +        } else {
   1.106 +            buffer[0]=orig-c2;  // LV syllable
   1.107 +            buffer[1]=(UChar)(JAMO_T_BASE+c2);
   1.108 +        }
   1.109 +    }
   1.110 +private:
   1.111 +    Hangul();  // no instantiation
   1.112 +};
   1.113 +
   1.114 +class Normalizer2Impl;
   1.115 +
   1.116 +class ReorderingBuffer : public UMemory {
   1.117 +public:
   1.118 +    ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) :
   1.119 +        impl(ni), str(dest),
   1.120 +        start(NULL), reorderStart(NULL), limit(NULL),
   1.121 +        remainingCapacity(0), lastCC(0) {}
   1.122 +    ~ReorderingBuffer() {
   1.123 +        if(start!=NULL) {
   1.124 +            str.releaseBuffer((int32_t)(limit-start));
   1.125 +        }
   1.126 +    }
   1.127 +    UBool init(int32_t destCapacity, UErrorCode &errorCode);
   1.128 +
   1.129 +    UBool isEmpty() const { return start==limit; }
   1.130 +    int32_t length() const { return (int32_t)(limit-start); }
   1.131 +    UChar *getStart() { return start; }
   1.132 +    UChar *getLimit() { return limit; }
   1.133 +    uint8_t getLastCC() const { return lastCC; }
   1.134 +
   1.135 +    UBool equals(const UChar *start, const UChar *limit) const;
   1.136 +
   1.137 +    // For Hangul composition, replacing the Leading consonant Jamo with the syllable.
   1.138 +    void setLastChar(UChar c) {
   1.139 +        *(limit-1)=c;
   1.140 +    }
   1.141 +
   1.142 +    UBool append(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
   1.143 +        return (c<=0xffff) ?
   1.144 +            appendBMP((UChar)c, cc, errorCode) :
   1.145 +            appendSupplementary(c, cc, errorCode);
   1.146 +    }
   1.147 +    // s must be in NFD, otherwise change the implementation.
   1.148 +    UBool append(const UChar *s, int32_t length,
   1.149 +                 uint8_t leadCC, uint8_t trailCC,
   1.150 +                 UErrorCode &errorCode);
   1.151 +    UBool appendBMP(UChar c, uint8_t cc, UErrorCode &errorCode) {
   1.152 +        if(remainingCapacity==0 && !resize(1, errorCode)) {
   1.153 +            return FALSE;
   1.154 +        }
   1.155 +        if(lastCC<=cc || cc==0) {
   1.156 +            *limit++=c;
   1.157 +            lastCC=cc;
   1.158 +            if(cc<=1) {
   1.159 +                reorderStart=limit;
   1.160 +            }
   1.161 +        } else {
   1.162 +            insert(c, cc);
   1.163 +        }
   1.164 +        --remainingCapacity;
   1.165 +        return TRUE;
   1.166 +    }
   1.167 +    UBool appendZeroCC(UChar32 c, UErrorCode &errorCode);
   1.168 +    UBool appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode);
   1.169 +    void remove();
   1.170 +    void removeSuffix(int32_t suffixLength);
   1.171 +    void setReorderingLimit(UChar *newLimit) {
   1.172 +        remainingCapacity+=(int32_t)(limit-newLimit);
   1.173 +        reorderStart=limit=newLimit;
   1.174 +        lastCC=0;
   1.175 +    }
   1.176 +    void copyReorderableSuffixTo(UnicodeString &s) const {
   1.177 +        s.setTo(reorderStart, (int32_t)(limit-reorderStart));
   1.178 +    }
   1.179 +private:
   1.180 +    /*
   1.181 +     * TODO: Revisit whether it makes sense to track reorderStart.
   1.182 +     * It is set to after the last known character with cc<=1,
   1.183 +     * which stops previousCC() before it reads that character and looks up its cc.
   1.184 +     * previousCC() is normally only called from insert().
   1.185 +     * In other words, reorderStart speeds up the insertion of a combining mark
   1.186 +     * into a multi-combining mark sequence where it does not belong at the end.
   1.187 +     * This might not be worth the trouble.
   1.188 +     * On the other hand, it's not a huge amount of trouble.
   1.189 +     *
   1.190 +     * We probably need it for UNORM_SIMPLE_APPEND.
   1.191 +     */
   1.192 +
   1.193 +    UBool appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode);
   1.194 +    void insert(UChar32 c, uint8_t cc);
   1.195 +    static void writeCodePoint(UChar *p, UChar32 c) {
   1.196 +        if(c<=0xffff) {
   1.197 +            *p=(UChar)c;
   1.198 +        } else {
   1.199 +            p[0]=U16_LEAD(c);
   1.200 +            p[1]=U16_TRAIL(c);
   1.201 +        }
   1.202 +    }
   1.203 +    UBool resize(int32_t appendLength, UErrorCode &errorCode);
   1.204 +
   1.205 +    const Normalizer2Impl &impl;
   1.206 +    UnicodeString &str;
   1.207 +    UChar *start, *reorderStart, *limit;
   1.208 +    int32_t remainingCapacity;
   1.209 +    uint8_t lastCC;
   1.210 +
   1.211 +    // private backward iterator
   1.212 +    void setIterator() { codePointStart=limit; }
   1.213 +    void skipPrevious();  // Requires start<codePointStart.
   1.214 +    uint8_t previousCC();  // Returns 0 if there is no previous character.
   1.215 +
   1.216 +    UChar *codePointStart, *codePointLimit;
   1.217 +};
   1.218 +
   1.219 +class U_COMMON_API Normalizer2Impl : public UMemory {
   1.220 +public:
   1.221 +    Normalizer2Impl() : memory(NULL), normTrie(NULL), fCanonIterData(NULL) {
   1.222 +        fCanonIterDataInitOnce.reset();
   1.223 +    }
   1.224 +    ~Normalizer2Impl();
   1.225 +
   1.226 +    void load(const char *packageName, const char *name, UErrorCode &errorCode);
   1.227 +
   1.228 +    void addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const;
   1.229 +    void addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const;
   1.230 +
   1.231 +    // low-level properties ------------------------------------------------ ***
   1.232 +
   1.233 +    const UTrie2 *getNormTrie() const { return normTrie; }
   1.234 +
   1.235 +    UBool ensureCanonIterData(UErrorCode &errorCode) const;
   1.236 +
   1.237 +    uint16_t getNorm16(UChar32 c) const { return UTRIE2_GET16(normTrie, c); }
   1.238 +
   1.239 +    UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const {
   1.240 +        if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) {
   1.241 +            return UNORM_YES;
   1.242 +        } else if(minMaybeYes<=norm16) {
   1.243 +            return UNORM_MAYBE;
   1.244 +        } else {
   1.245 +            return UNORM_NO;
   1.246 +        }
   1.247 +    }
   1.248 +    UBool isCompNo(uint16_t norm16) const { return minNoNo<=norm16 && norm16<minMaybeYes; }
   1.249 +    UBool isDecompYes(uint16_t norm16) const { return norm16<minYesNo || minMaybeYes<=norm16; }
   1.250 +
   1.251 +    uint8_t getCC(uint16_t norm16) const {
   1.252 +        if(norm16>=MIN_NORMAL_MAYBE_YES) {
   1.253 +            return (uint8_t)norm16;
   1.254 +        }
   1.255 +        if(norm16<minNoNo || limitNoNo<=norm16) {
   1.256 +            return 0;
   1.257 +        }
   1.258 +        return getCCFromNoNo(norm16);
   1.259 +    }
   1.260 +    static uint8_t getCCFromYesOrMaybe(uint16_t norm16) {
   1.261 +        return norm16>=MIN_NORMAL_MAYBE_YES ? (uint8_t)norm16 : 0;
   1.262 +    }
   1.263 +
   1.264 +    /**
   1.265 +     * Returns the FCD data for code point c.
   1.266 +     * @param c A Unicode code point.
   1.267 +     * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
   1.268 +     */
   1.269 +    uint16_t getFCD16(UChar32 c) const {
   1.270 +        if(c<0) {
   1.271 +            return 0;
   1.272 +        } else if(c<0x180) {
   1.273 +            return tccc180[c];
   1.274 +        } else if(c<=0xffff) {
   1.275 +            if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
   1.276 +        }
   1.277 +        return getFCD16FromNormData(c);
   1.278 +    }
   1.279 +    /**
   1.280 +     * Returns the FCD data for the next code point (post-increment).
   1.281 +     * Might skip only a lead surrogate rather than the whole surrogate pair if none of
   1.282 +     * the supplementary code points associated with the lead surrogate have non-zero FCD data.
   1.283 +     * @param s A valid pointer into a string. Requires s!=limit.
   1.284 +     * @param limit The end of the string, or NULL.
   1.285 +     * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
   1.286 +     */
   1.287 +    uint16_t nextFCD16(const UChar *&s, const UChar *limit) const {
   1.288 +        UChar32 c=*s++;
   1.289 +        if(c<0x180) {
   1.290 +            return tccc180[c];
   1.291 +        } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
   1.292 +            return 0;
   1.293 +        }
   1.294 +        UChar c2;
   1.295 +        if(U16_IS_LEAD(c) && s!=limit && U16_IS_TRAIL(c2=*s)) {
   1.296 +            c=U16_GET_SUPPLEMENTARY(c, c2);
   1.297 +            ++s;
   1.298 +        }
   1.299 +        return getFCD16FromNormData(c);
   1.300 +    }
   1.301 +    /**
   1.302 +     * Returns the FCD data for the previous code point (pre-decrement).
   1.303 +     * @param start The start of the string.
   1.304 +     * @param s A valid pointer into a string. Requires start<s.
   1.305 +     * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
   1.306 +     */
   1.307 +    uint16_t previousFCD16(const UChar *start, const UChar *&s) const {
   1.308 +        UChar32 c=*--s;
   1.309 +        if(c<0x180) {
   1.310 +            return tccc180[c];
   1.311 +        }
   1.312 +        if(!U16_IS_TRAIL(c)) {
   1.313 +            if(!singleLeadMightHaveNonZeroFCD16(c)) {
   1.314 +                return 0;
   1.315 +            }
   1.316 +        } else {
   1.317 +            UChar c2;
   1.318 +            if(start<s && U16_IS_LEAD(c2=*(s-1))) {
   1.319 +                c=U16_GET_SUPPLEMENTARY(c2, c);
   1.320 +                --s;
   1.321 +            }
   1.322 +        }
   1.323 +        return getFCD16FromNormData(c);
   1.324 +    }
   1.325 +
   1.326 +    /** Returns the FCD data for U+0000<=c<U+0180. */
   1.327 +    uint16_t getFCD16FromBelow180(UChar32 c) const { return tccc180[c]; }
   1.328 +    /** Returns TRUE if the single-or-lead code unit c might have non-zero FCD data. */
   1.329 +    UBool singleLeadMightHaveNonZeroFCD16(UChar32 lead) const {
   1.330 +        // 0<=lead<=0xffff
   1.331 +        uint8_t bits=smallFCD[lead>>8];
   1.332 +        if(bits==0) { return false; }
   1.333 +        return (UBool)((bits>>((lead>>5)&7))&1);
   1.334 +    }
   1.335 +    /** Returns the FCD value from the regular normalization data. */
   1.336 +    uint16_t getFCD16FromNormData(UChar32 c) const;
   1.337 +
   1.338 +    void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
   1.339 +                                     CanonIterData &newData, UErrorCode &errorCode) const;
   1.340 +
   1.341 +    /**
   1.342 +     * Gets the decomposition for one code point.
   1.343 +     * @param c code point
   1.344 +     * @param buffer out-only buffer for algorithmic decompositions
   1.345 +     * @param length out-only, takes the length of the decomposition, if any
   1.346 +     * @return pointer to the decomposition, or NULL if none
   1.347 +     */
   1.348 +    const UChar *getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const;
   1.349 +
   1.350 +    /**
   1.351 +     * Gets the raw decomposition for one code point.
   1.352 +     * @param c code point
   1.353 +     * @param buffer out-only buffer for algorithmic decompositions
   1.354 +     * @param length out-only, takes the length of the decomposition, if any
   1.355 +     * @return pointer to the decomposition, or NULL if none
   1.356 +     */
   1.357 +    const UChar *getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const;
   1.358 +
   1.359 +    UChar32 composePair(UChar32 a, UChar32 b) const;
   1.360 +
   1.361 +    UBool isCanonSegmentStarter(UChar32 c) const;
   1.362 +    UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const;
   1.363 +
   1.364 +    enum {
   1.365 +        MIN_CCC_LCCC_CP=0x300
   1.366 +    };
   1.367 +
   1.368 +    enum {
   1.369 +        MIN_YES_YES_WITH_CC=0xff01,
   1.370 +        JAMO_VT=0xff00,
   1.371 +        MIN_NORMAL_MAYBE_YES=0xfe00,
   1.372 +        JAMO_L=1,
   1.373 +        MAX_DELTA=0x40
   1.374 +    };
   1.375 +
   1.376 +    enum {
   1.377 +        // Byte offsets from the start of the data, after the generic header.
   1.378 +        IX_NORM_TRIE_OFFSET,
   1.379 +        IX_EXTRA_DATA_OFFSET,
   1.380 +        IX_SMALL_FCD_OFFSET,
   1.381 +        IX_RESERVED3_OFFSET,
   1.382 +        IX_RESERVED4_OFFSET,
   1.383 +        IX_RESERVED5_OFFSET,
   1.384 +        IX_RESERVED6_OFFSET,
   1.385 +        IX_TOTAL_SIZE,
   1.386 +
   1.387 +        // Code point thresholds for quick check codes.
   1.388 +        IX_MIN_DECOMP_NO_CP,
   1.389 +        IX_MIN_COMP_NO_MAYBE_CP,
   1.390 +
   1.391 +        // Norm16 value thresholds for quick check combinations and types of extra data.
   1.392 +        IX_MIN_YES_NO,  // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
   1.393 +        IX_MIN_NO_NO,
   1.394 +        IX_LIMIT_NO_NO,
   1.395 +        IX_MIN_MAYBE_YES,
   1.396 +
   1.397 +        IX_MIN_YES_NO_MAPPINGS_ONLY,  // Mappings only in [minYesNoMappingsOnly..minNoNo[.
   1.398 +
   1.399 +        IX_RESERVED15,
   1.400 +        IX_COUNT
   1.401 +    };
   1.402 +
   1.403 +    enum {
   1.404 +        MAPPING_HAS_CCC_LCCC_WORD=0x80,
   1.405 +        MAPPING_HAS_RAW_MAPPING=0x40,
   1.406 +        MAPPING_NO_COMP_BOUNDARY_AFTER=0x20,
   1.407 +        MAPPING_LENGTH_MASK=0x1f
   1.408 +    };
   1.409 +
   1.410 +    enum {
   1.411 +        COMP_1_LAST_TUPLE=0x8000,
   1.412 +        COMP_1_TRIPLE=1,
   1.413 +        COMP_1_TRAIL_LIMIT=0x3400,
   1.414 +        COMP_1_TRAIL_MASK=0x7ffe,
   1.415 +        COMP_1_TRAIL_SHIFT=9,  // 10-1 for the "triple" bit
   1.416 +        COMP_2_TRAIL_SHIFT=6,
   1.417 +        COMP_2_TRAIL_MASK=0xffc0
   1.418 +    };
   1.419 +
   1.420 +    // higher-level functionality ------------------------------------------ ***
   1.421 +
   1.422 +    const UChar *decompose(const UChar *src, const UChar *limit,
   1.423 +                           ReorderingBuffer *buffer, UErrorCode &errorCode) const;
   1.424 +    void decomposeAndAppend(const UChar *src, const UChar *limit,
   1.425 +                            UBool doDecompose,
   1.426 +                            UnicodeString &safeMiddle,
   1.427 +                            ReorderingBuffer &buffer,
   1.428 +                            UErrorCode &errorCode) const;
   1.429 +    UBool compose(const UChar *src, const UChar *limit,
   1.430 +                  UBool onlyContiguous,
   1.431 +                  UBool doCompose,
   1.432 +                  ReorderingBuffer &buffer,
   1.433 +                  UErrorCode &errorCode) const;
   1.434 +    const UChar *composeQuickCheck(const UChar *src, const UChar *limit,
   1.435 +                                   UBool onlyContiguous,
   1.436 +                                   UNormalizationCheckResult *pQCResult) const;
   1.437 +    void composeAndAppend(const UChar *src, const UChar *limit,
   1.438 +                          UBool doCompose,
   1.439 +                          UBool onlyContiguous,
   1.440 +                          UnicodeString &safeMiddle,
   1.441 +                          ReorderingBuffer &buffer,
   1.442 +                          UErrorCode &errorCode) const;
   1.443 +    const UChar *makeFCD(const UChar *src, const UChar *limit,
   1.444 +                         ReorderingBuffer *buffer, UErrorCode &errorCode) const;
   1.445 +    void makeFCDAndAppend(const UChar *src, const UChar *limit,
   1.446 +                          UBool doMakeFCD,
   1.447 +                          UnicodeString &safeMiddle,
   1.448 +                          ReorderingBuffer &buffer,
   1.449 +                          UErrorCode &errorCode) const;
   1.450 +
   1.451 +    UBool hasDecompBoundary(UChar32 c, UBool before) const;
   1.452 +    UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); }
   1.453 +
   1.454 +    UBool hasCompBoundaryBefore(UChar32 c) const {
   1.455 +        return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c));
   1.456 +    }
   1.457 +    UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const;
   1.458 +
   1.459 +    UBool hasFCDBoundaryBefore(UChar32 c) const { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; }
   1.460 +    UBool hasFCDBoundaryAfter(UChar32 c) const {
   1.461 +        uint16_t fcd16=getFCD16(c);
   1.462 +        return fcd16<=1 || (fcd16&0xff)==0;
   1.463 +    }
   1.464 +    UBool isFCDInert(UChar32 c) const { return getFCD16(c)<=1; }
   1.465 +private:
   1.466 +    static UBool U_CALLCONV
   1.467 +    isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo);
   1.468 +
   1.469 +    UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
   1.470 +    UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; }
   1.471 +    static UBool isInert(uint16_t norm16) { return norm16==0; }
   1.472 +    static UBool isJamoL(uint16_t norm16) { return norm16==1; }
   1.473 +    static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; }
   1.474 +    UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; }
   1.475 +    UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; }
   1.476 +    // UBool isCompYes(uint16_t norm16) const {
   1.477 +    //     return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
   1.478 +    // }
   1.479 +    // UBool isCompYesOrMaybe(uint16_t norm16) const {
   1.480 +    //     return norm16<minNoNo || minMaybeYes<=norm16;
   1.481 +    // }
   1.482 +    // UBool hasZeroCCFromDecompYes(uint16_t norm16) const {
   1.483 +    //     return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
   1.484 +    // }
   1.485 +    UBool isDecompYesAndZeroCC(uint16_t norm16) const {
   1.486 +        return norm16<minYesNo ||
   1.487 +               norm16==JAMO_VT ||
   1.488 +               (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
   1.489 +    }
   1.490 +    /**
   1.491 +     * A little faster and simpler than isDecompYesAndZeroCC() but does not include
   1.492 +     * the MaybeYes which combine-forward and have ccc=0.
   1.493 +     * (Standard Unicode 5.2 normalization does not have such characters.)
   1.494 +     */
   1.495 +    UBool isMostDecompYesAndZeroCC(uint16_t norm16) const {
   1.496 +        return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
   1.497 +    }
   1.498 +    UBool isDecompNoAlgorithmic(uint16_t norm16) const { return norm16>=limitNoNo; }
   1.499 +
   1.500 +    // For use with isCompYes().
   1.501 +    // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
   1.502 +    // static uint8_t getCCFromYes(uint16_t norm16) {
   1.503 +    //     return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0;
   1.504 +    // }
   1.505 +    uint8_t getCCFromNoNo(uint16_t norm16) const {
   1.506 +        const uint16_t *mapping=getMapping(norm16);
   1.507 +        if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) {
   1.508 +            return (uint8_t)*(mapping-1);
   1.509 +        } else {
   1.510 +            return 0;
   1.511 +        }
   1.512 +    }
   1.513 +    // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
   1.514 +    uint8_t getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const;
   1.515 +
   1.516 +    // Requires algorithmic-NoNo.
   1.517 +    UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const {
   1.518 +        return c+norm16-(minMaybeYes-MAX_DELTA-1);
   1.519 +    }
   1.520 +
   1.521 +    // Requires minYesNo<norm16<limitNoNo.
   1.522 +    const uint16_t *getMapping(uint16_t norm16) const { return extraData+norm16; }
   1.523 +    const uint16_t *getCompositionsListForDecompYes(uint16_t norm16) const {
   1.524 +        if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) {
   1.525 +            return NULL;
   1.526 +        } else if(norm16<minMaybeYes) {
   1.527 +            return extraData+norm16;  // for yesYes; if Jamo L: harmless empty list
   1.528 +        } else {
   1.529 +            return maybeYesCompositions+norm16-minMaybeYes;
   1.530 +        }
   1.531 +    }
   1.532 +    const uint16_t *getCompositionsListForComposite(uint16_t norm16) const {
   1.533 +        const uint16_t *list=extraData+norm16;  // composite has both mapping & compositions list
   1.534 +        return list+  // mapping pointer
   1.535 +            1+  // +1 to skip the first unit with the mapping lenth
   1.536 +            (*list&MAPPING_LENGTH_MASK);  // + mapping length
   1.537 +    }
   1.538 +    /**
   1.539 +     * @param c code point must have compositions
   1.540 +     * @return compositions list pointer
   1.541 +     */
   1.542 +    const uint16_t *getCompositionsList(uint16_t norm16) const {
   1.543 +        return isDecompYes(norm16) ?
   1.544 +                getCompositionsListForDecompYes(norm16) :
   1.545 +                getCompositionsListForComposite(norm16);
   1.546 +    }
   1.547 +
   1.548 +    const UChar *copyLowPrefixFromNulTerminated(const UChar *src,
   1.549 +                                                UChar32 minNeedDataCP,
   1.550 +                                                ReorderingBuffer *buffer,
   1.551 +                                                UErrorCode &errorCode) const;
   1.552 +    UBool decomposeShort(const UChar *src, const UChar *limit,
   1.553 +                         ReorderingBuffer &buffer, UErrorCode &errorCode) const;
   1.554 +    UBool decompose(UChar32 c, uint16_t norm16,
   1.555 +                    ReorderingBuffer &buffer, UErrorCode &errorCode) const;
   1.556 +
   1.557 +    static int32_t combine(const uint16_t *list, UChar32 trail);
   1.558 +    void addComposites(const uint16_t *list, UnicodeSet &set) const;
   1.559 +    void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
   1.560 +                   UBool onlyContiguous) const;
   1.561 +
   1.562 +    UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const;
   1.563 +    const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p) const;
   1.564 +    const UChar *findNextCompBoundary(const UChar *p, const UChar *limit) const;
   1.565 +
   1.566 +    const UChar *findPreviousFCDBoundary(const UChar *start, const UChar *p) const;
   1.567 +    const UChar *findNextFCDBoundary(const UChar *p, const UChar *limit) const;
   1.568 +
   1.569 +    int32_t getCanonValue(UChar32 c) const;
   1.570 +    const UnicodeSet &getCanonStartSet(int32_t n) const;
   1.571 +
   1.572 +    UDataMemory *memory;
   1.573 +    UVersionInfo dataVersion;
   1.574 +
   1.575 +    // Code point thresholds for quick check codes.
   1.576 +    UChar32 minDecompNoCP;
   1.577 +    UChar32 minCompNoMaybeCP;
   1.578 +
   1.579 +    // Norm16 value thresholds for quick check combinations and types of extra data.
   1.580 +    uint16_t minYesNo;
   1.581 +    uint16_t minYesNoMappingsOnly;
   1.582 +    uint16_t minNoNo;
   1.583 +    uint16_t limitNoNo;
   1.584 +    uint16_t minMaybeYes;
   1.585 +
   1.586 +    UTrie2 *normTrie;
   1.587 +    const uint16_t *maybeYesCompositions;
   1.588 +    const uint16_t *extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
   1.589 +    const uint8_t *smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0
   1.590 +    uint8_t tccc180[0x180];  // tccc values for U+0000..U+017F
   1.591 +
   1.592 +  public:           // CanonIterData is public to allow access from C callback functions.
   1.593 +    UInitOnce       fCanonIterDataInitOnce;
   1.594 +    CanonIterData  *fCanonIterData;
   1.595 +};
   1.596 +
   1.597 +// bits in canonIterData
   1.598 +#define CANON_NOT_SEGMENT_STARTER 0x80000000
   1.599 +#define CANON_HAS_COMPOSITIONS 0x40000000
   1.600 +#define CANON_HAS_SET 0x200000
   1.601 +#define CANON_VALUE_MASK 0x1fffff
   1.602 +
   1.603 +/**
   1.604 + * ICU-internal shortcut for quick access to standard Unicode normalization.
   1.605 + */
   1.606 +class U_COMMON_API Normalizer2Factory {
   1.607 +public:
   1.608 +    static const Normalizer2 *getNFCInstance(UErrorCode &errorCode);
   1.609 +    static const Normalizer2 *getNFDInstance(UErrorCode &errorCode);
   1.610 +    static const Normalizer2 *getFCDInstance(UErrorCode &errorCode);
   1.611 +    static const Normalizer2 *getFCCInstance(UErrorCode &errorCode);
   1.612 +    static const Normalizer2 *getNFKCInstance(UErrorCode &errorCode);
   1.613 +    static const Normalizer2 *getNFKDInstance(UErrorCode &errorCode);
   1.614 +    static const Normalizer2 *getNFKC_CFInstance(UErrorCode &errorCode);
   1.615 +    static const Normalizer2 *getNoopInstance(UErrorCode &errorCode);
   1.616 +
   1.617 +    static const Normalizer2 *getInstance(UNormalizationMode mode, UErrorCode &errorCode);
   1.618 +
   1.619 +    static const Normalizer2Impl *getNFCImpl(UErrorCode &errorCode);
   1.620 +    static const Normalizer2Impl *getNFKCImpl(UErrorCode &errorCode);
   1.621 +    static const Normalizer2Impl *getNFKC_CFImpl(UErrorCode &errorCode);
   1.622 +
   1.623 +    // Get the Impl instance of the Normalizer2.
   1.624 +    // Must be used only when it is known that norm2 is a Normalizer2WithImpl instance.
   1.625 +    static const Normalizer2Impl *getImpl(const Normalizer2 *norm2);
   1.626 +private:
   1.627 +    Normalizer2Factory();  // No instantiation.
   1.628 +};
   1.629 +
   1.630 +U_NAMESPACE_END
   1.631 +
   1.632 +U_CAPI int32_t U_EXPORT2
   1.633 +unorm2_swap(const UDataSwapper *ds,
   1.634 +            const void *inData, int32_t length, void *outData,
   1.635 +            UErrorCode *pErrorCode);
   1.636 +
   1.637 +/**
   1.638 + * Get the NF*_QC property for a code point, for u_getIntPropertyValue().
   1.639 + * @internal
   1.640 + */
   1.641 +U_CFUNC UNormalizationCheckResult
   1.642 +unorm_getQuickCheck(UChar32 c, UNormalizationMode mode);
   1.643 +
   1.644 +/**
   1.645 + * Gets the 16-bit FCD value (lead & trail CCs) for a code point, for u_getIntPropertyValue().
   1.646 + * @internal
   1.647 + */
   1.648 +U_CFUNC uint16_t
   1.649 +unorm_getFCD16(UChar32 c);
   1.650 +
   1.651 +/**
   1.652 + * Format of Normalizer2 .nrm data files.
   1.653 + * Format version 2.0.
   1.654 + *
   1.655 + * Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms.
   1.656 + * ICU ships with data files for standard Unicode Normalization Forms
   1.657 + * NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm) and NFKC_Casefold (nfkc_cf.nrm).
   1.658 + * Custom (application-specific) data can be built into additional .nrm files
   1.659 + * with the gennorm2 build tool.
   1.660 + *
   1.661 + * Normalizer2.getInstance() causes a .nrm file to be loaded, unless it has been
   1.662 + * cached already. Internally, Normalizer2Impl.load() reads the .nrm file.
   1.663 + *
   1.664 + * A .nrm file begins with a standard ICU data file header
   1.665 + * (DataHeader, see ucmndata.h and unicode/udata.h).
   1.666 + * The UDataInfo.dataVersion field usually contains the Unicode version
   1.667 + * for which the data was generated.
   1.668 + *
   1.669 + * After the header, the file contains the following parts.
   1.670 + * Constants are defined as enum values of the Normalizer2Impl class.
   1.671 + *
   1.672 + * Many details of the data structures are described in the design doc
   1.673 + * which is at http://site.icu-project.org/design/normalization/custom
   1.674 + *
   1.675 + * int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_NORM_TRIE_OFFSET]/4;
   1.676 + *
   1.677 + *      The first eight indexes are byte offsets in ascending order.
   1.678 + *      Each byte offset marks the start of the next part in the data file,
   1.679 + *      and the end of the previous one.
   1.680 + *      When two consecutive byte offsets are the same, then the corresponding part is empty.
   1.681 + *      Byte offsets are offsets from after the header,
   1.682 + *      that is, from the beginning of the indexes[].
   1.683 + *      Each part starts at an offset with proper alignment for its data.
   1.684 + *      If necessary, the previous part may include padding bytes to achieve this alignment.
   1.685 + *
   1.686 + *      minDecompNoCP=indexes[IX_MIN_DECOMP_NO_CP] is the lowest code point
   1.687 + *      with a decomposition mapping, that is, with NF*D_QC=No.
   1.688 + *      minCompNoMaybeCP=indexes[IX_MIN_COMP_NO_MAYBE_CP] is the lowest code point
   1.689 + *      with NF*C_QC=No (has a one-way mapping) or Maybe (combines backward).
   1.690 + *
   1.691 + *      The next five indexes are thresholds of 16-bit trie values for ranges of
   1.692 + *      values indicating multiple normalization properties.
   1.693 + *          minYesNo=indexes[IX_MIN_YES_NO];
   1.694 + *          minNoNo=indexes[IX_MIN_NO_NO];
   1.695 + *          limitNoNo=indexes[IX_LIMIT_NO_NO];
   1.696 + *          minMaybeYes=indexes[IX_MIN_MAYBE_YES];
   1.697 + *          minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
   1.698 + *      See the normTrie description below and the design doc for details.
   1.699 + *
   1.700 + * UTrie2 normTrie; -- see utrie2_impl.h and utrie2.h
   1.701 + *
   1.702 + *      The trie holds the main normalization data. Each code point is mapped to a 16-bit value.
   1.703 + *      Rather than using independent bits in the value (which would require more than 16 bits),
   1.704 + *      information is extracted primarily via range checks.
   1.705 + *      For example, a 16-bit value norm16 in the range minYesNo<=norm16<minNoNo
   1.706 + *      means that the character has NF*C_QC=Yes and NF*D_QC=No properties,
   1.707 + *      which means it has a two-way (round-trip) decomposition mapping.
   1.708 + *      Values in the range 2<=norm16<limitNoNo are also directly indexes into the extraData
   1.709 + *      pointing to mappings, compositions lists, or both.
   1.710 + *      Value norm16==0 means that the character is normalization-inert, that is,
   1.711 + *      it does not have a mapping, does not participate in composition, has a zero
   1.712 + *      canonical combining class, and forms a boundary where text before it and after it
   1.713 + *      can be normalized independently.
   1.714 + *      For details about how multiple properties are encoded in 16-bit values
   1.715 + *      see the design doc.
   1.716 + *      Note that the encoding cannot express all combinations of the properties involved;
   1.717 + *      it only supports those combinations that are allowed by
   1.718 + *      the Unicode Normalization algorithms. Details are in the design doc as well.
   1.719 + *      The gennorm2 tool only builds .nrm files for data that conforms to the limitations.
   1.720 + *
   1.721 + *      The trie has a value for each lead surrogate code unit representing the "worst case"
   1.722 + *      properties of the 1024 supplementary characters whose UTF-16 form starts with
   1.723 + *      the lead surrogate. If all of the 1024 supplementary characters are normalization-inert,
   1.724 + *      then their lead surrogate code unit has the trie value 0.
   1.725 + *      When the lead surrogate unit's value exceeds the quick check minimum during processing,
   1.726 + *      the properties for the full supplementary code point need to be looked up.
   1.727 + *
   1.728 + * uint16_t maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes];
   1.729 + * uint16_t extraData[];
   1.730 + *
   1.731 + *      There is only one byte offset for the end of these two arrays.
   1.732 + *      The split between them is given by the constant and variable mentioned above.
   1.733 + *
   1.734 + *      The maybeYesCompositions array contains compositions lists for characters that
   1.735 + *      combine both forward (as starters in composition pairs)
   1.736 + *      and backward (as trailing characters in composition pairs).
   1.737 + *      Such characters do not occur in Unicode 5.2 but are allowed by
   1.738 + *      the Unicode Normalization algorithms.
   1.739 + *      If there are no such characters, then minMaybeYes==MIN_NORMAL_MAYBE_YES
   1.740 + *      and the maybeYesCompositions array is empty.
   1.741 + *      If there are such characters, then minMaybeYes is subtracted from their norm16 values
   1.742 + *      to get the index into this array.
   1.743 + *
   1.744 + *      The extraData array contains compositions lists for "YesYes" characters,
   1.745 + *      followed by mappings and optional compositions lists for "YesNo" characters,
   1.746 + *      followed by only mappings for "NoNo" characters.
   1.747 + *      (Referring to pairs of NFC/NFD quick check values.)
   1.748 + *      The norm16 values of those characters are directly indexes into the extraData array.
   1.749 + *
   1.750 + *      The data structures for compositions lists and mappings are described in the design doc.
   1.751 + *
   1.752 + * uint8_t smallFCD[0x100]; -- new in format version 2
   1.753 + *
   1.754 + *      This is a bit set to help speed up FCD value lookups in the absence of a full
   1.755 + *      UTrie2 or other large data structure with the full FCD value mapping.
   1.756 + *
   1.757 + *      Each smallFCD bit is set if any of the corresponding 32 BMP code points
   1.758 + *      has a non-zero FCD value (lccc!=0 or tccc!=0).
   1.759 + *      Bit 0 of smallFCD[0] is for U+0000..U+001F. Bit 7 of smallFCD[0xff] is for U+FFE0..U+FFFF.
   1.760 + *      A bit for 32 lead surrogates is set if any of the 32k corresponding
   1.761 + *      _supplementary_ code points has a non-zero FCD value.
   1.762 + *
   1.763 + *      This bit set is most useful for the large blocks of CJK characters with FCD=0.
   1.764 + *
   1.765 + * Changes from format version 1 to format version 2 ---------------------------
   1.766 + *
   1.767 + * - Addition of data for raw (not recursively decomposed) mappings.
   1.768 + *   + The MAPPING_NO_COMP_BOUNDARY_AFTER bit in the extraData is now also set when
   1.769 + *     the mapping is to an empty string or when the character combines-forward.
   1.770 + *     This subsumes the one actual use of the MAPPING_PLUS_COMPOSITION_LIST bit which
   1.771 + *     is then repurposed for the MAPPING_HAS_RAW_MAPPING bit.
   1.772 + *   + For details see the design doc.
   1.773 + * - Addition of indexes[IX_MIN_YES_NO_MAPPINGS_ONLY] and separation of the yesNo extraData into
   1.774 + *   distinct ranges (combines-forward vs. not)
   1.775 + *   so that a range check can be used to find out if there is a compositions list.
   1.776 + *   This is fully equivalent with formatVersion 1's MAPPING_PLUS_COMPOSITION_LIST flag.
   1.777 + *   It is needed for the new (in ICU 49) composePair(), not for other normalization.
   1.778 + * - Addition of the smallFCD[] bit set.
   1.779 + */
   1.780 +
   1.781 +#endif  /* !UCONFIG_NO_NORMALIZATION */
   1.782 +#endif  /* __NORMALIZER2IMPL_H__ */

mercurial