intl/icu/source/common/normalizer2impl.h

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

     1 /*
     2 *******************************************************************************
     3 *
     4 *   Copyright (C) 2009-2013, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 *******************************************************************************
     8 *   file name:  normalizer2impl.h
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2009nov22
    14 *   created by: Markus W. Scherer
    15 */
    17 #ifndef __NORMALIZER2IMPL_H__
    18 #define __NORMALIZER2IMPL_H__
    20 #include "unicode/utypes.h"
    22 #if !UCONFIG_NO_NORMALIZATION
    24 #include "unicode/normalizer2.h"
    25 #include "unicode/udata.h"
    26 #include "unicode/unistr.h"
    27 #include "unicode/unorm.h"
    28 #include "unicode/utf16.h"
    29 #include "mutex.h"
    30 #include "uset_imp.h"
    31 #include "utrie2.h"
    33 U_NAMESPACE_BEGIN
    35 struct CanonIterData;
    37 class Hangul {
    38 public:
    39     /* Korean Hangul and Jamo constants */
    40     enum {
    41         JAMO_L_BASE=0x1100,     /* "lead" jamo */
    42         JAMO_V_BASE=0x1161,     /* "vowel" jamo */
    43         JAMO_T_BASE=0x11a7,     /* "trail" jamo */
    45         HANGUL_BASE=0xac00,
    47         JAMO_L_COUNT=19,
    48         JAMO_V_COUNT=21,
    49         JAMO_T_COUNT=28,
    51         JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT,
    53         HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT,
    54         HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT
    55     };
    57     static inline UBool isHangul(UChar32 c) {
    58         return HANGUL_BASE<=c && c<HANGUL_LIMIT;
    59     }
    60     static inline UBool
    61     isHangulWithoutJamoT(UChar c) {
    62         c-=HANGUL_BASE;
    63         return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
    64     }
    65     static inline UBool isJamoL(UChar32 c) {
    66         return (uint32_t)(c-JAMO_L_BASE)<JAMO_L_COUNT;
    67     }
    68     static inline UBool isJamoV(UChar32 c) {
    69         return (uint32_t)(c-JAMO_V_BASE)<JAMO_V_COUNT;
    70     }
    72     /**
    73      * Decomposes c, which must be a Hangul syllable, into buffer
    74      * and returns the length of the decomposition (2 or 3).
    75      */
    76     static inline int32_t decompose(UChar32 c, UChar buffer[3]) {
    77         c-=HANGUL_BASE;
    78         UChar32 c2=c%JAMO_T_COUNT;
    79         c/=JAMO_T_COUNT;
    80         buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
    81         buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
    82         if(c2==0) {
    83             return 2;
    84         } else {
    85             buffer[2]=(UChar)(JAMO_T_BASE+c2);
    86             return 3;
    87         }
    88     }
    90     /**
    91      * Decomposes c, which must be a Hangul syllable, into buffer.
    92      * This is the raw, not recursive, decomposition. Its length is always 2.
    93      */
    94     static inline void getRawDecomposition(UChar32 c, UChar buffer[2]) {
    95         UChar32 orig=c;
    96         c-=HANGUL_BASE;
    97         UChar32 c2=c%JAMO_T_COUNT;
    98         if(c2==0) {
    99             c/=JAMO_T_COUNT;
   100             buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
   101             buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
   102         } else {
   103             buffer[0]=orig-c2;  // LV syllable
   104             buffer[1]=(UChar)(JAMO_T_BASE+c2);
   105         }
   106     }
   107 private:
   108     Hangul();  // no instantiation
   109 };
   111 class Normalizer2Impl;
   113 class ReorderingBuffer : public UMemory {
   114 public:
   115     ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) :
   116         impl(ni), str(dest),
   117         start(NULL), reorderStart(NULL), limit(NULL),
   118         remainingCapacity(0), lastCC(0) {}
   119     ~ReorderingBuffer() {
   120         if(start!=NULL) {
   121             str.releaseBuffer((int32_t)(limit-start));
   122         }
   123     }
   124     UBool init(int32_t destCapacity, UErrorCode &errorCode);
   126     UBool isEmpty() const { return start==limit; }
   127     int32_t length() const { return (int32_t)(limit-start); }
   128     UChar *getStart() { return start; }
   129     UChar *getLimit() { return limit; }
   130     uint8_t getLastCC() const { return lastCC; }
   132     UBool equals(const UChar *start, const UChar *limit) const;
   134     // For Hangul composition, replacing the Leading consonant Jamo with the syllable.
   135     void setLastChar(UChar c) {
   136         *(limit-1)=c;
   137     }
   139     UBool append(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
   140         return (c<=0xffff) ?
   141             appendBMP((UChar)c, cc, errorCode) :
   142             appendSupplementary(c, cc, errorCode);
   143     }
   144     // s must be in NFD, otherwise change the implementation.
   145     UBool append(const UChar *s, int32_t length,
   146                  uint8_t leadCC, uint8_t trailCC,
   147                  UErrorCode &errorCode);
   148     UBool appendBMP(UChar c, uint8_t cc, UErrorCode &errorCode) {
   149         if(remainingCapacity==0 && !resize(1, errorCode)) {
   150             return FALSE;
   151         }
   152         if(lastCC<=cc || cc==0) {
   153             *limit++=c;
   154             lastCC=cc;
   155             if(cc<=1) {
   156                 reorderStart=limit;
   157             }
   158         } else {
   159             insert(c, cc);
   160         }
   161         --remainingCapacity;
   162         return TRUE;
   163     }
   164     UBool appendZeroCC(UChar32 c, UErrorCode &errorCode);
   165     UBool appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode);
   166     void remove();
   167     void removeSuffix(int32_t suffixLength);
   168     void setReorderingLimit(UChar *newLimit) {
   169         remainingCapacity+=(int32_t)(limit-newLimit);
   170         reorderStart=limit=newLimit;
   171         lastCC=0;
   172     }
   173     void copyReorderableSuffixTo(UnicodeString &s) const {
   174         s.setTo(reorderStart, (int32_t)(limit-reorderStart));
   175     }
   176 private:
   177     /*
   178      * TODO: Revisit whether it makes sense to track reorderStart.
   179      * It is set to after the last known character with cc<=1,
   180      * which stops previousCC() before it reads that character and looks up its cc.
   181      * previousCC() is normally only called from insert().
   182      * In other words, reorderStart speeds up the insertion of a combining mark
   183      * into a multi-combining mark sequence where it does not belong at the end.
   184      * This might not be worth the trouble.
   185      * On the other hand, it's not a huge amount of trouble.
   186      *
   187      * We probably need it for UNORM_SIMPLE_APPEND.
   188      */
   190     UBool appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode);
   191     void insert(UChar32 c, uint8_t cc);
   192     static void writeCodePoint(UChar *p, UChar32 c) {
   193         if(c<=0xffff) {
   194             *p=(UChar)c;
   195         } else {
   196             p[0]=U16_LEAD(c);
   197             p[1]=U16_TRAIL(c);
   198         }
   199     }
   200     UBool resize(int32_t appendLength, UErrorCode &errorCode);
   202     const Normalizer2Impl &impl;
   203     UnicodeString &str;
   204     UChar *start, *reorderStart, *limit;
   205     int32_t remainingCapacity;
   206     uint8_t lastCC;
   208     // private backward iterator
   209     void setIterator() { codePointStart=limit; }
   210     void skipPrevious();  // Requires start<codePointStart.
   211     uint8_t previousCC();  // Returns 0 if there is no previous character.
   213     UChar *codePointStart, *codePointLimit;
   214 };
   216 class U_COMMON_API Normalizer2Impl : public UMemory {
   217 public:
   218     Normalizer2Impl() : memory(NULL), normTrie(NULL), fCanonIterData(NULL) {
   219         fCanonIterDataInitOnce.reset();
   220     }
   221     ~Normalizer2Impl();
   223     void load(const char *packageName, const char *name, UErrorCode &errorCode);
   225     void addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const;
   226     void addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const;
   228     // low-level properties ------------------------------------------------ ***
   230     const UTrie2 *getNormTrie() const { return normTrie; }
   232     UBool ensureCanonIterData(UErrorCode &errorCode) const;
   234     uint16_t getNorm16(UChar32 c) const { return UTRIE2_GET16(normTrie, c); }
   236     UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const {
   237         if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) {
   238             return UNORM_YES;
   239         } else if(minMaybeYes<=norm16) {
   240             return UNORM_MAYBE;
   241         } else {
   242             return UNORM_NO;
   243         }
   244     }
   245     UBool isCompNo(uint16_t norm16) const { return minNoNo<=norm16 && norm16<minMaybeYes; }
   246     UBool isDecompYes(uint16_t norm16) const { return norm16<minYesNo || minMaybeYes<=norm16; }
   248     uint8_t getCC(uint16_t norm16) const {
   249         if(norm16>=MIN_NORMAL_MAYBE_YES) {
   250             return (uint8_t)norm16;
   251         }
   252         if(norm16<minNoNo || limitNoNo<=norm16) {
   253             return 0;
   254         }
   255         return getCCFromNoNo(norm16);
   256     }
   257     static uint8_t getCCFromYesOrMaybe(uint16_t norm16) {
   258         return norm16>=MIN_NORMAL_MAYBE_YES ? (uint8_t)norm16 : 0;
   259     }
   261     /**
   262      * Returns the FCD data for code point c.
   263      * @param c A Unicode code point.
   264      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
   265      */
   266     uint16_t getFCD16(UChar32 c) const {
   267         if(c<0) {
   268             return 0;
   269         } else if(c<0x180) {
   270             return tccc180[c];
   271         } else if(c<=0xffff) {
   272             if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
   273         }
   274         return getFCD16FromNormData(c);
   275     }
   276     /**
   277      * Returns the FCD data for the next code point (post-increment).
   278      * Might skip only a lead surrogate rather than the whole surrogate pair if none of
   279      * the supplementary code points associated with the lead surrogate have non-zero FCD data.
   280      * @param s A valid pointer into a string. Requires s!=limit.
   281      * @param limit The end of the string, or NULL.
   282      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
   283      */
   284     uint16_t nextFCD16(const UChar *&s, const UChar *limit) const {
   285         UChar32 c=*s++;
   286         if(c<0x180) {
   287             return tccc180[c];
   288         } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
   289             return 0;
   290         }
   291         UChar c2;
   292         if(U16_IS_LEAD(c) && s!=limit && U16_IS_TRAIL(c2=*s)) {
   293             c=U16_GET_SUPPLEMENTARY(c, c2);
   294             ++s;
   295         }
   296         return getFCD16FromNormData(c);
   297     }
   298     /**
   299      * Returns the FCD data for the previous code point (pre-decrement).
   300      * @param start The start of the string.
   301      * @param s A valid pointer into a string. Requires start<s.
   302      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
   303      */
   304     uint16_t previousFCD16(const UChar *start, const UChar *&s) const {
   305         UChar32 c=*--s;
   306         if(c<0x180) {
   307             return tccc180[c];
   308         }
   309         if(!U16_IS_TRAIL(c)) {
   310             if(!singleLeadMightHaveNonZeroFCD16(c)) {
   311                 return 0;
   312             }
   313         } else {
   314             UChar c2;
   315             if(start<s && U16_IS_LEAD(c2=*(s-1))) {
   316                 c=U16_GET_SUPPLEMENTARY(c2, c);
   317                 --s;
   318             }
   319         }
   320         return getFCD16FromNormData(c);
   321     }
   323     /** Returns the FCD data for U+0000<=c<U+0180. */
   324     uint16_t getFCD16FromBelow180(UChar32 c) const { return tccc180[c]; }
   325     /** Returns TRUE if the single-or-lead code unit c might have non-zero FCD data. */
   326     UBool singleLeadMightHaveNonZeroFCD16(UChar32 lead) const {
   327         // 0<=lead<=0xffff
   328         uint8_t bits=smallFCD[lead>>8];
   329         if(bits==0) { return false; }
   330         return (UBool)((bits>>((lead>>5)&7))&1);
   331     }
   332     /** Returns the FCD value from the regular normalization data. */
   333     uint16_t getFCD16FromNormData(UChar32 c) const;
   335     void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
   336                                      CanonIterData &newData, UErrorCode &errorCode) const;
   338     /**
   339      * Gets the decomposition for one code point.
   340      * @param c code point
   341      * @param buffer out-only buffer for algorithmic decompositions
   342      * @param length out-only, takes the length of the decomposition, if any
   343      * @return pointer to the decomposition, or NULL if none
   344      */
   345     const UChar *getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const;
   347     /**
   348      * Gets the raw decomposition for one code point.
   349      * @param c code point
   350      * @param buffer out-only buffer for algorithmic decompositions
   351      * @param length out-only, takes the length of the decomposition, if any
   352      * @return pointer to the decomposition, or NULL if none
   353      */
   354     const UChar *getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const;
   356     UChar32 composePair(UChar32 a, UChar32 b) const;
   358     UBool isCanonSegmentStarter(UChar32 c) const;
   359     UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const;
   361     enum {
   362         MIN_CCC_LCCC_CP=0x300
   363     };
   365     enum {
   366         MIN_YES_YES_WITH_CC=0xff01,
   367         JAMO_VT=0xff00,
   368         MIN_NORMAL_MAYBE_YES=0xfe00,
   369         JAMO_L=1,
   370         MAX_DELTA=0x40
   371     };
   373     enum {
   374         // Byte offsets from the start of the data, after the generic header.
   375         IX_NORM_TRIE_OFFSET,
   376         IX_EXTRA_DATA_OFFSET,
   377         IX_SMALL_FCD_OFFSET,
   378         IX_RESERVED3_OFFSET,
   379         IX_RESERVED4_OFFSET,
   380         IX_RESERVED5_OFFSET,
   381         IX_RESERVED6_OFFSET,
   382         IX_TOTAL_SIZE,
   384         // Code point thresholds for quick check codes.
   385         IX_MIN_DECOMP_NO_CP,
   386         IX_MIN_COMP_NO_MAYBE_CP,
   388         // Norm16 value thresholds for quick check combinations and types of extra data.
   389         IX_MIN_YES_NO,  // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
   390         IX_MIN_NO_NO,
   391         IX_LIMIT_NO_NO,
   392         IX_MIN_MAYBE_YES,
   394         IX_MIN_YES_NO_MAPPINGS_ONLY,  // Mappings only in [minYesNoMappingsOnly..minNoNo[.
   396         IX_RESERVED15,
   397         IX_COUNT
   398     };
   400     enum {
   401         MAPPING_HAS_CCC_LCCC_WORD=0x80,
   402         MAPPING_HAS_RAW_MAPPING=0x40,
   403         MAPPING_NO_COMP_BOUNDARY_AFTER=0x20,
   404         MAPPING_LENGTH_MASK=0x1f
   405     };
   407     enum {
   408         COMP_1_LAST_TUPLE=0x8000,
   409         COMP_1_TRIPLE=1,
   410         COMP_1_TRAIL_LIMIT=0x3400,
   411         COMP_1_TRAIL_MASK=0x7ffe,
   412         COMP_1_TRAIL_SHIFT=9,  // 10-1 for the "triple" bit
   413         COMP_2_TRAIL_SHIFT=6,
   414         COMP_2_TRAIL_MASK=0xffc0
   415     };
   417     // higher-level functionality ------------------------------------------ ***
   419     const UChar *decompose(const UChar *src, const UChar *limit,
   420                            ReorderingBuffer *buffer, UErrorCode &errorCode) const;
   421     void decomposeAndAppend(const UChar *src, const UChar *limit,
   422                             UBool doDecompose,
   423                             UnicodeString &safeMiddle,
   424                             ReorderingBuffer &buffer,
   425                             UErrorCode &errorCode) const;
   426     UBool compose(const UChar *src, const UChar *limit,
   427                   UBool onlyContiguous,
   428                   UBool doCompose,
   429                   ReorderingBuffer &buffer,
   430                   UErrorCode &errorCode) const;
   431     const UChar *composeQuickCheck(const UChar *src, const UChar *limit,
   432                                    UBool onlyContiguous,
   433                                    UNormalizationCheckResult *pQCResult) const;
   434     void composeAndAppend(const UChar *src, const UChar *limit,
   435                           UBool doCompose,
   436                           UBool onlyContiguous,
   437                           UnicodeString &safeMiddle,
   438                           ReorderingBuffer &buffer,
   439                           UErrorCode &errorCode) const;
   440     const UChar *makeFCD(const UChar *src, const UChar *limit,
   441                          ReorderingBuffer *buffer, UErrorCode &errorCode) const;
   442     void makeFCDAndAppend(const UChar *src, const UChar *limit,
   443                           UBool doMakeFCD,
   444                           UnicodeString &safeMiddle,
   445                           ReorderingBuffer &buffer,
   446                           UErrorCode &errorCode) const;
   448     UBool hasDecompBoundary(UChar32 c, UBool before) const;
   449     UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); }
   451     UBool hasCompBoundaryBefore(UChar32 c) const {
   452         return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c));
   453     }
   454     UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const;
   456     UBool hasFCDBoundaryBefore(UChar32 c) const { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; }
   457     UBool hasFCDBoundaryAfter(UChar32 c) const {
   458         uint16_t fcd16=getFCD16(c);
   459         return fcd16<=1 || (fcd16&0xff)==0;
   460     }
   461     UBool isFCDInert(UChar32 c) const { return getFCD16(c)<=1; }
   462 private:
   463     static UBool U_CALLCONV
   464     isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo);
   466     UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
   467     UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; }
   468     static UBool isInert(uint16_t norm16) { return norm16==0; }
   469     static UBool isJamoL(uint16_t norm16) { return norm16==1; }
   470     static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; }
   471     UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; }
   472     UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; }
   473     // UBool isCompYes(uint16_t norm16) const {
   474     //     return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
   475     // }
   476     // UBool isCompYesOrMaybe(uint16_t norm16) const {
   477     //     return norm16<minNoNo || minMaybeYes<=norm16;
   478     // }
   479     // UBool hasZeroCCFromDecompYes(uint16_t norm16) const {
   480     //     return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
   481     // }
   482     UBool isDecompYesAndZeroCC(uint16_t norm16) const {
   483         return norm16<minYesNo ||
   484                norm16==JAMO_VT ||
   485                (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
   486     }
   487     /**
   488      * A little faster and simpler than isDecompYesAndZeroCC() but does not include
   489      * the MaybeYes which combine-forward and have ccc=0.
   490      * (Standard Unicode 5.2 normalization does not have such characters.)
   491      */
   492     UBool isMostDecompYesAndZeroCC(uint16_t norm16) const {
   493         return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
   494     }
   495     UBool isDecompNoAlgorithmic(uint16_t norm16) const { return norm16>=limitNoNo; }
   497     // For use with isCompYes().
   498     // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
   499     // static uint8_t getCCFromYes(uint16_t norm16) {
   500     //     return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0;
   501     // }
   502     uint8_t getCCFromNoNo(uint16_t norm16) const {
   503         const uint16_t *mapping=getMapping(norm16);
   504         if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) {
   505             return (uint8_t)*(mapping-1);
   506         } else {
   507             return 0;
   508         }
   509     }
   510     // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
   511     uint8_t getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const;
   513     // Requires algorithmic-NoNo.
   514     UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const {
   515         return c+norm16-(minMaybeYes-MAX_DELTA-1);
   516     }
   518     // Requires minYesNo<norm16<limitNoNo.
   519     const uint16_t *getMapping(uint16_t norm16) const { return extraData+norm16; }
   520     const uint16_t *getCompositionsListForDecompYes(uint16_t norm16) const {
   521         if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) {
   522             return NULL;
   523         } else if(norm16<minMaybeYes) {
   524             return extraData+norm16;  // for yesYes; if Jamo L: harmless empty list
   525         } else {
   526             return maybeYesCompositions+norm16-minMaybeYes;
   527         }
   528     }
   529     const uint16_t *getCompositionsListForComposite(uint16_t norm16) const {
   530         const uint16_t *list=extraData+norm16;  // composite has both mapping & compositions list
   531         return list+  // mapping pointer
   532             1+  // +1 to skip the first unit with the mapping lenth
   533             (*list&MAPPING_LENGTH_MASK);  // + mapping length
   534     }
   535     /**
   536      * @param c code point must have compositions
   537      * @return compositions list pointer
   538      */
   539     const uint16_t *getCompositionsList(uint16_t norm16) const {
   540         return isDecompYes(norm16) ?
   541                 getCompositionsListForDecompYes(norm16) :
   542                 getCompositionsListForComposite(norm16);
   543     }
   545     const UChar *copyLowPrefixFromNulTerminated(const UChar *src,
   546                                                 UChar32 minNeedDataCP,
   547                                                 ReorderingBuffer *buffer,
   548                                                 UErrorCode &errorCode) const;
   549     UBool decomposeShort(const UChar *src, const UChar *limit,
   550                          ReorderingBuffer &buffer, UErrorCode &errorCode) const;
   551     UBool decompose(UChar32 c, uint16_t norm16,
   552                     ReorderingBuffer &buffer, UErrorCode &errorCode) const;
   554     static int32_t combine(const uint16_t *list, UChar32 trail);
   555     void addComposites(const uint16_t *list, UnicodeSet &set) const;
   556     void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
   557                    UBool onlyContiguous) const;
   559     UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const;
   560     const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p) const;
   561     const UChar *findNextCompBoundary(const UChar *p, const UChar *limit) const;
   563     const UChar *findPreviousFCDBoundary(const UChar *start, const UChar *p) const;
   564     const UChar *findNextFCDBoundary(const UChar *p, const UChar *limit) const;
   566     int32_t getCanonValue(UChar32 c) const;
   567     const UnicodeSet &getCanonStartSet(int32_t n) const;
   569     UDataMemory *memory;
   570     UVersionInfo dataVersion;
   572     // Code point thresholds for quick check codes.
   573     UChar32 minDecompNoCP;
   574     UChar32 minCompNoMaybeCP;
   576     // Norm16 value thresholds for quick check combinations and types of extra data.
   577     uint16_t minYesNo;
   578     uint16_t minYesNoMappingsOnly;
   579     uint16_t minNoNo;
   580     uint16_t limitNoNo;
   581     uint16_t minMaybeYes;
   583     UTrie2 *normTrie;
   584     const uint16_t *maybeYesCompositions;
   585     const uint16_t *extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
   586     const uint8_t *smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0
   587     uint8_t tccc180[0x180];  // tccc values for U+0000..U+017F
   589   public:           // CanonIterData is public to allow access from C callback functions.
   590     UInitOnce       fCanonIterDataInitOnce;
   591     CanonIterData  *fCanonIterData;
   592 };
   594 // bits in canonIterData
   595 #define CANON_NOT_SEGMENT_STARTER 0x80000000
   596 #define CANON_HAS_COMPOSITIONS 0x40000000
   597 #define CANON_HAS_SET 0x200000
   598 #define CANON_VALUE_MASK 0x1fffff
   600 /**
   601  * ICU-internal shortcut for quick access to standard Unicode normalization.
   602  */
   603 class U_COMMON_API Normalizer2Factory {
   604 public:
   605     static const Normalizer2 *getNFCInstance(UErrorCode &errorCode);
   606     static const Normalizer2 *getNFDInstance(UErrorCode &errorCode);
   607     static const Normalizer2 *getFCDInstance(UErrorCode &errorCode);
   608     static const Normalizer2 *getFCCInstance(UErrorCode &errorCode);
   609     static const Normalizer2 *getNFKCInstance(UErrorCode &errorCode);
   610     static const Normalizer2 *getNFKDInstance(UErrorCode &errorCode);
   611     static const Normalizer2 *getNFKC_CFInstance(UErrorCode &errorCode);
   612     static const Normalizer2 *getNoopInstance(UErrorCode &errorCode);
   614     static const Normalizer2 *getInstance(UNormalizationMode mode, UErrorCode &errorCode);
   616     static const Normalizer2Impl *getNFCImpl(UErrorCode &errorCode);
   617     static const Normalizer2Impl *getNFKCImpl(UErrorCode &errorCode);
   618     static const Normalizer2Impl *getNFKC_CFImpl(UErrorCode &errorCode);
   620     // Get the Impl instance of the Normalizer2.
   621     // Must be used only when it is known that norm2 is a Normalizer2WithImpl instance.
   622     static const Normalizer2Impl *getImpl(const Normalizer2 *norm2);
   623 private:
   624     Normalizer2Factory();  // No instantiation.
   625 };
   627 U_NAMESPACE_END
   629 U_CAPI int32_t U_EXPORT2
   630 unorm2_swap(const UDataSwapper *ds,
   631             const void *inData, int32_t length, void *outData,
   632             UErrorCode *pErrorCode);
   634 /**
   635  * Get the NF*_QC property for a code point, for u_getIntPropertyValue().
   636  * @internal
   637  */
   638 U_CFUNC UNormalizationCheckResult
   639 unorm_getQuickCheck(UChar32 c, UNormalizationMode mode);
   641 /**
   642  * Gets the 16-bit FCD value (lead & trail CCs) for a code point, for u_getIntPropertyValue().
   643  * @internal
   644  */
   645 U_CFUNC uint16_t
   646 unorm_getFCD16(UChar32 c);
   648 /**
   649  * Format of Normalizer2 .nrm data files.
   650  * Format version 2.0.
   651  *
   652  * Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms.
   653  * ICU ships with data files for standard Unicode Normalization Forms
   654  * NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm) and NFKC_Casefold (nfkc_cf.nrm).
   655  * Custom (application-specific) data can be built into additional .nrm files
   656  * with the gennorm2 build tool.
   657  *
   658  * Normalizer2.getInstance() causes a .nrm file to be loaded, unless it has been
   659  * cached already. Internally, Normalizer2Impl.load() reads the .nrm file.
   660  *
   661  * A .nrm file begins with a standard ICU data file header
   662  * (DataHeader, see ucmndata.h and unicode/udata.h).
   663  * The UDataInfo.dataVersion field usually contains the Unicode version
   664  * for which the data was generated.
   665  *
   666  * After the header, the file contains the following parts.
   667  * Constants are defined as enum values of the Normalizer2Impl class.
   668  *
   669  * Many details of the data structures are described in the design doc
   670  * which is at http://site.icu-project.org/design/normalization/custom
   671  *
   672  * int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_NORM_TRIE_OFFSET]/4;
   673  *
   674  *      The first eight indexes are byte offsets in ascending order.
   675  *      Each byte offset marks the start of the next part in the data file,
   676  *      and the end of the previous one.
   677  *      When two consecutive byte offsets are the same, then the corresponding part is empty.
   678  *      Byte offsets are offsets from after the header,
   679  *      that is, from the beginning of the indexes[].
   680  *      Each part starts at an offset with proper alignment for its data.
   681  *      If necessary, the previous part may include padding bytes to achieve this alignment.
   682  *
   683  *      minDecompNoCP=indexes[IX_MIN_DECOMP_NO_CP] is the lowest code point
   684  *      with a decomposition mapping, that is, with NF*D_QC=No.
   685  *      minCompNoMaybeCP=indexes[IX_MIN_COMP_NO_MAYBE_CP] is the lowest code point
   686  *      with NF*C_QC=No (has a one-way mapping) or Maybe (combines backward).
   687  *
   688  *      The next five indexes are thresholds of 16-bit trie values for ranges of
   689  *      values indicating multiple normalization properties.
   690  *          minYesNo=indexes[IX_MIN_YES_NO];
   691  *          minNoNo=indexes[IX_MIN_NO_NO];
   692  *          limitNoNo=indexes[IX_LIMIT_NO_NO];
   693  *          minMaybeYes=indexes[IX_MIN_MAYBE_YES];
   694  *          minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
   695  *      See the normTrie description below and the design doc for details.
   696  *
   697  * UTrie2 normTrie; -- see utrie2_impl.h and utrie2.h
   698  *
   699  *      The trie holds the main normalization data. Each code point is mapped to a 16-bit value.
   700  *      Rather than using independent bits in the value (which would require more than 16 bits),
   701  *      information is extracted primarily via range checks.
   702  *      For example, a 16-bit value norm16 in the range minYesNo<=norm16<minNoNo
   703  *      means that the character has NF*C_QC=Yes and NF*D_QC=No properties,
   704  *      which means it has a two-way (round-trip) decomposition mapping.
   705  *      Values in the range 2<=norm16<limitNoNo are also directly indexes into the extraData
   706  *      pointing to mappings, compositions lists, or both.
   707  *      Value norm16==0 means that the character is normalization-inert, that is,
   708  *      it does not have a mapping, does not participate in composition, has a zero
   709  *      canonical combining class, and forms a boundary where text before it and after it
   710  *      can be normalized independently.
   711  *      For details about how multiple properties are encoded in 16-bit values
   712  *      see the design doc.
   713  *      Note that the encoding cannot express all combinations of the properties involved;
   714  *      it only supports those combinations that are allowed by
   715  *      the Unicode Normalization algorithms. Details are in the design doc as well.
   716  *      The gennorm2 tool only builds .nrm files for data that conforms to the limitations.
   717  *
   718  *      The trie has a value for each lead surrogate code unit representing the "worst case"
   719  *      properties of the 1024 supplementary characters whose UTF-16 form starts with
   720  *      the lead surrogate. If all of the 1024 supplementary characters are normalization-inert,
   721  *      then their lead surrogate code unit has the trie value 0.
   722  *      When the lead surrogate unit's value exceeds the quick check minimum during processing,
   723  *      the properties for the full supplementary code point need to be looked up.
   724  *
   725  * uint16_t maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes];
   726  * uint16_t extraData[];
   727  *
   728  *      There is only one byte offset for the end of these two arrays.
   729  *      The split between them is given by the constant and variable mentioned above.
   730  *
   731  *      The maybeYesCompositions array contains compositions lists for characters that
   732  *      combine both forward (as starters in composition pairs)
   733  *      and backward (as trailing characters in composition pairs).
   734  *      Such characters do not occur in Unicode 5.2 but are allowed by
   735  *      the Unicode Normalization algorithms.
   736  *      If there are no such characters, then minMaybeYes==MIN_NORMAL_MAYBE_YES
   737  *      and the maybeYesCompositions array is empty.
   738  *      If there are such characters, then minMaybeYes is subtracted from their norm16 values
   739  *      to get the index into this array.
   740  *
   741  *      The extraData array contains compositions lists for "YesYes" characters,
   742  *      followed by mappings and optional compositions lists for "YesNo" characters,
   743  *      followed by only mappings for "NoNo" characters.
   744  *      (Referring to pairs of NFC/NFD quick check values.)
   745  *      The norm16 values of those characters are directly indexes into the extraData array.
   746  *
   747  *      The data structures for compositions lists and mappings are described in the design doc.
   748  *
   749  * uint8_t smallFCD[0x100]; -- new in format version 2
   750  *
   751  *      This is a bit set to help speed up FCD value lookups in the absence of a full
   752  *      UTrie2 or other large data structure with the full FCD value mapping.
   753  *
   754  *      Each smallFCD bit is set if any of the corresponding 32 BMP code points
   755  *      has a non-zero FCD value (lccc!=0 or tccc!=0).
   756  *      Bit 0 of smallFCD[0] is for U+0000..U+001F. Bit 7 of smallFCD[0xff] is for U+FFE0..U+FFFF.
   757  *      A bit for 32 lead surrogates is set if any of the 32k corresponding
   758  *      _supplementary_ code points has a non-zero FCD value.
   759  *
   760  *      This bit set is most useful for the large blocks of CJK characters with FCD=0.
   761  *
   762  * Changes from format version 1 to format version 2 ---------------------------
   763  *
   764  * - Addition of data for raw (not recursively decomposed) mappings.
   765  *   + The MAPPING_NO_COMP_BOUNDARY_AFTER bit in the extraData is now also set when
   766  *     the mapping is to an empty string or when the character combines-forward.
   767  *     This subsumes the one actual use of the MAPPING_PLUS_COMPOSITION_LIST bit which
   768  *     is then repurposed for the MAPPING_HAS_RAW_MAPPING bit.
   769  *   + For details see the design doc.
   770  * - Addition of indexes[IX_MIN_YES_NO_MAPPINGS_ONLY] and separation of the yesNo extraData into
   771  *   distinct ranges (combines-forward vs. not)
   772  *   so that a range check can be used to find out if there is a compositions list.
   773  *   This is fully equivalent with formatVersion 1's MAPPING_PLUS_COMPOSITION_LIST flag.
   774  *   It is needed for the new (in ICU 49) composePair(), not for other normalization.
   775  * - Addition of the smallFCD[] bit set.
   776  */
   778 #endif  /* !UCONFIG_NO_NORMALIZATION */
   779 #endif  /* __NORMALIZER2IMPL_H__ */

mercurial