michael@0: /* michael@0: ******************************************************************************* michael@0: * michael@0: * Copyright (C) 2009-2013, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ******************************************************************************* michael@0: * file name: normalizer2impl.h michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2009nov22 michael@0: * created by: Markus W. Scherer michael@0: */ michael@0: michael@0: #ifndef __NORMALIZER2IMPL_H__ michael@0: #define __NORMALIZER2IMPL_H__ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_NORMALIZATION michael@0: michael@0: #include "unicode/normalizer2.h" michael@0: #include "unicode/udata.h" michael@0: #include "unicode/unistr.h" michael@0: #include "unicode/unorm.h" michael@0: #include "unicode/utf16.h" michael@0: #include "mutex.h" michael@0: #include "uset_imp.h" michael@0: #include "utrie2.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: struct CanonIterData; michael@0: michael@0: class Hangul { michael@0: public: michael@0: /* Korean Hangul and Jamo constants */ michael@0: enum { michael@0: JAMO_L_BASE=0x1100, /* "lead" jamo */ michael@0: JAMO_V_BASE=0x1161, /* "vowel" jamo */ michael@0: JAMO_T_BASE=0x11a7, /* "trail" jamo */ michael@0: michael@0: HANGUL_BASE=0xac00, michael@0: michael@0: JAMO_L_COUNT=19, michael@0: JAMO_V_COUNT=21, michael@0: JAMO_T_COUNT=28, michael@0: michael@0: JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT, michael@0: michael@0: HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT, michael@0: HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT michael@0: }; michael@0: michael@0: static inline UBool isHangul(UChar32 c) { michael@0: return HANGUL_BASE<=c && c=MIN_NORMAL_MAYBE_YES) { michael@0: return (uint8_t)norm16; michael@0: } michael@0: if(norm16=MIN_NORMAL_MAYBE_YES ? (uint8_t)norm16 : 0; michael@0: } michael@0: michael@0: /** michael@0: * Returns the FCD data for code point c. michael@0: * @param c A Unicode code point. michael@0: * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. michael@0: */ michael@0: uint16_t getFCD16(UChar32 c) const { michael@0: if(c<0) { michael@0: return 0; michael@0: } else if(c<0x180) { michael@0: return tccc180[c]; michael@0: } else if(c<=0xffff) { michael@0: if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } michael@0: } michael@0: return getFCD16FromNormData(c); michael@0: } michael@0: /** michael@0: * Returns the FCD data for the next code point (post-increment). michael@0: * Might skip only a lead surrogate rather than the whole surrogate pair if none of michael@0: * the supplementary code points associated with the lead surrogate have non-zero FCD data. michael@0: * @param s A valid pointer into a string. Requires s!=limit. michael@0: * @param limit The end of the string, or NULL. michael@0: * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. michael@0: */ michael@0: uint16_t nextFCD16(const UChar *&s, const UChar *limit) const { michael@0: UChar32 c=*s++; michael@0: if(c<0x180) { michael@0: return tccc180[c]; michael@0: } else if(!singleLeadMightHaveNonZeroFCD16(c)) { michael@0: return 0; michael@0: } michael@0: UChar c2; michael@0: if(U16_IS_LEAD(c) && s!=limit && U16_IS_TRAIL(c2=*s)) { michael@0: c=U16_GET_SUPPLEMENTARY(c, c2); michael@0: ++s; michael@0: } michael@0: return getFCD16FromNormData(c); michael@0: } michael@0: /** michael@0: * Returns the FCD data for the previous code point (pre-decrement). michael@0: * @param start The start of the string. michael@0: * @param s A valid pointer into a string. Requires start>8]; michael@0: if(bits==0) { return false; } michael@0: return (UBool)((bits>>((lead>>5)&7))&1); michael@0: } michael@0: /** Returns the FCD value from the regular normalization data. */ michael@0: uint16_t getFCD16FromNormData(UChar32 c) const; michael@0: michael@0: void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16, michael@0: CanonIterData &newData, UErrorCode &errorCode) const; michael@0: michael@0: /** michael@0: * Gets the decomposition for one code point. michael@0: * @param c code point michael@0: * @param buffer out-only buffer for algorithmic decompositions michael@0: * @param length out-only, takes the length of the decomposition, if any michael@0: * @return pointer to the decomposition, or NULL if none michael@0: */ michael@0: const UChar *getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const; michael@0: michael@0: /** michael@0: * Gets the raw decomposition for one code point. michael@0: * @param c code point michael@0: * @param buffer out-only buffer for algorithmic decompositions michael@0: * @param length out-only, takes the length of the decomposition, if any michael@0: * @return pointer to the decomposition, or NULL if none michael@0: */ michael@0: const UChar *getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const; michael@0: michael@0: UChar32 composePair(UChar32 a, UChar32 b) const; michael@0: michael@0: UBool isCanonSegmentStarter(UChar32 c) const; michael@0: UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const; michael@0: michael@0: enum { michael@0: MIN_CCC_LCCC_CP=0x300 michael@0: }; michael@0: michael@0: enum { michael@0: MIN_YES_YES_WITH_CC=0xff01, michael@0: JAMO_VT=0xff00, michael@0: MIN_NORMAL_MAYBE_YES=0xfe00, michael@0: JAMO_L=1, michael@0: MAX_DELTA=0x40 michael@0: }; michael@0: michael@0: enum { michael@0: // Byte offsets from the start of the data, after the generic header. michael@0: IX_NORM_TRIE_OFFSET, michael@0: IX_EXTRA_DATA_OFFSET, michael@0: IX_SMALL_FCD_OFFSET, michael@0: IX_RESERVED3_OFFSET, michael@0: IX_RESERVED4_OFFSET, michael@0: IX_RESERVED5_OFFSET, michael@0: IX_RESERVED6_OFFSET, michael@0: IX_TOTAL_SIZE, michael@0: michael@0: // Code point thresholds for quick check codes. michael@0: IX_MIN_DECOMP_NO_CP, michael@0: IX_MIN_COMP_NO_MAYBE_CP, michael@0: michael@0: // Norm16 value thresholds for quick check combinations and types of extra data. michael@0: IX_MIN_YES_NO, // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. michael@0: IX_MIN_NO_NO, michael@0: IX_LIMIT_NO_NO, michael@0: IX_MIN_MAYBE_YES, michael@0: michael@0: IX_MIN_YES_NO_MAPPINGS_ONLY, // Mappings only in [minYesNoMappingsOnly..minNoNo[. michael@0: michael@0: IX_RESERVED15, michael@0: IX_COUNT michael@0: }; michael@0: michael@0: enum { michael@0: MAPPING_HAS_CCC_LCCC_WORD=0x80, michael@0: MAPPING_HAS_RAW_MAPPING=0x40, michael@0: MAPPING_NO_COMP_BOUNDARY_AFTER=0x20, michael@0: MAPPING_LENGTH_MASK=0x1f michael@0: }; michael@0: michael@0: enum { michael@0: COMP_1_LAST_TUPLE=0x8000, michael@0: COMP_1_TRIPLE=1, michael@0: COMP_1_TRAIL_LIMIT=0x3400, michael@0: COMP_1_TRAIL_MASK=0x7ffe, michael@0: COMP_1_TRAIL_SHIFT=9, // 10-1 for the "triple" bit michael@0: COMP_2_TRAIL_SHIFT=6, michael@0: COMP_2_TRAIL_MASK=0xffc0 michael@0: }; michael@0: michael@0: // higher-level functionality ------------------------------------------ *** michael@0: michael@0: const UChar *decompose(const UChar *src, const UChar *limit, michael@0: ReorderingBuffer *buffer, UErrorCode &errorCode) const; michael@0: void decomposeAndAppend(const UChar *src, const UChar *limit, michael@0: UBool doDecompose, michael@0: UnicodeString &safeMiddle, michael@0: ReorderingBuffer &buffer, michael@0: UErrorCode &errorCode) const; michael@0: UBool compose(const UChar *src, const UChar *limit, michael@0: UBool onlyContiguous, michael@0: UBool doCompose, michael@0: ReorderingBuffer &buffer, michael@0: UErrorCode &errorCode) const; michael@0: const UChar *composeQuickCheck(const UChar *src, const UChar *limit, michael@0: UBool onlyContiguous, michael@0: UNormalizationCheckResult *pQCResult) const; michael@0: void composeAndAppend(const UChar *src, const UChar *limit, michael@0: UBool doCompose, michael@0: UBool onlyContiguous, michael@0: UnicodeString &safeMiddle, michael@0: ReorderingBuffer &buffer, michael@0: UErrorCode &errorCode) const; michael@0: const UChar *makeFCD(const UChar *src, const UChar *limit, michael@0: ReorderingBuffer *buffer, UErrorCode &errorCode) const; michael@0: void makeFCDAndAppend(const UChar *src, const UChar *limit, michael@0: UBool doMakeFCD, michael@0: UnicodeString &safeMiddle, michael@0: ReorderingBuffer &buffer, michael@0: UErrorCode &errorCode) const; michael@0: michael@0: UBool hasDecompBoundary(UChar32 c, UBool before) const; michael@0: UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); } michael@0: michael@0: UBool hasCompBoundaryBefore(UChar32 c) const { michael@0: return c=minMaybeYes; } michael@0: static UBool isInert(uint16_t norm16) { return norm16==0; } michael@0: static UBool isJamoL(uint16_t norm16) { return norm16==1; } michael@0: static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; } michael@0: UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; } michael@0: UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16=MIN_YES_YES_WITH_CC || norm16=limitNoNo; } michael@0: michael@0: // For use with isCompYes(). michael@0: // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. michael@0: // static uint8_t getCCFromYes(uint16_t norm16) { michael@0: // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0; michael@0: // } michael@0: uint8_t getCCFromNoNo(uint16_t norm16) const { michael@0: const uint16_t *mapping=getMapping(norm16); michael@0: if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) { michael@0: return (uint8_t)*(mapping-1); michael@0: } else { michael@0: return 0; michael@0: } michael@0: } michael@0: // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() michael@0: uint8_t getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const; michael@0: michael@0: // Requires algorithmic-NoNo. michael@0: UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const { michael@0: return c+norm16-(minMaybeYes-MAX_DELTA-1); michael@0: } michael@0: michael@0: // Requires minYesNo