michael@0: /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- michael@0: * This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #include "nsUnicodeProperties.h" michael@0: #include "nsUnicodePropertyData.cpp" michael@0: michael@0: #include "mozilla/ArrayUtils.h" michael@0: #include "nsCharTraits.h" michael@0: michael@0: #define UNICODE_BMP_LIMIT 0x10000 michael@0: #define UNICODE_LIMIT 0x110000 michael@0: michael@0: michael@0: const nsCharProps1& michael@0: GetCharProps1(uint32_t aCh) michael@0: { michael@0: if (aCh < UNICODE_BMP_LIMIT) { michael@0: return sCharProp1Values[sCharProp1Pages[0][aCh >> kCharProp1CharBits]] michael@0: [aCh & ((1 << kCharProp1CharBits) - 1)]; michael@0: } michael@0: if (aCh < (kCharProp1MaxPlane + 1) * 0x10000) { michael@0: return sCharProp1Values[sCharProp1Pages[sCharProp1Planes[(aCh >> 16) - 1]] michael@0: [(aCh & 0xffff) >> kCharProp1CharBits]] michael@0: [aCh & ((1 << kCharProp1CharBits) - 1)]; michael@0: } michael@0: michael@0: // Default values for unassigned michael@0: static const nsCharProps1 undefined = { michael@0: 0, // Index to mirrored char offsets michael@0: 0, // Hangul Syllable type michael@0: 0 // Combining class michael@0: }; michael@0: return undefined; michael@0: } michael@0: michael@0: const nsCharProps2& michael@0: GetCharProps2(uint32_t aCh) michael@0: { michael@0: if (aCh < UNICODE_BMP_LIMIT) { michael@0: return sCharProp2Values[sCharProp2Pages[0][aCh >> kCharProp2CharBits]] michael@0: [aCh & ((1 << kCharProp2CharBits) - 1)]; michael@0: } michael@0: if (aCh < (kCharProp2MaxPlane + 1) * 0x10000) { michael@0: return sCharProp2Values[sCharProp2Pages[sCharProp2Planes[(aCh >> 16) - 1]] michael@0: [(aCh & 0xffff) >> kCharProp2CharBits]] michael@0: [aCh & ((1 << kCharProp2CharBits) - 1)]; michael@0: } michael@0: michael@0: NS_NOTREACHED("Getting CharProps for codepoint outside Unicode range"); michael@0: // Default values for unassigned michael@0: static const nsCharProps2 undefined = { michael@0: MOZ_SCRIPT_UNKNOWN, // Script code michael@0: 0, // East Asian Width michael@0: HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED, // General Category michael@0: eCharType_LeftToRight, // Bidi Category michael@0: mozilla::unicode::XIDMOD_NOT_CHARS, // Xidmod michael@0: -1, // Numeric Value michael@0: mozilla::unicode::HVT_NotHan // Han variant michael@0: }; michael@0: return undefined; michael@0: } michael@0: michael@0: namespace mozilla { michael@0: michael@0: namespace unicode { michael@0: michael@0: /* michael@0: To store properties for a million Unicode codepoints compactly, we use michael@0: a three-level array structure, with the Unicode values considered as michael@0: three elements: Plane, Page, and Char. michael@0: michael@0: Space optimization happens because multiple Planes can refer to the same michael@0: Page array, and multiple Pages can refer to the same Char array holding michael@0: the actual values. In practice, most of the higher planes are empty and michael@0: thus share the same data; and within the BMP, there are also many pages michael@0: that repeat the same data for any given property. michael@0: michael@0: Plane is usually zero, so we skip a lookup in this case, and require michael@0: that the Plane 0 pages are always the first set of entries in the Page michael@0: array. michael@0: michael@0: The division of the remaining 16 bits into Page and Char fields is michael@0: adjusted for each property (by experiment using the generation tool) michael@0: to provide the most compact storage, depending on the distribution michael@0: of values. michael@0: */ michael@0: michael@0: nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[] = { michael@0: /* michael@0: * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants michael@0: * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-common.h. michael@0: */ michael@0: /* CONTROL */ nsIUGenCategory::kOther, michael@0: /* FORMAT */ nsIUGenCategory::kOther, michael@0: /* UNASSIGNED */ nsIUGenCategory::kOther, michael@0: /* PRIVATE_USE */ nsIUGenCategory::kOther, michael@0: /* SURROGATE */ nsIUGenCategory::kOther, michael@0: /* LOWERCASE_LETTER */ nsIUGenCategory::kLetter, michael@0: /* MODIFIER_LETTER */ nsIUGenCategory::kLetter, michael@0: /* OTHER_LETTER */ nsIUGenCategory::kLetter, michael@0: /* TITLECASE_LETTER */ nsIUGenCategory::kLetter, michael@0: /* UPPERCASE_LETTER */ nsIUGenCategory::kLetter, michael@0: /* COMBINING_MARK */ nsIUGenCategory::kMark, michael@0: /* ENCLOSING_MARK */ nsIUGenCategory::kMark, michael@0: /* NON_SPACING_MARK */ nsIUGenCategory::kMark, michael@0: /* DECIMAL_NUMBER */ nsIUGenCategory::kNumber, michael@0: /* LETTER_NUMBER */ nsIUGenCategory::kNumber, michael@0: /* OTHER_NUMBER */ nsIUGenCategory::kNumber, michael@0: /* CONNECT_PUNCTUATION */ nsIUGenCategory::kPunctuation, michael@0: /* DASH_PUNCTUATION */ nsIUGenCategory::kPunctuation, michael@0: /* CLOSE_PUNCTUATION */ nsIUGenCategory::kPunctuation, michael@0: /* FINAL_PUNCTUATION */ nsIUGenCategory::kPunctuation, michael@0: /* INITIAL_PUNCTUATION */ nsIUGenCategory::kPunctuation, michael@0: /* OTHER_PUNCTUATION */ nsIUGenCategory::kPunctuation, michael@0: /* OPEN_PUNCTUATION */ nsIUGenCategory::kPunctuation, michael@0: /* CURRENCY_SYMBOL */ nsIUGenCategory::kSymbol, michael@0: /* MODIFIER_SYMBOL */ nsIUGenCategory::kSymbol, michael@0: /* MATH_SYMBOL */ nsIUGenCategory::kSymbol, michael@0: /* OTHER_SYMBOL */ nsIUGenCategory::kSymbol, michael@0: /* LINE_SEPARATOR */ nsIUGenCategory::kSeparator, michael@0: /* PARAGRAPH_SEPARATOR */ nsIUGenCategory::kSeparator, michael@0: /* SPACE_SEPARATOR */ nsIUGenCategory::kSeparator michael@0: }; michael@0: michael@0: uint32_t michael@0: GetMirroredChar(uint32_t aCh) michael@0: { michael@0: return aCh + sMirrorOffsets[GetCharProps1(aCh).mMirrorOffsetIndex]; michael@0: } michael@0: michael@0: uint32_t michael@0: GetScriptTagForCode(int32_t aScriptCode) michael@0: { michael@0: // this will safely return 0 for negative script codes, too :) michael@0: if (uint32_t(aScriptCode) > ArrayLength(sScriptCodeToTag)) { michael@0: return 0; michael@0: } michael@0: return sScriptCodeToTag[aScriptCode]; michael@0: } michael@0: michael@0: static inline uint32_t michael@0: GetCaseMapValue(uint32_t aCh) michael@0: { michael@0: if (aCh < UNICODE_BMP_LIMIT) { michael@0: return sCaseMapValues[sCaseMapPages[0][aCh >> kCaseMapCharBits]] michael@0: [aCh & ((1 << kCaseMapCharBits) - 1)]; michael@0: } michael@0: if (aCh < (kCaseMapMaxPlane + 1) * 0x10000) { michael@0: return sCaseMapValues[sCaseMapPages[sCaseMapPlanes[(aCh >> 16) - 1]] michael@0: [(aCh & 0xffff) >> kCaseMapCharBits]] michael@0: [aCh & ((1 << kCaseMapCharBits) - 1)]; michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: uint32_t michael@0: GetUppercase(uint32_t aCh) michael@0: { michael@0: uint32_t mapValue = GetCaseMapValue(aCh); michael@0: if (mapValue & (kLowerToUpper | kTitleToUpper)) { michael@0: return aCh ^ (mapValue & kCaseMapCharMask); michael@0: } michael@0: if (mapValue & kLowerToTitle) { michael@0: return GetUppercase(aCh ^ (mapValue & kCaseMapCharMask)); michael@0: } michael@0: return aCh; michael@0: } michael@0: michael@0: uint32_t michael@0: GetLowercase(uint32_t aCh) michael@0: { michael@0: uint32_t mapValue = GetCaseMapValue(aCh); michael@0: if (mapValue & kUpperToLower) { michael@0: return aCh ^ (mapValue & kCaseMapCharMask); michael@0: } michael@0: if (mapValue & kTitleToUpper) { michael@0: return GetLowercase(aCh ^ (mapValue & kCaseMapCharMask)); michael@0: } michael@0: return aCh; michael@0: } michael@0: michael@0: uint32_t michael@0: GetTitlecaseForLower(uint32_t aCh) michael@0: { michael@0: uint32_t mapValue = GetCaseMapValue(aCh); michael@0: if (mapValue & (kLowerToTitle | kLowerToUpper)) { michael@0: return aCh ^ (mapValue & kCaseMapCharMask); michael@0: } michael@0: return aCh; michael@0: } michael@0: michael@0: uint32_t michael@0: GetTitlecaseForAll(uint32_t aCh) michael@0: { michael@0: uint32_t mapValue = GetCaseMapValue(aCh); michael@0: if (mapValue & (kLowerToTitle | kLowerToUpper)) { michael@0: return aCh ^ (mapValue & kCaseMapCharMask); michael@0: } michael@0: if (mapValue & kUpperToLower) { michael@0: return GetTitlecaseForLower(aCh ^ (mapValue & kCaseMapCharMask)); michael@0: } michael@0: return aCh; michael@0: } michael@0: michael@0: HanVariantType michael@0: GetHanVariant(uint32_t aCh) michael@0: { michael@0: // In the sHanVariantValues array, data for 4 successive characters michael@0: // (2 bits each) is packed in to each uint8_t entry, with the value michael@0: // for the lowest character stored in the least significant bits. michael@0: uint8_t v = 0; michael@0: if (aCh < UNICODE_BMP_LIMIT) { michael@0: v = sHanVariantValues[sHanVariantPages[0][aCh >> kHanVariantCharBits]] michael@0: [(aCh & ((1 << kHanVariantCharBits) - 1)) >> 2]; michael@0: } else if (aCh < (kHanVariantMaxPlane + 1) * 0x10000) { michael@0: v = sHanVariantValues[sHanVariantPages[sHanVariantPlanes[(aCh >> 16) - 1]] michael@0: [(aCh & 0xffff) >> kHanVariantCharBits]] michael@0: [(aCh & ((1 << kHanVariantCharBits) - 1)) >> 2]; michael@0: } michael@0: // extract the appropriate 2-bit field from the value michael@0: return HanVariantType((v >> ((aCh & 3) * 2)) & 3); michael@0: } michael@0: michael@0: uint32_t michael@0: GetFullWidth(uint32_t aCh) michael@0: { michael@0: // full-width mappings only exist for BMP characters; all others are michael@0: // returned unchanged michael@0: if (aCh < UNICODE_BMP_LIMIT) { michael@0: uint32_t v = michael@0: sFullWidthValues[sFullWidthPages[aCh >> kFullWidthCharBits]] michael@0: [aCh & ((1 << kFullWidthCharBits) - 1)]; michael@0: if (v) { michael@0: // return the mapped value if non-zero; else return original char michael@0: return v; michael@0: } michael@0: } michael@0: return aCh; michael@0: } michael@0: michael@0: bool michael@0: IsClusterExtender(uint32_t aCh, uint8_t aCategory) michael@0: { michael@0: return ((aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK && michael@0: aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) || michael@0: (aCh >= 0x200c && aCh <= 0x200d) || // ZWJ, ZWNJ michael@0: (aCh >= 0xff9e && aCh <= 0xff9f)); // katakana sound marks michael@0: } michael@0: michael@0: // TODO: replace this with a properties file or similar; michael@0: // expect this to evolve as harfbuzz shaping support matures. michael@0: // michael@0: // The "shaping type" of each script run, as returned by this michael@0: // function, is compared to the bits set in the michael@0: // gfx.font_rendering.harfbuzz.scripts michael@0: // preference to decide whether to use the harfbuzz shaper. michael@0: // michael@0: int32_t michael@0: ScriptShapingType(int32_t aScriptCode) michael@0: { michael@0: switch (aScriptCode) { michael@0: default: michael@0: return SHAPING_DEFAULT; // scripts not explicitly listed here are michael@0: // assumed to just use default shaping michael@0: michael@0: case MOZ_SCRIPT_ARABIC: michael@0: case MOZ_SCRIPT_SYRIAC: michael@0: case MOZ_SCRIPT_NKO: michael@0: case MOZ_SCRIPT_MANDAIC: michael@0: return SHAPING_ARABIC; // bidi scripts with Arabic-style shaping michael@0: michael@0: case MOZ_SCRIPT_HEBREW: michael@0: return SHAPING_HEBREW; michael@0: michael@0: case MOZ_SCRIPT_HANGUL: michael@0: return SHAPING_HANGUL; michael@0: michael@0: case MOZ_SCRIPT_MONGOLIAN: // to be supported by the Arabic shaper? michael@0: return SHAPING_MONGOLIAN; michael@0: michael@0: case MOZ_SCRIPT_THAI: // no complex OT features, but MS engines like to do michael@0: // sequence checking michael@0: return SHAPING_THAI; michael@0: michael@0: case MOZ_SCRIPT_BENGALI: michael@0: case MOZ_SCRIPT_DEVANAGARI: michael@0: case MOZ_SCRIPT_GUJARATI: michael@0: case MOZ_SCRIPT_GURMUKHI: michael@0: case MOZ_SCRIPT_KANNADA: michael@0: case MOZ_SCRIPT_MALAYALAM: michael@0: case MOZ_SCRIPT_ORIYA: michael@0: case MOZ_SCRIPT_SINHALA: michael@0: case MOZ_SCRIPT_TAMIL: michael@0: case MOZ_SCRIPT_TELUGU: michael@0: case MOZ_SCRIPT_KHMER: michael@0: case MOZ_SCRIPT_LAO: michael@0: case MOZ_SCRIPT_TIBETAN: michael@0: case MOZ_SCRIPT_NEW_TAI_LUE: michael@0: case MOZ_SCRIPT_TAI_LE: michael@0: case MOZ_SCRIPT_MYANMAR: michael@0: case MOZ_SCRIPT_PHAGS_PA: michael@0: case MOZ_SCRIPT_BATAK: michael@0: case MOZ_SCRIPT_BRAHMI: michael@0: return SHAPING_INDIC; // scripts that require Indic or other "special" shaping michael@0: } michael@0: } michael@0: michael@0: void michael@0: ClusterIterator::Next() michael@0: { michael@0: if (AtEnd()) { michael@0: NS_WARNING("ClusterIterator has already reached the end"); michael@0: return; michael@0: } michael@0: michael@0: uint32_t ch = *mPos++; michael@0: michael@0: if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit && michael@0: NS_IS_LOW_SURROGATE(*mPos)) { michael@0: ch = SURROGATE_TO_UCS4(ch, *mPos++); michael@0: } else if ((ch & ~0xff) == 0x1100 || michael@0: (ch >= 0xa960 && ch <= 0xa97f) || michael@0: (ch >= 0xac00 && ch <= 0xd7ff)) { michael@0: // Handle conjoining Jamo that make Hangul syllables michael@0: HSType hangulState = GetHangulSyllableType(ch); michael@0: while (mPos < mLimit) { michael@0: ch = *mPos; michael@0: HSType hangulType = GetHangulSyllableType(ch); michael@0: switch (hangulType) { michael@0: case HST_L: michael@0: case HST_LV: michael@0: case HST_LVT: michael@0: if (hangulState == HST_L) { michael@0: hangulState = hangulType; michael@0: mPos++; michael@0: continue; michael@0: } michael@0: break; michael@0: case HST_V: michael@0: if ((hangulState != HST_NONE) && !(hangulState & HST_T)) { michael@0: hangulState = hangulType; michael@0: mPos++; michael@0: continue; michael@0: } michael@0: break; michael@0: case HST_T: michael@0: if (hangulState & (HST_V | HST_T)) { michael@0: hangulState = hangulType; michael@0: mPos++; michael@0: continue; michael@0: } michael@0: break; michael@0: default: michael@0: break; michael@0: } michael@0: break; michael@0: } michael@0: } michael@0: michael@0: while (mPos < mLimit) { michael@0: ch = *mPos; michael@0: michael@0: // Check for surrogate pairs; note that isolated surrogates will just michael@0: // be treated as generic (non-cluster-extending) characters here, michael@0: // which is fine for cluster-iterating purposes michael@0: if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit - 1 && michael@0: NS_IS_LOW_SURROGATE(*(mPos + 1))) { michael@0: ch = SURROGATE_TO_UCS4(ch, *(mPos + 1)); michael@0: } michael@0: michael@0: if (!IsClusterExtender(ch)) { michael@0: break; michael@0: } michael@0: michael@0: mPos++; michael@0: if (!IS_IN_BMP(ch)) { michael@0: mPos++; michael@0: } michael@0: } michael@0: michael@0: NS_ASSERTION(mText < mPos && mPos <= mLimit, michael@0: "ClusterIterator::Next has overshot the string!"); michael@0: } michael@0: michael@0: } // end namespace unicode michael@0: michael@0: } // end namespace mozilla