michael@0: /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
michael@0:  * This Source Code Form is subject to the terms of the Mozilla Public
michael@0:  * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0:  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0: 
michael@0: #include "nsUnicodeProperties.h"
michael@0: #include "nsUnicodePropertyData.cpp"
michael@0: 
michael@0: #include "mozilla/ArrayUtils.h"
michael@0: #include "nsCharTraits.h"
michael@0: 
michael@0: #define UNICODE_BMP_LIMIT 0x10000
michael@0: #define UNICODE_LIMIT     0x110000
michael@0: 
michael@0: 
michael@0: const nsCharProps1&
michael@0: GetCharProps1(uint32_t aCh)
michael@0: {
michael@0:     if (aCh < UNICODE_BMP_LIMIT) {
michael@0:         return sCharProp1Values[sCharProp1Pages[0][aCh >> kCharProp1CharBits]]
michael@0:                                [aCh & ((1 << kCharProp1CharBits) - 1)];
michael@0:     }
michael@0:     if (aCh < (kCharProp1MaxPlane + 1) * 0x10000) {
michael@0:         return sCharProp1Values[sCharProp1Pages[sCharProp1Planes[(aCh >> 16) - 1]]
michael@0:                                                [(aCh & 0xffff) >> kCharProp1CharBits]]
michael@0:                                [aCh & ((1 << kCharProp1CharBits) - 1)];
michael@0:     }
michael@0: 
michael@0:     // Default values for unassigned
michael@0:     static const nsCharProps1 undefined = {
michael@0:         0,       // Index to mirrored char offsets
michael@0:         0,       // Hangul Syllable type
michael@0:         0        // Combining class
michael@0:     };
michael@0:     return undefined;
michael@0: }
michael@0: 
michael@0: const nsCharProps2&
michael@0: GetCharProps2(uint32_t aCh)
michael@0: {
michael@0:     if (aCh < UNICODE_BMP_LIMIT) {
michael@0:         return sCharProp2Values[sCharProp2Pages[0][aCh >> kCharProp2CharBits]]
michael@0:                               [aCh & ((1 << kCharProp2CharBits) - 1)];
michael@0:     }
michael@0:     if (aCh < (kCharProp2MaxPlane + 1) * 0x10000) {
michael@0:         return sCharProp2Values[sCharProp2Pages[sCharProp2Planes[(aCh >> 16) - 1]]
michael@0:                                                [(aCh & 0xffff) >> kCharProp2CharBits]]
michael@0:                                [aCh & ((1 << kCharProp2CharBits) - 1)];
michael@0:     }
michael@0: 
michael@0:     NS_NOTREACHED("Getting CharProps for codepoint outside Unicode range");
michael@0:     // Default values for unassigned
michael@0:     static const nsCharProps2 undefined = {
michael@0:         MOZ_SCRIPT_UNKNOWN,                      // Script code
michael@0:         0,                                       // East Asian Width
michael@0:         HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED,  // General Category
michael@0:         eCharType_LeftToRight,                   // Bidi Category
michael@0:         mozilla::unicode::XIDMOD_NOT_CHARS,      // Xidmod
michael@0:         -1,                                      // Numeric Value
michael@0:         mozilla::unicode::HVT_NotHan             // Han variant
michael@0:     };
michael@0:     return undefined;
michael@0: }
michael@0: 
michael@0: namespace mozilla {
michael@0: 
michael@0: namespace unicode {
michael@0: 
michael@0: /*
michael@0: To store properties for a million Unicode codepoints compactly, we use
michael@0: a three-level array structure, with the Unicode values considered as
michael@0: three elements: Plane, Page, and Char.
michael@0: 
michael@0: Space optimization happens because multiple Planes can refer to the same
michael@0: Page array, and multiple Pages can refer to the same Char array holding
michael@0: the actual values. In practice, most of the higher planes are empty and
michael@0: thus share the same data; and within the BMP, there are also many pages
michael@0: that repeat the same data for any given property.
michael@0: 
michael@0: Plane is usually zero, so we skip a lookup in this case, and require
michael@0: that the Plane 0 pages are always the first set of entries in the Page
michael@0: array.
michael@0: 
michael@0: The division of the remaining 16 bits into Page and Char fields is
michael@0: adjusted for each property (by experiment using the generation tool)
michael@0: to provide the most compact storage, depending on the distribution
michael@0: of values.
michael@0: */
michael@0: 
michael@0: nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[] = {
michael@0:   /*
michael@0:    * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants
michael@0:    * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-common.h.
michael@0:    */
michael@0:   /* CONTROL */             nsIUGenCategory::kOther,
michael@0:   /* FORMAT */              nsIUGenCategory::kOther,
michael@0:   /* UNASSIGNED */          nsIUGenCategory::kOther,
michael@0:   /* PRIVATE_USE */         nsIUGenCategory::kOther,
michael@0:   /* SURROGATE */           nsIUGenCategory::kOther,
michael@0:   /* LOWERCASE_LETTER */    nsIUGenCategory::kLetter,
michael@0:   /* MODIFIER_LETTER */     nsIUGenCategory::kLetter,
michael@0:   /* OTHER_LETTER */        nsIUGenCategory::kLetter,
michael@0:   /* TITLECASE_LETTER */    nsIUGenCategory::kLetter,
michael@0:   /* UPPERCASE_LETTER */    nsIUGenCategory::kLetter,
michael@0:   /* COMBINING_MARK */      nsIUGenCategory::kMark,
michael@0:   /* ENCLOSING_MARK */      nsIUGenCategory::kMark,
michael@0:   /* NON_SPACING_MARK */    nsIUGenCategory::kMark,
michael@0:   /* DECIMAL_NUMBER */      nsIUGenCategory::kNumber,
michael@0:   /* LETTER_NUMBER */       nsIUGenCategory::kNumber,
michael@0:   /* OTHER_NUMBER */        nsIUGenCategory::kNumber,
michael@0:   /* CONNECT_PUNCTUATION */ nsIUGenCategory::kPunctuation,
michael@0:   /* DASH_PUNCTUATION */    nsIUGenCategory::kPunctuation,
michael@0:   /* CLOSE_PUNCTUATION */   nsIUGenCategory::kPunctuation,
michael@0:   /* FINAL_PUNCTUATION */   nsIUGenCategory::kPunctuation,
michael@0:   /* INITIAL_PUNCTUATION */ nsIUGenCategory::kPunctuation,
michael@0:   /* OTHER_PUNCTUATION */   nsIUGenCategory::kPunctuation,
michael@0:   /* OPEN_PUNCTUATION */    nsIUGenCategory::kPunctuation,
michael@0:   /* CURRENCY_SYMBOL */     nsIUGenCategory::kSymbol,
michael@0:   /* MODIFIER_SYMBOL */     nsIUGenCategory::kSymbol,
michael@0:   /* MATH_SYMBOL */         nsIUGenCategory::kSymbol,
michael@0:   /* OTHER_SYMBOL */        nsIUGenCategory::kSymbol,
michael@0:   /* LINE_SEPARATOR */      nsIUGenCategory::kSeparator,
michael@0:   /* PARAGRAPH_SEPARATOR */ nsIUGenCategory::kSeparator,
michael@0:   /* SPACE_SEPARATOR */     nsIUGenCategory::kSeparator
michael@0: };
michael@0: 
michael@0: uint32_t
michael@0: GetMirroredChar(uint32_t aCh)
michael@0: {
michael@0:     return aCh + sMirrorOffsets[GetCharProps1(aCh).mMirrorOffsetIndex];
michael@0: }
michael@0: 
michael@0: uint32_t
michael@0: GetScriptTagForCode(int32_t aScriptCode)
michael@0: {
michael@0:     // this will safely return 0 for negative script codes, too :)
michael@0:     if (uint32_t(aScriptCode) > ArrayLength(sScriptCodeToTag)) {
michael@0:         return 0;
michael@0:     }
michael@0:     return sScriptCodeToTag[aScriptCode];
michael@0: }
michael@0: 
michael@0: static inline uint32_t
michael@0: GetCaseMapValue(uint32_t aCh)
michael@0: {
michael@0:     if (aCh < UNICODE_BMP_LIMIT) {
michael@0:         return sCaseMapValues[sCaseMapPages[0][aCh >> kCaseMapCharBits]]
michael@0:                              [aCh & ((1 << kCaseMapCharBits) - 1)];
michael@0:     }
michael@0:     if (aCh < (kCaseMapMaxPlane + 1) * 0x10000) {
michael@0:         return sCaseMapValues[sCaseMapPages[sCaseMapPlanes[(aCh >> 16) - 1]]
michael@0:                                            [(aCh & 0xffff) >> kCaseMapCharBits]]
michael@0:                              [aCh & ((1 << kCaseMapCharBits) - 1)];
michael@0:     }
michael@0:     return 0;
michael@0: }
michael@0: 
michael@0: uint32_t
michael@0: GetUppercase(uint32_t aCh)
michael@0: {
michael@0:     uint32_t mapValue = GetCaseMapValue(aCh);
michael@0:     if (mapValue & (kLowerToUpper | kTitleToUpper)) {
michael@0:         return aCh ^ (mapValue & kCaseMapCharMask);
michael@0:     }
michael@0:     if (mapValue & kLowerToTitle) {
michael@0:         return GetUppercase(aCh ^ (mapValue & kCaseMapCharMask));
michael@0:     }
michael@0:     return aCh;
michael@0: }
michael@0: 
michael@0: uint32_t
michael@0: GetLowercase(uint32_t aCh)
michael@0: {
michael@0:     uint32_t mapValue = GetCaseMapValue(aCh);
michael@0:     if (mapValue & kUpperToLower) {
michael@0:         return aCh ^ (mapValue & kCaseMapCharMask);
michael@0:     }
michael@0:     if (mapValue & kTitleToUpper) {
michael@0:         return GetLowercase(aCh ^ (mapValue & kCaseMapCharMask));
michael@0:     }
michael@0:     return aCh;
michael@0: }
michael@0: 
michael@0: uint32_t
michael@0: GetTitlecaseForLower(uint32_t aCh)
michael@0: {
michael@0:     uint32_t mapValue = GetCaseMapValue(aCh);
michael@0:     if (mapValue & (kLowerToTitle | kLowerToUpper)) {
michael@0:         return aCh ^ (mapValue & kCaseMapCharMask);
michael@0:     }
michael@0:     return aCh;
michael@0: }
michael@0: 
michael@0: uint32_t
michael@0: GetTitlecaseForAll(uint32_t aCh)
michael@0: {
michael@0:     uint32_t mapValue = GetCaseMapValue(aCh);
michael@0:     if (mapValue & (kLowerToTitle | kLowerToUpper)) {
michael@0:         return aCh ^ (mapValue & kCaseMapCharMask);
michael@0:     }
michael@0:     if (mapValue & kUpperToLower) {
michael@0:         return GetTitlecaseForLower(aCh ^ (mapValue & kCaseMapCharMask));
michael@0:     }
michael@0:     return aCh;
michael@0: }
michael@0: 
michael@0: HanVariantType
michael@0: GetHanVariant(uint32_t aCh)
michael@0: {
michael@0:     // In the sHanVariantValues array, data for 4 successive characters
michael@0:     // (2 bits each) is packed in to each uint8_t entry, with the value
michael@0:     // for the lowest character stored in the least significant bits.
michael@0:     uint8_t v = 0;
michael@0:     if (aCh < UNICODE_BMP_LIMIT) {
michael@0:         v = sHanVariantValues[sHanVariantPages[0][aCh >> kHanVariantCharBits]]
michael@0:                              [(aCh & ((1 << kHanVariantCharBits) - 1)) >> 2];
michael@0:     } else if (aCh < (kHanVariantMaxPlane + 1) * 0x10000) {
michael@0:         v = sHanVariantValues[sHanVariantPages[sHanVariantPlanes[(aCh >> 16) - 1]]
michael@0:                                               [(aCh & 0xffff) >> kHanVariantCharBits]]
michael@0:                              [(aCh & ((1 << kHanVariantCharBits) - 1)) >> 2];
michael@0:     }
michael@0:     // extract the appropriate 2-bit field from the value
michael@0:     return HanVariantType((v >> ((aCh & 3) * 2)) & 3);
michael@0: }
michael@0: 
michael@0: uint32_t
michael@0: GetFullWidth(uint32_t aCh)
michael@0: {
michael@0:     // full-width mappings only exist for BMP characters; all others are
michael@0:     // returned unchanged
michael@0:     if (aCh < UNICODE_BMP_LIMIT) {
michael@0:         uint32_t v =
michael@0:             sFullWidthValues[sFullWidthPages[aCh >> kFullWidthCharBits]]
michael@0:                             [aCh & ((1 << kFullWidthCharBits) - 1)];
michael@0:         if (v) {
michael@0:             // return the mapped value if non-zero; else return original char
michael@0:             return v;
michael@0:         }
michael@0:     }
michael@0:     return aCh;
michael@0: }
michael@0: 
michael@0: bool
michael@0: IsClusterExtender(uint32_t aCh, uint8_t aCategory)
michael@0: {
michael@0:     return ((aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK &&
michael@0:              aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) ||
michael@0:             (aCh >= 0x200c && aCh <= 0x200d) || // ZWJ, ZWNJ
michael@0:             (aCh >= 0xff9e && aCh <= 0xff9f));  // katakana sound marks
michael@0: }
michael@0: 
michael@0: // TODO: replace this with a properties file or similar;
michael@0: // expect this to evolve as harfbuzz shaping support matures.
michael@0: //
michael@0: // The "shaping type" of each script run, as returned by this
michael@0: // function, is compared to the bits set in the
michael@0: // gfx.font_rendering.harfbuzz.scripts
michael@0: // preference to decide whether to use the harfbuzz shaper.
michael@0: //
michael@0: int32_t
michael@0: ScriptShapingType(int32_t aScriptCode)
michael@0: {
michael@0:     switch (aScriptCode) {
michael@0:     default:
michael@0:         return SHAPING_DEFAULT; // scripts not explicitly listed here are
michael@0:                                 // assumed to just use default shaping
michael@0: 
michael@0:     case MOZ_SCRIPT_ARABIC:
michael@0:     case MOZ_SCRIPT_SYRIAC:
michael@0:     case MOZ_SCRIPT_NKO:
michael@0:     case MOZ_SCRIPT_MANDAIC:
michael@0:         return SHAPING_ARABIC; // bidi scripts with Arabic-style shaping
michael@0: 
michael@0:     case MOZ_SCRIPT_HEBREW:
michael@0:         return SHAPING_HEBREW;
michael@0: 
michael@0:     case MOZ_SCRIPT_HANGUL:
michael@0:         return SHAPING_HANGUL;
michael@0: 
michael@0:     case MOZ_SCRIPT_MONGOLIAN: // to be supported by the Arabic shaper?
michael@0:         return SHAPING_MONGOLIAN;
michael@0: 
michael@0:     case MOZ_SCRIPT_THAI: // no complex OT features, but MS engines like to do
michael@0:                           // sequence checking
michael@0:         return SHAPING_THAI;
michael@0: 
michael@0:     case MOZ_SCRIPT_BENGALI:
michael@0:     case MOZ_SCRIPT_DEVANAGARI:
michael@0:     case MOZ_SCRIPT_GUJARATI:
michael@0:     case MOZ_SCRIPT_GURMUKHI:
michael@0:     case MOZ_SCRIPT_KANNADA:
michael@0:     case MOZ_SCRIPT_MALAYALAM:
michael@0:     case MOZ_SCRIPT_ORIYA:
michael@0:     case MOZ_SCRIPT_SINHALA:
michael@0:     case MOZ_SCRIPT_TAMIL:
michael@0:     case MOZ_SCRIPT_TELUGU:
michael@0:     case MOZ_SCRIPT_KHMER:
michael@0:     case MOZ_SCRIPT_LAO:
michael@0:     case MOZ_SCRIPT_TIBETAN:
michael@0:     case MOZ_SCRIPT_NEW_TAI_LUE:
michael@0:     case MOZ_SCRIPT_TAI_LE:
michael@0:     case MOZ_SCRIPT_MYANMAR:
michael@0:     case MOZ_SCRIPT_PHAGS_PA:
michael@0:     case MOZ_SCRIPT_BATAK:
michael@0:     case MOZ_SCRIPT_BRAHMI:
michael@0:         return SHAPING_INDIC; // scripts that require Indic or other "special" shaping
michael@0:     }
michael@0: }
michael@0: 
michael@0: void
michael@0: ClusterIterator::Next()
michael@0: {
michael@0:     if (AtEnd()) {
michael@0:         NS_WARNING("ClusterIterator has already reached the end");
michael@0:         return;
michael@0:     }
michael@0: 
michael@0:     uint32_t ch = *mPos++;
michael@0: 
michael@0:     if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit &&
michael@0:         NS_IS_LOW_SURROGATE(*mPos)) {
michael@0:         ch = SURROGATE_TO_UCS4(ch, *mPos++);
michael@0:     } else if ((ch & ~0xff) == 0x1100 ||
michael@0:         (ch >= 0xa960 && ch <= 0xa97f) ||
michael@0:         (ch >= 0xac00 && ch <= 0xd7ff)) {
michael@0:         // Handle conjoining Jamo that make Hangul syllables
michael@0:         HSType hangulState = GetHangulSyllableType(ch);
michael@0:         while (mPos < mLimit) {
michael@0:             ch = *mPos;
michael@0:             HSType hangulType = GetHangulSyllableType(ch);
michael@0:             switch (hangulType) {
michael@0:             case HST_L:
michael@0:             case HST_LV:
michael@0:             case HST_LVT:
michael@0:                 if (hangulState == HST_L) {
michael@0:                     hangulState = hangulType;
michael@0:                     mPos++;
michael@0:                     continue;
michael@0:                 }
michael@0:                 break;
michael@0:             case HST_V:
michael@0:                 if ((hangulState != HST_NONE) && !(hangulState & HST_T)) {
michael@0:                     hangulState = hangulType;
michael@0:                     mPos++;
michael@0:                     continue;
michael@0:                 }
michael@0:                 break;
michael@0:             case HST_T:
michael@0:                 if (hangulState & (HST_V | HST_T)) {
michael@0:                     hangulState = hangulType;
michael@0:                     mPos++;
michael@0:                     continue;
michael@0:                 }
michael@0:                 break;
michael@0:             default:
michael@0:                 break;
michael@0:             }
michael@0:             break;
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     while (mPos < mLimit) {
michael@0:         ch = *mPos;
michael@0: 
michael@0:         // Check for surrogate pairs; note that isolated surrogates will just
michael@0:         // be treated as generic (non-cluster-extending) characters here,
michael@0:         // which is fine for cluster-iterating purposes
michael@0:         if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit - 1 &&
michael@0:             NS_IS_LOW_SURROGATE(*(mPos + 1))) {
michael@0:             ch = SURROGATE_TO_UCS4(ch, *(mPos + 1));
michael@0:         }
michael@0: 
michael@0:         if (!IsClusterExtender(ch)) {
michael@0:             break;
michael@0:         }
michael@0: 
michael@0:         mPos++;
michael@0:         if (!IS_IN_BMP(ch)) {
michael@0:             mPos++;
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     NS_ASSERTION(mText < mPos && mPos <= mLimit,
michael@0:                  "ClusterIterator::Next has overshot the string!");
michael@0: }
michael@0: 
michael@0: } // end namespace unicode
michael@0: 
michael@0: } // end namespace mozilla