diff -r 000000000000 -r 6474c204b198 intl/unicharutil/util/nsUnicodeProperties.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/intl/unicharutil/util/nsUnicodeProperties.cpp	Wed Dec 31 06:09:35 2014 +0100
@@ -0,0 +1,389 @@
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsUnicodeProperties.h"
+#include "nsUnicodePropertyData.cpp"
+
+#include "mozilla/ArrayUtils.h"
+#include "nsCharTraits.h"
+
+#define UNICODE_BMP_LIMIT 0x10000
+#define UNICODE_LIMIT     0x110000
+
+
+const nsCharProps1&
+GetCharProps1(uint32_t aCh)
+{
+    if (aCh < UNICODE_BMP_LIMIT) {
+        return sCharProp1Values[sCharProp1Pages[0][aCh >> kCharProp1CharBits]]
+                               [aCh & ((1 << kCharProp1CharBits) - 1)];
+    }
+    if (aCh < (kCharProp1MaxPlane + 1) * 0x10000) {
+        return sCharProp1Values[sCharProp1Pages[sCharProp1Planes[(aCh >> 16) - 1]]
+                                               [(aCh & 0xffff) >> kCharProp1CharBits]]
+                               [aCh & ((1 << kCharProp1CharBits) - 1)];
+    }
+
+    // Default values for unassigned
+    static const nsCharProps1 undefined = {
+        0,       // Index to mirrored char offsets
+        0,       // Hangul Syllable type
+        0        // Combining class
+    };
+    return undefined;
+}
+
+const nsCharProps2&
+GetCharProps2(uint32_t aCh)
+{
+    if (aCh < UNICODE_BMP_LIMIT) {
+        return sCharProp2Values[sCharProp2Pages[0][aCh >> kCharProp2CharBits]]
+                              [aCh & ((1 << kCharProp2CharBits) - 1)];
+    }
+    if (aCh < (kCharProp2MaxPlane + 1) * 0x10000) {
+        return sCharProp2Values[sCharProp2Pages[sCharProp2Planes[(aCh >> 16) - 1]]
+                                               [(aCh & 0xffff) >> kCharProp2CharBits]]
+                               [aCh & ((1 << kCharProp2CharBits) - 1)];
+    }
+
+    NS_NOTREACHED("Getting CharProps for codepoint outside Unicode range");
+    // Default values for unassigned
+    static const nsCharProps2 undefined = {
+        MOZ_SCRIPT_UNKNOWN,                      // Script code
+        0,                                       // East Asian Width
+        HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED,  // General Category
+        eCharType_LeftToRight,                   // Bidi Category
+        mozilla::unicode::XIDMOD_NOT_CHARS,      // Xidmod
+        -1,                                      // Numeric Value
+        mozilla::unicode::HVT_NotHan             // Han variant
+    };
+    return undefined;
+}
+
+namespace mozilla {
+
+namespace unicode {
+
+/*
+To store properties for a million Unicode codepoints compactly, we use
+a three-level array structure, with the Unicode values considered as
+three elements: Plane, Page, and Char.
+
+Space optimization happens because multiple Planes can refer to the same
+Page array, and multiple Pages can refer to the same Char array holding
+the actual values. In practice, most of the higher planes are empty and
+thus share the same data; and within the BMP, there are also many pages
+that repeat the same data for any given property.
+
+Plane is usually zero, so we skip a lookup in this case, and require
+that the Plane 0 pages are always the first set of entries in the Page
+array.
+
+The division of the remaining 16 bits into Page and Char fields is
+adjusted for each property (by experiment using the generation tool)
+to provide the most compact storage, depending on the distribution
+of values.
+*/
+
+nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[] = {
+  /*
+   * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants
+   * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-common.h.
+   */
+  /* CONTROL */             nsIUGenCategory::kOther,
+  /* FORMAT */              nsIUGenCategory::kOther,
+  /* UNASSIGNED */          nsIUGenCategory::kOther,
+  /* PRIVATE_USE */         nsIUGenCategory::kOther,
+  /* SURROGATE */           nsIUGenCategory::kOther,
+  /* LOWERCASE_LETTER */    nsIUGenCategory::kLetter,
+  /* MODIFIER_LETTER */     nsIUGenCategory::kLetter,
+  /* OTHER_LETTER */        nsIUGenCategory::kLetter,
+  /* TITLECASE_LETTER */    nsIUGenCategory::kLetter,
+  /* UPPERCASE_LETTER */    nsIUGenCategory::kLetter,
+  /* COMBINING_MARK */      nsIUGenCategory::kMark,
+  /* ENCLOSING_MARK */      nsIUGenCategory::kMark,
+  /* NON_SPACING_MARK */    nsIUGenCategory::kMark,
+  /* DECIMAL_NUMBER */      nsIUGenCategory::kNumber,
+  /* LETTER_NUMBER */       nsIUGenCategory::kNumber,
+  /* OTHER_NUMBER */        nsIUGenCategory::kNumber,
+  /* CONNECT_PUNCTUATION */ nsIUGenCategory::kPunctuation,
+  /* DASH_PUNCTUATION */    nsIUGenCategory::kPunctuation,
+  /* CLOSE_PUNCTUATION */   nsIUGenCategory::kPunctuation,
+  /* FINAL_PUNCTUATION */   nsIUGenCategory::kPunctuation,
+  /* INITIAL_PUNCTUATION */ nsIUGenCategory::kPunctuation,
+  /* OTHER_PUNCTUATION */   nsIUGenCategory::kPunctuation,
+  /* OPEN_PUNCTUATION */    nsIUGenCategory::kPunctuation,
+  /* CURRENCY_SYMBOL */     nsIUGenCategory::kSymbol,
+  /* MODIFIER_SYMBOL */     nsIUGenCategory::kSymbol,
+  /* MATH_SYMBOL */         nsIUGenCategory::kSymbol,
+  /* OTHER_SYMBOL */        nsIUGenCategory::kSymbol,
+  /* LINE_SEPARATOR */      nsIUGenCategory::kSeparator,
+  /* PARAGRAPH_SEPARATOR */ nsIUGenCategory::kSeparator,
+  /* SPACE_SEPARATOR */     nsIUGenCategory::kSeparator
+};
+
+uint32_t
+GetMirroredChar(uint32_t aCh)
+{
+    return aCh + sMirrorOffsets[GetCharProps1(aCh).mMirrorOffsetIndex];
+}
+
+uint32_t
+GetScriptTagForCode(int32_t aScriptCode)
+{
+    // this will safely return 0 for negative script codes, too :)
+    if (uint32_t(aScriptCode) > ArrayLength(sScriptCodeToTag)) {
+        return 0;
+    }
+    return sScriptCodeToTag[aScriptCode];
+}
+
+static inline uint32_t
+GetCaseMapValue(uint32_t aCh)
+{
+    if (aCh < UNICODE_BMP_LIMIT) {
+        return sCaseMapValues[sCaseMapPages[0][aCh >> kCaseMapCharBits]]
+                             [aCh & ((1 << kCaseMapCharBits) - 1)];
+    }
+    if (aCh < (kCaseMapMaxPlane + 1) * 0x10000) {
+        return sCaseMapValues[sCaseMapPages[sCaseMapPlanes[(aCh >> 16) - 1]]
+                                           [(aCh & 0xffff) >> kCaseMapCharBits]]
+                             [aCh & ((1 << kCaseMapCharBits) - 1)];
+    }
+    return 0;
+}
+
+uint32_t
+GetUppercase(uint32_t aCh)
+{
+    uint32_t mapValue = GetCaseMapValue(aCh);
+    if (mapValue & (kLowerToUpper | kTitleToUpper)) {
+        return aCh ^ (mapValue & kCaseMapCharMask);
+    }
+    if (mapValue & kLowerToTitle) {
+        return GetUppercase(aCh ^ (mapValue & kCaseMapCharMask));
+    }
+    return aCh;
+}
+
+uint32_t
+GetLowercase(uint32_t aCh)
+{
+    uint32_t mapValue = GetCaseMapValue(aCh);
+    if (mapValue & kUpperToLower) {
+        return aCh ^ (mapValue & kCaseMapCharMask);
+    }
+    if (mapValue & kTitleToUpper) {
+        return GetLowercase(aCh ^ (mapValue & kCaseMapCharMask));
+    }
+    return aCh;
+}
+
+uint32_t
+GetTitlecaseForLower(uint32_t aCh)
+{
+    uint32_t mapValue = GetCaseMapValue(aCh);
+    if (mapValue & (kLowerToTitle | kLowerToUpper)) {
+        return aCh ^ (mapValue & kCaseMapCharMask);
+    }
+    return aCh;
+}
+
+uint32_t
+GetTitlecaseForAll(uint32_t aCh)
+{
+    uint32_t mapValue = GetCaseMapValue(aCh);
+    if (mapValue & (kLowerToTitle | kLowerToUpper)) {
+        return aCh ^ (mapValue & kCaseMapCharMask);
+    }
+    if (mapValue & kUpperToLower) {
+        return GetTitlecaseForLower(aCh ^ (mapValue & kCaseMapCharMask));
+    }
+    return aCh;
+}
+
+HanVariantType
+GetHanVariant(uint32_t aCh)
+{
+    // In the sHanVariantValues array, data for 4 successive characters
+    // (2 bits each) is packed in to each uint8_t entry, with the value
+    // for the lowest character stored in the least significant bits.
+    uint8_t v = 0;
+    if (aCh < UNICODE_BMP_LIMIT) {
+        v = sHanVariantValues[sHanVariantPages[0][aCh >> kHanVariantCharBits]]
+                             [(aCh & ((1 << kHanVariantCharBits) - 1)) >> 2];
+    } else if (aCh < (kHanVariantMaxPlane + 1) * 0x10000) {
+        v = sHanVariantValues[sHanVariantPages[sHanVariantPlanes[(aCh >> 16) - 1]]
+                                              [(aCh & 0xffff) >> kHanVariantCharBits]]
+                             [(aCh & ((1 << kHanVariantCharBits) - 1)) >> 2];
+    }
+    // extract the appropriate 2-bit field from the value
+    return HanVariantType((v >> ((aCh & 3) * 2)) & 3);
+}
+
+uint32_t
+GetFullWidth(uint32_t aCh)
+{
+    // full-width mappings only exist for BMP characters; all others are
+    // returned unchanged
+    if (aCh < UNICODE_BMP_LIMIT) {
+        uint32_t v =
+            sFullWidthValues[sFullWidthPages[aCh >> kFullWidthCharBits]]
+                            [aCh & ((1 << kFullWidthCharBits) - 1)];
+        if (v) {
+            // return the mapped value if non-zero; else return original char
+            return v;
+        }
+    }
+    return aCh;
+}
+
+bool
+IsClusterExtender(uint32_t aCh, uint8_t aCategory)
+{
+    return ((aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK &&
+             aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) ||
+            (aCh >= 0x200c && aCh <= 0x200d) || // ZWJ, ZWNJ
+            (aCh >= 0xff9e && aCh <= 0xff9f));  // katakana sound marks
+}
+
+// TODO: replace this with a properties file or similar;
+// expect this to evolve as harfbuzz shaping support matures.
+//
+// The "shaping type" of each script run, as returned by this
+// function, is compared to the bits set in the
+// gfx.font_rendering.harfbuzz.scripts
+// preference to decide whether to use the harfbuzz shaper.
+//
+int32_t
+ScriptShapingType(int32_t aScriptCode)
+{
+    switch (aScriptCode) {
+    default:
+        return SHAPING_DEFAULT; // scripts not explicitly listed here are
+                                // assumed to just use default shaping
+
+    case MOZ_SCRIPT_ARABIC:
+    case MOZ_SCRIPT_SYRIAC:
+    case MOZ_SCRIPT_NKO:
+    case MOZ_SCRIPT_MANDAIC:
+        return SHAPING_ARABIC; // bidi scripts with Arabic-style shaping
+
+    case MOZ_SCRIPT_HEBREW:
+        return SHAPING_HEBREW;
+
+    case MOZ_SCRIPT_HANGUL:
+        return SHAPING_HANGUL;
+
+    case MOZ_SCRIPT_MONGOLIAN: // to be supported by the Arabic shaper?
+        return SHAPING_MONGOLIAN;
+
+    case MOZ_SCRIPT_THAI: // no complex OT features, but MS engines like to do
+                          // sequence checking
+        return SHAPING_THAI;
+
+    case MOZ_SCRIPT_BENGALI:
+    case MOZ_SCRIPT_DEVANAGARI:
+    case MOZ_SCRIPT_GUJARATI:
+    case MOZ_SCRIPT_GURMUKHI:
+    case MOZ_SCRIPT_KANNADA:
+    case MOZ_SCRIPT_MALAYALAM:
+    case MOZ_SCRIPT_ORIYA:
+    case MOZ_SCRIPT_SINHALA:
+    case MOZ_SCRIPT_TAMIL:
+    case MOZ_SCRIPT_TELUGU:
+    case MOZ_SCRIPT_KHMER:
+    case MOZ_SCRIPT_LAO:
+    case MOZ_SCRIPT_TIBETAN:
+    case MOZ_SCRIPT_NEW_TAI_LUE:
+    case MOZ_SCRIPT_TAI_LE:
+    case MOZ_SCRIPT_MYANMAR:
+    case MOZ_SCRIPT_PHAGS_PA:
+    case MOZ_SCRIPT_BATAK:
+    case MOZ_SCRIPT_BRAHMI:
+        return SHAPING_INDIC; // scripts that require Indic or other "special" shaping
+    }
+}
+
+void
+ClusterIterator::Next()
+{
+    if (AtEnd()) {
+        NS_WARNING("ClusterIterator has already reached the end");
+        return;
+    }
+
+    uint32_t ch = *mPos++;
+
+    if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit &&
+        NS_IS_LOW_SURROGATE(*mPos)) {
+        ch = SURROGATE_TO_UCS4(ch, *mPos++);
+    } else if ((ch & ~0xff) == 0x1100 ||
+        (ch >= 0xa960 && ch <= 0xa97f) ||
+        (ch >= 0xac00 && ch <= 0xd7ff)) {
+        // Handle conjoining Jamo that make Hangul syllables
+        HSType hangulState = GetHangulSyllableType(ch);
+        while (mPos < mLimit) {
+            ch = *mPos;
+            HSType hangulType = GetHangulSyllableType(ch);
+            switch (hangulType) {
+            case HST_L:
+            case HST_LV:
+            case HST_LVT:
+                if (hangulState == HST_L) {
+                    hangulState = hangulType;
+                    mPos++;
+                    continue;
+                }
+                break;
+            case HST_V:
+                if ((hangulState != HST_NONE) && !(hangulState & HST_T)) {
+                    hangulState = hangulType;
+                    mPos++;
+                    continue;
+                }
+                break;
+            case HST_T:
+                if (hangulState & (HST_V | HST_T)) {
+                    hangulState = hangulType;
+                    mPos++;
+                    continue;
+                }
+                break;
+            default:
+                break;
+            }
+            break;
+        }
+    }
+
+    while (mPos < mLimit) {
+        ch = *mPos;
+
+        // Check for surrogate pairs; note that isolated surrogates will just
+        // be treated as generic (non-cluster-extending) characters here,
+        // which is fine for cluster-iterating purposes
+        if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit - 1 &&
+            NS_IS_LOW_SURROGATE(*(mPos + 1))) {
+            ch = SURROGATE_TO_UCS4(ch, *(mPos + 1));
+        }
+
+        if (!IsClusterExtender(ch)) {
+            break;
+        }
+
+        mPos++;
+        if (!IS_IN_BMP(ch)) {
+            mPos++;
+        }
+    }
+
+    NS_ASSERTION(mText < mPos && mPos <= mLimit,
+                 "ClusterIterator::Next has overshot the string!");
+}
+
+} // end namespace unicode
+
+} // end namespace mozilla