intl/unicharutil/util/nsUnicodeProperties.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/unicharutil/util/nsUnicodeProperties.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,389 @@
     1.4 +/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
     1.5 + * This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +
     1.9 +#include "nsUnicodeProperties.h"
    1.10 +#include "nsUnicodePropertyData.cpp"
    1.11 +
    1.12 +#include "mozilla/ArrayUtils.h"
    1.13 +#include "nsCharTraits.h"
    1.14 +
    1.15 +#define UNICODE_BMP_LIMIT 0x10000
    1.16 +#define UNICODE_LIMIT     0x110000
    1.17 +
    1.18 +
    1.19 +const nsCharProps1&
    1.20 +GetCharProps1(uint32_t aCh)
    1.21 +{
    1.22 +    if (aCh < UNICODE_BMP_LIMIT) {
    1.23 +        return sCharProp1Values[sCharProp1Pages[0][aCh >> kCharProp1CharBits]]
    1.24 +                               [aCh & ((1 << kCharProp1CharBits) - 1)];
    1.25 +    }
    1.26 +    if (aCh < (kCharProp1MaxPlane + 1) * 0x10000) {
    1.27 +        return sCharProp1Values[sCharProp1Pages[sCharProp1Planes[(aCh >> 16) - 1]]
    1.28 +                                               [(aCh & 0xffff) >> kCharProp1CharBits]]
    1.29 +                               [aCh & ((1 << kCharProp1CharBits) - 1)];
    1.30 +    }
    1.31 +
    1.32 +    // Default values for unassigned
    1.33 +    static const nsCharProps1 undefined = {
    1.34 +        0,       // Index to mirrored char offsets
    1.35 +        0,       // Hangul Syllable type
    1.36 +        0        // Combining class
    1.37 +    };
    1.38 +    return undefined;
    1.39 +}
    1.40 +
    1.41 +const nsCharProps2&
    1.42 +GetCharProps2(uint32_t aCh)
    1.43 +{
    1.44 +    if (aCh < UNICODE_BMP_LIMIT) {
    1.45 +        return sCharProp2Values[sCharProp2Pages[0][aCh >> kCharProp2CharBits]]
    1.46 +                              [aCh & ((1 << kCharProp2CharBits) - 1)];
    1.47 +    }
    1.48 +    if (aCh < (kCharProp2MaxPlane + 1) * 0x10000) {
    1.49 +        return sCharProp2Values[sCharProp2Pages[sCharProp2Planes[(aCh >> 16) - 1]]
    1.50 +                                               [(aCh & 0xffff) >> kCharProp2CharBits]]
    1.51 +                               [aCh & ((1 << kCharProp2CharBits) - 1)];
    1.52 +    }
    1.53 +
    1.54 +    NS_NOTREACHED("Getting CharProps for codepoint outside Unicode range");
    1.55 +    // Default values for unassigned
    1.56 +    static const nsCharProps2 undefined = {
    1.57 +        MOZ_SCRIPT_UNKNOWN,                      // Script code
    1.58 +        0,                                       // East Asian Width
    1.59 +        HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED,  // General Category
    1.60 +        eCharType_LeftToRight,                   // Bidi Category
    1.61 +        mozilla::unicode::XIDMOD_NOT_CHARS,      // Xidmod
    1.62 +        -1,                                      // Numeric Value
    1.63 +        mozilla::unicode::HVT_NotHan             // Han variant
    1.64 +    };
    1.65 +    return undefined;
    1.66 +}
    1.67 +
    1.68 +namespace mozilla {
    1.69 +
    1.70 +namespace unicode {
    1.71 +
    1.72 +/*
    1.73 +To store properties for a million Unicode codepoints compactly, we use
    1.74 +a three-level array structure, with the Unicode values considered as
    1.75 +three elements: Plane, Page, and Char.
    1.76 +
    1.77 +Space optimization happens because multiple Planes can refer to the same
    1.78 +Page array, and multiple Pages can refer to the same Char array holding
    1.79 +the actual values. In practice, most of the higher planes are empty and
    1.80 +thus share the same data; and within the BMP, there are also many pages
    1.81 +that repeat the same data for any given property.
    1.82 +
    1.83 +Plane is usually zero, so we skip a lookup in this case, and require
    1.84 +that the Plane 0 pages are always the first set of entries in the Page
    1.85 +array.
    1.86 +
    1.87 +The division of the remaining 16 bits into Page and Char fields is
    1.88 +adjusted for each property (by experiment using the generation tool)
    1.89 +to provide the most compact storage, depending on the distribution
    1.90 +of values.
    1.91 +*/
    1.92 +
    1.93 +nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[] = {
    1.94 +  /*
    1.95 +   * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants
    1.96 +   * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-common.h.
    1.97 +   */
    1.98 +  /* CONTROL */             nsIUGenCategory::kOther,
    1.99 +  /* FORMAT */              nsIUGenCategory::kOther,
   1.100 +  /* UNASSIGNED */          nsIUGenCategory::kOther,
   1.101 +  /* PRIVATE_USE */         nsIUGenCategory::kOther,
   1.102 +  /* SURROGATE */           nsIUGenCategory::kOther,
   1.103 +  /* LOWERCASE_LETTER */    nsIUGenCategory::kLetter,
   1.104 +  /* MODIFIER_LETTER */     nsIUGenCategory::kLetter,
   1.105 +  /* OTHER_LETTER */        nsIUGenCategory::kLetter,
   1.106 +  /* TITLECASE_LETTER */    nsIUGenCategory::kLetter,
   1.107 +  /* UPPERCASE_LETTER */    nsIUGenCategory::kLetter,
   1.108 +  /* COMBINING_MARK */      nsIUGenCategory::kMark,
   1.109 +  /* ENCLOSING_MARK */      nsIUGenCategory::kMark,
   1.110 +  /* NON_SPACING_MARK */    nsIUGenCategory::kMark,
   1.111 +  /* DECIMAL_NUMBER */      nsIUGenCategory::kNumber,
   1.112 +  /* LETTER_NUMBER */       nsIUGenCategory::kNumber,
   1.113 +  /* OTHER_NUMBER */        nsIUGenCategory::kNumber,
   1.114 +  /* CONNECT_PUNCTUATION */ nsIUGenCategory::kPunctuation,
   1.115 +  /* DASH_PUNCTUATION */    nsIUGenCategory::kPunctuation,
   1.116 +  /* CLOSE_PUNCTUATION */   nsIUGenCategory::kPunctuation,
   1.117 +  /* FINAL_PUNCTUATION */   nsIUGenCategory::kPunctuation,
   1.118 +  /* INITIAL_PUNCTUATION */ nsIUGenCategory::kPunctuation,
   1.119 +  /* OTHER_PUNCTUATION */   nsIUGenCategory::kPunctuation,
   1.120 +  /* OPEN_PUNCTUATION */    nsIUGenCategory::kPunctuation,
   1.121 +  /* CURRENCY_SYMBOL */     nsIUGenCategory::kSymbol,
   1.122 +  /* MODIFIER_SYMBOL */     nsIUGenCategory::kSymbol,
   1.123 +  /* MATH_SYMBOL */         nsIUGenCategory::kSymbol,
   1.124 +  /* OTHER_SYMBOL */        nsIUGenCategory::kSymbol,
   1.125 +  /* LINE_SEPARATOR */      nsIUGenCategory::kSeparator,
   1.126 +  /* PARAGRAPH_SEPARATOR */ nsIUGenCategory::kSeparator,
   1.127 +  /* SPACE_SEPARATOR */     nsIUGenCategory::kSeparator
   1.128 +};
   1.129 +
   1.130 +uint32_t
   1.131 +GetMirroredChar(uint32_t aCh)
   1.132 +{
   1.133 +    return aCh + sMirrorOffsets[GetCharProps1(aCh).mMirrorOffsetIndex];
   1.134 +}
   1.135 +
   1.136 +uint32_t
   1.137 +GetScriptTagForCode(int32_t aScriptCode)
   1.138 +{
   1.139 +    // this will safely return 0 for negative script codes, too :)
   1.140 +    if (uint32_t(aScriptCode) > ArrayLength(sScriptCodeToTag)) {
   1.141 +        return 0;
   1.142 +    }
   1.143 +    return sScriptCodeToTag[aScriptCode];
   1.144 +}
   1.145 +
   1.146 +static inline uint32_t
   1.147 +GetCaseMapValue(uint32_t aCh)
   1.148 +{
   1.149 +    if (aCh < UNICODE_BMP_LIMIT) {
   1.150 +        return sCaseMapValues[sCaseMapPages[0][aCh >> kCaseMapCharBits]]
   1.151 +                             [aCh & ((1 << kCaseMapCharBits) - 1)];
   1.152 +    }
   1.153 +    if (aCh < (kCaseMapMaxPlane + 1) * 0x10000) {
   1.154 +        return sCaseMapValues[sCaseMapPages[sCaseMapPlanes[(aCh >> 16) - 1]]
   1.155 +                                           [(aCh & 0xffff) >> kCaseMapCharBits]]
   1.156 +                             [aCh & ((1 << kCaseMapCharBits) - 1)];
   1.157 +    }
   1.158 +    return 0;
   1.159 +}
   1.160 +
   1.161 +uint32_t
   1.162 +GetUppercase(uint32_t aCh)
   1.163 +{
   1.164 +    uint32_t mapValue = GetCaseMapValue(aCh);
   1.165 +    if (mapValue & (kLowerToUpper | kTitleToUpper)) {
   1.166 +        return aCh ^ (mapValue & kCaseMapCharMask);
   1.167 +    }
   1.168 +    if (mapValue & kLowerToTitle) {
   1.169 +        return GetUppercase(aCh ^ (mapValue & kCaseMapCharMask));
   1.170 +    }
   1.171 +    return aCh;
   1.172 +}
   1.173 +
   1.174 +uint32_t
   1.175 +GetLowercase(uint32_t aCh)
   1.176 +{
   1.177 +    uint32_t mapValue = GetCaseMapValue(aCh);
   1.178 +    if (mapValue & kUpperToLower) {
   1.179 +        return aCh ^ (mapValue & kCaseMapCharMask);
   1.180 +    }
   1.181 +    if (mapValue & kTitleToUpper) {
   1.182 +        return GetLowercase(aCh ^ (mapValue & kCaseMapCharMask));
   1.183 +    }
   1.184 +    return aCh;
   1.185 +}
   1.186 +
   1.187 +uint32_t
   1.188 +GetTitlecaseForLower(uint32_t aCh)
   1.189 +{
   1.190 +    uint32_t mapValue = GetCaseMapValue(aCh);
   1.191 +    if (mapValue & (kLowerToTitle | kLowerToUpper)) {
   1.192 +        return aCh ^ (mapValue & kCaseMapCharMask);
   1.193 +    }
   1.194 +    return aCh;
   1.195 +}
   1.196 +
   1.197 +uint32_t
   1.198 +GetTitlecaseForAll(uint32_t aCh)
   1.199 +{
   1.200 +    uint32_t mapValue = GetCaseMapValue(aCh);
   1.201 +    if (mapValue & (kLowerToTitle | kLowerToUpper)) {
   1.202 +        return aCh ^ (mapValue & kCaseMapCharMask);
   1.203 +    }
   1.204 +    if (mapValue & kUpperToLower) {
   1.205 +        return GetTitlecaseForLower(aCh ^ (mapValue & kCaseMapCharMask));
   1.206 +    }
   1.207 +    return aCh;
   1.208 +}
   1.209 +
   1.210 +HanVariantType
   1.211 +GetHanVariant(uint32_t aCh)
   1.212 +{
   1.213 +    // In the sHanVariantValues array, data for 4 successive characters
   1.214 +    // (2 bits each) is packed in to each uint8_t entry, with the value
   1.215 +    // for the lowest character stored in the least significant bits.
   1.216 +    uint8_t v = 0;
   1.217 +    if (aCh < UNICODE_BMP_LIMIT) {
   1.218 +        v = sHanVariantValues[sHanVariantPages[0][aCh >> kHanVariantCharBits]]
   1.219 +                             [(aCh & ((1 << kHanVariantCharBits) - 1)) >> 2];
   1.220 +    } else if (aCh < (kHanVariantMaxPlane + 1) * 0x10000) {
   1.221 +        v = sHanVariantValues[sHanVariantPages[sHanVariantPlanes[(aCh >> 16) - 1]]
   1.222 +                                              [(aCh & 0xffff) >> kHanVariantCharBits]]
   1.223 +                             [(aCh & ((1 << kHanVariantCharBits) - 1)) >> 2];
   1.224 +    }
   1.225 +    // extract the appropriate 2-bit field from the value
   1.226 +    return HanVariantType((v >> ((aCh & 3) * 2)) & 3);
   1.227 +}
   1.228 +
   1.229 +uint32_t
   1.230 +GetFullWidth(uint32_t aCh)
   1.231 +{
   1.232 +    // full-width mappings only exist for BMP characters; all others are
   1.233 +    // returned unchanged
   1.234 +    if (aCh < UNICODE_BMP_LIMIT) {
   1.235 +        uint32_t v =
   1.236 +            sFullWidthValues[sFullWidthPages[aCh >> kFullWidthCharBits]]
   1.237 +                            [aCh & ((1 << kFullWidthCharBits) - 1)];
   1.238 +        if (v) {
   1.239 +            // return the mapped value if non-zero; else return original char
   1.240 +            return v;
   1.241 +        }
   1.242 +    }
   1.243 +    return aCh;
   1.244 +}
   1.245 +
   1.246 +bool
   1.247 +IsClusterExtender(uint32_t aCh, uint8_t aCategory)
   1.248 +{
   1.249 +    return ((aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK &&
   1.250 +             aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) ||
   1.251 +            (aCh >= 0x200c && aCh <= 0x200d) || // ZWJ, ZWNJ
   1.252 +            (aCh >= 0xff9e && aCh <= 0xff9f));  // katakana sound marks
   1.253 +}
   1.254 +
   1.255 +// TODO: replace this with a properties file or similar;
   1.256 +// expect this to evolve as harfbuzz shaping support matures.
   1.257 +//
   1.258 +// The "shaping type" of each script run, as returned by this
   1.259 +// function, is compared to the bits set in the
   1.260 +// gfx.font_rendering.harfbuzz.scripts
   1.261 +// preference to decide whether to use the harfbuzz shaper.
   1.262 +//
   1.263 +int32_t
   1.264 +ScriptShapingType(int32_t aScriptCode)
   1.265 +{
   1.266 +    switch (aScriptCode) {
   1.267 +    default:
   1.268 +        return SHAPING_DEFAULT; // scripts not explicitly listed here are
   1.269 +                                // assumed to just use default shaping
   1.270 +
   1.271 +    case MOZ_SCRIPT_ARABIC:
   1.272 +    case MOZ_SCRIPT_SYRIAC:
   1.273 +    case MOZ_SCRIPT_NKO:
   1.274 +    case MOZ_SCRIPT_MANDAIC:
   1.275 +        return SHAPING_ARABIC; // bidi scripts with Arabic-style shaping
   1.276 +
   1.277 +    case MOZ_SCRIPT_HEBREW:
   1.278 +        return SHAPING_HEBREW;
   1.279 +
   1.280 +    case MOZ_SCRIPT_HANGUL:
   1.281 +        return SHAPING_HANGUL;
   1.282 +
   1.283 +    case MOZ_SCRIPT_MONGOLIAN: // to be supported by the Arabic shaper?
   1.284 +        return SHAPING_MONGOLIAN;
   1.285 +
   1.286 +    case MOZ_SCRIPT_THAI: // no complex OT features, but MS engines like to do
   1.287 +                          // sequence checking
   1.288 +        return SHAPING_THAI;
   1.289 +
   1.290 +    case MOZ_SCRIPT_BENGALI:
   1.291 +    case MOZ_SCRIPT_DEVANAGARI:
   1.292 +    case MOZ_SCRIPT_GUJARATI:
   1.293 +    case MOZ_SCRIPT_GURMUKHI:
   1.294 +    case MOZ_SCRIPT_KANNADA:
   1.295 +    case MOZ_SCRIPT_MALAYALAM:
   1.296 +    case MOZ_SCRIPT_ORIYA:
   1.297 +    case MOZ_SCRIPT_SINHALA:
   1.298 +    case MOZ_SCRIPT_TAMIL:
   1.299 +    case MOZ_SCRIPT_TELUGU:
   1.300 +    case MOZ_SCRIPT_KHMER:
   1.301 +    case MOZ_SCRIPT_LAO:
   1.302 +    case MOZ_SCRIPT_TIBETAN:
   1.303 +    case MOZ_SCRIPT_NEW_TAI_LUE:
   1.304 +    case MOZ_SCRIPT_TAI_LE:
   1.305 +    case MOZ_SCRIPT_MYANMAR:
   1.306 +    case MOZ_SCRIPT_PHAGS_PA:
   1.307 +    case MOZ_SCRIPT_BATAK:
   1.308 +    case MOZ_SCRIPT_BRAHMI:
   1.309 +        return SHAPING_INDIC; // scripts that require Indic or other "special" shaping
   1.310 +    }
   1.311 +}
   1.312 +
   1.313 +void
   1.314 +ClusterIterator::Next()
   1.315 +{
   1.316 +    if (AtEnd()) {
   1.317 +        NS_WARNING("ClusterIterator has already reached the end");
   1.318 +        return;
   1.319 +    }
   1.320 +
   1.321 +    uint32_t ch = *mPos++;
   1.322 +
   1.323 +    if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit &&
   1.324 +        NS_IS_LOW_SURROGATE(*mPos)) {
   1.325 +        ch = SURROGATE_TO_UCS4(ch, *mPos++);
   1.326 +    } else if ((ch & ~0xff) == 0x1100 ||
   1.327 +        (ch >= 0xa960 && ch <= 0xa97f) ||
   1.328 +        (ch >= 0xac00 && ch <= 0xd7ff)) {
   1.329 +        // Handle conjoining Jamo that make Hangul syllables
   1.330 +        HSType hangulState = GetHangulSyllableType(ch);
   1.331 +        while (mPos < mLimit) {
   1.332 +            ch = *mPos;
   1.333 +            HSType hangulType = GetHangulSyllableType(ch);
   1.334 +            switch (hangulType) {
   1.335 +            case HST_L:
   1.336 +            case HST_LV:
   1.337 +            case HST_LVT:
   1.338 +                if (hangulState == HST_L) {
   1.339 +                    hangulState = hangulType;
   1.340 +                    mPos++;
   1.341 +                    continue;
   1.342 +                }
   1.343 +                break;
   1.344 +            case HST_V:
   1.345 +                if ((hangulState != HST_NONE) && !(hangulState & HST_T)) {
   1.346 +                    hangulState = hangulType;
   1.347 +                    mPos++;
   1.348 +                    continue;
   1.349 +                }
   1.350 +                break;
   1.351 +            case HST_T:
   1.352 +                if (hangulState & (HST_V | HST_T)) {
   1.353 +                    hangulState = hangulType;
   1.354 +                    mPos++;
   1.355 +                    continue;
   1.356 +                }
   1.357 +                break;
   1.358 +            default:
   1.359 +                break;
   1.360 +            }
   1.361 +            break;
   1.362 +        }
   1.363 +    }
   1.364 +
   1.365 +    while (mPos < mLimit) {
   1.366 +        ch = *mPos;
   1.367 +
   1.368 +        // Check for surrogate pairs; note that isolated surrogates will just
   1.369 +        // be treated as generic (non-cluster-extending) characters here,
   1.370 +        // which is fine for cluster-iterating purposes
   1.371 +        if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit - 1 &&
   1.372 +            NS_IS_LOW_SURROGATE(*(mPos + 1))) {
   1.373 +            ch = SURROGATE_TO_UCS4(ch, *(mPos + 1));
   1.374 +        }
   1.375 +
   1.376 +        if (!IsClusterExtender(ch)) {
   1.377 +            break;
   1.378 +        }
   1.379 +
   1.380 +        mPos++;
   1.381 +        if (!IS_IN_BMP(ch)) {
   1.382 +            mPos++;
   1.383 +        }
   1.384 +    }
   1.385 +
   1.386 +    NS_ASSERTION(mText < mPos && mPos <= mLimit,
   1.387 +                 "ClusterIterator::Next has overshot the string!");
   1.388 +}
   1.389 +
   1.390 +} // end namespace unicode
   1.391 +
   1.392 +} // end namespace mozilla

mercurial