intl/unicharutil/util/nsUnicodeProperties.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
michael@0 2 * This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 5
michael@0 6 #include "nsUnicodeProperties.h"
michael@0 7 #include "nsUnicodePropertyData.cpp"
michael@0 8
michael@0 9 #include "mozilla/ArrayUtils.h"
michael@0 10 #include "nsCharTraits.h"
michael@0 11
michael@0 12 #define UNICODE_BMP_LIMIT 0x10000
michael@0 13 #define UNICODE_LIMIT 0x110000
michael@0 14
michael@0 15
michael@0 16 const nsCharProps1&
michael@0 17 GetCharProps1(uint32_t aCh)
michael@0 18 {
michael@0 19 if (aCh < UNICODE_BMP_LIMIT) {
michael@0 20 return sCharProp1Values[sCharProp1Pages[0][aCh >> kCharProp1CharBits]]
michael@0 21 [aCh & ((1 << kCharProp1CharBits) - 1)];
michael@0 22 }
michael@0 23 if (aCh < (kCharProp1MaxPlane + 1) * 0x10000) {
michael@0 24 return sCharProp1Values[sCharProp1Pages[sCharProp1Planes[(aCh >> 16) - 1]]
michael@0 25 [(aCh & 0xffff) >> kCharProp1CharBits]]
michael@0 26 [aCh & ((1 << kCharProp1CharBits) - 1)];
michael@0 27 }
michael@0 28
michael@0 29 // Default values for unassigned
michael@0 30 static const nsCharProps1 undefined = {
michael@0 31 0, // Index to mirrored char offsets
michael@0 32 0, // Hangul Syllable type
michael@0 33 0 // Combining class
michael@0 34 };
michael@0 35 return undefined;
michael@0 36 }
michael@0 37
michael@0 38 const nsCharProps2&
michael@0 39 GetCharProps2(uint32_t aCh)
michael@0 40 {
michael@0 41 if (aCh < UNICODE_BMP_LIMIT) {
michael@0 42 return sCharProp2Values[sCharProp2Pages[0][aCh >> kCharProp2CharBits]]
michael@0 43 [aCh & ((1 << kCharProp2CharBits) - 1)];
michael@0 44 }
michael@0 45 if (aCh < (kCharProp2MaxPlane + 1) * 0x10000) {
michael@0 46 return sCharProp2Values[sCharProp2Pages[sCharProp2Planes[(aCh >> 16) - 1]]
michael@0 47 [(aCh & 0xffff) >> kCharProp2CharBits]]
michael@0 48 [aCh & ((1 << kCharProp2CharBits) - 1)];
michael@0 49 }
michael@0 50
michael@0 51 NS_NOTREACHED("Getting CharProps for codepoint outside Unicode range");
michael@0 52 // Default values for unassigned
michael@0 53 static const nsCharProps2 undefined = {
michael@0 54 MOZ_SCRIPT_UNKNOWN, // Script code
michael@0 55 0, // East Asian Width
michael@0 56 HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED, // General Category
michael@0 57 eCharType_LeftToRight, // Bidi Category
michael@0 58 mozilla::unicode::XIDMOD_NOT_CHARS, // Xidmod
michael@0 59 -1, // Numeric Value
michael@0 60 mozilla::unicode::HVT_NotHan // Han variant
michael@0 61 };
michael@0 62 return undefined;
michael@0 63 }
michael@0 64
michael@0 65 namespace mozilla {
michael@0 66
michael@0 67 namespace unicode {
michael@0 68
michael@0 69 /*
michael@0 70 To store properties for a million Unicode codepoints compactly, we use
michael@0 71 a three-level array structure, with the Unicode values considered as
michael@0 72 three elements: Plane, Page, and Char.
michael@0 73
michael@0 74 Space optimization happens because multiple Planes can refer to the same
michael@0 75 Page array, and multiple Pages can refer to the same Char array holding
michael@0 76 the actual values. In practice, most of the higher planes are empty and
michael@0 77 thus share the same data; and within the BMP, there are also many pages
michael@0 78 that repeat the same data for any given property.
michael@0 79
michael@0 80 Plane is usually zero, so we skip a lookup in this case, and require
michael@0 81 that the Plane 0 pages are always the first set of entries in the Page
michael@0 82 array.
michael@0 83
michael@0 84 The division of the remaining 16 bits into Page and Char fields is
michael@0 85 adjusted for each property (by experiment using the generation tool)
michael@0 86 to provide the most compact storage, depending on the distribution
michael@0 87 of values.
michael@0 88 */
michael@0 89
michael@0 90 nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[] = {
michael@0 91 /*
michael@0 92 * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants
michael@0 93 * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-common.h.
michael@0 94 */
michael@0 95 /* CONTROL */ nsIUGenCategory::kOther,
michael@0 96 /* FORMAT */ nsIUGenCategory::kOther,
michael@0 97 /* UNASSIGNED */ nsIUGenCategory::kOther,
michael@0 98 /* PRIVATE_USE */ nsIUGenCategory::kOther,
michael@0 99 /* SURROGATE */ nsIUGenCategory::kOther,
michael@0 100 /* LOWERCASE_LETTER */ nsIUGenCategory::kLetter,
michael@0 101 /* MODIFIER_LETTER */ nsIUGenCategory::kLetter,
michael@0 102 /* OTHER_LETTER */ nsIUGenCategory::kLetter,
michael@0 103 /* TITLECASE_LETTER */ nsIUGenCategory::kLetter,
michael@0 104 /* UPPERCASE_LETTER */ nsIUGenCategory::kLetter,
michael@0 105 /* COMBINING_MARK */ nsIUGenCategory::kMark,
michael@0 106 /* ENCLOSING_MARK */ nsIUGenCategory::kMark,
michael@0 107 /* NON_SPACING_MARK */ nsIUGenCategory::kMark,
michael@0 108 /* DECIMAL_NUMBER */ nsIUGenCategory::kNumber,
michael@0 109 /* LETTER_NUMBER */ nsIUGenCategory::kNumber,
michael@0 110 /* OTHER_NUMBER */ nsIUGenCategory::kNumber,
michael@0 111 /* CONNECT_PUNCTUATION */ nsIUGenCategory::kPunctuation,
michael@0 112 /* DASH_PUNCTUATION */ nsIUGenCategory::kPunctuation,
michael@0 113 /* CLOSE_PUNCTUATION */ nsIUGenCategory::kPunctuation,
michael@0 114 /* FINAL_PUNCTUATION */ nsIUGenCategory::kPunctuation,
michael@0 115 /* INITIAL_PUNCTUATION */ nsIUGenCategory::kPunctuation,
michael@0 116 /* OTHER_PUNCTUATION */ nsIUGenCategory::kPunctuation,
michael@0 117 /* OPEN_PUNCTUATION */ nsIUGenCategory::kPunctuation,
michael@0 118 /* CURRENCY_SYMBOL */ nsIUGenCategory::kSymbol,
michael@0 119 /* MODIFIER_SYMBOL */ nsIUGenCategory::kSymbol,
michael@0 120 /* MATH_SYMBOL */ nsIUGenCategory::kSymbol,
michael@0 121 /* OTHER_SYMBOL */ nsIUGenCategory::kSymbol,
michael@0 122 /* LINE_SEPARATOR */ nsIUGenCategory::kSeparator,
michael@0 123 /* PARAGRAPH_SEPARATOR */ nsIUGenCategory::kSeparator,
michael@0 124 /* SPACE_SEPARATOR */ nsIUGenCategory::kSeparator
michael@0 125 };
michael@0 126
michael@0 127 uint32_t
michael@0 128 GetMirroredChar(uint32_t aCh)
michael@0 129 {
michael@0 130 return aCh + sMirrorOffsets[GetCharProps1(aCh).mMirrorOffsetIndex];
michael@0 131 }
michael@0 132
michael@0 133 uint32_t
michael@0 134 GetScriptTagForCode(int32_t aScriptCode)
michael@0 135 {
michael@0 136 // this will safely return 0 for negative script codes, too :)
michael@0 137 if (uint32_t(aScriptCode) > ArrayLength(sScriptCodeToTag)) {
michael@0 138 return 0;
michael@0 139 }
michael@0 140 return sScriptCodeToTag[aScriptCode];
michael@0 141 }
michael@0 142
michael@0 143 static inline uint32_t
michael@0 144 GetCaseMapValue(uint32_t aCh)
michael@0 145 {
michael@0 146 if (aCh < UNICODE_BMP_LIMIT) {
michael@0 147 return sCaseMapValues[sCaseMapPages[0][aCh >> kCaseMapCharBits]]
michael@0 148 [aCh & ((1 << kCaseMapCharBits) - 1)];
michael@0 149 }
michael@0 150 if (aCh < (kCaseMapMaxPlane + 1) * 0x10000) {
michael@0 151 return sCaseMapValues[sCaseMapPages[sCaseMapPlanes[(aCh >> 16) - 1]]
michael@0 152 [(aCh & 0xffff) >> kCaseMapCharBits]]
michael@0 153 [aCh & ((1 << kCaseMapCharBits) - 1)];
michael@0 154 }
michael@0 155 return 0;
michael@0 156 }
michael@0 157
michael@0 158 uint32_t
michael@0 159 GetUppercase(uint32_t aCh)
michael@0 160 {
michael@0 161 uint32_t mapValue = GetCaseMapValue(aCh);
michael@0 162 if (mapValue & (kLowerToUpper | kTitleToUpper)) {
michael@0 163 return aCh ^ (mapValue & kCaseMapCharMask);
michael@0 164 }
michael@0 165 if (mapValue & kLowerToTitle) {
michael@0 166 return GetUppercase(aCh ^ (mapValue & kCaseMapCharMask));
michael@0 167 }
michael@0 168 return aCh;
michael@0 169 }
michael@0 170
michael@0 171 uint32_t
michael@0 172 GetLowercase(uint32_t aCh)
michael@0 173 {
michael@0 174 uint32_t mapValue = GetCaseMapValue(aCh);
michael@0 175 if (mapValue & kUpperToLower) {
michael@0 176 return aCh ^ (mapValue & kCaseMapCharMask);
michael@0 177 }
michael@0 178 if (mapValue & kTitleToUpper) {
michael@0 179 return GetLowercase(aCh ^ (mapValue & kCaseMapCharMask));
michael@0 180 }
michael@0 181 return aCh;
michael@0 182 }
michael@0 183
michael@0 184 uint32_t
michael@0 185 GetTitlecaseForLower(uint32_t aCh)
michael@0 186 {
michael@0 187 uint32_t mapValue = GetCaseMapValue(aCh);
michael@0 188 if (mapValue & (kLowerToTitle | kLowerToUpper)) {
michael@0 189 return aCh ^ (mapValue & kCaseMapCharMask);
michael@0 190 }
michael@0 191 return aCh;
michael@0 192 }
michael@0 193
michael@0 194 uint32_t
michael@0 195 GetTitlecaseForAll(uint32_t aCh)
michael@0 196 {
michael@0 197 uint32_t mapValue = GetCaseMapValue(aCh);
michael@0 198 if (mapValue & (kLowerToTitle | kLowerToUpper)) {
michael@0 199 return aCh ^ (mapValue & kCaseMapCharMask);
michael@0 200 }
michael@0 201 if (mapValue & kUpperToLower) {
michael@0 202 return GetTitlecaseForLower(aCh ^ (mapValue & kCaseMapCharMask));
michael@0 203 }
michael@0 204 return aCh;
michael@0 205 }
michael@0 206
michael@0 207 HanVariantType
michael@0 208 GetHanVariant(uint32_t aCh)
michael@0 209 {
michael@0 210 // In the sHanVariantValues array, data for 4 successive characters
michael@0 211 // (2 bits each) is packed in to each uint8_t entry, with the value
michael@0 212 // for the lowest character stored in the least significant bits.
michael@0 213 uint8_t v = 0;
michael@0 214 if (aCh < UNICODE_BMP_LIMIT) {
michael@0 215 v = sHanVariantValues[sHanVariantPages[0][aCh >> kHanVariantCharBits]]
michael@0 216 [(aCh & ((1 << kHanVariantCharBits) - 1)) >> 2];
michael@0 217 } else if (aCh < (kHanVariantMaxPlane + 1) * 0x10000) {
michael@0 218 v = sHanVariantValues[sHanVariantPages[sHanVariantPlanes[(aCh >> 16) - 1]]
michael@0 219 [(aCh & 0xffff) >> kHanVariantCharBits]]
michael@0 220 [(aCh & ((1 << kHanVariantCharBits) - 1)) >> 2];
michael@0 221 }
michael@0 222 // extract the appropriate 2-bit field from the value
michael@0 223 return HanVariantType((v >> ((aCh & 3) * 2)) & 3);
michael@0 224 }
michael@0 225
michael@0 226 uint32_t
michael@0 227 GetFullWidth(uint32_t aCh)
michael@0 228 {
michael@0 229 // full-width mappings only exist for BMP characters; all others are
michael@0 230 // returned unchanged
michael@0 231 if (aCh < UNICODE_BMP_LIMIT) {
michael@0 232 uint32_t v =
michael@0 233 sFullWidthValues[sFullWidthPages[aCh >> kFullWidthCharBits]]
michael@0 234 [aCh & ((1 << kFullWidthCharBits) - 1)];
michael@0 235 if (v) {
michael@0 236 // return the mapped value if non-zero; else return original char
michael@0 237 return v;
michael@0 238 }
michael@0 239 }
michael@0 240 return aCh;
michael@0 241 }
michael@0 242
michael@0 243 bool
michael@0 244 IsClusterExtender(uint32_t aCh, uint8_t aCategory)
michael@0 245 {
michael@0 246 return ((aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK &&
michael@0 247 aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) ||
michael@0 248 (aCh >= 0x200c && aCh <= 0x200d) || // ZWJ, ZWNJ
michael@0 249 (aCh >= 0xff9e && aCh <= 0xff9f)); // katakana sound marks
michael@0 250 }
michael@0 251
michael@0 252 // TODO: replace this with a properties file or similar;
michael@0 253 // expect this to evolve as harfbuzz shaping support matures.
michael@0 254 //
michael@0 255 // The "shaping type" of each script run, as returned by this
michael@0 256 // function, is compared to the bits set in the
michael@0 257 // gfx.font_rendering.harfbuzz.scripts
michael@0 258 // preference to decide whether to use the harfbuzz shaper.
michael@0 259 //
michael@0 260 int32_t
michael@0 261 ScriptShapingType(int32_t aScriptCode)
michael@0 262 {
michael@0 263 switch (aScriptCode) {
michael@0 264 default:
michael@0 265 return SHAPING_DEFAULT; // scripts not explicitly listed here are
michael@0 266 // assumed to just use default shaping
michael@0 267
michael@0 268 case MOZ_SCRIPT_ARABIC:
michael@0 269 case MOZ_SCRIPT_SYRIAC:
michael@0 270 case MOZ_SCRIPT_NKO:
michael@0 271 case MOZ_SCRIPT_MANDAIC:
michael@0 272 return SHAPING_ARABIC; // bidi scripts with Arabic-style shaping
michael@0 273
michael@0 274 case MOZ_SCRIPT_HEBREW:
michael@0 275 return SHAPING_HEBREW;
michael@0 276
michael@0 277 case MOZ_SCRIPT_HANGUL:
michael@0 278 return SHAPING_HANGUL;
michael@0 279
michael@0 280 case MOZ_SCRIPT_MONGOLIAN: // to be supported by the Arabic shaper?
michael@0 281 return SHAPING_MONGOLIAN;
michael@0 282
michael@0 283 case MOZ_SCRIPT_THAI: // no complex OT features, but MS engines like to do
michael@0 284 // sequence checking
michael@0 285 return SHAPING_THAI;
michael@0 286
michael@0 287 case MOZ_SCRIPT_BENGALI:
michael@0 288 case MOZ_SCRIPT_DEVANAGARI:
michael@0 289 case MOZ_SCRIPT_GUJARATI:
michael@0 290 case MOZ_SCRIPT_GURMUKHI:
michael@0 291 case MOZ_SCRIPT_KANNADA:
michael@0 292 case MOZ_SCRIPT_MALAYALAM:
michael@0 293 case MOZ_SCRIPT_ORIYA:
michael@0 294 case MOZ_SCRIPT_SINHALA:
michael@0 295 case MOZ_SCRIPT_TAMIL:
michael@0 296 case MOZ_SCRIPT_TELUGU:
michael@0 297 case MOZ_SCRIPT_KHMER:
michael@0 298 case MOZ_SCRIPT_LAO:
michael@0 299 case MOZ_SCRIPT_TIBETAN:
michael@0 300 case MOZ_SCRIPT_NEW_TAI_LUE:
michael@0 301 case MOZ_SCRIPT_TAI_LE:
michael@0 302 case MOZ_SCRIPT_MYANMAR:
michael@0 303 case MOZ_SCRIPT_PHAGS_PA:
michael@0 304 case MOZ_SCRIPT_BATAK:
michael@0 305 case MOZ_SCRIPT_BRAHMI:
michael@0 306 return SHAPING_INDIC; // scripts that require Indic or other "special" shaping
michael@0 307 }
michael@0 308 }
michael@0 309
michael@0 310 void
michael@0 311 ClusterIterator::Next()
michael@0 312 {
michael@0 313 if (AtEnd()) {
michael@0 314 NS_WARNING("ClusterIterator has already reached the end");
michael@0 315 return;
michael@0 316 }
michael@0 317
michael@0 318 uint32_t ch = *mPos++;
michael@0 319
michael@0 320 if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit &&
michael@0 321 NS_IS_LOW_SURROGATE(*mPos)) {
michael@0 322 ch = SURROGATE_TO_UCS4(ch, *mPos++);
michael@0 323 } else if ((ch & ~0xff) == 0x1100 ||
michael@0 324 (ch >= 0xa960 && ch <= 0xa97f) ||
michael@0 325 (ch >= 0xac00 && ch <= 0xd7ff)) {
michael@0 326 // Handle conjoining Jamo that make Hangul syllables
michael@0 327 HSType hangulState = GetHangulSyllableType(ch);
michael@0 328 while (mPos < mLimit) {
michael@0 329 ch = *mPos;
michael@0 330 HSType hangulType = GetHangulSyllableType(ch);
michael@0 331 switch (hangulType) {
michael@0 332 case HST_L:
michael@0 333 case HST_LV:
michael@0 334 case HST_LVT:
michael@0 335 if (hangulState == HST_L) {
michael@0 336 hangulState = hangulType;
michael@0 337 mPos++;
michael@0 338 continue;
michael@0 339 }
michael@0 340 break;
michael@0 341 case HST_V:
michael@0 342 if ((hangulState != HST_NONE) && !(hangulState & HST_T)) {
michael@0 343 hangulState = hangulType;
michael@0 344 mPos++;
michael@0 345 continue;
michael@0 346 }
michael@0 347 break;
michael@0 348 case HST_T:
michael@0 349 if (hangulState & (HST_V | HST_T)) {
michael@0 350 hangulState = hangulType;
michael@0 351 mPos++;
michael@0 352 continue;
michael@0 353 }
michael@0 354 break;
michael@0 355 default:
michael@0 356 break;
michael@0 357 }
michael@0 358 break;
michael@0 359 }
michael@0 360 }
michael@0 361
michael@0 362 while (mPos < mLimit) {
michael@0 363 ch = *mPos;
michael@0 364
michael@0 365 // Check for surrogate pairs; note that isolated surrogates will just
michael@0 366 // be treated as generic (non-cluster-extending) characters here,
michael@0 367 // which is fine for cluster-iterating purposes
michael@0 368 if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit - 1 &&
michael@0 369 NS_IS_LOW_SURROGATE(*(mPos + 1))) {
michael@0 370 ch = SURROGATE_TO_UCS4(ch, *(mPos + 1));
michael@0 371 }
michael@0 372
michael@0 373 if (!IsClusterExtender(ch)) {
michael@0 374 break;
michael@0 375 }
michael@0 376
michael@0 377 mPos++;
michael@0 378 if (!IS_IN_BMP(ch)) {
michael@0 379 mPos++;
michael@0 380 }
michael@0 381 }
michael@0 382
michael@0 383 NS_ASSERTION(mText < mPos && mPos <= mLimit,
michael@0 384 "ClusterIterator::Next has overshot the string!");
michael@0 385 }
michael@0 386
michael@0 387 } // end namespace unicode
michael@0 388
michael@0 389 } // end namespace mozilla

mercurial