1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/unicharutil/util/nsUnicodeProperties.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,389 @@ 1.4 +/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- 1.5 + * This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 +#include "nsUnicodeProperties.h" 1.10 +#include "nsUnicodePropertyData.cpp" 1.11 + 1.12 +#include "mozilla/ArrayUtils.h" 1.13 +#include "nsCharTraits.h" 1.14 + 1.15 +#define UNICODE_BMP_LIMIT 0x10000 1.16 +#define UNICODE_LIMIT 0x110000 1.17 + 1.18 + 1.19 +const nsCharProps1& 1.20 +GetCharProps1(uint32_t aCh) 1.21 +{ 1.22 + if (aCh < UNICODE_BMP_LIMIT) { 1.23 + return sCharProp1Values[sCharProp1Pages[0][aCh >> kCharProp1CharBits]] 1.24 + [aCh & ((1 << kCharProp1CharBits) - 1)]; 1.25 + } 1.26 + if (aCh < (kCharProp1MaxPlane + 1) * 0x10000) { 1.27 + return sCharProp1Values[sCharProp1Pages[sCharProp1Planes[(aCh >> 16) - 1]] 1.28 + [(aCh & 0xffff) >> kCharProp1CharBits]] 1.29 + [aCh & ((1 << kCharProp1CharBits) - 1)]; 1.30 + } 1.31 + 1.32 + // Default values for unassigned 1.33 + static const nsCharProps1 undefined = { 1.34 + 0, // Index to mirrored char offsets 1.35 + 0, // Hangul Syllable type 1.36 + 0 // Combining class 1.37 + }; 1.38 + return undefined; 1.39 +} 1.40 + 1.41 +const nsCharProps2& 1.42 +GetCharProps2(uint32_t aCh) 1.43 +{ 1.44 + if (aCh < UNICODE_BMP_LIMIT) { 1.45 + return sCharProp2Values[sCharProp2Pages[0][aCh >> kCharProp2CharBits]] 1.46 + [aCh & ((1 << kCharProp2CharBits) - 1)]; 1.47 + } 1.48 + if (aCh < (kCharProp2MaxPlane + 1) * 0x10000) { 1.49 + return sCharProp2Values[sCharProp2Pages[sCharProp2Planes[(aCh >> 16) - 1]] 1.50 + [(aCh & 0xffff) >> kCharProp2CharBits]] 1.51 + [aCh & ((1 << kCharProp2CharBits) - 1)]; 1.52 + } 1.53 + 1.54 + NS_NOTREACHED("Getting CharProps for codepoint outside Unicode range"); 1.55 + // Default values for unassigned 1.56 + static const nsCharProps2 undefined = { 1.57 + MOZ_SCRIPT_UNKNOWN, // Script code 1.58 + 0, // East Asian Width 1.59 + HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED, // General Category 1.60 + eCharType_LeftToRight, // Bidi Category 1.61 + mozilla::unicode::XIDMOD_NOT_CHARS, // Xidmod 1.62 + -1, // Numeric Value 1.63 + mozilla::unicode::HVT_NotHan // Han variant 1.64 + }; 1.65 + return undefined; 1.66 +} 1.67 + 1.68 +namespace mozilla { 1.69 + 1.70 +namespace unicode { 1.71 + 1.72 +/* 1.73 +To store properties for a million Unicode codepoints compactly, we use 1.74 +a three-level array structure, with the Unicode values considered as 1.75 +three elements: Plane, Page, and Char. 1.76 + 1.77 +Space optimization happens because multiple Planes can refer to the same 1.78 +Page array, and multiple Pages can refer to the same Char array holding 1.79 +the actual values. In practice, most of the higher planes are empty and 1.80 +thus share the same data; and within the BMP, there are also many pages 1.81 +that repeat the same data for any given property. 1.82 + 1.83 +Plane is usually zero, so we skip a lookup in this case, and require 1.84 +that the Plane 0 pages are always the first set of entries in the Page 1.85 +array. 1.86 + 1.87 +The division of the remaining 16 bits into Page and Char fields is 1.88 +adjusted for each property (by experiment using the generation tool) 1.89 +to provide the most compact storage, depending on the distribution 1.90 +of values. 1.91 +*/ 1.92 + 1.93 +nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[] = { 1.94 + /* 1.95 + * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants 1.96 + * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-common.h. 1.97 + */ 1.98 + /* CONTROL */ nsIUGenCategory::kOther, 1.99 + /* FORMAT */ nsIUGenCategory::kOther, 1.100 + /* UNASSIGNED */ nsIUGenCategory::kOther, 1.101 + /* PRIVATE_USE */ nsIUGenCategory::kOther, 1.102 + /* SURROGATE */ nsIUGenCategory::kOther, 1.103 + /* LOWERCASE_LETTER */ nsIUGenCategory::kLetter, 1.104 + /* MODIFIER_LETTER */ nsIUGenCategory::kLetter, 1.105 + /* OTHER_LETTER */ nsIUGenCategory::kLetter, 1.106 + /* TITLECASE_LETTER */ nsIUGenCategory::kLetter, 1.107 + /* UPPERCASE_LETTER */ nsIUGenCategory::kLetter, 1.108 + /* COMBINING_MARK */ nsIUGenCategory::kMark, 1.109 + /* ENCLOSING_MARK */ nsIUGenCategory::kMark, 1.110 + /* NON_SPACING_MARK */ nsIUGenCategory::kMark, 1.111 + /* DECIMAL_NUMBER */ nsIUGenCategory::kNumber, 1.112 + /* LETTER_NUMBER */ nsIUGenCategory::kNumber, 1.113 + /* OTHER_NUMBER */ nsIUGenCategory::kNumber, 1.114 + /* CONNECT_PUNCTUATION */ nsIUGenCategory::kPunctuation, 1.115 + /* DASH_PUNCTUATION */ nsIUGenCategory::kPunctuation, 1.116 + /* CLOSE_PUNCTUATION */ nsIUGenCategory::kPunctuation, 1.117 + /* FINAL_PUNCTUATION */ nsIUGenCategory::kPunctuation, 1.118 + /* INITIAL_PUNCTUATION */ nsIUGenCategory::kPunctuation, 1.119 + /* OTHER_PUNCTUATION */ nsIUGenCategory::kPunctuation, 1.120 + /* OPEN_PUNCTUATION */ nsIUGenCategory::kPunctuation, 1.121 + /* CURRENCY_SYMBOL */ nsIUGenCategory::kSymbol, 1.122 + /* MODIFIER_SYMBOL */ nsIUGenCategory::kSymbol, 1.123 + /* MATH_SYMBOL */ nsIUGenCategory::kSymbol, 1.124 + /* OTHER_SYMBOL */ nsIUGenCategory::kSymbol, 1.125 + /* LINE_SEPARATOR */ nsIUGenCategory::kSeparator, 1.126 + /* PARAGRAPH_SEPARATOR */ nsIUGenCategory::kSeparator, 1.127 + /* SPACE_SEPARATOR */ nsIUGenCategory::kSeparator 1.128 +}; 1.129 + 1.130 +uint32_t 1.131 +GetMirroredChar(uint32_t aCh) 1.132 +{ 1.133 + return aCh + sMirrorOffsets[GetCharProps1(aCh).mMirrorOffsetIndex]; 1.134 +} 1.135 + 1.136 +uint32_t 1.137 +GetScriptTagForCode(int32_t aScriptCode) 1.138 +{ 1.139 + // this will safely return 0 for negative script codes, too :) 1.140 + if (uint32_t(aScriptCode) > ArrayLength(sScriptCodeToTag)) { 1.141 + return 0; 1.142 + } 1.143 + return sScriptCodeToTag[aScriptCode]; 1.144 +} 1.145 + 1.146 +static inline uint32_t 1.147 +GetCaseMapValue(uint32_t aCh) 1.148 +{ 1.149 + if (aCh < UNICODE_BMP_LIMIT) { 1.150 + return sCaseMapValues[sCaseMapPages[0][aCh >> kCaseMapCharBits]] 1.151 + [aCh & ((1 << kCaseMapCharBits) - 1)]; 1.152 + } 1.153 + if (aCh < (kCaseMapMaxPlane + 1) * 0x10000) { 1.154 + return sCaseMapValues[sCaseMapPages[sCaseMapPlanes[(aCh >> 16) - 1]] 1.155 + [(aCh & 0xffff) >> kCaseMapCharBits]] 1.156 + [aCh & ((1 << kCaseMapCharBits) - 1)]; 1.157 + } 1.158 + return 0; 1.159 +} 1.160 + 1.161 +uint32_t 1.162 +GetUppercase(uint32_t aCh) 1.163 +{ 1.164 + uint32_t mapValue = GetCaseMapValue(aCh); 1.165 + if (mapValue & (kLowerToUpper | kTitleToUpper)) { 1.166 + return aCh ^ (mapValue & kCaseMapCharMask); 1.167 + } 1.168 + if (mapValue & kLowerToTitle) { 1.169 + return GetUppercase(aCh ^ (mapValue & kCaseMapCharMask)); 1.170 + } 1.171 + return aCh; 1.172 +} 1.173 + 1.174 +uint32_t 1.175 +GetLowercase(uint32_t aCh) 1.176 +{ 1.177 + uint32_t mapValue = GetCaseMapValue(aCh); 1.178 + if (mapValue & kUpperToLower) { 1.179 + return aCh ^ (mapValue & kCaseMapCharMask); 1.180 + } 1.181 + if (mapValue & kTitleToUpper) { 1.182 + return GetLowercase(aCh ^ (mapValue & kCaseMapCharMask)); 1.183 + } 1.184 + return aCh; 1.185 +} 1.186 + 1.187 +uint32_t 1.188 +GetTitlecaseForLower(uint32_t aCh) 1.189 +{ 1.190 + uint32_t mapValue = GetCaseMapValue(aCh); 1.191 + if (mapValue & (kLowerToTitle | kLowerToUpper)) { 1.192 + return aCh ^ (mapValue & kCaseMapCharMask); 1.193 + } 1.194 + return aCh; 1.195 +} 1.196 + 1.197 +uint32_t 1.198 +GetTitlecaseForAll(uint32_t aCh) 1.199 +{ 1.200 + uint32_t mapValue = GetCaseMapValue(aCh); 1.201 + if (mapValue & (kLowerToTitle | kLowerToUpper)) { 1.202 + return aCh ^ (mapValue & kCaseMapCharMask); 1.203 + } 1.204 + if (mapValue & kUpperToLower) { 1.205 + return GetTitlecaseForLower(aCh ^ (mapValue & kCaseMapCharMask)); 1.206 + } 1.207 + return aCh; 1.208 +} 1.209 + 1.210 +HanVariantType 1.211 +GetHanVariant(uint32_t aCh) 1.212 +{ 1.213 + // In the sHanVariantValues array, data for 4 successive characters 1.214 + // (2 bits each) is packed in to each uint8_t entry, with the value 1.215 + // for the lowest character stored in the least significant bits. 1.216 + uint8_t v = 0; 1.217 + if (aCh < UNICODE_BMP_LIMIT) { 1.218 + v = sHanVariantValues[sHanVariantPages[0][aCh >> kHanVariantCharBits]] 1.219 + [(aCh & ((1 << kHanVariantCharBits) - 1)) >> 2]; 1.220 + } else if (aCh < (kHanVariantMaxPlane + 1) * 0x10000) { 1.221 + v = sHanVariantValues[sHanVariantPages[sHanVariantPlanes[(aCh >> 16) - 1]] 1.222 + [(aCh & 0xffff) >> kHanVariantCharBits]] 1.223 + [(aCh & ((1 << kHanVariantCharBits) - 1)) >> 2]; 1.224 + } 1.225 + // extract the appropriate 2-bit field from the value 1.226 + return HanVariantType((v >> ((aCh & 3) * 2)) & 3); 1.227 +} 1.228 + 1.229 +uint32_t 1.230 +GetFullWidth(uint32_t aCh) 1.231 +{ 1.232 + // full-width mappings only exist for BMP characters; all others are 1.233 + // returned unchanged 1.234 + if (aCh < UNICODE_BMP_LIMIT) { 1.235 + uint32_t v = 1.236 + sFullWidthValues[sFullWidthPages[aCh >> kFullWidthCharBits]] 1.237 + [aCh & ((1 << kFullWidthCharBits) - 1)]; 1.238 + if (v) { 1.239 + // return the mapped value if non-zero; else return original char 1.240 + return v; 1.241 + } 1.242 + } 1.243 + return aCh; 1.244 +} 1.245 + 1.246 +bool 1.247 +IsClusterExtender(uint32_t aCh, uint8_t aCategory) 1.248 +{ 1.249 + return ((aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK && 1.250 + aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) || 1.251 + (aCh >= 0x200c && aCh <= 0x200d) || // ZWJ, ZWNJ 1.252 + (aCh >= 0xff9e && aCh <= 0xff9f)); // katakana sound marks 1.253 +} 1.254 + 1.255 +// TODO: replace this with a properties file or similar; 1.256 +// expect this to evolve as harfbuzz shaping support matures. 1.257 +// 1.258 +// The "shaping type" of each script run, as returned by this 1.259 +// function, is compared to the bits set in the 1.260 +// gfx.font_rendering.harfbuzz.scripts 1.261 +// preference to decide whether to use the harfbuzz shaper. 1.262 +// 1.263 +int32_t 1.264 +ScriptShapingType(int32_t aScriptCode) 1.265 +{ 1.266 + switch (aScriptCode) { 1.267 + default: 1.268 + return SHAPING_DEFAULT; // scripts not explicitly listed here are 1.269 + // assumed to just use default shaping 1.270 + 1.271 + case MOZ_SCRIPT_ARABIC: 1.272 + case MOZ_SCRIPT_SYRIAC: 1.273 + case MOZ_SCRIPT_NKO: 1.274 + case MOZ_SCRIPT_MANDAIC: 1.275 + return SHAPING_ARABIC; // bidi scripts with Arabic-style shaping 1.276 + 1.277 + case MOZ_SCRIPT_HEBREW: 1.278 + return SHAPING_HEBREW; 1.279 + 1.280 + case MOZ_SCRIPT_HANGUL: 1.281 + return SHAPING_HANGUL; 1.282 + 1.283 + case MOZ_SCRIPT_MONGOLIAN: // to be supported by the Arabic shaper? 1.284 + return SHAPING_MONGOLIAN; 1.285 + 1.286 + case MOZ_SCRIPT_THAI: // no complex OT features, but MS engines like to do 1.287 + // sequence checking 1.288 + return SHAPING_THAI; 1.289 + 1.290 + case MOZ_SCRIPT_BENGALI: 1.291 + case MOZ_SCRIPT_DEVANAGARI: 1.292 + case MOZ_SCRIPT_GUJARATI: 1.293 + case MOZ_SCRIPT_GURMUKHI: 1.294 + case MOZ_SCRIPT_KANNADA: 1.295 + case MOZ_SCRIPT_MALAYALAM: 1.296 + case MOZ_SCRIPT_ORIYA: 1.297 + case MOZ_SCRIPT_SINHALA: 1.298 + case MOZ_SCRIPT_TAMIL: 1.299 + case MOZ_SCRIPT_TELUGU: 1.300 + case MOZ_SCRIPT_KHMER: 1.301 + case MOZ_SCRIPT_LAO: 1.302 + case MOZ_SCRIPT_TIBETAN: 1.303 + case MOZ_SCRIPT_NEW_TAI_LUE: 1.304 + case MOZ_SCRIPT_TAI_LE: 1.305 + case MOZ_SCRIPT_MYANMAR: 1.306 + case MOZ_SCRIPT_PHAGS_PA: 1.307 + case MOZ_SCRIPT_BATAK: 1.308 + case MOZ_SCRIPT_BRAHMI: 1.309 + return SHAPING_INDIC; // scripts that require Indic or other "special" shaping 1.310 + } 1.311 +} 1.312 + 1.313 +void 1.314 +ClusterIterator::Next() 1.315 +{ 1.316 + if (AtEnd()) { 1.317 + NS_WARNING("ClusterIterator has already reached the end"); 1.318 + return; 1.319 + } 1.320 + 1.321 + uint32_t ch = *mPos++; 1.322 + 1.323 + if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit && 1.324 + NS_IS_LOW_SURROGATE(*mPos)) { 1.325 + ch = SURROGATE_TO_UCS4(ch, *mPos++); 1.326 + } else if ((ch & ~0xff) == 0x1100 || 1.327 + (ch >= 0xa960 && ch <= 0xa97f) || 1.328 + (ch >= 0xac00 && ch <= 0xd7ff)) { 1.329 + // Handle conjoining Jamo that make Hangul syllables 1.330 + HSType hangulState = GetHangulSyllableType(ch); 1.331 + while (mPos < mLimit) { 1.332 + ch = *mPos; 1.333 + HSType hangulType = GetHangulSyllableType(ch); 1.334 + switch (hangulType) { 1.335 + case HST_L: 1.336 + case HST_LV: 1.337 + case HST_LVT: 1.338 + if (hangulState == HST_L) { 1.339 + hangulState = hangulType; 1.340 + mPos++; 1.341 + continue; 1.342 + } 1.343 + break; 1.344 + case HST_V: 1.345 + if ((hangulState != HST_NONE) && !(hangulState & HST_T)) { 1.346 + hangulState = hangulType; 1.347 + mPos++; 1.348 + continue; 1.349 + } 1.350 + break; 1.351 + case HST_T: 1.352 + if (hangulState & (HST_V | HST_T)) { 1.353 + hangulState = hangulType; 1.354 + mPos++; 1.355 + continue; 1.356 + } 1.357 + break; 1.358 + default: 1.359 + break; 1.360 + } 1.361 + break; 1.362 + } 1.363 + } 1.364 + 1.365 + while (mPos < mLimit) { 1.366 + ch = *mPos; 1.367 + 1.368 + // Check for surrogate pairs; note that isolated surrogates will just 1.369 + // be treated as generic (non-cluster-extending) characters here, 1.370 + // which is fine for cluster-iterating purposes 1.371 + if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit - 1 && 1.372 + NS_IS_LOW_SURROGATE(*(mPos + 1))) { 1.373 + ch = SURROGATE_TO_UCS4(ch, *(mPos + 1)); 1.374 + } 1.375 + 1.376 + if (!IsClusterExtender(ch)) { 1.377 + break; 1.378 + } 1.379 + 1.380 + mPos++; 1.381 + if (!IS_IN_BMP(ch)) { 1.382 + mPos++; 1.383 + } 1.384 + } 1.385 + 1.386 + NS_ASSERTION(mText < mPos && mPos <= mLimit, 1.387 + "ClusterIterator::Next has overshot the string!"); 1.388 +} 1.389 + 1.390 +} // end namespace unicode 1.391 + 1.392 +} // end namespace mozilla