Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 * This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #include "nsUnicodeProperties.h"
7 #include "nsUnicodePropertyData.cpp"
9 #include "mozilla/ArrayUtils.h"
10 #include "nsCharTraits.h"
12 #define UNICODE_BMP_LIMIT 0x10000
13 #define UNICODE_LIMIT 0x110000
16 const nsCharProps1&
17 GetCharProps1(uint32_t aCh)
18 {
19 if (aCh < UNICODE_BMP_LIMIT) {
20 return sCharProp1Values[sCharProp1Pages[0][aCh >> kCharProp1CharBits]]
21 [aCh & ((1 << kCharProp1CharBits) - 1)];
22 }
23 if (aCh < (kCharProp1MaxPlane + 1) * 0x10000) {
24 return sCharProp1Values[sCharProp1Pages[sCharProp1Planes[(aCh >> 16) - 1]]
25 [(aCh & 0xffff) >> kCharProp1CharBits]]
26 [aCh & ((1 << kCharProp1CharBits) - 1)];
27 }
29 // Default values for unassigned
30 static const nsCharProps1 undefined = {
31 0, // Index to mirrored char offsets
32 0, // Hangul Syllable type
33 0 // Combining class
34 };
35 return undefined;
36 }
38 const nsCharProps2&
39 GetCharProps2(uint32_t aCh)
40 {
41 if (aCh < UNICODE_BMP_LIMIT) {
42 return sCharProp2Values[sCharProp2Pages[0][aCh >> kCharProp2CharBits]]
43 [aCh & ((1 << kCharProp2CharBits) - 1)];
44 }
45 if (aCh < (kCharProp2MaxPlane + 1) * 0x10000) {
46 return sCharProp2Values[sCharProp2Pages[sCharProp2Planes[(aCh >> 16) - 1]]
47 [(aCh & 0xffff) >> kCharProp2CharBits]]
48 [aCh & ((1 << kCharProp2CharBits) - 1)];
49 }
51 NS_NOTREACHED("Getting CharProps for codepoint outside Unicode range");
52 // Default values for unassigned
53 static const nsCharProps2 undefined = {
54 MOZ_SCRIPT_UNKNOWN, // Script code
55 0, // East Asian Width
56 HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED, // General Category
57 eCharType_LeftToRight, // Bidi Category
58 mozilla::unicode::XIDMOD_NOT_CHARS, // Xidmod
59 -1, // Numeric Value
60 mozilla::unicode::HVT_NotHan // Han variant
61 };
62 return undefined;
63 }
65 namespace mozilla {
67 namespace unicode {
69 /*
70 To store properties for a million Unicode codepoints compactly, we use
71 a three-level array structure, with the Unicode values considered as
72 three elements: Plane, Page, and Char.
74 Space optimization happens because multiple Planes can refer to the same
75 Page array, and multiple Pages can refer to the same Char array holding
76 the actual values. In practice, most of the higher planes are empty and
77 thus share the same data; and within the BMP, there are also many pages
78 that repeat the same data for any given property.
80 Plane is usually zero, so we skip a lookup in this case, and require
81 that the Plane 0 pages are always the first set of entries in the Page
82 array.
84 The division of the remaining 16 bits into Page and Char fields is
85 adjusted for each property (by experiment using the generation tool)
86 to provide the most compact storage, depending on the distribution
87 of values.
88 */
90 nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[] = {
91 /*
92 * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants
93 * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-common.h.
94 */
95 /* CONTROL */ nsIUGenCategory::kOther,
96 /* FORMAT */ nsIUGenCategory::kOther,
97 /* UNASSIGNED */ nsIUGenCategory::kOther,
98 /* PRIVATE_USE */ nsIUGenCategory::kOther,
99 /* SURROGATE */ nsIUGenCategory::kOther,
100 /* LOWERCASE_LETTER */ nsIUGenCategory::kLetter,
101 /* MODIFIER_LETTER */ nsIUGenCategory::kLetter,
102 /* OTHER_LETTER */ nsIUGenCategory::kLetter,
103 /* TITLECASE_LETTER */ nsIUGenCategory::kLetter,
104 /* UPPERCASE_LETTER */ nsIUGenCategory::kLetter,
105 /* COMBINING_MARK */ nsIUGenCategory::kMark,
106 /* ENCLOSING_MARK */ nsIUGenCategory::kMark,
107 /* NON_SPACING_MARK */ nsIUGenCategory::kMark,
108 /* DECIMAL_NUMBER */ nsIUGenCategory::kNumber,
109 /* LETTER_NUMBER */ nsIUGenCategory::kNumber,
110 /* OTHER_NUMBER */ nsIUGenCategory::kNumber,
111 /* CONNECT_PUNCTUATION */ nsIUGenCategory::kPunctuation,
112 /* DASH_PUNCTUATION */ nsIUGenCategory::kPunctuation,
113 /* CLOSE_PUNCTUATION */ nsIUGenCategory::kPunctuation,
114 /* FINAL_PUNCTUATION */ nsIUGenCategory::kPunctuation,
115 /* INITIAL_PUNCTUATION */ nsIUGenCategory::kPunctuation,
116 /* OTHER_PUNCTUATION */ nsIUGenCategory::kPunctuation,
117 /* OPEN_PUNCTUATION */ nsIUGenCategory::kPunctuation,
118 /* CURRENCY_SYMBOL */ nsIUGenCategory::kSymbol,
119 /* MODIFIER_SYMBOL */ nsIUGenCategory::kSymbol,
120 /* MATH_SYMBOL */ nsIUGenCategory::kSymbol,
121 /* OTHER_SYMBOL */ nsIUGenCategory::kSymbol,
122 /* LINE_SEPARATOR */ nsIUGenCategory::kSeparator,
123 /* PARAGRAPH_SEPARATOR */ nsIUGenCategory::kSeparator,
124 /* SPACE_SEPARATOR */ nsIUGenCategory::kSeparator
125 };
127 uint32_t
128 GetMirroredChar(uint32_t aCh)
129 {
130 return aCh + sMirrorOffsets[GetCharProps1(aCh).mMirrorOffsetIndex];
131 }
133 uint32_t
134 GetScriptTagForCode(int32_t aScriptCode)
135 {
136 // this will safely return 0 for negative script codes, too :)
137 if (uint32_t(aScriptCode) > ArrayLength(sScriptCodeToTag)) {
138 return 0;
139 }
140 return sScriptCodeToTag[aScriptCode];
141 }
143 static inline uint32_t
144 GetCaseMapValue(uint32_t aCh)
145 {
146 if (aCh < UNICODE_BMP_LIMIT) {
147 return sCaseMapValues[sCaseMapPages[0][aCh >> kCaseMapCharBits]]
148 [aCh & ((1 << kCaseMapCharBits) - 1)];
149 }
150 if (aCh < (kCaseMapMaxPlane + 1) * 0x10000) {
151 return sCaseMapValues[sCaseMapPages[sCaseMapPlanes[(aCh >> 16) - 1]]
152 [(aCh & 0xffff) >> kCaseMapCharBits]]
153 [aCh & ((1 << kCaseMapCharBits) - 1)];
154 }
155 return 0;
156 }
158 uint32_t
159 GetUppercase(uint32_t aCh)
160 {
161 uint32_t mapValue = GetCaseMapValue(aCh);
162 if (mapValue & (kLowerToUpper | kTitleToUpper)) {
163 return aCh ^ (mapValue & kCaseMapCharMask);
164 }
165 if (mapValue & kLowerToTitle) {
166 return GetUppercase(aCh ^ (mapValue & kCaseMapCharMask));
167 }
168 return aCh;
169 }
171 uint32_t
172 GetLowercase(uint32_t aCh)
173 {
174 uint32_t mapValue = GetCaseMapValue(aCh);
175 if (mapValue & kUpperToLower) {
176 return aCh ^ (mapValue & kCaseMapCharMask);
177 }
178 if (mapValue & kTitleToUpper) {
179 return GetLowercase(aCh ^ (mapValue & kCaseMapCharMask));
180 }
181 return aCh;
182 }
184 uint32_t
185 GetTitlecaseForLower(uint32_t aCh)
186 {
187 uint32_t mapValue = GetCaseMapValue(aCh);
188 if (mapValue & (kLowerToTitle | kLowerToUpper)) {
189 return aCh ^ (mapValue & kCaseMapCharMask);
190 }
191 return aCh;
192 }
194 uint32_t
195 GetTitlecaseForAll(uint32_t aCh)
196 {
197 uint32_t mapValue = GetCaseMapValue(aCh);
198 if (mapValue & (kLowerToTitle | kLowerToUpper)) {
199 return aCh ^ (mapValue & kCaseMapCharMask);
200 }
201 if (mapValue & kUpperToLower) {
202 return GetTitlecaseForLower(aCh ^ (mapValue & kCaseMapCharMask));
203 }
204 return aCh;
205 }
207 HanVariantType
208 GetHanVariant(uint32_t aCh)
209 {
210 // In the sHanVariantValues array, data for 4 successive characters
211 // (2 bits each) is packed in to each uint8_t entry, with the value
212 // for the lowest character stored in the least significant bits.
213 uint8_t v = 0;
214 if (aCh < UNICODE_BMP_LIMIT) {
215 v = sHanVariantValues[sHanVariantPages[0][aCh >> kHanVariantCharBits]]
216 [(aCh & ((1 << kHanVariantCharBits) - 1)) >> 2];
217 } else if (aCh < (kHanVariantMaxPlane + 1) * 0x10000) {
218 v = sHanVariantValues[sHanVariantPages[sHanVariantPlanes[(aCh >> 16) - 1]]
219 [(aCh & 0xffff) >> kHanVariantCharBits]]
220 [(aCh & ((1 << kHanVariantCharBits) - 1)) >> 2];
221 }
222 // extract the appropriate 2-bit field from the value
223 return HanVariantType((v >> ((aCh & 3) * 2)) & 3);
224 }
226 uint32_t
227 GetFullWidth(uint32_t aCh)
228 {
229 // full-width mappings only exist for BMP characters; all others are
230 // returned unchanged
231 if (aCh < UNICODE_BMP_LIMIT) {
232 uint32_t v =
233 sFullWidthValues[sFullWidthPages[aCh >> kFullWidthCharBits]]
234 [aCh & ((1 << kFullWidthCharBits) - 1)];
235 if (v) {
236 // return the mapped value if non-zero; else return original char
237 return v;
238 }
239 }
240 return aCh;
241 }
243 bool
244 IsClusterExtender(uint32_t aCh, uint8_t aCategory)
245 {
246 return ((aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK &&
247 aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) ||
248 (aCh >= 0x200c && aCh <= 0x200d) || // ZWJ, ZWNJ
249 (aCh >= 0xff9e && aCh <= 0xff9f)); // katakana sound marks
250 }
252 // TODO: replace this with a properties file or similar;
253 // expect this to evolve as harfbuzz shaping support matures.
254 //
255 // The "shaping type" of each script run, as returned by this
256 // function, is compared to the bits set in the
257 // gfx.font_rendering.harfbuzz.scripts
258 // preference to decide whether to use the harfbuzz shaper.
259 //
260 int32_t
261 ScriptShapingType(int32_t aScriptCode)
262 {
263 switch (aScriptCode) {
264 default:
265 return SHAPING_DEFAULT; // scripts not explicitly listed here are
266 // assumed to just use default shaping
268 case MOZ_SCRIPT_ARABIC:
269 case MOZ_SCRIPT_SYRIAC:
270 case MOZ_SCRIPT_NKO:
271 case MOZ_SCRIPT_MANDAIC:
272 return SHAPING_ARABIC; // bidi scripts with Arabic-style shaping
274 case MOZ_SCRIPT_HEBREW:
275 return SHAPING_HEBREW;
277 case MOZ_SCRIPT_HANGUL:
278 return SHAPING_HANGUL;
280 case MOZ_SCRIPT_MONGOLIAN: // to be supported by the Arabic shaper?
281 return SHAPING_MONGOLIAN;
283 case MOZ_SCRIPT_THAI: // no complex OT features, but MS engines like to do
284 // sequence checking
285 return SHAPING_THAI;
287 case MOZ_SCRIPT_BENGALI:
288 case MOZ_SCRIPT_DEVANAGARI:
289 case MOZ_SCRIPT_GUJARATI:
290 case MOZ_SCRIPT_GURMUKHI:
291 case MOZ_SCRIPT_KANNADA:
292 case MOZ_SCRIPT_MALAYALAM:
293 case MOZ_SCRIPT_ORIYA:
294 case MOZ_SCRIPT_SINHALA:
295 case MOZ_SCRIPT_TAMIL:
296 case MOZ_SCRIPT_TELUGU:
297 case MOZ_SCRIPT_KHMER:
298 case MOZ_SCRIPT_LAO:
299 case MOZ_SCRIPT_TIBETAN:
300 case MOZ_SCRIPT_NEW_TAI_LUE:
301 case MOZ_SCRIPT_TAI_LE:
302 case MOZ_SCRIPT_MYANMAR:
303 case MOZ_SCRIPT_PHAGS_PA:
304 case MOZ_SCRIPT_BATAK:
305 case MOZ_SCRIPT_BRAHMI:
306 return SHAPING_INDIC; // scripts that require Indic or other "special" shaping
307 }
308 }
310 void
311 ClusterIterator::Next()
312 {
313 if (AtEnd()) {
314 NS_WARNING("ClusterIterator has already reached the end");
315 return;
316 }
318 uint32_t ch = *mPos++;
320 if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit &&
321 NS_IS_LOW_SURROGATE(*mPos)) {
322 ch = SURROGATE_TO_UCS4(ch, *mPos++);
323 } else if ((ch & ~0xff) == 0x1100 ||
324 (ch >= 0xa960 && ch <= 0xa97f) ||
325 (ch >= 0xac00 && ch <= 0xd7ff)) {
326 // Handle conjoining Jamo that make Hangul syllables
327 HSType hangulState = GetHangulSyllableType(ch);
328 while (mPos < mLimit) {
329 ch = *mPos;
330 HSType hangulType = GetHangulSyllableType(ch);
331 switch (hangulType) {
332 case HST_L:
333 case HST_LV:
334 case HST_LVT:
335 if (hangulState == HST_L) {
336 hangulState = hangulType;
337 mPos++;
338 continue;
339 }
340 break;
341 case HST_V:
342 if ((hangulState != HST_NONE) && !(hangulState & HST_T)) {
343 hangulState = hangulType;
344 mPos++;
345 continue;
346 }
347 break;
348 case HST_T:
349 if (hangulState & (HST_V | HST_T)) {
350 hangulState = hangulType;
351 mPos++;
352 continue;
353 }
354 break;
355 default:
356 break;
357 }
358 break;
359 }
360 }
362 while (mPos < mLimit) {
363 ch = *mPos;
365 // Check for surrogate pairs; note that isolated surrogates will just
366 // be treated as generic (non-cluster-extending) characters here,
367 // which is fine for cluster-iterating purposes
368 if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit - 1 &&
369 NS_IS_LOW_SURROGATE(*(mPos + 1))) {
370 ch = SURROGATE_TO_UCS4(ch, *(mPos + 1));
371 }
373 if (!IsClusterExtender(ch)) {
374 break;
375 }
377 mPos++;
378 if (!IS_IN_BMP(ch)) {
379 mPos++;
380 }
381 }
383 NS_ASSERTION(mText < mPos && mPos <= mLimit,
384 "ClusterIterator::Next has overshot the string!");
385 }
387 } // end namespace unicode
389 } // end namespace mozilla