| |
1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- |
| |
2 * This Source Code Form is subject to the terms of the Mozilla Public |
| |
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
| |
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
| |
5 |
| |
6 #include "nsUnicodeProperties.h" |
| |
7 #include "nsUnicodePropertyData.cpp" |
| |
8 |
| |
9 #include "mozilla/ArrayUtils.h" |
| |
10 #include "nsCharTraits.h" |
| |
11 |
| |
12 #define UNICODE_BMP_LIMIT 0x10000 |
| |
13 #define UNICODE_LIMIT 0x110000 |
| |
14 |
| |
15 |
| |
16 const nsCharProps1& |
| |
17 GetCharProps1(uint32_t aCh) |
| |
18 { |
| |
19 if (aCh < UNICODE_BMP_LIMIT) { |
| |
20 return sCharProp1Values[sCharProp1Pages[0][aCh >> kCharProp1CharBits]] |
| |
21 [aCh & ((1 << kCharProp1CharBits) - 1)]; |
| |
22 } |
| |
23 if (aCh < (kCharProp1MaxPlane + 1) * 0x10000) { |
| |
24 return sCharProp1Values[sCharProp1Pages[sCharProp1Planes[(aCh >> 16) - 1]] |
| |
25 [(aCh & 0xffff) >> kCharProp1CharBits]] |
| |
26 [aCh & ((1 << kCharProp1CharBits) - 1)]; |
| |
27 } |
| |
28 |
| |
29 // Default values for unassigned |
| |
30 static const nsCharProps1 undefined = { |
| |
31 0, // Index to mirrored char offsets |
| |
32 0, // Hangul Syllable type |
| |
33 0 // Combining class |
| |
34 }; |
| |
35 return undefined; |
| |
36 } |
| |
37 |
| |
38 const nsCharProps2& |
| |
39 GetCharProps2(uint32_t aCh) |
| |
40 { |
| |
41 if (aCh < UNICODE_BMP_LIMIT) { |
| |
42 return sCharProp2Values[sCharProp2Pages[0][aCh >> kCharProp2CharBits]] |
| |
43 [aCh & ((1 << kCharProp2CharBits) - 1)]; |
| |
44 } |
| |
45 if (aCh < (kCharProp2MaxPlane + 1) * 0x10000) { |
| |
46 return sCharProp2Values[sCharProp2Pages[sCharProp2Planes[(aCh >> 16) - 1]] |
| |
47 [(aCh & 0xffff) >> kCharProp2CharBits]] |
| |
48 [aCh & ((1 << kCharProp2CharBits) - 1)]; |
| |
49 } |
| |
50 |
| |
51 NS_NOTREACHED("Getting CharProps for codepoint outside Unicode range"); |
| |
52 // Default values for unassigned |
| |
53 static const nsCharProps2 undefined = { |
| |
54 MOZ_SCRIPT_UNKNOWN, // Script code |
| |
55 0, // East Asian Width |
| |
56 HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED, // General Category |
| |
57 eCharType_LeftToRight, // Bidi Category |
| |
58 mozilla::unicode::XIDMOD_NOT_CHARS, // Xidmod |
| |
59 -1, // Numeric Value |
| |
60 mozilla::unicode::HVT_NotHan // Han variant |
| |
61 }; |
| |
62 return undefined; |
| |
63 } |
| |
64 |
| |
65 namespace mozilla { |
| |
66 |
| |
67 namespace unicode { |
| |
68 |
| |
69 /* |
| |
70 To store properties for a million Unicode codepoints compactly, we use |
| |
71 a three-level array structure, with the Unicode values considered as |
| |
72 three elements: Plane, Page, and Char. |
| |
73 |
| |
74 Space optimization happens because multiple Planes can refer to the same |
| |
75 Page array, and multiple Pages can refer to the same Char array holding |
| |
76 the actual values. In practice, most of the higher planes are empty and |
| |
77 thus share the same data; and within the BMP, there are also many pages |
| |
78 that repeat the same data for any given property. |
| |
79 |
| |
80 Plane is usually zero, so we skip a lookup in this case, and require |
| |
81 that the Plane 0 pages are always the first set of entries in the Page |
| |
82 array. |
| |
83 |
| |
84 The division of the remaining 16 bits into Page and Char fields is |
| |
85 adjusted for each property (by experiment using the generation tool) |
| |
86 to provide the most compact storage, depending on the distribution |
| |
87 of values. |
| |
88 */ |
| |
89 |
| |
90 nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[] = { |
| |
91 /* |
| |
92 * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants |
| |
93 * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-common.h. |
| |
94 */ |
| |
95 /* CONTROL */ nsIUGenCategory::kOther, |
| |
96 /* FORMAT */ nsIUGenCategory::kOther, |
| |
97 /* UNASSIGNED */ nsIUGenCategory::kOther, |
| |
98 /* PRIVATE_USE */ nsIUGenCategory::kOther, |
| |
99 /* SURROGATE */ nsIUGenCategory::kOther, |
| |
100 /* LOWERCASE_LETTER */ nsIUGenCategory::kLetter, |
| |
101 /* MODIFIER_LETTER */ nsIUGenCategory::kLetter, |
| |
102 /* OTHER_LETTER */ nsIUGenCategory::kLetter, |
| |
103 /* TITLECASE_LETTER */ nsIUGenCategory::kLetter, |
| |
104 /* UPPERCASE_LETTER */ nsIUGenCategory::kLetter, |
| |
105 /* COMBINING_MARK */ nsIUGenCategory::kMark, |
| |
106 /* ENCLOSING_MARK */ nsIUGenCategory::kMark, |
| |
107 /* NON_SPACING_MARK */ nsIUGenCategory::kMark, |
| |
108 /* DECIMAL_NUMBER */ nsIUGenCategory::kNumber, |
| |
109 /* LETTER_NUMBER */ nsIUGenCategory::kNumber, |
| |
110 /* OTHER_NUMBER */ nsIUGenCategory::kNumber, |
| |
111 /* CONNECT_PUNCTUATION */ nsIUGenCategory::kPunctuation, |
| |
112 /* DASH_PUNCTUATION */ nsIUGenCategory::kPunctuation, |
| |
113 /* CLOSE_PUNCTUATION */ nsIUGenCategory::kPunctuation, |
| |
114 /* FINAL_PUNCTUATION */ nsIUGenCategory::kPunctuation, |
| |
115 /* INITIAL_PUNCTUATION */ nsIUGenCategory::kPunctuation, |
| |
116 /* OTHER_PUNCTUATION */ nsIUGenCategory::kPunctuation, |
| |
117 /* OPEN_PUNCTUATION */ nsIUGenCategory::kPunctuation, |
| |
118 /* CURRENCY_SYMBOL */ nsIUGenCategory::kSymbol, |
| |
119 /* MODIFIER_SYMBOL */ nsIUGenCategory::kSymbol, |
| |
120 /* MATH_SYMBOL */ nsIUGenCategory::kSymbol, |
| |
121 /* OTHER_SYMBOL */ nsIUGenCategory::kSymbol, |
| |
122 /* LINE_SEPARATOR */ nsIUGenCategory::kSeparator, |
| |
123 /* PARAGRAPH_SEPARATOR */ nsIUGenCategory::kSeparator, |
| |
124 /* SPACE_SEPARATOR */ nsIUGenCategory::kSeparator |
| |
125 }; |
| |
126 |
| |
127 uint32_t |
| |
128 GetMirroredChar(uint32_t aCh) |
| |
129 { |
| |
130 return aCh + sMirrorOffsets[GetCharProps1(aCh).mMirrorOffsetIndex]; |
| |
131 } |
| |
132 |
| |
133 uint32_t |
| |
134 GetScriptTagForCode(int32_t aScriptCode) |
| |
135 { |
| |
136 // this will safely return 0 for negative script codes, too :) |
| |
137 if (uint32_t(aScriptCode) > ArrayLength(sScriptCodeToTag)) { |
| |
138 return 0; |
| |
139 } |
| |
140 return sScriptCodeToTag[aScriptCode]; |
| |
141 } |
| |
142 |
| |
143 static inline uint32_t |
| |
144 GetCaseMapValue(uint32_t aCh) |
| |
145 { |
| |
146 if (aCh < UNICODE_BMP_LIMIT) { |
| |
147 return sCaseMapValues[sCaseMapPages[0][aCh >> kCaseMapCharBits]] |
| |
148 [aCh & ((1 << kCaseMapCharBits) - 1)]; |
| |
149 } |
| |
150 if (aCh < (kCaseMapMaxPlane + 1) * 0x10000) { |
| |
151 return sCaseMapValues[sCaseMapPages[sCaseMapPlanes[(aCh >> 16) - 1]] |
| |
152 [(aCh & 0xffff) >> kCaseMapCharBits]] |
| |
153 [aCh & ((1 << kCaseMapCharBits) - 1)]; |
| |
154 } |
| |
155 return 0; |
| |
156 } |
| |
157 |
| |
158 uint32_t |
| |
159 GetUppercase(uint32_t aCh) |
| |
160 { |
| |
161 uint32_t mapValue = GetCaseMapValue(aCh); |
| |
162 if (mapValue & (kLowerToUpper | kTitleToUpper)) { |
| |
163 return aCh ^ (mapValue & kCaseMapCharMask); |
| |
164 } |
| |
165 if (mapValue & kLowerToTitle) { |
| |
166 return GetUppercase(aCh ^ (mapValue & kCaseMapCharMask)); |
| |
167 } |
| |
168 return aCh; |
| |
169 } |
| |
170 |
| |
171 uint32_t |
| |
172 GetLowercase(uint32_t aCh) |
| |
173 { |
| |
174 uint32_t mapValue = GetCaseMapValue(aCh); |
| |
175 if (mapValue & kUpperToLower) { |
| |
176 return aCh ^ (mapValue & kCaseMapCharMask); |
| |
177 } |
| |
178 if (mapValue & kTitleToUpper) { |
| |
179 return GetLowercase(aCh ^ (mapValue & kCaseMapCharMask)); |
| |
180 } |
| |
181 return aCh; |
| |
182 } |
| |
183 |
| |
184 uint32_t |
| |
185 GetTitlecaseForLower(uint32_t aCh) |
| |
186 { |
| |
187 uint32_t mapValue = GetCaseMapValue(aCh); |
| |
188 if (mapValue & (kLowerToTitle | kLowerToUpper)) { |
| |
189 return aCh ^ (mapValue & kCaseMapCharMask); |
| |
190 } |
| |
191 return aCh; |
| |
192 } |
| |
193 |
| |
194 uint32_t |
| |
195 GetTitlecaseForAll(uint32_t aCh) |
| |
196 { |
| |
197 uint32_t mapValue = GetCaseMapValue(aCh); |
| |
198 if (mapValue & (kLowerToTitle | kLowerToUpper)) { |
| |
199 return aCh ^ (mapValue & kCaseMapCharMask); |
| |
200 } |
| |
201 if (mapValue & kUpperToLower) { |
| |
202 return GetTitlecaseForLower(aCh ^ (mapValue & kCaseMapCharMask)); |
| |
203 } |
| |
204 return aCh; |
| |
205 } |
| |
206 |
| |
207 HanVariantType |
| |
208 GetHanVariant(uint32_t aCh) |
| |
209 { |
| |
210 // In the sHanVariantValues array, data for 4 successive characters |
| |
211 // (2 bits each) is packed in to each uint8_t entry, with the value |
| |
212 // for the lowest character stored in the least significant bits. |
| |
213 uint8_t v = 0; |
| |
214 if (aCh < UNICODE_BMP_LIMIT) { |
| |
215 v = sHanVariantValues[sHanVariantPages[0][aCh >> kHanVariantCharBits]] |
| |
216 [(aCh & ((1 << kHanVariantCharBits) - 1)) >> 2]; |
| |
217 } else if (aCh < (kHanVariantMaxPlane + 1) * 0x10000) { |
| |
218 v = sHanVariantValues[sHanVariantPages[sHanVariantPlanes[(aCh >> 16) - 1]] |
| |
219 [(aCh & 0xffff) >> kHanVariantCharBits]] |
| |
220 [(aCh & ((1 << kHanVariantCharBits) - 1)) >> 2]; |
| |
221 } |
| |
222 // extract the appropriate 2-bit field from the value |
| |
223 return HanVariantType((v >> ((aCh & 3) * 2)) & 3); |
| |
224 } |
| |
225 |
| |
226 uint32_t |
| |
227 GetFullWidth(uint32_t aCh) |
| |
228 { |
| |
229 // full-width mappings only exist for BMP characters; all others are |
| |
230 // returned unchanged |
| |
231 if (aCh < UNICODE_BMP_LIMIT) { |
| |
232 uint32_t v = |
| |
233 sFullWidthValues[sFullWidthPages[aCh >> kFullWidthCharBits]] |
| |
234 [aCh & ((1 << kFullWidthCharBits) - 1)]; |
| |
235 if (v) { |
| |
236 // return the mapped value if non-zero; else return original char |
| |
237 return v; |
| |
238 } |
| |
239 } |
| |
240 return aCh; |
| |
241 } |
| |
242 |
| |
243 bool |
| |
244 IsClusterExtender(uint32_t aCh, uint8_t aCategory) |
| |
245 { |
| |
246 return ((aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK && |
| |
247 aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) || |
| |
248 (aCh >= 0x200c && aCh <= 0x200d) || // ZWJ, ZWNJ |
| |
249 (aCh >= 0xff9e && aCh <= 0xff9f)); // katakana sound marks |
| |
250 } |
| |
251 |
| |
252 // TODO: replace this with a properties file or similar; |
| |
253 // expect this to evolve as harfbuzz shaping support matures. |
| |
254 // |
| |
255 // The "shaping type" of each script run, as returned by this |
| |
256 // function, is compared to the bits set in the |
| |
257 // gfx.font_rendering.harfbuzz.scripts |
| |
258 // preference to decide whether to use the harfbuzz shaper. |
| |
259 // |
| |
260 int32_t |
| |
261 ScriptShapingType(int32_t aScriptCode) |
| |
262 { |
| |
263 switch (aScriptCode) { |
| |
264 default: |
| |
265 return SHAPING_DEFAULT; // scripts not explicitly listed here are |
| |
266 // assumed to just use default shaping |
| |
267 |
| |
268 case MOZ_SCRIPT_ARABIC: |
| |
269 case MOZ_SCRIPT_SYRIAC: |
| |
270 case MOZ_SCRIPT_NKO: |
| |
271 case MOZ_SCRIPT_MANDAIC: |
| |
272 return SHAPING_ARABIC; // bidi scripts with Arabic-style shaping |
| |
273 |
| |
274 case MOZ_SCRIPT_HEBREW: |
| |
275 return SHAPING_HEBREW; |
| |
276 |
| |
277 case MOZ_SCRIPT_HANGUL: |
| |
278 return SHAPING_HANGUL; |
| |
279 |
| |
280 case MOZ_SCRIPT_MONGOLIAN: // to be supported by the Arabic shaper? |
| |
281 return SHAPING_MONGOLIAN; |
| |
282 |
| |
283 case MOZ_SCRIPT_THAI: // no complex OT features, but MS engines like to do |
| |
284 // sequence checking |
| |
285 return SHAPING_THAI; |
| |
286 |
| |
287 case MOZ_SCRIPT_BENGALI: |
| |
288 case MOZ_SCRIPT_DEVANAGARI: |
| |
289 case MOZ_SCRIPT_GUJARATI: |
| |
290 case MOZ_SCRIPT_GURMUKHI: |
| |
291 case MOZ_SCRIPT_KANNADA: |
| |
292 case MOZ_SCRIPT_MALAYALAM: |
| |
293 case MOZ_SCRIPT_ORIYA: |
| |
294 case MOZ_SCRIPT_SINHALA: |
| |
295 case MOZ_SCRIPT_TAMIL: |
| |
296 case MOZ_SCRIPT_TELUGU: |
| |
297 case MOZ_SCRIPT_KHMER: |
| |
298 case MOZ_SCRIPT_LAO: |
| |
299 case MOZ_SCRIPT_TIBETAN: |
| |
300 case MOZ_SCRIPT_NEW_TAI_LUE: |
| |
301 case MOZ_SCRIPT_TAI_LE: |
| |
302 case MOZ_SCRIPT_MYANMAR: |
| |
303 case MOZ_SCRIPT_PHAGS_PA: |
| |
304 case MOZ_SCRIPT_BATAK: |
| |
305 case MOZ_SCRIPT_BRAHMI: |
| |
306 return SHAPING_INDIC; // scripts that require Indic or other "special" shaping |
| |
307 } |
| |
308 } |
| |
309 |
| |
310 void |
| |
311 ClusterIterator::Next() |
| |
312 { |
| |
313 if (AtEnd()) { |
| |
314 NS_WARNING("ClusterIterator has already reached the end"); |
| |
315 return; |
| |
316 } |
| |
317 |
| |
318 uint32_t ch = *mPos++; |
| |
319 |
| |
320 if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit && |
| |
321 NS_IS_LOW_SURROGATE(*mPos)) { |
| |
322 ch = SURROGATE_TO_UCS4(ch, *mPos++); |
| |
323 } else if ((ch & ~0xff) == 0x1100 || |
| |
324 (ch >= 0xa960 && ch <= 0xa97f) || |
| |
325 (ch >= 0xac00 && ch <= 0xd7ff)) { |
| |
326 // Handle conjoining Jamo that make Hangul syllables |
| |
327 HSType hangulState = GetHangulSyllableType(ch); |
| |
328 while (mPos < mLimit) { |
| |
329 ch = *mPos; |
| |
330 HSType hangulType = GetHangulSyllableType(ch); |
| |
331 switch (hangulType) { |
| |
332 case HST_L: |
| |
333 case HST_LV: |
| |
334 case HST_LVT: |
| |
335 if (hangulState == HST_L) { |
| |
336 hangulState = hangulType; |
| |
337 mPos++; |
| |
338 continue; |
| |
339 } |
| |
340 break; |
| |
341 case HST_V: |
| |
342 if ((hangulState != HST_NONE) && !(hangulState & HST_T)) { |
| |
343 hangulState = hangulType; |
| |
344 mPos++; |
| |
345 continue; |
| |
346 } |
| |
347 break; |
| |
348 case HST_T: |
| |
349 if (hangulState & (HST_V | HST_T)) { |
| |
350 hangulState = hangulType; |
| |
351 mPos++; |
| |
352 continue; |
| |
353 } |
| |
354 break; |
| |
355 default: |
| |
356 break; |
| |
357 } |
| |
358 break; |
| |
359 } |
| |
360 } |
| |
361 |
| |
362 while (mPos < mLimit) { |
| |
363 ch = *mPos; |
| |
364 |
| |
365 // Check for surrogate pairs; note that isolated surrogates will just |
| |
366 // be treated as generic (non-cluster-extending) characters here, |
| |
367 // which is fine for cluster-iterating purposes |
| |
368 if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit - 1 && |
| |
369 NS_IS_LOW_SURROGATE(*(mPos + 1))) { |
| |
370 ch = SURROGATE_TO_UCS4(ch, *(mPos + 1)); |
| |
371 } |
| |
372 |
| |
373 if (!IsClusterExtender(ch)) { |
| |
374 break; |
| |
375 } |
| |
376 |
| |
377 mPos++; |
| |
378 if (!IS_IN_BMP(ch)) { |
| |
379 mPos++; |
| |
380 } |
| |
381 } |
| |
382 |
| |
383 NS_ASSERTION(mText < mPos && mPos <= mLimit, |
| |
384 "ClusterIterator::Next has overshot the string!"); |
| |
385 } |
| |
386 |
| |
387 } // end namespace unicode |
| |
388 |
| |
389 } // end namespace mozilla |