|
1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- |
|
2 * This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 |
|
6 #include "nsUnicodeProperties.h" |
|
7 #include "nsUnicodePropertyData.cpp" |
|
8 |
|
9 #include "mozilla/ArrayUtils.h" |
|
10 #include "nsCharTraits.h" |
|
11 |
|
12 #define UNICODE_BMP_LIMIT 0x10000 |
|
13 #define UNICODE_LIMIT 0x110000 |
|
14 |
|
15 |
|
16 const nsCharProps1& |
|
17 GetCharProps1(uint32_t aCh) |
|
18 { |
|
19 if (aCh < UNICODE_BMP_LIMIT) { |
|
20 return sCharProp1Values[sCharProp1Pages[0][aCh >> kCharProp1CharBits]] |
|
21 [aCh & ((1 << kCharProp1CharBits) - 1)]; |
|
22 } |
|
23 if (aCh < (kCharProp1MaxPlane + 1) * 0x10000) { |
|
24 return sCharProp1Values[sCharProp1Pages[sCharProp1Planes[(aCh >> 16) - 1]] |
|
25 [(aCh & 0xffff) >> kCharProp1CharBits]] |
|
26 [aCh & ((1 << kCharProp1CharBits) - 1)]; |
|
27 } |
|
28 |
|
29 // Default values for unassigned |
|
30 static const nsCharProps1 undefined = { |
|
31 0, // Index to mirrored char offsets |
|
32 0, // Hangul Syllable type |
|
33 0 // Combining class |
|
34 }; |
|
35 return undefined; |
|
36 } |
|
37 |
|
38 const nsCharProps2& |
|
39 GetCharProps2(uint32_t aCh) |
|
40 { |
|
41 if (aCh < UNICODE_BMP_LIMIT) { |
|
42 return sCharProp2Values[sCharProp2Pages[0][aCh >> kCharProp2CharBits]] |
|
43 [aCh & ((1 << kCharProp2CharBits) - 1)]; |
|
44 } |
|
45 if (aCh < (kCharProp2MaxPlane + 1) * 0x10000) { |
|
46 return sCharProp2Values[sCharProp2Pages[sCharProp2Planes[(aCh >> 16) - 1]] |
|
47 [(aCh & 0xffff) >> kCharProp2CharBits]] |
|
48 [aCh & ((1 << kCharProp2CharBits) - 1)]; |
|
49 } |
|
50 |
|
51 NS_NOTREACHED("Getting CharProps for codepoint outside Unicode range"); |
|
52 // Default values for unassigned |
|
53 static const nsCharProps2 undefined = { |
|
54 MOZ_SCRIPT_UNKNOWN, // Script code |
|
55 0, // East Asian Width |
|
56 HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED, // General Category |
|
57 eCharType_LeftToRight, // Bidi Category |
|
58 mozilla::unicode::XIDMOD_NOT_CHARS, // Xidmod |
|
59 -1, // Numeric Value |
|
60 mozilla::unicode::HVT_NotHan // Han variant |
|
61 }; |
|
62 return undefined; |
|
63 } |
|
64 |
|
65 namespace mozilla { |
|
66 |
|
67 namespace unicode { |
|
68 |
|
69 /* |
|
70 To store properties for a million Unicode codepoints compactly, we use |
|
71 a three-level array structure, with the Unicode values considered as |
|
72 three elements: Plane, Page, and Char. |
|
73 |
|
74 Space optimization happens because multiple Planes can refer to the same |
|
75 Page array, and multiple Pages can refer to the same Char array holding |
|
76 the actual values. In practice, most of the higher planes are empty and |
|
77 thus share the same data; and within the BMP, there are also many pages |
|
78 that repeat the same data for any given property. |
|
79 |
|
80 Plane is usually zero, so we skip a lookup in this case, and require |
|
81 that the Plane 0 pages are always the first set of entries in the Page |
|
82 array. |
|
83 |
|
84 The division of the remaining 16 bits into Page and Char fields is |
|
85 adjusted for each property (by experiment using the generation tool) |
|
86 to provide the most compact storage, depending on the distribution |
|
87 of values. |
|
88 */ |
|
89 |
|
90 nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[] = { |
|
91 /* |
|
92 * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants |
|
93 * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-common.h. |
|
94 */ |
|
95 /* CONTROL */ nsIUGenCategory::kOther, |
|
96 /* FORMAT */ nsIUGenCategory::kOther, |
|
97 /* UNASSIGNED */ nsIUGenCategory::kOther, |
|
98 /* PRIVATE_USE */ nsIUGenCategory::kOther, |
|
99 /* SURROGATE */ nsIUGenCategory::kOther, |
|
100 /* LOWERCASE_LETTER */ nsIUGenCategory::kLetter, |
|
101 /* MODIFIER_LETTER */ nsIUGenCategory::kLetter, |
|
102 /* OTHER_LETTER */ nsIUGenCategory::kLetter, |
|
103 /* TITLECASE_LETTER */ nsIUGenCategory::kLetter, |
|
104 /* UPPERCASE_LETTER */ nsIUGenCategory::kLetter, |
|
105 /* COMBINING_MARK */ nsIUGenCategory::kMark, |
|
106 /* ENCLOSING_MARK */ nsIUGenCategory::kMark, |
|
107 /* NON_SPACING_MARK */ nsIUGenCategory::kMark, |
|
108 /* DECIMAL_NUMBER */ nsIUGenCategory::kNumber, |
|
109 /* LETTER_NUMBER */ nsIUGenCategory::kNumber, |
|
110 /* OTHER_NUMBER */ nsIUGenCategory::kNumber, |
|
111 /* CONNECT_PUNCTUATION */ nsIUGenCategory::kPunctuation, |
|
112 /* DASH_PUNCTUATION */ nsIUGenCategory::kPunctuation, |
|
113 /* CLOSE_PUNCTUATION */ nsIUGenCategory::kPunctuation, |
|
114 /* FINAL_PUNCTUATION */ nsIUGenCategory::kPunctuation, |
|
115 /* INITIAL_PUNCTUATION */ nsIUGenCategory::kPunctuation, |
|
116 /* OTHER_PUNCTUATION */ nsIUGenCategory::kPunctuation, |
|
117 /* OPEN_PUNCTUATION */ nsIUGenCategory::kPunctuation, |
|
118 /* CURRENCY_SYMBOL */ nsIUGenCategory::kSymbol, |
|
119 /* MODIFIER_SYMBOL */ nsIUGenCategory::kSymbol, |
|
120 /* MATH_SYMBOL */ nsIUGenCategory::kSymbol, |
|
121 /* OTHER_SYMBOL */ nsIUGenCategory::kSymbol, |
|
122 /* LINE_SEPARATOR */ nsIUGenCategory::kSeparator, |
|
123 /* PARAGRAPH_SEPARATOR */ nsIUGenCategory::kSeparator, |
|
124 /* SPACE_SEPARATOR */ nsIUGenCategory::kSeparator |
|
125 }; |
|
126 |
|
127 uint32_t |
|
128 GetMirroredChar(uint32_t aCh) |
|
129 { |
|
130 return aCh + sMirrorOffsets[GetCharProps1(aCh).mMirrorOffsetIndex]; |
|
131 } |
|
132 |
|
133 uint32_t |
|
134 GetScriptTagForCode(int32_t aScriptCode) |
|
135 { |
|
136 // this will safely return 0 for negative script codes, too :) |
|
137 if (uint32_t(aScriptCode) > ArrayLength(sScriptCodeToTag)) { |
|
138 return 0; |
|
139 } |
|
140 return sScriptCodeToTag[aScriptCode]; |
|
141 } |
|
142 |
|
143 static inline uint32_t |
|
144 GetCaseMapValue(uint32_t aCh) |
|
145 { |
|
146 if (aCh < UNICODE_BMP_LIMIT) { |
|
147 return sCaseMapValues[sCaseMapPages[0][aCh >> kCaseMapCharBits]] |
|
148 [aCh & ((1 << kCaseMapCharBits) - 1)]; |
|
149 } |
|
150 if (aCh < (kCaseMapMaxPlane + 1) * 0x10000) { |
|
151 return sCaseMapValues[sCaseMapPages[sCaseMapPlanes[(aCh >> 16) - 1]] |
|
152 [(aCh & 0xffff) >> kCaseMapCharBits]] |
|
153 [aCh & ((1 << kCaseMapCharBits) - 1)]; |
|
154 } |
|
155 return 0; |
|
156 } |
|
157 |
|
158 uint32_t |
|
159 GetUppercase(uint32_t aCh) |
|
160 { |
|
161 uint32_t mapValue = GetCaseMapValue(aCh); |
|
162 if (mapValue & (kLowerToUpper | kTitleToUpper)) { |
|
163 return aCh ^ (mapValue & kCaseMapCharMask); |
|
164 } |
|
165 if (mapValue & kLowerToTitle) { |
|
166 return GetUppercase(aCh ^ (mapValue & kCaseMapCharMask)); |
|
167 } |
|
168 return aCh; |
|
169 } |
|
170 |
|
171 uint32_t |
|
172 GetLowercase(uint32_t aCh) |
|
173 { |
|
174 uint32_t mapValue = GetCaseMapValue(aCh); |
|
175 if (mapValue & kUpperToLower) { |
|
176 return aCh ^ (mapValue & kCaseMapCharMask); |
|
177 } |
|
178 if (mapValue & kTitleToUpper) { |
|
179 return GetLowercase(aCh ^ (mapValue & kCaseMapCharMask)); |
|
180 } |
|
181 return aCh; |
|
182 } |
|
183 |
|
184 uint32_t |
|
185 GetTitlecaseForLower(uint32_t aCh) |
|
186 { |
|
187 uint32_t mapValue = GetCaseMapValue(aCh); |
|
188 if (mapValue & (kLowerToTitle | kLowerToUpper)) { |
|
189 return aCh ^ (mapValue & kCaseMapCharMask); |
|
190 } |
|
191 return aCh; |
|
192 } |
|
193 |
|
194 uint32_t |
|
195 GetTitlecaseForAll(uint32_t aCh) |
|
196 { |
|
197 uint32_t mapValue = GetCaseMapValue(aCh); |
|
198 if (mapValue & (kLowerToTitle | kLowerToUpper)) { |
|
199 return aCh ^ (mapValue & kCaseMapCharMask); |
|
200 } |
|
201 if (mapValue & kUpperToLower) { |
|
202 return GetTitlecaseForLower(aCh ^ (mapValue & kCaseMapCharMask)); |
|
203 } |
|
204 return aCh; |
|
205 } |
|
206 |
|
207 HanVariantType |
|
208 GetHanVariant(uint32_t aCh) |
|
209 { |
|
210 // In the sHanVariantValues array, data for 4 successive characters |
|
211 // (2 bits each) is packed in to each uint8_t entry, with the value |
|
212 // for the lowest character stored in the least significant bits. |
|
213 uint8_t v = 0; |
|
214 if (aCh < UNICODE_BMP_LIMIT) { |
|
215 v = sHanVariantValues[sHanVariantPages[0][aCh >> kHanVariantCharBits]] |
|
216 [(aCh & ((1 << kHanVariantCharBits) - 1)) >> 2]; |
|
217 } else if (aCh < (kHanVariantMaxPlane + 1) * 0x10000) { |
|
218 v = sHanVariantValues[sHanVariantPages[sHanVariantPlanes[(aCh >> 16) - 1]] |
|
219 [(aCh & 0xffff) >> kHanVariantCharBits]] |
|
220 [(aCh & ((1 << kHanVariantCharBits) - 1)) >> 2]; |
|
221 } |
|
222 // extract the appropriate 2-bit field from the value |
|
223 return HanVariantType((v >> ((aCh & 3) * 2)) & 3); |
|
224 } |
|
225 |
|
226 uint32_t |
|
227 GetFullWidth(uint32_t aCh) |
|
228 { |
|
229 // full-width mappings only exist for BMP characters; all others are |
|
230 // returned unchanged |
|
231 if (aCh < UNICODE_BMP_LIMIT) { |
|
232 uint32_t v = |
|
233 sFullWidthValues[sFullWidthPages[aCh >> kFullWidthCharBits]] |
|
234 [aCh & ((1 << kFullWidthCharBits) - 1)]; |
|
235 if (v) { |
|
236 // return the mapped value if non-zero; else return original char |
|
237 return v; |
|
238 } |
|
239 } |
|
240 return aCh; |
|
241 } |
|
242 |
|
243 bool |
|
244 IsClusterExtender(uint32_t aCh, uint8_t aCategory) |
|
245 { |
|
246 return ((aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK && |
|
247 aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) || |
|
248 (aCh >= 0x200c && aCh <= 0x200d) || // ZWJ, ZWNJ |
|
249 (aCh >= 0xff9e && aCh <= 0xff9f)); // katakana sound marks |
|
250 } |
|
251 |
|
252 // TODO: replace this with a properties file or similar; |
|
253 // expect this to evolve as harfbuzz shaping support matures. |
|
254 // |
|
255 // The "shaping type" of each script run, as returned by this |
|
256 // function, is compared to the bits set in the |
|
257 // gfx.font_rendering.harfbuzz.scripts |
|
258 // preference to decide whether to use the harfbuzz shaper. |
|
259 // |
|
260 int32_t |
|
261 ScriptShapingType(int32_t aScriptCode) |
|
262 { |
|
263 switch (aScriptCode) { |
|
264 default: |
|
265 return SHAPING_DEFAULT; // scripts not explicitly listed here are |
|
266 // assumed to just use default shaping |
|
267 |
|
268 case MOZ_SCRIPT_ARABIC: |
|
269 case MOZ_SCRIPT_SYRIAC: |
|
270 case MOZ_SCRIPT_NKO: |
|
271 case MOZ_SCRIPT_MANDAIC: |
|
272 return SHAPING_ARABIC; // bidi scripts with Arabic-style shaping |
|
273 |
|
274 case MOZ_SCRIPT_HEBREW: |
|
275 return SHAPING_HEBREW; |
|
276 |
|
277 case MOZ_SCRIPT_HANGUL: |
|
278 return SHAPING_HANGUL; |
|
279 |
|
280 case MOZ_SCRIPT_MONGOLIAN: // to be supported by the Arabic shaper? |
|
281 return SHAPING_MONGOLIAN; |
|
282 |
|
283 case MOZ_SCRIPT_THAI: // no complex OT features, but MS engines like to do |
|
284 // sequence checking |
|
285 return SHAPING_THAI; |
|
286 |
|
287 case MOZ_SCRIPT_BENGALI: |
|
288 case MOZ_SCRIPT_DEVANAGARI: |
|
289 case MOZ_SCRIPT_GUJARATI: |
|
290 case MOZ_SCRIPT_GURMUKHI: |
|
291 case MOZ_SCRIPT_KANNADA: |
|
292 case MOZ_SCRIPT_MALAYALAM: |
|
293 case MOZ_SCRIPT_ORIYA: |
|
294 case MOZ_SCRIPT_SINHALA: |
|
295 case MOZ_SCRIPT_TAMIL: |
|
296 case MOZ_SCRIPT_TELUGU: |
|
297 case MOZ_SCRIPT_KHMER: |
|
298 case MOZ_SCRIPT_LAO: |
|
299 case MOZ_SCRIPT_TIBETAN: |
|
300 case MOZ_SCRIPT_NEW_TAI_LUE: |
|
301 case MOZ_SCRIPT_TAI_LE: |
|
302 case MOZ_SCRIPT_MYANMAR: |
|
303 case MOZ_SCRIPT_PHAGS_PA: |
|
304 case MOZ_SCRIPT_BATAK: |
|
305 case MOZ_SCRIPT_BRAHMI: |
|
306 return SHAPING_INDIC; // scripts that require Indic or other "special" shaping |
|
307 } |
|
308 } |
|
309 |
|
310 void |
|
311 ClusterIterator::Next() |
|
312 { |
|
313 if (AtEnd()) { |
|
314 NS_WARNING("ClusterIterator has already reached the end"); |
|
315 return; |
|
316 } |
|
317 |
|
318 uint32_t ch = *mPos++; |
|
319 |
|
320 if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit && |
|
321 NS_IS_LOW_SURROGATE(*mPos)) { |
|
322 ch = SURROGATE_TO_UCS4(ch, *mPos++); |
|
323 } else if ((ch & ~0xff) == 0x1100 || |
|
324 (ch >= 0xa960 && ch <= 0xa97f) || |
|
325 (ch >= 0xac00 && ch <= 0xd7ff)) { |
|
326 // Handle conjoining Jamo that make Hangul syllables |
|
327 HSType hangulState = GetHangulSyllableType(ch); |
|
328 while (mPos < mLimit) { |
|
329 ch = *mPos; |
|
330 HSType hangulType = GetHangulSyllableType(ch); |
|
331 switch (hangulType) { |
|
332 case HST_L: |
|
333 case HST_LV: |
|
334 case HST_LVT: |
|
335 if (hangulState == HST_L) { |
|
336 hangulState = hangulType; |
|
337 mPos++; |
|
338 continue; |
|
339 } |
|
340 break; |
|
341 case HST_V: |
|
342 if ((hangulState != HST_NONE) && !(hangulState & HST_T)) { |
|
343 hangulState = hangulType; |
|
344 mPos++; |
|
345 continue; |
|
346 } |
|
347 break; |
|
348 case HST_T: |
|
349 if (hangulState & (HST_V | HST_T)) { |
|
350 hangulState = hangulType; |
|
351 mPos++; |
|
352 continue; |
|
353 } |
|
354 break; |
|
355 default: |
|
356 break; |
|
357 } |
|
358 break; |
|
359 } |
|
360 } |
|
361 |
|
362 while (mPos < mLimit) { |
|
363 ch = *mPos; |
|
364 |
|
365 // Check for surrogate pairs; note that isolated surrogates will just |
|
366 // be treated as generic (non-cluster-extending) characters here, |
|
367 // which is fine for cluster-iterating purposes |
|
368 if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit - 1 && |
|
369 NS_IS_LOW_SURROGATE(*(mPos + 1))) { |
|
370 ch = SURROGATE_TO_UCS4(ch, *(mPos + 1)); |
|
371 } |
|
372 |
|
373 if (!IsClusterExtender(ch)) { |
|
374 break; |
|
375 } |
|
376 |
|
377 mPos++; |
|
378 if (!IS_IN_BMP(ch)) { |
|
379 mPos++; |
|
380 } |
|
381 } |
|
382 |
|
383 NS_ASSERTION(mText < mPos && mPos <= mLimit, |
|
384 "ClusterIterator::Next has overshot the string!"); |
|
385 } |
|
386 |
|
387 } // end namespace unicode |
|
388 |
|
389 } // end namespace mozilla |