|
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
|
2 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 |
|
6 #include "nsUnicodeRange.h" |
|
7 #include "nsGkAtoms.h" |
|
8 #include "mozilla/NullPtr.h" |
|
9 |
|
10 // This table depends on unicode range definitions. |
|
11 // Each item's index must correspond unicode range value |
|
12 // eg. x-cyrillic = LangGroupTable[kRangeCyrillic] |
|
13 static nsIAtom **gUnicodeRangeToLangGroupAtomTable[] = |
|
14 { |
|
15 &nsGkAtoms::x_cyrillic, |
|
16 &nsGkAtoms::el_, |
|
17 &nsGkAtoms::tr, |
|
18 &nsGkAtoms::he, |
|
19 &nsGkAtoms::ar, |
|
20 &nsGkAtoms::x_baltic, |
|
21 &nsGkAtoms::th, |
|
22 &nsGkAtoms::ko, |
|
23 &nsGkAtoms::Japanese, |
|
24 &nsGkAtoms::zh_cn, |
|
25 &nsGkAtoms::zh_tw, |
|
26 &nsGkAtoms::x_devanagari, |
|
27 &nsGkAtoms::x_tamil, |
|
28 &nsGkAtoms::x_armn, |
|
29 &nsGkAtoms::x_beng, |
|
30 &nsGkAtoms::x_cans, |
|
31 &nsGkAtoms::x_ethi, |
|
32 &nsGkAtoms::x_geor, |
|
33 &nsGkAtoms::x_gujr, |
|
34 &nsGkAtoms::x_guru, |
|
35 &nsGkAtoms::x_khmr, |
|
36 &nsGkAtoms::x_mlym, |
|
37 &nsGkAtoms::x_orya, |
|
38 &nsGkAtoms::x_telu, |
|
39 &nsGkAtoms::x_knda, |
|
40 &nsGkAtoms::x_sinh, |
|
41 &nsGkAtoms::x_tibt |
|
42 }; |
|
43 |
|
44 /********************************************************************** |
|
45 * Unicode subranges as defined in unicode 3.0 |
|
46 * x-western, x-central-euro, tr, x-baltic -> latin |
|
47 * 0000 - 036f |
|
48 * 1e00 - 1eff |
|
49 * 2000 - 206f (general punctuation) |
|
50 * 20a0 - 20cf (currency symbols) |
|
51 * 2100 - 214f (letterlike symbols) |
|
52 * 2150 - 218f (Number Forms) |
|
53 * el -> greek |
|
54 * 0370 - 03ff |
|
55 * 1f00 - 1fff |
|
56 * x-cyrillic -> cyrillic |
|
57 * 0400 - 04ff |
|
58 * he -> hebrew |
|
59 * 0590 - 05ff |
|
60 * ar -> arabic |
|
61 * 0600 - 06ff |
|
62 * fb50 - fdff (arabic presentation forms) |
|
63 * fe70 - feff (arabic presentation forms b) |
|
64 * th - thai |
|
65 * 0e00 - 0e7f |
|
66 * ko -> korean |
|
67 * ac00 - d7af (hangul Syllables) |
|
68 * 1100 - 11ff (jamo) |
|
69 * 3130 - 318f (hangul compatibility jamo) |
|
70 * ja |
|
71 * 3040 - 309f (hiragana) |
|
72 * 30a0 - 30ff (katakana) |
|
73 * zh-CN |
|
74 * zh-TW |
|
75 * |
|
76 * CJK |
|
77 * 3100 - 312f (bopomofo) |
|
78 * 31a0 - 31bf (bopomofo extended) |
|
79 * 3000 - 303f (CJK Symbols and Punctuation) |
|
80 * 2e80 - 2eff (CJK radicals supplement) |
|
81 * 2f00 - 2fdf (Kangxi Radicals) |
|
82 * 2ff0 - 2fff (Ideographic Description Characters) |
|
83 * 3190 - 319f (kanbun) |
|
84 * 3200 - 32ff (Enclosed CJK letters and Months) |
|
85 * 3300 - 33ff (CJK compatibility) |
|
86 * 3400 - 4dbf (CJK Unified Ideographs Extension A) |
|
87 * 4e00 - 9faf (CJK Unified Ideographs) |
|
88 * f900 - fa5f (CJK Compatibility Ideographs) |
|
89 * fe30 - fe4f (CJK compatibility Forms) |
|
90 * ff00 - ffef (halfwidth and fullwidth forms) |
|
91 * |
|
92 * Armenian |
|
93 * 0530 - 058f |
|
94 * Sriac |
|
95 * 0700 - 074f |
|
96 * Thaana |
|
97 * 0780 - 07bf |
|
98 * Devanagari |
|
99 * 0900 - 097f |
|
100 * Bengali |
|
101 * 0980 - 09ff |
|
102 * Gurmukhi |
|
103 * 0a00 - 0a7f |
|
104 * Gujarati |
|
105 * 0a80 - 0aff |
|
106 * Oriya |
|
107 * 0b00 - 0b7f |
|
108 * Tamil |
|
109 * 0b80 - 0bff |
|
110 * Telugu |
|
111 * 0c00 - 0c7f |
|
112 * Kannada |
|
113 * 0c80 - 0cff |
|
114 * Malayalam |
|
115 * 0d00 - 0d7f |
|
116 * Sinhala |
|
117 * 0d80 - 0def |
|
118 * Lao |
|
119 * 0e80 - 0eff |
|
120 * Tibetan |
|
121 * 0f00 - 0fbf |
|
122 * Myanmar |
|
123 * 1000 - 109f |
|
124 * Georgian |
|
125 * 10a0 - 10ff |
|
126 * Ethiopic |
|
127 * 1200 - 137f |
|
128 * Cherokee |
|
129 * 13a0 - 13ff |
|
130 * Canadian Aboriginal Syllabics |
|
131 * 1400 - 167f |
|
132 * Ogham |
|
133 * 1680 - 169f |
|
134 * Runic |
|
135 * 16a0 - 16ff |
|
136 * Khmer |
|
137 * 1780 - 17ff |
|
138 * Mongolian |
|
139 * 1800 - 18af |
|
140 * Misc - superscripts and subscripts |
|
141 * 2070 - 209f |
|
142 * Misc - Combining Diacritical Marks for Symbols |
|
143 * 20d0 - 20ff |
|
144 * Misc - Arrows |
|
145 * 2190 - 21ff |
|
146 * Misc - Mathematical Operators |
|
147 * 2200 - 22ff |
|
148 * Misc - Miscellaneous Technical |
|
149 * 2300 - 23ff |
|
150 * Misc - Control picture |
|
151 * 2400 - 243f |
|
152 * Misc - Optical character recognition |
|
153 * 2440 - 2450 |
|
154 * Misc - Enclose Alphanumerics |
|
155 * 2460 - 24ff |
|
156 * Misc - Box Drawing |
|
157 * 2500 - 257f |
|
158 * Misc - Block Elements |
|
159 * 2580 - 259f |
|
160 * Misc - Geometric Shapes |
|
161 * 25a0 - 25ff |
|
162 * Misc - Miscellaneous Symbols |
|
163 * 2600 - 267f |
|
164 * Misc - Dingbats |
|
165 * 2700 - 27bf |
|
166 * Misc - Braille Patterns |
|
167 * 2800 - 28ff |
|
168 * Yi Syllables |
|
169 * a000 - a48f |
|
170 * Yi radicals |
|
171 * a490 - a4cf |
|
172 * Alphabetic Presentation Forms |
|
173 * fb00 - fb4f |
|
174 * Misc - Combining half Marks |
|
175 * fe20 - fe2f |
|
176 * Misc - small form variants |
|
177 * fe50 - fe6f |
|
178 * Misc - Specials |
|
179 * fff0 - ffff |
|
180 *********************************************************************/ |
|
181 |
|
182 |
|
183 |
|
184 #define NUM_OF_SUBTABLES 10 |
|
185 #define SUBTABLE_SIZE 16 |
|
186 |
|
187 static const uint8_t gUnicodeSubrangeTable[NUM_OF_SUBTABLES][SUBTABLE_SIZE] = |
|
188 { |
|
189 { // table for X--- |
|
190 kRangeTableBase+1, //u0xxx |
|
191 kRangeTableBase+2, //u1xxx |
|
192 kRangeTableBase+3, //u2xxx |
|
193 kRangeSetCJK, //u3xxx |
|
194 kRangeSetCJK, //u4xxx |
|
195 kRangeSetCJK, //u5xxx |
|
196 kRangeSetCJK, //u6xxx |
|
197 kRangeSetCJK, //u7xxx |
|
198 kRangeSetCJK, //u8xxx |
|
199 kRangeSetCJK, //u9xxx |
|
200 kRangeTableBase+4, //uaxxx |
|
201 kRangeKorean, //ubxxx |
|
202 kRangeKorean, //ucxxx |
|
203 kRangeTableBase+5, //udxxx |
|
204 kRangePrivate, //uexxx |
|
205 kRangeTableBase+6 //ufxxx |
|
206 }, |
|
207 { //table for 0X-- |
|
208 kRangeSetLatin, //u00xx |
|
209 kRangeSetLatin, //u01xx |
|
210 kRangeSetLatin, //u02xx |
|
211 kRangeGreek, //u03xx XXX 0300-036f is in fact kRangeCombiningDiacriticalMarks |
|
212 kRangeCyrillic, //u04xx |
|
213 kRangeTableBase+7, //u05xx, includes Cyrillic supplement, Hebrew, and Armenian |
|
214 kRangeArabic, //u06xx |
|
215 kRangeTertiaryTable, //u07xx |
|
216 kRangeUnassigned, //u08xx |
|
217 kRangeTertiaryTable, //u09xx |
|
218 kRangeTertiaryTable, //u0axx |
|
219 kRangeTertiaryTable, //u0bxx |
|
220 kRangeTertiaryTable, //u0cxx |
|
221 kRangeTertiaryTable, //u0dxx |
|
222 kRangeTertiaryTable, //u0exx |
|
223 kRangeTibetan //u0fxx |
|
224 }, |
|
225 { //table for 1x-- |
|
226 kRangeTertiaryTable, //u10xx |
|
227 kRangeKorean, //u11xx |
|
228 kRangeEthiopic, //u12xx |
|
229 kRangeTertiaryTable, //u13xx |
|
230 kRangeCanadian, //u14xx |
|
231 kRangeCanadian, //u15xx |
|
232 kRangeTertiaryTable, //u16xx |
|
233 kRangeKhmer, //u17xx |
|
234 kRangeMongolian, //u18xx |
|
235 kRangeUnassigned, //u19xx |
|
236 kRangeUnassigned, //u1axx |
|
237 kRangeUnassigned, //u1bxx |
|
238 kRangeUnassigned, //u1cxx |
|
239 kRangeUnassigned, //u1dxx |
|
240 kRangeSetLatin, //u1exx |
|
241 kRangeGreek //u1fxx |
|
242 }, |
|
243 { //table for 2x-- |
|
244 kRangeSetLatin, //u20xx |
|
245 kRangeSetLatin, //u21xx |
|
246 kRangeMathOperators, //u22xx |
|
247 kRangeMiscTechnical, //u23xx |
|
248 kRangeControlOpticalEnclose, //u24xx |
|
249 kRangeBoxBlockGeometrics, //u25xx |
|
250 kRangeMiscSymbols, //u26xx |
|
251 kRangeDingbats, //u27xx |
|
252 kRangeBraillePattern, //u28xx |
|
253 kRangeUnassigned, //u29xx |
|
254 kRangeUnassigned, //u2axx |
|
255 kRangeUnassigned, //u2bxx |
|
256 kRangeUnassigned, //u2cxx |
|
257 kRangeUnassigned, //u2dxx |
|
258 kRangeSetCJK, //u2exx |
|
259 kRangeSetCJK //u2fxx |
|
260 }, |
|
261 { //table for ax-- |
|
262 kRangeYi, //ua0xx |
|
263 kRangeYi, //ua1xx |
|
264 kRangeYi, //ua2xx |
|
265 kRangeYi, //ua3xx |
|
266 kRangeYi, //ua4xx |
|
267 kRangeUnassigned, //ua5xx |
|
268 kRangeUnassigned, //ua6xx |
|
269 kRangeUnassigned, //ua7xx |
|
270 kRangeUnassigned, //ua8xx |
|
271 kRangeUnassigned, //ua9xx |
|
272 kRangeUnassigned, //uaaxx |
|
273 kRangeUnassigned, //uabxx |
|
274 kRangeKorean, //uacxx |
|
275 kRangeKorean, //uadxx |
|
276 kRangeKorean, //uaexx |
|
277 kRangeKorean //uafxx |
|
278 }, |
|
279 { //table for dx-- |
|
280 kRangeKorean, //ud0xx |
|
281 kRangeKorean, //ud1xx |
|
282 kRangeKorean, //ud2xx |
|
283 kRangeKorean, //ud3xx |
|
284 kRangeKorean, //ud4xx |
|
285 kRangeKorean, //ud5xx |
|
286 kRangeKorean, //ud6xx |
|
287 kRangeKorean, //ud7xx |
|
288 kRangeSurrogate, //ud8xx |
|
289 kRangeSurrogate, //ud9xx |
|
290 kRangeSurrogate, //udaxx |
|
291 kRangeSurrogate, //udbxx |
|
292 kRangeSurrogate, //udcxx |
|
293 kRangeSurrogate, //uddxx |
|
294 kRangeSurrogate, //udexx |
|
295 kRangeSurrogate //udfxx |
|
296 }, |
|
297 { // table for fx-- |
|
298 kRangePrivate, //uf0xx |
|
299 kRangePrivate, //uf1xx |
|
300 kRangePrivate, //uf2xx |
|
301 kRangePrivate, //uf3xx |
|
302 kRangePrivate, //uf4xx |
|
303 kRangePrivate, //uf5xx |
|
304 kRangePrivate, //uf6xx |
|
305 kRangePrivate, //uf7xx |
|
306 kRangePrivate, //uf8xx |
|
307 kRangeSetCJK, //uf9xx |
|
308 kRangeSetCJK, //ufaxx |
|
309 kRangeArabic, //ufbxx, includes alphabic presentation form |
|
310 kRangeArabic, //ufcxx |
|
311 kRangeArabic, //ufdxx |
|
312 kRangeTableBase+8, //ufexx |
|
313 kRangeTableBase+9 //uffxx, halfwidth and fullwidth forms, includes Specials |
|
314 }, |
|
315 { //table for 0x0500 - 0x05ff |
|
316 kRangeCyrillic, //u050x |
|
317 kRangeCyrillic, //u051x |
|
318 kRangeCyrillic, //u052x |
|
319 kRangeArmenian, //u053x |
|
320 kRangeArmenian, //u054x |
|
321 kRangeArmenian, //u055x |
|
322 kRangeArmenian, //u056x |
|
323 kRangeArmenian, //u057x |
|
324 kRangeArmenian, //u058x |
|
325 kRangeHebrew, //u059x |
|
326 kRangeHebrew, //u05ax |
|
327 kRangeHebrew, //u05bx |
|
328 kRangeHebrew, //u05cx |
|
329 kRangeHebrew, //u05dx |
|
330 kRangeHebrew, //u05ex |
|
331 kRangeHebrew //u05fx |
|
332 }, |
|
333 { //table for 0xfe00 - 0xfeff |
|
334 kRangeSetCJK, //ufe0x |
|
335 kRangeSetCJK, //ufe1x |
|
336 kRangeSetCJK, //ufe2x |
|
337 kRangeSetCJK, //ufe3x |
|
338 kRangeSetCJK, //ufe4x |
|
339 kRangeSetCJK, //ufe5x |
|
340 kRangeSetCJK, //ufe6x |
|
341 kRangeArabic, //ufe7x |
|
342 kRangeArabic, //ufe8x |
|
343 kRangeArabic, //ufe9x |
|
344 kRangeArabic, //ufeax |
|
345 kRangeArabic, //ufebx |
|
346 kRangeArabic, //ufecx |
|
347 kRangeArabic, //ufedx |
|
348 kRangeArabic, //ufeex |
|
349 kRangeArabic //ufefx |
|
350 }, |
|
351 { //table for 0xff00 - 0xffff |
|
352 kRangeSetCJK, //uff0x, fullwidth latin |
|
353 kRangeSetCJK, //uff1x, fullwidth latin |
|
354 kRangeSetCJK, //uff2x, fullwidth latin |
|
355 kRangeSetCJK, //uff3x, fullwidth latin |
|
356 kRangeSetCJK, //uff4x, fullwidth latin |
|
357 kRangeSetCJK, //uff5x, fullwidth latin |
|
358 kRangeSetCJK, //uff6x, halfwidth katakana |
|
359 kRangeSetCJK, //uff7x, halfwidth katakana |
|
360 kRangeSetCJK, //uff8x, halfwidth katakana |
|
361 kRangeSetCJK, //uff9x, halfwidth katakana |
|
362 kRangeSetCJK, //uffax, halfwidth hangul jamo |
|
363 kRangeSetCJK, //uffbx, halfwidth hangul jamo |
|
364 kRangeSetCJK, //uffcx, halfwidth hangul jamo |
|
365 kRangeSetCJK, //uffdx, halfwidth hangul jamo |
|
366 kRangeSetCJK, //uffex, fullwidth symbols |
|
367 kRangeSpecials, //ufffx, Specials |
|
368 }, |
|
369 }; |
|
370 |
|
371 // Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80) |
|
372 // code points so that the number of entries in the tertiary range |
|
373 // table for that range is obtained by dividing (0x1700 - 0x0700) by 128. |
|
374 // Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal |
|
375 // syllabaries take multiple chunks and Ogham and Runic share a single chunk. |
|
376 #define TERTIARY_TABLE_SIZE ((0x1700 - 0x0700) / 0x80) |
|
377 |
|
378 static const uint8_t gUnicodeTertiaryRangeTable[TERTIARY_TABLE_SIZE] = |
|
379 { //table for 0x0700 - 0x1600 |
|
380 kRangeSyriac, //u070x |
|
381 kRangeThaana, //u078x |
|
382 kRangeUnassigned, //u080x place holder(resolved in the 2ndary tab.) |
|
383 kRangeUnassigned, //u088x place holder(resolved in the 2ndary tab.) |
|
384 kRangeDevanagari, //u090x |
|
385 kRangeBengali, //u098x |
|
386 kRangeGurmukhi, //u0a0x |
|
387 kRangeGujarati, //u0a8x |
|
388 kRangeOriya, //u0b0x |
|
389 kRangeTamil, //u0b8x |
|
390 kRangeTelugu, //u0c0x |
|
391 kRangeKannada, //u0c8x |
|
392 kRangeMalayalam, //u0d0x |
|
393 kRangeSinhala, //u0d8x |
|
394 kRangeThai, //u0e0x |
|
395 kRangeLao, //u0e8x |
|
396 kRangeTibetan, //u0f0x place holder(resolved in the 2ndary tab.) |
|
397 kRangeTibetan, //u0f8x place holder(resolved in the 2ndary tab.) |
|
398 kRangeMyanmar, //u100x |
|
399 kRangeGeorgian, //u108x |
|
400 kRangeKorean, //u110x place holder(resolved in the 2ndary tab.) |
|
401 kRangeKorean, //u118x place holder(resolved in the 2ndary tab.) |
|
402 kRangeEthiopic, //u120x place holder(resolved in the 2ndary tab.) |
|
403 kRangeEthiopic, //u128x place holder(resolved in the 2ndary tab.) |
|
404 kRangeEthiopic, //u130x |
|
405 kRangeCherokee, //u138x |
|
406 kRangeCanadian, //u140x place holder(resolved in the 2ndary tab.) |
|
407 kRangeCanadian, //u148x place holder(resolved in the 2ndary tab.) |
|
408 kRangeCanadian, //u150x place holder(resolved in the 2ndary tab.) |
|
409 kRangeCanadian, //u158x place holder(resolved in the 2ndary tab.) |
|
410 kRangeCanadian, //u160x |
|
411 kRangeOghamRunic //u168x this contains two scripts, Ogham & Runic |
|
412 }; |
|
413 |
|
414 // A two level index is almost enough for locating a range, with the |
|
415 // exception of u03xx and u05xx. Since we don't really care about range for |
|
416 // combining diacritical marks in our font application, they are |
|
417 // not discriminated further. But future adoption of this module for other use |
|
418 // should be aware of this limitation. The implementation can be extended if |
|
419 // there is such a need. |
|
420 // For Indic, Southeast Asian scripts and some other scripts between |
|
421 // U+0700 and U+16FF, it's extended to the third level. |
|
422 uint32_t FindCharUnicodeRange(uint32_t ch) |
|
423 { |
|
424 uint32_t range; |
|
425 |
|
426 // aggregate ranges for non-BMP codepoints |
|
427 if (ch > 0xFFFF) { |
|
428 uint32_t p = (ch >> 16); |
|
429 if (p == 1) { |
|
430 return kRangeSMP; |
|
431 } else if (p == 2) { |
|
432 return kRangeSetCJK; |
|
433 } |
|
434 return kRangeHigherPlanes; |
|
435 } |
|
436 |
|
437 // lookup explicit range for BMP codepoints |
|
438 // first general range |
|
439 range = gUnicodeSubrangeTable[0][ch >> 12]; |
|
440 |
|
441 // if general range is good enough, return that |
|
442 if (range < kRangeTableBase) |
|
443 // we try to get a specific range |
|
444 return range; |
|
445 |
|
446 // otherwise, use subrange tables |
|
447 range = gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x0f00) >> 8]; |
|
448 if (range < kRangeTableBase) |
|
449 return range; |
|
450 if (range < kRangeTertiaryTable) |
|
451 return gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x00f0) >> 4]; |
|
452 |
|
453 // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks |
|
454 return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7]; |
|
455 } |
|
456 |
|
457 nsIAtom *LangGroupFromUnicodeRange(uint8_t unicodeRange) |
|
458 { |
|
459 if (kRangeSpecificItemNum > unicodeRange) { |
|
460 nsIAtom **atom = gUnicodeRangeToLangGroupAtomTable[unicodeRange]; |
|
461 return *atom; |
|
462 } |
|
463 return nullptr; |
|
464 } |