gfx/thebes/nsUnicodeRange.cpp

changeset 0
6474c204b198
equal deleted inserted replaced
-1:000000000000 0:b90b10348ab7
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5
6 #include "nsUnicodeRange.h"
7 #include "nsGkAtoms.h"
8 #include "mozilla/NullPtr.h"
9
10 // This table depends on unicode range definitions.
11 // Each item's index must correspond unicode range value
12 // eg. x-cyrillic = LangGroupTable[kRangeCyrillic]
13 static nsIAtom **gUnicodeRangeToLangGroupAtomTable[] =
14 {
15 &nsGkAtoms::x_cyrillic,
16 &nsGkAtoms::el_,
17 &nsGkAtoms::tr,
18 &nsGkAtoms::he,
19 &nsGkAtoms::ar,
20 &nsGkAtoms::x_baltic,
21 &nsGkAtoms::th,
22 &nsGkAtoms::ko,
23 &nsGkAtoms::Japanese,
24 &nsGkAtoms::zh_cn,
25 &nsGkAtoms::zh_tw,
26 &nsGkAtoms::x_devanagari,
27 &nsGkAtoms::x_tamil,
28 &nsGkAtoms::x_armn,
29 &nsGkAtoms::x_beng,
30 &nsGkAtoms::x_cans,
31 &nsGkAtoms::x_ethi,
32 &nsGkAtoms::x_geor,
33 &nsGkAtoms::x_gujr,
34 &nsGkAtoms::x_guru,
35 &nsGkAtoms::x_khmr,
36 &nsGkAtoms::x_mlym,
37 &nsGkAtoms::x_orya,
38 &nsGkAtoms::x_telu,
39 &nsGkAtoms::x_knda,
40 &nsGkAtoms::x_sinh,
41 &nsGkAtoms::x_tibt
42 };
43
44 /**********************************************************************
45 * Unicode subranges as defined in unicode 3.0
46 * x-western, x-central-euro, tr, x-baltic -> latin
47 * 0000 - 036f
48 * 1e00 - 1eff
49 * 2000 - 206f (general punctuation)
50 * 20a0 - 20cf (currency symbols)
51 * 2100 - 214f (letterlike symbols)
52 * 2150 - 218f (Number Forms)
53 * el -> greek
54 * 0370 - 03ff
55 * 1f00 - 1fff
56 * x-cyrillic -> cyrillic
57 * 0400 - 04ff
58 * he -> hebrew
59 * 0590 - 05ff
60 * ar -> arabic
61 * 0600 - 06ff
62 * fb50 - fdff (arabic presentation forms)
63 * fe70 - feff (arabic presentation forms b)
64 * th - thai
65 * 0e00 - 0e7f
66 * ko -> korean
67 * ac00 - d7af (hangul Syllables)
68 * 1100 - 11ff (jamo)
69 * 3130 - 318f (hangul compatibility jamo)
70 * ja
71 * 3040 - 309f (hiragana)
72 * 30a0 - 30ff (katakana)
73 * zh-CN
74 * zh-TW
75 *
76 * CJK
77 * 3100 - 312f (bopomofo)
78 * 31a0 - 31bf (bopomofo extended)
79 * 3000 - 303f (CJK Symbols and Punctuation)
80 * 2e80 - 2eff (CJK radicals supplement)
81 * 2f00 - 2fdf (Kangxi Radicals)
82 * 2ff0 - 2fff (Ideographic Description Characters)
83 * 3190 - 319f (kanbun)
84 * 3200 - 32ff (Enclosed CJK letters and Months)
85 * 3300 - 33ff (CJK compatibility)
86 * 3400 - 4dbf (CJK Unified Ideographs Extension A)
87 * 4e00 - 9faf (CJK Unified Ideographs)
88 * f900 - fa5f (CJK Compatibility Ideographs)
89 * fe30 - fe4f (CJK compatibility Forms)
90 * ff00 - ffef (halfwidth and fullwidth forms)
91 *
92 * Armenian
93 * 0530 - 058f
94 * Sriac
95 * 0700 - 074f
96 * Thaana
97 * 0780 - 07bf
98 * Devanagari
99 * 0900 - 097f
100 * Bengali
101 * 0980 - 09ff
102 * Gurmukhi
103 * 0a00 - 0a7f
104 * Gujarati
105 * 0a80 - 0aff
106 * Oriya
107 * 0b00 - 0b7f
108 * Tamil
109 * 0b80 - 0bff
110 * Telugu
111 * 0c00 - 0c7f
112 * Kannada
113 * 0c80 - 0cff
114 * Malayalam
115 * 0d00 - 0d7f
116 * Sinhala
117 * 0d80 - 0def
118 * Lao
119 * 0e80 - 0eff
120 * Tibetan
121 * 0f00 - 0fbf
122 * Myanmar
123 * 1000 - 109f
124 * Georgian
125 * 10a0 - 10ff
126 * Ethiopic
127 * 1200 - 137f
128 * Cherokee
129 * 13a0 - 13ff
130 * Canadian Aboriginal Syllabics
131 * 1400 - 167f
132 * Ogham
133 * 1680 - 169f
134 * Runic
135 * 16a0 - 16ff
136 * Khmer
137 * 1780 - 17ff
138 * Mongolian
139 * 1800 - 18af
140 * Misc - superscripts and subscripts
141 * 2070 - 209f
142 * Misc - Combining Diacritical Marks for Symbols
143 * 20d0 - 20ff
144 * Misc - Arrows
145 * 2190 - 21ff
146 * Misc - Mathematical Operators
147 * 2200 - 22ff
148 * Misc - Miscellaneous Technical
149 * 2300 - 23ff
150 * Misc - Control picture
151 * 2400 - 243f
152 * Misc - Optical character recognition
153 * 2440 - 2450
154 * Misc - Enclose Alphanumerics
155 * 2460 - 24ff
156 * Misc - Box Drawing
157 * 2500 - 257f
158 * Misc - Block Elements
159 * 2580 - 259f
160 * Misc - Geometric Shapes
161 * 25a0 - 25ff
162 * Misc - Miscellaneous Symbols
163 * 2600 - 267f
164 * Misc - Dingbats
165 * 2700 - 27bf
166 * Misc - Braille Patterns
167 * 2800 - 28ff
168 * Yi Syllables
169 * a000 - a48f
170 * Yi radicals
171 * a490 - a4cf
172 * Alphabetic Presentation Forms
173 * fb00 - fb4f
174 * Misc - Combining half Marks
175 * fe20 - fe2f
176 * Misc - small form variants
177 * fe50 - fe6f
178 * Misc - Specials
179 * fff0 - ffff
180 *********************************************************************/
181
182
183
184 #define NUM_OF_SUBTABLES 10
185 #define SUBTABLE_SIZE 16
186
187 static const uint8_t gUnicodeSubrangeTable[NUM_OF_SUBTABLES][SUBTABLE_SIZE] =
188 {
189 { // table for X---
190 kRangeTableBase+1, //u0xxx
191 kRangeTableBase+2, //u1xxx
192 kRangeTableBase+3, //u2xxx
193 kRangeSetCJK, //u3xxx
194 kRangeSetCJK, //u4xxx
195 kRangeSetCJK, //u5xxx
196 kRangeSetCJK, //u6xxx
197 kRangeSetCJK, //u7xxx
198 kRangeSetCJK, //u8xxx
199 kRangeSetCJK, //u9xxx
200 kRangeTableBase+4, //uaxxx
201 kRangeKorean, //ubxxx
202 kRangeKorean, //ucxxx
203 kRangeTableBase+5, //udxxx
204 kRangePrivate, //uexxx
205 kRangeTableBase+6 //ufxxx
206 },
207 { //table for 0X--
208 kRangeSetLatin, //u00xx
209 kRangeSetLatin, //u01xx
210 kRangeSetLatin, //u02xx
211 kRangeGreek, //u03xx XXX 0300-036f is in fact kRangeCombiningDiacriticalMarks
212 kRangeCyrillic, //u04xx
213 kRangeTableBase+7, //u05xx, includes Cyrillic supplement, Hebrew, and Armenian
214 kRangeArabic, //u06xx
215 kRangeTertiaryTable, //u07xx
216 kRangeUnassigned, //u08xx
217 kRangeTertiaryTable, //u09xx
218 kRangeTertiaryTable, //u0axx
219 kRangeTertiaryTable, //u0bxx
220 kRangeTertiaryTable, //u0cxx
221 kRangeTertiaryTable, //u0dxx
222 kRangeTertiaryTable, //u0exx
223 kRangeTibetan //u0fxx
224 },
225 { //table for 1x--
226 kRangeTertiaryTable, //u10xx
227 kRangeKorean, //u11xx
228 kRangeEthiopic, //u12xx
229 kRangeTertiaryTable, //u13xx
230 kRangeCanadian, //u14xx
231 kRangeCanadian, //u15xx
232 kRangeTertiaryTable, //u16xx
233 kRangeKhmer, //u17xx
234 kRangeMongolian, //u18xx
235 kRangeUnassigned, //u19xx
236 kRangeUnassigned, //u1axx
237 kRangeUnassigned, //u1bxx
238 kRangeUnassigned, //u1cxx
239 kRangeUnassigned, //u1dxx
240 kRangeSetLatin, //u1exx
241 kRangeGreek //u1fxx
242 },
243 { //table for 2x--
244 kRangeSetLatin, //u20xx
245 kRangeSetLatin, //u21xx
246 kRangeMathOperators, //u22xx
247 kRangeMiscTechnical, //u23xx
248 kRangeControlOpticalEnclose, //u24xx
249 kRangeBoxBlockGeometrics, //u25xx
250 kRangeMiscSymbols, //u26xx
251 kRangeDingbats, //u27xx
252 kRangeBraillePattern, //u28xx
253 kRangeUnassigned, //u29xx
254 kRangeUnassigned, //u2axx
255 kRangeUnassigned, //u2bxx
256 kRangeUnassigned, //u2cxx
257 kRangeUnassigned, //u2dxx
258 kRangeSetCJK, //u2exx
259 kRangeSetCJK //u2fxx
260 },
261 { //table for ax--
262 kRangeYi, //ua0xx
263 kRangeYi, //ua1xx
264 kRangeYi, //ua2xx
265 kRangeYi, //ua3xx
266 kRangeYi, //ua4xx
267 kRangeUnassigned, //ua5xx
268 kRangeUnassigned, //ua6xx
269 kRangeUnassigned, //ua7xx
270 kRangeUnassigned, //ua8xx
271 kRangeUnassigned, //ua9xx
272 kRangeUnassigned, //uaaxx
273 kRangeUnassigned, //uabxx
274 kRangeKorean, //uacxx
275 kRangeKorean, //uadxx
276 kRangeKorean, //uaexx
277 kRangeKorean //uafxx
278 },
279 { //table for dx--
280 kRangeKorean, //ud0xx
281 kRangeKorean, //ud1xx
282 kRangeKorean, //ud2xx
283 kRangeKorean, //ud3xx
284 kRangeKorean, //ud4xx
285 kRangeKorean, //ud5xx
286 kRangeKorean, //ud6xx
287 kRangeKorean, //ud7xx
288 kRangeSurrogate, //ud8xx
289 kRangeSurrogate, //ud9xx
290 kRangeSurrogate, //udaxx
291 kRangeSurrogate, //udbxx
292 kRangeSurrogate, //udcxx
293 kRangeSurrogate, //uddxx
294 kRangeSurrogate, //udexx
295 kRangeSurrogate //udfxx
296 },
297 { // table for fx--
298 kRangePrivate, //uf0xx
299 kRangePrivate, //uf1xx
300 kRangePrivate, //uf2xx
301 kRangePrivate, //uf3xx
302 kRangePrivate, //uf4xx
303 kRangePrivate, //uf5xx
304 kRangePrivate, //uf6xx
305 kRangePrivate, //uf7xx
306 kRangePrivate, //uf8xx
307 kRangeSetCJK, //uf9xx
308 kRangeSetCJK, //ufaxx
309 kRangeArabic, //ufbxx, includes alphabic presentation form
310 kRangeArabic, //ufcxx
311 kRangeArabic, //ufdxx
312 kRangeTableBase+8, //ufexx
313 kRangeTableBase+9 //uffxx, halfwidth and fullwidth forms, includes Specials
314 },
315 { //table for 0x0500 - 0x05ff
316 kRangeCyrillic, //u050x
317 kRangeCyrillic, //u051x
318 kRangeCyrillic, //u052x
319 kRangeArmenian, //u053x
320 kRangeArmenian, //u054x
321 kRangeArmenian, //u055x
322 kRangeArmenian, //u056x
323 kRangeArmenian, //u057x
324 kRangeArmenian, //u058x
325 kRangeHebrew, //u059x
326 kRangeHebrew, //u05ax
327 kRangeHebrew, //u05bx
328 kRangeHebrew, //u05cx
329 kRangeHebrew, //u05dx
330 kRangeHebrew, //u05ex
331 kRangeHebrew //u05fx
332 },
333 { //table for 0xfe00 - 0xfeff
334 kRangeSetCJK, //ufe0x
335 kRangeSetCJK, //ufe1x
336 kRangeSetCJK, //ufe2x
337 kRangeSetCJK, //ufe3x
338 kRangeSetCJK, //ufe4x
339 kRangeSetCJK, //ufe5x
340 kRangeSetCJK, //ufe6x
341 kRangeArabic, //ufe7x
342 kRangeArabic, //ufe8x
343 kRangeArabic, //ufe9x
344 kRangeArabic, //ufeax
345 kRangeArabic, //ufebx
346 kRangeArabic, //ufecx
347 kRangeArabic, //ufedx
348 kRangeArabic, //ufeex
349 kRangeArabic //ufefx
350 },
351 { //table for 0xff00 - 0xffff
352 kRangeSetCJK, //uff0x, fullwidth latin
353 kRangeSetCJK, //uff1x, fullwidth latin
354 kRangeSetCJK, //uff2x, fullwidth latin
355 kRangeSetCJK, //uff3x, fullwidth latin
356 kRangeSetCJK, //uff4x, fullwidth latin
357 kRangeSetCJK, //uff5x, fullwidth latin
358 kRangeSetCJK, //uff6x, halfwidth katakana
359 kRangeSetCJK, //uff7x, halfwidth katakana
360 kRangeSetCJK, //uff8x, halfwidth katakana
361 kRangeSetCJK, //uff9x, halfwidth katakana
362 kRangeSetCJK, //uffax, halfwidth hangul jamo
363 kRangeSetCJK, //uffbx, halfwidth hangul jamo
364 kRangeSetCJK, //uffcx, halfwidth hangul jamo
365 kRangeSetCJK, //uffdx, halfwidth hangul jamo
366 kRangeSetCJK, //uffex, fullwidth symbols
367 kRangeSpecials, //ufffx, Specials
368 },
369 };
370
371 // Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80)
372 // code points so that the number of entries in the tertiary range
373 // table for that range is obtained by dividing (0x1700 - 0x0700) by 128.
374 // Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal
375 // syllabaries take multiple chunks and Ogham and Runic share a single chunk.
376 #define TERTIARY_TABLE_SIZE ((0x1700 - 0x0700) / 0x80)
377
378 static const uint8_t gUnicodeTertiaryRangeTable[TERTIARY_TABLE_SIZE] =
379 { //table for 0x0700 - 0x1600
380 kRangeSyriac, //u070x
381 kRangeThaana, //u078x
382 kRangeUnassigned, //u080x place holder(resolved in the 2ndary tab.)
383 kRangeUnassigned, //u088x place holder(resolved in the 2ndary tab.)
384 kRangeDevanagari, //u090x
385 kRangeBengali, //u098x
386 kRangeGurmukhi, //u0a0x
387 kRangeGujarati, //u0a8x
388 kRangeOriya, //u0b0x
389 kRangeTamil, //u0b8x
390 kRangeTelugu, //u0c0x
391 kRangeKannada, //u0c8x
392 kRangeMalayalam, //u0d0x
393 kRangeSinhala, //u0d8x
394 kRangeThai, //u0e0x
395 kRangeLao, //u0e8x
396 kRangeTibetan, //u0f0x place holder(resolved in the 2ndary tab.)
397 kRangeTibetan, //u0f8x place holder(resolved in the 2ndary tab.)
398 kRangeMyanmar, //u100x
399 kRangeGeorgian, //u108x
400 kRangeKorean, //u110x place holder(resolved in the 2ndary tab.)
401 kRangeKorean, //u118x place holder(resolved in the 2ndary tab.)
402 kRangeEthiopic, //u120x place holder(resolved in the 2ndary tab.)
403 kRangeEthiopic, //u128x place holder(resolved in the 2ndary tab.)
404 kRangeEthiopic, //u130x
405 kRangeCherokee, //u138x
406 kRangeCanadian, //u140x place holder(resolved in the 2ndary tab.)
407 kRangeCanadian, //u148x place holder(resolved in the 2ndary tab.)
408 kRangeCanadian, //u150x place holder(resolved in the 2ndary tab.)
409 kRangeCanadian, //u158x place holder(resolved in the 2ndary tab.)
410 kRangeCanadian, //u160x
411 kRangeOghamRunic //u168x this contains two scripts, Ogham & Runic
412 };
413
414 // A two level index is almost enough for locating a range, with the
415 // exception of u03xx and u05xx. Since we don't really care about range for
416 // combining diacritical marks in our font application, they are
417 // not discriminated further. But future adoption of this module for other use
418 // should be aware of this limitation. The implementation can be extended if
419 // there is such a need.
420 // For Indic, Southeast Asian scripts and some other scripts between
421 // U+0700 and U+16FF, it's extended to the third level.
422 uint32_t FindCharUnicodeRange(uint32_t ch)
423 {
424 uint32_t range;
425
426 // aggregate ranges for non-BMP codepoints
427 if (ch > 0xFFFF) {
428 uint32_t p = (ch >> 16);
429 if (p == 1) {
430 return kRangeSMP;
431 } else if (p == 2) {
432 return kRangeSetCJK;
433 }
434 return kRangeHigherPlanes;
435 }
436
437 // lookup explicit range for BMP codepoints
438 // first general range
439 range = gUnicodeSubrangeTable[0][ch >> 12];
440
441 // if general range is good enough, return that
442 if (range < kRangeTableBase)
443 // we try to get a specific range
444 return range;
445
446 // otherwise, use subrange tables
447 range = gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x0f00) >> 8];
448 if (range < kRangeTableBase)
449 return range;
450 if (range < kRangeTertiaryTable)
451 return gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x00f0) >> 4];
452
453 // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks
454 return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7];
455 }
456
457 nsIAtom *LangGroupFromUnicodeRange(uint8_t unicodeRange)
458 {
459 if (kRangeSpecificItemNum > unicodeRange) {
460 nsIAtom **atom = gUnicodeRangeToLangGroupAtomTable[unicodeRange];
461 return *atom;
462 }
463 return nullptr;
464 }

mercurial