1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/thebes/nsUnicodeRange.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,464 @@ 1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 +#include "nsUnicodeRange.h" 1.10 +#include "nsGkAtoms.h" 1.11 +#include "mozilla/NullPtr.h" 1.12 + 1.13 +// This table depends on unicode range definitions. 1.14 +// Each item's index must correspond unicode range value 1.15 +// eg. x-cyrillic = LangGroupTable[kRangeCyrillic] 1.16 +static nsIAtom **gUnicodeRangeToLangGroupAtomTable[] = 1.17 +{ 1.18 + &nsGkAtoms::x_cyrillic, 1.19 + &nsGkAtoms::el_, 1.20 + &nsGkAtoms::tr, 1.21 + &nsGkAtoms::he, 1.22 + &nsGkAtoms::ar, 1.23 + &nsGkAtoms::x_baltic, 1.24 + &nsGkAtoms::th, 1.25 + &nsGkAtoms::ko, 1.26 + &nsGkAtoms::Japanese, 1.27 + &nsGkAtoms::zh_cn, 1.28 + &nsGkAtoms::zh_tw, 1.29 + &nsGkAtoms::x_devanagari, 1.30 + &nsGkAtoms::x_tamil, 1.31 + &nsGkAtoms::x_armn, 1.32 + &nsGkAtoms::x_beng, 1.33 + &nsGkAtoms::x_cans, 1.34 + &nsGkAtoms::x_ethi, 1.35 + &nsGkAtoms::x_geor, 1.36 + &nsGkAtoms::x_gujr, 1.37 + &nsGkAtoms::x_guru, 1.38 + &nsGkAtoms::x_khmr, 1.39 + &nsGkAtoms::x_mlym, 1.40 + &nsGkAtoms::x_orya, 1.41 + &nsGkAtoms::x_telu, 1.42 + &nsGkAtoms::x_knda, 1.43 + &nsGkAtoms::x_sinh, 1.44 + &nsGkAtoms::x_tibt 1.45 +}; 1.46 + 1.47 +/********************************************************************** 1.48 + * Unicode subranges as defined in unicode 3.0 1.49 + * x-western, x-central-euro, tr, x-baltic -> latin 1.50 + * 0000 - 036f 1.51 + * 1e00 - 1eff 1.52 + * 2000 - 206f (general punctuation) 1.53 + * 20a0 - 20cf (currency symbols) 1.54 + * 2100 - 214f (letterlike symbols) 1.55 + * 2150 - 218f (Number Forms) 1.56 + * el -> greek 1.57 + * 0370 - 03ff 1.58 + * 1f00 - 1fff 1.59 + * x-cyrillic -> cyrillic 1.60 + * 0400 - 04ff 1.61 + * he -> hebrew 1.62 + * 0590 - 05ff 1.63 + * ar -> arabic 1.64 + * 0600 - 06ff 1.65 + * fb50 - fdff (arabic presentation forms) 1.66 + * fe70 - feff (arabic presentation forms b) 1.67 + * th - thai 1.68 + * 0e00 - 0e7f 1.69 + * ko -> korean 1.70 + * ac00 - d7af (hangul Syllables) 1.71 + * 1100 - 11ff (jamo) 1.72 + * 3130 - 318f (hangul compatibility jamo) 1.73 + * ja 1.74 + * 3040 - 309f (hiragana) 1.75 + * 30a0 - 30ff (katakana) 1.76 + * zh-CN 1.77 + * zh-TW 1.78 + * 1.79 + * CJK 1.80 + * 3100 - 312f (bopomofo) 1.81 + * 31a0 - 31bf (bopomofo extended) 1.82 + * 3000 - 303f (CJK Symbols and Punctuation) 1.83 + * 2e80 - 2eff (CJK radicals supplement) 1.84 + * 2f00 - 2fdf (Kangxi Radicals) 1.85 + * 2ff0 - 2fff (Ideographic Description Characters) 1.86 + * 3190 - 319f (kanbun) 1.87 + * 3200 - 32ff (Enclosed CJK letters and Months) 1.88 + * 3300 - 33ff (CJK compatibility) 1.89 + * 3400 - 4dbf (CJK Unified Ideographs Extension A) 1.90 + * 4e00 - 9faf (CJK Unified Ideographs) 1.91 + * f900 - fa5f (CJK Compatibility Ideographs) 1.92 + * fe30 - fe4f (CJK compatibility Forms) 1.93 + * ff00 - ffef (halfwidth and fullwidth forms) 1.94 + * 1.95 + * Armenian 1.96 + * 0530 - 058f 1.97 + * Sriac 1.98 + * 0700 - 074f 1.99 + * Thaana 1.100 + * 0780 - 07bf 1.101 + * Devanagari 1.102 + * 0900 - 097f 1.103 + * Bengali 1.104 + * 0980 - 09ff 1.105 + * Gurmukhi 1.106 + * 0a00 - 0a7f 1.107 + * Gujarati 1.108 + * 0a80 - 0aff 1.109 + * Oriya 1.110 + * 0b00 - 0b7f 1.111 + * Tamil 1.112 + * 0b80 - 0bff 1.113 + * Telugu 1.114 + * 0c00 - 0c7f 1.115 + * Kannada 1.116 + * 0c80 - 0cff 1.117 + * Malayalam 1.118 + * 0d00 - 0d7f 1.119 + * Sinhala 1.120 + * 0d80 - 0def 1.121 + * Lao 1.122 + * 0e80 - 0eff 1.123 + * Tibetan 1.124 + * 0f00 - 0fbf 1.125 + * Myanmar 1.126 + * 1000 - 109f 1.127 + * Georgian 1.128 + * 10a0 - 10ff 1.129 + * Ethiopic 1.130 + * 1200 - 137f 1.131 + * Cherokee 1.132 + * 13a0 - 13ff 1.133 + * Canadian Aboriginal Syllabics 1.134 + * 1400 - 167f 1.135 + * Ogham 1.136 + * 1680 - 169f 1.137 + * Runic 1.138 + * 16a0 - 16ff 1.139 + * Khmer 1.140 + * 1780 - 17ff 1.141 + * Mongolian 1.142 + * 1800 - 18af 1.143 + * Misc - superscripts and subscripts 1.144 + * 2070 - 209f 1.145 + * Misc - Combining Diacritical Marks for Symbols 1.146 + * 20d0 - 20ff 1.147 + * Misc - Arrows 1.148 + * 2190 - 21ff 1.149 + * Misc - Mathematical Operators 1.150 + * 2200 - 22ff 1.151 + * Misc - Miscellaneous Technical 1.152 + * 2300 - 23ff 1.153 + * Misc - Control picture 1.154 + * 2400 - 243f 1.155 + * Misc - Optical character recognition 1.156 + * 2440 - 2450 1.157 + * Misc - Enclose Alphanumerics 1.158 + * 2460 - 24ff 1.159 + * Misc - Box Drawing 1.160 + * 2500 - 257f 1.161 + * Misc - Block Elements 1.162 + * 2580 - 259f 1.163 + * Misc - Geometric Shapes 1.164 + * 25a0 - 25ff 1.165 + * Misc - Miscellaneous Symbols 1.166 + * 2600 - 267f 1.167 + * Misc - Dingbats 1.168 + * 2700 - 27bf 1.169 + * Misc - Braille Patterns 1.170 + * 2800 - 28ff 1.171 + * Yi Syllables 1.172 + * a000 - a48f 1.173 + * Yi radicals 1.174 + * a490 - a4cf 1.175 + * Alphabetic Presentation Forms 1.176 + * fb00 - fb4f 1.177 + * Misc - Combining half Marks 1.178 + * fe20 - fe2f 1.179 + * Misc - small form variants 1.180 + * fe50 - fe6f 1.181 + * Misc - Specials 1.182 + * fff0 - ffff 1.183 + *********************************************************************/ 1.184 + 1.185 + 1.186 + 1.187 +#define NUM_OF_SUBTABLES 10 1.188 +#define SUBTABLE_SIZE 16 1.189 + 1.190 +static const uint8_t gUnicodeSubrangeTable[NUM_OF_SUBTABLES][SUBTABLE_SIZE] = 1.191 +{ 1.192 + { // table for X--- 1.193 + kRangeTableBase+1, //u0xxx 1.194 + kRangeTableBase+2, //u1xxx 1.195 + kRangeTableBase+3, //u2xxx 1.196 + kRangeSetCJK, //u3xxx 1.197 + kRangeSetCJK, //u4xxx 1.198 + kRangeSetCJK, //u5xxx 1.199 + kRangeSetCJK, //u6xxx 1.200 + kRangeSetCJK, //u7xxx 1.201 + kRangeSetCJK, //u8xxx 1.202 + kRangeSetCJK, //u9xxx 1.203 + kRangeTableBase+4, //uaxxx 1.204 + kRangeKorean, //ubxxx 1.205 + kRangeKorean, //ucxxx 1.206 + kRangeTableBase+5, //udxxx 1.207 + kRangePrivate, //uexxx 1.208 + kRangeTableBase+6 //ufxxx 1.209 + }, 1.210 + { //table for 0X-- 1.211 + kRangeSetLatin, //u00xx 1.212 + kRangeSetLatin, //u01xx 1.213 + kRangeSetLatin, //u02xx 1.214 + kRangeGreek, //u03xx XXX 0300-036f is in fact kRangeCombiningDiacriticalMarks 1.215 + kRangeCyrillic, //u04xx 1.216 + kRangeTableBase+7, //u05xx, includes Cyrillic supplement, Hebrew, and Armenian 1.217 + kRangeArabic, //u06xx 1.218 + kRangeTertiaryTable, //u07xx 1.219 + kRangeUnassigned, //u08xx 1.220 + kRangeTertiaryTable, //u09xx 1.221 + kRangeTertiaryTable, //u0axx 1.222 + kRangeTertiaryTable, //u0bxx 1.223 + kRangeTertiaryTable, //u0cxx 1.224 + kRangeTertiaryTable, //u0dxx 1.225 + kRangeTertiaryTable, //u0exx 1.226 + kRangeTibetan //u0fxx 1.227 + }, 1.228 + { //table for 1x-- 1.229 + kRangeTertiaryTable, //u10xx 1.230 + kRangeKorean, //u11xx 1.231 + kRangeEthiopic, //u12xx 1.232 + kRangeTertiaryTable, //u13xx 1.233 + kRangeCanadian, //u14xx 1.234 + kRangeCanadian, //u15xx 1.235 + kRangeTertiaryTable, //u16xx 1.236 + kRangeKhmer, //u17xx 1.237 + kRangeMongolian, //u18xx 1.238 + kRangeUnassigned, //u19xx 1.239 + kRangeUnassigned, //u1axx 1.240 + kRangeUnassigned, //u1bxx 1.241 + kRangeUnassigned, //u1cxx 1.242 + kRangeUnassigned, //u1dxx 1.243 + kRangeSetLatin, //u1exx 1.244 + kRangeGreek //u1fxx 1.245 + }, 1.246 + { //table for 2x-- 1.247 + kRangeSetLatin, //u20xx 1.248 + kRangeSetLatin, //u21xx 1.249 + kRangeMathOperators, //u22xx 1.250 + kRangeMiscTechnical, //u23xx 1.251 + kRangeControlOpticalEnclose, //u24xx 1.252 + kRangeBoxBlockGeometrics, //u25xx 1.253 + kRangeMiscSymbols, //u26xx 1.254 + kRangeDingbats, //u27xx 1.255 + kRangeBraillePattern, //u28xx 1.256 + kRangeUnassigned, //u29xx 1.257 + kRangeUnassigned, //u2axx 1.258 + kRangeUnassigned, //u2bxx 1.259 + kRangeUnassigned, //u2cxx 1.260 + kRangeUnassigned, //u2dxx 1.261 + kRangeSetCJK, //u2exx 1.262 + kRangeSetCJK //u2fxx 1.263 + }, 1.264 + { //table for ax-- 1.265 + kRangeYi, //ua0xx 1.266 + kRangeYi, //ua1xx 1.267 + kRangeYi, //ua2xx 1.268 + kRangeYi, //ua3xx 1.269 + kRangeYi, //ua4xx 1.270 + kRangeUnassigned, //ua5xx 1.271 + kRangeUnassigned, //ua6xx 1.272 + kRangeUnassigned, //ua7xx 1.273 + kRangeUnassigned, //ua8xx 1.274 + kRangeUnassigned, //ua9xx 1.275 + kRangeUnassigned, //uaaxx 1.276 + kRangeUnassigned, //uabxx 1.277 + kRangeKorean, //uacxx 1.278 + kRangeKorean, //uadxx 1.279 + kRangeKorean, //uaexx 1.280 + kRangeKorean //uafxx 1.281 + }, 1.282 + { //table for dx-- 1.283 + kRangeKorean, //ud0xx 1.284 + kRangeKorean, //ud1xx 1.285 + kRangeKorean, //ud2xx 1.286 + kRangeKorean, //ud3xx 1.287 + kRangeKorean, //ud4xx 1.288 + kRangeKorean, //ud5xx 1.289 + kRangeKorean, //ud6xx 1.290 + kRangeKorean, //ud7xx 1.291 + kRangeSurrogate, //ud8xx 1.292 + kRangeSurrogate, //ud9xx 1.293 + kRangeSurrogate, //udaxx 1.294 + kRangeSurrogate, //udbxx 1.295 + kRangeSurrogate, //udcxx 1.296 + kRangeSurrogate, //uddxx 1.297 + kRangeSurrogate, //udexx 1.298 + kRangeSurrogate //udfxx 1.299 + }, 1.300 + { // table for fx-- 1.301 + kRangePrivate, //uf0xx 1.302 + kRangePrivate, //uf1xx 1.303 + kRangePrivate, //uf2xx 1.304 + kRangePrivate, //uf3xx 1.305 + kRangePrivate, //uf4xx 1.306 + kRangePrivate, //uf5xx 1.307 + kRangePrivate, //uf6xx 1.308 + kRangePrivate, //uf7xx 1.309 + kRangePrivate, //uf8xx 1.310 + kRangeSetCJK, //uf9xx 1.311 + kRangeSetCJK, //ufaxx 1.312 + kRangeArabic, //ufbxx, includes alphabic presentation form 1.313 + kRangeArabic, //ufcxx 1.314 + kRangeArabic, //ufdxx 1.315 + kRangeTableBase+8, //ufexx 1.316 + kRangeTableBase+9 //uffxx, halfwidth and fullwidth forms, includes Specials 1.317 + }, 1.318 + { //table for 0x0500 - 0x05ff 1.319 + kRangeCyrillic, //u050x 1.320 + kRangeCyrillic, //u051x 1.321 + kRangeCyrillic, //u052x 1.322 + kRangeArmenian, //u053x 1.323 + kRangeArmenian, //u054x 1.324 + kRangeArmenian, //u055x 1.325 + kRangeArmenian, //u056x 1.326 + kRangeArmenian, //u057x 1.327 + kRangeArmenian, //u058x 1.328 + kRangeHebrew, //u059x 1.329 + kRangeHebrew, //u05ax 1.330 + kRangeHebrew, //u05bx 1.331 + kRangeHebrew, //u05cx 1.332 + kRangeHebrew, //u05dx 1.333 + kRangeHebrew, //u05ex 1.334 + kRangeHebrew //u05fx 1.335 + }, 1.336 + { //table for 0xfe00 - 0xfeff 1.337 + kRangeSetCJK, //ufe0x 1.338 + kRangeSetCJK, //ufe1x 1.339 + kRangeSetCJK, //ufe2x 1.340 + kRangeSetCJK, //ufe3x 1.341 + kRangeSetCJK, //ufe4x 1.342 + kRangeSetCJK, //ufe5x 1.343 + kRangeSetCJK, //ufe6x 1.344 + kRangeArabic, //ufe7x 1.345 + kRangeArabic, //ufe8x 1.346 + kRangeArabic, //ufe9x 1.347 + kRangeArabic, //ufeax 1.348 + kRangeArabic, //ufebx 1.349 + kRangeArabic, //ufecx 1.350 + kRangeArabic, //ufedx 1.351 + kRangeArabic, //ufeex 1.352 + kRangeArabic //ufefx 1.353 + }, 1.354 + { //table for 0xff00 - 0xffff 1.355 + kRangeSetCJK, //uff0x, fullwidth latin 1.356 + kRangeSetCJK, //uff1x, fullwidth latin 1.357 + kRangeSetCJK, //uff2x, fullwidth latin 1.358 + kRangeSetCJK, //uff3x, fullwidth latin 1.359 + kRangeSetCJK, //uff4x, fullwidth latin 1.360 + kRangeSetCJK, //uff5x, fullwidth latin 1.361 + kRangeSetCJK, //uff6x, halfwidth katakana 1.362 + kRangeSetCJK, //uff7x, halfwidth katakana 1.363 + kRangeSetCJK, //uff8x, halfwidth katakana 1.364 + kRangeSetCJK, //uff9x, halfwidth katakana 1.365 + kRangeSetCJK, //uffax, halfwidth hangul jamo 1.366 + kRangeSetCJK, //uffbx, halfwidth hangul jamo 1.367 + kRangeSetCJK, //uffcx, halfwidth hangul jamo 1.368 + kRangeSetCJK, //uffdx, halfwidth hangul jamo 1.369 + kRangeSetCJK, //uffex, fullwidth symbols 1.370 + kRangeSpecials, //ufffx, Specials 1.371 + }, 1.372 +}; 1.373 + 1.374 +// Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80) 1.375 +// code points so that the number of entries in the tertiary range 1.376 +// table for that range is obtained by dividing (0x1700 - 0x0700) by 128. 1.377 +// Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal 1.378 +// syllabaries take multiple chunks and Ogham and Runic share a single chunk. 1.379 +#define TERTIARY_TABLE_SIZE ((0x1700 - 0x0700) / 0x80) 1.380 + 1.381 +static const uint8_t gUnicodeTertiaryRangeTable[TERTIARY_TABLE_SIZE] = 1.382 +{ //table for 0x0700 - 0x1600 1.383 + kRangeSyriac, //u070x 1.384 + kRangeThaana, //u078x 1.385 + kRangeUnassigned, //u080x place holder(resolved in the 2ndary tab.) 1.386 + kRangeUnassigned, //u088x place holder(resolved in the 2ndary tab.) 1.387 + kRangeDevanagari, //u090x 1.388 + kRangeBengali, //u098x 1.389 + kRangeGurmukhi, //u0a0x 1.390 + kRangeGujarati, //u0a8x 1.391 + kRangeOriya, //u0b0x 1.392 + kRangeTamil, //u0b8x 1.393 + kRangeTelugu, //u0c0x 1.394 + kRangeKannada, //u0c8x 1.395 + kRangeMalayalam, //u0d0x 1.396 + kRangeSinhala, //u0d8x 1.397 + kRangeThai, //u0e0x 1.398 + kRangeLao, //u0e8x 1.399 + kRangeTibetan, //u0f0x place holder(resolved in the 2ndary tab.) 1.400 + kRangeTibetan, //u0f8x place holder(resolved in the 2ndary tab.) 1.401 + kRangeMyanmar, //u100x 1.402 + kRangeGeorgian, //u108x 1.403 + kRangeKorean, //u110x place holder(resolved in the 2ndary tab.) 1.404 + kRangeKorean, //u118x place holder(resolved in the 2ndary tab.) 1.405 + kRangeEthiopic, //u120x place holder(resolved in the 2ndary tab.) 1.406 + kRangeEthiopic, //u128x place holder(resolved in the 2ndary tab.) 1.407 + kRangeEthiopic, //u130x 1.408 + kRangeCherokee, //u138x 1.409 + kRangeCanadian, //u140x place holder(resolved in the 2ndary tab.) 1.410 + kRangeCanadian, //u148x place holder(resolved in the 2ndary tab.) 1.411 + kRangeCanadian, //u150x place holder(resolved in the 2ndary tab.) 1.412 + kRangeCanadian, //u158x place holder(resolved in the 2ndary tab.) 1.413 + kRangeCanadian, //u160x 1.414 + kRangeOghamRunic //u168x this contains two scripts, Ogham & Runic 1.415 +}; 1.416 + 1.417 +// A two level index is almost enough for locating a range, with the 1.418 +// exception of u03xx and u05xx. Since we don't really care about range for 1.419 +// combining diacritical marks in our font application, they are 1.420 +// not discriminated further. But future adoption of this module for other use 1.421 +// should be aware of this limitation. The implementation can be extended if 1.422 +// there is such a need. 1.423 +// For Indic, Southeast Asian scripts and some other scripts between 1.424 +// U+0700 and U+16FF, it's extended to the third level. 1.425 +uint32_t FindCharUnicodeRange(uint32_t ch) 1.426 +{ 1.427 + uint32_t range; 1.428 + 1.429 + // aggregate ranges for non-BMP codepoints 1.430 + if (ch > 0xFFFF) { 1.431 + uint32_t p = (ch >> 16); 1.432 + if (p == 1) { 1.433 + return kRangeSMP; 1.434 + } else if (p == 2) { 1.435 + return kRangeSetCJK; 1.436 + } 1.437 + return kRangeHigherPlanes; 1.438 + } 1.439 + 1.440 + // lookup explicit range for BMP codepoints 1.441 + // first general range 1.442 + range = gUnicodeSubrangeTable[0][ch >> 12]; 1.443 + 1.444 + // if general range is good enough, return that 1.445 + if (range < kRangeTableBase) 1.446 + // we try to get a specific range 1.447 + return range; 1.448 + 1.449 + // otherwise, use subrange tables 1.450 + range = gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x0f00) >> 8]; 1.451 + if (range < kRangeTableBase) 1.452 + return range; 1.453 + if (range < kRangeTertiaryTable) 1.454 + return gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x00f0) >> 4]; 1.455 + 1.456 + // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks 1.457 + return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7]; 1.458 +} 1.459 + 1.460 +nsIAtom *LangGroupFromUnicodeRange(uint8_t unicodeRange) 1.461 +{ 1.462 + if (kRangeSpecificItemNum > unicodeRange) { 1.463 + nsIAtom **atom = gUnicodeRangeToLangGroupAtomTable[unicodeRange]; 1.464 + return *atom; 1.465 + } 1.466 + return nullptr; 1.467 +}