michael@0: /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #include "nsUnicodeRange.h" michael@0: #include "nsGkAtoms.h" michael@0: #include "mozilla/NullPtr.h" michael@0: michael@0: // This table depends on unicode range definitions. michael@0: // Each item's index must correspond unicode range value michael@0: // eg. x-cyrillic = LangGroupTable[kRangeCyrillic] michael@0: static nsIAtom **gUnicodeRangeToLangGroupAtomTable[] = michael@0: { michael@0: &nsGkAtoms::x_cyrillic, michael@0: &nsGkAtoms::el_, michael@0: &nsGkAtoms::tr, michael@0: &nsGkAtoms::he, michael@0: &nsGkAtoms::ar, michael@0: &nsGkAtoms::x_baltic, michael@0: &nsGkAtoms::th, michael@0: &nsGkAtoms::ko, michael@0: &nsGkAtoms::Japanese, michael@0: &nsGkAtoms::zh_cn, michael@0: &nsGkAtoms::zh_tw, michael@0: &nsGkAtoms::x_devanagari, michael@0: &nsGkAtoms::x_tamil, michael@0: &nsGkAtoms::x_armn, michael@0: &nsGkAtoms::x_beng, michael@0: &nsGkAtoms::x_cans, michael@0: &nsGkAtoms::x_ethi, michael@0: &nsGkAtoms::x_geor, michael@0: &nsGkAtoms::x_gujr, michael@0: &nsGkAtoms::x_guru, michael@0: &nsGkAtoms::x_khmr, michael@0: &nsGkAtoms::x_mlym, michael@0: &nsGkAtoms::x_orya, michael@0: &nsGkAtoms::x_telu, michael@0: &nsGkAtoms::x_knda, michael@0: &nsGkAtoms::x_sinh, michael@0: &nsGkAtoms::x_tibt michael@0: }; michael@0: michael@0: /********************************************************************** michael@0: * Unicode subranges as defined in unicode 3.0 michael@0: * x-western, x-central-euro, tr, x-baltic -> latin michael@0: * 0000 - 036f michael@0: * 1e00 - 1eff michael@0: * 2000 - 206f (general punctuation) michael@0: * 20a0 - 20cf (currency symbols) michael@0: * 2100 - 214f (letterlike symbols) michael@0: * 2150 - 218f (Number Forms) michael@0: * el -> greek michael@0: * 0370 - 03ff michael@0: * 1f00 - 1fff michael@0: * x-cyrillic -> cyrillic michael@0: * 0400 - 04ff michael@0: * he -> hebrew michael@0: * 0590 - 05ff michael@0: * ar -> arabic michael@0: * 0600 - 06ff michael@0: * fb50 - fdff (arabic presentation forms) michael@0: * fe70 - feff (arabic presentation forms b) michael@0: * th - thai michael@0: * 0e00 - 0e7f michael@0: * ko -> korean michael@0: * ac00 - d7af (hangul Syllables) michael@0: * 1100 - 11ff (jamo) michael@0: * 3130 - 318f (hangul compatibility jamo) michael@0: * ja michael@0: * 3040 - 309f (hiragana) michael@0: * 30a0 - 30ff (katakana) michael@0: * zh-CN michael@0: * zh-TW michael@0: * michael@0: * CJK michael@0: * 3100 - 312f (bopomofo) michael@0: * 31a0 - 31bf (bopomofo extended) michael@0: * 3000 - 303f (CJK Symbols and Punctuation) michael@0: * 2e80 - 2eff (CJK radicals supplement) michael@0: * 2f00 - 2fdf (Kangxi Radicals) michael@0: * 2ff0 - 2fff (Ideographic Description Characters) michael@0: * 3190 - 319f (kanbun) michael@0: * 3200 - 32ff (Enclosed CJK letters and Months) michael@0: * 3300 - 33ff (CJK compatibility) michael@0: * 3400 - 4dbf (CJK Unified Ideographs Extension A) michael@0: * 4e00 - 9faf (CJK Unified Ideographs) michael@0: * f900 - fa5f (CJK Compatibility Ideographs) michael@0: * fe30 - fe4f (CJK compatibility Forms) michael@0: * ff00 - ffef (halfwidth and fullwidth forms) michael@0: * michael@0: * Armenian michael@0: * 0530 - 058f michael@0: * Sriac michael@0: * 0700 - 074f michael@0: * Thaana michael@0: * 0780 - 07bf michael@0: * Devanagari michael@0: * 0900 - 097f michael@0: * Bengali michael@0: * 0980 - 09ff michael@0: * Gurmukhi michael@0: * 0a00 - 0a7f michael@0: * Gujarati michael@0: * 0a80 - 0aff michael@0: * Oriya michael@0: * 0b00 - 0b7f michael@0: * Tamil michael@0: * 0b80 - 0bff michael@0: * Telugu michael@0: * 0c00 - 0c7f michael@0: * Kannada michael@0: * 0c80 - 0cff michael@0: * Malayalam michael@0: * 0d00 - 0d7f michael@0: * Sinhala michael@0: * 0d80 - 0def michael@0: * Lao michael@0: * 0e80 - 0eff michael@0: * Tibetan michael@0: * 0f00 - 0fbf michael@0: * Myanmar michael@0: * 1000 - 109f michael@0: * Georgian michael@0: * 10a0 - 10ff michael@0: * Ethiopic michael@0: * 1200 - 137f michael@0: * Cherokee michael@0: * 13a0 - 13ff michael@0: * Canadian Aboriginal Syllabics michael@0: * 1400 - 167f michael@0: * Ogham michael@0: * 1680 - 169f michael@0: * Runic michael@0: * 16a0 - 16ff michael@0: * Khmer michael@0: * 1780 - 17ff michael@0: * Mongolian michael@0: * 1800 - 18af michael@0: * Misc - superscripts and subscripts michael@0: * 2070 - 209f michael@0: * Misc - Combining Diacritical Marks for Symbols michael@0: * 20d0 - 20ff michael@0: * Misc - Arrows michael@0: * 2190 - 21ff michael@0: * Misc - Mathematical Operators michael@0: * 2200 - 22ff michael@0: * Misc - Miscellaneous Technical michael@0: * 2300 - 23ff michael@0: * Misc - Control picture michael@0: * 2400 - 243f michael@0: * Misc - Optical character recognition michael@0: * 2440 - 2450 michael@0: * Misc - Enclose Alphanumerics michael@0: * 2460 - 24ff michael@0: * Misc - Box Drawing michael@0: * 2500 - 257f michael@0: * Misc - Block Elements michael@0: * 2580 - 259f michael@0: * Misc - Geometric Shapes michael@0: * 25a0 - 25ff michael@0: * Misc - Miscellaneous Symbols michael@0: * 2600 - 267f michael@0: * Misc - Dingbats michael@0: * 2700 - 27bf michael@0: * Misc - Braille Patterns michael@0: * 2800 - 28ff michael@0: * Yi Syllables michael@0: * a000 - a48f michael@0: * Yi radicals michael@0: * a490 - a4cf michael@0: * Alphabetic Presentation Forms michael@0: * fb00 - fb4f michael@0: * Misc - Combining half Marks michael@0: * fe20 - fe2f michael@0: * Misc - small form variants michael@0: * fe50 - fe6f michael@0: * Misc - Specials michael@0: * fff0 - ffff michael@0: *********************************************************************/ michael@0: michael@0: michael@0: michael@0: #define NUM_OF_SUBTABLES 10 michael@0: #define SUBTABLE_SIZE 16 michael@0: michael@0: static const uint8_t gUnicodeSubrangeTable[NUM_OF_SUBTABLES][SUBTABLE_SIZE] = michael@0: { michael@0: { // table for X--- michael@0: kRangeTableBase+1, //u0xxx michael@0: kRangeTableBase+2, //u1xxx michael@0: kRangeTableBase+3, //u2xxx michael@0: kRangeSetCJK, //u3xxx michael@0: kRangeSetCJK, //u4xxx michael@0: kRangeSetCJK, //u5xxx michael@0: kRangeSetCJK, //u6xxx michael@0: kRangeSetCJK, //u7xxx michael@0: kRangeSetCJK, //u8xxx michael@0: kRangeSetCJK, //u9xxx michael@0: kRangeTableBase+4, //uaxxx michael@0: kRangeKorean, //ubxxx michael@0: kRangeKorean, //ucxxx michael@0: kRangeTableBase+5, //udxxx michael@0: kRangePrivate, //uexxx michael@0: kRangeTableBase+6 //ufxxx michael@0: }, michael@0: { //table for 0X-- michael@0: kRangeSetLatin, //u00xx michael@0: kRangeSetLatin, //u01xx michael@0: kRangeSetLatin, //u02xx michael@0: kRangeGreek, //u03xx XXX 0300-036f is in fact kRangeCombiningDiacriticalMarks michael@0: kRangeCyrillic, //u04xx michael@0: kRangeTableBase+7, //u05xx, includes Cyrillic supplement, Hebrew, and Armenian michael@0: kRangeArabic, //u06xx michael@0: kRangeTertiaryTable, //u07xx michael@0: kRangeUnassigned, //u08xx michael@0: kRangeTertiaryTable, //u09xx michael@0: kRangeTertiaryTable, //u0axx michael@0: kRangeTertiaryTable, //u0bxx michael@0: kRangeTertiaryTable, //u0cxx michael@0: kRangeTertiaryTable, //u0dxx michael@0: kRangeTertiaryTable, //u0exx michael@0: kRangeTibetan //u0fxx michael@0: }, michael@0: { //table for 1x-- michael@0: kRangeTertiaryTable, //u10xx michael@0: kRangeKorean, //u11xx michael@0: kRangeEthiopic, //u12xx michael@0: kRangeTertiaryTable, //u13xx michael@0: kRangeCanadian, //u14xx michael@0: kRangeCanadian, //u15xx michael@0: kRangeTertiaryTable, //u16xx michael@0: kRangeKhmer, //u17xx michael@0: kRangeMongolian, //u18xx michael@0: kRangeUnassigned, //u19xx michael@0: kRangeUnassigned, //u1axx michael@0: kRangeUnassigned, //u1bxx michael@0: kRangeUnassigned, //u1cxx michael@0: kRangeUnassigned, //u1dxx michael@0: kRangeSetLatin, //u1exx michael@0: kRangeGreek //u1fxx michael@0: }, michael@0: { //table for 2x-- michael@0: kRangeSetLatin, //u20xx michael@0: kRangeSetLatin, //u21xx michael@0: kRangeMathOperators, //u22xx michael@0: kRangeMiscTechnical, //u23xx michael@0: kRangeControlOpticalEnclose, //u24xx michael@0: kRangeBoxBlockGeometrics, //u25xx michael@0: kRangeMiscSymbols, //u26xx michael@0: kRangeDingbats, //u27xx michael@0: kRangeBraillePattern, //u28xx michael@0: kRangeUnassigned, //u29xx michael@0: kRangeUnassigned, //u2axx michael@0: kRangeUnassigned, //u2bxx michael@0: kRangeUnassigned, //u2cxx michael@0: kRangeUnassigned, //u2dxx michael@0: kRangeSetCJK, //u2exx michael@0: kRangeSetCJK //u2fxx michael@0: }, michael@0: { //table for ax-- michael@0: kRangeYi, //ua0xx michael@0: kRangeYi, //ua1xx michael@0: kRangeYi, //ua2xx michael@0: kRangeYi, //ua3xx michael@0: kRangeYi, //ua4xx michael@0: kRangeUnassigned, //ua5xx michael@0: kRangeUnassigned, //ua6xx michael@0: kRangeUnassigned, //ua7xx michael@0: kRangeUnassigned, //ua8xx michael@0: kRangeUnassigned, //ua9xx michael@0: kRangeUnassigned, //uaaxx michael@0: kRangeUnassigned, //uabxx michael@0: kRangeKorean, //uacxx michael@0: kRangeKorean, //uadxx michael@0: kRangeKorean, //uaexx michael@0: kRangeKorean //uafxx michael@0: }, michael@0: { //table for dx-- michael@0: kRangeKorean, //ud0xx michael@0: kRangeKorean, //ud1xx michael@0: kRangeKorean, //ud2xx michael@0: kRangeKorean, //ud3xx michael@0: kRangeKorean, //ud4xx michael@0: kRangeKorean, //ud5xx michael@0: kRangeKorean, //ud6xx michael@0: kRangeKorean, //ud7xx michael@0: kRangeSurrogate, //ud8xx michael@0: kRangeSurrogate, //ud9xx michael@0: kRangeSurrogate, //udaxx michael@0: kRangeSurrogate, //udbxx michael@0: kRangeSurrogate, //udcxx michael@0: kRangeSurrogate, //uddxx michael@0: kRangeSurrogate, //udexx michael@0: kRangeSurrogate //udfxx michael@0: }, michael@0: { // table for fx-- michael@0: kRangePrivate, //uf0xx michael@0: kRangePrivate, //uf1xx michael@0: kRangePrivate, //uf2xx michael@0: kRangePrivate, //uf3xx michael@0: kRangePrivate, //uf4xx michael@0: kRangePrivate, //uf5xx michael@0: kRangePrivate, //uf6xx michael@0: kRangePrivate, //uf7xx michael@0: kRangePrivate, //uf8xx michael@0: kRangeSetCJK, //uf9xx michael@0: kRangeSetCJK, //ufaxx michael@0: kRangeArabic, //ufbxx, includes alphabic presentation form michael@0: kRangeArabic, //ufcxx michael@0: kRangeArabic, //ufdxx michael@0: kRangeTableBase+8, //ufexx michael@0: kRangeTableBase+9 //uffxx, halfwidth and fullwidth forms, includes Specials michael@0: }, michael@0: { //table for 0x0500 - 0x05ff michael@0: kRangeCyrillic, //u050x michael@0: kRangeCyrillic, //u051x michael@0: kRangeCyrillic, //u052x michael@0: kRangeArmenian, //u053x michael@0: kRangeArmenian, //u054x michael@0: kRangeArmenian, //u055x michael@0: kRangeArmenian, //u056x michael@0: kRangeArmenian, //u057x michael@0: kRangeArmenian, //u058x michael@0: kRangeHebrew, //u059x michael@0: kRangeHebrew, //u05ax michael@0: kRangeHebrew, //u05bx michael@0: kRangeHebrew, //u05cx michael@0: kRangeHebrew, //u05dx michael@0: kRangeHebrew, //u05ex michael@0: kRangeHebrew //u05fx michael@0: }, michael@0: { //table for 0xfe00 - 0xfeff michael@0: kRangeSetCJK, //ufe0x michael@0: kRangeSetCJK, //ufe1x michael@0: kRangeSetCJK, //ufe2x michael@0: kRangeSetCJK, //ufe3x michael@0: kRangeSetCJK, //ufe4x michael@0: kRangeSetCJK, //ufe5x michael@0: kRangeSetCJK, //ufe6x michael@0: kRangeArabic, //ufe7x michael@0: kRangeArabic, //ufe8x michael@0: kRangeArabic, //ufe9x michael@0: kRangeArabic, //ufeax michael@0: kRangeArabic, //ufebx michael@0: kRangeArabic, //ufecx michael@0: kRangeArabic, //ufedx michael@0: kRangeArabic, //ufeex michael@0: kRangeArabic //ufefx michael@0: }, michael@0: { //table for 0xff00 - 0xffff michael@0: kRangeSetCJK, //uff0x, fullwidth latin michael@0: kRangeSetCJK, //uff1x, fullwidth latin michael@0: kRangeSetCJK, //uff2x, fullwidth latin michael@0: kRangeSetCJK, //uff3x, fullwidth latin michael@0: kRangeSetCJK, //uff4x, fullwidth latin michael@0: kRangeSetCJK, //uff5x, fullwidth latin michael@0: kRangeSetCJK, //uff6x, halfwidth katakana michael@0: kRangeSetCJK, //uff7x, halfwidth katakana michael@0: kRangeSetCJK, //uff8x, halfwidth katakana michael@0: kRangeSetCJK, //uff9x, halfwidth katakana michael@0: kRangeSetCJK, //uffax, halfwidth hangul jamo michael@0: kRangeSetCJK, //uffbx, halfwidth hangul jamo michael@0: kRangeSetCJK, //uffcx, halfwidth hangul jamo michael@0: kRangeSetCJK, //uffdx, halfwidth hangul jamo michael@0: kRangeSetCJK, //uffex, fullwidth symbols michael@0: kRangeSpecials, //ufffx, Specials michael@0: }, michael@0: }; michael@0: michael@0: // Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80) michael@0: // code points so that the number of entries in the tertiary range michael@0: // table for that range is obtained by dividing (0x1700 - 0x0700) by 128. michael@0: // Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal michael@0: // syllabaries take multiple chunks and Ogham and Runic share a single chunk. michael@0: #define TERTIARY_TABLE_SIZE ((0x1700 - 0x0700) / 0x80) michael@0: michael@0: static const uint8_t gUnicodeTertiaryRangeTable[TERTIARY_TABLE_SIZE] = michael@0: { //table for 0x0700 - 0x1600 michael@0: kRangeSyriac, //u070x michael@0: kRangeThaana, //u078x michael@0: kRangeUnassigned, //u080x place holder(resolved in the 2ndary tab.) michael@0: kRangeUnassigned, //u088x place holder(resolved in the 2ndary tab.) michael@0: kRangeDevanagari, //u090x michael@0: kRangeBengali, //u098x michael@0: kRangeGurmukhi, //u0a0x michael@0: kRangeGujarati, //u0a8x michael@0: kRangeOriya, //u0b0x michael@0: kRangeTamil, //u0b8x michael@0: kRangeTelugu, //u0c0x michael@0: kRangeKannada, //u0c8x michael@0: kRangeMalayalam, //u0d0x michael@0: kRangeSinhala, //u0d8x michael@0: kRangeThai, //u0e0x michael@0: kRangeLao, //u0e8x michael@0: kRangeTibetan, //u0f0x place holder(resolved in the 2ndary tab.) michael@0: kRangeTibetan, //u0f8x place holder(resolved in the 2ndary tab.) michael@0: kRangeMyanmar, //u100x michael@0: kRangeGeorgian, //u108x michael@0: kRangeKorean, //u110x place holder(resolved in the 2ndary tab.) michael@0: kRangeKorean, //u118x place holder(resolved in the 2ndary tab.) michael@0: kRangeEthiopic, //u120x place holder(resolved in the 2ndary tab.) michael@0: kRangeEthiopic, //u128x place holder(resolved in the 2ndary tab.) michael@0: kRangeEthiopic, //u130x michael@0: kRangeCherokee, //u138x michael@0: kRangeCanadian, //u140x place holder(resolved in the 2ndary tab.) michael@0: kRangeCanadian, //u148x place holder(resolved in the 2ndary tab.) michael@0: kRangeCanadian, //u150x place holder(resolved in the 2ndary tab.) michael@0: kRangeCanadian, //u158x place holder(resolved in the 2ndary tab.) michael@0: kRangeCanadian, //u160x michael@0: kRangeOghamRunic //u168x this contains two scripts, Ogham & Runic michael@0: }; michael@0: michael@0: // A two level index is almost enough for locating a range, with the michael@0: // exception of u03xx and u05xx. Since we don't really care about range for michael@0: // combining diacritical marks in our font application, they are michael@0: // not discriminated further. But future adoption of this module for other use michael@0: // should be aware of this limitation. The implementation can be extended if michael@0: // there is such a need. michael@0: // For Indic, Southeast Asian scripts and some other scripts between michael@0: // U+0700 and U+16FF, it's extended to the third level. michael@0: uint32_t FindCharUnicodeRange(uint32_t ch) michael@0: { michael@0: uint32_t range; michael@0: michael@0: // aggregate ranges for non-BMP codepoints michael@0: if (ch > 0xFFFF) { michael@0: uint32_t p = (ch >> 16); michael@0: if (p == 1) { michael@0: return kRangeSMP; michael@0: } else if (p == 2) { michael@0: return kRangeSetCJK; michael@0: } michael@0: return kRangeHigherPlanes; michael@0: } michael@0: michael@0: // lookup explicit range for BMP codepoints michael@0: // first general range michael@0: range = gUnicodeSubrangeTable[0][ch >> 12]; michael@0: michael@0: // if general range is good enough, return that michael@0: if (range < kRangeTableBase) michael@0: // we try to get a specific range michael@0: return range; michael@0: michael@0: // otherwise, use subrange tables michael@0: range = gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x0f00) >> 8]; michael@0: if (range < kRangeTableBase) michael@0: return range; michael@0: if (range < kRangeTertiaryTable) michael@0: return gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x00f0) >> 4]; michael@0: michael@0: // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks michael@0: return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7]; michael@0: } michael@0: michael@0: nsIAtom *LangGroupFromUnicodeRange(uint8_t unicodeRange) michael@0: { michael@0: if (kRangeSpecificItemNum > unicodeRange) { michael@0: nsIAtom **atom = gUnicodeRangeToLangGroupAtomTable[unicodeRange]; michael@0: return *atom; michael@0: } michael@0: return nullptr; michael@0: }