gfx/thebes/nsUnicodeRange.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     2 /* This Source Code Form is subject to the terms of the Mozilla Public
     3  * License, v. 2.0. If a copy of the MPL was not distributed with this
     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     6 #include "nsUnicodeRange.h"
     7 #include "nsGkAtoms.h"
     8 #include "mozilla/NullPtr.h"
    10 // This table depends on unicode range definitions. 
    11 // Each item's index must correspond unicode range value
    12 // eg. x-cyrillic = LangGroupTable[kRangeCyrillic]
    13 static nsIAtom **gUnicodeRangeToLangGroupAtomTable[] =
    14 {
    15   &nsGkAtoms::x_cyrillic,
    16   &nsGkAtoms::el_,
    17   &nsGkAtoms::tr,
    18   &nsGkAtoms::he,
    19   &nsGkAtoms::ar,
    20   &nsGkAtoms::x_baltic,
    21   &nsGkAtoms::th,
    22   &nsGkAtoms::ko,
    23   &nsGkAtoms::Japanese,
    24   &nsGkAtoms::zh_cn,
    25   &nsGkAtoms::zh_tw,
    26   &nsGkAtoms::x_devanagari,
    27   &nsGkAtoms::x_tamil,
    28   &nsGkAtoms::x_armn,
    29   &nsGkAtoms::x_beng,
    30   &nsGkAtoms::x_cans,
    31   &nsGkAtoms::x_ethi,
    32   &nsGkAtoms::x_geor,
    33   &nsGkAtoms::x_gujr,
    34   &nsGkAtoms::x_guru,
    35   &nsGkAtoms::x_khmr,
    36   &nsGkAtoms::x_mlym,
    37   &nsGkAtoms::x_orya,
    38   &nsGkAtoms::x_telu,
    39   &nsGkAtoms::x_knda,
    40   &nsGkAtoms::x_sinh,
    41   &nsGkAtoms::x_tibt
    42 };
    44 /**********************************************************************
    45  * Unicode subranges as defined in unicode 3.0
    46  * x-western, x-central-euro, tr, x-baltic  -> latin 
    47  *  0000 - 036f 
    48  *  1e00 - 1eff
    49  *  2000 - 206f  (general punctuation)
    50  *  20a0 - 20cf  (currency symbols)
    51  *  2100 - 214f  (letterlike symbols)
    52  *  2150 - 218f  (Number Forms)
    53  * el         -> greek
    54  *  0370 - 03ff
    55  *  1f00 - 1fff
    56  * x-cyrillic -> cyrillic
    57  *  0400 - 04ff
    58  * he         -> hebrew
    59  *  0590 - 05ff
    60  * ar         -> arabic
    61  *  0600 - 06ff
    62  *  fb50 - fdff (arabic presentation forms)
    63  *  fe70 - feff (arabic presentation forms b)
    64  * th - thai
    65  *  0e00 - 0e7f
    66  * ko        -> korean
    67  *  ac00 - d7af  (hangul Syllables)
    68  *  1100 - 11ff    (jamo)
    69  *  3130 - 318f (hangul compatibility jamo)
    70  * ja
    71  *  3040 - 309f (hiragana)
    72  *  30a0 - 30ff (katakana)
    73  * zh-CN
    74  * zh-TW
    75  *
    76  * CJK
    77  *  3100 - 312f (bopomofo)
    78  *  31a0 - 31bf (bopomofo extended)
    79  *  3000 - 303f (CJK Symbols and Punctuation) 
    80  *  2e80 - 2eff (CJK radicals supplement)
    81  *  2f00 - 2fdf (Kangxi Radicals)
    82  *  2ff0 - 2fff (Ideographic Description Characters)
    83  *  3190 - 319f (kanbun)
    84  *  3200 - 32ff (Enclosed CJK letters and Months)
    85  *  3300 - 33ff (CJK compatibility)
    86  *  3400 - 4dbf (CJK Unified Ideographs Extension A)
    87  *  4e00 - 9faf (CJK Unified Ideographs)
    88  *  f900 - fa5f (CJK Compatibility Ideographs)
    89  *  fe30 - fe4f (CJK compatibility Forms)
    90  *  ff00 - ffef (halfwidth and fullwidth forms)
    91  *
    92  * Armenian
    93  *  0530 - 058f 
    94  * Sriac 
    95  *  0700 - 074f
    96  * Thaana
    97  *  0780 - 07bf
    98  * Devanagari
    99  *  0900 - 097f
   100  * Bengali
   101  *  0980 - 09ff
   102  * Gurmukhi
   103  *  0a00 - 0a7f
   104  * Gujarati
   105  *  0a80 - 0aff
   106  * Oriya
   107  *  0b00 - 0b7f
   108  * Tamil
   109  *  0b80 - 0bff
   110  * Telugu
   111  *  0c00 - 0c7f
   112  * Kannada
   113  *  0c80 - 0cff
   114  * Malayalam
   115  *  0d00 - 0d7f
   116  * Sinhala
   117  *  0d80 - 0def
   118  * Lao
   119  *  0e80 - 0eff
   120  * Tibetan
   121  *  0f00 - 0fbf
   122  * Myanmar
   123  *  1000 - 109f
   124  * Georgian
   125  *  10a0 - 10ff
   126  * Ethiopic
   127  *  1200 - 137f
   128  * Cherokee
   129  *  13a0 - 13ff
   130  * Canadian Aboriginal Syllabics
   131  *  1400 - 167f
   132  * Ogham
   133  *  1680 - 169f
   134  * Runic 
   135  *  16a0 - 16ff
   136  * Khmer
   137  *  1780 - 17ff
   138  * Mongolian
   139  *  1800 - 18af
   140  * Misc - superscripts and subscripts
   141  *  2070 - 209f
   142  * Misc - Combining Diacritical Marks for Symbols
   143  *  20d0 - 20ff
   144  * Misc - Arrows
   145  *  2190 - 21ff
   146  * Misc - Mathematical Operators
   147  *  2200 - 22ff
   148  * Misc - Miscellaneous Technical
   149  *  2300 - 23ff
   150  * Misc - Control picture
   151  *  2400 - 243f
   152  * Misc - Optical character recognition
   153  *  2440 - 2450
   154  * Misc - Enclose Alphanumerics
   155  *  2460 - 24ff
   156  * Misc - Box Drawing 
   157  *  2500 - 257f
   158  * Misc - Block Elements
   159  *  2580 - 259f
   160  * Misc - Geometric Shapes
   161  *  25a0 - 25ff
   162  * Misc - Miscellaneous Symbols
   163  *  2600 - 267f
   164  * Misc - Dingbats
   165  *  2700 - 27bf
   166  * Misc - Braille Patterns
   167  *  2800 - 28ff
   168  * Yi Syllables
   169  *  a000 - a48f
   170  * Yi radicals
   171  *  a490 - a4cf
   172  * Alphabetic Presentation Forms
   173  *  fb00 - fb4f
   174  * Misc - Combining half Marks
   175  *  fe20 - fe2f
   176  * Misc - small form variants
   177  *  fe50 - fe6f
   178  * Misc - Specials
   179  *  fff0 - ffff
   180  *********************************************************************/
   184 #define NUM_OF_SUBTABLES      10
   185 #define SUBTABLE_SIZE         16
   187 static const uint8_t gUnicodeSubrangeTable[NUM_OF_SUBTABLES][SUBTABLE_SIZE] = 
   188 { 
   189   { // table for X---
   190     kRangeTableBase+1,  //u0xxx
   191     kRangeTableBase+2,  //u1xxx
   192     kRangeTableBase+3,  //u2xxx
   193     kRangeSetCJK,       //u3xxx
   194     kRangeSetCJK,       //u4xxx
   195     kRangeSetCJK,       //u5xxx
   196     kRangeSetCJK,       //u6xxx
   197     kRangeSetCJK,       //u7xxx
   198     kRangeSetCJK,       //u8xxx
   199     kRangeSetCJK,       //u9xxx
   200     kRangeTableBase+4,  //uaxxx
   201     kRangeKorean,       //ubxxx
   202     kRangeKorean,       //ucxxx
   203     kRangeTableBase+5,  //udxxx
   204     kRangePrivate,      //uexxx
   205     kRangeTableBase+6   //ufxxx
   206   },
   207   { //table for 0X--
   208     kRangeSetLatin,          //u00xx
   209     kRangeSetLatin,          //u01xx
   210     kRangeSetLatin,          //u02xx
   211     kRangeGreek,             //u03xx     XXX 0300-036f is in fact kRangeCombiningDiacriticalMarks
   212     kRangeCyrillic,          //u04xx
   213     kRangeTableBase+7,       //u05xx, includes Cyrillic supplement, Hebrew, and Armenian
   214     kRangeArabic,            //u06xx
   215     kRangeTertiaryTable,     //u07xx
   216     kRangeUnassigned,        //u08xx
   217     kRangeTertiaryTable,     //u09xx
   218     kRangeTertiaryTable,     //u0axx
   219     kRangeTertiaryTable,     //u0bxx
   220     kRangeTertiaryTable,     //u0cxx
   221     kRangeTertiaryTable,     //u0dxx
   222     kRangeTertiaryTable,     //u0exx
   223     kRangeTibetan            //u0fxx
   224   },
   225   { //table for 1x--
   226     kRangeTertiaryTable,     //u10xx
   227     kRangeKorean,            //u11xx
   228     kRangeEthiopic,          //u12xx
   229     kRangeTertiaryTable,     //u13xx
   230     kRangeCanadian,          //u14xx
   231     kRangeCanadian,          //u15xx
   232     kRangeTertiaryTable,     //u16xx
   233     kRangeKhmer,             //u17xx
   234     kRangeMongolian,         //u18xx
   235     kRangeUnassigned,        //u19xx
   236     kRangeUnassigned,        //u1axx
   237     kRangeUnassigned,        //u1bxx
   238     kRangeUnassigned,        //u1cxx
   239     kRangeUnassigned,        //u1dxx
   240     kRangeSetLatin,          //u1exx
   241     kRangeGreek              //u1fxx
   242   },
   243   { //table for 2x--
   244     kRangeSetLatin,          //u20xx
   245     kRangeSetLatin,          //u21xx
   246     kRangeMathOperators,     //u22xx
   247     kRangeMiscTechnical,     //u23xx
   248     kRangeControlOpticalEnclose, //u24xx
   249     kRangeBoxBlockGeometrics, //u25xx
   250     kRangeMiscSymbols,       //u26xx
   251     kRangeDingbats,          //u27xx
   252     kRangeBraillePattern,    //u28xx
   253     kRangeUnassigned,        //u29xx
   254     kRangeUnassigned,        //u2axx
   255     kRangeUnassigned,        //u2bxx
   256     kRangeUnassigned,        //u2cxx
   257     kRangeUnassigned,        //u2dxx
   258     kRangeSetCJK,            //u2exx
   259     kRangeSetCJK             //u2fxx
   260   },
   261   {  //table for ax--
   262     kRangeYi,                //ua0xx
   263     kRangeYi,                //ua1xx
   264     kRangeYi,                //ua2xx
   265     kRangeYi,                //ua3xx
   266     kRangeYi,                //ua4xx
   267     kRangeUnassigned,        //ua5xx
   268     kRangeUnassigned,        //ua6xx
   269     kRangeUnassigned,        //ua7xx
   270     kRangeUnassigned,        //ua8xx
   271     kRangeUnassigned,        //ua9xx
   272     kRangeUnassigned,        //uaaxx
   273     kRangeUnassigned,        //uabxx
   274     kRangeKorean,            //uacxx
   275     kRangeKorean,            //uadxx
   276     kRangeKorean,            //uaexx
   277     kRangeKorean             //uafxx
   278   },
   279   {  //table for dx--
   280     kRangeKorean,            //ud0xx
   281     kRangeKorean,            //ud1xx
   282     kRangeKorean,            //ud2xx
   283     kRangeKorean,            //ud3xx
   284     kRangeKorean,            //ud4xx
   285     kRangeKorean,            //ud5xx
   286     kRangeKorean,            //ud6xx
   287     kRangeKorean,            //ud7xx
   288     kRangeSurrogate,         //ud8xx
   289     kRangeSurrogate,         //ud9xx
   290     kRangeSurrogate,         //udaxx
   291     kRangeSurrogate,         //udbxx
   292     kRangeSurrogate,         //udcxx
   293     kRangeSurrogate,         //uddxx
   294     kRangeSurrogate,         //udexx
   295     kRangeSurrogate          //udfxx
   296   },
   297   { // table for fx--
   298     kRangePrivate,           //uf0xx 
   299     kRangePrivate,           //uf1xx 
   300     kRangePrivate,           //uf2xx 
   301     kRangePrivate,           //uf3xx 
   302     kRangePrivate,           //uf4xx 
   303     kRangePrivate,           //uf5xx 
   304     kRangePrivate,           //uf6xx 
   305     kRangePrivate,           //uf7xx 
   306     kRangePrivate,           //uf8xx 
   307     kRangeSetCJK,            //uf9xx 
   308     kRangeSetCJK,            //ufaxx 
   309     kRangeArabic,            //ufbxx, includes alphabic presentation form
   310     kRangeArabic,            //ufcxx
   311     kRangeArabic,            //ufdxx
   312     kRangeTableBase+8,       //ufexx
   313     kRangeTableBase+9        //uffxx, halfwidth and fullwidth forms, includes Specials
   314   },
   315   { //table for 0x0500 - 0x05ff
   316     kRangeCyrillic,          //u050x
   317     kRangeCyrillic,          //u051x
   318     kRangeCyrillic,          //u052x
   319     kRangeArmenian,          //u053x
   320     kRangeArmenian,          //u054x
   321     kRangeArmenian,          //u055x
   322     kRangeArmenian,          //u056x
   323     kRangeArmenian,          //u057x
   324     kRangeArmenian,          //u058x
   325     kRangeHebrew,            //u059x
   326     kRangeHebrew,            //u05ax
   327     kRangeHebrew,            //u05bx
   328     kRangeHebrew,            //u05cx
   329     kRangeHebrew,            //u05dx
   330     kRangeHebrew,            //u05ex
   331     kRangeHebrew             //u05fx
   332   },
   333   { //table for 0xfe00 - 0xfeff
   334     kRangeSetCJK,            //ufe0x
   335     kRangeSetCJK,            //ufe1x
   336     kRangeSetCJK,            //ufe2x
   337     kRangeSetCJK,            //ufe3x
   338     kRangeSetCJK,            //ufe4x
   339     kRangeSetCJK,            //ufe5x
   340     kRangeSetCJK,            //ufe6x
   341     kRangeArabic,            //ufe7x
   342     kRangeArabic,            //ufe8x
   343     kRangeArabic,            //ufe9x
   344     kRangeArabic,            //ufeax
   345     kRangeArabic,            //ufebx
   346     kRangeArabic,            //ufecx
   347     kRangeArabic,            //ufedx
   348     kRangeArabic,            //ufeex
   349     kRangeArabic             //ufefx
   350   },
   351   { //table for 0xff00 - 0xffff
   352     kRangeSetCJK,            //uff0x, fullwidth latin
   353     kRangeSetCJK,            //uff1x, fullwidth latin
   354     kRangeSetCJK,            //uff2x, fullwidth latin
   355     kRangeSetCJK,            //uff3x, fullwidth latin
   356     kRangeSetCJK,            //uff4x, fullwidth latin
   357     kRangeSetCJK,            //uff5x, fullwidth latin
   358     kRangeSetCJK,            //uff6x, halfwidth katakana
   359     kRangeSetCJK,            //uff7x, halfwidth katakana
   360     kRangeSetCJK,            //uff8x, halfwidth katakana
   361     kRangeSetCJK,            //uff9x, halfwidth katakana
   362     kRangeSetCJK,            //uffax, halfwidth hangul jamo
   363     kRangeSetCJK,            //uffbx, halfwidth hangul jamo
   364     kRangeSetCJK,            //uffcx, halfwidth hangul jamo
   365     kRangeSetCJK,            //uffdx, halfwidth hangul jamo
   366     kRangeSetCJK,            //uffex, fullwidth symbols
   367     kRangeSpecials,          //ufffx, Specials
   368   },
   369 };
   371 // Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80) 
   372 // code points  so that the number of entries in the tertiary range
   373 // table for that range is obtained by dividing (0x1700 - 0x0700) by 128.
   374 // Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal 
   375 // syllabaries take multiple chunks and Ogham and Runic share  a single chunk.
   376 #define TERTIARY_TABLE_SIZE ((0x1700 - 0x0700) / 0x80)
   378 static const uint8_t gUnicodeTertiaryRangeTable[TERTIARY_TABLE_SIZE] =
   379 { //table for 0x0700 - 0x1600 
   380     kRangeSyriac,            //u070x
   381     kRangeThaana,            //u078x
   382     kRangeUnassigned,        //u080x  place holder(resolved in the 2ndary tab.)
   383     kRangeUnassigned,        //u088x  place holder(resolved in the 2ndary tab.)
   384     kRangeDevanagari,        //u090x
   385     kRangeBengali,           //u098x
   386     kRangeGurmukhi,          //u0a0x
   387     kRangeGujarati,          //u0a8x
   388     kRangeOriya,             //u0b0x
   389     kRangeTamil,             //u0b8x
   390     kRangeTelugu,            //u0c0x
   391     kRangeKannada,           //u0c8x
   392     kRangeMalayalam,         //u0d0x
   393     kRangeSinhala,           //u0d8x
   394     kRangeThai,              //u0e0x  
   395     kRangeLao,               //u0e8x
   396     kRangeTibetan,           //u0f0x  place holder(resolved in the 2ndary tab.)
   397     kRangeTibetan,           //u0f8x  place holder(resolved in the 2ndary tab.)
   398     kRangeMyanmar,           //u100x
   399     kRangeGeorgian,          //u108x
   400     kRangeKorean,            //u110x  place holder(resolved in the 2ndary tab.)
   401     kRangeKorean,            //u118x  place holder(resolved in the 2ndary tab.)
   402     kRangeEthiopic,          //u120x  place holder(resolved in the 2ndary tab.)
   403     kRangeEthiopic,          //u128x  place holder(resolved in the 2ndary tab.)
   404     kRangeEthiopic,          //u130x  
   405     kRangeCherokee,          //u138x
   406     kRangeCanadian,          //u140x  place holder(resolved in the 2ndary tab.)
   407     kRangeCanadian,          //u148x  place holder(resolved in the 2ndary tab.)
   408     kRangeCanadian,          //u150x  place holder(resolved in the 2ndary tab.)
   409     kRangeCanadian,          //u158x  place holder(resolved in the 2ndary tab.)
   410     kRangeCanadian,          //u160x  
   411     kRangeOghamRunic         //u168x  this contains two scripts, Ogham & Runic
   412 };
   414 // A two level index is almost enough for locating a range, with the 
   415 // exception of u03xx and u05xx. Since we don't really care about range for
   416 // combining diacritical marks in our font application, they are 
   417 // not discriminated further. But future adoption of this module for other use 
   418 // should be aware of this limitation. The implementation can be extended if 
   419 // there is such a need.
   420 // For Indic, Southeast Asian scripts and some other scripts between
   421 // U+0700 and U+16FF, it's extended to the third level.
   422 uint32_t FindCharUnicodeRange(uint32_t ch)
   423 {
   424   uint32_t range;
   426   // aggregate ranges for non-BMP codepoints
   427   if (ch > 0xFFFF) {
   428     uint32_t p = (ch >> 16);
   429     if (p == 1) {
   430         return kRangeSMP;
   431     } else if (p == 2) {
   432         return kRangeSetCJK;
   433     }
   434     return kRangeHigherPlanes;
   435   }
   437   // lookup explicit range for BMP codepoints
   438   // first general range
   439   range = gUnicodeSubrangeTable[0][ch >> 12];
   441   // if general range is good enough, return that
   442   if (range < kRangeTableBase)
   443     // we try to get a specific range 
   444     return range;
   446   // otherwise, use subrange tables
   447   range = gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x0f00) >> 8];
   448   if (range < kRangeTableBase)
   449     return range;
   450   if (range < kRangeTertiaryTable)
   451     return gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x00f0) >> 4];
   453   // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks
   454   return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7];
   455 }
   457 nsIAtom *LangGroupFromUnicodeRange(uint8_t unicodeRange)
   458 {
   459   if (kRangeSpecificItemNum > unicodeRange) {
   460     nsIAtom **atom = gUnicodeRangeToLangGroupAtomTable[unicodeRange];
   461     return *atom;
   462   }
   463   return nullptr;
   464 }

mercurial