The Tor Browser: gfx/harfbuzz/src/hb-icu.cc@925c144e1f1f

     1 /*

     2  * Copyright © 2009  Red Hat, Inc.

     3  * Copyright © 2009  Keith Stribley

     4  * Copyright © 2011  Google, Inc.

     5  *

     6  *  This is part of HarfBuzz, a text shaping library.

     7  *

     8  * Permission is hereby granted, without written agreement and without

     9  * license or royalty fees, to use, copy, modify, and distribute this

    10  * software and its documentation for any purpose, provided that the

    11  * above copyright notice and the following two paragraphs appear in

    12  * all copies of this software.

    13  *

    14  * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR

    15  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES

    16  * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN

    17  * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH

    18  * DAMAGE.

    19  *

    20  * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,

    21  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND

    22  * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS

    23  * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO

    24  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.

    25  *

    26  * Red Hat Author(s): Behdad Esfahbod

    27  * Google Author(s): Behdad Esfahbod

    28  */

    30 #include "hb-private.hh"

    32 #include "hb-icu.h"

    34 #include "hb-unicode-private.hh"

    36 #include <unicode/uchar.h>

    37 #include <unicode/unorm.h>

    38 #include <unicode/ustring.h>

    39 #include <unicode/uversion.h>

    42 hb_script_t

    43 hb_icu_script_to_script (UScriptCode script)

    44 {

    45   if (unlikely (script == USCRIPT_INVALID_CODE))

    46     return HB_SCRIPT_INVALID;

    48   return hb_script_from_string (uscript_getShortName (script), -1);

    49 }

    51 UScriptCode

    52 hb_icu_script_from_script (hb_script_t script)

    53 {

    54   if (unlikely (script == HB_SCRIPT_INVALID))

    55     return USCRIPT_INVALID_CODE;

    57   for (unsigned int i = 0; i < USCRIPT_CODE_LIMIT; i++)

    58     if (unlikely (hb_icu_script_to_script ((UScriptCode) i) == script))

    59       return (UScriptCode) i;

    61   return USCRIPT_UNKNOWN;

    62 }

    65 static hb_unicode_combining_class_t

    66 hb_icu_unicode_combining_class (hb_unicode_funcs_t *ufuncs HB_UNUSED,

    67 				hb_codepoint_t      unicode,

    68 				void               *user_data HB_UNUSED)

    70 {

    71   return (hb_unicode_combining_class_t) u_getCombiningClass (unicode);

    72 }

    74 static unsigned int

    75 hb_icu_unicode_eastasian_width (hb_unicode_funcs_t *ufuncs HB_UNUSED,

    76 				hb_codepoint_t      unicode,

    77 				void               *user_data HB_UNUSED)

    78 {

    79   switch (u_getIntPropertyValue(unicode, UCHAR_EAST_ASIAN_WIDTH))

    80   {

    81   case U_EA_WIDE:

    82   case U_EA_FULLWIDTH:

    83     return 2;

    84   case U_EA_NEUTRAL:

    85   case U_EA_AMBIGUOUS:

    86   case U_EA_HALFWIDTH:

    87   case U_EA_NARROW:

    88     return 1;

    89   }

    90   return 1;

    91 }

    93 static hb_unicode_general_category_t

    94 hb_icu_unicode_general_category (hb_unicode_funcs_t *ufuncs HB_UNUSED,

    95 				 hb_codepoint_t      unicode,

    96 				 void               *user_data HB_UNUSED)

    97 {

    98   switch (u_getIntPropertyValue(unicode, UCHAR_GENERAL_CATEGORY))

    99   {

   100   case U_UNASSIGNED:			return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;

   102   case U_UPPERCASE_LETTER:		return HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER;

   103   case U_LOWERCASE_LETTER:		return HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER;

   104   case U_TITLECASE_LETTER:		return HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER;

   105   case U_MODIFIER_LETTER:		return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER;

   106   case U_OTHER_LETTER:			return HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER;

   108   case U_NON_SPACING_MARK:		return HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK;

   109   case U_ENCLOSING_MARK:		return HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK;

   110   case U_COMBINING_SPACING_MARK:	return HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK;

   112   case U_DECIMAL_DIGIT_NUMBER:		return HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER;

   113   case U_LETTER_NUMBER:			return HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER;

   114   case U_OTHER_NUMBER:			return HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER;

   116   case U_SPACE_SEPARATOR:		return HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR;

   117   case U_LINE_SEPARATOR:		return HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR;

   118   case U_PARAGRAPH_SEPARATOR:		return HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR;

   120   case U_CONTROL_CHAR:			return HB_UNICODE_GENERAL_CATEGORY_CONTROL;

   121   case U_FORMAT_CHAR:			return HB_UNICODE_GENERAL_CATEGORY_FORMAT;

   122   case U_PRIVATE_USE_CHAR:		return HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE;

   123   case U_SURROGATE:			return HB_UNICODE_GENERAL_CATEGORY_SURROGATE;

   126   case U_DASH_PUNCTUATION:		return HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION;

   127   case U_START_PUNCTUATION:		return HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION;

   128   case U_END_PUNCTUATION:		return HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION;

   129   case U_CONNECTOR_PUNCTUATION:		return HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION;

   130   case U_OTHER_PUNCTUATION:		return HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION;

   132   case U_MATH_SYMBOL:			return HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL;

   133   case U_CURRENCY_SYMBOL:		return HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL;

   134   case U_MODIFIER_SYMBOL:		return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL;

   135   case U_OTHER_SYMBOL:			return HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL;

   137   case U_INITIAL_PUNCTUATION:		return HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION;

   138   case U_FINAL_PUNCTUATION:		return HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION;

   139   }

   141   return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;

   142 }

   144 static hb_codepoint_t

   145 hb_icu_unicode_mirroring (hb_unicode_funcs_t *ufuncs HB_UNUSED,

   146 			  hb_codepoint_t      unicode,

   147 			  void               *user_data HB_UNUSED)

   148 {

   149   return u_charMirror(unicode);

   150 }

   152 static hb_script_t

   153 hb_icu_unicode_script (hb_unicode_funcs_t *ufuncs HB_UNUSED,

   154 		       hb_codepoint_t      unicode,

   155 		       void               *user_data HB_UNUSED)

   156 {

   157   UErrorCode status = U_ZERO_ERROR;

   158   UScriptCode scriptCode = uscript_getScript(unicode, &status);

   160   if (unlikely (U_FAILURE (status)))

   161     return HB_SCRIPT_UNKNOWN;

   163   return hb_icu_script_to_script (scriptCode);

   164 }

   166 #if U_ICU_VERSION_MAJOR_NUM >= 49

   167 static const UNormalizer2 *normalizer;

   168 #endif

   170 static hb_bool_t

   171 hb_icu_unicode_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED,

   172 			hb_codepoint_t      a,

   173 			hb_codepoint_t      b,

   174 			hb_codepoint_t     *ab,

   175 			void               *user_data HB_UNUSED)

   176 {

   177 #if U_ICU_VERSION_MAJOR_NUM >= 49

   178   {

   179     UChar32 ret = unorm2_composePair (normalizer, a, b);

   180     if (ret < 0) return false;

   181     *ab = ret;

   182     return true;

   183   }

   184 #endif

   186   /* We don't ifdef-out the fallback code such that compiler always

   187    * sees it and makes sure it's compilable. */

   189   UChar utf16[4], normalized[5];

   190   unsigned int len;

   191   hb_bool_t ret, err;

   192   UErrorCode icu_err;

   194   len = 0;

   195   err = false;

   196   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), a, err);

   197   if (err) return false;

   198   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), b, err);

   199   if (err) return false;

   201   icu_err = U_ZERO_ERROR;

   202   len = unorm_normalize (utf16, len, UNORM_NFC, 0, normalized, ARRAY_LENGTH (normalized), &icu_err);

   203   if (U_FAILURE (icu_err))

   204     return false;

   205   if (u_countChar32 (normalized, len) == 1) {

   206     U16_GET_UNSAFE (normalized, 0, *ab);

   207     ret = true;

   208   } else {

   209     ret = false;

   210   }

   212   return ret;

   213 }

   215 static hb_bool_t

   216 hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,

   217 			  hb_codepoint_t      ab,

   218 			  hb_codepoint_t     *a,

   219 			  hb_codepoint_t     *b,

   220 			  void               *user_data HB_UNUSED)

   221 {

   222 #if U_ICU_VERSION_MAJOR_NUM >= 49

   223   {

   224     UChar decomposed[4];

   225     int len;

   226     UErrorCode icu_err = U_ZERO_ERROR;

   227     len = unorm2_getRawDecomposition (normalizer, ab, decomposed,

   228 				      ARRAY_LENGTH (decomposed), &icu_err);

   229     if (U_FAILURE (icu_err) || len < 0) return false;

   231     len = u_countChar32 (decomposed, len);

   232     if (len == 1) {

   233       U16_GET_UNSAFE (decomposed, 0, *a);

   234       *b = 0;

   235       return *a != ab;

   236     } else if (len == 2) {

   237       len =0;

   238       U16_NEXT_UNSAFE (decomposed, len, *a);

   239       U16_NEXT_UNSAFE (decomposed, len, *b);

   240     }

   241     return true;

   242   }

   243 #endif

   245   /* We don't ifdef-out the fallback code such that compiler always

   246    * sees it and makes sure it's compilable. */

   248   UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1];

   249   unsigned int len;

   250   hb_bool_t ret, err;

   251   UErrorCode icu_err;

   253   /* This function is a monster! Maybe it wasn't a good idea adding a

   254    * pairwise decompose API... */

   255   /* Watchout for the dragons.  Err, watchout for macros changing len. */

   257   len = 0;

   258   err = false;

   259   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), ab, err);

   260   if (err) return false;

   262   icu_err = U_ZERO_ERROR;

   263   len = unorm_normalize (utf16, len, UNORM_NFD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err);

   264   if (U_FAILURE (icu_err))

   265     return false;

   267   len = u_countChar32 (normalized, len);

   269   if (len == 1) {

   270     U16_GET_UNSAFE (normalized, 0, *a);

   271     *b = 0;

   272     ret = *a != ab;

   273   } else if (len == 2) {

   274     len =0;

   275     U16_NEXT_UNSAFE (normalized, len, *a);

   276     U16_NEXT_UNSAFE (normalized, len, *b);

   278     /* Here's the ugly part: if ab decomposes to a single character and

   279      * that character decomposes again, we have to detect that and undo

   280      * the second part :-(. */

   281     UChar recomposed[20];

   282     icu_err = U_ZERO_ERROR;

   283     unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err);

   284     if (U_FAILURE (icu_err))

   285       return false;

   286     hb_codepoint_t c;

   287     U16_GET_UNSAFE (recomposed, 0, c);

   288     if (c != *a && c != ab) {

   289       *a = c;

   290       *b = 0;

   291     }

   292     ret = true;

   293   } else {

   294     /* If decomposed to more than two characters, take the last one,

   295      * and recompose the rest to get the first component. */

   296     U16_PREV_UNSAFE (normalized, len, *b); /* Changes len in-place. */

   297     UChar recomposed[18 * 2];

   298     icu_err = U_ZERO_ERROR;

   299     len = unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err);

   300     if (U_FAILURE (icu_err))

   301       return false;

   302     /* We expect that recomposed has exactly one character now. */

   303     if (unlikely (u_countChar32 (recomposed, len) != 1))

   304       return false;

   305     U16_GET_UNSAFE (recomposed, 0, *a);

   306     ret = true;

   307   }

   309   return ret;

   310 }

   312 static unsigned int

   313 hb_icu_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs HB_UNUSED,

   314 					hb_codepoint_t      u,

   315 					hb_codepoint_t     *decomposed,

   316 					void               *user_data HB_UNUSED)

   317 {

   318   UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1];

   319   unsigned int len;

   320   int32_t utf32_len;

   321   hb_bool_t err;

   322   UErrorCode icu_err;

   324   /* Copy @u into a UTF-16 array to be passed to ICU. */

   325   len = 0;

   326   err = FALSE;

   327   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), u, err);

   328   if (err)

   329     return 0;

   331   /* Normalise the codepoint using NFKD mode. */

   332   icu_err = U_ZERO_ERROR;

   333   len = unorm_normalize (utf16, len, UNORM_NFKD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err);

   334   if (icu_err)

   335     return 0;

   337   /* Convert the decomposed form from UTF-16 to UTF-32. */

   338   icu_err = U_ZERO_ERROR;

   339   u_strToUTF32 ((UChar32*) decomposed, HB_UNICODE_MAX_DECOMPOSITION_LEN, &utf32_len, normalized, len, &icu_err);

   340   if (icu_err)

   341     return 0;

   343   return utf32_len;

   344 }

   347 hb_unicode_funcs_t *

   348 hb_icu_get_unicode_funcs (void)

   349 {

   350   static const hb_unicode_funcs_t _hb_icu_unicode_funcs = {

   351     HB_OBJECT_HEADER_STATIC,

   353     NULL, /* parent */

   354     true, /* immutable */

   355     {

   356 #define HB_UNICODE_FUNC_IMPLEMENT(name) hb_icu_unicode_##name,

   357       HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS

   358 #undef HB_UNICODE_FUNC_IMPLEMENT

   359     }

   360   };

   362 #if U_ICU_VERSION_MAJOR_NUM >= 49

   363   if (!hb_atomic_ptr_get (&normalizer)) {

   364     UErrorCode icu_err = U_ZERO_ERROR;

   365     /* We ignore failure in getNFCInstace(). */

   366     hb_atomic_ptr_cmpexch (&normalizer, NULL, unorm2_getNFCInstance (&icu_err));

   367   }

   368 #endif

   369   return const_cast<hb_unicode_funcs_t *> (&_hb_icu_unicode_funcs);

   370 }

The Tor Browser / file revision

gfx/harfbuzz/src/hb-icu.cc@925c144e1f1f

gfx/harfbuzz/src/hb-icu.cc