michael@0: /* michael@0: * Copyright © 2009 Red Hat, Inc. michael@0: * Copyright © 2009 Keith Stribley michael@0: * Copyright © 2011 Google, Inc. michael@0: * michael@0: * This is part of HarfBuzz, a text shaping library. michael@0: * michael@0: * Permission is hereby granted, without written agreement and without michael@0: * license or royalty fees, to use, copy, modify, and distribute this michael@0: * software and its documentation for any purpose, provided that the michael@0: * above copyright notice and the following two paragraphs appear in michael@0: * all copies of this software. michael@0: * michael@0: * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR michael@0: * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES michael@0: * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN michael@0: * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH michael@0: * DAMAGE. michael@0: * michael@0: * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, michael@0: * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND michael@0: * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS michael@0: * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO michael@0: * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. michael@0: * michael@0: * Red Hat Author(s): Behdad Esfahbod michael@0: * Google Author(s): Behdad Esfahbod michael@0: */ michael@0: michael@0: #include "hb-private.hh" michael@0: michael@0: #include "hb-icu.h" michael@0: michael@0: #include "hb-unicode-private.hh" michael@0: michael@0: #include michael@0: #include michael@0: #include michael@0: #include michael@0: michael@0: michael@0: hb_script_t michael@0: hb_icu_script_to_script (UScriptCode script) michael@0: { michael@0: if (unlikely (script == USCRIPT_INVALID_CODE)) michael@0: return HB_SCRIPT_INVALID; michael@0: michael@0: return hb_script_from_string (uscript_getShortName (script), -1); michael@0: } michael@0: michael@0: UScriptCode michael@0: hb_icu_script_from_script (hb_script_t script) michael@0: { michael@0: if (unlikely (script == HB_SCRIPT_INVALID)) michael@0: return USCRIPT_INVALID_CODE; michael@0: michael@0: for (unsigned int i = 0; i < USCRIPT_CODE_LIMIT; i++) michael@0: if (unlikely (hb_icu_script_to_script ((UScriptCode) i) == script)) michael@0: return (UScriptCode) i; michael@0: michael@0: return USCRIPT_UNKNOWN; michael@0: } michael@0: michael@0: michael@0: static hb_unicode_combining_class_t michael@0: hb_icu_unicode_combining_class (hb_unicode_funcs_t *ufuncs HB_UNUSED, michael@0: hb_codepoint_t unicode, michael@0: void *user_data HB_UNUSED) michael@0: michael@0: { michael@0: return (hb_unicode_combining_class_t) u_getCombiningClass (unicode); michael@0: } michael@0: michael@0: static unsigned int michael@0: hb_icu_unicode_eastasian_width (hb_unicode_funcs_t *ufuncs HB_UNUSED, michael@0: hb_codepoint_t unicode, michael@0: void *user_data HB_UNUSED) michael@0: { michael@0: switch (u_getIntPropertyValue(unicode, UCHAR_EAST_ASIAN_WIDTH)) michael@0: { michael@0: case U_EA_WIDE: michael@0: case U_EA_FULLWIDTH: michael@0: return 2; michael@0: case U_EA_NEUTRAL: michael@0: case U_EA_AMBIGUOUS: michael@0: case U_EA_HALFWIDTH: michael@0: case U_EA_NARROW: michael@0: return 1; michael@0: } michael@0: return 1; michael@0: } michael@0: michael@0: static hb_unicode_general_category_t michael@0: hb_icu_unicode_general_category (hb_unicode_funcs_t *ufuncs HB_UNUSED, michael@0: hb_codepoint_t unicode, michael@0: void *user_data HB_UNUSED) michael@0: { michael@0: switch (u_getIntPropertyValue(unicode, UCHAR_GENERAL_CATEGORY)) michael@0: { michael@0: case U_UNASSIGNED: return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED; michael@0: michael@0: case U_UPPERCASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER; michael@0: case U_LOWERCASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER; michael@0: case U_TITLECASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER; michael@0: case U_MODIFIER_LETTER: return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER; michael@0: case U_OTHER_LETTER: return HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER; michael@0: michael@0: case U_NON_SPACING_MARK: return HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK; michael@0: case U_ENCLOSING_MARK: return HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK; michael@0: case U_COMBINING_SPACING_MARK: return HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK; michael@0: michael@0: case U_DECIMAL_DIGIT_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER; michael@0: case U_LETTER_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER; michael@0: case U_OTHER_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER; michael@0: michael@0: case U_SPACE_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR; michael@0: case U_LINE_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR; michael@0: case U_PARAGRAPH_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR; michael@0: michael@0: case U_CONTROL_CHAR: return HB_UNICODE_GENERAL_CATEGORY_CONTROL; michael@0: case U_FORMAT_CHAR: return HB_UNICODE_GENERAL_CATEGORY_FORMAT; michael@0: case U_PRIVATE_USE_CHAR: return HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE; michael@0: case U_SURROGATE: return HB_UNICODE_GENERAL_CATEGORY_SURROGATE; michael@0: michael@0: michael@0: case U_DASH_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION; michael@0: case U_START_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION; michael@0: case U_END_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION; michael@0: case U_CONNECTOR_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION; michael@0: case U_OTHER_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION; michael@0: michael@0: case U_MATH_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL; michael@0: case U_CURRENCY_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL; michael@0: case U_MODIFIER_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL; michael@0: case U_OTHER_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL; michael@0: michael@0: case U_INITIAL_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION; michael@0: case U_FINAL_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION; michael@0: } michael@0: michael@0: return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED; michael@0: } michael@0: michael@0: static hb_codepoint_t michael@0: hb_icu_unicode_mirroring (hb_unicode_funcs_t *ufuncs HB_UNUSED, michael@0: hb_codepoint_t unicode, michael@0: void *user_data HB_UNUSED) michael@0: { michael@0: return u_charMirror(unicode); michael@0: } michael@0: michael@0: static hb_script_t michael@0: hb_icu_unicode_script (hb_unicode_funcs_t *ufuncs HB_UNUSED, michael@0: hb_codepoint_t unicode, michael@0: void *user_data HB_UNUSED) michael@0: { michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: UScriptCode scriptCode = uscript_getScript(unicode, &status); michael@0: michael@0: if (unlikely (U_FAILURE (status))) michael@0: return HB_SCRIPT_UNKNOWN; michael@0: michael@0: return hb_icu_script_to_script (scriptCode); michael@0: } michael@0: michael@0: #if U_ICU_VERSION_MAJOR_NUM >= 49 michael@0: static const UNormalizer2 *normalizer; michael@0: #endif michael@0: michael@0: static hb_bool_t michael@0: hb_icu_unicode_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED, michael@0: hb_codepoint_t a, michael@0: hb_codepoint_t b, michael@0: hb_codepoint_t *ab, michael@0: void *user_data HB_UNUSED) michael@0: { michael@0: #if U_ICU_VERSION_MAJOR_NUM >= 49 michael@0: { michael@0: UChar32 ret = unorm2_composePair (normalizer, a, b); michael@0: if (ret < 0) return false; michael@0: *ab = ret; michael@0: return true; michael@0: } michael@0: #endif michael@0: michael@0: /* We don't ifdef-out the fallback code such that compiler always michael@0: * sees it and makes sure it's compilable. */ michael@0: michael@0: UChar utf16[4], normalized[5]; michael@0: unsigned int len; michael@0: hb_bool_t ret, err; michael@0: UErrorCode icu_err; michael@0: michael@0: len = 0; michael@0: err = false; michael@0: U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), a, err); michael@0: if (err) return false; michael@0: U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), b, err); michael@0: if (err) return false; michael@0: michael@0: icu_err = U_ZERO_ERROR; michael@0: len = unorm_normalize (utf16, len, UNORM_NFC, 0, normalized, ARRAY_LENGTH (normalized), &icu_err); michael@0: if (U_FAILURE (icu_err)) michael@0: return false; michael@0: if (u_countChar32 (normalized, len) == 1) { michael@0: U16_GET_UNSAFE (normalized, 0, *ab); michael@0: ret = true; michael@0: } else { michael@0: ret = false; michael@0: } michael@0: michael@0: return ret; michael@0: } michael@0: michael@0: static hb_bool_t michael@0: hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED, michael@0: hb_codepoint_t ab, michael@0: hb_codepoint_t *a, michael@0: hb_codepoint_t *b, michael@0: void *user_data HB_UNUSED) michael@0: { michael@0: #if U_ICU_VERSION_MAJOR_NUM >= 49 michael@0: { michael@0: UChar decomposed[4]; michael@0: int len; michael@0: UErrorCode icu_err = U_ZERO_ERROR; michael@0: len = unorm2_getRawDecomposition (normalizer, ab, decomposed, michael@0: ARRAY_LENGTH (decomposed), &icu_err); michael@0: if (U_FAILURE (icu_err) || len < 0) return false; michael@0: michael@0: len = u_countChar32 (decomposed, len); michael@0: if (len == 1) { michael@0: U16_GET_UNSAFE (decomposed, 0, *a); michael@0: *b = 0; michael@0: return *a != ab; michael@0: } else if (len == 2) { michael@0: len =0; michael@0: U16_NEXT_UNSAFE (decomposed, len, *a); michael@0: U16_NEXT_UNSAFE (decomposed, len, *b); michael@0: } michael@0: return true; michael@0: } michael@0: #endif michael@0: michael@0: /* We don't ifdef-out the fallback code such that compiler always michael@0: * sees it and makes sure it's compilable. */ michael@0: michael@0: UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1]; michael@0: unsigned int len; michael@0: hb_bool_t ret, err; michael@0: UErrorCode icu_err; michael@0: michael@0: /* This function is a monster! Maybe it wasn't a good idea adding a michael@0: * pairwise decompose API... */ michael@0: /* Watchout for the dragons. Err, watchout for macros changing len. */ michael@0: michael@0: len = 0; michael@0: err = false; michael@0: U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), ab, err); michael@0: if (err) return false; michael@0: michael@0: icu_err = U_ZERO_ERROR; michael@0: len = unorm_normalize (utf16, len, UNORM_NFD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err); michael@0: if (U_FAILURE (icu_err)) michael@0: return false; michael@0: michael@0: len = u_countChar32 (normalized, len); michael@0: michael@0: if (len == 1) { michael@0: U16_GET_UNSAFE (normalized, 0, *a); michael@0: *b = 0; michael@0: ret = *a != ab; michael@0: } else if (len == 2) { michael@0: len =0; michael@0: U16_NEXT_UNSAFE (normalized, len, *a); michael@0: U16_NEXT_UNSAFE (normalized, len, *b); michael@0: michael@0: /* Here's the ugly part: if ab decomposes to a single character and michael@0: * that character decomposes again, we have to detect that and undo michael@0: * the second part :-(. */ michael@0: UChar recomposed[20]; michael@0: icu_err = U_ZERO_ERROR; michael@0: unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err); michael@0: if (U_FAILURE (icu_err)) michael@0: return false; michael@0: hb_codepoint_t c; michael@0: U16_GET_UNSAFE (recomposed, 0, c); michael@0: if (c != *a && c != ab) { michael@0: *a = c; michael@0: *b = 0; michael@0: } michael@0: ret = true; michael@0: } else { michael@0: /* If decomposed to more than two characters, take the last one, michael@0: * and recompose the rest to get the first component. */ michael@0: U16_PREV_UNSAFE (normalized, len, *b); /* Changes len in-place. */ michael@0: UChar recomposed[18 * 2]; michael@0: icu_err = U_ZERO_ERROR; michael@0: len = unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err); michael@0: if (U_FAILURE (icu_err)) michael@0: return false; michael@0: /* We expect that recomposed has exactly one character now. */ michael@0: if (unlikely (u_countChar32 (recomposed, len) != 1)) michael@0: return false; michael@0: U16_GET_UNSAFE (recomposed, 0, *a); michael@0: ret = true; michael@0: } michael@0: michael@0: return ret; michael@0: } michael@0: michael@0: static unsigned int michael@0: hb_icu_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs HB_UNUSED, michael@0: hb_codepoint_t u, michael@0: hb_codepoint_t *decomposed, michael@0: void *user_data HB_UNUSED) michael@0: { michael@0: UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1]; michael@0: unsigned int len; michael@0: int32_t utf32_len; michael@0: hb_bool_t err; michael@0: UErrorCode icu_err; michael@0: michael@0: /* Copy @u into a UTF-16 array to be passed to ICU. */ michael@0: len = 0; michael@0: err = FALSE; michael@0: U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), u, err); michael@0: if (err) michael@0: return 0; michael@0: michael@0: /* Normalise the codepoint using NFKD mode. */ michael@0: icu_err = U_ZERO_ERROR; michael@0: len = unorm_normalize (utf16, len, UNORM_NFKD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err); michael@0: if (icu_err) michael@0: return 0; michael@0: michael@0: /* Convert the decomposed form from UTF-16 to UTF-32. */ michael@0: icu_err = U_ZERO_ERROR; michael@0: u_strToUTF32 ((UChar32*) decomposed, HB_UNICODE_MAX_DECOMPOSITION_LEN, &utf32_len, normalized, len, &icu_err); michael@0: if (icu_err) michael@0: return 0; michael@0: michael@0: return utf32_len; michael@0: } michael@0: michael@0: michael@0: hb_unicode_funcs_t * michael@0: hb_icu_get_unicode_funcs (void) michael@0: { michael@0: static const hb_unicode_funcs_t _hb_icu_unicode_funcs = { michael@0: HB_OBJECT_HEADER_STATIC, michael@0: michael@0: NULL, /* parent */ michael@0: true, /* immutable */ michael@0: { michael@0: #define HB_UNICODE_FUNC_IMPLEMENT(name) hb_icu_unicode_##name, michael@0: HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS michael@0: #undef HB_UNICODE_FUNC_IMPLEMENT michael@0: } michael@0: }; michael@0: michael@0: #if U_ICU_VERSION_MAJOR_NUM >= 49 michael@0: if (!hb_atomic_ptr_get (&normalizer)) { michael@0: UErrorCode icu_err = U_ZERO_ERROR; michael@0: /* We ignore failure in getNFCInstace(). */ michael@0: hb_atomic_ptr_cmpexch (&normalizer, NULL, unorm2_getNFCInstance (&icu_err)); michael@0: } michael@0: #endif michael@0: return const_cast (&_hb_icu_unicode_funcs); michael@0: } michael@0: michael@0: