|
1 /* |
|
2 * Copyright © 2009 Red Hat, Inc. |
|
3 * Copyright © 2009 Keith Stribley |
|
4 * Copyright © 2011 Google, Inc. |
|
5 * |
|
6 * This is part of HarfBuzz, a text shaping library. |
|
7 * |
|
8 * Permission is hereby granted, without written agreement and without |
|
9 * license or royalty fees, to use, copy, modify, and distribute this |
|
10 * software and its documentation for any purpose, provided that the |
|
11 * above copyright notice and the following two paragraphs appear in |
|
12 * all copies of this software. |
|
13 * |
|
14 * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR |
|
15 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES |
|
16 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN |
|
17 * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH |
|
18 * DAMAGE. |
|
19 * |
|
20 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, |
|
21 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND |
|
22 * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS |
|
23 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO |
|
24 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. |
|
25 * |
|
26 * Red Hat Author(s): Behdad Esfahbod |
|
27 * Google Author(s): Behdad Esfahbod |
|
28 */ |
|
29 |
|
30 #include "hb-private.hh" |
|
31 |
|
32 #include "hb-icu.h" |
|
33 |
|
34 #include "hb-unicode-private.hh" |
|
35 |
|
36 #include <unicode/uchar.h> |
|
37 #include <unicode/unorm.h> |
|
38 #include <unicode/ustring.h> |
|
39 #include <unicode/uversion.h> |
|
40 |
|
41 |
|
42 hb_script_t |
|
43 hb_icu_script_to_script (UScriptCode script) |
|
44 { |
|
45 if (unlikely (script == USCRIPT_INVALID_CODE)) |
|
46 return HB_SCRIPT_INVALID; |
|
47 |
|
48 return hb_script_from_string (uscript_getShortName (script), -1); |
|
49 } |
|
50 |
|
51 UScriptCode |
|
52 hb_icu_script_from_script (hb_script_t script) |
|
53 { |
|
54 if (unlikely (script == HB_SCRIPT_INVALID)) |
|
55 return USCRIPT_INVALID_CODE; |
|
56 |
|
57 for (unsigned int i = 0; i < USCRIPT_CODE_LIMIT; i++) |
|
58 if (unlikely (hb_icu_script_to_script ((UScriptCode) i) == script)) |
|
59 return (UScriptCode) i; |
|
60 |
|
61 return USCRIPT_UNKNOWN; |
|
62 } |
|
63 |
|
64 |
|
65 static hb_unicode_combining_class_t |
|
66 hb_icu_unicode_combining_class (hb_unicode_funcs_t *ufuncs HB_UNUSED, |
|
67 hb_codepoint_t unicode, |
|
68 void *user_data HB_UNUSED) |
|
69 |
|
70 { |
|
71 return (hb_unicode_combining_class_t) u_getCombiningClass (unicode); |
|
72 } |
|
73 |
|
74 static unsigned int |
|
75 hb_icu_unicode_eastasian_width (hb_unicode_funcs_t *ufuncs HB_UNUSED, |
|
76 hb_codepoint_t unicode, |
|
77 void *user_data HB_UNUSED) |
|
78 { |
|
79 switch (u_getIntPropertyValue(unicode, UCHAR_EAST_ASIAN_WIDTH)) |
|
80 { |
|
81 case U_EA_WIDE: |
|
82 case U_EA_FULLWIDTH: |
|
83 return 2; |
|
84 case U_EA_NEUTRAL: |
|
85 case U_EA_AMBIGUOUS: |
|
86 case U_EA_HALFWIDTH: |
|
87 case U_EA_NARROW: |
|
88 return 1; |
|
89 } |
|
90 return 1; |
|
91 } |
|
92 |
|
93 static hb_unicode_general_category_t |
|
94 hb_icu_unicode_general_category (hb_unicode_funcs_t *ufuncs HB_UNUSED, |
|
95 hb_codepoint_t unicode, |
|
96 void *user_data HB_UNUSED) |
|
97 { |
|
98 switch (u_getIntPropertyValue(unicode, UCHAR_GENERAL_CATEGORY)) |
|
99 { |
|
100 case U_UNASSIGNED: return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED; |
|
101 |
|
102 case U_UPPERCASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER; |
|
103 case U_LOWERCASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER; |
|
104 case U_TITLECASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER; |
|
105 case U_MODIFIER_LETTER: return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER; |
|
106 case U_OTHER_LETTER: return HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER; |
|
107 |
|
108 case U_NON_SPACING_MARK: return HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK; |
|
109 case U_ENCLOSING_MARK: return HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK; |
|
110 case U_COMBINING_SPACING_MARK: return HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK; |
|
111 |
|
112 case U_DECIMAL_DIGIT_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER; |
|
113 case U_LETTER_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER; |
|
114 case U_OTHER_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER; |
|
115 |
|
116 case U_SPACE_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR; |
|
117 case U_LINE_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR; |
|
118 case U_PARAGRAPH_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR; |
|
119 |
|
120 case U_CONTROL_CHAR: return HB_UNICODE_GENERAL_CATEGORY_CONTROL; |
|
121 case U_FORMAT_CHAR: return HB_UNICODE_GENERAL_CATEGORY_FORMAT; |
|
122 case U_PRIVATE_USE_CHAR: return HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE; |
|
123 case U_SURROGATE: return HB_UNICODE_GENERAL_CATEGORY_SURROGATE; |
|
124 |
|
125 |
|
126 case U_DASH_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION; |
|
127 case U_START_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION; |
|
128 case U_END_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION; |
|
129 case U_CONNECTOR_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION; |
|
130 case U_OTHER_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION; |
|
131 |
|
132 case U_MATH_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL; |
|
133 case U_CURRENCY_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL; |
|
134 case U_MODIFIER_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL; |
|
135 case U_OTHER_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL; |
|
136 |
|
137 case U_INITIAL_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION; |
|
138 case U_FINAL_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION; |
|
139 } |
|
140 |
|
141 return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED; |
|
142 } |
|
143 |
|
144 static hb_codepoint_t |
|
145 hb_icu_unicode_mirroring (hb_unicode_funcs_t *ufuncs HB_UNUSED, |
|
146 hb_codepoint_t unicode, |
|
147 void *user_data HB_UNUSED) |
|
148 { |
|
149 return u_charMirror(unicode); |
|
150 } |
|
151 |
|
152 static hb_script_t |
|
153 hb_icu_unicode_script (hb_unicode_funcs_t *ufuncs HB_UNUSED, |
|
154 hb_codepoint_t unicode, |
|
155 void *user_data HB_UNUSED) |
|
156 { |
|
157 UErrorCode status = U_ZERO_ERROR; |
|
158 UScriptCode scriptCode = uscript_getScript(unicode, &status); |
|
159 |
|
160 if (unlikely (U_FAILURE (status))) |
|
161 return HB_SCRIPT_UNKNOWN; |
|
162 |
|
163 return hb_icu_script_to_script (scriptCode); |
|
164 } |
|
165 |
|
166 #if U_ICU_VERSION_MAJOR_NUM >= 49 |
|
167 static const UNormalizer2 *normalizer; |
|
168 #endif |
|
169 |
|
170 static hb_bool_t |
|
171 hb_icu_unicode_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED, |
|
172 hb_codepoint_t a, |
|
173 hb_codepoint_t b, |
|
174 hb_codepoint_t *ab, |
|
175 void *user_data HB_UNUSED) |
|
176 { |
|
177 #if U_ICU_VERSION_MAJOR_NUM >= 49 |
|
178 { |
|
179 UChar32 ret = unorm2_composePair (normalizer, a, b); |
|
180 if (ret < 0) return false; |
|
181 *ab = ret; |
|
182 return true; |
|
183 } |
|
184 #endif |
|
185 |
|
186 /* We don't ifdef-out the fallback code such that compiler always |
|
187 * sees it and makes sure it's compilable. */ |
|
188 |
|
189 UChar utf16[4], normalized[5]; |
|
190 unsigned int len; |
|
191 hb_bool_t ret, err; |
|
192 UErrorCode icu_err; |
|
193 |
|
194 len = 0; |
|
195 err = false; |
|
196 U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), a, err); |
|
197 if (err) return false; |
|
198 U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), b, err); |
|
199 if (err) return false; |
|
200 |
|
201 icu_err = U_ZERO_ERROR; |
|
202 len = unorm_normalize (utf16, len, UNORM_NFC, 0, normalized, ARRAY_LENGTH (normalized), &icu_err); |
|
203 if (U_FAILURE (icu_err)) |
|
204 return false; |
|
205 if (u_countChar32 (normalized, len) == 1) { |
|
206 U16_GET_UNSAFE (normalized, 0, *ab); |
|
207 ret = true; |
|
208 } else { |
|
209 ret = false; |
|
210 } |
|
211 |
|
212 return ret; |
|
213 } |
|
214 |
|
215 static hb_bool_t |
|
216 hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED, |
|
217 hb_codepoint_t ab, |
|
218 hb_codepoint_t *a, |
|
219 hb_codepoint_t *b, |
|
220 void *user_data HB_UNUSED) |
|
221 { |
|
222 #if U_ICU_VERSION_MAJOR_NUM >= 49 |
|
223 { |
|
224 UChar decomposed[4]; |
|
225 int len; |
|
226 UErrorCode icu_err = U_ZERO_ERROR; |
|
227 len = unorm2_getRawDecomposition (normalizer, ab, decomposed, |
|
228 ARRAY_LENGTH (decomposed), &icu_err); |
|
229 if (U_FAILURE (icu_err) || len < 0) return false; |
|
230 |
|
231 len = u_countChar32 (decomposed, len); |
|
232 if (len == 1) { |
|
233 U16_GET_UNSAFE (decomposed, 0, *a); |
|
234 *b = 0; |
|
235 return *a != ab; |
|
236 } else if (len == 2) { |
|
237 len =0; |
|
238 U16_NEXT_UNSAFE (decomposed, len, *a); |
|
239 U16_NEXT_UNSAFE (decomposed, len, *b); |
|
240 } |
|
241 return true; |
|
242 } |
|
243 #endif |
|
244 |
|
245 /* We don't ifdef-out the fallback code such that compiler always |
|
246 * sees it and makes sure it's compilable. */ |
|
247 |
|
248 UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1]; |
|
249 unsigned int len; |
|
250 hb_bool_t ret, err; |
|
251 UErrorCode icu_err; |
|
252 |
|
253 /* This function is a monster! Maybe it wasn't a good idea adding a |
|
254 * pairwise decompose API... */ |
|
255 /* Watchout for the dragons. Err, watchout for macros changing len. */ |
|
256 |
|
257 len = 0; |
|
258 err = false; |
|
259 U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), ab, err); |
|
260 if (err) return false; |
|
261 |
|
262 icu_err = U_ZERO_ERROR; |
|
263 len = unorm_normalize (utf16, len, UNORM_NFD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err); |
|
264 if (U_FAILURE (icu_err)) |
|
265 return false; |
|
266 |
|
267 len = u_countChar32 (normalized, len); |
|
268 |
|
269 if (len == 1) { |
|
270 U16_GET_UNSAFE (normalized, 0, *a); |
|
271 *b = 0; |
|
272 ret = *a != ab; |
|
273 } else if (len == 2) { |
|
274 len =0; |
|
275 U16_NEXT_UNSAFE (normalized, len, *a); |
|
276 U16_NEXT_UNSAFE (normalized, len, *b); |
|
277 |
|
278 /* Here's the ugly part: if ab decomposes to a single character and |
|
279 * that character decomposes again, we have to detect that and undo |
|
280 * the second part :-(. */ |
|
281 UChar recomposed[20]; |
|
282 icu_err = U_ZERO_ERROR; |
|
283 unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err); |
|
284 if (U_FAILURE (icu_err)) |
|
285 return false; |
|
286 hb_codepoint_t c; |
|
287 U16_GET_UNSAFE (recomposed, 0, c); |
|
288 if (c != *a && c != ab) { |
|
289 *a = c; |
|
290 *b = 0; |
|
291 } |
|
292 ret = true; |
|
293 } else { |
|
294 /* If decomposed to more than two characters, take the last one, |
|
295 * and recompose the rest to get the first component. */ |
|
296 U16_PREV_UNSAFE (normalized, len, *b); /* Changes len in-place. */ |
|
297 UChar recomposed[18 * 2]; |
|
298 icu_err = U_ZERO_ERROR; |
|
299 len = unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err); |
|
300 if (U_FAILURE (icu_err)) |
|
301 return false; |
|
302 /* We expect that recomposed has exactly one character now. */ |
|
303 if (unlikely (u_countChar32 (recomposed, len) != 1)) |
|
304 return false; |
|
305 U16_GET_UNSAFE (recomposed, 0, *a); |
|
306 ret = true; |
|
307 } |
|
308 |
|
309 return ret; |
|
310 } |
|
311 |
|
312 static unsigned int |
|
313 hb_icu_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs HB_UNUSED, |
|
314 hb_codepoint_t u, |
|
315 hb_codepoint_t *decomposed, |
|
316 void *user_data HB_UNUSED) |
|
317 { |
|
318 UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1]; |
|
319 unsigned int len; |
|
320 int32_t utf32_len; |
|
321 hb_bool_t err; |
|
322 UErrorCode icu_err; |
|
323 |
|
324 /* Copy @u into a UTF-16 array to be passed to ICU. */ |
|
325 len = 0; |
|
326 err = FALSE; |
|
327 U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), u, err); |
|
328 if (err) |
|
329 return 0; |
|
330 |
|
331 /* Normalise the codepoint using NFKD mode. */ |
|
332 icu_err = U_ZERO_ERROR; |
|
333 len = unorm_normalize (utf16, len, UNORM_NFKD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err); |
|
334 if (icu_err) |
|
335 return 0; |
|
336 |
|
337 /* Convert the decomposed form from UTF-16 to UTF-32. */ |
|
338 icu_err = U_ZERO_ERROR; |
|
339 u_strToUTF32 ((UChar32*) decomposed, HB_UNICODE_MAX_DECOMPOSITION_LEN, &utf32_len, normalized, len, &icu_err); |
|
340 if (icu_err) |
|
341 return 0; |
|
342 |
|
343 return utf32_len; |
|
344 } |
|
345 |
|
346 |
|
347 hb_unicode_funcs_t * |
|
348 hb_icu_get_unicode_funcs (void) |
|
349 { |
|
350 static const hb_unicode_funcs_t _hb_icu_unicode_funcs = { |
|
351 HB_OBJECT_HEADER_STATIC, |
|
352 |
|
353 NULL, /* parent */ |
|
354 true, /* immutable */ |
|
355 { |
|
356 #define HB_UNICODE_FUNC_IMPLEMENT(name) hb_icu_unicode_##name, |
|
357 HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS |
|
358 #undef HB_UNICODE_FUNC_IMPLEMENT |
|
359 } |
|
360 }; |
|
361 |
|
362 #if U_ICU_VERSION_MAJOR_NUM >= 49 |
|
363 if (!hb_atomic_ptr_get (&normalizer)) { |
|
364 UErrorCode icu_err = U_ZERO_ERROR; |
|
365 /* We ignore failure in getNFCInstace(). */ |
|
366 hb_atomic_ptr_cmpexch (&normalizer, NULL, unorm2_getNFCInstance (&icu_err)); |
|
367 } |
|
368 #endif |
|
369 return const_cast<hb_unicode_funcs_t *> (&_hb_icu_unicode_funcs); |
|
370 } |
|
371 |
|
372 |