Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
michael@0 | 2 | |
michael@0 | 3 | /* This file is modified from JPNIC's mDNKit, it is under both MPL and |
michael@0 | 4 | * JPNIC's license. |
michael@0 | 5 | */ |
michael@0 | 6 | |
michael@0 | 7 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 8 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 9 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 10 | |
michael@0 | 11 | /* |
michael@0 | 12 | * Copyright (c) 2000,2002 Japan Network Information Center. |
michael@0 | 13 | * All rights reserved. |
michael@0 | 14 | * |
michael@0 | 15 | * By using this file, you agree to the terms and conditions set forth bellow. |
michael@0 | 16 | * |
michael@0 | 17 | * LICENSE TERMS AND CONDITIONS |
michael@0 | 18 | * |
michael@0 | 19 | * The following License Terms and Conditions apply, unless a different |
michael@0 | 20 | * license is obtained from Japan Network Information Center ("JPNIC"), |
michael@0 | 21 | * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda, |
michael@0 | 22 | * Chiyoda-ku, Tokyo 101-0047, Japan. |
michael@0 | 23 | * |
michael@0 | 24 | * 1. Use, Modification and Redistribution (including distribution of any |
michael@0 | 25 | * modified or derived work) in source and/or binary forms is permitted |
michael@0 | 26 | * under this License Terms and Conditions. |
michael@0 | 27 | * |
michael@0 | 28 | * 2. Redistribution of source code must retain the copyright notices as they |
michael@0 | 29 | * appear in each source code file, this License Terms and Conditions. |
michael@0 | 30 | * |
michael@0 | 31 | * 3. Redistribution in binary form must reproduce the Copyright Notice, |
michael@0 | 32 | * this License Terms and Conditions, in the documentation and/or other |
michael@0 | 33 | * materials provided with the distribution. For the purposes of binary |
michael@0 | 34 | * distribution the "Copyright Notice" refers to the following language: |
michael@0 | 35 | * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved." |
michael@0 | 36 | * |
michael@0 | 37 | * 4. The name of JPNIC may not be used to endorse or promote products |
michael@0 | 38 | * derived from this Software without specific prior written approval of |
michael@0 | 39 | * JPNIC. |
michael@0 | 40 | * |
michael@0 | 41 | * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC |
michael@0 | 42 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
michael@0 | 43 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
michael@0 | 44 | * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE |
michael@0 | 45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
michael@0 | 46 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
michael@0 | 47 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
michael@0 | 48 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
michael@0 | 49 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR |
michael@0 | 50 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF |
michael@0 | 51 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. |
michael@0 | 52 | */ |
michael@0 | 53 | |
michael@0 | 54 | #include <string.h> |
michael@0 | 55 | |
michael@0 | 56 | #include "nsMemory.h" |
michael@0 | 57 | #include "nsUnicodeNormalizer.h" |
michael@0 | 58 | #include "nsString.h" |
michael@0 | 59 | |
michael@0 | 60 | NS_IMPL_ISUPPORTS(nsUnicodeNormalizer, nsIUnicodeNormalizer) |
michael@0 | 61 | |
michael@0 | 62 | |
michael@0 | 63 | nsUnicodeNormalizer::nsUnicodeNormalizer() |
michael@0 | 64 | { |
michael@0 | 65 | } |
michael@0 | 66 | |
michael@0 | 67 | nsUnicodeNormalizer::~nsUnicodeNormalizer() |
michael@0 | 68 | { |
michael@0 | 69 | } |
michael@0 | 70 | |
michael@0 | 71 | |
michael@0 | 72 | |
michael@0 | 73 | #define END_BIT 0x80000000 |
michael@0 | 74 | |
michael@0 | 75 | |
michael@0 | 76 | /* |
michael@0 | 77 | * Some constants for Hangul decomposition/composition. |
michael@0 | 78 | * These things were taken from unicode book. |
michael@0 | 79 | */ |
michael@0 | 80 | #define SBase 0xac00 |
michael@0 | 81 | #define LBase 0x1100 |
michael@0 | 82 | #define VBase 0x1161 |
michael@0 | 83 | #define TBase 0x11a7 |
michael@0 | 84 | #define LCount 19 |
michael@0 | 85 | #define VCount 21 |
michael@0 | 86 | #define TCount 28 |
michael@0 | 87 | #define SLast (SBase + LCount * VCount * TCount) |
michael@0 | 88 | |
michael@0 | 89 | struct composition { |
michael@0 | 90 | uint32_t c2; /* 2nd character */ |
michael@0 | 91 | uint32_t comp; /* composed character */ |
michael@0 | 92 | }; |
michael@0 | 93 | |
michael@0 | 94 | |
michael@0 | 95 | #include "normalization_data.h" |
michael@0 | 96 | |
michael@0 | 97 | /* |
michael@0 | 98 | * Macro for multi-level index table. |
michael@0 | 99 | */ |
michael@0 | 100 | #define LOOKUPTBL(vprefix, mprefix, v) \ |
michael@0 | 101 | DMAP(vprefix)[\ |
michael@0 | 102 | IMAP(vprefix)[\ |
michael@0 | 103 | IMAP(vprefix)[IDX0(mprefix, v)] + IDX1(mprefix, v)\ |
michael@0 | 104 | ]\ |
michael@0 | 105 | ].tbl[IDX2(mprefix, v)] |
michael@0 | 106 | |
michael@0 | 107 | #define IDX0(mprefix, v) IDX_0(v, BITS1(mprefix), BITS2(mprefix)) |
michael@0 | 108 | #define IDX1(mprefix, v) IDX_1(v, BITS1(mprefix), BITS2(mprefix)) |
michael@0 | 109 | #define IDX2(mprefix, v) IDX_2(v, BITS1(mprefix), BITS2(mprefix)) |
michael@0 | 110 | |
michael@0 | 111 | #define IDX_0(v, bits1, bits2) ((v) >> ((bits1) + (bits2))) |
michael@0 | 112 | #define IDX_1(v, bits1, bits2) (((v) >> (bits2)) & ((1 << (bits1)) - 1)) |
michael@0 | 113 | #define IDX_2(v, bits1, bits2) ((v) & ((1 << (bits2)) - 1)) |
michael@0 | 114 | |
michael@0 | 115 | #define BITS1(mprefix) mprefix ## _BITS_1 |
michael@0 | 116 | #define BITS2(mprefix) mprefix ## _BITS_2 |
michael@0 | 117 | |
michael@0 | 118 | #define IMAP(vprefix) vprefix ## _imap |
michael@0 | 119 | #define DMAP(vprefix) vprefix ## _table |
michael@0 | 120 | #define SEQ(vprefix) vprefix ## _seq |
michael@0 | 121 | |
michael@0 | 122 | static int32_t |
michael@0 | 123 | canonclass(uint32_t c) { |
michael@0 | 124 | /* Look up canonicalclass table. */ |
michael@0 | 125 | return (LOOKUPTBL(canon_class, CANON_CLASS, c)); |
michael@0 | 126 | } |
michael@0 | 127 | |
michael@0 | 128 | static int32_t |
michael@0 | 129 | decompose_char(uint32_t c, const uint32_t **seqp) |
michael@0 | 130 | { |
michael@0 | 131 | /* Look up decomposition table. */ |
michael@0 | 132 | int32_t seqidx = LOOKUPTBL(decompose, DECOMP, c); |
michael@0 | 133 | *seqp = SEQ(decompose) + (seqidx & ~DECOMP_COMPAT); |
michael@0 | 134 | return (seqidx); |
michael@0 | 135 | } |
michael@0 | 136 | |
michael@0 | 137 | static int32_t |
michael@0 | 138 | compose_char(uint32_t c, |
michael@0 | 139 | const struct composition **compp) |
michael@0 | 140 | { |
michael@0 | 141 | /* Look up composition table. */ |
michael@0 | 142 | int32_t seqidx = LOOKUPTBL(compose, CANON_COMPOSE, c); |
michael@0 | 143 | *compp = SEQ(compose) + (seqidx & 0xffff); |
michael@0 | 144 | return (seqidx >> 16); |
michael@0 | 145 | } |
michael@0 | 146 | |
michael@0 | 147 | static nsresult |
michael@0 | 148 | mdn__unicode_decompose(int32_t compat, uint32_t *v, size_t vlen, |
michael@0 | 149 | uint32_t c, int32_t *decomp_lenp) |
michael@0 | 150 | { |
michael@0 | 151 | uint32_t *vorg = v; |
michael@0 | 152 | int32_t seqidx; |
michael@0 | 153 | const uint32_t *seq; |
michael@0 | 154 | |
michael@0 | 155 | //assert(v != nullptr && vlen >= 0 && decomp_lenp != nullptr); |
michael@0 | 156 | |
michael@0 | 157 | /* |
michael@0 | 158 | * First, check for Hangul. |
michael@0 | 159 | */ |
michael@0 | 160 | if (SBase <= c && c < SLast) { |
michael@0 | 161 | int32_t idx, t_offset, v_offset, l_offset; |
michael@0 | 162 | |
michael@0 | 163 | idx = c - SBase; |
michael@0 | 164 | t_offset = idx % TCount; |
michael@0 | 165 | idx /= TCount; |
michael@0 | 166 | v_offset = idx % VCount; |
michael@0 | 167 | l_offset = idx / VCount; |
michael@0 | 168 | if ((t_offset == 0 && vlen < 2) || (t_offset > 0 && vlen < 3)) |
michael@0 | 169 | return (NS_ERROR_UNORM_MOREOUTPUT); |
michael@0 | 170 | *v++ = LBase + l_offset; |
michael@0 | 171 | *v++ = VBase + v_offset; |
michael@0 | 172 | if (t_offset > 0) |
michael@0 | 173 | *v++ = TBase + t_offset; |
michael@0 | 174 | *decomp_lenp = v - vorg; |
michael@0 | 175 | return (NS_OK); |
michael@0 | 176 | } |
michael@0 | 177 | |
michael@0 | 178 | /* |
michael@0 | 179 | * Look up decomposition table. If no decomposition is defined |
michael@0 | 180 | * or if it is a compatibility decomosition when canonical |
michael@0 | 181 | * decomposition requested, return 'NS_SUCCESS_UNORM_NOTFOUND'. |
michael@0 | 182 | */ |
michael@0 | 183 | seqidx = decompose_char(c, &seq); |
michael@0 | 184 | if (seqidx == 0 || (compat == 0 && (seqidx & DECOMP_COMPAT) != 0)) |
michael@0 | 185 | return (NS_SUCCESS_UNORM_NOTFOUND); |
michael@0 | 186 | |
michael@0 | 187 | /* |
michael@0 | 188 | * Copy the decomposed sequence. The end of the sequence are |
michael@0 | 189 | * marked with END_BIT. |
michael@0 | 190 | */ |
michael@0 | 191 | do { |
michael@0 | 192 | uint32_t c; |
michael@0 | 193 | int32_t dlen; |
michael@0 | 194 | nsresult r; |
michael@0 | 195 | |
michael@0 | 196 | c = *seq & ~END_BIT; |
michael@0 | 197 | |
michael@0 | 198 | /* Decompose recursively. */ |
michael@0 | 199 | r = mdn__unicode_decompose(compat, v, vlen, c, &dlen); |
michael@0 | 200 | if (r == NS_OK) { |
michael@0 | 201 | v += dlen; |
michael@0 | 202 | vlen -= dlen; |
michael@0 | 203 | } else if (r == NS_SUCCESS_UNORM_NOTFOUND) { |
michael@0 | 204 | if (vlen < 1) |
michael@0 | 205 | return (NS_ERROR_UNORM_MOREOUTPUT); |
michael@0 | 206 | *v++ = c; |
michael@0 | 207 | vlen--; |
michael@0 | 208 | } else { |
michael@0 | 209 | return (r); |
michael@0 | 210 | } |
michael@0 | 211 | |
michael@0 | 212 | } while ((*seq++ & END_BIT) == 0); |
michael@0 | 213 | |
michael@0 | 214 | *decomp_lenp = v - vorg; |
michael@0 | 215 | |
michael@0 | 216 | return (NS_OK); |
michael@0 | 217 | } |
michael@0 | 218 | |
michael@0 | 219 | static int32_t |
michael@0 | 220 | mdn__unicode_iscompositecandidate(uint32_t c) |
michael@0 | 221 | { |
michael@0 | 222 | const struct composition *dummy; |
michael@0 | 223 | |
michael@0 | 224 | /* Check for Hangul */ |
michael@0 | 225 | if ((LBase <= c && c < LBase + LCount) || (SBase <= c && c < SLast)) |
michael@0 | 226 | return (1); |
michael@0 | 227 | |
michael@0 | 228 | /* |
michael@0 | 229 | * Look up composition table. If there are no composition |
michael@0 | 230 | * that begins with the given character, it is not a |
michael@0 | 231 | * composition candidate. |
michael@0 | 232 | */ |
michael@0 | 233 | if (compose_char(c, &dummy) == 0) |
michael@0 | 234 | return (0); |
michael@0 | 235 | else |
michael@0 | 236 | return (1); |
michael@0 | 237 | } |
michael@0 | 238 | |
michael@0 | 239 | static nsresult |
michael@0 | 240 | mdn__unicode_compose(uint32_t c1, uint32_t c2, uint32_t *compp) |
michael@0 | 241 | { |
michael@0 | 242 | int32_t n; |
michael@0 | 243 | int32_t lo, hi; |
michael@0 | 244 | const struct composition *cseq; |
michael@0 | 245 | |
michael@0 | 246 | //assert(compp != nullptr); |
michael@0 | 247 | |
michael@0 | 248 | /* |
michael@0 | 249 | * Check for Hangul. |
michael@0 | 250 | */ |
michael@0 | 251 | if (LBase <= c1 && c1 < LBase + LCount && |
michael@0 | 252 | VBase <= c2 && c2 < VBase + VCount) { |
michael@0 | 253 | /* |
michael@0 | 254 | * Hangul L and V. |
michael@0 | 255 | */ |
michael@0 | 256 | *compp = SBase + |
michael@0 | 257 | ((c1 - LBase) * VCount + (c2 - VBase)) * TCount; |
michael@0 | 258 | return (NS_OK); |
michael@0 | 259 | } else if (SBase <= c1 && c1 < SLast && |
michael@0 | 260 | TBase <= c2 && c2 < TBase + TCount && |
michael@0 | 261 | (c1 - SBase) % TCount == 0) { |
michael@0 | 262 | /* |
michael@0 | 263 | * Hangul LV and T. |
michael@0 | 264 | */ |
michael@0 | 265 | *compp = c1 + (c2 - TBase); |
michael@0 | 266 | return (NS_OK); |
michael@0 | 267 | } |
michael@0 | 268 | |
michael@0 | 269 | /* |
michael@0 | 270 | * Look up composition table. If the result is 0, no composition |
michael@0 | 271 | * is defined. Otherwise, upper 16bits of the result contains |
michael@0 | 272 | * the number of composition that begins with 'c1', and the lower |
michael@0 | 273 | * 16bits is the offset in 'compose_seq'. |
michael@0 | 274 | */ |
michael@0 | 275 | if ((n = compose_char(c1, &cseq)) == 0) |
michael@0 | 276 | return (NS_SUCCESS_UNORM_NOTFOUND); |
michael@0 | 277 | |
michael@0 | 278 | /* |
michael@0 | 279 | * The composite sequences are sorted by the 2nd character 'c2'. |
michael@0 | 280 | * So we can use binary search. |
michael@0 | 281 | */ |
michael@0 | 282 | lo = 0; |
michael@0 | 283 | hi = n - 1; |
michael@0 | 284 | while (lo <= hi) { |
michael@0 | 285 | int32_t mid = (lo + hi) / 2; |
michael@0 | 286 | |
michael@0 | 287 | if (cseq[mid].c2 < c2) { |
michael@0 | 288 | lo = mid + 1; |
michael@0 | 289 | } else if (cseq[mid].c2 > c2) { |
michael@0 | 290 | hi = mid - 1; |
michael@0 | 291 | } else { |
michael@0 | 292 | *compp = cseq[mid].comp; |
michael@0 | 293 | return (NS_OK); |
michael@0 | 294 | } |
michael@0 | 295 | } |
michael@0 | 296 | return (NS_SUCCESS_UNORM_NOTFOUND); |
michael@0 | 297 | } |
michael@0 | 298 | |
michael@0 | 299 | |
michael@0 | 300 | #define WORKBUF_SIZE 128 |
michael@0 | 301 | #define WORKBUF_SIZE_MAX 10000 |
michael@0 | 302 | |
michael@0 | 303 | typedef struct { |
michael@0 | 304 | int32_t cur; /* pointing now processing character */ |
michael@0 | 305 | int32_t last; /* pointing just after the last character */ |
michael@0 | 306 | int32_t size; /* size of UCS and CLASS array */ |
michael@0 | 307 | uint32_t *ucs; /* UCS-4 characters */ |
michael@0 | 308 | int32_t *cclass; /* and their canonical classes */ |
michael@0 | 309 | uint32_t ucs_buf[WORKBUF_SIZE]; /* local buffer */ |
michael@0 | 310 | int32_t class_buf[WORKBUF_SIZE]; /* ditto */ |
michael@0 | 311 | } workbuf_t; |
michael@0 | 312 | |
michael@0 | 313 | static nsresult decompose(workbuf_t *wb, uint32_t c, int32_t compat); |
michael@0 | 314 | static void get_class(workbuf_t *wb); |
michael@0 | 315 | static void reorder(workbuf_t *wb); |
michael@0 | 316 | static void compose(workbuf_t *wb); |
michael@0 | 317 | static nsresult flush_before_cur(workbuf_t *wb, nsAString& aToStr); |
michael@0 | 318 | static void workbuf_init(workbuf_t *wb); |
michael@0 | 319 | static void workbuf_free(workbuf_t *wb); |
michael@0 | 320 | static nsresult workbuf_extend(workbuf_t *wb); |
michael@0 | 321 | static nsresult workbuf_append(workbuf_t *wb, uint32_t c); |
michael@0 | 322 | static void workbuf_shift(workbuf_t *wb, int32_t shift); |
michael@0 | 323 | static void workbuf_removevoid(workbuf_t *wb); |
michael@0 | 324 | |
michael@0 | 325 | |
michael@0 | 326 | static nsresult |
michael@0 | 327 | mdn_normalize(bool do_composition, bool compat, |
michael@0 | 328 | const nsAString& aSrcStr, nsAString& aToStr) |
michael@0 | 329 | { |
michael@0 | 330 | workbuf_t wb; |
michael@0 | 331 | nsresult r = NS_OK; |
michael@0 | 332 | /* |
michael@0 | 333 | * Initialize working buffer. |
michael@0 | 334 | */ |
michael@0 | 335 | workbuf_init(&wb); |
michael@0 | 336 | |
michael@0 | 337 | nsAString::const_iterator start, end; |
michael@0 | 338 | aSrcStr.BeginReading(start); |
michael@0 | 339 | aSrcStr.EndReading(end); |
michael@0 | 340 | |
michael@0 | 341 | while (start != end) { |
michael@0 | 342 | uint32_t c; |
michael@0 | 343 | char16_t curChar; |
michael@0 | 344 | |
michael@0 | 345 | //assert(wb.cur == wb.last); |
michael@0 | 346 | |
michael@0 | 347 | /* |
michael@0 | 348 | * Get one character from 'from'. |
michael@0 | 349 | */ |
michael@0 | 350 | curChar= *start++; |
michael@0 | 351 | |
michael@0 | 352 | if (NS_IS_HIGH_SURROGATE(curChar) && start != end && NS_IS_LOW_SURROGATE(*(start)) ) { |
michael@0 | 353 | c = SURROGATE_TO_UCS4(curChar, *start); |
michael@0 | 354 | ++start; |
michael@0 | 355 | } else { |
michael@0 | 356 | c = curChar; |
michael@0 | 357 | } |
michael@0 | 358 | |
michael@0 | 359 | /* |
michael@0 | 360 | * Decompose it. |
michael@0 | 361 | */ |
michael@0 | 362 | if ((r = decompose(&wb, c, compat)) != NS_OK) |
michael@0 | 363 | break; |
michael@0 | 364 | |
michael@0 | 365 | /* |
michael@0 | 366 | * Get canonical class. |
michael@0 | 367 | */ |
michael@0 | 368 | get_class(&wb); |
michael@0 | 369 | |
michael@0 | 370 | /* |
michael@0 | 371 | * Reorder & compose. |
michael@0 | 372 | */ |
michael@0 | 373 | for (; wb.cur < wb.last; wb.cur++) { |
michael@0 | 374 | if (wb.cur == 0) { |
michael@0 | 375 | continue; |
michael@0 | 376 | } else if (wb.cclass[wb.cur] > 0) { |
michael@0 | 377 | /* |
michael@0 | 378 | * This is not a starter. Try reordering. |
michael@0 | 379 | * Note that characters up to it are |
michael@0 | 380 | * already in canonical order. |
michael@0 | 381 | */ |
michael@0 | 382 | reorder(&wb); |
michael@0 | 383 | continue; |
michael@0 | 384 | } |
michael@0 | 385 | |
michael@0 | 386 | /* |
michael@0 | 387 | * This is a starter character, and there are |
michael@0 | 388 | * some characters before it. Those characters |
michael@0 | 389 | * have been reordered properly, and |
michael@0 | 390 | * ready for composition. |
michael@0 | 391 | */ |
michael@0 | 392 | if (do_composition && wb.cclass[0] == 0) |
michael@0 | 393 | compose(&wb); |
michael@0 | 394 | |
michael@0 | 395 | /* |
michael@0 | 396 | * If CUR points to a starter character, |
michael@0 | 397 | * then process of characters before CUR are |
michael@0 | 398 | * already finished, because any further |
michael@0 | 399 | * reordering/composition for them are blocked |
michael@0 | 400 | * by the starter CUR points. |
michael@0 | 401 | */ |
michael@0 | 402 | if (wb.cur > 0 && wb.cclass[wb.cur] == 0) { |
michael@0 | 403 | /* Flush everything before CUR. */ |
michael@0 | 404 | r = flush_before_cur(&wb, aToStr); |
michael@0 | 405 | if (r != NS_OK) |
michael@0 | 406 | break; |
michael@0 | 407 | } |
michael@0 | 408 | } |
michael@0 | 409 | } |
michael@0 | 410 | |
michael@0 | 411 | if (r == NS_OK) { |
michael@0 | 412 | if (do_composition && wb.cur > 0 && wb.cclass[0] == 0) { |
michael@0 | 413 | /* |
michael@0 | 414 | * There is some characters left in WB. |
michael@0 | 415 | * They are ordered, but not composed yet. |
michael@0 | 416 | * Now CUR points just after the last character in WB, |
michael@0 | 417 | * and since compose() tries to compose characters |
michael@0 | 418 | * between top and CUR inclusive, we must make CUR |
michael@0 | 419 | * one character back during compose(). |
michael@0 | 420 | */ |
michael@0 | 421 | wb.cur--; |
michael@0 | 422 | compose(&wb); |
michael@0 | 423 | wb.cur++; |
michael@0 | 424 | } |
michael@0 | 425 | /* |
michael@0 | 426 | * Call this even when WB.CUR == 0, to make TO |
michael@0 | 427 | * NUL-terminated. |
michael@0 | 428 | */ |
michael@0 | 429 | r = flush_before_cur(&wb, aToStr); |
michael@0 | 430 | } |
michael@0 | 431 | |
michael@0 | 432 | workbuf_free(&wb); |
michael@0 | 433 | |
michael@0 | 434 | return (r); |
michael@0 | 435 | } |
michael@0 | 436 | |
michael@0 | 437 | static nsresult |
michael@0 | 438 | decompose(workbuf_t *wb, uint32_t c, int32_t compat) { |
michael@0 | 439 | nsresult r; |
michael@0 | 440 | int32_t dec_len; |
michael@0 | 441 | |
michael@0 | 442 | again: |
michael@0 | 443 | r = mdn__unicode_decompose(compat, wb->ucs + wb->last, |
michael@0 | 444 | wb->size - wb->last, c, &dec_len); |
michael@0 | 445 | switch (r) { |
michael@0 | 446 | case NS_OK: |
michael@0 | 447 | wb->last += dec_len; |
michael@0 | 448 | return (NS_OK); |
michael@0 | 449 | case NS_SUCCESS_UNORM_NOTFOUND: |
michael@0 | 450 | return (workbuf_append(wb, c)); |
michael@0 | 451 | case NS_ERROR_UNORM_MOREOUTPUT: |
michael@0 | 452 | if ((r = workbuf_extend(wb)) != NS_OK) |
michael@0 | 453 | return (r); |
michael@0 | 454 | if (wb->size > WORKBUF_SIZE_MAX) { |
michael@0 | 455 | // "mdn__unormalize_form*: " "working buffer too large\n" |
michael@0 | 456 | return (NS_ERROR_FAILURE); |
michael@0 | 457 | } |
michael@0 | 458 | goto again; |
michael@0 | 459 | default: |
michael@0 | 460 | return (r); |
michael@0 | 461 | } |
michael@0 | 462 | /* NOTREACHED */ |
michael@0 | 463 | } |
michael@0 | 464 | |
michael@0 | 465 | static void |
michael@0 | 466 | get_class(workbuf_t *wb) { |
michael@0 | 467 | int32_t i; |
michael@0 | 468 | |
michael@0 | 469 | for (i = wb->cur; i < wb->last; i++) |
michael@0 | 470 | wb->cclass[i] = canonclass(wb->ucs[i]); |
michael@0 | 471 | } |
michael@0 | 472 | |
michael@0 | 473 | static void |
michael@0 | 474 | reorder(workbuf_t *wb) { |
michael@0 | 475 | uint32_t c; |
michael@0 | 476 | int32_t i; |
michael@0 | 477 | int32_t cclass; |
michael@0 | 478 | |
michael@0 | 479 | //assert(wb != nullptr); |
michael@0 | 480 | |
michael@0 | 481 | i = wb->cur; |
michael@0 | 482 | c = wb->ucs[i]; |
michael@0 | 483 | cclass = wb->cclass[i]; |
michael@0 | 484 | |
michael@0 | 485 | while (i > 0 && wb->cclass[i - 1] > cclass) { |
michael@0 | 486 | wb->ucs[i] = wb->ucs[i - 1]; |
michael@0 | 487 | wb->cclass[i] =wb->cclass[i - 1]; |
michael@0 | 488 | i--; |
michael@0 | 489 | wb->ucs[i] = c; |
michael@0 | 490 | wb->cclass[i] = cclass; |
michael@0 | 491 | } |
michael@0 | 492 | } |
michael@0 | 493 | |
michael@0 | 494 | static void |
michael@0 | 495 | compose(workbuf_t *wb) { |
michael@0 | 496 | int32_t cur; |
michael@0 | 497 | uint32_t *ucs; |
michael@0 | 498 | int32_t *cclass; |
michael@0 | 499 | int32_t last_class; |
michael@0 | 500 | int32_t nvoids; |
michael@0 | 501 | int32_t i; |
michael@0 | 502 | |
michael@0 | 503 | //assert(wb != nullptr && wb->cclass[0] == 0); |
michael@0 | 504 | |
michael@0 | 505 | cur = wb->cur; |
michael@0 | 506 | ucs = wb->ucs; |
michael@0 | 507 | cclass = wb->cclass; |
michael@0 | 508 | |
michael@0 | 509 | /* |
michael@0 | 510 | * If there are no decomposition sequence that begins with |
michael@0 | 511 | * the top character, composition is impossible. |
michael@0 | 512 | */ |
michael@0 | 513 | if (!mdn__unicode_iscompositecandidate(ucs[0])) |
michael@0 | 514 | return; |
michael@0 | 515 | |
michael@0 | 516 | last_class = 0; |
michael@0 | 517 | nvoids = 0; |
michael@0 | 518 | for (i = 1; i <= cur; i++) { |
michael@0 | 519 | uint32_t c; |
michael@0 | 520 | int32_t cl = cclass[i]; |
michael@0 | 521 | |
michael@0 | 522 | if ((last_class < cl || cl == 0) && |
michael@0 | 523 | mdn__unicode_compose(ucs[0], ucs[i], |
michael@0 | 524 | &c) == NS_OK) { |
michael@0 | 525 | /* |
michael@0 | 526 | * Replace the top character with the composed one. |
michael@0 | 527 | */ |
michael@0 | 528 | ucs[0] = c; |
michael@0 | 529 | cclass[0] = canonclass(c); |
michael@0 | 530 | |
michael@0 | 531 | cclass[i] = -1; /* void this character */ |
michael@0 | 532 | nvoids++; |
michael@0 | 533 | } else { |
michael@0 | 534 | last_class = cl; |
michael@0 | 535 | } |
michael@0 | 536 | } |
michael@0 | 537 | |
michael@0 | 538 | /* Purge void characters, if any. */ |
michael@0 | 539 | if (nvoids > 0) |
michael@0 | 540 | workbuf_removevoid(wb); |
michael@0 | 541 | } |
michael@0 | 542 | |
michael@0 | 543 | static nsresult |
michael@0 | 544 | flush_before_cur(workbuf_t *wb, nsAString& aToStr) |
michael@0 | 545 | { |
michael@0 | 546 | int32_t i; |
michael@0 | 547 | |
michael@0 | 548 | for (i = 0; i < wb->cur; i++) { |
michael@0 | 549 | if (!IS_IN_BMP(wb->ucs[i])) { |
michael@0 | 550 | aToStr.Append((char16_t)H_SURROGATE(wb->ucs[i])); |
michael@0 | 551 | aToStr.Append((char16_t)L_SURROGATE(wb->ucs[i])); |
michael@0 | 552 | } else { |
michael@0 | 553 | aToStr.Append((char16_t)(wb->ucs[i])); |
michael@0 | 554 | } |
michael@0 | 555 | } |
michael@0 | 556 | |
michael@0 | 557 | workbuf_shift(wb, wb->cur); |
michael@0 | 558 | |
michael@0 | 559 | return (NS_OK); |
michael@0 | 560 | } |
michael@0 | 561 | |
michael@0 | 562 | static void |
michael@0 | 563 | workbuf_init(workbuf_t *wb) { |
michael@0 | 564 | wb->cur = 0; |
michael@0 | 565 | wb->last = 0; |
michael@0 | 566 | wb->size = WORKBUF_SIZE; |
michael@0 | 567 | wb->ucs = wb->ucs_buf; |
michael@0 | 568 | wb->cclass = wb->class_buf; |
michael@0 | 569 | } |
michael@0 | 570 | |
michael@0 | 571 | static void |
michael@0 | 572 | workbuf_free(workbuf_t *wb) { |
michael@0 | 573 | if (wb->ucs != wb->ucs_buf) { |
michael@0 | 574 | nsMemory::Free(wb->ucs); |
michael@0 | 575 | nsMemory::Free(wb->cclass); |
michael@0 | 576 | } |
michael@0 | 577 | } |
michael@0 | 578 | |
michael@0 | 579 | static nsresult |
michael@0 | 580 | workbuf_extend(workbuf_t *wb) { |
michael@0 | 581 | int32_t newsize = wb->size * 3; |
michael@0 | 582 | |
michael@0 | 583 | if (wb->ucs == wb->ucs_buf) { |
michael@0 | 584 | wb->ucs = (uint32_t*)nsMemory::Alloc(sizeof(wb->ucs[0]) * newsize); |
michael@0 | 585 | if (!wb->ucs) |
michael@0 | 586 | return NS_ERROR_OUT_OF_MEMORY; |
michael@0 | 587 | wb->cclass = (int32_t*)nsMemory::Alloc(sizeof(wb->cclass[0]) * newsize); |
michael@0 | 588 | if (!wb->cclass) { |
michael@0 | 589 | nsMemory::Free(wb->ucs); |
michael@0 | 590 | wb->ucs = nullptr; |
michael@0 | 591 | return NS_ERROR_OUT_OF_MEMORY; |
michael@0 | 592 | } |
michael@0 | 593 | } else { |
michael@0 | 594 | void* buf = nsMemory::Realloc(wb->ucs, sizeof(wb->ucs[0]) * newsize); |
michael@0 | 595 | if (!buf) |
michael@0 | 596 | return NS_ERROR_OUT_OF_MEMORY; |
michael@0 | 597 | wb->ucs = (uint32_t*)buf; |
michael@0 | 598 | buf = nsMemory::Realloc(wb->cclass, sizeof(wb->cclass[0]) * newsize); |
michael@0 | 599 | if (!buf) |
michael@0 | 600 | return NS_ERROR_OUT_OF_MEMORY; |
michael@0 | 601 | wb->cclass = (int32_t*)buf; |
michael@0 | 602 | } |
michael@0 | 603 | return (NS_OK); |
michael@0 | 604 | } |
michael@0 | 605 | |
michael@0 | 606 | static nsresult |
michael@0 | 607 | workbuf_append(workbuf_t *wb, uint32_t c) { |
michael@0 | 608 | nsresult r; |
michael@0 | 609 | |
michael@0 | 610 | if (wb->last >= wb->size && (r = workbuf_extend(wb)) != NS_OK) |
michael@0 | 611 | return (r); |
michael@0 | 612 | wb->ucs[wb->last++] = c; |
michael@0 | 613 | return (NS_OK); |
michael@0 | 614 | } |
michael@0 | 615 | |
michael@0 | 616 | static void |
michael@0 | 617 | workbuf_shift(workbuf_t *wb, int32_t shift) { |
michael@0 | 618 | int32_t nmove; |
michael@0 | 619 | |
michael@0 | 620 | //assert(wb != nullptr && wb->cur >= shift); |
michael@0 | 621 | |
michael@0 | 622 | nmove = wb->last - shift; |
michael@0 | 623 | memmove(&wb->ucs[0], &wb->ucs[shift], |
michael@0 | 624 | nmove * sizeof(wb->ucs[0])); |
michael@0 | 625 | memmove(&wb->cclass[0], &wb->cclass[shift], |
michael@0 | 626 | nmove * sizeof(wb->cclass[0])); |
michael@0 | 627 | wb->cur -= shift; |
michael@0 | 628 | wb->last -= shift; |
michael@0 | 629 | } |
michael@0 | 630 | |
michael@0 | 631 | static void |
michael@0 | 632 | workbuf_removevoid(workbuf_t *wb) { |
michael@0 | 633 | int32_t i, j; |
michael@0 | 634 | int32_t last = wb->last; |
michael@0 | 635 | |
michael@0 | 636 | for (i = j = 0; i < last; i++) { |
michael@0 | 637 | if (wb->cclass[i] >= 0) { |
michael@0 | 638 | if (j < i) { |
michael@0 | 639 | wb->ucs[j] = wb->ucs[i]; |
michael@0 | 640 | wb->cclass[j] = wb->cclass[i]; |
michael@0 | 641 | } |
michael@0 | 642 | j++; |
michael@0 | 643 | } |
michael@0 | 644 | } |
michael@0 | 645 | wb->cur -= last - j; |
michael@0 | 646 | wb->last = j; |
michael@0 | 647 | } |
michael@0 | 648 | |
michael@0 | 649 | nsresult |
michael@0 | 650 | nsUnicodeNormalizer::NormalizeUnicodeNFD( const nsAString& aSrc, nsAString& aDest) |
michael@0 | 651 | { |
michael@0 | 652 | return mdn_normalize(false, false, aSrc, aDest); |
michael@0 | 653 | } |
michael@0 | 654 | |
michael@0 | 655 | nsresult |
michael@0 | 656 | nsUnicodeNormalizer::NormalizeUnicodeNFC( const nsAString& aSrc, nsAString& aDest) |
michael@0 | 657 | { |
michael@0 | 658 | return mdn_normalize(true, false, aSrc, aDest); |
michael@0 | 659 | } |
michael@0 | 660 | |
michael@0 | 661 | nsresult |
michael@0 | 662 | nsUnicodeNormalizer::NormalizeUnicodeNFKD( const nsAString& aSrc, nsAString& aDest) |
michael@0 | 663 | { |
michael@0 | 664 | return mdn_normalize(false, true, aSrc, aDest); |
michael@0 | 665 | } |
michael@0 | 666 | |
michael@0 | 667 | nsresult |
michael@0 | 668 | nsUnicodeNormalizer::NormalizeUnicodeNFKC( const nsAString& aSrc, nsAString& aDest) |
michael@0 | 669 | { |
michael@0 | 670 | return mdn_normalize(true, true, aSrc, aDest); |
michael@0 | 671 | } |
michael@0 | 672 | |
michael@0 | 673 | bool |
michael@0 | 674 | nsUnicodeNormalizer::Compose(uint32_t a, uint32_t b, uint32_t *ab) |
michael@0 | 675 | { |
michael@0 | 676 | return mdn__unicode_compose(a, b, ab) == NS_OK; |
michael@0 | 677 | } |
michael@0 | 678 | |
michael@0 | 679 | bool |
michael@0 | 680 | nsUnicodeNormalizer::DecomposeNonRecursively(uint32_t c, uint32_t *c1, uint32_t *c2) |
michael@0 | 681 | { |
michael@0 | 682 | // We can't use mdn__unicode_decompose here, because that does a recursive |
michael@0 | 683 | // decomposition that may yield more than two characters, but the harfbuzz |
michael@0 | 684 | // callback wants just a single-step decomp that is guaranteed to produce |
michael@0 | 685 | // no more than two characters. So we do a low-level lookup in the table |
michael@0 | 686 | // of decomp sequences. |
michael@0 | 687 | const uint32_t *seq; |
michael@0 | 688 | uint32_t seqidx = decompose_char(c, &seq); |
michael@0 | 689 | if (seqidx == 0 || ((seqidx & DECOMP_COMPAT) != 0)) { |
michael@0 | 690 | return false; |
michael@0 | 691 | } |
michael@0 | 692 | *c1 = *seq & ~END_BIT; |
michael@0 | 693 | if (*seq & END_BIT) { |
michael@0 | 694 | *c2 = 0; |
michael@0 | 695 | } else { |
michael@0 | 696 | *c2 = *++seq & ~END_BIT; |
michael@0 | 697 | } |
michael@0 | 698 | return true; |
michael@0 | 699 | } |