intl/unicharutil/src/nsUnicodeNormalizer.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
michael@0 2
michael@0 3 /* This file is modified from JPNIC's mDNKit, it is under both MPL and
michael@0 4 * JPNIC's license.
michael@0 5 */
michael@0 6
michael@0 7 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 8 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 9 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 10
michael@0 11 /*
michael@0 12 * Copyright (c) 2000,2002 Japan Network Information Center.
michael@0 13 * All rights reserved.
michael@0 14 *
michael@0 15 * By using this file, you agree to the terms and conditions set forth bellow.
michael@0 16 *
michael@0 17 * LICENSE TERMS AND CONDITIONS
michael@0 18 *
michael@0 19 * The following License Terms and Conditions apply, unless a different
michael@0 20 * license is obtained from Japan Network Information Center ("JPNIC"),
michael@0 21 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
michael@0 22 * Chiyoda-ku, Tokyo 101-0047, Japan.
michael@0 23 *
michael@0 24 * 1. Use, Modification and Redistribution (including distribution of any
michael@0 25 * modified or derived work) in source and/or binary forms is permitted
michael@0 26 * under this License Terms and Conditions.
michael@0 27 *
michael@0 28 * 2. Redistribution of source code must retain the copyright notices as they
michael@0 29 * appear in each source code file, this License Terms and Conditions.
michael@0 30 *
michael@0 31 * 3. Redistribution in binary form must reproduce the Copyright Notice,
michael@0 32 * this License Terms and Conditions, in the documentation and/or other
michael@0 33 * materials provided with the distribution. For the purposes of binary
michael@0 34 * distribution the "Copyright Notice" refers to the following language:
michael@0 35 * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved."
michael@0 36 *
michael@0 37 * 4. The name of JPNIC may not be used to endorse or promote products
michael@0 38 * derived from this Software without specific prior written approval of
michael@0 39 * JPNIC.
michael@0 40 *
michael@0 41 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
michael@0 42 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
michael@0 43 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
michael@0 44 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE
michael@0 45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
michael@0 46 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
michael@0 47 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
michael@0 48 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
michael@0 49 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
michael@0 50 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
michael@0 51 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
michael@0 52 */
michael@0 53
michael@0 54 #include <string.h>
michael@0 55
michael@0 56 #include "nsMemory.h"
michael@0 57 #include "nsUnicodeNormalizer.h"
michael@0 58 #include "nsString.h"
michael@0 59
michael@0 60 NS_IMPL_ISUPPORTS(nsUnicodeNormalizer, nsIUnicodeNormalizer)
michael@0 61
michael@0 62
michael@0 63 nsUnicodeNormalizer::nsUnicodeNormalizer()
michael@0 64 {
michael@0 65 }
michael@0 66
michael@0 67 nsUnicodeNormalizer::~nsUnicodeNormalizer()
michael@0 68 {
michael@0 69 }
michael@0 70
michael@0 71
michael@0 72
michael@0 73 #define END_BIT 0x80000000
michael@0 74
michael@0 75
michael@0 76 /*
michael@0 77 * Some constants for Hangul decomposition/composition.
michael@0 78 * These things were taken from unicode book.
michael@0 79 */
michael@0 80 #define SBase 0xac00
michael@0 81 #define LBase 0x1100
michael@0 82 #define VBase 0x1161
michael@0 83 #define TBase 0x11a7
michael@0 84 #define LCount 19
michael@0 85 #define VCount 21
michael@0 86 #define TCount 28
michael@0 87 #define SLast (SBase + LCount * VCount * TCount)
michael@0 88
michael@0 89 struct composition {
michael@0 90 uint32_t c2; /* 2nd character */
michael@0 91 uint32_t comp; /* composed character */
michael@0 92 };
michael@0 93
michael@0 94
michael@0 95 #include "normalization_data.h"
michael@0 96
michael@0 97 /*
michael@0 98 * Macro for multi-level index table.
michael@0 99 */
michael@0 100 #define LOOKUPTBL(vprefix, mprefix, v) \
michael@0 101 DMAP(vprefix)[\
michael@0 102 IMAP(vprefix)[\
michael@0 103 IMAP(vprefix)[IDX0(mprefix, v)] + IDX1(mprefix, v)\
michael@0 104 ]\
michael@0 105 ].tbl[IDX2(mprefix, v)]
michael@0 106
michael@0 107 #define IDX0(mprefix, v) IDX_0(v, BITS1(mprefix), BITS2(mprefix))
michael@0 108 #define IDX1(mprefix, v) IDX_1(v, BITS1(mprefix), BITS2(mprefix))
michael@0 109 #define IDX2(mprefix, v) IDX_2(v, BITS1(mprefix), BITS2(mprefix))
michael@0 110
michael@0 111 #define IDX_0(v, bits1, bits2) ((v) >> ((bits1) + (bits2)))
michael@0 112 #define IDX_1(v, bits1, bits2) (((v) >> (bits2)) & ((1 << (bits1)) - 1))
michael@0 113 #define IDX_2(v, bits1, bits2) ((v) & ((1 << (bits2)) - 1))
michael@0 114
michael@0 115 #define BITS1(mprefix) mprefix ## _BITS_1
michael@0 116 #define BITS2(mprefix) mprefix ## _BITS_2
michael@0 117
michael@0 118 #define IMAP(vprefix) vprefix ## _imap
michael@0 119 #define DMAP(vprefix) vprefix ## _table
michael@0 120 #define SEQ(vprefix) vprefix ## _seq
michael@0 121
michael@0 122 static int32_t
michael@0 123 canonclass(uint32_t c) {
michael@0 124 /* Look up canonicalclass table. */
michael@0 125 return (LOOKUPTBL(canon_class, CANON_CLASS, c));
michael@0 126 }
michael@0 127
michael@0 128 static int32_t
michael@0 129 decompose_char(uint32_t c, const uint32_t **seqp)
michael@0 130 {
michael@0 131 /* Look up decomposition table. */
michael@0 132 int32_t seqidx = LOOKUPTBL(decompose, DECOMP, c);
michael@0 133 *seqp = SEQ(decompose) + (seqidx & ~DECOMP_COMPAT);
michael@0 134 return (seqidx);
michael@0 135 }
michael@0 136
michael@0 137 static int32_t
michael@0 138 compose_char(uint32_t c,
michael@0 139 const struct composition **compp)
michael@0 140 {
michael@0 141 /* Look up composition table. */
michael@0 142 int32_t seqidx = LOOKUPTBL(compose, CANON_COMPOSE, c);
michael@0 143 *compp = SEQ(compose) + (seqidx & 0xffff);
michael@0 144 return (seqidx >> 16);
michael@0 145 }
michael@0 146
michael@0 147 static nsresult
michael@0 148 mdn__unicode_decompose(int32_t compat, uint32_t *v, size_t vlen,
michael@0 149 uint32_t c, int32_t *decomp_lenp)
michael@0 150 {
michael@0 151 uint32_t *vorg = v;
michael@0 152 int32_t seqidx;
michael@0 153 const uint32_t *seq;
michael@0 154
michael@0 155 //assert(v != nullptr && vlen >= 0 && decomp_lenp != nullptr);
michael@0 156
michael@0 157 /*
michael@0 158 * First, check for Hangul.
michael@0 159 */
michael@0 160 if (SBase <= c && c < SLast) {
michael@0 161 int32_t idx, t_offset, v_offset, l_offset;
michael@0 162
michael@0 163 idx = c - SBase;
michael@0 164 t_offset = idx % TCount;
michael@0 165 idx /= TCount;
michael@0 166 v_offset = idx % VCount;
michael@0 167 l_offset = idx / VCount;
michael@0 168 if ((t_offset == 0 && vlen < 2) || (t_offset > 0 && vlen < 3))
michael@0 169 return (NS_ERROR_UNORM_MOREOUTPUT);
michael@0 170 *v++ = LBase + l_offset;
michael@0 171 *v++ = VBase + v_offset;
michael@0 172 if (t_offset > 0)
michael@0 173 *v++ = TBase + t_offset;
michael@0 174 *decomp_lenp = v - vorg;
michael@0 175 return (NS_OK);
michael@0 176 }
michael@0 177
michael@0 178 /*
michael@0 179 * Look up decomposition table. If no decomposition is defined
michael@0 180 * or if it is a compatibility decomosition when canonical
michael@0 181 * decomposition requested, return 'NS_SUCCESS_UNORM_NOTFOUND'.
michael@0 182 */
michael@0 183 seqidx = decompose_char(c, &seq);
michael@0 184 if (seqidx == 0 || (compat == 0 && (seqidx & DECOMP_COMPAT) != 0))
michael@0 185 return (NS_SUCCESS_UNORM_NOTFOUND);
michael@0 186
michael@0 187 /*
michael@0 188 * Copy the decomposed sequence. The end of the sequence are
michael@0 189 * marked with END_BIT.
michael@0 190 */
michael@0 191 do {
michael@0 192 uint32_t c;
michael@0 193 int32_t dlen;
michael@0 194 nsresult r;
michael@0 195
michael@0 196 c = *seq & ~END_BIT;
michael@0 197
michael@0 198 /* Decompose recursively. */
michael@0 199 r = mdn__unicode_decompose(compat, v, vlen, c, &dlen);
michael@0 200 if (r == NS_OK) {
michael@0 201 v += dlen;
michael@0 202 vlen -= dlen;
michael@0 203 } else if (r == NS_SUCCESS_UNORM_NOTFOUND) {
michael@0 204 if (vlen < 1)
michael@0 205 return (NS_ERROR_UNORM_MOREOUTPUT);
michael@0 206 *v++ = c;
michael@0 207 vlen--;
michael@0 208 } else {
michael@0 209 return (r);
michael@0 210 }
michael@0 211
michael@0 212 } while ((*seq++ & END_BIT) == 0);
michael@0 213
michael@0 214 *decomp_lenp = v - vorg;
michael@0 215
michael@0 216 return (NS_OK);
michael@0 217 }
michael@0 218
michael@0 219 static int32_t
michael@0 220 mdn__unicode_iscompositecandidate(uint32_t c)
michael@0 221 {
michael@0 222 const struct composition *dummy;
michael@0 223
michael@0 224 /* Check for Hangul */
michael@0 225 if ((LBase <= c && c < LBase + LCount) || (SBase <= c && c < SLast))
michael@0 226 return (1);
michael@0 227
michael@0 228 /*
michael@0 229 * Look up composition table. If there are no composition
michael@0 230 * that begins with the given character, it is not a
michael@0 231 * composition candidate.
michael@0 232 */
michael@0 233 if (compose_char(c, &dummy) == 0)
michael@0 234 return (0);
michael@0 235 else
michael@0 236 return (1);
michael@0 237 }
michael@0 238
michael@0 239 static nsresult
michael@0 240 mdn__unicode_compose(uint32_t c1, uint32_t c2, uint32_t *compp)
michael@0 241 {
michael@0 242 int32_t n;
michael@0 243 int32_t lo, hi;
michael@0 244 const struct composition *cseq;
michael@0 245
michael@0 246 //assert(compp != nullptr);
michael@0 247
michael@0 248 /*
michael@0 249 * Check for Hangul.
michael@0 250 */
michael@0 251 if (LBase <= c1 && c1 < LBase + LCount &&
michael@0 252 VBase <= c2 && c2 < VBase + VCount) {
michael@0 253 /*
michael@0 254 * Hangul L and V.
michael@0 255 */
michael@0 256 *compp = SBase +
michael@0 257 ((c1 - LBase) * VCount + (c2 - VBase)) * TCount;
michael@0 258 return (NS_OK);
michael@0 259 } else if (SBase <= c1 && c1 < SLast &&
michael@0 260 TBase <= c2 && c2 < TBase + TCount &&
michael@0 261 (c1 - SBase) % TCount == 0) {
michael@0 262 /*
michael@0 263 * Hangul LV and T.
michael@0 264 */
michael@0 265 *compp = c1 + (c2 - TBase);
michael@0 266 return (NS_OK);
michael@0 267 }
michael@0 268
michael@0 269 /*
michael@0 270 * Look up composition table. If the result is 0, no composition
michael@0 271 * is defined. Otherwise, upper 16bits of the result contains
michael@0 272 * the number of composition that begins with 'c1', and the lower
michael@0 273 * 16bits is the offset in 'compose_seq'.
michael@0 274 */
michael@0 275 if ((n = compose_char(c1, &cseq)) == 0)
michael@0 276 return (NS_SUCCESS_UNORM_NOTFOUND);
michael@0 277
michael@0 278 /*
michael@0 279 * The composite sequences are sorted by the 2nd character 'c2'.
michael@0 280 * So we can use binary search.
michael@0 281 */
michael@0 282 lo = 0;
michael@0 283 hi = n - 1;
michael@0 284 while (lo <= hi) {
michael@0 285 int32_t mid = (lo + hi) / 2;
michael@0 286
michael@0 287 if (cseq[mid].c2 < c2) {
michael@0 288 lo = mid + 1;
michael@0 289 } else if (cseq[mid].c2 > c2) {
michael@0 290 hi = mid - 1;
michael@0 291 } else {
michael@0 292 *compp = cseq[mid].comp;
michael@0 293 return (NS_OK);
michael@0 294 }
michael@0 295 }
michael@0 296 return (NS_SUCCESS_UNORM_NOTFOUND);
michael@0 297 }
michael@0 298
michael@0 299
michael@0 300 #define WORKBUF_SIZE 128
michael@0 301 #define WORKBUF_SIZE_MAX 10000
michael@0 302
michael@0 303 typedef struct {
michael@0 304 int32_t cur; /* pointing now processing character */
michael@0 305 int32_t last; /* pointing just after the last character */
michael@0 306 int32_t size; /* size of UCS and CLASS array */
michael@0 307 uint32_t *ucs; /* UCS-4 characters */
michael@0 308 int32_t *cclass; /* and their canonical classes */
michael@0 309 uint32_t ucs_buf[WORKBUF_SIZE]; /* local buffer */
michael@0 310 int32_t class_buf[WORKBUF_SIZE]; /* ditto */
michael@0 311 } workbuf_t;
michael@0 312
michael@0 313 static nsresult decompose(workbuf_t *wb, uint32_t c, int32_t compat);
michael@0 314 static void get_class(workbuf_t *wb);
michael@0 315 static void reorder(workbuf_t *wb);
michael@0 316 static void compose(workbuf_t *wb);
michael@0 317 static nsresult flush_before_cur(workbuf_t *wb, nsAString& aToStr);
michael@0 318 static void workbuf_init(workbuf_t *wb);
michael@0 319 static void workbuf_free(workbuf_t *wb);
michael@0 320 static nsresult workbuf_extend(workbuf_t *wb);
michael@0 321 static nsresult workbuf_append(workbuf_t *wb, uint32_t c);
michael@0 322 static void workbuf_shift(workbuf_t *wb, int32_t shift);
michael@0 323 static void workbuf_removevoid(workbuf_t *wb);
michael@0 324
michael@0 325
michael@0 326 static nsresult
michael@0 327 mdn_normalize(bool do_composition, bool compat,
michael@0 328 const nsAString& aSrcStr, nsAString& aToStr)
michael@0 329 {
michael@0 330 workbuf_t wb;
michael@0 331 nsresult r = NS_OK;
michael@0 332 /*
michael@0 333 * Initialize working buffer.
michael@0 334 */
michael@0 335 workbuf_init(&wb);
michael@0 336
michael@0 337 nsAString::const_iterator start, end;
michael@0 338 aSrcStr.BeginReading(start);
michael@0 339 aSrcStr.EndReading(end);
michael@0 340
michael@0 341 while (start != end) {
michael@0 342 uint32_t c;
michael@0 343 char16_t curChar;
michael@0 344
michael@0 345 //assert(wb.cur == wb.last);
michael@0 346
michael@0 347 /*
michael@0 348 * Get one character from 'from'.
michael@0 349 */
michael@0 350 curChar= *start++;
michael@0 351
michael@0 352 if (NS_IS_HIGH_SURROGATE(curChar) && start != end && NS_IS_LOW_SURROGATE(*(start)) ) {
michael@0 353 c = SURROGATE_TO_UCS4(curChar, *start);
michael@0 354 ++start;
michael@0 355 } else {
michael@0 356 c = curChar;
michael@0 357 }
michael@0 358
michael@0 359 /*
michael@0 360 * Decompose it.
michael@0 361 */
michael@0 362 if ((r = decompose(&wb, c, compat)) != NS_OK)
michael@0 363 break;
michael@0 364
michael@0 365 /*
michael@0 366 * Get canonical class.
michael@0 367 */
michael@0 368 get_class(&wb);
michael@0 369
michael@0 370 /*
michael@0 371 * Reorder & compose.
michael@0 372 */
michael@0 373 for (; wb.cur < wb.last; wb.cur++) {
michael@0 374 if (wb.cur == 0) {
michael@0 375 continue;
michael@0 376 } else if (wb.cclass[wb.cur] > 0) {
michael@0 377 /*
michael@0 378 * This is not a starter. Try reordering.
michael@0 379 * Note that characters up to it are
michael@0 380 * already in canonical order.
michael@0 381 */
michael@0 382 reorder(&wb);
michael@0 383 continue;
michael@0 384 }
michael@0 385
michael@0 386 /*
michael@0 387 * This is a starter character, and there are
michael@0 388 * some characters before it. Those characters
michael@0 389 * have been reordered properly, and
michael@0 390 * ready for composition.
michael@0 391 */
michael@0 392 if (do_composition && wb.cclass[0] == 0)
michael@0 393 compose(&wb);
michael@0 394
michael@0 395 /*
michael@0 396 * If CUR points to a starter character,
michael@0 397 * then process of characters before CUR are
michael@0 398 * already finished, because any further
michael@0 399 * reordering/composition for them are blocked
michael@0 400 * by the starter CUR points.
michael@0 401 */
michael@0 402 if (wb.cur > 0 && wb.cclass[wb.cur] == 0) {
michael@0 403 /* Flush everything before CUR. */
michael@0 404 r = flush_before_cur(&wb, aToStr);
michael@0 405 if (r != NS_OK)
michael@0 406 break;
michael@0 407 }
michael@0 408 }
michael@0 409 }
michael@0 410
michael@0 411 if (r == NS_OK) {
michael@0 412 if (do_composition && wb.cur > 0 && wb.cclass[0] == 0) {
michael@0 413 /*
michael@0 414 * There is some characters left in WB.
michael@0 415 * They are ordered, but not composed yet.
michael@0 416 * Now CUR points just after the last character in WB,
michael@0 417 * and since compose() tries to compose characters
michael@0 418 * between top and CUR inclusive, we must make CUR
michael@0 419 * one character back during compose().
michael@0 420 */
michael@0 421 wb.cur--;
michael@0 422 compose(&wb);
michael@0 423 wb.cur++;
michael@0 424 }
michael@0 425 /*
michael@0 426 * Call this even when WB.CUR == 0, to make TO
michael@0 427 * NUL-terminated.
michael@0 428 */
michael@0 429 r = flush_before_cur(&wb, aToStr);
michael@0 430 }
michael@0 431
michael@0 432 workbuf_free(&wb);
michael@0 433
michael@0 434 return (r);
michael@0 435 }
michael@0 436
michael@0 437 static nsresult
michael@0 438 decompose(workbuf_t *wb, uint32_t c, int32_t compat) {
michael@0 439 nsresult r;
michael@0 440 int32_t dec_len;
michael@0 441
michael@0 442 again:
michael@0 443 r = mdn__unicode_decompose(compat, wb->ucs + wb->last,
michael@0 444 wb->size - wb->last, c, &dec_len);
michael@0 445 switch (r) {
michael@0 446 case NS_OK:
michael@0 447 wb->last += dec_len;
michael@0 448 return (NS_OK);
michael@0 449 case NS_SUCCESS_UNORM_NOTFOUND:
michael@0 450 return (workbuf_append(wb, c));
michael@0 451 case NS_ERROR_UNORM_MOREOUTPUT:
michael@0 452 if ((r = workbuf_extend(wb)) != NS_OK)
michael@0 453 return (r);
michael@0 454 if (wb->size > WORKBUF_SIZE_MAX) {
michael@0 455 // "mdn__unormalize_form*: " "working buffer too large\n"
michael@0 456 return (NS_ERROR_FAILURE);
michael@0 457 }
michael@0 458 goto again;
michael@0 459 default:
michael@0 460 return (r);
michael@0 461 }
michael@0 462 /* NOTREACHED */
michael@0 463 }
michael@0 464
michael@0 465 static void
michael@0 466 get_class(workbuf_t *wb) {
michael@0 467 int32_t i;
michael@0 468
michael@0 469 for (i = wb->cur; i < wb->last; i++)
michael@0 470 wb->cclass[i] = canonclass(wb->ucs[i]);
michael@0 471 }
michael@0 472
michael@0 473 static void
michael@0 474 reorder(workbuf_t *wb) {
michael@0 475 uint32_t c;
michael@0 476 int32_t i;
michael@0 477 int32_t cclass;
michael@0 478
michael@0 479 //assert(wb != nullptr);
michael@0 480
michael@0 481 i = wb->cur;
michael@0 482 c = wb->ucs[i];
michael@0 483 cclass = wb->cclass[i];
michael@0 484
michael@0 485 while (i > 0 && wb->cclass[i - 1] > cclass) {
michael@0 486 wb->ucs[i] = wb->ucs[i - 1];
michael@0 487 wb->cclass[i] =wb->cclass[i - 1];
michael@0 488 i--;
michael@0 489 wb->ucs[i] = c;
michael@0 490 wb->cclass[i] = cclass;
michael@0 491 }
michael@0 492 }
michael@0 493
michael@0 494 static void
michael@0 495 compose(workbuf_t *wb) {
michael@0 496 int32_t cur;
michael@0 497 uint32_t *ucs;
michael@0 498 int32_t *cclass;
michael@0 499 int32_t last_class;
michael@0 500 int32_t nvoids;
michael@0 501 int32_t i;
michael@0 502
michael@0 503 //assert(wb != nullptr && wb->cclass[0] == 0);
michael@0 504
michael@0 505 cur = wb->cur;
michael@0 506 ucs = wb->ucs;
michael@0 507 cclass = wb->cclass;
michael@0 508
michael@0 509 /*
michael@0 510 * If there are no decomposition sequence that begins with
michael@0 511 * the top character, composition is impossible.
michael@0 512 */
michael@0 513 if (!mdn__unicode_iscompositecandidate(ucs[0]))
michael@0 514 return;
michael@0 515
michael@0 516 last_class = 0;
michael@0 517 nvoids = 0;
michael@0 518 for (i = 1; i <= cur; i++) {
michael@0 519 uint32_t c;
michael@0 520 int32_t cl = cclass[i];
michael@0 521
michael@0 522 if ((last_class < cl || cl == 0) &&
michael@0 523 mdn__unicode_compose(ucs[0], ucs[i],
michael@0 524 &c) == NS_OK) {
michael@0 525 /*
michael@0 526 * Replace the top character with the composed one.
michael@0 527 */
michael@0 528 ucs[0] = c;
michael@0 529 cclass[0] = canonclass(c);
michael@0 530
michael@0 531 cclass[i] = -1; /* void this character */
michael@0 532 nvoids++;
michael@0 533 } else {
michael@0 534 last_class = cl;
michael@0 535 }
michael@0 536 }
michael@0 537
michael@0 538 /* Purge void characters, if any. */
michael@0 539 if (nvoids > 0)
michael@0 540 workbuf_removevoid(wb);
michael@0 541 }
michael@0 542
michael@0 543 static nsresult
michael@0 544 flush_before_cur(workbuf_t *wb, nsAString& aToStr)
michael@0 545 {
michael@0 546 int32_t i;
michael@0 547
michael@0 548 for (i = 0; i < wb->cur; i++) {
michael@0 549 if (!IS_IN_BMP(wb->ucs[i])) {
michael@0 550 aToStr.Append((char16_t)H_SURROGATE(wb->ucs[i]));
michael@0 551 aToStr.Append((char16_t)L_SURROGATE(wb->ucs[i]));
michael@0 552 } else {
michael@0 553 aToStr.Append((char16_t)(wb->ucs[i]));
michael@0 554 }
michael@0 555 }
michael@0 556
michael@0 557 workbuf_shift(wb, wb->cur);
michael@0 558
michael@0 559 return (NS_OK);
michael@0 560 }
michael@0 561
michael@0 562 static void
michael@0 563 workbuf_init(workbuf_t *wb) {
michael@0 564 wb->cur = 0;
michael@0 565 wb->last = 0;
michael@0 566 wb->size = WORKBUF_SIZE;
michael@0 567 wb->ucs = wb->ucs_buf;
michael@0 568 wb->cclass = wb->class_buf;
michael@0 569 }
michael@0 570
michael@0 571 static void
michael@0 572 workbuf_free(workbuf_t *wb) {
michael@0 573 if (wb->ucs != wb->ucs_buf) {
michael@0 574 nsMemory::Free(wb->ucs);
michael@0 575 nsMemory::Free(wb->cclass);
michael@0 576 }
michael@0 577 }
michael@0 578
michael@0 579 static nsresult
michael@0 580 workbuf_extend(workbuf_t *wb) {
michael@0 581 int32_t newsize = wb->size * 3;
michael@0 582
michael@0 583 if (wb->ucs == wb->ucs_buf) {
michael@0 584 wb->ucs = (uint32_t*)nsMemory::Alloc(sizeof(wb->ucs[0]) * newsize);
michael@0 585 if (!wb->ucs)
michael@0 586 return NS_ERROR_OUT_OF_MEMORY;
michael@0 587 wb->cclass = (int32_t*)nsMemory::Alloc(sizeof(wb->cclass[0]) * newsize);
michael@0 588 if (!wb->cclass) {
michael@0 589 nsMemory::Free(wb->ucs);
michael@0 590 wb->ucs = nullptr;
michael@0 591 return NS_ERROR_OUT_OF_MEMORY;
michael@0 592 }
michael@0 593 } else {
michael@0 594 void* buf = nsMemory::Realloc(wb->ucs, sizeof(wb->ucs[0]) * newsize);
michael@0 595 if (!buf)
michael@0 596 return NS_ERROR_OUT_OF_MEMORY;
michael@0 597 wb->ucs = (uint32_t*)buf;
michael@0 598 buf = nsMemory::Realloc(wb->cclass, sizeof(wb->cclass[0]) * newsize);
michael@0 599 if (!buf)
michael@0 600 return NS_ERROR_OUT_OF_MEMORY;
michael@0 601 wb->cclass = (int32_t*)buf;
michael@0 602 }
michael@0 603 return (NS_OK);
michael@0 604 }
michael@0 605
michael@0 606 static nsresult
michael@0 607 workbuf_append(workbuf_t *wb, uint32_t c) {
michael@0 608 nsresult r;
michael@0 609
michael@0 610 if (wb->last >= wb->size && (r = workbuf_extend(wb)) != NS_OK)
michael@0 611 return (r);
michael@0 612 wb->ucs[wb->last++] = c;
michael@0 613 return (NS_OK);
michael@0 614 }
michael@0 615
michael@0 616 static void
michael@0 617 workbuf_shift(workbuf_t *wb, int32_t shift) {
michael@0 618 int32_t nmove;
michael@0 619
michael@0 620 //assert(wb != nullptr && wb->cur >= shift);
michael@0 621
michael@0 622 nmove = wb->last - shift;
michael@0 623 memmove(&wb->ucs[0], &wb->ucs[shift],
michael@0 624 nmove * sizeof(wb->ucs[0]));
michael@0 625 memmove(&wb->cclass[0], &wb->cclass[shift],
michael@0 626 nmove * sizeof(wb->cclass[0]));
michael@0 627 wb->cur -= shift;
michael@0 628 wb->last -= shift;
michael@0 629 }
michael@0 630
michael@0 631 static void
michael@0 632 workbuf_removevoid(workbuf_t *wb) {
michael@0 633 int32_t i, j;
michael@0 634 int32_t last = wb->last;
michael@0 635
michael@0 636 for (i = j = 0; i < last; i++) {
michael@0 637 if (wb->cclass[i] >= 0) {
michael@0 638 if (j < i) {
michael@0 639 wb->ucs[j] = wb->ucs[i];
michael@0 640 wb->cclass[j] = wb->cclass[i];
michael@0 641 }
michael@0 642 j++;
michael@0 643 }
michael@0 644 }
michael@0 645 wb->cur -= last - j;
michael@0 646 wb->last = j;
michael@0 647 }
michael@0 648
michael@0 649 nsresult
michael@0 650 nsUnicodeNormalizer::NormalizeUnicodeNFD( const nsAString& aSrc, nsAString& aDest)
michael@0 651 {
michael@0 652 return mdn_normalize(false, false, aSrc, aDest);
michael@0 653 }
michael@0 654
michael@0 655 nsresult
michael@0 656 nsUnicodeNormalizer::NormalizeUnicodeNFC( const nsAString& aSrc, nsAString& aDest)
michael@0 657 {
michael@0 658 return mdn_normalize(true, false, aSrc, aDest);
michael@0 659 }
michael@0 660
michael@0 661 nsresult
michael@0 662 nsUnicodeNormalizer::NormalizeUnicodeNFKD( const nsAString& aSrc, nsAString& aDest)
michael@0 663 {
michael@0 664 return mdn_normalize(false, true, aSrc, aDest);
michael@0 665 }
michael@0 666
michael@0 667 nsresult
michael@0 668 nsUnicodeNormalizer::NormalizeUnicodeNFKC( const nsAString& aSrc, nsAString& aDest)
michael@0 669 {
michael@0 670 return mdn_normalize(true, true, aSrc, aDest);
michael@0 671 }
michael@0 672
michael@0 673 bool
michael@0 674 nsUnicodeNormalizer::Compose(uint32_t a, uint32_t b, uint32_t *ab)
michael@0 675 {
michael@0 676 return mdn__unicode_compose(a, b, ab) == NS_OK;
michael@0 677 }
michael@0 678
michael@0 679 bool
michael@0 680 nsUnicodeNormalizer::DecomposeNonRecursively(uint32_t c, uint32_t *c1, uint32_t *c2)
michael@0 681 {
michael@0 682 // We can't use mdn__unicode_decompose here, because that does a recursive
michael@0 683 // decomposition that may yield more than two characters, but the harfbuzz
michael@0 684 // callback wants just a single-step decomp that is guaranteed to produce
michael@0 685 // no more than two characters. So we do a low-level lookup in the table
michael@0 686 // of decomp sequences.
michael@0 687 const uint32_t *seq;
michael@0 688 uint32_t seqidx = decompose_char(c, &seq);
michael@0 689 if (seqidx == 0 || ((seqidx & DECOMP_COMPAT) != 0)) {
michael@0 690 return false;
michael@0 691 }
michael@0 692 *c1 = *seq & ~END_BIT;
michael@0 693 if (*seq & END_BIT) {
michael@0 694 *c2 = 0;
michael@0 695 } else {
michael@0 696 *c2 = *++seq & ~END_BIT;
michael@0 697 }
michael@0 698 return true;
michael@0 699 }

mercurial