The Tor Browser: diff intl/unicharutil/src/nsUnicodeNormalizer.cpp

     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/unicharutil/src/nsUnicodeNormalizer.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,699 @@
     1.4 +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
     1.5 +
     1.6 +/* This file is modified from JPNIC's mDNKit, it is under both MPL and 
     1.7 + * JPNIC's license.
     1.8 + */
     1.9 +
    1.10 +/* This Source Code Form is subject to the terms of the Mozilla Public
    1.11 + * License, v. 2.0. If a copy of the MPL was not distributed with this
    1.12 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
    1.13 +
    1.14 +/*
    1.15 + * Copyright (c) 2000,2002 Japan Network Information Center.
    1.16 + * All rights reserved.
    1.17 + *  
    1.18 + * By using this file, you agree to the terms and conditions set forth bellow.
    1.19 + * 
    1.20 + * 			LICENSE TERMS AND CONDITIONS 
    1.21 + * 
    1.22 + * The following License Terms and Conditions apply, unless a different
    1.23 + * license is obtained from Japan Network Information Center ("JPNIC"),
    1.24 + * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
    1.25 + * Chiyoda-ku, Tokyo 101-0047, Japan.
    1.26 + * 
    1.27 + * 1. Use, Modification and Redistribution (including distribution of any
    1.28 + *    modified or derived work) in source and/or binary forms is permitted
    1.29 + *    under this License Terms and Conditions.
    1.30 + * 
    1.31 + * 2. Redistribution of source code must retain the copyright notices as they
    1.32 + *    appear in each source code file, this License Terms and Conditions.
    1.33 + * 
    1.34 + * 3. Redistribution in binary form must reproduce the Copyright Notice,
    1.35 + *    this License Terms and Conditions, in the documentation and/or other
    1.36 + *    materials provided with the distribution.  For the purposes of binary
    1.37 + *    distribution the "Copyright Notice" refers to the following language:
    1.38 + *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
    1.39 + * 
    1.40 + * 4. The name of JPNIC may not be used to endorse or promote products
    1.41 + *    derived from this Software without specific prior written approval of
    1.42 + *    JPNIC.
    1.43 + * 
    1.44 + * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
    1.45 + *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    1.46 + *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
    1.47 + *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
    1.48 + *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
    1.49 + *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
    1.50 + *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
    1.51 + *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
    1.52 + *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
    1.53 + *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
    1.54 + *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
    1.55 + */
    1.56 +
    1.57 +#include <string.h>
    1.58 +
    1.59 +#include "nsMemory.h"
    1.60 +#include "nsUnicodeNormalizer.h"
    1.61 +#include "nsString.h"
    1.62 +
    1.63 +NS_IMPL_ISUPPORTS(nsUnicodeNormalizer, nsIUnicodeNormalizer)
    1.64 +
    1.65 +
    1.66 +nsUnicodeNormalizer::nsUnicodeNormalizer()
    1.67 +{
    1.68 +}
    1.69 +
    1.70 +nsUnicodeNormalizer::~nsUnicodeNormalizer()
    1.71 +{
    1.72 +}
    1.73 +
    1.74 +
    1.75 +
    1.76 +#define END_BIT		0x80000000
    1.77 +
    1.78 +
    1.79 +/*
    1.80 + * Some constants for Hangul decomposition/composition.
    1.81 + * These things were taken from unicode book. 
    1.82 + */
    1.83 +#define SBase		0xac00
    1.84 +#define LBase		0x1100
    1.85 +#define VBase		0x1161
    1.86 +#define TBase		0x11a7
    1.87 +#define LCount		19
    1.88 +#define VCount		21
    1.89 +#define TCount		28
    1.90 +#define SLast		(SBase + LCount * VCount * TCount)
    1.91 +
    1.92 +struct composition {
    1.93 +	uint32_t c2;	/* 2nd character */
    1.94 +	uint32_t comp;	/* composed character */
    1.95 +};
    1.96 +
    1.97 +
    1.98 +#include "normalization_data.h"
    1.99 +
   1.100 +/*
   1.101 + * Macro for multi-level index table.
   1.102 + */
   1.103 +#define LOOKUPTBL(vprefix, mprefix, v) \
   1.104 +	DMAP(vprefix)[\
   1.105 +		IMAP(vprefix)[\
   1.106 +			IMAP(vprefix)[IDX0(mprefix, v)] + IDX1(mprefix, v)\
   1.107 +		]\
   1.108 +	].tbl[IDX2(mprefix, v)]
   1.109 +
   1.110 +#define IDX0(mprefix, v) IDX_0(v, BITS1(mprefix), BITS2(mprefix))
   1.111 +#define IDX1(mprefix, v) IDX_1(v, BITS1(mprefix), BITS2(mprefix))
   1.112 +#define IDX2(mprefix, v) IDX_2(v, BITS1(mprefix), BITS2(mprefix))
   1.113 +
   1.114 +#define IDX_0(v, bits1, bits2)	((v) >> ((bits1) + (bits2)))
   1.115 +#define IDX_1(v, bits1, bits2)	(((v) >> (bits2)) & ((1 << (bits1)) - 1))
   1.116 +#define IDX_2(v, bits1, bits2)	((v) & ((1 << (bits2)) - 1))
   1.117 +
   1.118 +#define BITS1(mprefix)	mprefix ## _BITS_1
   1.119 +#define BITS2(mprefix)	mprefix ## _BITS_2
   1.120 +
   1.121 +#define IMAP(vprefix)	vprefix ## _imap
   1.122 +#define DMAP(vprefix)	vprefix ## _table
   1.123 +#define SEQ(vprefix)	vprefix ## _seq
   1.124 +
   1.125 +static int32_t
   1.126 +canonclass(uint32_t c) {
   1.127 +	/* Look up canonicalclass table. */
   1.128 +	return (LOOKUPTBL(canon_class, CANON_CLASS, c));
   1.129 +}
   1.130 +
   1.131 +static int32_t
   1.132 +decompose_char(uint32_t c, const uint32_t **seqp)
   1.133 +{
   1.134 +	/* Look up decomposition table. */
   1.135 +	int32_t seqidx = LOOKUPTBL(decompose, DECOMP, c);
   1.136 +	*seqp = SEQ(decompose) + (seqidx & ~DECOMP_COMPAT);
   1.137 +	return (seqidx);
   1.138 +}
   1.139 +
   1.140 +static int32_t
   1.141 +compose_char(uint32_t c,
   1.142 +				const struct composition **compp)
   1.143 +{
   1.144 +	/* Look up composition table. */
   1.145 +	int32_t seqidx = LOOKUPTBL(compose, CANON_COMPOSE, c);
   1.146 +	*compp = SEQ(compose) + (seqidx & 0xffff);
   1.147 +	return (seqidx >> 16);
   1.148 +}
   1.149 +
   1.150 +static nsresult
   1.151 +mdn__unicode_decompose(int32_t compat, uint32_t *v, size_t vlen,
   1.152 +		       uint32_t c, int32_t *decomp_lenp)
   1.153 +{
   1.154 +	uint32_t *vorg = v;
   1.155 +	int32_t seqidx;
   1.156 +	const uint32_t *seq;
   1.157 +
   1.158 +	//assert(v != nullptr && vlen >= 0 && decomp_lenp != nullptr);
   1.159 +
   1.160 +	/*
   1.161 +	 * First, check for Hangul.
   1.162 +	 */
   1.163 +	if (SBase <= c && c < SLast) {
   1.164 +		int32_t idx, t_offset, v_offset, l_offset;
   1.165 +
   1.166 +		idx = c - SBase;
   1.167 +		t_offset = idx % TCount;
   1.168 +		idx /= TCount;
   1.169 +		v_offset = idx % VCount;
   1.170 +		l_offset = idx / VCount;
   1.171 +		if ((t_offset == 0 && vlen < 2) || (t_offset > 0 && vlen < 3))
   1.172 +			return (NS_ERROR_UNORM_MOREOUTPUT);
   1.173 +		*v++ = LBase + l_offset;
   1.174 +		*v++ = VBase + v_offset;
   1.175 +		if (t_offset > 0)
   1.176 +			*v++ = TBase + t_offset;
   1.177 +		*decomp_lenp = v - vorg;
   1.178 +		return (NS_OK);
   1.179 +	}
   1.180 +
   1.181 +	/*
   1.182 +	 * Look up decomposition table.  If no decomposition is defined
   1.183 +	 * or if it is a compatibility decomosition when canonical
   1.184 +	 * decomposition requested, return 'NS_SUCCESS_UNORM_NOTFOUND'.
   1.185 +	 */
   1.186 +	seqidx = decompose_char(c, &seq);
   1.187 +	if (seqidx == 0 || (compat == 0 && (seqidx & DECOMP_COMPAT) != 0))
   1.188 +		return (NS_SUCCESS_UNORM_NOTFOUND);
   1.189 +	
   1.190 +	/*
   1.191 +	 * Copy the decomposed sequence.  The end of the sequence are
   1.192 +	 * marked with END_BIT.
   1.193 +	 */
   1.194 +	do {
   1.195 +		uint32_t c;
   1.196 +		int32_t dlen;
   1.197 +		nsresult r;
   1.198 +
   1.199 +		c = *seq & ~END_BIT;
   1.200 +
   1.201 +		/* Decompose recursively. */
   1.202 +		r = mdn__unicode_decompose(compat, v, vlen, c, &dlen);
   1.203 +		if (r == NS_OK) {
   1.204 +			v += dlen;
   1.205 +			vlen -= dlen;
   1.206 +		} else if (r == NS_SUCCESS_UNORM_NOTFOUND) {
   1.207 +			if (vlen < 1)
   1.208 +				return (NS_ERROR_UNORM_MOREOUTPUT);
   1.209 +			*v++ = c;
   1.210 +			vlen--;
   1.211 +		} else {
   1.212 +			return (r);
   1.213 +		}
   1.214 +
   1.215 +	} while ((*seq++ & END_BIT) == 0);
   1.216 +	
   1.217 +	*decomp_lenp = v - vorg;
   1.218 +
   1.219 +	return (NS_OK);
   1.220 +}
   1.221 +
   1.222 +static int32_t
   1.223 +mdn__unicode_iscompositecandidate(uint32_t c)
   1.224 +{
   1.225 +	const struct composition *dummy;
   1.226 +
   1.227 +	/* Check for Hangul */
   1.228 +	if ((LBase <= c && c < LBase + LCount) || (SBase <= c && c < SLast))
   1.229 +		return (1);
   1.230 +
   1.231 +	/*
   1.232 +	 * Look up composition table.  If there are no composition
   1.233 +	 * that begins with the given character, it is not a
   1.234 +	 * composition candidate.
   1.235 +	 */
   1.236 +	if (compose_char(c, &dummy) == 0)
   1.237 +		return (0);
   1.238 +	else
   1.239 +		return (1);
   1.240 +}
   1.241 +
   1.242 +static nsresult
   1.243 +mdn__unicode_compose(uint32_t c1, uint32_t c2, uint32_t *compp)
   1.244 +{
   1.245 +	int32_t n;
   1.246 +	int32_t lo, hi;
   1.247 +	const struct composition *cseq;
   1.248 +
   1.249 +	//assert(compp != nullptr);
   1.250 +
   1.251 +	/*
   1.252 +	 * Check for Hangul.
   1.253 +	 */
   1.254 +	if (LBase <= c1 && c1 < LBase + LCount &&
   1.255 +	    VBase <= c2 && c2 < VBase + VCount) {
   1.256 +		/*
   1.257 +		 * Hangul L and V.
   1.258 +		 */
   1.259 +		*compp = SBase +
   1.260 +			((c1 - LBase) * VCount + (c2 - VBase)) * TCount;
   1.261 +		return (NS_OK);
   1.262 +	} else if (SBase <= c1 && c1 < SLast &&
   1.263 +		   TBase <= c2 && c2 < TBase + TCount &&
   1.264 +		   (c1 - SBase) % TCount == 0) {
   1.265 +		/*
   1.266 +		 * Hangul LV and T.
   1.267 +		 */
   1.268 +		*compp = c1 + (c2 - TBase);
   1.269 +		return (NS_OK);
   1.270 +	}
   1.271 +
   1.272 +	/*
   1.273 +	 * Look up composition table.  If the result is 0, no composition
   1.274 +	 * is defined.  Otherwise, upper 16bits of the result contains
   1.275 +	 * the number of composition that begins with 'c1', and the lower
   1.276 +	 * 16bits is the offset in 'compose_seq'.
   1.277 +	 */
   1.278 +	if ((n = compose_char(c1, &cseq)) == 0)
   1.279 +		return (NS_SUCCESS_UNORM_NOTFOUND);
   1.280 +
   1.281 +	/*
   1.282 +	 * The composite sequences are sorted by the 2nd character 'c2'.
   1.283 +	 * So we can use binary search.
   1.284 +	 */
   1.285 +	lo = 0;
   1.286 +	hi = n - 1;
   1.287 +	while (lo <= hi) {
   1.288 +		int32_t mid = (lo + hi) / 2;
   1.289 +
   1.290 +		if (cseq[mid].c2 < c2) {
   1.291 +			lo = mid + 1;
   1.292 +		} else if (cseq[mid].c2 > c2) {
   1.293 +			hi = mid - 1;
   1.294 +		} else {
   1.295 +			*compp = cseq[mid].comp;
   1.296 +			return (NS_OK);
   1.297 +		}
   1.298 +	}
   1.299 +	return (NS_SUCCESS_UNORM_NOTFOUND);
   1.300 +}
   1.301 +
   1.302 +
   1.303 +#define WORKBUF_SIZE		128
   1.304 +#define WORKBUF_SIZE_MAX	10000
   1.305 +
   1.306 +typedef struct {
   1.307 +	int32_t cur;		/* pointing now processing character */
   1.308 +	int32_t last;		/* pointing just after the last character */
   1.309 +	int32_t size;		/* size of UCS and CLASS array */
   1.310 +	uint32_t *ucs;	/* UCS-4 characters */
   1.311 +	int32_t *cclass;		/* and their canonical classes */
   1.312 +	uint32_t ucs_buf[WORKBUF_SIZE];	/* local buffer */
   1.313 +	int32_t class_buf[WORKBUF_SIZE];		/* ditto */
   1.314 +} workbuf_t;
   1.315 +
   1.316 +static nsresult	decompose(workbuf_t *wb, uint32_t c, int32_t compat);
   1.317 +static void		get_class(workbuf_t *wb);
   1.318 +static void		reorder(workbuf_t *wb);
   1.319 +static void		compose(workbuf_t *wb);
   1.320 +static nsresult flush_before_cur(workbuf_t *wb, nsAString& aToStr);
   1.321 +static void		workbuf_init(workbuf_t *wb);
   1.322 +static void		workbuf_free(workbuf_t *wb);
   1.323 +static nsresult	workbuf_extend(workbuf_t *wb);
   1.324 +static nsresult	workbuf_append(workbuf_t *wb, uint32_t c);
   1.325 +static void		workbuf_shift(workbuf_t *wb, int32_t shift);
   1.326 +static void		workbuf_removevoid(workbuf_t *wb);
   1.327 +
   1.328 +
   1.329 +static nsresult
   1.330 +mdn_normalize(bool do_composition, bool compat,
   1.331 +	  const nsAString& aSrcStr, nsAString& aToStr)
   1.332 +{
   1.333 +	workbuf_t wb;
   1.334 +	nsresult r = NS_OK;
   1.335 +	/*
   1.336 +	 * Initialize working buffer.
   1.337 +	 */
   1.338 +	workbuf_init(&wb);
   1.339 +
   1.340 +	nsAString::const_iterator start, end;
   1.341 +	aSrcStr.BeginReading(start); 
   1.342 +	aSrcStr.EndReading(end); 
   1.343 +
   1.344 +	while (start != end) {
   1.345 +		uint32_t c;
   1.346 +		char16_t curChar;
   1.347 +
   1.348 +		//assert(wb.cur == wb.last);
   1.349 +
   1.350 +		/*
   1.351 +		 * Get one character from 'from'.
   1.352 +		 */
   1.353 +		curChar= *start++;
   1.354 +
   1.355 +		if (NS_IS_HIGH_SURROGATE(curChar) && start != end && NS_IS_LOW_SURROGATE(*(start)) ) {
   1.356 +			c = SURROGATE_TO_UCS4(curChar, *start);
   1.357 +			++start;
   1.358 +		} else {
   1.359 +			c = curChar;
   1.360 +		}
   1.361 +
   1.362 +		/*
   1.363 +		 * Decompose it.
   1.364 +		 */
   1.365 +		if ((r = decompose(&wb, c, compat)) != NS_OK)
   1.366 +			break;
   1.367 +
   1.368 +		/*
   1.369 +		 * Get canonical class.
   1.370 +		 */
   1.371 +		get_class(&wb);
   1.372 +
   1.373 +		/*
   1.374 +		 * Reorder & compose.
   1.375 +		 */
   1.376 +		for (; wb.cur < wb.last; wb.cur++) {
   1.377 +			if (wb.cur == 0) {
   1.378 +				continue;
   1.379 +			} else if (wb.cclass[wb.cur] > 0) {
   1.380 +				/*
   1.381 +				 * This is not a starter. Try reordering.
   1.382 +				 * Note that characters up to it are
   1.383 +				 * already in canonical order.
   1.384 +				 */
   1.385 +				reorder(&wb);
   1.386 +				continue;
   1.387 +			}
   1.388 +
   1.389 +			/*
   1.390 +			 * This is a starter character, and there are
   1.391 +			 * some characters before it.  Those characters
   1.392 +			 * have been reordered properly, and
   1.393 +			 * ready for composition.
   1.394 +			 */
   1.395 +			if (do_composition && wb.cclass[0] == 0)
   1.396 +				compose(&wb);
   1.397 +
   1.398 +			/*
   1.399 +			 * If CUR points to a starter character,
   1.400 +			 * then process of characters before CUR are
   1.401 +			 * already finished, because any further
   1.402 +			 * reordering/composition for them are blocked
   1.403 +			 * by the starter CUR points.
   1.404 +			 */
   1.405 +			if (wb.cur > 0 && wb.cclass[wb.cur] == 0) {
   1.406 +				/* Flush everything before CUR. */
   1.407 +				r = flush_before_cur(&wb, aToStr);
   1.408 +				if (r != NS_OK)
   1.409 +					break;
   1.410 +			}
   1.411 +		}
   1.412 +	}
   1.413 +
   1.414 +	if (r == NS_OK) {
   1.415 +		if (do_composition && wb.cur > 0 && wb.cclass[0] == 0) {
   1.416 +			/*
   1.417 +			 * There is some characters left in WB.
   1.418 +			 * They are ordered, but not composed yet.
   1.419 +			 * Now CUR points just after the last character in WB,
   1.420 +			 * and since compose() tries to compose characters
   1.421 +			 * between top and CUR inclusive, we must make CUR
   1.422 +			 * one character back during compose().
   1.423 +			 */
   1.424 +			wb.cur--;
   1.425 +			compose(&wb);
   1.426 +			wb.cur++;
   1.427 +		}
   1.428 +		/*
   1.429 +		 * Call this even when WB.CUR == 0, to make TO
   1.430 +		 * NUL-terminated.
   1.431 +		 */
   1.432 +		r = flush_before_cur(&wb, aToStr);
   1.433 +	}
   1.434 +
   1.435 +	workbuf_free(&wb);
   1.436 +
   1.437 +	return (r);
   1.438 +}
   1.439 +
   1.440 +static nsresult
   1.441 +decompose(workbuf_t *wb, uint32_t c, int32_t compat) {
   1.442 +	nsresult r;
   1.443 +	int32_t dec_len;
   1.444 +
   1.445 +again:
   1.446 +	r = mdn__unicode_decompose(compat, wb->ucs + wb->last,
   1.447 +				   wb->size - wb->last, c, &dec_len);
   1.448 +	switch (r) {
   1.449 +	case NS_OK:
   1.450 +		wb->last += dec_len;
   1.451 +		return (NS_OK);
   1.452 +	case NS_SUCCESS_UNORM_NOTFOUND:
   1.453 +		return (workbuf_append(wb, c));
   1.454 +	case NS_ERROR_UNORM_MOREOUTPUT:
   1.455 +		if ((r = workbuf_extend(wb)) != NS_OK)
   1.456 +			return (r);
   1.457 +		if (wb->size > WORKBUF_SIZE_MAX) {
   1.458 +			// "mdn__unormalize_form*: " "working buffer too large\n"
   1.459 +			return (NS_ERROR_FAILURE);
   1.460 +		}
   1.461 +		goto again;
   1.462 +	default:
   1.463 +		return (r);
   1.464 +	}
   1.465 +	/* NOTREACHED */
   1.466 +}
   1.467 +
   1.468 +static void		
   1.469 +get_class(workbuf_t *wb) {
   1.470 +	int32_t i;
   1.471 +
   1.472 +	for (i = wb->cur; i < wb->last; i++)
   1.473 +		wb->cclass[i] = canonclass(wb->ucs[i]);
   1.474 +}
   1.475 +
   1.476 +static void
   1.477 +reorder(workbuf_t *wb) {
   1.478 +	uint32_t c;
   1.479 +	int32_t i;
   1.480 +	int32_t cclass;
   1.481 +
   1.482 +	//assert(wb != nullptr);
   1.483 +
   1.484 +	i = wb->cur;
   1.485 +	c = wb->ucs[i];
   1.486 +	cclass = wb->cclass[i];
   1.487 +
   1.488 +	while (i > 0 && wb->cclass[i - 1] > cclass) {
   1.489 +		wb->ucs[i] = wb->ucs[i - 1];
   1.490 +		wb->cclass[i] =wb->cclass[i - 1];
   1.491 +		i--;
   1.492 +		wb->ucs[i] = c;
   1.493 +		wb->cclass[i] = cclass;
   1.494 +	}
   1.495 +}
   1.496 +
   1.497 +static void
   1.498 +compose(workbuf_t *wb) {
   1.499 +	int32_t cur;
   1.500 +	uint32_t *ucs;
   1.501 +	int32_t *cclass;
   1.502 +	int32_t last_class;
   1.503 +	int32_t nvoids;
   1.504 +	int32_t i;
   1.505 +
   1.506 +	//assert(wb != nullptr && wb->cclass[0] == 0);
   1.507 +
   1.508 +	cur = wb->cur;
   1.509 +	ucs = wb->ucs;
   1.510 +	cclass = wb->cclass;
   1.511 +
   1.512 +	/*
   1.513 +	 * If there are no decomposition sequence that begins with
   1.514 +	 * the top character, composition is impossible.
   1.515 +	 */
   1.516 +	if (!mdn__unicode_iscompositecandidate(ucs[0]))
   1.517 +		return;
   1.518 +
   1.519 +	last_class = 0;
   1.520 +	nvoids = 0;
   1.521 +	for (i = 1; i <= cur; i++) {
   1.522 +		uint32_t c;
   1.523 +		int32_t cl = cclass[i];
   1.524 +
   1.525 +		if ((last_class < cl || cl == 0) &&
   1.526 +		    mdn__unicode_compose(ucs[0], ucs[i],
   1.527 +					 &c) == NS_OK) {
   1.528 +			/*
   1.529 +			 * Replace the top character with the composed one.
   1.530 +			 */
   1.531 +			ucs[0] = c;
   1.532 +			cclass[0] = canonclass(c);
   1.533 +
   1.534 +			cclass[i] = -1;	/* void this character */
   1.535 +			nvoids++;
   1.536 +		} else {
   1.537 +			last_class = cl;
   1.538 +		}
   1.539 +	}
   1.540 +
   1.541 +	/* Purge void characters, if any. */
   1.542 +	if (nvoids > 0)
   1.543 +		workbuf_removevoid(wb);
   1.544 +}
   1.545 +
   1.546 +static nsresult
   1.547 +flush_before_cur(workbuf_t *wb, nsAString& aToStr) 
   1.548 +{
   1.549 +	int32_t i;
   1.550 +
   1.551 +	for (i = 0; i < wb->cur; i++) {
   1.552 +		if (!IS_IN_BMP(wb->ucs[i])) {
   1.553 +			aToStr.Append((char16_t)H_SURROGATE(wb->ucs[i]));
   1.554 +			aToStr.Append((char16_t)L_SURROGATE(wb->ucs[i]));
   1.555 +		} else {
   1.556 +			aToStr.Append((char16_t)(wb->ucs[i]));
   1.557 +		}
   1.558 +	}
   1.559 +
   1.560 +	workbuf_shift(wb, wb->cur);
   1.561 +
   1.562 +	return (NS_OK);
   1.563 +}
   1.564 +
   1.565 +static void
   1.566 +workbuf_init(workbuf_t *wb) {
   1.567 +	wb->cur = 0;
   1.568 +	wb->last = 0;
   1.569 +	wb->size = WORKBUF_SIZE;
   1.570 +	wb->ucs = wb->ucs_buf;
   1.571 +	wb->cclass = wb->class_buf;
   1.572 +}
   1.573 +
   1.574 +static void
   1.575 +workbuf_free(workbuf_t *wb) {
   1.576 +	if (wb->ucs != wb->ucs_buf) {
   1.577 +		nsMemory::Free(wb->ucs);
   1.578 +		nsMemory::Free(wb->cclass);
   1.579 +	}
   1.580 +}
   1.581 +
   1.582 +static nsresult
   1.583 +workbuf_extend(workbuf_t *wb) {
   1.584 +	int32_t newsize = wb->size * 3;
   1.585 +
   1.586 +	if (wb->ucs == wb->ucs_buf) {
   1.587 +		wb->ucs = (uint32_t*)nsMemory::Alloc(sizeof(wb->ucs[0]) * newsize);
   1.588 +		if (!wb->ucs)
   1.589 +			return NS_ERROR_OUT_OF_MEMORY;
   1.590 +		wb->cclass = (int32_t*)nsMemory::Alloc(sizeof(wb->cclass[0]) * newsize);
   1.591 +		if (!wb->cclass) {
   1.592 +			nsMemory::Free(wb->ucs);
   1.593 +			wb->ucs = nullptr;
   1.594 +			return NS_ERROR_OUT_OF_MEMORY;
   1.595 +		}
   1.596 +	} else {
   1.597 +		void* buf = nsMemory::Realloc(wb->ucs, sizeof(wb->ucs[0]) * newsize);
   1.598 +		if (!buf)
   1.599 +			return NS_ERROR_OUT_OF_MEMORY;
   1.600 +		wb->ucs = (uint32_t*)buf;
   1.601 +		buf = nsMemory::Realloc(wb->cclass, sizeof(wb->cclass[0]) * newsize);
   1.602 +		if (!buf)
   1.603 +			return NS_ERROR_OUT_OF_MEMORY;
   1.604 +		wb->cclass = (int32_t*)buf;
   1.605 +	}
   1.606 +	return (NS_OK);
   1.607 +}
   1.608 +
   1.609 +static nsresult
   1.610 +workbuf_append(workbuf_t *wb, uint32_t c) {
   1.611 +	nsresult r;
   1.612 +
   1.613 +	if (wb->last >= wb->size && (r = workbuf_extend(wb)) != NS_OK)
   1.614 +		return (r);
   1.615 +	wb->ucs[wb->last++] = c;
   1.616 +	return (NS_OK);
   1.617 +}
   1.618 +
   1.619 +static void
   1.620 +workbuf_shift(workbuf_t *wb, int32_t shift) {
   1.621 +	int32_t nmove;
   1.622 +
   1.623 +	//assert(wb != nullptr && wb->cur >= shift);
   1.624 +
   1.625 +	nmove = wb->last - shift;
   1.626 +	memmove(&wb->ucs[0], &wb->ucs[shift],
   1.627 +		      nmove * sizeof(wb->ucs[0]));
   1.628 +	memmove(&wb->cclass[0], &wb->cclass[shift],
   1.629 +		      nmove * sizeof(wb->cclass[0]));
   1.630 +	wb->cur -= shift;
   1.631 +	wb->last -= shift;
   1.632 +}
   1.633 +
   1.634 +static void
   1.635 +workbuf_removevoid(workbuf_t *wb) {
   1.636 +	int32_t i, j;
   1.637 +	int32_t last = wb->last;
   1.638 +
   1.639 +	for (i = j = 0; i < last; i++) {
   1.640 +		if (wb->cclass[i] >= 0) {
   1.641 +			if (j < i) {
   1.642 +				wb->ucs[j] = wb->ucs[i];
   1.643 +				wb->cclass[j] = wb->cclass[i];
   1.644 +			}
   1.645 +			j++;
   1.646 +		}
   1.647 +	}
   1.648 +	wb->cur -= last - j;
   1.649 +	wb->last = j;
   1.650 +}
   1.651 +
   1.652 +nsresult  
   1.653 +nsUnicodeNormalizer::NormalizeUnicodeNFD( const nsAString& aSrc, nsAString& aDest)
   1.654 +{
   1.655 +  return mdn_normalize(false, false, aSrc, aDest);
   1.656 +}
   1.657 +
   1.658 +nsresult  
   1.659 +nsUnicodeNormalizer::NormalizeUnicodeNFC( const nsAString& aSrc, nsAString& aDest)
   1.660 +{
   1.661 +  return mdn_normalize(true, false, aSrc, aDest);
   1.662 +}
   1.663 +
   1.664 +nsresult  
   1.665 +nsUnicodeNormalizer::NormalizeUnicodeNFKD( const nsAString& aSrc, nsAString& aDest)
   1.666 +{
   1.667 +  return mdn_normalize(false, true, aSrc, aDest);
   1.668 +}
   1.669 +
   1.670 +nsresult  
   1.671 +nsUnicodeNormalizer::NormalizeUnicodeNFKC( const nsAString& aSrc, nsAString& aDest)
   1.672 +{
   1.673 +  return mdn_normalize(true, true, aSrc, aDest);
   1.674 +}
   1.675 +
   1.676 +bool
   1.677 +nsUnicodeNormalizer::Compose(uint32_t a, uint32_t b, uint32_t *ab)
   1.678 +{
   1.679 +  return mdn__unicode_compose(a, b, ab) == NS_OK;
   1.680 +}
   1.681 +
   1.682 +bool
   1.683 +nsUnicodeNormalizer::DecomposeNonRecursively(uint32_t c, uint32_t *c1, uint32_t *c2)
   1.684 +{
   1.685 +  // We can't use mdn__unicode_decompose here, because that does a recursive
   1.686 +  // decomposition that may yield more than two characters, but the harfbuzz
   1.687 +  // callback wants just a single-step decomp that is guaranteed to produce
   1.688 +  // no more than two characters. So we do a low-level lookup in the table
   1.689 +  // of decomp sequences.
   1.690 +  const uint32_t *seq;
   1.691 +  uint32_t seqidx = decompose_char(c, &seq);
   1.692 +  if (seqidx == 0 || ((seqidx & DECOMP_COMPAT) != 0)) {
   1.693 +    return false;
   1.694 +  }
   1.695 +  *c1 = *seq & ~END_BIT;
   1.696 +  if (*seq & END_BIT) {
   1.697 +    *c2 = 0;
   1.698 +  } else {
   1.699 +    *c2 = *++seq & ~END_BIT;
   1.700 +  }
   1.701 +  return true;
   1.702 +}
The Tor Browser / file diff

diff: intl/unicharutil/src/nsUnicodeNormalizer.cpp

intl/unicharutil/src/nsUnicodeNormalizer.cpp