michael@0: /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
michael@0: 
michael@0: /* This file is modified from JPNIC's mDNKit, it is under both MPL and 
michael@0:  * JPNIC's license.
michael@0:  */
michael@0: 
michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0:  * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0:  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0: 
michael@0: /*
michael@0:  * Copyright (c) 2000,2002 Japan Network Information Center.
michael@0:  * All rights reserved.
michael@0:  *  
michael@0:  * By using this file, you agree to the terms and conditions set forth bellow.
michael@0:  * 
michael@0:  * 			LICENSE TERMS AND CONDITIONS 
michael@0:  * 
michael@0:  * The following License Terms and Conditions apply, unless a different
michael@0:  * license is obtained from Japan Network Information Center ("JPNIC"),
michael@0:  * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
michael@0:  * Chiyoda-ku, Tokyo 101-0047, Japan.
michael@0:  * 
michael@0:  * 1. Use, Modification and Redistribution (including distribution of any
michael@0:  *    modified or derived work) in source and/or binary forms is permitted
michael@0:  *    under this License Terms and Conditions.
michael@0:  * 
michael@0:  * 2. Redistribution of source code must retain the copyright notices as they
michael@0:  *    appear in each source code file, this License Terms and Conditions.
michael@0:  * 
michael@0:  * 3. Redistribution in binary form must reproduce the Copyright Notice,
michael@0:  *    this License Terms and Conditions, in the documentation and/or other
michael@0:  *    materials provided with the distribution.  For the purposes of binary
michael@0:  *    distribution the "Copyright Notice" refers to the following language:
michael@0:  *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
michael@0:  * 
michael@0:  * 4. The name of JPNIC may not be used to endorse or promote products
michael@0:  *    derived from this Software without specific prior written approval of
michael@0:  *    JPNIC.
michael@0:  * 
michael@0:  * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
michael@0:  *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
michael@0:  *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
michael@0:  *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
michael@0:  *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
michael@0:  *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
michael@0:  *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
michael@0:  *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
michael@0:  *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
michael@0:  *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
michael@0:  *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
michael@0:  */
michael@0: 
michael@0: #include <string.h>
michael@0: 
michael@0: #include "nsMemory.h"
michael@0: #include "nsUnicodeNormalizer.h"
michael@0: #include "nsString.h"
michael@0: 
michael@0: NS_IMPL_ISUPPORTS(nsUnicodeNormalizer, nsIUnicodeNormalizer)
michael@0: 
michael@0: 
michael@0: nsUnicodeNormalizer::nsUnicodeNormalizer()
michael@0: {
michael@0: }
michael@0: 
michael@0: nsUnicodeNormalizer::~nsUnicodeNormalizer()
michael@0: {
michael@0: }
michael@0: 
michael@0: 
michael@0: 
michael@0: #define END_BIT		0x80000000
michael@0: 
michael@0: 
michael@0: /*
michael@0:  * Some constants for Hangul decomposition/composition.
michael@0:  * These things were taken from unicode book. 
michael@0:  */
michael@0: #define SBase		0xac00
michael@0: #define LBase		0x1100
michael@0: #define VBase		0x1161
michael@0: #define TBase		0x11a7
michael@0: #define LCount		19
michael@0: #define VCount		21
michael@0: #define TCount		28
michael@0: #define SLast		(SBase + LCount * VCount * TCount)
michael@0: 
michael@0: struct composition {
michael@0: 	uint32_t c2;	/* 2nd character */
michael@0: 	uint32_t comp;	/* composed character */
michael@0: };
michael@0: 
michael@0: 
michael@0: #include "normalization_data.h"
michael@0: 
michael@0: /*
michael@0:  * Macro for multi-level index table.
michael@0:  */
michael@0: #define LOOKUPTBL(vprefix, mprefix, v) \
michael@0: 	DMAP(vprefix)[\
michael@0: 		IMAP(vprefix)[\
michael@0: 			IMAP(vprefix)[IDX0(mprefix, v)] + IDX1(mprefix, v)\
michael@0: 		]\
michael@0: 	].tbl[IDX2(mprefix, v)]
michael@0: 
michael@0: #define IDX0(mprefix, v) IDX_0(v, BITS1(mprefix), BITS2(mprefix))
michael@0: #define IDX1(mprefix, v) IDX_1(v, BITS1(mprefix), BITS2(mprefix))
michael@0: #define IDX2(mprefix, v) IDX_2(v, BITS1(mprefix), BITS2(mprefix))
michael@0: 
michael@0: #define IDX_0(v, bits1, bits2)	((v) >> ((bits1) + (bits2)))
michael@0: #define IDX_1(v, bits1, bits2)	(((v) >> (bits2)) & ((1 << (bits1)) - 1))
michael@0: #define IDX_2(v, bits1, bits2)	((v) & ((1 << (bits2)) - 1))
michael@0: 
michael@0: #define BITS1(mprefix)	mprefix ## _BITS_1
michael@0: #define BITS2(mprefix)	mprefix ## _BITS_2
michael@0: 
michael@0: #define IMAP(vprefix)	vprefix ## _imap
michael@0: #define DMAP(vprefix)	vprefix ## _table
michael@0: #define SEQ(vprefix)	vprefix ## _seq
michael@0: 
michael@0: static int32_t
michael@0: canonclass(uint32_t c) {
michael@0: 	/* Look up canonicalclass table. */
michael@0: 	return (LOOKUPTBL(canon_class, CANON_CLASS, c));
michael@0: }
michael@0: 
michael@0: static int32_t
michael@0: decompose_char(uint32_t c, const uint32_t **seqp)
michael@0: {
michael@0: 	/* Look up decomposition table. */
michael@0: 	int32_t seqidx = LOOKUPTBL(decompose, DECOMP, c);
michael@0: 	*seqp = SEQ(decompose) + (seqidx & ~DECOMP_COMPAT);
michael@0: 	return (seqidx);
michael@0: }
michael@0: 
michael@0: static int32_t
michael@0: compose_char(uint32_t c,
michael@0: 				const struct composition **compp)
michael@0: {
michael@0: 	/* Look up composition table. */
michael@0: 	int32_t seqidx = LOOKUPTBL(compose, CANON_COMPOSE, c);
michael@0: 	*compp = SEQ(compose) + (seqidx & 0xffff);
michael@0: 	return (seqidx >> 16);
michael@0: }
michael@0: 
michael@0: static nsresult
michael@0: mdn__unicode_decompose(int32_t compat, uint32_t *v, size_t vlen,
michael@0: 		       uint32_t c, int32_t *decomp_lenp)
michael@0: {
michael@0: 	uint32_t *vorg = v;
michael@0: 	int32_t seqidx;
michael@0: 	const uint32_t *seq;
michael@0: 
michael@0: 	//assert(v != nullptr && vlen >= 0 && decomp_lenp != nullptr);
michael@0: 
michael@0: 	/*
michael@0: 	 * First, check for Hangul.
michael@0: 	 */
michael@0: 	if (SBase <= c && c < SLast) {
michael@0: 		int32_t idx, t_offset, v_offset, l_offset;
michael@0: 
michael@0: 		idx = c - SBase;
michael@0: 		t_offset = idx % TCount;
michael@0: 		idx /= TCount;
michael@0: 		v_offset = idx % VCount;
michael@0: 		l_offset = idx / VCount;
michael@0: 		if ((t_offset == 0 && vlen < 2) || (t_offset > 0 && vlen < 3))
michael@0: 			return (NS_ERROR_UNORM_MOREOUTPUT);
michael@0: 		*v++ = LBase + l_offset;
michael@0: 		*v++ = VBase + v_offset;
michael@0: 		if (t_offset > 0)
michael@0: 			*v++ = TBase + t_offset;
michael@0: 		*decomp_lenp = v - vorg;
michael@0: 		return (NS_OK);
michael@0: 	}
michael@0: 
michael@0: 	/*
michael@0: 	 * Look up decomposition table.  If no decomposition is defined
michael@0: 	 * or if it is a compatibility decomosition when canonical
michael@0: 	 * decomposition requested, return 'NS_SUCCESS_UNORM_NOTFOUND'.
michael@0: 	 */
michael@0: 	seqidx = decompose_char(c, &seq);
michael@0: 	if (seqidx == 0 || (compat == 0 && (seqidx & DECOMP_COMPAT) != 0))
michael@0: 		return (NS_SUCCESS_UNORM_NOTFOUND);
michael@0: 	
michael@0: 	/*
michael@0: 	 * Copy the decomposed sequence.  The end of the sequence are
michael@0: 	 * marked with END_BIT.
michael@0: 	 */
michael@0: 	do {
michael@0: 		uint32_t c;
michael@0: 		int32_t dlen;
michael@0: 		nsresult r;
michael@0: 
michael@0: 		c = *seq & ~END_BIT;
michael@0: 
michael@0: 		/* Decompose recursively. */
michael@0: 		r = mdn__unicode_decompose(compat, v, vlen, c, &dlen);
michael@0: 		if (r == NS_OK) {
michael@0: 			v += dlen;
michael@0: 			vlen -= dlen;
michael@0: 		} else if (r == NS_SUCCESS_UNORM_NOTFOUND) {
michael@0: 			if (vlen < 1)
michael@0: 				return (NS_ERROR_UNORM_MOREOUTPUT);
michael@0: 			*v++ = c;
michael@0: 			vlen--;
michael@0: 		} else {
michael@0: 			return (r);
michael@0: 		}
michael@0: 
michael@0: 	} while ((*seq++ & END_BIT) == 0);
michael@0: 	
michael@0: 	*decomp_lenp = v - vorg;
michael@0: 
michael@0: 	return (NS_OK);
michael@0: }
michael@0: 
michael@0: static int32_t
michael@0: mdn__unicode_iscompositecandidate(uint32_t c)
michael@0: {
michael@0: 	const struct composition *dummy;
michael@0: 
michael@0: 	/* Check for Hangul */
michael@0: 	if ((LBase <= c && c < LBase + LCount) || (SBase <= c && c < SLast))
michael@0: 		return (1);
michael@0: 
michael@0: 	/*
michael@0: 	 * Look up composition table.  If there are no composition
michael@0: 	 * that begins with the given character, it is not a
michael@0: 	 * composition candidate.
michael@0: 	 */
michael@0: 	if (compose_char(c, &dummy) == 0)
michael@0: 		return (0);
michael@0: 	else
michael@0: 		return (1);
michael@0: }
michael@0: 
michael@0: static nsresult
michael@0: mdn__unicode_compose(uint32_t c1, uint32_t c2, uint32_t *compp)
michael@0: {
michael@0: 	int32_t n;
michael@0: 	int32_t lo, hi;
michael@0: 	const struct composition *cseq;
michael@0: 
michael@0: 	//assert(compp != nullptr);
michael@0: 
michael@0: 	/*
michael@0: 	 * Check for Hangul.
michael@0: 	 */
michael@0: 	if (LBase <= c1 && c1 < LBase + LCount &&
michael@0: 	    VBase <= c2 && c2 < VBase + VCount) {
michael@0: 		/*
michael@0: 		 * Hangul L and V.
michael@0: 		 */
michael@0: 		*compp = SBase +
michael@0: 			((c1 - LBase) * VCount + (c2 - VBase)) * TCount;
michael@0: 		return (NS_OK);
michael@0: 	} else if (SBase <= c1 && c1 < SLast &&
michael@0: 		   TBase <= c2 && c2 < TBase + TCount &&
michael@0: 		   (c1 - SBase) % TCount == 0) {
michael@0: 		/*
michael@0: 		 * Hangul LV and T.
michael@0: 		 */
michael@0: 		*compp = c1 + (c2 - TBase);
michael@0: 		return (NS_OK);
michael@0: 	}
michael@0: 
michael@0: 	/*
michael@0: 	 * Look up composition table.  If the result is 0, no composition
michael@0: 	 * is defined.  Otherwise, upper 16bits of the result contains
michael@0: 	 * the number of composition that begins with 'c1', and the lower
michael@0: 	 * 16bits is the offset in 'compose_seq'.
michael@0: 	 */
michael@0: 	if ((n = compose_char(c1, &cseq)) == 0)
michael@0: 		return (NS_SUCCESS_UNORM_NOTFOUND);
michael@0: 
michael@0: 	/*
michael@0: 	 * The composite sequences are sorted by the 2nd character 'c2'.
michael@0: 	 * So we can use binary search.
michael@0: 	 */
michael@0: 	lo = 0;
michael@0: 	hi = n - 1;
michael@0: 	while (lo <= hi) {
michael@0: 		int32_t mid = (lo + hi) / 2;
michael@0: 
michael@0: 		if (cseq[mid].c2 < c2) {
michael@0: 			lo = mid + 1;
michael@0: 		} else if (cseq[mid].c2 > c2) {
michael@0: 			hi = mid - 1;
michael@0: 		} else {
michael@0: 			*compp = cseq[mid].comp;
michael@0: 			return (NS_OK);
michael@0: 		}
michael@0: 	}
michael@0: 	return (NS_SUCCESS_UNORM_NOTFOUND);
michael@0: }
michael@0: 
michael@0: 
michael@0: #define WORKBUF_SIZE		128
michael@0: #define WORKBUF_SIZE_MAX	10000
michael@0: 
michael@0: typedef struct {
michael@0: 	int32_t cur;		/* pointing now processing character */
michael@0: 	int32_t last;		/* pointing just after the last character */
michael@0: 	int32_t size;		/* size of UCS and CLASS array */
michael@0: 	uint32_t *ucs;	/* UCS-4 characters */
michael@0: 	int32_t *cclass;		/* and their canonical classes */
michael@0: 	uint32_t ucs_buf[WORKBUF_SIZE];	/* local buffer */
michael@0: 	int32_t class_buf[WORKBUF_SIZE];		/* ditto */
michael@0: } workbuf_t;
michael@0: 
michael@0: static nsresult	decompose(workbuf_t *wb, uint32_t c, int32_t compat);
michael@0: static void		get_class(workbuf_t *wb);
michael@0: static void		reorder(workbuf_t *wb);
michael@0: static void		compose(workbuf_t *wb);
michael@0: static nsresult flush_before_cur(workbuf_t *wb, nsAString& aToStr);
michael@0: static void		workbuf_init(workbuf_t *wb);
michael@0: static void		workbuf_free(workbuf_t *wb);
michael@0: static nsresult	workbuf_extend(workbuf_t *wb);
michael@0: static nsresult	workbuf_append(workbuf_t *wb, uint32_t c);
michael@0: static void		workbuf_shift(workbuf_t *wb, int32_t shift);
michael@0: static void		workbuf_removevoid(workbuf_t *wb);
michael@0: 
michael@0: 
michael@0: static nsresult
michael@0: mdn_normalize(bool do_composition, bool compat,
michael@0: 	  const nsAString& aSrcStr, nsAString& aToStr)
michael@0: {
michael@0: 	workbuf_t wb;
michael@0: 	nsresult r = NS_OK;
michael@0: 	/*
michael@0: 	 * Initialize working buffer.
michael@0: 	 */
michael@0: 	workbuf_init(&wb);
michael@0: 
michael@0: 	nsAString::const_iterator start, end;
michael@0: 	aSrcStr.BeginReading(start); 
michael@0: 	aSrcStr.EndReading(end); 
michael@0: 
michael@0: 	while (start != end) {
michael@0: 		uint32_t c;
michael@0: 		char16_t curChar;
michael@0: 
michael@0: 		//assert(wb.cur == wb.last);
michael@0: 
michael@0: 		/*
michael@0: 		 * Get one character from 'from'.
michael@0: 		 */
michael@0: 		curChar= *start++;
michael@0: 
michael@0: 		if (NS_IS_HIGH_SURROGATE(curChar) && start != end && NS_IS_LOW_SURROGATE(*(start)) ) {
michael@0: 			c = SURROGATE_TO_UCS4(curChar, *start);
michael@0: 			++start;
michael@0: 		} else {
michael@0: 			c = curChar;
michael@0: 		}
michael@0: 
michael@0: 		/*
michael@0: 		 * Decompose it.
michael@0: 		 */
michael@0: 		if ((r = decompose(&wb, c, compat)) != NS_OK)
michael@0: 			break;
michael@0: 
michael@0: 		/*
michael@0: 		 * Get canonical class.
michael@0: 		 */
michael@0: 		get_class(&wb);
michael@0: 
michael@0: 		/*
michael@0: 		 * Reorder & compose.
michael@0: 		 */
michael@0: 		for (; wb.cur < wb.last; wb.cur++) {
michael@0: 			if (wb.cur == 0) {
michael@0: 				continue;
michael@0: 			} else if (wb.cclass[wb.cur] > 0) {
michael@0: 				/*
michael@0: 				 * This is not a starter. Try reordering.
michael@0: 				 * Note that characters up to it are
michael@0: 				 * already in canonical order.
michael@0: 				 */
michael@0: 				reorder(&wb);
michael@0: 				continue;
michael@0: 			}
michael@0: 
michael@0: 			/*
michael@0: 			 * This is a starter character, and there are
michael@0: 			 * some characters before it.  Those characters
michael@0: 			 * have been reordered properly, and
michael@0: 			 * ready for composition.
michael@0: 			 */
michael@0: 			if (do_composition && wb.cclass[0] == 0)
michael@0: 				compose(&wb);
michael@0: 
michael@0: 			/*
michael@0: 			 * If CUR points to a starter character,
michael@0: 			 * then process of characters before CUR are
michael@0: 			 * already finished, because any further
michael@0: 			 * reordering/composition for them are blocked
michael@0: 			 * by the starter CUR points.
michael@0: 			 */
michael@0: 			if (wb.cur > 0 && wb.cclass[wb.cur] == 0) {
michael@0: 				/* Flush everything before CUR. */
michael@0: 				r = flush_before_cur(&wb, aToStr);
michael@0: 				if (r != NS_OK)
michael@0: 					break;
michael@0: 			}
michael@0: 		}
michael@0: 	}
michael@0: 
michael@0: 	if (r == NS_OK) {
michael@0: 		if (do_composition && wb.cur > 0 && wb.cclass[0] == 0) {
michael@0: 			/*
michael@0: 			 * There is some characters left in WB.
michael@0: 			 * They are ordered, but not composed yet.
michael@0: 			 * Now CUR points just after the last character in WB,
michael@0: 			 * and since compose() tries to compose characters
michael@0: 			 * between top and CUR inclusive, we must make CUR
michael@0: 			 * one character back during compose().
michael@0: 			 */
michael@0: 			wb.cur--;
michael@0: 			compose(&wb);
michael@0: 			wb.cur++;
michael@0: 		}
michael@0: 		/*
michael@0: 		 * Call this even when WB.CUR == 0, to make TO
michael@0: 		 * NUL-terminated.
michael@0: 		 */
michael@0: 		r = flush_before_cur(&wb, aToStr);
michael@0: 	}
michael@0: 
michael@0: 	workbuf_free(&wb);
michael@0: 
michael@0: 	return (r);
michael@0: }
michael@0: 
michael@0: static nsresult
michael@0: decompose(workbuf_t *wb, uint32_t c, int32_t compat) {
michael@0: 	nsresult r;
michael@0: 	int32_t dec_len;
michael@0: 
michael@0: again:
michael@0: 	r = mdn__unicode_decompose(compat, wb->ucs + wb->last,
michael@0: 				   wb->size - wb->last, c, &dec_len);
michael@0: 	switch (r) {
michael@0: 	case NS_OK:
michael@0: 		wb->last += dec_len;
michael@0: 		return (NS_OK);
michael@0: 	case NS_SUCCESS_UNORM_NOTFOUND:
michael@0: 		return (workbuf_append(wb, c));
michael@0: 	case NS_ERROR_UNORM_MOREOUTPUT:
michael@0: 		if ((r = workbuf_extend(wb)) != NS_OK)
michael@0: 			return (r);
michael@0: 		if (wb->size > WORKBUF_SIZE_MAX) {
michael@0: 			// "mdn__unormalize_form*: " "working buffer too large\n"
michael@0: 			return (NS_ERROR_FAILURE);
michael@0: 		}
michael@0: 		goto again;
michael@0: 	default:
michael@0: 		return (r);
michael@0: 	}
michael@0: 	/* NOTREACHED */
michael@0: }
michael@0: 
michael@0: static void		
michael@0: get_class(workbuf_t *wb) {
michael@0: 	int32_t i;
michael@0: 
michael@0: 	for (i = wb->cur; i < wb->last; i++)
michael@0: 		wb->cclass[i] = canonclass(wb->ucs[i]);
michael@0: }
michael@0: 
michael@0: static void
michael@0: reorder(workbuf_t *wb) {
michael@0: 	uint32_t c;
michael@0: 	int32_t i;
michael@0: 	int32_t cclass;
michael@0: 
michael@0: 	//assert(wb != nullptr);
michael@0: 
michael@0: 	i = wb->cur;
michael@0: 	c = wb->ucs[i];
michael@0: 	cclass = wb->cclass[i];
michael@0: 
michael@0: 	while (i > 0 && wb->cclass[i - 1] > cclass) {
michael@0: 		wb->ucs[i] = wb->ucs[i - 1];
michael@0: 		wb->cclass[i] =wb->cclass[i - 1];
michael@0: 		i--;
michael@0: 		wb->ucs[i] = c;
michael@0: 		wb->cclass[i] = cclass;
michael@0: 	}
michael@0: }
michael@0: 
michael@0: static void
michael@0: compose(workbuf_t *wb) {
michael@0: 	int32_t cur;
michael@0: 	uint32_t *ucs;
michael@0: 	int32_t *cclass;
michael@0: 	int32_t last_class;
michael@0: 	int32_t nvoids;
michael@0: 	int32_t i;
michael@0: 
michael@0: 	//assert(wb != nullptr && wb->cclass[0] == 0);
michael@0: 
michael@0: 	cur = wb->cur;
michael@0: 	ucs = wb->ucs;
michael@0: 	cclass = wb->cclass;
michael@0: 
michael@0: 	/*
michael@0: 	 * If there are no decomposition sequence that begins with
michael@0: 	 * the top character, composition is impossible.
michael@0: 	 */
michael@0: 	if (!mdn__unicode_iscompositecandidate(ucs[0]))
michael@0: 		return;
michael@0: 
michael@0: 	last_class = 0;
michael@0: 	nvoids = 0;
michael@0: 	for (i = 1; i <= cur; i++) {
michael@0: 		uint32_t c;
michael@0: 		int32_t cl = cclass[i];
michael@0: 
michael@0: 		if ((last_class < cl || cl == 0) &&
michael@0: 		    mdn__unicode_compose(ucs[0], ucs[i],
michael@0: 					 &c) == NS_OK) {
michael@0: 			/*
michael@0: 			 * Replace the top character with the composed one.
michael@0: 			 */
michael@0: 			ucs[0] = c;
michael@0: 			cclass[0] = canonclass(c);
michael@0: 
michael@0: 			cclass[i] = -1;	/* void this character */
michael@0: 			nvoids++;
michael@0: 		} else {
michael@0: 			last_class = cl;
michael@0: 		}
michael@0: 	}
michael@0: 
michael@0: 	/* Purge void characters, if any. */
michael@0: 	if (nvoids > 0)
michael@0: 		workbuf_removevoid(wb);
michael@0: }
michael@0: 
michael@0: static nsresult
michael@0: flush_before_cur(workbuf_t *wb, nsAString& aToStr) 
michael@0: {
michael@0: 	int32_t i;
michael@0: 
michael@0: 	for (i = 0; i < wb->cur; i++) {
michael@0: 		if (!IS_IN_BMP(wb->ucs[i])) {
michael@0: 			aToStr.Append((char16_t)H_SURROGATE(wb->ucs[i]));
michael@0: 			aToStr.Append((char16_t)L_SURROGATE(wb->ucs[i]));
michael@0: 		} else {
michael@0: 			aToStr.Append((char16_t)(wb->ucs[i]));
michael@0: 		}
michael@0: 	}
michael@0: 
michael@0: 	workbuf_shift(wb, wb->cur);
michael@0: 
michael@0: 	return (NS_OK);
michael@0: }
michael@0: 
michael@0: static void
michael@0: workbuf_init(workbuf_t *wb) {
michael@0: 	wb->cur = 0;
michael@0: 	wb->last = 0;
michael@0: 	wb->size = WORKBUF_SIZE;
michael@0: 	wb->ucs = wb->ucs_buf;
michael@0: 	wb->cclass = wb->class_buf;
michael@0: }
michael@0: 
michael@0: static void
michael@0: workbuf_free(workbuf_t *wb) {
michael@0: 	if (wb->ucs != wb->ucs_buf) {
michael@0: 		nsMemory::Free(wb->ucs);
michael@0: 		nsMemory::Free(wb->cclass);
michael@0: 	}
michael@0: }
michael@0: 
michael@0: static nsresult
michael@0: workbuf_extend(workbuf_t *wb) {
michael@0: 	int32_t newsize = wb->size * 3;
michael@0: 
michael@0: 	if (wb->ucs == wb->ucs_buf) {
michael@0: 		wb->ucs = (uint32_t*)nsMemory::Alloc(sizeof(wb->ucs[0]) * newsize);
michael@0: 		if (!wb->ucs)
michael@0: 			return NS_ERROR_OUT_OF_MEMORY;
michael@0: 		wb->cclass = (int32_t*)nsMemory::Alloc(sizeof(wb->cclass[0]) * newsize);
michael@0: 		if (!wb->cclass) {
michael@0: 			nsMemory::Free(wb->ucs);
michael@0: 			wb->ucs = nullptr;
michael@0: 			return NS_ERROR_OUT_OF_MEMORY;
michael@0: 		}
michael@0: 	} else {
michael@0: 		void* buf = nsMemory::Realloc(wb->ucs, sizeof(wb->ucs[0]) * newsize);
michael@0: 		if (!buf)
michael@0: 			return NS_ERROR_OUT_OF_MEMORY;
michael@0: 		wb->ucs = (uint32_t*)buf;
michael@0: 		buf = nsMemory::Realloc(wb->cclass, sizeof(wb->cclass[0]) * newsize);
michael@0: 		if (!buf)
michael@0: 			return NS_ERROR_OUT_OF_MEMORY;
michael@0: 		wb->cclass = (int32_t*)buf;
michael@0: 	}
michael@0: 	return (NS_OK);
michael@0: }
michael@0: 
michael@0: static nsresult
michael@0: workbuf_append(workbuf_t *wb, uint32_t c) {
michael@0: 	nsresult r;
michael@0: 
michael@0: 	if (wb->last >= wb->size && (r = workbuf_extend(wb)) != NS_OK)
michael@0: 		return (r);
michael@0: 	wb->ucs[wb->last++] = c;
michael@0: 	return (NS_OK);
michael@0: }
michael@0: 
michael@0: static void
michael@0: workbuf_shift(workbuf_t *wb, int32_t shift) {
michael@0: 	int32_t nmove;
michael@0: 
michael@0: 	//assert(wb != nullptr && wb->cur >= shift);
michael@0: 
michael@0: 	nmove = wb->last - shift;
michael@0: 	memmove(&wb->ucs[0], &wb->ucs[shift],
michael@0: 		      nmove * sizeof(wb->ucs[0]));
michael@0: 	memmove(&wb->cclass[0], &wb->cclass[shift],
michael@0: 		      nmove * sizeof(wb->cclass[0]));
michael@0: 	wb->cur -= shift;
michael@0: 	wb->last -= shift;
michael@0: }
michael@0: 
michael@0: static void
michael@0: workbuf_removevoid(workbuf_t *wb) {
michael@0: 	int32_t i, j;
michael@0: 	int32_t last = wb->last;
michael@0: 
michael@0: 	for (i = j = 0; i < last; i++) {
michael@0: 		if (wb->cclass[i] >= 0) {
michael@0: 			if (j < i) {
michael@0: 				wb->ucs[j] = wb->ucs[i];
michael@0: 				wb->cclass[j] = wb->cclass[i];
michael@0: 			}
michael@0: 			j++;
michael@0: 		}
michael@0: 	}
michael@0: 	wb->cur -= last - j;
michael@0: 	wb->last = j;
michael@0: }
michael@0: 
michael@0: nsresult  
michael@0: nsUnicodeNormalizer::NormalizeUnicodeNFD( const nsAString& aSrc, nsAString& aDest)
michael@0: {
michael@0:   return mdn_normalize(false, false, aSrc, aDest);
michael@0: }
michael@0: 
michael@0: nsresult  
michael@0: nsUnicodeNormalizer::NormalizeUnicodeNFC( const nsAString& aSrc, nsAString& aDest)
michael@0: {
michael@0:   return mdn_normalize(true, false, aSrc, aDest);
michael@0: }
michael@0: 
michael@0: nsresult  
michael@0: nsUnicodeNormalizer::NormalizeUnicodeNFKD( const nsAString& aSrc, nsAString& aDest)
michael@0: {
michael@0:   return mdn_normalize(false, true, aSrc, aDest);
michael@0: }
michael@0: 
michael@0: nsresult  
michael@0: nsUnicodeNormalizer::NormalizeUnicodeNFKC( const nsAString& aSrc, nsAString& aDest)
michael@0: {
michael@0:   return mdn_normalize(true, true, aSrc, aDest);
michael@0: }
michael@0: 
michael@0: bool
michael@0: nsUnicodeNormalizer::Compose(uint32_t a, uint32_t b, uint32_t *ab)
michael@0: {
michael@0:   return mdn__unicode_compose(a, b, ab) == NS_OK;
michael@0: }
michael@0: 
michael@0: bool
michael@0: nsUnicodeNormalizer::DecomposeNonRecursively(uint32_t c, uint32_t *c1, uint32_t *c2)
michael@0: {
michael@0:   // We can't use mdn__unicode_decompose here, because that does a recursive
michael@0:   // decomposition that may yield more than two characters, but the harfbuzz
michael@0:   // callback wants just a single-step decomp that is guaranteed to produce
michael@0:   // no more than two characters. So we do a low-level lookup in the table
michael@0:   // of decomp sequences.
michael@0:   const uint32_t *seq;
michael@0:   uint32_t seqidx = decompose_char(c, &seq);
michael@0:   if (seqidx == 0 || ((seqidx & DECOMP_COMPAT) != 0)) {
michael@0:     return false;
michael@0:   }
michael@0:   *c1 = *seq & ~END_BIT;
michael@0:   if (*seq & END_BIT) {
michael@0:     *c2 = 0;
michael@0:   } else {
michael@0:     *c2 = *++seq & ~END_BIT;
michael@0:   }
michael@0:   return true;
michael@0: }