michael@0: /* michael@0: ****************************************************************************** michael@0: * Copyright (c) 1996-2011, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ****************************************************************************** michael@0: * File unorm.cpp michael@0: * michael@0: * Created by: Vladimir Weinstein 12052000 michael@0: * michael@0: * Modification history : michael@0: * michael@0: * Date Name Description michael@0: * 02/01/01 synwee Added normalization quickcheck enum and method. michael@0: * 02/12/01 synwee Commented out quickcheck util api has been approved michael@0: * Added private method for doing FCD checks michael@0: * 02/23/01 synwee Modified quickcheck and checkFCE to run through michael@0: * string for codepoints < 0x300 for the normalization michael@0: * mode NFC. michael@0: * 05/25/01+ Markus Scherer total rewrite, implement all normalization here michael@0: * instead of just wrappers around normlzr.cpp, michael@0: * load unorm.dat, support Unicode 3.1 with michael@0: * supplementary code points, etc. michael@0: * 2009-nov..2010-jan Markus Scherer total rewrite, new Normalizer2 API & code michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_NORMALIZATION michael@0: michael@0: #include "unicode/udata.h" michael@0: #include "unicode/ustring.h" michael@0: #include "unicode/uiter.h" michael@0: #include "unicode/unorm.h" michael@0: #include "unicode/unorm2.h" michael@0: #include "normalizer2impl.h" michael@0: #include "unormimp.h" michael@0: #include "uprops.h" michael@0: #include "ustr_imp.h" michael@0: michael@0: #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) michael@0: michael@0: U_NAMESPACE_USE michael@0: michael@0: /* quick check functions ---------------------------------------------------- */ michael@0: michael@0: U_CAPI UNormalizationCheckResult U_EXPORT2 michael@0: unorm_quickCheck(const UChar *src, michael@0: int32_t srcLength, michael@0: UNormalizationMode mode, michael@0: UErrorCode *pErrorCode) { michael@0: const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); michael@0: return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); michael@0: } michael@0: michael@0: U_CAPI UNormalizationCheckResult U_EXPORT2 michael@0: unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength, michael@0: UNormalizationMode mode, int32_t options, michael@0: UErrorCode *pErrorCode) { michael@0: const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); michael@0: if(options&UNORM_UNICODE_3_2) { michael@0: FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); michael@0: return unorm2_quickCheck( michael@0: reinterpret_cast(static_cast(&fn2)), michael@0: src, srcLength, pErrorCode); michael@0: } else { michael@0: return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); michael@0: } michael@0: } michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: unorm_isNormalized(const UChar *src, int32_t srcLength, michael@0: UNormalizationMode mode, michael@0: UErrorCode *pErrorCode) { michael@0: const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); michael@0: return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); michael@0: } michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength, michael@0: UNormalizationMode mode, int32_t options, michael@0: UErrorCode *pErrorCode) { michael@0: const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); michael@0: if(options&UNORM_UNICODE_3_2) { michael@0: FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); michael@0: return unorm2_isNormalized( michael@0: reinterpret_cast(static_cast(&fn2)), michael@0: src, srcLength, pErrorCode); michael@0: } else { michael@0: return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); michael@0: } michael@0: } michael@0: michael@0: /* normalize() API ---------------------------------------------------------- */ michael@0: michael@0: /** Public API for normalizing. */ michael@0: U_CAPI int32_t U_EXPORT2 michael@0: unorm_normalize(const UChar *src, int32_t srcLength, michael@0: UNormalizationMode mode, int32_t options, michael@0: UChar *dest, int32_t destCapacity, michael@0: UErrorCode *pErrorCode) { michael@0: const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); michael@0: if(options&UNORM_UNICODE_3_2) { michael@0: FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); michael@0: return unorm2_normalize( michael@0: reinterpret_cast(static_cast(&fn2)), michael@0: src, srcLength, dest, destCapacity, pErrorCode); michael@0: } else { michael@0: return unorm2_normalize((const UNormalizer2 *)n2, michael@0: src, srcLength, dest, destCapacity, pErrorCode); michael@0: } michael@0: } michael@0: michael@0: michael@0: /* iteration functions ------------------------------------------------------ */ michael@0: michael@0: static int32_t michael@0: _iterate(UCharIterator *src, UBool forward, michael@0: UChar *dest, int32_t destCapacity, michael@0: const Normalizer2 *n2, michael@0: UBool doNormalize, UBool *pNeededToNormalize, michael@0: UErrorCode *pErrorCode) { michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return 0; michael@0: } michael@0: if(destCapacity<0 || (dest==NULL && destCapacity>0) || src==NULL) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: if(pNeededToNormalize!=NULL) { michael@0: *pNeededToNormalize=FALSE; michael@0: } michael@0: if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) { michael@0: return u_terminateUChars(dest, destCapacity, 0, pErrorCode); michael@0: } michael@0: michael@0: UnicodeString buffer; michael@0: UChar32 c; michael@0: if(forward) { michael@0: /* get one character and ignore its properties */ michael@0: buffer.append(uiter_next32(src)); michael@0: /* get all following characters until we see a boundary */ michael@0: while((c=uiter_next32(src))>=0) { michael@0: if(n2->hasBoundaryBefore(c)) { michael@0: /* back out the latest movement to stop at the boundary */ michael@0: src->move(src, -U16_LENGTH(c), UITER_CURRENT); michael@0: break; michael@0: } else { michael@0: buffer.append(c); michael@0: } michael@0: } michael@0: } else { michael@0: while((c=uiter_previous32(src))>=0) { michael@0: /* always write this character to the front of the buffer */ michael@0: buffer.insert(0, c); michael@0: /* stop if this just-copied character is a boundary */ michael@0: if(n2->hasBoundaryBefore(c)) { michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: michael@0: UnicodeString destString(dest, 0, destCapacity); michael@0: if(buffer.length()>0 && doNormalize) { michael@0: n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode); michael@0: if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) { michael@0: *pNeededToNormalize= destString!=buffer; michael@0: } michael@0: return destString.length(); michael@0: } else { michael@0: /* just copy the source characters */ michael@0: return buffer.extract(dest, destCapacity, *pErrorCode); michael@0: } michael@0: } michael@0: michael@0: static int32_t michael@0: unorm_iterate(UCharIterator *src, UBool forward, michael@0: UChar *dest, int32_t destCapacity, michael@0: UNormalizationMode mode, int32_t options, michael@0: UBool doNormalize, UBool *pNeededToNormalize, michael@0: UErrorCode *pErrorCode) { michael@0: const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); michael@0: if(options&UNORM_UNICODE_3_2) { michael@0: const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode); michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return 0; michael@0: } michael@0: FilteredNormalizer2 fn2(*n2, *uni32); michael@0: return _iterate(src, forward, dest, destCapacity, michael@0: &fn2, doNormalize, pNeededToNormalize, pErrorCode); michael@0: } michael@0: return _iterate(src, forward, dest, destCapacity, michael@0: n2, doNormalize, pNeededToNormalize, pErrorCode); michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: unorm_previous(UCharIterator *src, michael@0: UChar *dest, int32_t destCapacity, michael@0: UNormalizationMode mode, int32_t options, michael@0: UBool doNormalize, UBool *pNeededToNormalize, michael@0: UErrorCode *pErrorCode) { michael@0: return unorm_iterate(src, FALSE, michael@0: dest, destCapacity, michael@0: mode, options, michael@0: doNormalize, pNeededToNormalize, michael@0: pErrorCode); michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: unorm_next(UCharIterator *src, michael@0: UChar *dest, int32_t destCapacity, michael@0: UNormalizationMode mode, int32_t options, michael@0: UBool doNormalize, UBool *pNeededToNormalize, michael@0: UErrorCode *pErrorCode) { michael@0: return unorm_iterate(src, TRUE, michael@0: dest, destCapacity, michael@0: mode, options, michael@0: doNormalize, pNeededToNormalize, michael@0: pErrorCode); michael@0: } michael@0: michael@0: /* Concatenation of normalized strings -------------------------------------- */ michael@0: michael@0: static int32_t michael@0: _concatenate(const UChar *left, int32_t leftLength, michael@0: const UChar *right, int32_t rightLength, michael@0: UChar *dest, int32_t destCapacity, michael@0: const Normalizer2 *n2, michael@0: UErrorCode *pErrorCode) { michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return 0; michael@0: } michael@0: if(destCapacity<0 || (dest==NULL && destCapacity>0) || michael@0: left==NULL || leftLength<-1 || right==NULL || rightLength<-1) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: /* check for overlapping right and destination */ michael@0: if( dest!=NULL && michael@0: ((right>=dest && right<(dest+destCapacity)) || michael@0: (rightLength>0 && dest>=right && dest<(right+rightLength))) michael@0: ) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: /* allow left==dest */ michael@0: UnicodeString destString; michael@0: if(left==dest) { michael@0: destString.setTo(dest, leftLength, destCapacity); michael@0: } else { michael@0: destString.setTo(dest, 0, destCapacity); michael@0: destString.append(left, leftLength); michael@0: } michael@0: return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode). michael@0: extract(dest, destCapacity, *pErrorCode); michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: unorm_concatenate(const UChar *left, int32_t leftLength, michael@0: const UChar *right, int32_t rightLength, michael@0: UChar *dest, int32_t destCapacity, michael@0: UNormalizationMode mode, int32_t options, michael@0: UErrorCode *pErrorCode) { michael@0: const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); michael@0: if(options&UNORM_UNICODE_3_2) { michael@0: const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode); michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return 0; michael@0: } michael@0: FilteredNormalizer2 fn2(*n2, *uni32); michael@0: return _concatenate(left, leftLength, right, rightLength, michael@0: dest, destCapacity, &fn2, pErrorCode); michael@0: } michael@0: return _concatenate(left, leftLength, right, rightLength, michael@0: dest, destCapacity, n2, pErrorCode); michael@0: } michael@0: michael@0: #endif /* #if !UCONFIG_NO_NORMALIZATION */