1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/unorm.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,280 @@ 1.4 +/* 1.5 +****************************************************************************** 1.6 +* Copyright (c) 1996-2011, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +****************************************************************************** 1.9 +* File unorm.cpp 1.10 +* 1.11 +* Created by: Vladimir Weinstein 12052000 1.12 +* 1.13 +* Modification history : 1.14 +* 1.15 +* Date Name Description 1.16 +* 02/01/01 synwee Added normalization quickcheck enum and method. 1.17 +* 02/12/01 synwee Commented out quickcheck util api has been approved 1.18 +* Added private method for doing FCD checks 1.19 +* 02/23/01 synwee Modified quickcheck and checkFCE to run through 1.20 +* string for codepoints < 0x300 for the normalization 1.21 +* mode NFC. 1.22 +* 05/25/01+ Markus Scherer total rewrite, implement all normalization here 1.23 +* instead of just wrappers around normlzr.cpp, 1.24 +* load unorm.dat, support Unicode 3.1 with 1.25 +* supplementary code points, etc. 1.26 +* 2009-nov..2010-jan Markus Scherer total rewrite, new Normalizer2 API & code 1.27 +*/ 1.28 + 1.29 +#include "unicode/utypes.h" 1.30 + 1.31 +#if !UCONFIG_NO_NORMALIZATION 1.32 + 1.33 +#include "unicode/udata.h" 1.34 +#include "unicode/ustring.h" 1.35 +#include "unicode/uiter.h" 1.36 +#include "unicode/unorm.h" 1.37 +#include "unicode/unorm2.h" 1.38 +#include "normalizer2impl.h" 1.39 +#include "unormimp.h" 1.40 +#include "uprops.h" 1.41 +#include "ustr_imp.h" 1.42 + 1.43 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 1.44 + 1.45 +U_NAMESPACE_USE 1.46 + 1.47 +/* quick check functions ---------------------------------------------------- */ 1.48 + 1.49 +U_CAPI UNormalizationCheckResult U_EXPORT2 1.50 +unorm_quickCheck(const UChar *src, 1.51 + int32_t srcLength, 1.52 + UNormalizationMode mode, 1.53 + UErrorCode *pErrorCode) { 1.54 + const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 1.55 + return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); 1.56 +} 1.57 + 1.58 +U_CAPI UNormalizationCheckResult U_EXPORT2 1.59 +unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength, 1.60 + UNormalizationMode mode, int32_t options, 1.61 + UErrorCode *pErrorCode) { 1.62 + const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 1.63 + if(options&UNORM_UNICODE_3_2) { 1.64 + FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); 1.65 + return unorm2_quickCheck( 1.66 + reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), 1.67 + src, srcLength, pErrorCode); 1.68 + } else { 1.69 + return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); 1.70 + } 1.71 +} 1.72 + 1.73 +U_CAPI UBool U_EXPORT2 1.74 +unorm_isNormalized(const UChar *src, int32_t srcLength, 1.75 + UNormalizationMode mode, 1.76 + UErrorCode *pErrorCode) { 1.77 + const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 1.78 + return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); 1.79 +} 1.80 + 1.81 +U_CAPI UBool U_EXPORT2 1.82 +unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength, 1.83 + UNormalizationMode mode, int32_t options, 1.84 + UErrorCode *pErrorCode) { 1.85 + const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 1.86 + if(options&UNORM_UNICODE_3_2) { 1.87 + FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); 1.88 + return unorm2_isNormalized( 1.89 + reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), 1.90 + src, srcLength, pErrorCode); 1.91 + } else { 1.92 + return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); 1.93 + } 1.94 +} 1.95 + 1.96 +/* normalize() API ---------------------------------------------------------- */ 1.97 + 1.98 +/** Public API for normalizing. */ 1.99 +U_CAPI int32_t U_EXPORT2 1.100 +unorm_normalize(const UChar *src, int32_t srcLength, 1.101 + UNormalizationMode mode, int32_t options, 1.102 + UChar *dest, int32_t destCapacity, 1.103 + UErrorCode *pErrorCode) { 1.104 + const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 1.105 + if(options&UNORM_UNICODE_3_2) { 1.106 + FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); 1.107 + return unorm2_normalize( 1.108 + reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), 1.109 + src, srcLength, dest, destCapacity, pErrorCode); 1.110 + } else { 1.111 + return unorm2_normalize((const UNormalizer2 *)n2, 1.112 + src, srcLength, dest, destCapacity, pErrorCode); 1.113 + } 1.114 +} 1.115 + 1.116 + 1.117 +/* iteration functions ------------------------------------------------------ */ 1.118 + 1.119 +static int32_t 1.120 +_iterate(UCharIterator *src, UBool forward, 1.121 + UChar *dest, int32_t destCapacity, 1.122 + const Normalizer2 *n2, 1.123 + UBool doNormalize, UBool *pNeededToNormalize, 1.124 + UErrorCode *pErrorCode) { 1.125 + if(U_FAILURE(*pErrorCode)) { 1.126 + return 0; 1.127 + } 1.128 + if(destCapacity<0 || (dest==NULL && destCapacity>0) || src==NULL) { 1.129 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.130 + return 0; 1.131 + } 1.132 + 1.133 + if(pNeededToNormalize!=NULL) { 1.134 + *pNeededToNormalize=FALSE; 1.135 + } 1.136 + if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) { 1.137 + return u_terminateUChars(dest, destCapacity, 0, pErrorCode); 1.138 + } 1.139 + 1.140 + UnicodeString buffer; 1.141 + UChar32 c; 1.142 + if(forward) { 1.143 + /* get one character and ignore its properties */ 1.144 + buffer.append(uiter_next32(src)); 1.145 + /* get all following characters until we see a boundary */ 1.146 + while((c=uiter_next32(src))>=0) { 1.147 + if(n2->hasBoundaryBefore(c)) { 1.148 + /* back out the latest movement to stop at the boundary */ 1.149 + src->move(src, -U16_LENGTH(c), UITER_CURRENT); 1.150 + break; 1.151 + } else { 1.152 + buffer.append(c); 1.153 + } 1.154 + } 1.155 + } else { 1.156 + while((c=uiter_previous32(src))>=0) { 1.157 + /* always write this character to the front of the buffer */ 1.158 + buffer.insert(0, c); 1.159 + /* stop if this just-copied character is a boundary */ 1.160 + if(n2->hasBoundaryBefore(c)) { 1.161 + break; 1.162 + } 1.163 + } 1.164 + } 1.165 + 1.166 + UnicodeString destString(dest, 0, destCapacity); 1.167 + if(buffer.length()>0 && doNormalize) { 1.168 + n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode); 1.169 + if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) { 1.170 + *pNeededToNormalize= destString!=buffer; 1.171 + } 1.172 + return destString.length(); 1.173 + } else { 1.174 + /* just copy the source characters */ 1.175 + return buffer.extract(dest, destCapacity, *pErrorCode); 1.176 + } 1.177 +} 1.178 + 1.179 +static int32_t 1.180 +unorm_iterate(UCharIterator *src, UBool forward, 1.181 + UChar *dest, int32_t destCapacity, 1.182 + UNormalizationMode mode, int32_t options, 1.183 + UBool doNormalize, UBool *pNeededToNormalize, 1.184 + UErrorCode *pErrorCode) { 1.185 + const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 1.186 + if(options&UNORM_UNICODE_3_2) { 1.187 + const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode); 1.188 + if(U_FAILURE(*pErrorCode)) { 1.189 + return 0; 1.190 + } 1.191 + FilteredNormalizer2 fn2(*n2, *uni32); 1.192 + return _iterate(src, forward, dest, destCapacity, 1.193 + &fn2, doNormalize, pNeededToNormalize, pErrorCode); 1.194 + } 1.195 + return _iterate(src, forward, dest, destCapacity, 1.196 + n2, doNormalize, pNeededToNormalize, pErrorCode); 1.197 +} 1.198 + 1.199 +U_CAPI int32_t U_EXPORT2 1.200 +unorm_previous(UCharIterator *src, 1.201 + UChar *dest, int32_t destCapacity, 1.202 + UNormalizationMode mode, int32_t options, 1.203 + UBool doNormalize, UBool *pNeededToNormalize, 1.204 + UErrorCode *pErrorCode) { 1.205 + return unorm_iterate(src, FALSE, 1.206 + dest, destCapacity, 1.207 + mode, options, 1.208 + doNormalize, pNeededToNormalize, 1.209 + pErrorCode); 1.210 +} 1.211 + 1.212 +U_CAPI int32_t U_EXPORT2 1.213 +unorm_next(UCharIterator *src, 1.214 + UChar *dest, int32_t destCapacity, 1.215 + UNormalizationMode mode, int32_t options, 1.216 + UBool doNormalize, UBool *pNeededToNormalize, 1.217 + UErrorCode *pErrorCode) { 1.218 + return unorm_iterate(src, TRUE, 1.219 + dest, destCapacity, 1.220 + mode, options, 1.221 + doNormalize, pNeededToNormalize, 1.222 + pErrorCode); 1.223 +} 1.224 + 1.225 +/* Concatenation of normalized strings -------------------------------------- */ 1.226 + 1.227 +static int32_t 1.228 +_concatenate(const UChar *left, int32_t leftLength, 1.229 + const UChar *right, int32_t rightLength, 1.230 + UChar *dest, int32_t destCapacity, 1.231 + const Normalizer2 *n2, 1.232 + UErrorCode *pErrorCode) { 1.233 + if(U_FAILURE(*pErrorCode)) { 1.234 + return 0; 1.235 + } 1.236 + if(destCapacity<0 || (dest==NULL && destCapacity>0) || 1.237 + left==NULL || leftLength<-1 || right==NULL || rightLength<-1) { 1.238 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.239 + return 0; 1.240 + } 1.241 + 1.242 + /* check for overlapping right and destination */ 1.243 + if( dest!=NULL && 1.244 + ((right>=dest && right<(dest+destCapacity)) || 1.245 + (rightLength>0 && dest>=right && dest<(right+rightLength))) 1.246 + ) { 1.247 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.248 + return 0; 1.249 + } 1.250 + 1.251 + /* allow left==dest */ 1.252 + UnicodeString destString; 1.253 + if(left==dest) { 1.254 + destString.setTo(dest, leftLength, destCapacity); 1.255 + } else { 1.256 + destString.setTo(dest, 0, destCapacity); 1.257 + destString.append(left, leftLength); 1.258 + } 1.259 + return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode). 1.260 + extract(dest, destCapacity, *pErrorCode); 1.261 +} 1.262 + 1.263 +U_CAPI int32_t U_EXPORT2 1.264 +unorm_concatenate(const UChar *left, int32_t leftLength, 1.265 + const UChar *right, int32_t rightLength, 1.266 + UChar *dest, int32_t destCapacity, 1.267 + UNormalizationMode mode, int32_t options, 1.268 + UErrorCode *pErrorCode) { 1.269 + const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); 1.270 + if(options&UNORM_UNICODE_3_2) { 1.271 + const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode); 1.272 + if(U_FAILURE(*pErrorCode)) { 1.273 + return 0; 1.274 + } 1.275 + FilteredNormalizer2 fn2(*n2, *uni32); 1.276 + return _concatenate(left, leftLength, right, rightLength, 1.277 + dest, destCapacity, &fn2, pErrorCode); 1.278 + } 1.279 + return _concatenate(left, leftLength, right, rightLength, 1.280 + dest, destCapacity, n2, pErrorCode); 1.281 +} 1.282 + 1.283 +#endif /* #if !UCONFIG_NO_NORMALIZATION */