1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/uspoof.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,736 @@ 1.4 +/* 1.5 +*************************************************************************** 1.6 +* Copyright (C) 2008-2013, International Business Machines Corporation 1.7 +* and others. All Rights Reserved. 1.8 +*************************************************************************** 1.9 +* file name: uspoof.cpp 1.10 +* encoding: US-ASCII 1.11 +* tab size: 8 (not used) 1.12 +* indentation:4 1.13 +* 1.14 +* created on: 2008Feb13 1.15 +* created by: Andy Heninger 1.16 +* 1.17 +* Unicode Spoof Detection 1.18 +*/ 1.19 +#include "unicode/utypes.h" 1.20 +#include "unicode/normalizer2.h" 1.21 +#include "unicode/uspoof.h" 1.22 +#include "unicode/ustring.h" 1.23 +#include "unicode/utf16.h" 1.24 +#include "cmemory.h" 1.25 +#include "cstring.h" 1.26 +#include "identifier_info.h" 1.27 +#include "mutex.h" 1.28 +#include "scriptset.h" 1.29 +#include "uassert.h" 1.30 +#include "ucln_in.h" 1.31 +#include "uspoof_impl.h" 1.32 +#include "umutex.h" 1.33 + 1.34 + 1.35 +#if !UCONFIG_NO_NORMALIZATION 1.36 + 1.37 +U_NAMESPACE_USE 1.38 + 1.39 + 1.40 +// 1.41 +// Static Objects used by the spoof impl, their thread safe initialization and their cleanup. 1.42 +// 1.43 +static UnicodeSet *gInclusionSet = NULL; 1.44 +static UnicodeSet *gRecommendedSet = NULL; 1.45 +static const Normalizer2 *gNfdNormalizer = NULL; 1.46 +static UMutex gInitMutex = U_MUTEX_INITIALIZER; 1.47 + 1.48 +static UBool U_CALLCONV 1.49 +uspoof_cleanup(void) { 1.50 + delete gInclusionSet; 1.51 + gInclusionSet = NULL; 1.52 + delete gRecommendedSet; 1.53 + gRecommendedSet = NULL; 1.54 + gNfdNormalizer = NULL; 1.55 + return TRUE; 1.56 +} 1.57 + 1.58 +static void initializeStatics() { 1.59 + Mutex m(&gInitMutex); 1.60 + UErrorCode status = U_ZERO_ERROR; 1.61 + if (gInclusionSet == NULL) { 1.62 + gInclusionSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\ 1.63 + \\-.\\u00B7\\u05F3\\u05F4\\u0F0B\\u200C\\u200D\\u2019]"), status); 1.64 + gRecommendedSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\ 1.65 + [0-z\\u00C0-\\u017E\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-\ 1.66 + \\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F5\\u01F8-\\u021B\\u021E\ 1.67 + \\u021F\\u0226-\\u0233\\u02BB\\u02BC\\u02EC\\u0300-\\u0304\ 1.68 + \\u0306-\\u030C\\u030F-\\u0311\\u0313\\u0314\\u031B\\u0323-\ 1.69 + \\u0328\\u032D\\u032E\\u0330\\u0331\\u0335\\u0338\\u0339\ 1.70 + \\u0342-\\u0345\\u037B-\\u03CE\\u03FC-\\u045F\\u048A-\\u0525\ 1.71 + \\u0531-\\u0586\\u05D0-\\u05F2\\u0621-\\u063F\\u0641-\\u0655\ 1.72 + \\u0660-\\u0669\\u0670-\\u068D\\u068F-\\u06D5\\u06E5\\u06E6\ 1.73 + \\u06EE-\\u06FF\\u0750-\\u07B1\\u0901-\\u0939\\u093C-\\u094D\ 1.74 + \\u0950\\u0960-\\u0972\\u0979-\\u0A4D\\u0A5C-\\u0A74\\u0A81-\ 1.75 + \\u0B43\\u0B47-\\u0B61\\u0B66-\\u0C56\\u0C60\\u0C61\\u0C66-\ 1.76 + \\u0CD6\\u0CE0-\\u0CEF\\u0D02-\\u0D28\\u0D2A-\\u0D39\\u0D3D-\ 1.77 + \\u0D43\\u0D46-\\u0D4D\\u0D57-\\u0D61\\u0D66-\\u0D8E\\u0D91-\ 1.78 + \\u0DA5\\u0DA7-\\u0DDE\\u0DF2\\u0E01-\\u0ED9\\u0F00\\u0F20-\ 1.79 + \\u0F8B\\u0F90-\\u109D\\u10D0-\\u10F0\\u10F7-\\u10FA\\u1200-\ 1.80 + \\u135A\\u135F\\u1380-\\u138F\\u1401-\\u167F\\u1780-\\u17A2\ 1.81 + \\u17A5-\\u17A7\\u17A9-\\u17B3\\u17B6-\\u17CA\\u17D2\\u17D7-\ 1.82 + \\u17DC\\u17E0-\\u17E9\\u1810-\\u18A8\\u18AA-\\u18F5\\u1E00-\ 1.83 + \\u1E99\\u1F00-\\u1FFC\\u2D30-\\u2D65\\u2D80-\\u2DDE\\u3005-\ 1.84 + \\u3007\\u3041-\\u31B7\\u3400-\\u9FCB\\uA000-\\uA48C\\uA67F\ 1.85 + \\uA717-\\uA71F\\uA788\\uAA60-\\uAA7B\\uAC00-\\uD7A3\\uFA0E-\ 1.86 + \\uFA29\\U00020000-\ 1.87 + \\U0002B734]-[[:Cn:][:nfkcqc=n:][:XIDC=n:]]]"), status); 1.88 + gNfdNormalizer = Normalizer2::getNFDInstance(status); 1.89 + } 1.90 + ucln_i18n_registerCleanup(UCLN_I18N_SPOOF, uspoof_cleanup); 1.91 + 1.92 + return; 1.93 +} 1.94 + 1.95 + 1.96 +U_CAPI USpoofChecker * U_EXPORT2 1.97 +uspoof_open(UErrorCode *status) { 1.98 + if (U_FAILURE(*status)) { 1.99 + return NULL; 1.100 + } 1.101 + initializeStatics(); 1.102 + SpoofImpl *si = new SpoofImpl(SpoofData::getDefault(*status), *status); 1.103 + if (U_FAILURE(*status)) { 1.104 + delete si; 1.105 + si = NULL; 1.106 + } 1.107 + return reinterpret_cast<USpoofChecker *>(si); 1.108 +} 1.109 + 1.110 + 1.111 +U_CAPI USpoofChecker * U_EXPORT2 1.112 +uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength, 1.113 + UErrorCode *status) { 1.114 + if (U_FAILURE(*status)) { 1.115 + return NULL; 1.116 + } 1.117 + initializeStatics(); 1.118 + SpoofData *sd = new SpoofData(data, length, *status); 1.119 + SpoofImpl *si = new SpoofImpl(sd, *status); 1.120 + if (U_FAILURE(*status)) { 1.121 + delete sd; 1.122 + delete si; 1.123 + return NULL; 1.124 + } 1.125 + if (sd == NULL || si == NULL) { 1.126 + *status = U_MEMORY_ALLOCATION_ERROR; 1.127 + delete sd; 1.128 + delete si; 1.129 + return NULL; 1.130 + } 1.131 + 1.132 + if (pActualLength != NULL) { 1.133 + *pActualLength = sd->fRawData->fLength; 1.134 + } 1.135 + return reinterpret_cast<USpoofChecker *>(si); 1.136 +} 1.137 + 1.138 + 1.139 +U_CAPI USpoofChecker * U_EXPORT2 1.140 +uspoof_clone(const USpoofChecker *sc, UErrorCode *status) { 1.141 + const SpoofImpl *src = SpoofImpl::validateThis(sc, *status); 1.142 + if (src == NULL) { 1.143 + return NULL; 1.144 + } 1.145 + SpoofImpl *result = new SpoofImpl(*src, *status); // copy constructor 1.146 + if (U_FAILURE(*status)) { 1.147 + delete result; 1.148 + result = NULL; 1.149 + } 1.150 + return reinterpret_cast<USpoofChecker *>(result); 1.151 +} 1.152 + 1.153 + 1.154 +U_CAPI void U_EXPORT2 1.155 +uspoof_close(USpoofChecker *sc) { 1.156 + UErrorCode status = U_ZERO_ERROR; 1.157 + SpoofImpl *This = SpoofImpl::validateThis(sc, status); 1.158 + delete This; 1.159 +} 1.160 + 1.161 + 1.162 +U_CAPI void U_EXPORT2 1.163 +uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status) { 1.164 + SpoofImpl *This = SpoofImpl::validateThis(sc, *status); 1.165 + if (This == NULL) { 1.166 + return; 1.167 + } 1.168 + 1.169 + // Verify that the requested checks are all ones (bits) that 1.170 + // are acceptable, known values. 1.171 + if (checks & ~USPOOF_ALL_CHECKS) { 1.172 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.173 + return; 1.174 + } 1.175 + 1.176 + This->fChecks = checks; 1.177 +} 1.178 + 1.179 + 1.180 +U_CAPI int32_t U_EXPORT2 1.181 +uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status) { 1.182 + const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); 1.183 + if (This == NULL) { 1.184 + return 0; 1.185 + } 1.186 + return This->fChecks; 1.187 +} 1.188 + 1.189 +U_CAPI void U_EXPORT2 1.190 +uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel) { 1.191 + UErrorCode status = U_ZERO_ERROR; 1.192 + SpoofImpl *This = SpoofImpl::validateThis(sc, status); 1.193 + if (This != NULL) { 1.194 + This->fRestrictionLevel = restrictionLevel; 1.195 + } 1.196 +} 1.197 + 1.198 +U_CAPI URestrictionLevel U_EXPORT2 1.199 +uspoof_getRestrictionLevel(const USpoofChecker *sc) { 1.200 + UErrorCode status = U_ZERO_ERROR; 1.201 + const SpoofImpl *This = SpoofImpl::validateThis(sc, status); 1.202 + if (This == NULL) { 1.203 + return USPOOF_UNRESTRICTIVE; 1.204 + } 1.205 + return This->fRestrictionLevel; 1.206 +} 1.207 + 1.208 +U_CAPI void U_EXPORT2 1.209 +uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status) { 1.210 + SpoofImpl *This = SpoofImpl::validateThis(sc, *status); 1.211 + if (This == NULL) { 1.212 + return; 1.213 + } 1.214 + This->setAllowedLocales(localesList, *status); 1.215 +} 1.216 + 1.217 +U_CAPI const char * U_EXPORT2 1.218 +uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status) { 1.219 + SpoofImpl *This = SpoofImpl::validateThis(sc, *status); 1.220 + if (This == NULL) { 1.221 + return NULL; 1.222 + } 1.223 + return This->getAllowedLocales(*status); 1.224 +} 1.225 + 1.226 + 1.227 +U_CAPI const USet * U_EXPORT2 1.228 +uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status) { 1.229 + const UnicodeSet *result = uspoof_getAllowedUnicodeSet(sc, status); 1.230 + return result->toUSet(); 1.231 +} 1.232 + 1.233 +U_CAPI const UnicodeSet * U_EXPORT2 1.234 +uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status) { 1.235 + const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); 1.236 + if (This == NULL) { 1.237 + return NULL; 1.238 + } 1.239 + return This->fAllowedCharsSet; 1.240 +} 1.241 + 1.242 + 1.243 +U_CAPI void U_EXPORT2 1.244 +uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status) { 1.245 + const UnicodeSet *set = UnicodeSet::fromUSet(chars); 1.246 + uspoof_setAllowedUnicodeSet(sc, set, status); 1.247 +} 1.248 + 1.249 + 1.250 +U_CAPI void U_EXPORT2 1.251 +uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const UnicodeSet *chars, UErrorCode *status) { 1.252 + SpoofImpl *This = SpoofImpl::validateThis(sc, *status); 1.253 + if (This == NULL) { 1.254 + return; 1.255 + } 1.256 + if (chars->isBogus()) { 1.257 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.258 + return; 1.259 + } 1.260 + UnicodeSet *clonedSet = static_cast<UnicodeSet *>(chars->clone()); 1.261 + if (clonedSet == NULL || clonedSet->isBogus()) { 1.262 + *status = U_MEMORY_ALLOCATION_ERROR; 1.263 + return; 1.264 + } 1.265 + clonedSet->freeze(); 1.266 + delete This->fAllowedCharsSet; 1.267 + This->fAllowedCharsSet = clonedSet; 1.268 + This->fChecks |= USPOOF_CHAR_LIMIT; 1.269 +} 1.270 + 1.271 + 1.272 +U_CAPI int32_t U_EXPORT2 1.273 +uspoof_check(const USpoofChecker *sc, 1.274 + const UChar *id, int32_t length, 1.275 + int32_t *position, 1.276 + UErrorCode *status) { 1.277 + 1.278 + const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); 1.279 + if (This == NULL) { 1.280 + return 0; 1.281 + } 1.282 + if (length < -1) { 1.283 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.284 + return 0; 1.285 + } 1.286 + UnicodeString idStr((length == -1), id, length); // Aliasing constructor. 1.287 + int32_t result = uspoof_checkUnicodeString(sc, idStr, position, status); 1.288 + return result; 1.289 +} 1.290 + 1.291 + 1.292 +U_CAPI int32_t U_EXPORT2 1.293 +uspoof_checkUTF8(const USpoofChecker *sc, 1.294 + const char *id, int32_t length, 1.295 + int32_t *position, 1.296 + UErrorCode *status) { 1.297 + 1.298 + if (U_FAILURE(*status)) { 1.299 + return 0; 1.300 + } 1.301 + UnicodeString idStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : uprv_strlen(id))); 1.302 + int32_t result = uspoof_checkUnicodeString(sc, idStr, position, status); 1.303 + return result; 1.304 +} 1.305 + 1.306 + 1.307 +U_CAPI int32_t U_EXPORT2 1.308 +uspoof_areConfusable(const USpoofChecker *sc, 1.309 + const UChar *id1, int32_t length1, 1.310 + const UChar *id2, int32_t length2, 1.311 + UErrorCode *status) { 1.312 + SpoofImpl::validateThis(sc, *status); 1.313 + if (U_FAILURE(*status)) { 1.314 + return 0; 1.315 + } 1.316 + if (length1 < -1 || length2 < -1) { 1.317 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.318 + return 0; 1.319 + } 1.320 + 1.321 + UnicodeString id1Str((length1==-1), id1, length1); // Aliasing constructor 1.322 + UnicodeString id2Str((length2==-1), id2, length2); // Aliasing constructor 1.323 + return uspoof_areConfusableUnicodeString(sc, id1Str, id2Str, status); 1.324 +} 1.325 + 1.326 + 1.327 +U_CAPI int32_t U_EXPORT2 1.328 +uspoof_areConfusableUTF8(const USpoofChecker *sc, 1.329 + const char *id1, int32_t length1, 1.330 + const char *id2, int32_t length2, 1.331 + UErrorCode *status) { 1.332 + SpoofImpl::validateThis(sc, *status); 1.333 + if (U_FAILURE(*status)) { 1.334 + return 0; 1.335 + } 1.336 + if (length1 < -1 || length2 < -1) { 1.337 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.338 + return 0; 1.339 + } 1.340 + UnicodeString id1Str = UnicodeString::fromUTF8(StringPiece(id1, length1>=0? length1 : uprv_strlen(id1))); 1.341 + UnicodeString id2Str = UnicodeString::fromUTF8(StringPiece(id2, length2>=0? length2 : uprv_strlen(id2))); 1.342 + int32_t results = uspoof_areConfusableUnicodeString(sc, id1Str, id2Str, status); 1.343 + return results; 1.344 +} 1.345 + 1.346 + 1.347 +U_CAPI int32_t U_EXPORT2 1.348 +uspoof_areConfusableUnicodeString(const USpoofChecker *sc, 1.349 + const icu::UnicodeString &id1, 1.350 + const icu::UnicodeString &id2, 1.351 + UErrorCode *status) { 1.352 + const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); 1.353 + if (U_FAILURE(*status)) { 1.354 + return 0; 1.355 + } 1.356 + // 1.357 + // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable, 1.358 + // and for definitions of the types (single, whole, mixed-script) of confusables. 1.359 + 1.360 + // We only care about a few of the check flags. Ignore the others. 1.361 + // If no tests relavant to this function have been specified, return an error. 1.362 + // TODO: is this really the right thing to do? It's probably an error on the caller's part, 1.363 + // but logically we would just return 0 (no error). 1.364 + if ((This->fChecks & (USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | 1.365 + USPOOF_WHOLE_SCRIPT_CONFUSABLE)) == 0) { 1.366 + *status = U_INVALID_STATE_ERROR; 1.367 + return 0; 1.368 + } 1.369 + int32_t flagsForSkeleton = This->fChecks & USPOOF_ANY_CASE; 1.370 + 1.371 + int32_t result = 0; 1.372 + IdentifierInfo *identifierInfo = This->getIdentifierInfo(*status); 1.373 + if (U_FAILURE(*status)) { 1.374 + return 0; 1.375 + } 1.376 + identifierInfo->setIdentifier(id1, *status); 1.377 + int32_t id1ScriptCount = identifierInfo->getScriptCount(); 1.378 + identifierInfo->setIdentifier(id2, *status); 1.379 + int32_t id2ScriptCount = identifierInfo->getScriptCount(); 1.380 + This->releaseIdentifierInfo(identifierInfo); 1.381 + identifierInfo = NULL; 1.382 + 1.383 + if (This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) { 1.384 + UnicodeString id1Skeleton; 1.385 + UnicodeString id2Skeleton; 1.386 + if (id1ScriptCount <= 1 && id2ScriptCount <= 1) { 1.387 + flagsForSkeleton |= USPOOF_SINGLE_SCRIPT_CONFUSABLE; 1.388 + uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id1, id1Skeleton, status); 1.389 + uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id2, id2Skeleton, status); 1.390 + if (id1Skeleton == id2Skeleton) { 1.391 + result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE; 1.392 + } 1.393 + } 1.394 + } 1.395 + 1.396 + if (result & USPOOF_SINGLE_SCRIPT_CONFUSABLE) { 1.397 + // If the two inputs are single script confusable they cannot also be 1.398 + // mixed or whole script confusable, according to the UAX39 definitions. 1.399 + // So we can skip those tests. 1.400 + return result; 1.401 + } 1.402 + 1.403 + // Two identifiers are whole script confusable if each is of a single script 1.404 + // and they are mixed script confusable. 1.405 + UBool possiblyWholeScriptConfusables = 1.406 + id1ScriptCount <= 1 && id2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE); 1.407 + 1.408 + // 1.409 + // Mixed Script Check 1.410 + // 1.411 + if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) || possiblyWholeScriptConfusables ) { 1.412 + // For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us 1.413 + // the mixed script table skeleton, which is what we want. 1.414 + // The Any Case / Lower Case bit in the skelton flags was set at the top of the function. 1.415 + UnicodeString id1Skeleton; 1.416 + UnicodeString id2Skeleton; 1.417 + flagsForSkeleton &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE; 1.418 + uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id1, id1Skeleton, status); 1.419 + uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id2, id2Skeleton, status); 1.420 + if (id1Skeleton == id2Skeleton) { 1.421 + result |= USPOOF_MIXED_SCRIPT_CONFUSABLE; 1.422 + if (possiblyWholeScriptConfusables) { 1.423 + result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE; 1.424 + } 1.425 + } 1.426 + } 1.427 + 1.428 + return result; 1.429 +} 1.430 + 1.431 + 1.432 + 1.433 + 1.434 +U_CAPI int32_t U_EXPORT2 1.435 +uspoof_checkUnicodeString(const USpoofChecker *sc, 1.436 + const icu::UnicodeString &id, 1.437 + int32_t *position, 1.438 + UErrorCode *status) { 1.439 + const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); 1.440 + if (This == NULL) { 1.441 + return 0; 1.442 + } 1.443 + int32_t result = 0; 1.444 + 1.445 + IdentifierInfo *identifierInfo = NULL; 1.446 + if ((This->fChecks) & (USPOOF_RESTRICTION_LEVEL | USPOOF_MIXED_NUMBERS)) { 1.447 + identifierInfo = This->getIdentifierInfo(*status); 1.448 + if (U_FAILURE(*status)) { 1.449 + goto cleanupAndReturn; 1.450 + } 1.451 + identifierInfo->setIdentifier(id, *status); 1.452 + identifierInfo->setIdentifierProfile(*This->fAllowedCharsSet); 1.453 + } 1.454 + 1.455 + 1.456 + if ((This->fChecks) & USPOOF_RESTRICTION_LEVEL) { 1.457 + URestrictionLevel idRestrictionLevel = identifierInfo->getRestrictionLevel(*status); 1.458 + if (idRestrictionLevel > This->fRestrictionLevel) { 1.459 + result |= USPOOF_RESTRICTION_LEVEL; 1.460 + } 1.461 + if (This->fChecks & USPOOF_AUX_INFO) { 1.462 + result |= idRestrictionLevel; 1.463 + } 1.464 + } 1.465 + 1.466 + if ((This->fChecks) & USPOOF_MIXED_NUMBERS) { 1.467 + const UnicodeSet *numerics = identifierInfo->getNumerics(); 1.468 + if (numerics->size() > 1) { 1.469 + result |= USPOOF_MIXED_NUMBERS; 1.470 + } 1.471 + 1.472 + // TODO: ICU4J returns the UnicodeSet of the numerics found in the identifier. 1.473 + // We have no easy way to do the same in C. 1.474 + // if (checkResult != null) { 1.475 + // checkResult.numerics = numerics; 1.476 + // } 1.477 + } 1.478 + 1.479 + 1.480 + if (This->fChecks & (USPOOF_CHAR_LIMIT)) { 1.481 + int32_t i; 1.482 + UChar32 c; 1.483 + int32_t length = id.length(); 1.484 + for (i=0; i<length ;) { 1.485 + c = id.char32At(i); 1.486 + i += U16_LENGTH(c); 1.487 + if (!This->fAllowedCharsSet->contains(c)) { 1.488 + result |= USPOOF_CHAR_LIMIT; 1.489 + break; 1.490 + } 1.491 + } 1.492 + } 1.493 + 1.494 + if (This->fChecks & 1.495 + (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) { 1.496 + // These are the checks that need to be done on NFD input 1.497 + UnicodeString nfdText; 1.498 + gNfdNormalizer->normalize(id, nfdText, *status); 1.499 + int32_t nfdLength = nfdText.length(); 1.500 + 1.501 + if (This->fChecks & USPOOF_INVISIBLE) { 1.502 + 1.503 + // scan for more than one occurence of the same non-spacing mark 1.504 + // in a sequence of non-spacing marks. 1.505 + int32_t i; 1.506 + UChar32 c; 1.507 + UChar32 firstNonspacingMark = 0; 1.508 + UBool haveMultipleMarks = FALSE; 1.509 + UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence. 1.510 + 1.511 + for (i=0; i<nfdLength ;) { 1.512 + c = nfdText.char32At(i); 1.513 + i += U16_LENGTH(c); 1.514 + if (u_charType(c) != U_NON_SPACING_MARK) { 1.515 + firstNonspacingMark = 0; 1.516 + if (haveMultipleMarks) { 1.517 + marksSeenSoFar.clear(); 1.518 + haveMultipleMarks = FALSE; 1.519 + } 1.520 + continue; 1.521 + } 1.522 + if (firstNonspacingMark == 0) { 1.523 + firstNonspacingMark = c; 1.524 + continue; 1.525 + } 1.526 + if (!haveMultipleMarks) { 1.527 + marksSeenSoFar.add(firstNonspacingMark); 1.528 + haveMultipleMarks = TRUE; 1.529 + } 1.530 + if (marksSeenSoFar.contains(c)) { 1.531 + // report the error, and stop scanning. 1.532 + // No need to find more than the first failure. 1.533 + result |= USPOOF_INVISIBLE; 1.534 + break; 1.535 + } 1.536 + marksSeenSoFar.add(c); 1.537 + } 1.538 + } 1.539 + 1.540 + 1.541 + if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) { 1.542 + // The basic test is the same for both whole and mixed script confusables. 1.543 + // Compute the set of scripts that every input character has a confusable in. 1.544 + // For this computation an input character is always considered to be 1.545 + // confusable with itself in its own script. 1.546 + // 1.547 + // If the number of such scripts is two or more, and the input consisted of 1.548 + // characters all from a single script, we have a whole script confusable. 1.549 + // (The two scripts will be the original script and the one that is confusable) 1.550 + // 1.551 + // If the number of such scripts >= one, and the original input contained characters from 1.552 + // more than one script, we have a mixed script confusable. (We can transform 1.553 + // some of the characters, and end up with a visually similar string all in 1.554 + // one script.) 1.555 + 1.556 + if (identifierInfo == NULL) { 1.557 + identifierInfo = This->getIdentifierInfo(*status); 1.558 + if (U_FAILURE(*status)) { 1.559 + goto cleanupAndReturn; 1.560 + } 1.561 + identifierInfo->setIdentifier(id, *status); 1.562 + } 1.563 + 1.564 + int32_t scriptCount = identifierInfo->getScriptCount(); 1.565 + 1.566 + ScriptSet scripts; 1.567 + This->wholeScriptCheck(nfdText, &scripts, *status); 1.568 + int32_t confusableScriptCount = scripts.countMembers(); 1.569 + //printf("confusableScriptCount = %d\n", confusableScriptCount); 1.570 + 1.571 + if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) && 1.572 + confusableScriptCount >= 2 && 1.573 + scriptCount == 1) { 1.574 + result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE; 1.575 + } 1.576 + 1.577 + if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) && 1.578 + confusableScriptCount >= 1 && 1.579 + scriptCount > 1) { 1.580 + result |= USPOOF_MIXED_SCRIPT_CONFUSABLE; 1.581 + } 1.582 + } 1.583 + } 1.584 + 1.585 +cleanupAndReturn: 1.586 + This->releaseIdentifierInfo(identifierInfo); 1.587 + if (position != NULL) { 1.588 + *position = 0; 1.589 + } 1.590 + return result; 1.591 +} 1.592 + 1.593 + 1.594 +U_CAPI int32_t U_EXPORT2 1.595 +uspoof_getSkeleton(const USpoofChecker *sc, 1.596 + uint32_t type, 1.597 + const UChar *id, int32_t length, 1.598 + UChar *dest, int32_t destCapacity, 1.599 + UErrorCode *status) { 1.600 + 1.601 + SpoofImpl::validateThis(sc, *status); 1.602 + if (U_FAILURE(*status)) { 1.603 + return 0; 1.604 + } 1.605 + if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL)) { 1.606 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.607 + return 0; 1.608 + } 1.609 + 1.610 + UnicodeString idStr((length==-1), id, length); // Aliasing constructor 1.611 + UnicodeString destStr; 1.612 + uspoof_getSkeletonUnicodeString(sc, type, idStr, destStr, status); 1.613 + destStr.extract(dest, destCapacity, *status); 1.614 + return destStr.length(); 1.615 +} 1.616 + 1.617 + 1.618 + 1.619 +U_I18N_API UnicodeString & U_EXPORT2 1.620 +uspoof_getSkeletonUnicodeString(const USpoofChecker *sc, 1.621 + uint32_t type, 1.622 + const UnicodeString &id, 1.623 + UnicodeString &dest, 1.624 + UErrorCode *status) { 1.625 + const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); 1.626 + if (U_FAILURE(*status)) { 1.627 + return dest; 1.628 + } 1.629 + 1.630 + int32_t tableMask = 0; 1.631 + switch (type) { 1.632 + case 0: 1.633 + tableMask = USPOOF_ML_TABLE_FLAG; 1.634 + break; 1.635 + case USPOOF_SINGLE_SCRIPT_CONFUSABLE: 1.636 + tableMask = USPOOF_SL_TABLE_FLAG; 1.637 + break; 1.638 + case USPOOF_ANY_CASE: 1.639 + tableMask = USPOOF_MA_TABLE_FLAG; 1.640 + break; 1.641 + case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE: 1.642 + tableMask = USPOOF_SA_TABLE_FLAG; 1.643 + break; 1.644 + default: 1.645 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.646 + return dest; 1.647 + } 1.648 + 1.649 + UnicodeString nfdId; 1.650 + gNfdNormalizer->normalize(id, nfdId, *status); 1.651 + 1.652 + // Apply the skeleton mapping to the NFD normalized input string 1.653 + // Accumulate the skeleton, possibly unnormalized, in a UnicodeString. 1.654 + int32_t inputIndex = 0; 1.655 + UnicodeString skelStr; 1.656 + int32_t normalizedLen = nfdId.length(); 1.657 + for (inputIndex=0; inputIndex < normalizedLen; ) { 1.658 + UChar32 c = nfdId.char32At(inputIndex); 1.659 + inputIndex += U16_LENGTH(c); 1.660 + This->confusableLookup(c, tableMask, skelStr); 1.661 + } 1.662 + 1.663 + gNfdNormalizer->normalize(skelStr, dest, *status); 1.664 + return dest; 1.665 +} 1.666 + 1.667 + 1.668 +U_CAPI int32_t U_EXPORT2 1.669 +uspoof_getSkeletonUTF8(const USpoofChecker *sc, 1.670 + uint32_t type, 1.671 + const char *id, int32_t length, 1.672 + char *dest, int32_t destCapacity, 1.673 + UErrorCode *status) { 1.674 + SpoofImpl::validateThis(sc, *status); 1.675 + if (U_FAILURE(*status)) { 1.676 + return 0; 1.677 + } 1.678 + if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL)) { 1.679 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.680 + return 0; 1.681 + } 1.682 + 1.683 + UnicodeString srcStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : uprv_strlen(id))); 1.684 + UnicodeString destStr; 1.685 + uspoof_getSkeletonUnicodeString(sc, type, srcStr, destStr, status); 1.686 + if (U_FAILURE(*status)) { 1.687 + return 0; 1.688 + } 1.689 + 1.690 + int32_t lengthInUTF8 = 0; 1.691 + u_strToUTF8(dest, destCapacity, &lengthInUTF8, 1.692 + destStr.getBuffer(), destStr.length(), status); 1.693 + return lengthInUTF8; 1.694 +} 1.695 + 1.696 + 1.697 +U_CAPI int32_t U_EXPORT2 1.698 +uspoof_serialize(USpoofChecker *sc,void *buf, int32_t capacity, UErrorCode *status) { 1.699 + SpoofImpl *This = SpoofImpl::validateThis(sc, *status); 1.700 + if (This == NULL) { 1.701 + U_ASSERT(U_FAILURE(*status)); 1.702 + return 0; 1.703 + } 1.704 + int32_t dataSize = This->fSpoofData->fRawData->fLength; 1.705 + if (capacity < dataSize) { 1.706 + *status = U_BUFFER_OVERFLOW_ERROR; 1.707 + return dataSize; 1.708 + } 1.709 + uprv_memcpy(buf, This->fSpoofData->fRawData, dataSize); 1.710 + return dataSize; 1.711 +} 1.712 + 1.713 +U_CAPI const USet * U_EXPORT2 1.714 +uspoof_getInclusionSet(UErrorCode *) { 1.715 + initializeStatics(); 1.716 + return gInclusionSet->toUSet(); 1.717 +} 1.718 + 1.719 +U_CAPI const USet * U_EXPORT2 1.720 +uspoof_getRecommendedSet(UErrorCode *) { 1.721 + initializeStatics(); 1.722 + return gRecommendedSet->toUSet(); 1.723 +} 1.724 + 1.725 +U_I18N_API const UnicodeSet * U_EXPORT2 1.726 +uspoof_getInclusionUnicodeSet(UErrorCode *) { 1.727 + initializeStatics(); 1.728 + return gInclusionSet; 1.729 +} 1.730 + 1.731 +U_I18N_API const UnicodeSet * U_EXPORT2 1.732 +uspoof_getRecommendedUnicodeSet(UErrorCode *) { 1.733 + initializeStatics(); 1.734 + return gRecommendedSet; 1.735 +} 1.736 + 1.737 + 1.738 + 1.739 +#endif // !UCONFIG_NO_NORMALIZATION