intl/icu/source/i18n/uspoof.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/uspoof.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,736 @@
     1.4 +/*
     1.5 +***************************************************************************
     1.6 +* Copyright (C) 2008-2013, International Business Machines Corporation
     1.7 +* and others. All Rights Reserved.
     1.8 +***************************************************************************
     1.9 +*   file name:  uspoof.cpp
    1.10 +*   encoding:   US-ASCII
    1.11 +*   tab size:   8 (not used)
    1.12 +*   indentation:4
    1.13 +*
    1.14 +*   created on: 2008Feb13
    1.15 +*   created by: Andy Heninger
    1.16 +*
    1.17 +*   Unicode Spoof Detection
    1.18 +*/
    1.19 +#include "unicode/utypes.h"
    1.20 +#include "unicode/normalizer2.h"
    1.21 +#include "unicode/uspoof.h"
    1.22 +#include "unicode/ustring.h"
    1.23 +#include "unicode/utf16.h"
    1.24 +#include "cmemory.h"
    1.25 +#include "cstring.h"
    1.26 +#include "identifier_info.h"
    1.27 +#include "mutex.h"
    1.28 +#include "scriptset.h"
    1.29 +#include "uassert.h"
    1.30 +#include "ucln_in.h"
    1.31 +#include "uspoof_impl.h"
    1.32 +#include "umutex.h"
    1.33 +
    1.34 +
    1.35 +#if !UCONFIG_NO_NORMALIZATION
    1.36 +
    1.37 +U_NAMESPACE_USE
    1.38 +
    1.39 +
    1.40 +//
    1.41 +// Static Objects used by the spoof impl, their thread safe initialization and their cleanup.
    1.42 +//
    1.43 +static UnicodeSet *gInclusionSet = NULL;
    1.44 +static UnicodeSet *gRecommendedSet = NULL;
    1.45 +static const Normalizer2 *gNfdNormalizer = NULL;
    1.46 +static UMutex gInitMutex = U_MUTEX_INITIALIZER;
    1.47 +
    1.48 +static UBool U_CALLCONV
    1.49 +uspoof_cleanup(void) {
    1.50 +    delete gInclusionSet;
    1.51 +    gInclusionSet = NULL;
    1.52 +    delete gRecommendedSet;
    1.53 +    gRecommendedSet = NULL;
    1.54 +    gNfdNormalizer = NULL;
    1.55 +    return TRUE;
    1.56 +}
    1.57 +
    1.58 +static void initializeStatics() {
    1.59 +    Mutex m(&gInitMutex);
    1.60 +    UErrorCode status = U_ZERO_ERROR;
    1.61 +    if (gInclusionSet == NULL) {
    1.62 +        gInclusionSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\
    1.63 +            \\-.\\u00B7\\u05F3\\u05F4\\u0F0B\\u200C\\u200D\\u2019]"), status);
    1.64 +        gRecommendedSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\
    1.65 +            [0-z\\u00C0-\\u017E\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-\
    1.66 +            \\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F5\\u01F8-\\u021B\\u021E\
    1.67 +            \\u021F\\u0226-\\u0233\\u02BB\\u02BC\\u02EC\\u0300-\\u0304\
    1.68 +            \\u0306-\\u030C\\u030F-\\u0311\\u0313\\u0314\\u031B\\u0323-\
    1.69 +            \\u0328\\u032D\\u032E\\u0330\\u0331\\u0335\\u0338\\u0339\
    1.70 +            \\u0342-\\u0345\\u037B-\\u03CE\\u03FC-\\u045F\\u048A-\\u0525\
    1.71 +            \\u0531-\\u0586\\u05D0-\\u05F2\\u0621-\\u063F\\u0641-\\u0655\
    1.72 +            \\u0660-\\u0669\\u0670-\\u068D\\u068F-\\u06D5\\u06E5\\u06E6\
    1.73 +            \\u06EE-\\u06FF\\u0750-\\u07B1\\u0901-\\u0939\\u093C-\\u094D\
    1.74 +            \\u0950\\u0960-\\u0972\\u0979-\\u0A4D\\u0A5C-\\u0A74\\u0A81-\
    1.75 +            \\u0B43\\u0B47-\\u0B61\\u0B66-\\u0C56\\u0C60\\u0C61\\u0C66-\
    1.76 +            \\u0CD6\\u0CE0-\\u0CEF\\u0D02-\\u0D28\\u0D2A-\\u0D39\\u0D3D-\
    1.77 +            \\u0D43\\u0D46-\\u0D4D\\u0D57-\\u0D61\\u0D66-\\u0D8E\\u0D91-\
    1.78 +            \\u0DA5\\u0DA7-\\u0DDE\\u0DF2\\u0E01-\\u0ED9\\u0F00\\u0F20-\
    1.79 +            \\u0F8B\\u0F90-\\u109D\\u10D0-\\u10F0\\u10F7-\\u10FA\\u1200-\
    1.80 +            \\u135A\\u135F\\u1380-\\u138F\\u1401-\\u167F\\u1780-\\u17A2\
    1.81 +            \\u17A5-\\u17A7\\u17A9-\\u17B3\\u17B6-\\u17CA\\u17D2\\u17D7-\
    1.82 +            \\u17DC\\u17E0-\\u17E9\\u1810-\\u18A8\\u18AA-\\u18F5\\u1E00-\
    1.83 +            \\u1E99\\u1F00-\\u1FFC\\u2D30-\\u2D65\\u2D80-\\u2DDE\\u3005-\
    1.84 +            \\u3007\\u3041-\\u31B7\\u3400-\\u9FCB\\uA000-\\uA48C\\uA67F\
    1.85 +            \\uA717-\\uA71F\\uA788\\uAA60-\\uAA7B\\uAC00-\\uD7A3\\uFA0E-\
    1.86 +            \\uFA29\\U00020000-\
    1.87 +            \\U0002B734]-[[:Cn:][:nfkcqc=n:][:XIDC=n:]]]"), status);
    1.88 +        gNfdNormalizer = Normalizer2::getNFDInstance(status);
    1.89 +    }
    1.90 +    ucln_i18n_registerCleanup(UCLN_I18N_SPOOF, uspoof_cleanup);
    1.91 +
    1.92 +    return;
    1.93 +}
    1.94 +
    1.95 +
    1.96 +U_CAPI USpoofChecker * U_EXPORT2
    1.97 +uspoof_open(UErrorCode *status) {
    1.98 +    if (U_FAILURE(*status)) {
    1.99 +        return NULL;
   1.100 +    }
   1.101 +    initializeStatics();
   1.102 +    SpoofImpl *si = new SpoofImpl(SpoofData::getDefault(*status), *status);
   1.103 +    if (U_FAILURE(*status)) {
   1.104 +        delete si;
   1.105 +        si = NULL;
   1.106 +    }
   1.107 +    return reinterpret_cast<USpoofChecker *>(si);
   1.108 +}
   1.109 +
   1.110 +
   1.111 +U_CAPI USpoofChecker * U_EXPORT2
   1.112 +uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength,
   1.113 +                          UErrorCode *status) {
   1.114 +    if (U_FAILURE(*status)) {
   1.115 +        return NULL;
   1.116 +    }
   1.117 +    initializeStatics();
   1.118 +    SpoofData *sd = new SpoofData(data, length, *status);
   1.119 +    SpoofImpl *si = new SpoofImpl(sd, *status);
   1.120 +    if (U_FAILURE(*status)) {
   1.121 +        delete sd;
   1.122 +        delete si;
   1.123 +        return NULL;
   1.124 +    }
   1.125 +    if (sd == NULL || si == NULL) {
   1.126 +        *status = U_MEMORY_ALLOCATION_ERROR;
   1.127 +        delete sd;
   1.128 +        delete si;
   1.129 +        return NULL;
   1.130 +    }
   1.131 +        
   1.132 +    if (pActualLength != NULL) {
   1.133 +        *pActualLength = sd->fRawData->fLength;
   1.134 +    }
   1.135 +    return reinterpret_cast<USpoofChecker *>(si);
   1.136 +}
   1.137 +
   1.138 +
   1.139 +U_CAPI USpoofChecker * U_EXPORT2
   1.140 +uspoof_clone(const USpoofChecker *sc, UErrorCode *status) {
   1.141 +    const SpoofImpl *src = SpoofImpl::validateThis(sc, *status);
   1.142 +    if (src == NULL) {
   1.143 +        return NULL;
   1.144 +    }
   1.145 +    SpoofImpl *result = new SpoofImpl(*src, *status);   // copy constructor
   1.146 +    if (U_FAILURE(*status)) {
   1.147 +        delete result;
   1.148 +        result = NULL;
   1.149 +    }
   1.150 +    return reinterpret_cast<USpoofChecker *>(result);
   1.151 +}
   1.152 +
   1.153 +
   1.154 +U_CAPI void U_EXPORT2
   1.155 +uspoof_close(USpoofChecker *sc) {
   1.156 +    UErrorCode status = U_ZERO_ERROR;
   1.157 +    SpoofImpl *This = SpoofImpl::validateThis(sc, status);
   1.158 +    delete This;
   1.159 +}
   1.160 +
   1.161 +
   1.162 +U_CAPI void U_EXPORT2
   1.163 +uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status) {
   1.164 +    SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
   1.165 +    if (This == NULL) {
   1.166 +        return;
   1.167 +    }
   1.168 +
   1.169 +    // Verify that the requested checks are all ones (bits) that 
   1.170 +    //   are acceptable, known values.
   1.171 +    if (checks & ~USPOOF_ALL_CHECKS) {
   1.172 +        *status = U_ILLEGAL_ARGUMENT_ERROR; 
   1.173 +        return;
   1.174 +    }
   1.175 +
   1.176 +    This->fChecks = checks;
   1.177 +}
   1.178 +
   1.179 +
   1.180 +U_CAPI int32_t U_EXPORT2
   1.181 +uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status) {
   1.182 +    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
   1.183 +    if (This == NULL) {
   1.184 +        return 0;
   1.185 +    }
   1.186 +    return This->fChecks;
   1.187 +}
   1.188 +
   1.189 +U_CAPI void U_EXPORT2
   1.190 +uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel) {
   1.191 +    UErrorCode status = U_ZERO_ERROR;
   1.192 +    SpoofImpl *This = SpoofImpl::validateThis(sc, status);
   1.193 +    if (This != NULL) {
   1.194 +        This->fRestrictionLevel = restrictionLevel;
   1.195 +    }
   1.196 +}
   1.197 +
   1.198 +U_CAPI URestrictionLevel U_EXPORT2
   1.199 +uspoof_getRestrictionLevel(const USpoofChecker *sc) {
   1.200 +    UErrorCode status = U_ZERO_ERROR;
   1.201 +    const SpoofImpl *This = SpoofImpl::validateThis(sc, status);
   1.202 +    if (This == NULL) {
   1.203 +        return USPOOF_UNRESTRICTIVE;
   1.204 +    }
   1.205 +    return This->fRestrictionLevel;
   1.206 +}
   1.207 +
   1.208 +U_CAPI void U_EXPORT2
   1.209 +uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status) {
   1.210 +    SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
   1.211 +    if (This == NULL) {
   1.212 +        return;
   1.213 +    }
   1.214 +    This->setAllowedLocales(localesList, *status);
   1.215 +}
   1.216 +
   1.217 +U_CAPI const char * U_EXPORT2
   1.218 +uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status) {
   1.219 +    SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
   1.220 +    if (This == NULL) {
   1.221 +        return NULL;
   1.222 +    }
   1.223 +    return This->getAllowedLocales(*status);
   1.224 +}
   1.225 +
   1.226 +
   1.227 +U_CAPI const USet * U_EXPORT2
   1.228 +uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status) {
   1.229 +    const UnicodeSet *result = uspoof_getAllowedUnicodeSet(sc, status);
   1.230 +    return result->toUSet();
   1.231 +}
   1.232 +
   1.233 +U_CAPI const UnicodeSet * U_EXPORT2
   1.234 +uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status) {
   1.235 +    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
   1.236 +    if (This == NULL) {
   1.237 +        return NULL;
   1.238 +    }
   1.239 +    return This->fAllowedCharsSet;
   1.240 +}
   1.241 +
   1.242 +
   1.243 +U_CAPI void U_EXPORT2
   1.244 +uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status) {
   1.245 +    const UnicodeSet *set = UnicodeSet::fromUSet(chars);
   1.246 +    uspoof_setAllowedUnicodeSet(sc, set, status);
   1.247 +}
   1.248 +
   1.249 +
   1.250 +U_CAPI void U_EXPORT2
   1.251 +uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const UnicodeSet *chars, UErrorCode *status) {
   1.252 +    SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
   1.253 +    if (This == NULL) {
   1.254 +        return;
   1.255 +    }
   1.256 +    if (chars->isBogus()) {
   1.257 +        *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.258 +        return;
   1.259 +    }
   1.260 +    UnicodeSet *clonedSet = static_cast<UnicodeSet *>(chars->clone());
   1.261 +    if (clonedSet == NULL || clonedSet->isBogus()) {
   1.262 +        *status = U_MEMORY_ALLOCATION_ERROR;
   1.263 +        return;
   1.264 +    }
   1.265 +    clonedSet->freeze();
   1.266 +    delete This->fAllowedCharsSet;
   1.267 +    This->fAllowedCharsSet = clonedSet;
   1.268 +    This->fChecks |= USPOOF_CHAR_LIMIT;
   1.269 +}
   1.270 +
   1.271 +
   1.272 +U_CAPI int32_t U_EXPORT2
   1.273 +uspoof_check(const USpoofChecker *sc,
   1.274 +             const UChar *id, int32_t length,
   1.275 +             int32_t *position,
   1.276 +             UErrorCode *status) {
   1.277 +             
   1.278 +    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
   1.279 +    if (This == NULL) {
   1.280 +        return 0;
   1.281 +    }
   1.282 +    if (length < -1) {
   1.283 +        *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.284 +        return 0;
   1.285 +    }
   1.286 +    UnicodeString idStr((length == -1), id, length);  // Aliasing constructor.
   1.287 +    int32_t result = uspoof_checkUnicodeString(sc, idStr, position, status);
   1.288 +    return result;
   1.289 +}
   1.290 +
   1.291 +
   1.292 +U_CAPI int32_t U_EXPORT2
   1.293 +uspoof_checkUTF8(const USpoofChecker *sc,
   1.294 +                 const char *id, int32_t length,
   1.295 +                 int32_t *position,
   1.296 +                 UErrorCode *status) {
   1.297 +
   1.298 +    if (U_FAILURE(*status)) {
   1.299 +        return 0;
   1.300 +    }
   1.301 +    UnicodeString idStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : uprv_strlen(id)));
   1.302 +    int32_t result = uspoof_checkUnicodeString(sc, idStr, position, status);
   1.303 +    return result;
   1.304 +}
   1.305 +
   1.306 +
   1.307 +U_CAPI int32_t U_EXPORT2
   1.308 +uspoof_areConfusable(const USpoofChecker *sc,
   1.309 +                     const UChar *id1, int32_t length1,
   1.310 +                     const UChar *id2, int32_t length2,
   1.311 +                     UErrorCode *status) {
   1.312 +    SpoofImpl::validateThis(sc, *status);
   1.313 +    if (U_FAILURE(*status)) {
   1.314 +        return 0;
   1.315 +    }
   1.316 +    if (length1 < -1 || length2 < -1) {
   1.317 +        *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.318 +        return 0;
   1.319 +    }
   1.320 +        
   1.321 +    UnicodeString id1Str((length1==-1), id1, length1);  // Aliasing constructor
   1.322 +    UnicodeString id2Str((length2==-1), id2, length2);  // Aliasing constructor
   1.323 +    return uspoof_areConfusableUnicodeString(sc, id1Str, id2Str, status);
   1.324 +}
   1.325 +
   1.326 +
   1.327 +U_CAPI int32_t U_EXPORT2
   1.328 +uspoof_areConfusableUTF8(const USpoofChecker *sc,
   1.329 +                         const char *id1, int32_t length1,
   1.330 +                         const char *id2, int32_t length2,
   1.331 +                         UErrorCode *status) {
   1.332 +    SpoofImpl::validateThis(sc, *status);
   1.333 +    if (U_FAILURE(*status)) {
   1.334 +        return 0;
   1.335 +    }
   1.336 +    if (length1 < -1 || length2 < -1) {
   1.337 +        *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.338 +        return 0;
   1.339 +    }
   1.340 +    UnicodeString id1Str = UnicodeString::fromUTF8(StringPiece(id1, length1>=0? length1 : uprv_strlen(id1)));
   1.341 +    UnicodeString id2Str = UnicodeString::fromUTF8(StringPiece(id2, length2>=0? length2 : uprv_strlen(id2)));
   1.342 +    int32_t results = uspoof_areConfusableUnicodeString(sc, id1Str, id2Str, status);
   1.343 +    return results;
   1.344 +}
   1.345 + 
   1.346 +
   1.347 +U_CAPI int32_t U_EXPORT2
   1.348 +uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
   1.349 +                                  const icu::UnicodeString &id1,
   1.350 +                                  const icu::UnicodeString &id2,
   1.351 +                                  UErrorCode *status) {
   1.352 +    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
   1.353 +    if (U_FAILURE(*status)) {
   1.354 +        return 0;
   1.355 +    }
   1.356 +    // 
   1.357 +    // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable,
   1.358 +    //   and for definitions of the types (single, whole, mixed-script) of confusables.
   1.359 +    
   1.360 +    // We only care about a few of the check flags.  Ignore the others.
   1.361 +    // If no tests relavant to this function have been specified, return an error.
   1.362 +    // TODO:  is this really the right thing to do?  It's probably an error on the caller's part,
   1.363 +    //        but logically we would just return 0 (no error).
   1.364 +    if ((This->fChecks & (USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | 
   1.365 +                          USPOOF_WHOLE_SCRIPT_CONFUSABLE)) == 0) {
   1.366 +        *status = U_INVALID_STATE_ERROR;
   1.367 +        return 0;
   1.368 +    }
   1.369 +    int32_t  flagsForSkeleton = This->fChecks & USPOOF_ANY_CASE;
   1.370 +
   1.371 +    int32_t  result = 0;
   1.372 +    IdentifierInfo *identifierInfo = This->getIdentifierInfo(*status);
   1.373 +    if (U_FAILURE(*status)) {
   1.374 +        return 0;
   1.375 +    }
   1.376 +    identifierInfo->setIdentifier(id1, *status);
   1.377 +    int32_t id1ScriptCount = identifierInfo->getScriptCount();
   1.378 +    identifierInfo->setIdentifier(id2, *status);
   1.379 +    int32_t id2ScriptCount = identifierInfo->getScriptCount();
   1.380 +    This->releaseIdentifierInfo(identifierInfo);
   1.381 +    identifierInfo = NULL;
   1.382 +
   1.383 +    if (This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
   1.384 +        UnicodeString   id1Skeleton;
   1.385 +        UnicodeString   id2Skeleton;
   1.386 +        if (id1ScriptCount <= 1 && id2ScriptCount <= 1) {
   1.387 +            flagsForSkeleton |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
   1.388 +            uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id1, id1Skeleton, status);
   1.389 +            uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id2, id2Skeleton, status);
   1.390 +            if (id1Skeleton == id2Skeleton) {
   1.391 +                result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
   1.392 +            }
   1.393 +        }
   1.394 +    }
   1.395 +
   1.396 +    if (result & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
   1.397 +         // If the two inputs are single script confusable they cannot also be
   1.398 +         // mixed or whole script confusable, according to the UAX39 definitions.
   1.399 +         // So we can skip those tests.
   1.400 +         return result;
   1.401 +    }
   1.402 +
   1.403 +    // Two identifiers are whole script confusable if each is of a single script 
   1.404 +    // and they are mixed script confusable.
   1.405 +    UBool possiblyWholeScriptConfusables = 
   1.406 +        id1ScriptCount <= 1 && id2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE);
   1.407 +
   1.408 +    //
   1.409 +    // Mixed Script Check
   1.410 +    //
   1.411 +    if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) || possiblyWholeScriptConfusables ) {
   1.412 +        // For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us
   1.413 +        // the mixed script table skeleton, which is what we want.
   1.414 +        // The Any Case / Lower Case bit in the skelton flags was set at the top of the function.
   1.415 +        UnicodeString id1Skeleton;
   1.416 +        UnicodeString id2Skeleton;
   1.417 +        flagsForSkeleton &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE;
   1.418 +        uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id1, id1Skeleton, status);
   1.419 +        uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id2, id2Skeleton, status);
   1.420 +        if (id1Skeleton == id2Skeleton) {
   1.421 +            result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
   1.422 +            if (possiblyWholeScriptConfusables) {
   1.423 +                result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
   1.424 +            }
   1.425 +        }
   1.426 +    }
   1.427 +
   1.428 +    return result;
   1.429 +}
   1.430 +
   1.431 +
   1.432 +
   1.433 +
   1.434 +U_CAPI int32_t U_EXPORT2
   1.435 +uspoof_checkUnicodeString(const USpoofChecker *sc,
   1.436 +                          const icu::UnicodeString &id, 
   1.437 +                          int32_t *position,
   1.438 +                          UErrorCode *status) {
   1.439 +    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
   1.440 +    if (This == NULL) {
   1.441 +        return 0;
   1.442 +    }
   1.443 +    int32_t result = 0;
   1.444 +
   1.445 +    IdentifierInfo *identifierInfo = NULL;
   1.446 +    if ((This->fChecks) & (USPOOF_RESTRICTION_LEVEL | USPOOF_MIXED_NUMBERS)) {
   1.447 +        identifierInfo = This->getIdentifierInfo(*status);
   1.448 +        if (U_FAILURE(*status)) {
   1.449 +            goto cleanupAndReturn;
   1.450 +        }
   1.451 +        identifierInfo->setIdentifier(id, *status);
   1.452 +        identifierInfo->setIdentifierProfile(*This->fAllowedCharsSet);
   1.453 +    }
   1.454 +
   1.455 +
   1.456 +    if ((This->fChecks) & USPOOF_RESTRICTION_LEVEL) {
   1.457 +        URestrictionLevel idRestrictionLevel = identifierInfo->getRestrictionLevel(*status);
   1.458 +        if (idRestrictionLevel > This->fRestrictionLevel) {
   1.459 +            result |= USPOOF_RESTRICTION_LEVEL;
   1.460 +        }
   1.461 +        if (This->fChecks & USPOOF_AUX_INFO) {
   1.462 +            result |= idRestrictionLevel;
   1.463 +        }
   1.464 +    }
   1.465 +
   1.466 +    if ((This->fChecks) & USPOOF_MIXED_NUMBERS) {
   1.467 +        const UnicodeSet *numerics = identifierInfo->getNumerics();
   1.468 +        if (numerics->size() > 1) {
   1.469 +            result |= USPOOF_MIXED_NUMBERS;
   1.470 +        }
   1.471 +
   1.472 +        // TODO: ICU4J returns the UnicodeSet of the numerics found in the identifier.
   1.473 +        //       We have no easy way to do the same in C.
   1.474 +        // if (checkResult != null) {
   1.475 +        //     checkResult.numerics = numerics;
   1.476 +        // }
   1.477 +    }
   1.478 +
   1.479 +
   1.480 +    if (This->fChecks & (USPOOF_CHAR_LIMIT)) {
   1.481 +        int32_t i;
   1.482 +        UChar32 c;
   1.483 +        int32_t length = id.length();
   1.484 +        for (i=0; i<length ;) {
   1.485 +            c = id.char32At(i);
   1.486 +            i += U16_LENGTH(c);
   1.487 +            if (!This->fAllowedCharsSet->contains(c)) {
   1.488 +                result |= USPOOF_CHAR_LIMIT;
   1.489 +                break;
   1.490 +            }
   1.491 +        }
   1.492 +    }
   1.493 +
   1.494 +    if (This->fChecks & 
   1.495 +        (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
   1.496 +        // These are the checks that need to be done on NFD input
   1.497 +        UnicodeString nfdText;
   1.498 +        gNfdNormalizer->normalize(id, nfdText, *status);
   1.499 +        int32_t nfdLength = nfdText.length();
   1.500 +
   1.501 +        if (This->fChecks & USPOOF_INVISIBLE) {
   1.502 +           
   1.503 +            // scan for more than one occurence of the same non-spacing mark
   1.504 +            // in a sequence of non-spacing marks.
   1.505 +            int32_t     i;
   1.506 +            UChar32     c;
   1.507 +            UChar32     firstNonspacingMark = 0;
   1.508 +            UBool       haveMultipleMarks = FALSE;  
   1.509 +            UnicodeSet  marksSeenSoFar;   // Set of combining marks in a single combining sequence.
   1.510 +            
   1.511 +            for (i=0; i<nfdLength ;) {
   1.512 +                c = nfdText.char32At(i);
   1.513 +                i += U16_LENGTH(c);
   1.514 +                if (u_charType(c) != U_NON_SPACING_MARK) {
   1.515 +                    firstNonspacingMark = 0;
   1.516 +                    if (haveMultipleMarks) {
   1.517 +                        marksSeenSoFar.clear();
   1.518 +                        haveMultipleMarks = FALSE;
   1.519 +                    }
   1.520 +                    continue;
   1.521 +                }
   1.522 +                if (firstNonspacingMark == 0) {
   1.523 +                    firstNonspacingMark = c;
   1.524 +                    continue;
   1.525 +                }
   1.526 +                if (!haveMultipleMarks) {
   1.527 +                    marksSeenSoFar.add(firstNonspacingMark);
   1.528 +                    haveMultipleMarks = TRUE;
   1.529 +                }
   1.530 +                if (marksSeenSoFar.contains(c)) {
   1.531 +                    // report the error, and stop scanning.
   1.532 +                    // No need to find more than the first failure.
   1.533 +                    result |= USPOOF_INVISIBLE;
   1.534 +                    break;
   1.535 +                }
   1.536 +                marksSeenSoFar.add(c);
   1.537 +            }
   1.538 +        }
   1.539 +       
   1.540 +        
   1.541 +        if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
   1.542 +            // The basic test is the same for both whole and mixed script confusables.
   1.543 +            // Compute the set of scripts that every input character has a confusable in.
   1.544 +            // For this computation an input character is always considered to be
   1.545 +            // confusable with itself in its own script.
   1.546 +            //
   1.547 +            // If the number of such scripts is two or more, and the input consisted of
   1.548 +            // characters all from a single script, we have a whole script confusable.
   1.549 +            // (The two scripts will be the original script and the one that is confusable)
   1.550 +            //
   1.551 +            // If the number of such scripts >= one, and the original input contained characters from
   1.552 +            // more than one script, we have a mixed script confusable.  (We can transform
   1.553 +            // some of the characters, and end up with a visually similar string all in
   1.554 +            // one script.)
   1.555 +
   1.556 +            if (identifierInfo == NULL) {
   1.557 +                identifierInfo = This->getIdentifierInfo(*status);
   1.558 +                if (U_FAILURE(*status)) {
   1.559 +                    goto cleanupAndReturn;
   1.560 +                }
   1.561 +                identifierInfo->setIdentifier(id, *status);
   1.562 +            }
   1.563 +
   1.564 +            int32_t scriptCount = identifierInfo->getScriptCount();
   1.565 +            
   1.566 +            ScriptSet scripts;
   1.567 +            This->wholeScriptCheck(nfdText, &scripts, *status);
   1.568 +            int32_t confusableScriptCount = scripts.countMembers();
   1.569 +            //printf("confusableScriptCount = %d\n", confusableScriptCount);
   1.570 +            
   1.571 +            if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
   1.572 +                confusableScriptCount >= 2 &&
   1.573 +                scriptCount == 1) {
   1.574 +                result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
   1.575 +            }
   1.576 +        
   1.577 +            if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
   1.578 +                confusableScriptCount >= 1 &&
   1.579 +                scriptCount > 1) {
   1.580 +                result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
   1.581 +            }
   1.582 +        }
   1.583 +    }
   1.584 +
   1.585 +cleanupAndReturn:
   1.586 +    This->releaseIdentifierInfo(identifierInfo);
   1.587 +    if (position != NULL) {
   1.588 +        *position = 0;
   1.589 +    }
   1.590 +    return result;
   1.591 +}
   1.592 +
   1.593 +
   1.594 +U_CAPI int32_t U_EXPORT2
   1.595 +uspoof_getSkeleton(const USpoofChecker *sc,
   1.596 +                   uint32_t type,
   1.597 +                   const UChar *id,  int32_t length,
   1.598 +                   UChar *dest, int32_t destCapacity,
   1.599 +                   UErrorCode *status) {
   1.600 +
   1.601 +    SpoofImpl::validateThis(sc, *status);
   1.602 +    if (U_FAILURE(*status)) {
   1.603 +        return 0;
   1.604 +    }
   1.605 +    if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL)) {
   1.606 +        *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.607 +        return 0;
   1.608 +    }
   1.609 +
   1.610 +    UnicodeString idStr((length==-1), id, length);  // Aliasing constructor
   1.611 +    UnicodeString destStr;
   1.612 +    uspoof_getSkeletonUnicodeString(sc, type, idStr, destStr, status);
   1.613 +    destStr.extract(dest, destCapacity, *status);
   1.614 +    return destStr.length();
   1.615 +}
   1.616 +
   1.617 +
   1.618 +
   1.619 +U_I18N_API UnicodeString &  U_EXPORT2
   1.620 +uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
   1.621 +                                uint32_t type,
   1.622 +                                const UnicodeString &id,
   1.623 +                                UnicodeString &dest,
   1.624 +                                UErrorCode *status) {
   1.625 +    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
   1.626 +    if (U_FAILURE(*status)) {
   1.627 +        return dest;
   1.628 +    }
   1.629 +
   1.630 +   int32_t tableMask = 0;
   1.631 +   switch (type) {
   1.632 +      case 0:
   1.633 +        tableMask = USPOOF_ML_TABLE_FLAG;
   1.634 +        break;
   1.635 +      case USPOOF_SINGLE_SCRIPT_CONFUSABLE:
   1.636 +        tableMask = USPOOF_SL_TABLE_FLAG;
   1.637 +        break;
   1.638 +      case USPOOF_ANY_CASE:
   1.639 +        tableMask = USPOOF_MA_TABLE_FLAG;
   1.640 +        break;
   1.641 +      case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE:
   1.642 +        tableMask = USPOOF_SA_TABLE_FLAG;
   1.643 +        break;
   1.644 +      default:
   1.645 +        *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.646 +        return dest;
   1.647 +    }
   1.648 +
   1.649 +    UnicodeString nfdId;
   1.650 +    gNfdNormalizer->normalize(id, nfdId, *status);
   1.651 +
   1.652 +    // Apply the skeleton mapping to the NFD normalized input string
   1.653 +    // Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
   1.654 +    int32_t inputIndex = 0;
   1.655 +    UnicodeString skelStr;
   1.656 +    int32_t normalizedLen = nfdId.length();
   1.657 +    for (inputIndex=0; inputIndex < normalizedLen; ) {
   1.658 +        UChar32 c = nfdId.char32At(inputIndex);
   1.659 +        inputIndex += U16_LENGTH(c);
   1.660 +        This->confusableLookup(c, tableMask, skelStr);
   1.661 +    }
   1.662 +
   1.663 +    gNfdNormalizer->normalize(skelStr, dest, *status);
   1.664 +    return dest;
   1.665 +}
   1.666 +
   1.667 +
   1.668 +U_CAPI int32_t U_EXPORT2
   1.669 +uspoof_getSkeletonUTF8(const USpoofChecker *sc,
   1.670 +                       uint32_t type,
   1.671 +                       const char *id,  int32_t length,
   1.672 +                       char *dest, int32_t destCapacity,
   1.673 +                       UErrorCode *status) {
   1.674 +    SpoofImpl::validateThis(sc, *status);
   1.675 +    if (U_FAILURE(*status)) {
   1.676 +        return 0;
   1.677 +    }
   1.678 +    if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL)) {
   1.679 +        *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.680 +        return 0;
   1.681 +    }
   1.682 +
   1.683 +    UnicodeString srcStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : uprv_strlen(id)));
   1.684 +    UnicodeString destStr;
   1.685 +    uspoof_getSkeletonUnicodeString(sc, type, srcStr, destStr, status);
   1.686 +    if (U_FAILURE(*status)) {
   1.687 +        return 0;
   1.688 +    }
   1.689 +
   1.690 +    int32_t lengthInUTF8 = 0;
   1.691 +    u_strToUTF8(dest, destCapacity, &lengthInUTF8,
   1.692 +                destStr.getBuffer(), destStr.length(), status);
   1.693 +    return lengthInUTF8;
   1.694 +}
   1.695 +
   1.696 +
   1.697 +U_CAPI int32_t U_EXPORT2
   1.698 +uspoof_serialize(USpoofChecker *sc,void *buf, int32_t capacity, UErrorCode *status) {
   1.699 +    SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
   1.700 +    if (This == NULL) {
   1.701 +        U_ASSERT(U_FAILURE(*status));
   1.702 +        return 0;
   1.703 +    }
   1.704 +    int32_t dataSize = This->fSpoofData->fRawData->fLength;
   1.705 +    if (capacity < dataSize) {
   1.706 +        *status = U_BUFFER_OVERFLOW_ERROR;
   1.707 +        return dataSize;
   1.708 +    }
   1.709 +    uprv_memcpy(buf, This->fSpoofData->fRawData, dataSize);
   1.710 +    return dataSize;
   1.711 +}
   1.712 +
   1.713 +U_CAPI const USet * U_EXPORT2
   1.714 +uspoof_getInclusionSet(UErrorCode *) {
   1.715 +    initializeStatics();
   1.716 +    return gInclusionSet->toUSet();
   1.717 +}
   1.718 +
   1.719 +U_CAPI const USet * U_EXPORT2
   1.720 +uspoof_getRecommendedSet(UErrorCode *) {
   1.721 +    initializeStatics();
   1.722 +    return gRecommendedSet->toUSet();
   1.723 +}
   1.724 +
   1.725 +U_I18N_API const UnicodeSet * U_EXPORT2
   1.726 +uspoof_getInclusionUnicodeSet(UErrorCode *) {
   1.727 +    initializeStatics();
   1.728 +    return gInclusionSet;
   1.729 +}
   1.730 +
   1.731 +U_I18N_API const UnicodeSet * U_EXPORT2
   1.732 +uspoof_getRecommendedUnicodeSet(UErrorCode *) {
   1.733 +    initializeStatics();
   1.734 +    return gRecommendedSet;
   1.735 +}
   1.736 +
   1.737 +
   1.738 +
   1.739 +#endif // !UCONFIG_NO_NORMALIZATION

mercurial