intl/icu/source/common/usprep.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2003-2013, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 *******************************************************************************
michael@0 8 * file name: usprep.cpp
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2003jul2
michael@0 14 * created by: Ram Viswanadha
michael@0 15 */
michael@0 16
michael@0 17 #include "unicode/utypes.h"
michael@0 18
michael@0 19 #if !UCONFIG_NO_IDNA
michael@0 20
michael@0 21 #include "unicode/usprep.h"
michael@0 22
michael@0 23 #include "unicode/unorm.h"
michael@0 24 #include "unicode/ustring.h"
michael@0 25 #include "unicode/uchar.h"
michael@0 26 #include "unicode/uversion.h"
michael@0 27 #include "umutex.h"
michael@0 28 #include "cmemory.h"
michael@0 29 #include "sprpimpl.h"
michael@0 30 #include "ustr_imp.h"
michael@0 31 #include "uhash.h"
michael@0 32 #include "cstring.h"
michael@0 33 #include "udataswp.h"
michael@0 34 #include "ucln_cmn.h"
michael@0 35 #include "ubidi_props.h"
michael@0 36
michael@0 37 U_NAMESPACE_USE
michael@0 38
michael@0 39 U_CDECL_BEGIN
michael@0 40
michael@0 41 /*
michael@0 42 Static cache for already opened StringPrep profiles
michael@0 43 */
michael@0 44 static UHashtable *SHARED_DATA_HASHTABLE = NULL;
michael@0 45 static icu::UInitOnce gSharedDataInitOnce;
michael@0 46
michael@0 47 static UMutex usprepMutex = U_MUTEX_INITIALIZER;
michael@0 48
michael@0 49 /* format version of spp file */
michael@0 50 //static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
michael@0 51
michael@0 52 /* the Unicode version of the sprep data */
michael@0 53 static UVersionInfo dataVersion={ 0, 0, 0, 0 };
michael@0 54
michael@0 55 /* Profile names must be aligned to UStringPrepProfileType */
michael@0 56 static const char * const PROFILE_NAMES[] = {
michael@0 57 "rfc3491", /* USPREP_RFC3491_NAMEPREP */
michael@0 58 "rfc3530cs", /* USPREP_RFC3530_NFS4_CS_PREP */
michael@0 59 "rfc3530csci", /* USPREP_RFC3530_NFS4_CS_PREP_CI */
michael@0 60 "rfc3491", /* USPREP_RFC3530_NSF4_CIS_PREP */
michael@0 61 "rfc3530mixp", /* USPREP_RFC3530_NSF4_MIXED_PREP_PREFIX */
michael@0 62 "rfc3491", /* USPREP_RFC3530_NSF4_MIXED_PREP_SUFFIX */
michael@0 63 "rfc3722", /* USPREP_RFC3722_ISCSI */
michael@0 64 "rfc3920node", /* USPREP_RFC3920_NODEPREP */
michael@0 65 "rfc3920res", /* USPREP_RFC3920_RESOURCEPREP */
michael@0 66 "rfc4011", /* USPREP_RFC4011_MIB */
michael@0 67 "rfc4013", /* USPREP_RFC4013_SASLPREP */
michael@0 68 "rfc4505", /* USPREP_RFC4505_TRACE */
michael@0 69 "rfc4518", /* USPREP_RFC4518_LDAP */
michael@0 70 "rfc4518ci", /* USPREP_RFC4518_LDAP_CI */
michael@0 71 };
michael@0 72
michael@0 73 static UBool U_CALLCONV
michael@0 74 isSPrepAcceptable(void * /* context */,
michael@0 75 const char * /* type */,
michael@0 76 const char * /* name */,
michael@0 77 const UDataInfo *pInfo) {
michael@0 78 if(
michael@0 79 pInfo->size>=20 &&
michael@0 80 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
michael@0 81 pInfo->charsetFamily==U_CHARSET_FAMILY &&
michael@0 82 pInfo->dataFormat[0]==0x53 && /* dataFormat="SPRP" */
michael@0 83 pInfo->dataFormat[1]==0x50 &&
michael@0 84 pInfo->dataFormat[2]==0x52 &&
michael@0 85 pInfo->dataFormat[3]==0x50 &&
michael@0 86 pInfo->formatVersion[0]==3 &&
michael@0 87 pInfo->formatVersion[2]==UTRIE_SHIFT &&
michael@0 88 pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
michael@0 89 ) {
michael@0 90 //uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
michael@0 91 uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
michael@0 92 return TRUE;
michael@0 93 } else {
michael@0 94 return FALSE;
michael@0 95 }
michael@0 96 }
michael@0 97
michael@0 98 static int32_t U_CALLCONV
michael@0 99 getSPrepFoldingOffset(uint32_t data) {
michael@0 100
michael@0 101 return (int32_t)data;
michael@0 102
michael@0 103 }
michael@0 104
michael@0 105 /* hashes an entry */
michael@0 106 static int32_t U_CALLCONV
michael@0 107 hashEntry(const UHashTok parm) {
michael@0 108 UStringPrepKey *b = (UStringPrepKey *)parm.pointer;
michael@0 109 UHashTok namekey, pathkey;
michael@0 110 namekey.pointer = b->name;
michael@0 111 pathkey.pointer = b->path;
michael@0 112 return uhash_hashChars(namekey)+37*uhash_hashChars(pathkey);
michael@0 113 }
michael@0 114
michael@0 115 /* compares two entries */
michael@0 116 static UBool U_CALLCONV
michael@0 117 compareEntries(const UHashTok p1, const UHashTok p2) {
michael@0 118 UStringPrepKey *b1 = (UStringPrepKey *)p1.pointer;
michael@0 119 UStringPrepKey *b2 = (UStringPrepKey *)p2.pointer;
michael@0 120 UHashTok name1, name2, path1, path2;
michael@0 121 name1.pointer = b1->name;
michael@0 122 name2.pointer = b2->name;
michael@0 123 path1.pointer = b1->path;
michael@0 124 path2.pointer = b2->path;
michael@0 125 return ((UBool)(uhash_compareChars(name1, name2) &
michael@0 126 uhash_compareChars(path1, path2)));
michael@0 127 }
michael@0 128
michael@0 129 static void
michael@0 130 usprep_unload(UStringPrepProfile* data){
michael@0 131 udata_close(data->sprepData);
michael@0 132 }
michael@0 133
michael@0 134 static int32_t
michael@0 135 usprep_internal_flushCache(UBool noRefCount){
michael@0 136 UStringPrepProfile *profile = NULL;
michael@0 137 UStringPrepKey *key = NULL;
michael@0 138 int32_t pos = -1;
michael@0 139 int32_t deletedNum = 0;
michael@0 140 const UHashElement *e;
michael@0 141
michael@0 142 /*
michael@0 143 * if shared data hasn't even been lazy evaluated yet
michael@0 144 * return 0
michael@0 145 */
michael@0 146 umtx_lock(&usprepMutex);
michael@0 147 if (SHARED_DATA_HASHTABLE == NULL) {
michael@0 148 umtx_unlock(&usprepMutex);
michael@0 149 return 0;
michael@0 150 }
michael@0 151
michael@0 152 /*creates an enumeration to iterate through every element in the table */
michael@0 153 while ((e = uhash_nextElement(SHARED_DATA_HASHTABLE, &pos)) != NULL)
michael@0 154 {
michael@0 155 profile = (UStringPrepProfile *) e->value.pointer;
michael@0 156 key = (UStringPrepKey *) e->key.pointer;
michael@0 157
michael@0 158 if ((noRefCount== FALSE && profile->refCount == 0) ||
michael@0 159 noRefCount== TRUE) {
michael@0 160 deletedNum++;
michael@0 161 uhash_removeElement(SHARED_DATA_HASHTABLE, e);
michael@0 162
michael@0 163 /* unload the data */
michael@0 164 usprep_unload(profile);
michael@0 165
michael@0 166 if(key->name != NULL) {
michael@0 167 uprv_free(key->name);
michael@0 168 key->name=NULL;
michael@0 169 }
michael@0 170 if(key->path != NULL) {
michael@0 171 uprv_free(key->path);
michael@0 172 key->path=NULL;
michael@0 173 }
michael@0 174 uprv_free(profile);
michael@0 175 uprv_free(key);
michael@0 176 }
michael@0 177
michael@0 178 }
michael@0 179 umtx_unlock(&usprepMutex);
michael@0 180
michael@0 181 return deletedNum;
michael@0 182 }
michael@0 183
michael@0 184 /* Works just like ucnv_flushCache()
michael@0 185 static int32_t
michael@0 186 usprep_flushCache(){
michael@0 187 return usprep_internal_flushCache(FALSE);
michael@0 188 }
michael@0 189 */
michael@0 190
michael@0 191 static UBool U_CALLCONV usprep_cleanup(void){
michael@0 192 if (SHARED_DATA_HASHTABLE != NULL) {
michael@0 193 usprep_internal_flushCache(TRUE);
michael@0 194 if (SHARED_DATA_HASHTABLE != NULL && uhash_count(SHARED_DATA_HASHTABLE) == 0) {
michael@0 195 uhash_close(SHARED_DATA_HASHTABLE);
michael@0 196 SHARED_DATA_HASHTABLE = NULL;
michael@0 197 }
michael@0 198 }
michael@0 199 gSharedDataInitOnce.reset();
michael@0 200 return (SHARED_DATA_HASHTABLE == NULL);
michael@0 201 }
michael@0 202 U_CDECL_END
michael@0 203
michael@0 204
michael@0 205 /** Initializes the cache for resources */
michael@0 206 static void U_CALLCONV
michael@0 207 createCache(UErrorCode &status) {
michael@0 208 SHARED_DATA_HASHTABLE = uhash_open(hashEntry, compareEntries, NULL, &status);
michael@0 209 if (U_FAILURE(status)) {
michael@0 210 SHARED_DATA_HASHTABLE = NULL;
michael@0 211 }
michael@0 212 ucln_common_registerCleanup(UCLN_COMMON_USPREP, usprep_cleanup);
michael@0 213 }
michael@0 214
michael@0 215 static void
michael@0 216 initCache(UErrorCode *status) {
michael@0 217 umtx_initOnce(gSharedDataInitOnce, &createCache, *status);
michael@0 218 }
michael@0 219
michael@0 220 static UBool U_CALLCONV
michael@0 221 loadData(UStringPrepProfile* profile,
michael@0 222 const char* path,
michael@0 223 const char* name,
michael@0 224 const char* type,
michael@0 225 UErrorCode* errorCode) {
michael@0 226 /* load Unicode SPREP data from file */
michael@0 227 UTrie _sprepTrie={ 0,0,0,0,0,0,0 };
michael@0 228 UDataMemory *dataMemory;
michael@0 229 const int32_t *p=NULL;
michael@0 230 const uint8_t *pb;
michael@0 231 UVersionInfo normUnicodeVersion;
michael@0 232 int32_t normUniVer, sprepUniVer, normCorrVer;
michael@0 233
michael@0 234 if(errorCode==NULL || U_FAILURE(*errorCode)) {
michael@0 235 return 0;
michael@0 236 }
michael@0 237
michael@0 238 /* open the data outside the mutex block */
michael@0 239 //TODO: change the path
michael@0 240 dataMemory=udata_openChoice(path, type, name, isSPrepAcceptable, NULL, errorCode);
michael@0 241 if(U_FAILURE(*errorCode)) {
michael@0 242 return FALSE;
michael@0 243 }
michael@0 244
michael@0 245 p=(const int32_t *)udata_getMemory(dataMemory);
michael@0 246 pb=(const uint8_t *)(p+_SPREP_INDEX_TOP);
michael@0 247 utrie_unserialize(&_sprepTrie, pb, p[_SPREP_INDEX_TRIE_SIZE], errorCode);
michael@0 248 _sprepTrie.getFoldingOffset=getSPrepFoldingOffset;
michael@0 249
michael@0 250
michael@0 251 if(U_FAILURE(*errorCode)) {
michael@0 252 udata_close(dataMemory);
michael@0 253 return FALSE;
michael@0 254 }
michael@0 255
michael@0 256 /* in the mutex block, set the data for this process */
michael@0 257 umtx_lock(&usprepMutex);
michael@0 258 if(profile->sprepData==NULL) {
michael@0 259 profile->sprepData=dataMemory;
michael@0 260 dataMemory=NULL;
michael@0 261 uprv_memcpy(&profile->indexes, p, sizeof(profile->indexes));
michael@0 262 uprv_memcpy(&profile->sprepTrie, &_sprepTrie, sizeof(UTrie));
michael@0 263 } else {
michael@0 264 p=(const int32_t *)udata_getMemory(profile->sprepData);
michael@0 265 }
michael@0 266 umtx_unlock(&usprepMutex);
michael@0 267 /* initialize some variables */
michael@0 268 profile->mappingData=(uint16_t *)((uint8_t *)(p+_SPREP_INDEX_TOP)+profile->indexes[_SPREP_INDEX_TRIE_SIZE]);
michael@0 269
michael@0 270 u_getUnicodeVersion(normUnicodeVersion);
michael@0 271 normUniVer = (normUnicodeVersion[0] << 24) + (normUnicodeVersion[1] << 16) +
michael@0 272 (normUnicodeVersion[2] << 8 ) + (normUnicodeVersion[3]);
michael@0 273 sprepUniVer = (dataVersion[0] << 24) + (dataVersion[1] << 16) +
michael@0 274 (dataVersion[2] << 8 ) + (dataVersion[3]);
michael@0 275 normCorrVer = profile->indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION];
michael@0 276
michael@0 277 if(U_FAILURE(*errorCode)){
michael@0 278 udata_close(dataMemory);
michael@0 279 return FALSE;
michael@0 280 }
michael@0 281 if( normUniVer < sprepUniVer && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
michael@0 282 normUniVer < normCorrVer && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
michael@0 283 ((profile->indexes[_SPREP_OPTIONS] & _SPREP_NORMALIZATION_ON) > 0) /* normalization turned on*/
michael@0 284 ){
michael@0 285 *errorCode = U_INVALID_FORMAT_ERROR;
michael@0 286 udata_close(dataMemory);
michael@0 287 return FALSE;
michael@0 288 }
michael@0 289 profile->isDataLoaded = TRUE;
michael@0 290
michael@0 291 /* if a different thread set it first, then close the extra data */
michael@0 292 if(dataMemory!=NULL) {
michael@0 293 udata_close(dataMemory); /* NULL if it was set correctly */
michael@0 294 }
michael@0 295
michael@0 296
michael@0 297 return profile->isDataLoaded;
michael@0 298 }
michael@0 299
michael@0 300 static UStringPrepProfile*
michael@0 301 usprep_getProfile(const char* path,
michael@0 302 const char* name,
michael@0 303 UErrorCode *status){
michael@0 304
michael@0 305 UStringPrepProfile* profile = NULL;
michael@0 306
michael@0 307 initCache(status);
michael@0 308
michael@0 309 if(U_FAILURE(*status)){
michael@0 310 return NULL;
michael@0 311 }
michael@0 312
michael@0 313 UStringPrepKey stackKey;
michael@0 314 /*
michael@0 315 * const is cast way to save malloc, strcpy and free calls
michael@0 316 * we use the passed in pointers for fetching the data from the
michael@0 317 * hash table which is safe
michael@0 318 */
michael@0 319 stackKey.name = (char*) name;
michael@0 320 stackKey.path = (char*) path;
michael@0 321
michael@0 322 /* fetch the data from the cache */
michael@0 323 umtx_lock(&usprepMutex);
michael@0 324 profile = (UStringPrepProfile*) (uhash_get(SHARED_DATA_HASHTABLE,&stackKey));
michael@0 325 if(profile != NULL) {
michael@0 326 profile->refCount++;
michael@0 327 }
michael@0 328 umtx_unlock(&usprepMutex);
michael@0 329
michael@0 330 if(profile == NULL) {
michael@0 331 /* else load the data and put the data in the cache */
michael@0 332 LocalMemory<UStringPrepProfile> newProfile;
michael@0 333 if(newProfile.allocateInsteadAndReset() == NULL) {
michael@0 334 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 335 return NULL;
michael@0 336 }
michael@0 337
michael@0 338 /* load the data */
michael@0 339 if(!loadData(newProfile.getAlias(), path, name, _SPREP_DATA_TYPE, status) || U_FAILURE(*status) ){
michael@0 340 return NULL;
michael@0 341 }
michael@0 342
michael@0 343 /* get the options */
michael@0 344 newProfile->doNFKC = (UBool)((newProfile->indexes[_SPREP_OPTIONS] & _SPREP_NORMALIZATION_ON) > 0);
michael@0 345 newProfile->checkBiDi = (UBool)((newProfile->indexes[_SPREP_OPTIONS] & _SPREP_CHECK_BIDI_ON) > 0);
michael@0 346
michael@0 347 if(newProfile->checkBiDi) {
michael@0 348 newProfile->bdp = ubidi_getSingleton();
michael@0 349 }
michael@0 350
michael@0 351 LocalMemory<UStringPrepKey> key;
michael@0 352 LocalMemory<char> keyName;
michael@0 353 LocalMemory<char> keyPath;
michael@0 354 if( key.allocateInsteadAndReset() == NULL ||
michael@0 355 keyName.allocateInsteadAndCopy(uprv_strlen(name)+1) == NULL ||
michael@0 356 (path != NULL &&
michael@0 357 keyPath.allocateInsteadAndCopy(uprv_strlen(path)+1) == NULL)
michael@0 358 ) {
michael@0 359 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 360 usprep_unload(newProfile.getAlias());
michael@0 361 return NULL;
michael@0 362 }
michael@0 363
michael@0 364 umtx_lock(&usprepMutex);
michael@0 365 // If another thread already inserted the same key/value, refcount and cleanup our thread data
michael@0 366 profile = (UStringPrepProfile*) (uhash_get(SHARED_DATA_HASHTABLE,&stackKey));
michael@0 367 if(profile != NULL) {
michael@0 368 profile->refCount++;
michael@0 369 usprep_unload(newProfile.getAlias());
michael@0 370 }
michael@0 371 else {
michael@0 372 /* initialize the key members */
michael@0 373 key->name = keyName.orphan();
michael@0 374 uprv_strcpy(key->name, name);
michael@0 375 if(path != NULL){
michael@0 376 key->path = keyPath.orphan();
michael@0 377 uprv_strcpy(key->path, path);
michael@0 378 }
michael@0 379 profile = newProfile.orphan();
michael@0 380
michael@0 381 /* add the data object to the cache */
michael@0 382 profile->refCount = 1;
michael@0 383 uhash_put(SHARED_DATA_HASHTABLE, key.orphan(), profile, status);
michael@0 384 }
michael@0 385 umtx_unlock(&usprepMutex);
michael@0 386 }
michael@0 387
michael@0 388 return profile;
michael@0 389 }
michael@0 390
michael@0 391 U_CAPI UStringPrepProfile* U_EXPORT2
michael@0 392 usprep_open(const char* path,
michael@0 393 const char* name,
michael@0 394 UErrorCode* status){
michael@0 395
michael@0 396 if(status == NULL || U_FAILURE(*status)){
michael@0 397 return NULL;
michael@0 398 }
michael@0 399
michael@0 400 /* initialize the profile struct members */
michael@0 401 return usprep_getProfile(path,name,status);
michael@0 402 }
michael@0 403
michael@0 404 U_CAPI UStringPrepProfile* U_EXPORT2
michael@0 405 usprep_openByType(UStringPrepProfileType type,
michael@0 406 UErrorCode* status) {
michael@0 407 if(status == NULL || U_FAILURE(*status)){
michael@0 408 return NULL;
michael@0 409 }
michael@0 410 int32_t index = (int32_t)type;
michael@0 411 if (index < 0 || index >= (int32_t)(sizeof(PROFILE_NAMES)/sizeof(PROFILE_NAMES[0]))) {
michael@0 412 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 413 return NULL;
michael@0 414 }
michael@0 415 return usprep_open(NULL, PROFILE_NAMES[index], status);
michael@0 416 }
michael@0 417
michael@0 418 U_CAPI void U_EXPORT2
michael@0 419 usprep_close(UStringPrepProfile* profile){
michael@0 420 if(profile==NULL){
michael@0 421 return;
michael@0 422 }
michael@0 423
michael@0 424 umtx_lock(&usprepMutex);
michael@0 425 /* decrement the ref count*/
michael@0 426 if(profile->refCount > 0){
michael@0 427 profile->refCount--;
michael@0 428 }
michael@0 429 umtx_unlock(&usprepMutex);
michael@0 430
michael@0 431 }
michael@0 432
michael@0 433 U_CFUNC void
michael@0 434 uprv_syntaxError(const UChar* rules,
michael@0 435 int32_t pos,
michael@0 436 int32_t rulesLen,
michael@0 437 UParseError* parseError){
michael@0 438 if(parseError == NULL){
michael@0 439 return;
michael@0 440 }
michael@0 441 parseError->offset = pos;
michael@0 442 parseError->line = 0 ; // we are not using line numbers
michael@0 443
michael@0 444 // for pre-context
michael@0 445 int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
michael@0 446 int32_t limit = pos;
michael@0 447
michael@0 448 u_memcpy(parseError->preContext,rules+start,limit-start);
michael@0 449 //null terminate the buffer
michael@0 450 parseError->preContext[limit-start] = 0;
michael@0 451
michael@0 452 // for post-context; include error rules[pos]
michael@0 453 start = pos;
michael@0 454 limit = start + (U_PARSE_CONTEXT_LEN-1);
michael@0 455 if (limit > rulesLen) {
michael@0 456 limit = rulesLen;
michael@0 457 }
michael@0 458 if (start < rulesLen) {
michael@0 459 u_memcpy(parseError->postContext,rules+start,limit-start);
michael@0 460 }
michael@0 461 //null terminate the buffer
michael@0 462 parseError->postContext[limit-start]= 0;
michael@0 463 }
michael@0 464
michael@0 465
michael@0 466 static inline UStringPrepType
michael@0 467 getValues(uint16_t trieWord, int16_t& value, UBool& isIndex){
michael@0 468
michael@0 469 UStringPrepType type;
michael@0 470 if(trieWord == 0){
michael@0 471 /*
michael@0 472 * Initial value stored in the mapping table
michael@0 473 * just return USPREP_TYPE_LIMIT .. so that
michael@0 474 * the source codepoint is copied to the destination
michael@0 475 */
michael@0 476 type = USPREP_TYPE_LIMIT;
michael@0 477 isIndex =FALSE;
michael@0 478 value = 0;
michael@0 479 }else if(trieWord >= _SPREP_TYPE_THRESHOLD){
michael@0 480 type = (UStringPrepType) (trieWord - _SPREP_TYPE_THRESHOLD);
michael@0 481 isIndex =FALSE;
michael@0 482 value = 0;
michael@0 483 }else{
michael@0 484 /* get the type */
michael@0 485 type = USPREP_MAP;
michael@0 486 /* ascertain if the value is index or delta */
michael@0 487 if(trieWord & 0x02){
michael@0 488 isIndex = TRUE;
michael@0 489 value = trieWord >> 2; //mask off the lower 2 bits and shift
michael@0 490 }else{
michael@0 491 isIndex = FALSE;
michael@0 492 value = (int16_t)trieWord;
michael@0 493 value = (value >> 2);
michael@0 494 }
michael@0 495
michael@0 496 if((trieWord>>2) == _SPREP_MAX_INDEX_VALUE){
michael@0 497 type = USPREP_DELETE;
michael@0 498 isIndex =FALSE;
michael@0 499 value = 0;
michael@0 500 }
michael@0 501 }
michael@0 502 return type;
michael@0 503 }
michael@0 504
michael@0 505
michael@0 506
michael@0 507 static int32_t
michael@0 508 usprep_map( const UStringPrepProfile* profile,
michael@0 509 const UChar* src, int32_t srcLength,
michael@0 510 UChar* dest, int32_t destCapacity,
michael@0 511 int32_t options,
michael@0 512 UParseError* parseError,
michael@0 513 UErrorCode* status ){
michael@0 514
michael@0 515 uint16_t result;
michael@0 516 int32_t destIndex=0;
michael@0 517 int32_t srcIndex;
michael@0 518 UBool allowUnassigned = (UBool) ((options & USPREP_ALLOW_UNASSIGNED)>0);
michael@0 519 UStringPrepType type;
michael@0 520 int16_t value;
michael@0 521 UBool isIndex;
michael@0 522 const int32_t* indexes = profile->indexes;
michael@0 523
michael@0 524 // no error checking the caller check for error and arguments
michael@0 525 // no string length check the caller finds out the string length
michael@0 526
michael@0 527 for(srcIndex=0;srcIndex<srcLength;){
michael@0 528 UChar32 ch;
michael@0 529
michael@0 530 U16_NEXT(src,srcIndex,srcLength,ch);
michael@0 531
michael@0 532 result=0;
michael@0 533
michael@0 534 UTRIE_GET16(&profile->sprepTrie,ch,result);
michael@0 535
michael@0 536 type = getValues(result, value, isIndex);
michael@0 537
michael@0 538 // check if the source codepoint is unassigned
michael@0 539 if(type == USPREP_UNASSIGNED && allowUnassigned == FALSE){
michael@0 540
michael@0 541 uprv_syntaxError(src,srcIndex-U16_LENGTH(ch), srcLength,parseError);
michael@0 542 *status = U_STRINGPREP_UNASSIGNED_ERROR;
michael@0 543 return 0;
michael@0 544
michael@0 545 }else if(type == USPREP_MAP){
michael@0 546
michael@0 547 int32_t index, length;
michael@0 548
michael@0 549 if(isIndex){
michael@0 550 index = value;
michael@0 551 if(index >= indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] &&
michael@0 552 index < indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START]){
michael@0 553 length = 1;
michael@0 554 }else if(index >= indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] &&
michael@0 555 index < indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START]){
michael@0 556 length = 2;
michael@0 557 }else if(index >= indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] &&
michael@0 558 index < indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START]){
michael@0 559 length = 3;
michael@0 560 }else{
michael@0 561 length = profile->mappingData[index++];
michael@0 562
michael@0 563 }
michael@0 564
michael@0 565 /* copy mapping to destination */
michael@0 566 for(int32_t i=0; i< length; i++){
michael@0 567 if(destIndex < destCapacity ){
michael@0 568 dest[destIndex] = profile->mappingData[index+i];
michael@0 569 }
michael@0 570 destIndex++; /* for pre-flighting */
michael@0 571 }
michael@0 572 continue;
michael@0 573 }else{
michael@0 574 // subtract the delta to arrive at the code point
michael@0 575 ch -= value;
michael@0 576 }
michael@0 577
michael@0 578 }else if(type==USPREP_DELETE){
michael@0 579 // just consume the codepoint and contine
michael@0 580 continue;
michael@0 581 }
michael@0 582 //copy the code point into destination
michael@0 583 if(ch <= 0xFFFF){
michael@0 584 if(destIndex < destCapacity ){
michael@0 585 dest[destIndex] = (UChar)ch;
michael@0 586 }
michael@0 587 destIndex++;
michael@0 588 }else{
michael@0 589 if(destIndex+1 < destCapacity ){
michael@0 590 dest[destIndex] = U16_LEAD(ch);
michael@0 591 dest[destIndex+1] = U16_TRAIL(ch);
michael@0 592 }
michael@0 593 destIndex +=2;
michael@0 594 }
michael@0 595
michael@0 596 }
michael@0 597
michael@0 598 return u_terminateUChars(dest, destCapacity, destIndex, status);
michael@0 599 }
michael@0 600
michael@0 601
michael@0 602 static int32_t
michael@0 603 usprep_normalize( const UChar* src, int32_t srcLength,
michael@0 604 UChar* dest, int32_t destCapacity,
michael@0 605 UErrorCode* status ){
michael@0 606 return unorm_normalize(
michael@0 607 src, srcLength,
michael@0 608 UNORM_NFKC, UNORM_UNICODE_3_2,
michael@0 609 dest, destCapacity,
michael@0 610 status);
michael@0 611 }
michael@0 612
michael@0 613
michael@0 614 /*
michael@0 615 1) Map -- For each character in the input, check if it has a mapping
michael@0 616 and, if so, replace it with its mapping.
michael@0 617
michael@0 618 2) Normalize -- Possibly normalize the result of step 1 using Unicode
michael@0 619 normalization.
michael@0 620
michael@0 621 3) Prohibit -- Check for any characters that are not allowed in the
michael@0 622 output. If any are found, return an error.
michael@0 623
michael@0 624 4) Check bidi -- Possibly check for right-to-left characters, and if
michael@0 625 any are found, make sure that the whole string satisfies the
michael@0 626 requirements for bidirectional strings. If the string does not
michael@0 627 satisfy the requirements for bidirectional strings, return an
michael@0 628 error.
michael@0 629 [Unicode3.2] defines several bidirectional categories; each character
michael@0 630 has one bidirectional category assigned to it. For the purposes of
michael@0 631 the requirements below, an "RandALCat character" is a character that
michael@0 632 has Unicode bidirectional categories "R" or "AL"; an "LCat character"
michael@0 633 is a character that has Unicode bidirectional category "L". Note
michael@0 634
michael@0 635
michael@0 636 that there are many characters which fall in neither of the above
michael@0 637 definitions; Latin digits (<U+0030> through <U+0039>) are examples of
michael@0 638 this because they have bidirectional category "EN".
michael@0 639
michael@0 640 In any profile that specifies bidirectional character handling, all
michael@0 641 three of the following requirements MUST be met:
michael@0 642
michael@0 643 1) The characters in section 5.8 MUST be prohibited.
michael@0 644
michael@0 645 2) If a string contains any RandALCat character, the string MUST NOT
michael@0 646 contain any LCat character.
michael@0 647
michael@0 648 3) If a string contains any RandALCat character, a RandALCat
michael@0 649 character MUST be the first character of the string, and a
michael@0 650 RandALCat character MUST be the last character of the string.
michael@0 651 */
michael@0 652
michael@0 653 #define MAX_STACK_BUFFER_SIZE 300
michael@0 654
michael@0 655
michael@0 656 U_CAPI int32_t U_EXPORT2
michael@0 657 usprep_prepare( const UStringPrepProfile* profile,
michael@0 658 const UChar* src, int32_t srcLength,
michael@0 659 UChar* dest, int32_t destCapacity,
michael@0 660 int32_t options,
michael@0 661 UParseError* parseError,
michael@0 662 UErrorCode* status ){
michael@0 663
michael@0 664 // check error status
michael@0 665 if(status == NULL || U_FAILURE(*status)){
michael@0 666 return 0;
michael@0 667 }
michael@0 668
michael@0 669 //check arguments
michael@0 670 if(profile==NULL || src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
michael@0 671 *status=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 672 return 0;
michael@0 673 }
michael@0 674
michael@0 675 UChar b1Stack[MAX_STACK_BUFFER_SIZE], b2Stack[MAX_STACK_BUFFER_SIZE];
michael@0 676 UChar *b1 = b1Stack, *b2 = b2Stack;
michael@0 677 int32_t b1Len, b2Len=0,
michael@0 678 b1Capacity = MAX_STACK_BUFFER_SIZE ,
michael@0 679 b2Capacity = MAX_STACK_BUFFER_SIZE;
michael@0 680 uint16_t result;
michael@0 681 int32_t b2Index = 0;
michael@0 682 UCharDirection direction=U_CHAR_DIRECTION_COUNT, firstCharDir=U_CHAR_DIRECTION_COUNT;
michael@0 683 UBool leftToRight=FALSE, rightToLeft=FALSE;
michael@0 684 int32_t rtlPos =-1, ltrPos =-1;
michael@0 685
michael@0 686 //get the string length
michael@0 687 if(srcLength == -1){
michael@0 688 srcLength = u_strlen(src);
michael@0 689 }
michael@0 690 // map
michael@0 691 b1Len = usprep_map(profile, src, srcLength, b1, b1Capacity, options, parseError, status);
michael@0 692
michael@0 693 if(*status == U_BUFFER_OVERFLOW_ERROR){
michael@0 694 // redo processing of string
michael@0 695 /* we do not have enough room so grow the buffer*/
michael@0 696 b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
michael@0 697 if(b1==NULL){
michael@0 698 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 699 goto CLEANUP;
michael@0 700 }
michael@0 701
michael@0 702 *status = U_ZERO_ERROR; // reset error
michael@0 703
michael@0 704 b1Len = usprep_map(profile, src, srcLength, b1, b1Len, options, parseError, status);
michael@0 705
michael@0 706 }
michael@0 707
michael@0 708 // normalize
michael@0 709 if(profile->doNFKC == TRUE){
michael@0 710 b2Len = usprep_normalize(b1,b1Len, b2,b2Capacity,status);
michael@0 711
michael@0 712 if(*status == U_BUFFER_OVERFLOW_ERROR){
michael@0 713 // redo processing of string
michael@0 714 /* we do not have enough room so grow the buffer*/
michael@0 715 b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR);
michael@0 716 if(b2==NULL){
michael@0 717 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 718 goto CLEANUP;
michael@0 719 }
michael@0 720
michael@0 721 *status = U_ZERO_ERROR; // reset error
michael@0 722
michael@0 723 b2Len = usprep_normalize(b1,b1Len, b2,b2Len,status);
michael@0 724
michael@0 725 }
michael@0 726
michael@0 727 }else{
michael@0 728 b2 = b1;
michael@0 729 b2Len = b1Len;
michael@0 730 }
michael@0 731
michael@0 732
michael@0 733 if(U_FAILURE(*status)){
michael@0 734 goto CLEANUP;
michael@0 735 }
michael@0 736
michael@0 737 UChar32 ch;
michael@0 738 UStringPrepType type;
michael@0 739 int16_t value;
michael@0 740 UBool isIndex;
michael@0 741
michael@0 742 // Prohibit and checkBiDi in one pass
michael@0 743 for(b2Index=0; b2Index<b2Len;){
michael@0 744
michael@0 745 ch = 0;
michael@0 746
michael@0 747 U16_NEXT(b2, b2Index, b2Len, ch);
michael@0 748
michael@0 749 UTRIE_GET16(&profile->sprepTrie,ch,result);
michael@0 750
michael@0 751 type = getValues(result, value, isIndex);
michael@0 752
michael@0 753 if( type == USPREP_PROHIBITED ||
michael@0 754 ((result < _SPREP_TYPE_THRESHOLD) && (result & 0x01) /* first bit says it the code point is prohibited*/)
michael@0 755 ){
michael@0 756 *status = U_STRINGPREP_PROHIBITED_ERROR;
michael@0 757 uprv_syntaxError(b1, b2Index-U16_LENGTH(ch), b2Len, parseError);
michael@0 758 goto CLEANUP;
michael@0 759 }
michael@0 760
michael@0 761 if(profile->checkBiDi) {
michael@0 762 direction = ubidi_getClass(profile->bdp, ch);
michael@0 763 if(firstCharDir == U_CHAR_DIRECTION_COUNT){
michael@0 764 firstCharDir = direction;
michael@0 765 }
michael@0 766 if(direction == U_LEFT_TO_RIGHT){
michael@0 767 leftToRight = TRUE;
michael@0 768 ltrPos = b2Index-1;
michael@0 769 }
michael@0 770 if(direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC){
michael@0 771 rightToLeft = TRUE;
michael@0 772 rtlPos = b2Index-1;
michael@0 773 }
michael@0 774 }
michael@0 775 }
michael@0 776 if(profile->checkBiDi == TRUE){
michael@0 777 // satisfy 2
michael@0 778 if( leftToRight == TRUE && rightToLeft == TRUE){
michael@0 779 *status = U_STRINGPREP_CHECK_BIDI_ERROR;
michael@0 780 uprv_syntaxError(b2,(rtlPos>ltrPos) ? rtlPos : ltrPos, b2Len, parseError);
michael@0 781 goto CLEANUP;
michael@0 782 }
michael@0 783
michael@0 784 //satisfy 3
michael@0 785 if( rightToLeft == TRUE &&
michael@0 786 !((firstCharDir == U_RIGHT_TO_LEFT || firstCharDir == U_RIGHT_TO_LEFT_ARABIC) &&
michael@0 787 (direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC))
michael@0 788 ){
michael@0 789 *status = U_STRINGPREP_CHECK_BIDI_ERROR;
michael@0 790 uprv_syntaxError(b2, rtlPos, b2Len, parseError);
michael@0 791 return FALSE;
michael@0 792 }
michael@0 793 }
michael@0 794 if(b2Len>0 && b2Len <= destCapacity){
michael@0 795 uprv_memmove(dest,b2, b2Len*U_SIZEOF_UCHAR);
michael@0 796 }
michael@0 797
michael@0 798 CLEANUP:
michael@0 799 if(b1!=b1Stack){
michael@0 800 uprv_free(b1);
michael@0 801 b1=NULL;
michael@0 802 }
michael@0 803
michael@0 804 if(b2!=b1Stack && b2!=b2Stack && b2!=b1 /* b1 should not be freed twice */){
michael@0 805 uprv_free(b2);
michael@0 806 b2=NULL;
michael@0 807 }
michael@0 808 return u_terminateUChars(dest, destCapacity, b2Len, status);
michael@0 809 }
michael@0 810
michael@0 811
michael@0 812 /* data swapping ------------------------------------------------------------ */
michael@0 813
michael@0 814 U_CAPI int32_t U_EXPORT2
michael@0 815 usprep_swap(const UDataSwapper *ds,
michael@0 816 const void *inData, int32_t length, void *outData,
michael@0 817 UErrorCode *pErrorCode) {
michael@0 818 const UDataInfo *pInfo;
michael@0 819 int32_t headerSize;
michael@0 820
michael@0 821 const uint8_t *inBytes;
michael@0 822 uint8_t *outBytes;
michael@0 823
michael@0 824 const int32_t *inIndexes;
michael@0 825 int32_t indexes[16];
michael@0 826
michael@0 827 int32_t i, offset, count, size;
michael@0 828
michael@0 829 /* udata_swapDataHeader checks the arguments */
michael@0 830 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
michael@0 831 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
michael@0 832 return 0;
michael@0 833 }
michael@0 834
michael@0 835 /* check data format and format version */
michael@0 836 pInfo=(const UDataInfo *)((const char *)inData+4);
michael@0 837 if(!(
michael@0 838 pInfo->dataFormat[0]==0x53 && /* dataFormat="SPRP" */
michael@0 839 pInfo->dataFormat[1]==0x50 &&
michael@0 840 pInfo->dataFormat[2]==0x52 &&
michael@0 841 pInfo->dataFormat[3]==0x50 &&
michael@0 842 pInfo->formatVersion[0]==3
michael@0 843 )) {
michael@0 844 udata_printError(ds, "usprep_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as StringPrep .spp data\n",
michael@0 845 pInfo->dataFormat[0], pInfo->dataFormat[1],
michael@0 846 pInfo->dataFormat[2], pInfo->dataFormat[3],
michael@0 847 pInfo->formatVersion[0]);
michael@0 848 *pErrorCode=U_UNSUPPORTED_ERROR;
michael@0 849 return 0;
michael@0 850 }
michael@0 851
michael@0 852 inBytes=(const uint8_t *)inData+headerSize;
michael@0 853 outBytes=(uint8_t *)outData+headerSize;
michael@0 854
michael@0 855 inIndexes=(const int32_t *)inBytes;
michael@0 856
michael@0 857 if(length>=0) {
michael@0 858 length-=headerSize;
michael@0 859 if(length<16*4) {
michael@0 860 udata_printError(ds, "usprep_swap(): too few bytes (%d after header) for StringPrep .spp data\n",
michael@0 861 length);
michael@0 862 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 863 return 0;
michael@0 864 }
michael@0 865 }
michael@0 866
michael@0 867 /* read the first 16 indexes (ICU 2.8/format version 3: _SPREP_INDEX_TOP==16, might grow) */
michael@0 868 for(i=0; i<16; ++i) {
michael@0 869 indexes[i]=udata_readInt32(ds, inIndexes[i]);
michael@0 870 }
michael@0 871
michael@0 872 /* calculate the total length of the data */
michael@0 873 size=
michael@0 874 16*4+ /* size of indexes[] */
michael@0 875 indexes[_SPREP_INDEX_TRIE_SIZE]+
michael@0 876 indexes[_SPREP_INDEX_MAPPING_DATA_SIZE];
michael@0 877
michael@0 878 if(length>=0) {
michael@0 879 if(length<size) {
michael@0 880 udata_printError(ds, "usprep_swap(): too few bytes (%d after header) for all of StringPrep .spp data\n",
michael@0 881 length);
michael@0 882 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 883 return 0;
michael@0 884 }
michael@0 885
michael@0 886 /* copy the data for inaccessible bytes */
michael@0 887 if(inBytes!=outBytes) {
michael@0 888 uprv_memcpy(outBytes, inBytes, size);
michael@0 889 }
michael@0 890
michael@0 891 offset=0;
michael@0 892
michael@0 893 /* swap the int32_t indexes[] */
michael@0 894 count=16*4;
michael@0 895 ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode);
michael@0 896 offset+=count;
michael@0 897
michael@0 898 /* swap the UTrie */
michael@0 899 count=indexes[_SPREP_INDEX_TRIE_SIZE];
michael@0 900 utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
michael@0 901 offset+=count;
michael@0 902
michael@0 903 /* swap the uint16_t mappingTable[] */
michael@0 904 count=indexes[_SPREP_INDEX_MAPPING_DATA_SIZE];
michael@0 905 ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
michael@0 906 offset+=count;
michael@0 907 }
michael@0 908
michael@0 909 return headerSize+size;
michael@0 910 }
michael@0 911
michael@0 912 #endif /* #if !UCONFIG_NO_IDNA */

mercurial