1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/ucol_elm.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,2070 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 2001-2012, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: ucaelems.cpp 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created 02/22/2001 1.17 +* created by: Vladimir Weinstein 1.18 +* 1.19 +* This program reads the Franctional UCA table and generates 1.20 +* internal format for UCA table as well as inverse UCA table. 1.21 +* It then writes binary files containing the data: ucadata.dat 1.22 +* & invuca.dat 1.23 +* 1.24 +* date name comments 1.25 +* 03/02/2001 synwee added setMaxExpansion 1.26 +* 03/07/2001 synwee merged UCA's maxexpansion and tailoring's 1.27 +*/ 1.28 + 1.29 +#include "unicode/utypes.h" 1.30 + 1.31 +#if !UCONFIG_NO_COLLATION 1.32 + 1.33 +#include "unicode/uchar.h" 1.34 +#include "unicode/unistr.h" 1.35 +#include "unicode/ucoleitr.h" 1.36 +#include "unicode/normlzr.h" 1.37 +#include "unicode/utf16.h" 1.38 +#include "normalizer2impl.h" 1.39 +#include "ucol_elm.h" 1.40 +#include "ucol_tok.h" 1.41 +#include "ucol_cnt.h" 1.42 +#include "unicode/caniter.h" 1.43 +#include "cmemory.h" 1.44 +#include "uassert.h" 1.45 + 1.46 +U_NAMESPACE_USE 1.47 + 1.48 +static uint32_t uprv_uca_processContraction(CntTable *contractions, UCAElements *element, uint32_t existingCE, UErrorCode *status); 1.49 + 1.50 +U_CDECL_BEGIN 1.51 +static int32_t U_CALLCONV 1.52 +prefixLookupHash(const UHashTok e) { 1.53 + UCAElements *element = (UCAElements *)e.pointer; 1.54 + UChar buf[256]; 1.55 + UHashTok key; 1.56 + key.pointer = buf; 1.57 + uprv_memcpy(buf, element->cPoints, element->cSize*sizeof(UChar)); 1.58 + buf[element->cSize] = 0; 1.59 + //key.pointer = element->cPoints; 1.60 + //element->cPoints[element->cSize] = 0; 1.61 + return uhash_hashUChars(key); 1.62 +} 1.63 + 1.64 +static int8_t U_CALLCONV 1.65 +prefixLookupComp(const UHashTok e1, const UHashTok e2) { 1.66 + UCAElements *element1 = (UCAElements *)e1.pointer; 1.67 + UCAElements *element2 = (UCAElements *)e2.pointer; 1.68 + 1.69 + UChar buf1[256]; 1.70 + UHashTok key1; 1.71 + key1.pointer = buf1; 1.72 + uprv_memcpy(buf1, element1->cPoints, element1->cSize*sizeof(UChar)); 1.73 + buf1[element1->cSize] = 0; 1.74 + 1.75 + UChar buf2[256]; 1.76 + UHashTok key2; 1.77 + key2.pointer = buf2; 1.78 + uprv_memcpy(buf2, element2->cPoints, element2->cSize*sizeof(UChar)); 1.79 + buf2[element2->cSize] = 0; 1.80 + 1.81 + return uhash_compareUChars(key1, key2); 1.82 +} 1.83 +U_CDECL_END 1.84 + 1.85 +static int32_t uprv_uca_addExpansion(ExpansionTable *expansions, uint32_t value, UErrorCode *status) { 1.86 + if(U_FAILURE(*status)) { 1.87 + return 0; 1.88 + } 1.89 + if(expansions->CEs == NULL) { 1.90 + expansions->CEs = (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE*sizeof(uint32_t)); 1.91 + /* test for NULL */ 1.92 + if (expansions->CEs == NULL) { 1.93 + *status = U_MEMORY_ALLOCATION_ERROR; 1.94 + return 0; 1.95 + } 1.96 + expansions->size = INIT_EXP_TABLE_SIZE; 1.97 + expansions->position = 0; 1.98 + } 1.99 + 1.100 + if(expansions->position == expansions->size) { 1.101 + uint32_t *newData = (uint32_t *)uprv_realloc(expansions->CEs, 2*expansions->size*sizeof(uint32_t)); 1.102 + if(newData == NULL) { 1.103 +#ifdef UCOL_DEBUG 1.104 + fprintf(stderr, "out of memory for expansions\n"); 1.105 +#endif 1.106 + *status = U_MEMORY_ALLOCATION_ERROR; 1.107 + return -1; 1.108 + } 1.109 + expansions->CEs = newData; 1.110 + expansions->size *= 2; 1.111 + } 1.112 + 1.113 + expansions->CEs[expansions->position] = value; 1.114 + return(expansions->position++); 1.115 +} 1.116 + 1.117 +U_CAPI tempUCATable* U_EXPORT2 1.118 +uprv_uca_initTempTable(UCATableHeader *image, UColOptionSet *opts, const UCollator *UCA, UColCETags initTag, UColCETags supplementaryInitTag, UErrorCode *status) { 1.119 + MaxJamoExpansionTable *maxjet; 1.120 + MaxExpansionTable *maxet; 1.121 + tempUCATable *t = (tempUCATable *)uprv_malloc(sizeof(tempUCATable)); 1.122 + /* test for NULL */ 1.123 + if (t == NULL) { 1.124 + *status = U_MEMORY_ALLOCATION_ERROR; 1.125 + return NULL; 1.126 + } 1.127 + uprv_memset(t, 0, sizeof(tempUCATable)); 1.128 + 1.129 + maxet = (MaxExpansionTable *)uprv_malloc(sizeof(MaxExpansionTable)); 1.130 + if (maxet == NULL) { 1.131 + goto allocation_failure; 1.132 + } 1.133 + uprv_memset(maxet, 0, sizeof(MaxExpansionTable)); 1.134 + t->maxExpansions = maxet; 1.135 + 1.136 + maxjet = (MaxJamoExpansionTable *)uprv_malloc(sizeof(MaxJamoExpansionTable)); 1.137 + if (maxjet == NULL) { 1.138 + goto allocation_failure; 1.139 + } 1.140 + uprv_memset(maxjet, 0, sizeof(MaxJamoExpansionTable)); 1.141 + t->maxJamoExpansions = maxjet; 1.142 + 1.143 + t->image = image; 1.144 + t->options = opts; 1.145 + 1.146 + t->UCA = UCA; 1.147 + t->expansions = (ExpansionTable *)uprv_malloc(sizeof(ExpansionTable)); 1.148 + /* test for NULL */ 1.149 + if (t->expansions == NULL) { 1.150 + goto allocation_failure; 1.151 + } 1.152 + uprv_memset(t->expansions, 0, sizeof(ExpansionTable)); 1.153 + 1.154 + t->mapping = utrie_open(NULL, NULL, UCOL_ELM_TRIE_CAPACITY, 1.155 + UCOL_SPECIAL_FLAG | (initTag<<24), 1.156 + UCOL_SPECIAL_FLAG | (supplementaryInitTag << 24), 1.157 + TRUE); // Do your own mallocs for the structure, array and have linear Latin 1 1.158 + if (U_FAILURE(*status)) { 1.159 + goto allocation_failure; 1.160 + } 1.161 + t->prefixLookup = uhash_open(prefixLookupHash, prefixLookupComp, NULL, status); 1.162 + if (U_FAILURE(*status)) { 1.163 + goto allocation_failure; 1.164 + } 1.165 + uhash_setValueDeleter(t->prefixLookup, uprv_free); 1.166 + 1.167 + t->contractions = uprv_cnttab_open(t->mapping, status); 1.168 + if (U_FAILURE(*status)) { 1.169 + goto cleanup; 1.170 + } 1.171 + 1.172 + /* copy UCA's maxexpansion and merge as we go along */ 1.173 + if (UCA != NULL) { 1.174 + /* adding an extra initial value for easier manipulation */ 1.175 + maxet->size = (int32_t)(UCA->lastEndExpansionCE - UCA->endExpansionCE) + 2; 1.176 + maxet->position = maxet->size - 1; 1.177 + maxet->endExpansionCE = 1.178 + (uint32_t *)uprv_malloc(sizeof(uint32_t) * maxet->size); 1.179 + /* test for NULL */ 1.180 + if (maxet->endExpansionCE == NULL) { 1.181 + goto allocation_failure; 1.182 + } 1.183 + maxet->expansionCESize = 1.184 + (uint8_t *)uprv_malloc(sizeof(uint8_t) * maxet->size); 1.185 + /* test for NULL */ 1.186 + if (maxet->expansionCESize == NULL) { 1.187 + goto allocation_failure; 1.188 + } 1.189 + /* initialized value */ 1.190 + *(maxet->endExpansionCE) = 0; 1.191 + *(maxet->expansionCESize) = 0; 1.192 + uprv_memcpy(maxet->endExpansionCE + 1, UCA->endExpansionCE, 1.193 + sizeof(uint32_t) * (maxet->size - 1)); 1.194 + uprv_memcpy(maxet->expansionCESize + 1, UCA->expansionCESize, 1.195 + sizeof(uint8_t) * (maxet->size - 1)); 1.196 + } 1.197 + else { 1.198 + maxet->size = 0; 1.199 + } 1.200 + maxjet->endExpansionCE = NULL; 1.201 + maxjet->isV = NULL; 1.202 + maxjet->size = 0; 1.203 + maxjet->position = 0; 1.204 + maxjet->maxLSize = 1; 1.205 + maxjet->maxVSize = 1; 1.206 + maxjet->maxTSize = 1; 1.207 + 1.208 + t->unsafeCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE); 1.209 + /* test for NULL */ 1.210 + if (t->unsafeCP == NULL) { 1.211 + goto allocation_failure; 1.212 + } 1.213 + t->contrEndCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE); 1.214 + /* test for NULL */ 1.215 + if (t->contrEndCP == NULL) { 1.216 + goto allocation_failure; 1.217 + } 1.218 + uprv_memset(t->unsafeCP, 0, UCOL_UNSAFECP_TABLE_SIZE); 1.219 + uprv_memset(t->contrEndCP, 0, UCOL_UNSAFECP_TABLE_SIZE); 1.220 + t->cmLookup = NULL; 1.221 + return t; 1.222 + 1.223 +allocation_failure: 1.224 + *status = U_MEMORY_ALLOCATION_ERROR; 1.225 +cleanup: 1.226 + uprv_uca_closeTempTable(t); 1.227 + return NULL; 1.228 +} 1.229 + 1.230 +static tempUCATable* U_EXPORT2 1.231 +uprv_uca_cloneTempTable(tempUCATable *t, UErrorCode *status) { 1.232 + if(U_FAILURE(*status)) { 1.233 + return NULL; 1.234 + } 1.235 + 1.236 + tempUCATable *r = (tempUCATable *)uprv_malloc(sizeof(tempUCATable)); 1.237 + /* test for NULL */ 1.238 + if (r == NULL) { 1.239 + *status = U_MEMORY_ALLOCATION_ERROR; 1.240 + return NULL; 1.241 + } 1.242 + uprv_memset(r, 0, sizeof(tempUCATable)); 1.243 + 1.244 + /* mapping */ 1.245 + if(t->mapping != NULL) { 1.246 + /*r->mapping = ucmpe32_clone(t->mapping, status);*/ 1.247 + r->mapping = utrie_clone(NULL, t->mapping, NULL, 0); 1.248 + } 1.249 + 1.250 + // a hashing clone function would be very nice. We have none currently... 1.251 + // However, we should be good, as closing should not produce any prefixed elements. 1.252 + r->prefixLookup = NULL; // prefixes are not used in closing 1.253 + 1.254 + /* expansions */ 1.255 + if(t->expansions != NULL) { 1.256 + r->expansions = (ExpansionTable *)uprv_malloc(sizeof(ExpansionTable)); 1.257 + /* test for NULL */ 1.258 + if (r->expansions == NULL) { 1.259 + *status = U_MEMORY_ALLOCATION_ERROR; 1.260 + goto cleanup; 1.261 + } 1.262 + r->expansions->position = t->expansions->position; 1.263 + r->expansions->size = t->expansions->size; 1.264 + if(t->expansions->CEs != NULL) { 1.265 + r->expansions->CEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*t->expansions->size); 1.266 + /* test for NULL */ 1.267 + if (r->expansions->CEs == NULL) { 1.268 + *status = U_MEMORY_ALLOCATION_ERROR; 1.269 + goto cleanup; 1.270 + } 1.271 + uprv_memcpy(r->expansions->CEs, t->expansions->CEs, sizeof(uint32_t)*t->expansions->position); 1.272 + } else { 1.273 + r->expansions->CEs = NULL; 1.274 + } 1.275 + } 1.276 + 1.277 + if(t->contractions != NULL) { 1.278 + r->contractions = uprv_cnttab_clone(t->contractions, status); 1.279 + // Check for cloning failure. 1.280 + if (r->contractions == NULL) { 1.281 + *status = U_MEMORY_ALLOCATION_ERROR; 1.282 + goto cleanup; 1.283 + } 1.284 + r->contractions->mapping = r->mapping; 1.285 + } 1.286 + 1.287 + if(t->maxExpansions != NULL) { 1.288 + r->maxExpansions = (MaxExpansionTable *)uprv_malloc(sizeof(MaxExpansionTable)); 1.289 + /* test for NULL */ 1.290 + if (r->maxExpansions == NULL) { 1.291 + *status = U_MEMORY_ALLOCATION_ERROR; 1.292 + goto cleanup; 1.293 + } 1.294 + r->maxExpansions->size = t->maxExpansions->size; 1.295 + r->maxExpansions->position = t->maxExpansions->position; 1.296 + if(t->maxExpansions->endExpansionCE != NULL) { 1.297 + r->maxExpansions->endExpansionCE = (uint32_t *)uprv_malloc(sizeof(uint32_t)*t->maxExpansions->size); 1.298 + /* test for NULL */ 1.299 + if (r->maxExpansions->endExpansionCE == NULL) { 1.300 + *status = U_MEMORY_ALLOCATION_ERROR; 1.301 + goto cleanup; 1.302 + } 1.303 + uprv_memset(r->maxExpansions->endExpansionCE, 0xDB, sizeof(uint32_t)*t->maxExpansions->size); 1.304 + uprv_memcpy(r->maxExpansions->endExpansionCE, t->maxExpansions->endExpansionCE, t->maxExpansions->position*sizeof(uint32_t)); 1.305 + } else { 1.306 + r->maxExpansions->endExpansionCE = NULL; 1.307 + } 1.308 + if(t->maxExpansions->expansionCESize != NULL) { 1.309 + r->maxExpansions->expansionCESize = (uint8_t *)uprv_malloc(sizeof(uint8_t)*t->maxExpansions->size); 1.310 + /* test for NULL */ 1.311 + if (r->maxExpansions->expansionCESize == NULL) { 1.312 + *status = U_MEMORY_ALLOCATION_ERROR; 1.313 + goto cleanup; 1.314 + } 1.315 + uprv_memset(r->maxExpansions->expansionCESize, 0xDB, sizeof(uint8_t)*t->maxExpansions->size); 1.316 + uprv_memcpy(r->maxExpansions->expansionCESize, t->maxExpansions->expansionCESize, t->maxExpansions->position*sizeof(uint8_t)); 1.317 + } else { 1.318 + r->maxExpansions->expansionCESize = NULL; 1.319 + } 1.320 + } 1.321 + 1.322 + if(t->maxJamoExpansions != NULL) { 1.323 + r->maxJamoExpansions = (MaxJamoExpansionTable *)uprv_malloc(sizeof(MaxJamoExpansionTable)); 1.324 + /* test for NULL */ 1.325 + if (r->maxJamoExpansions == NULL) { 1.326 + *status = U_MEMORY_ALLOCATION_ERROR; 1.327 + goto cleanup; 1.328 + } 1.329 + r->maxJamoExpansions->size = t->maxJamoExpansions->size; 1.330 + r->maxJamoExpansions->position = t->maxJamoExpansions->position; 1.331 + r->maxJamoExpansions->maxLSize = t->maxJamoExpansions->maxLSize; 1.332 + r->maxJamoExpansions->maxVSize = t->maxJamoExpansions->maxVSize; 1.333 + r->maxJamoExpansions->maxTSize = t->maxJamoExpansions->maxTSize; 1.334 + if(t->maxJamoExpansions->size != 0) { 1.335 + r->maxJamoExpansions->endExpansionCE = (uint32_t *)uprv_malloc(sizeof(uint32_t)*t->maxJamoExpansions->size); 1.336 + /* test for NULL */ 1.337 + if (r->maxJamoExpansions->endExpansionCE == NULL) { 1.338 + *status = U_MEMORY_ALLOCATION_ERROR; 1.339 + goto cleanup; 1.340 + } 1.341 + uprv_memcpy(r->maxJamoExpansions->endExpansionCE, t->maxJamoExpansions->endExpansionCE, t->maxJamoExpansions->position*sizeof(uint32_t)); 1.342 + r->maxJamoExpansions->isV = (UBool *)uprv_malloc(sizeof(UBool)*t->maxJamoExpansions->size); 1.343 + /* test for NULL */ 1.344 + if (r->maxJamoExpansions->isV == NULL) { 1.345 + *status = U_MEMORY_ALLOCATION_ERROR; 1.346 + goto cleanup; 1.347 + } 1.348 + uprv_memcpy(r->maxJamoExpansions->isV, t->maxJamoExpansions->isV, t->maxJamoExpansions->position*sizeof(UBool)); 1.349 + } else { 1.350 + r->maxJamoExpansions->endExpansionCE = NULL; 1.351 + r->maxJamoExpansions->isV = NULL; 1.352 + } 1.353 + } 1.354 + 1.355 + if(t->unsafeCP != NULL) { 1.356 + r->unsafeCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE); 1.357 + /* test for NULL */ 1.358 + if (r->unsafeCP == NULL) { 1.359 + *status = U_MEMORY_ALLOCATION_ERROR; 1.360 + goto cleanup; 1.361 + } 1.362 + uprv_memcpy(r->unsafeCP, t->unsafeCP, UCOL_UNSAFECP_TABLE_SIZE); 1.363 + } 1.364 + 1.365 + if(t->contrEndCP != NULL) { 1.366 + r->contrEndCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE); 1.367 + /* test for NULL */ 1.368 + if (r->contrEndCP == NULL) { 1.369 + *status = U_MEMORY_ALLOCATION_ERROR; 1.370 + goto cleanup; 1.371 + } 1.372 + uprv_memcpy(r->contrEndCP, t->contrEndCP, UCOL_UNSAFECP_TABLE_SIZE); 1.373 + } 1.374 + 1.375 + r->UCA = t->UCA; 1.376 + r->image = t->image; 1.377 + r->options = t->options; 1.378 + 1.379 + return r; 1.380 +cleanup: 1.381 + uprv_uca_closeTempTable(t); 1.382 + return NULL; 1.383 +} 1.384 + 1.385 + 1.386 +U_CAPI void U_EXPORT2 1.387 +uprv_uca_closeTempTable(tempUCATable *t) { 1.388 + if(t != NULL) { 1.389 + if (t->expansions != NULL) { 1.390 + uprv_free(t->expansions->CEs); 1.391 + uprv_free(t->expansions); 1.392 + } 1.393 + if(t->contractions != NULL) { 1.394 + uprv_cnttab_close(t->contractions); 1.395 + } 1.396 + if (t->mapping != NULL) { 1.397 + utrie_close(t->mapping); 1.398 + } 1.399 + 1.400 + if(t->prefixLookup != NULL) { 1.401 + uhash_close(t->prefixLookup); 1.402 + } 1.403 + 1.404 + if (t->maxExpansions != NULL) { 1.405 + uprv_free(t->maxExpansions->endExpansionCE); 1.406 + uprv_free(t->maxExpansions->expansionCESize); 1.407 + uprv_free(t->maxExpansions); 1.408 + } 1.409 + 1.410 + if (t->maxJamoExpansions->size > 0) { 1.411 + uprv_free(t->maxJamoExpansions->endExpansionCE); 1.412 + uprv_free(t->maxJamoExpansions->isV); 1.413 + } 1.414 + uprv_free(t->maxJamoExpansions); 1.415 + 1.416 + uprv_free(t->unsafeCP); 1.417 + uprv_free(t->contrEndCP); 1.418 + 1.419 + if (t->cmLookup != NULL) { 1.420 + uprv_free(t->cmLookup->cPoints); 1.421 + uprv_free(t->cmLookup); 1.422 + } 1.423 + 1.424 + uprv_free(t); 1.425 + } 1.426 +} 1.427 + 1.428 +/** 1.429 +* Looks for the maximum length of all expansion sequences ending with the same 1.430 +* collation element. The size required for maxexpansion and maxsize is 1.431 +* returned if the arrays are too small. 1.432 +* @param endexpansion the last expansion collation element to be added 1.433 +* @param expansionsize size of the expansion 1.434 +* @param maxexpansion data structure to store the maximum expansion data. 1.435 +* @param status error status 1.436 +* @returns size of the maxexpansion and maxsize used. 1.437 +*/ 1.438 +static int uprv_uca_setMaxExpansion(uint32_t endexpansion, 1.439 + uint8_t expansionsize, 1.440 + MaxExpansionTable *maxexpansion, 1.441 + UErrorCode *status) 1.442 +{ 1.443 + if (maxexpansion->size == 0) { 1.444 + /* we'll always make the first element 0, for easier manipulation */ 1.445 + maxexpansion->endExpansionCE = 1.446 + (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(int32_t)); 1.447 + /* test for NULL */ 1.448 + if (maxexpansion->endExpansionCE == NULL) { 1.449 + *status = U_MEMORY_ALLOCATION_ERROR; 1.450 + return 0; 1.451 + } 1.452 + *(maxexpansion->endExpansionCE) = 0; 1.453 + maxexpansion->expansionCESize = 1.454 + (uint8_t *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(uint8_t)); 1.455 + /* test for NULL */; 1.456 + if (maxexpansion->expansionCESize == NULL) { 1.457 + *status = U_MEMORY_ALLOCATION_ERROR; 1.458 + return 0; 1.459 + } 1.460 + *(maxexpansion->expansionCESize) = 0; 1.461 + maxexpansion->size = INIT_EXP_TABLE_SIZE; 1.462 + maxexpansion->position = 0; 1.463 + } 1.464 + 1.465 + if (maxexpansion->position + 1 == maxexpansion->size) { 1.466 + uint32_t *neweece = (uint32_t *)uprv_realloc(maxexpansion->endExpansionCE, 1.467 + 2 * maxexpansion->size * sizeof(uint32_t)); 1.468 + if (neweece == NULL) { 1.469 + *status = U_MEMORY_ALLOCATION_ERROR; 1.470 + return 0; 1.471 + } 1.472 + maxexpansion->endExpansionCE = neweece; 1.473 + 1.474 + uint8_t *neweces = (uint8_t *)uprv_realloc(maxexpansion->expansionCESize, 1.475 + 2 * maxexpansion->size * sizeof(uint8_t)); 1.476 + if (neweces == NULL) { 1.477 + *status = U_MEMORY_ALLOCATION_ERROR; 1.478 + return 0; 1.479 + } 1.480 + maxexpansion->expansionCESize = neweces; 1.481 + maxexpansion->size *= 2; 1.482 + } 1.483 + 1.484 + uint32_t *pendexpansionce = maxexpansion->endExpansionCE; 1.485 + uint8_t *pexpansionsize = maxexpansion->expansionCESize; 1.486 + int pos = maxexpansion->position; 1.487 + 1.488 + uint32_t *start = pendexpansionce; 1.489 + uint32_t *limit = pendexpansionce + pos; 1.490 + 1.491 + /* using binary search to determine if last expansion element is 1.492 + already in the array */ 1.493 + uint32_t *mid; 1.494 + int result = -1; 1.495 + while (start < limit - 1) { 1.496 + mid = start + ((limit - start) >> 1); 1.497 + if (endexpansion <= *mid) { 1.498 + limit = mid; 1.499 + } 1.500 + else { 1.501 + start = mid; 1.502 + } 1.503 + } 1.504 + 1.505 + if (*start == endexpansion) { 1.506 + result = (int)(start - pendexpansionce); 1.507 + } 1.508 + else if (*limit == endexpansion) { 1.509 + result = (int)(limit - pendexpansionce); 1.510 + } 1.511 + 1.512 + if (result > -1) { 1.513 + /* found the ce in expansion, we'll just modify the size if it is 1.514 + smaller */ 1.515 + uint8_t *currentsize = pexpansionsize + result; 1.516 + if (*currentsize < expansionsize) { 1.517 + *currentsize = expansionsize; 1.518 + } 1.519 + } 1.520 + else { 1.521 + /* we'll need to squeeze the value into the array. 1.522 + initial implementation. */ 1.523 + /* shifting the subarray down by 1 */ 1.524 + int shiftsize = (int)((pendexpansionce + pos) - start); 1.525 + uint32_t *shiftpos = start + 1; 1.526 + uint8_t *sizeshiftpos = pexpansionsize + (shiftpos - pendexpansionce); 1.527 + 1.528 + /* okay need to rearrange the array into sorted order */ 1.529 + if (shiftsize == 0 /*|| *(pendexpansionce + pos) < endexpansion*/) { /* the commented part is actually both redundant and dangerous */ 1.530 + *(pendexpansionce + pos + 1) = endexpansion; 1.531 + *(pexpansionsize + pos + 1) = expansionsize; 1.532 + } 1.533 + else { 1.534 + uprv_memmove(shiftpos + 1, shiftpos, shiftsize * sizeof(int32_t)); 1.535 + uprv_memmove(sizeshiftpos + 1, sizeshiftpos, 1.536 + shiftsize * sizeof(uint8_t)); 1.537 + *shiftpos = endexpansion; 1.538 + *sizeshiftpos = expansionsize; 1.539 + } 1.540 + maxexpansion->position ++; 1.541 + 1.542 +#ifdef UCOL_DEBUG 1.543 + int temp; 1.544 + UBool found = FALSE; 1.545 + for (temp = 0; temp < maxexpansion->position; temp ++) { 1.546 + if (pendexpansionce[temp] >= pendexpansionce[temp + 1]) { 1.547 + fprintf(stderr, "expansions %d\n", temp); 1.548 + } 1.549 + if (pendexpansionce[temp] == endexpansion) { 1.550 + found =TRUE; 1.551 + if (pexpansionsize[temp] < expansionsize) { 1.552 + fprintf(stderr, "expansions size %d\n", temp); 1.553 + } 1.554 + } 1.555 + } 1.556 + if (pendexpansionce[temp] == endexpansion) { 1.557 + found =TRUE; 1.558 + if (pexpansionsize[temp] < expansionsize) { 1.559 + fprintf(stderr, "expansions size %d\n", temp); 1.560 + } 1.561 + } 1.562 + if (!found) 1.563 + fprintf(stderr, "expansion not found %d\n", temp); 1.564 +#endif 1.565 + } 1.566 + 1.567 + return maxexpansion->position; 1.568 +} 1.569 + 1.570 +/** 1.571 +* Sets the maximum length of all jamo expansion sequences ending with the same 1.572 +* collation element. The size required for maxexpansion and maxsize is 1.573 +* returned if the arrays are too small. 1.574 +* @param ch the jamo codepoint 1.575 +* @param endexpansion the last expansion collation element to be added 1.576 +* @param expansionsize size of the expansion 1.577 +* @param maxexpansion data structure to store the maximum expansion data. 1.578 +* @param status error status 1.579 +* @returns size of the maxexpansion and maxsize used. 1.580 +*/ 1.581 +static int uprv_uca_setMaxJamoExpansion(UChar ch, 1.582 + uint32_t endexpansion, 1.583 + uint8_t expansionsize, 1.584 + MaxJamoExpansionTable *maxexpansion, 1.585 + UErrorCode *status) 1.586 +{ 1.587 + UBool isV = TRUE; 1.588 + if (((uint32_t)ch - 0x1100) <= (0x1112 - 0x1100)) { 1.589 + /* determines L for Jamo, doesn't need to store this since it is never 1.590 + at the end of a expansion */ 1.591 + if (maxexpansion->maxLSize < expansionsize) { 1.592 + maxexpansion->maxLSize = expansionsize; 1.593 + } 1.594 + return maxexpansion->position; 1.595 + } 1.596 + 1.597 + if (((uint32_t)ch - 0x1161) <= (0x1175 - 0x1161)) { 1.598 + /* determines V for Jamo */ 1.599 + if (maxexpansion->maxVSize < expansionsize) { 1.600 + maxexpansion->maxVSize = expansionsize; 1.601 + } 1.602 + } 1.603 + 1.604 + if (((uint32_t)ch - 0x11A8) <= (0x11C2 - 0x11A8)) { 1.605 + isV = FALSE; 1.606 + /* determines T for Jamo */ 1.607 + if (maxexpansion->maxTSize < expansionsize) { 1.608 + maxexpansion->maxTSize = expansionsize; 1.609 + } 1.610 + } 1.611 + 1.612 + if (maxexpansion->size == 0) { 1.613 + /* we'll always make the first element 0, for easier manipulation */ 1.614 + maxexpansion->endExpansionCE = 1.615 + (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(uint32_t)); 1.616 + /* test for NULL */; 1.617 + if (maxexpansion->endExpansionCE == NULL) { 1.618 + *status = U_MEMORY_ALLOCATION_ERROR; 1.619 + return 0; 1.620 + } 1.621 + *(maxexpansion->endExpansionCE) = 0; 1.622 + maxexpansion->isV = 1.623 + (UBool *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(UBool)); 1.624 + /* test for NULL */; 1.625 + if (maxexpansion->isV == NULL) { 1.626 + *status = U_MEMORY_ALLOCATION_ERROR; 1.627 + uprv_free(maxexpansion->endExpansionCE); 1.628 + maxexpansion->endExpansionCE = NULL; 1.629 + return 0; 1.630 + } 1.631 + *(maxexpansion->isV) = 0; 1.632 + maxexpansion->size = INIT_EXP_TABLE_SIZE; 1.633 + maxexpansion->position = 0; 1.634 + } 1.635 + 1.636 + if (maxexpansion->position + 1 == maxexpansion->size) { 1.637 + maxexpansion->size *= 2; 1.638 + maxexpansion->endExpansionCE = (uint32_t *)uprv_realloc(maxexpansion->endExpansionCE, 1.639 + maxexpansion->size * sizeof(uint32_t)); 1.640 + if (maxexpansion->endExpansionCE == NULL) { 1.641 +#ifdef UCOL_DEBUG 1.642 + fprintf(stderr, "out of memory for maxExpansions\n"); 1.643 +#endif 1.644 + *status = U_MEMORY_ALLOCATION_ERROR; 1.645 + return 0; 1.646 + } 1.647 + maxexpansion->isV = (UBool *)uprv_realloc(maxexpansion->isV, 1.648 + maxexpansion->size * sizeof(UBool)); 1.649 + if (maxexpansion->isV == NULL) { 1.650 +#ifdef UCOL_DEBUG 1.651 + fprintf(stderr, "out of memory for maxExpansions\n"); 1.652 +#endif 1.653 + *status = U_MEMORY_ALLOCATION_ERROR; 1.654 + uprv_free(maxexpansion->endExpansionCE); 1.655 + maxexpansion->endExpansionCE = NULL; 1.656 + return 0; 1.657 + } 1.658 + } 1.659 + 1.660 + uint32_t *pendexpansionce = maxexpansion->endExpansionCE; 1.661 + int pos = maxexpansion->position; 1.662 + 1.663 + while (pos > 0) { 1.664 + pos --; 1.665 + if (*(pendexpansionce + pos) == endexpansion) { 1.666 + return maxexpansion->position; 1.667 + } 1.668 + } 1.669 + 1.670 + *(pendexpansionce + maxexpansion->position) = endexpansion; 1.671 + *(maxexpansion->isV + maxexpansion->position) = isV; 1.672 + maxexpansion->position ++; 1.673 + 1.674 + return maxexpansion->position; 1.675 +} 1.676 + 1.677 + 1.678 +static void ContrEndCPSet(uint8_t *table, UChar c) { 1.679 + uint32_t hash; 1.680 + uint8_t *htByte; 1.681 + 1.682 + hash = c; 1.683 + if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { 1.684 + hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; 1.685 + } 1.686 + htByte = &table[hash>>3]; 1.687 + *htByte |= (1 << (hash & 7)); 1.688 +} 1.689 + 1.690 + 1.691 +static void unsafeCPSet(uint8_t *table, UChar c) { 1.692 + uint32_t hash; 1.693 + uint8_t *htByte; 1.694 + 1.695 + hash = c; 1.696 + if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { 1.697 + if (hash >= 0xd800 && hash <= 0xf8ff) { 1.698 + /* Part of a surrogate, or in private use area. */ 1.699 + /* These don't go in the table */ 1.700 + return; 1.701 + } 1.702 + hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; 1.703 + } 1.704 + htByte = &table[hash>>3]; 1.705 + *htByte |= (1 << (hash & 7)); 1.706 +} 1.707 + 1.708 +static void 1.709 +uprv_uca_createCMTable(tempUCATable *t, int32_t noOfCM, UErrorCode *status) { 1.710 + t->cmLookup = (CombinClassTable *)uprv_malloc(sizeof(CombinClassTable)); 1.711 + if (t->cmLookup==NULL) { 1.712 + *status = U_MEMORY_ALLOCATION_ERROR; 1.713 + return; 1.714 + } 1.715 + t->cmLookup->cPoints=(UChar *)uprv_malloc(noOfCM*sizeof(UChar)); 1.716 + if (t->cmLookup->cPoints ==NULL) { 1.717 + uprv_free(t->cmLookup); 1.718 + t->cmLookup = NULL; 1.719 + *status = U_MEMORY_ALLOCATION_ERROR; 1.720 + return; 1.721 + } 1.722 + 1.723 + t->cmLookup->size=noOfCM; 1.724 + uprv_memset(t->cmLookup->index, 0, sizeof(t->cmLookup->index)); 1.725 + 1.726 + return; 1.727 +} 1.728 + 1.729 +static void 1.730 +uprv_uca_copyCMTable(tempUCATable *t, UChar *cm, uint16_t *index) { 1.731 + int32_t count=0; 1.732 + 1.733 + for (int32_t i=0; i<256; ++i) { 1.734 + if (index[i]>0) { 1.735 + // cPoints is ordered by combining class value. 1.736 + uprv_memcpy(t->cmLookup->cPoints+count, cm+(i<<8), index[i]*sizeof(UChar)); 1.737 + count += index[i]; 1.738 + } 1.739 + t->cmLookup->index[i]=count; 1.740 + } 1.741 + return; 1.742 +} 1.743 + 1.744 +/* 1. to the UnsafeCP hash table, add all chars with combining class != 0 */ 1.745 +/* 2. build combining marks table for all chars with combining class != 0 */ 1.746 +static void uprv_uca_unsafeCPAddCCNZ(tempUCATable *t, UErrorCode *status) { 1.747 + 1.748 + UChar c; 1.749 + uint16_t fcd; // Hi byte is lead combining class. lo byte is trailing combing class. 1.750 + UBool buildCMTable = (t->cmLookup==NULL); // flag for building combining class table 1.751 + UChar *cm=NULL; 1.752 + uint16_t index[256]; 1.753 + int32_t count=0; 1.754 + const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status); 1.755 + if (U_FAILURE(*status)) { 1.756 + return; 1.757 + } 1.758 + 1.759 + if (buildCMTable) { 1.760 + if (cm==NULL) { 1.761 + cm = (UChar *)uprv_malloc(sizeof(UChar)*UCOL_MAX_CM_TAB); 1.762 + if (cm==NULL) { 1.763 + *status = U_MEMORY_ALLOCATION_ERROR; 1.764 + return; 1.765 + } 1.766 + } 1.767 + uprv_memset(index, 0, sizeof(index)); 1.768 + } 1.769 + for (c=0; c<0xffff; c++) { 1.770 + if (U16_IS_LEAD(c)) { 1.771 + fcd = 0; 1.772 + if (nfcImpl->singleLeadMightHaveNonZeroFCD16(c)) { 1.773 + UChar32 supp = U16_GET_SUPPLEMENTARY(c, 0xdc00); 1.774 + UChar32 suppLimit = supp + 0x400; 1.775 + while (supp < suppLimit) { 1.776 + fcd |= nfcImpl->getFCD16FromNormData(supp++); 1.777 + } 1.778 + } 1.779 + } else { 1.780 + fcd = nfcImpl->getFCD16(c); 1.781 + } 1.782 + if (fcd >= 0x100 || // if the leading combining class(c) > 0 || 1.783 + (U16_IS_LEAD(c) && fcd != 0)) {// c is a leading surrogate with some FCD data 1.784 + if (buildCMTable) { 1.785 + uint32_t cClass = fcd & 0xff; 1.786 + //uint32_t temp=(cClass<<8)+index[cClass]; 1.787 + cm[(cClass<<8)+index[cClass]] = c; // 1.788 + index[cClass]++; 1.789 + count++; 1.790 + } 1.791 + unsafeCPSet(t->unsafeCP, c); 1.792 + } 1.793 + } 1.794 + 1.795 + // copy to cm table 1.796 + if (buildCMTable) { 1.797 + uprv_uca_createCMTable(t, count, status); 1.798 + if(U_FAILURE(*status)) { 1.799 + if (cm!=NULL) { 1.800 + uprv_free(cm); 1.801 + } 1.802 + return; 1.803 + } 1.804 + uprv_uca_copyCMTable(t, cm, index); 1.805 + } 1.806 + 1.807 + if(t->prefixLookup != NULL) { 1.808 + int32_t i = -1; 1.809 + const UHashElement *e = NULL; 1.810 + UCAElements *element = NULL; 1.811 + UChar NFCbuf[256]; 1.812 + while((e = uhash_nextElement(t->prefixLookup, &i)) != NULL) { 1.813 + element = (UCAElements *)e->value.pointer; 1.814 + // codepoints here are in the NFD form. We need to add the 1.815 + // first code point of the NFC form to unsafe, because 1.816 + // strcoll needs to backup over them. 1.817 + unorm_normalize(element->cPoints, element->cSize, UNORM_NFC, 0, 1.818 + NFCbuf, 256, status); 1.819 + unsafeCPSet(t->unsafeCP, NFCbuf[0]); 1.820 + } 1.821 + } 1.822 + 1.823 + if (cm!=NULL) { 1.824 + uprv_free(cm); 1.825 + } 1.826 +} 1.827 + 1.828 +static uint32_t uprv_uca_addPrefix(tempUCATable *t, uint32_t CE, 1.829 + UCAElements *element, UErrorCode *status) 1.830 +{ 1.831 + // currently the longest prefix we're supporting in Japanese is two characters 1.832 + // long. Although this table could quite easily mimic complete contraction stuff 1.833 + // there is no good reason to make a general solution, as it would require some 1.834 + // error prone messing. 1.835 + CntTable *contractions = t->contractions; 1.836 + UChar32 cp; 1.837 + uint32_t cpsize = 0; 1.838 + UChar *oldCP = element->cPoints; 1.839 + uint32_t oldCPSize = element->cSize; 1.840 + 1.841 + 1.842 + contractions->currentTag = SPEC_PROC_TAG; 1.843 + 1.844 + // here, we will normalize & add prefix to the table. 1.845 + uint32_t j = 0; 1.846 +#ifdef UCOL_DEBUG 1.847 + for(j=0; j<element->cSize; j++) { 1.848 + fprintf(stdout, "CP: %04X ", element->cPoints[j]); 1.849 + } 1.850 + fprintf(stdout, "El: %08X Pref: ", CE); 1.851 + for(j=0; j<element->prefixSize; j++) { 1.852 + fprintf(stdout, "%04X ", element->prefix[j]); 1.853 + } 1.854 + fprintf(stdout, "%08X ", element->mapCE); 1.855 +#endif 1.856 + 1.857 + for (j = 1; j<element->prefixSize; j++) { /* First add NFD prefix chars to unsafe CP hash table */ 1.858 + // Unless it is a trail surrogate, which is handled algoritmically and 1.859 + // shouldn't take up space in the table. 1.860 + if(!(U16_IS_TRAIL(element->prefix[j]))) { 1.861 + unsafeCPSet(t->unsafeCP, element->prefix[j]); 1.862 + } 1.863 + } 1.864 + 1.865 + UChar tempPrefix = 0; 1.866 + 1.867 + for(j = 0; j < /*nfcSize*/element->prefixSize/2; j++) { // prefixes are going to be looked up backwards 1.868 + // therefore, we will promptly reverse the prefix buffer... 1.869 + tempPrefix = *(/*nfcBuffer*/element->prefix+element->prefixSize-j-1); 1.870 + *(/*nfcBuffer*/element->prefix+element->prefixSize-j-1) = element->prefix[j]; 1.871 + element->prefix[j] = tempPrefix; 1.872 + } 1.873 + 1.874 +#ifdef UCOL_DEBUG 1.875 + fprintf(stdout, "Reversed: "); 1.876 + for(j=0; j<element->prefixSize; j++) { 1.877 + fprintf(stdout, "%04X ", element->prefix[j]); 1.878 + } 1.879 + fprintf(stdout, "%08X\n", element->mapCE); 1.880 +#endif 1.881 + 1.882 + // the first codepoint is also unsafe, as it forms a 'contraction' with the prefix 1.883 + if(!(U16_IS_TRAIL(element->cPoints[0]))) { 1.884 + unsafeCPSet(t->unsafeCP, element->cPoints[0]); 1.885 + } 1.886 + 1.887 + // Maybe we need this... To handle prefixes completely in the forward direction... 1.888 + //if(element->cSize == 1) { 1.889 + // if(!(U16_IS_TRAIL(element->cPoints[0]))) { 1.890 + // ContrEndCPSet(t->contrEndCP, element->cPoints[0]); 1.891 + // } 1.892 + //} 1.893 + 1.894 + element->cPoints = element->prefix; 1.895 + element->cSize = element->prefixSize; 1.896 + 1.897 + // Add the last char of the contraction to the contraction-end hash table. 1.898 + // unless it is a trail surrogate, which is handled algorithmically and 1.899 + // shouldn't be in the table 1.900 + if(!(U16_IS_TRAIL(element->cPoints[element->cSize -1]))) { 1.901 + ContrEndCPSet(t->contrEndCP, element->cPoints[element->cSize -1]); 1.902 + } 1.903 + 1.904 + // First we need to check if contractions starts with a surrogate 1.905 + U16_NEXT(element->cPoints, cpsize, element->cSize, cp); 1.906 + 1.907 + // If there are any Jamos in the contraction, we should turn on special 1.908 + // processing for Jamos 1.909 + if(UCOL_ISJAMO(element->prefix[0])) { 1.910 + t->image->jamoSpecial = TRUE; 1.911 + } 1.912 + /* then we need to deal with it */ 1.913 + /* we could aready have something in table - or we might not */ 1.914 + 1.915 + if(!isPrefix(CE)) { 1.916 + /* if it wasn't contraction, we wouldn't end up here*/ 1.917 + int32_t firstContractionOffset = 0; 1.918 + firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, CE, status); 1.919 + uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 1.920 + uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->prefix, newCE, status); 1.921 + uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, CE, status); 1.922 + CE = constructContractCE(SPEC_PROC_TAG, firstContractionOffset); 1.923 + } else { /* we are adding to existing contraction */ 1.924 + /* there were already some elements in the table, so we need to add a new contraction */ 1.925 + /* Two things can happen here: either the codepoint is already in the table, or it is not */ 1.926 + int32_t position = uprv_cnttab_findCP(contractions, CE, *element->prefix, status); 1.927 + if(position > 0) { /* if it is we just continue down the chain */ 1.928 + uint32_t eCE = uprv_cnttab_getCE(contractions, CE, position, status); 1.929 + uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status); 1.930 + uprv_cnttab_setContraction(contractions, CE, position, *(element->prefix), newCE, status); 1.931 + } else { /* if it isn't, we will have to create a new sequence */ 1.932 + uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 1.933 + uprv_cnttab_insertContraction(contractions, CE, *(element->prefix), element->mapCE, status); 1.934 + } 1.935 + } 1.936 + 1.937 + element->cPoints = oldCP; 1.938 + element->cSize = oldCPSize; 1.939 + 1.940 + return CE; 1.941 +} 1.942 + 1.943 +// Note regarding surrogate handling: We are interested only in the single 1.944 +// or leading surrogates in a contraction. If a surrogate is somewhere else 1.945 +// in the contraction, it is going to be handled as a pair of code units, 1.946 +// as it doesn't affect the performance AND handling surrogates specially 1.947 +// would complicate code way too much. 1.948 +static uint32_t uprv_uca_addContraction(tempUCATable *t, uint32_t CE, 1.949 + UCAElements *element, UErrorCode *status) 1.950 +{ 1.951 + CntTable *contractions = t->contractions; 1.952 + UChar32 cp; 1.953 + uint32_t cpsize = 0; 1.954 + 1.955 + contractions->currentTag = CONTRACTION_TAG; 1.956 + 1.957 + // First we need to check if contractions starts with a surrogate 1.958 + U16_NEXT(element->cPoints, cpsize, element->cSize, cp); 1.959 + 1.960 + if(cpsize<element->cSize) { // This is a real contraction, if there are other characters after the first 1.961 + uint32_t j = 0; 1.962 + for (j=1; j<element->cSize; j++) { /* First add contraction chars to unsafe CP hash table */ 1.963 + // Unless it is a trail surrogate, which is handled algoritmically and 1.964 + // shouldn't take up space in the table. 1.965 + if(!(U16_IS_TRAIL(element->cPoints[j]))) { 1.966 + unsafeCPSet(t->unsafeCP, element->cPoints[j]); 1.967 + } 1.968 + } 1.969 + // Add the last char of the contraction to the contraction-end hash table. 1.970 + // unless it is a trail surrogate, which is handled algorithmically and 1.971 + // shouldn't be in the table 1.972 + if(!(U16_IS_TRAIL(element->cPoints[element->cSize -1]))) { 1.973 + ContrEndCPSet(t->contrEndCP, element->cPoints[element->cSize -1]); 1.974 + } 1.975 + 1.976 + // If there are any Jamos in the contraction, we should turn on special 1.977 + // processing for Jamos 1.978 + if(UCOL_ISJAMO(element->cPoints[0])) { 1.979 + t->image->jamoSpecial = TRUE; 1.980 + } 1.981 + /* then we need to deal with it */ 1.982 + /* we could aready have something in table - or we might not */ 1.983 + element->cPoints+=cpsize; 1.984 + element->cSize-=cpsize; 1.985 + if(!isContraction(CE)) { 1.986 + /* if it wasn't contraction, we wouldn't end up here*/ 1.987 + int32_t firstContractionOffset = 0; 1.988 + firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, CE, status); 1.989 + uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 1.990 + uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->cPoints, newCE, status); 1.991 + uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, CE, status); 1.992 + CE = constructContractCE(CONTRACTION_TAG, firstContractionOffset); 1.993 + } else { /* we are adding to existing contraction */ 1.994 + /* there were already some elements in the table, so we need to add a new contraction */ 1.995 + /* Two things can happen here: either the codepoint is already in the table, or it is not */ 1.996 + int32_t position = uprv_cnttab_findCP(contractions, CE, *element->cPoints, status); 1.997 + if(position > 0) { /* if it is we just continue down the chain */ 1.998 + uint32_t eCE = uprv_cnttab_getCE(contractions, CE, position, status); 1.999 + uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status); 1.1000 + uprv_cnttab_setContraction(contractions, CE, position, *(element->cPoints), newCE, status); 1.1001 + } else { /* if it isn't, we will have to create a new sequence */ 1.1002 + uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 1.1003 + uprv_cnttab_insertContraction(contractions, CE, *(element->cPoints), newCE, status); 1.1004 + } 1.1005 + } 1.1006 + element->cPoints-=cpsize; 1.1007 + element->cSize+=cpsize; 1.1008 + /*ucmpe32_set(t->mapping, cp, CE);*/ 1.1009 + utrie_set32(t->mapping, cp, CE); 1.1010 + } else if(!isContraction(CE)) { /* this is just a surrogate, and there is no contraction */ 1.1011 + /*ucmpe32_set(t->mapping, cp, element->mapCE);*/ 1.1012 + utrie_set32(t->mapping, cp, element->mapCE); 1.1013 + } else { /* fill out the first stage of the contraction with the surrogate CE */ 1.1014 + uprv_cnttab_changeContraction(contractions, CE, 0, element->mapCE, status); 1.1015 + uprv_cnttab_changeContraction(contractions, CE, 0xFFFF, element->mapCE, status); 1.1016 + } 1.1017 + return CE; 1.1018 +} 1.1019 + 1.1020 + 1.1021 +static uint32_t uprv_uca_processContraction(CntTable *contractions, UCAElements *element, uint32_t existingCE, UErrorCode *status) { 1.1022 + int32_t firstContractionOffset = 0; 1.1023 + // uint32_t contractionElement = UCOL_NOT_FOUND; 1.1024 + 1.1025 + if(U_FAILURE(*status)) { 1.1026 + return UCOL_NOT_FOUND; 1.1027 + } 1.1028 + 1.1029 + /* end of recursion */ 1.1030 + if(element->cSize == 1) { 1.1031 + if(isCntTableElement(existingCE) && ((UColCETags)getCETag(existingCE) == contractions->currentTag)) { 1.1032 + uprv_cnttab_changeContraction(contractions, existingCE, 0, element->mapCE, status); 1.1033 + uprv_cnttab_changeContraction(contractions, existingCE, 0xFFFF, element->mapCE, status); 1.1034 + return existingCE; 1.1035 + } else { 1.1036 + return element->mapCE; /*can't do just that. existingCe might be a contraction, meaning that we need to do another step */ 1.1037 + } 1.1038 + } 1.1039 + 1.1040 + /* this recursion currently feeds on the only element we have... We will have to copy it in order to accomodate */ 1.1041 + /* for both backward and forward cycles */ 1.1042 + 1.1043 + /* we encountered either an empty space or a non-contraction element */ 1.1044 + /* this means we are constructing a new contraction sequence */ 1.1045 + element->cPoints++; 1.1046 + element->cSize--; 1.1047 + if(!isCntTableElement(existingCE)) { 1.1048 + /* if it wasn't contraction, we wouldn't end up here*/ 1.1049 + firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, existingCE, status); 1.1050 + uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 1.1051 + uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->cPoints, newCE, status); 1.1052 + uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, existingCE, status); 1.1053 + existingCE = constructContractCE(contractions->currentTag, firstContractionOffset); 1.1054 + } else { /* we are adding to existing contraction */ 1.1055 + /* there were already some elements in the table, so we need to add a new contraction */ 1.1056 + /* Two things can happen here: either the codepoint is already in the table, or it is not */ 1.1057 + int32_t position = uprv_cnttab_findCP(contractions, existingCE, *element->cPoints, status); 1.1058 + if(position > 0) { /* if it is we just continue down the chain */ 1.1059 + uint32_t eCE = uprv_cnttab_getCE(contractions, existingCE, position, status); 1.1060 + uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status); 1.1061 + uprv_cnttab_setContraction(contractions, existingCE, position, *(element->cPoints), newCE, status); 1.1062 + } else { /* if it isn't, we will have to create a new sequence */ 1.1063 + uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 1.1064 + uprv_cnttab_insertContraction(contractions, existingCE, *(element->cPoints), newCE, status); 1.1065 + } 1.1066 + } 1.1067 + element->cPoints--; 1.1068 + element->cSize++; 1.1069 + return existingCE; 1.1070 +} 1.1071 + 1.1072 +static uint32_t uprv_uca_finalizeAddition(tempUCATable *t, UCAElements *element, UErrorCode *status) { 1.1073 + uint32_t CE = UCOL_NOT_FOUND; 1.1074 + // This should add a completely ignorable element to the 1.1075 + // unsafe table, so that backward iteration will skip 1.1076 + // over it when treating contractions. 1.1077 + uint32_t i = 0; 1.1078 + if(element->mapCE == 0) { 1.1079 + for(i = 0; i < element->cSize; i++) { 1.1080 + if(!U16_IS_TRAIL(element->cPoints[i])) { 1.1081 + unsafeCPSet(t->unsafeCP, element->cPoints[i]); 1.1082 + } 1.1083 + } 1.1084 + } 1.1085 + if(element->cSize > 1) { /* we're adding a contraction */ 1.1086 + uint32_t i = 0; 1.1087 + UChar32 cp; 1.1088 + 1.1089 + U16_NEXT(element->cPoints, i, element->cSize, cp); 1.1090 + /*CE = ucmpe32_get(t->mapping, cp);*/ 1.1091 + CE = utrie_get32(t->mapping, cp, NULL); 1.1092 + 1.1093 + CE = uprv_uca_addContraction(t, CE, element, status); 1.1094 + } else { /* easy case, */ 1.1095 + /*CE = ucmpe32_get(t->mapping, element->cPoints[0]);*/ 1.1096 + CE = utrie_get32(t->mapping, element->cPoints[0], NULL); 1.1097 + 1.1098 + if( CE != UCOL_NOT_FOUND) { 1.1099 + if(isCntTableElement(CE) /*isContraction(CE)*/) { /* adding a non contraction element (thai, expansion, single) to already existing contraction */ 1.1100 + if(!isPrefix(element->mapCE)) { // we cannot reenter prefix elements - as we are going to create a dead loop 1.1101 + // Only expansions and regular CEs can go here... Contractions will never happen in this place 1.1102 + uprv_cnttab_setContraction(t->contractions, CE, 0, 0, element->mapCE, status); 1.1103 + /* This loop has to change the CE at the end of contraction REDO!*/ 1.1104 + uprv_cnttab_changeLastCE(t->contractions, CE, element->mapCE, status); 1.1105 + } 1.1106 + } else { 1.1107 + /*ucmpe32_set(t->mapping, element->cPoints[0], element->mapCE);*/ 1.1108 + utrie_set32(t->mapping, element->cPoints[0], element->mapCE); 1.1109 + if ((element->prefixSize!=0) && (!isSpecial(CE) || (getCETag(CE)!=IMPLICIT_TAG))) { 1.1110 + UCAElements *origElem = (UCAElements *)uprv_malloc(sizeof(UCAElements)); 1.1111 + /* test for NULL */ 1.1112 + if (origElem== NULL) { 1.1113 + *status = U_MEMORY_ALLOCATION_ERROR; 1.1114 + return 0; 1.1115 + } 1.1116 + /* copy the original UCA value */ 1.1117 + origElem->prefixSize = 0; 1.1118 + origElem->prefix = NULL; 1.1119 + origElem->cPoints = origElem->uchars; 1.1120 + origElem->cPoints[0] = element->cPoints[0]; 1.1121 + origElem->cSize = 1; 1.1122 + origElem->CEs[0]=CE; 1.1123 + origElem->mapCE=CE; 1.1124 + origElem->noOfCEs=1; 1.1125 + uprv_uca_finalizeAddition(t, origElem, status); 1.1126 + uprv_free(origElem); 1.1127 + } 1.1128 +#ifdef UCOL_DEBUG 1.1129 + fprintf(stderr, "Warning - trying to overwrite existing data %08X for cp %04X with %08X\n", CE, element->cPoints[0], element->CEs[0]); 1.1130 + //*status = U_ILLEGAL_ARGUMENT_ERROR; 1.1131 +#endif 1.1132 + } 1.1133 + } else { 1.1134 + /*ucmpe32_set(t->mapping, element->cPoints[0], element->mapCE);*/ 1.1135 + utrie_set32(t->mapping, element->cPoints[0], element->mapCE); 1.1136 + } 1.1137 + } 1.1138 + return CE; 1.1139 +} 1.1140 + 1.1141 +/* This adds a read element, while testing for existence */ 1.1142 +U_CAPI uint32_t U_EXPORT2 1.1143 +uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status) { 1.1144 + U_NAMESPACE_USE 1.1145 + 1.1146 + ExpansionTable *expansions = t->expansions; 1.1147 + 1.1148 + uint32_t i = 1; 1.1149 + uint32_t expansion = 0; 1.1150 + uint32_t CE; 1.1151 + 1.1152 + if(U_FAILURE(*status)) { 1.1153 + return 0xFFFF; 1.1154 + } 1.1155 + 1.1156 + element->mapCE = 0; // clear mapCE so that we can catch expansions 1.1157 + 1.1158 + if(element->noOfCEs == 1) { 1.1159 + element->mapCE = element->CEs[0]; 1.1160 + } else { 1.1161 + /* ICU 2.1 long primaries */ 1.1162 + /* unfortunately, it looks like we have to look for a long primary here */ 1.1163 + /* since in canonical closure we are going to hit some long primaries from */ 1.1164 + /* the first phase, and they will come back as continuations/expansions */ 1.1165 + /* destroying the effect of the previous opitimization */ 1.1166 + /* A long primary is a three byte primary with starting secondaries and tertiaries */ 1.1167 + /* It can appear in long runs of only primary differences (like east Asian tailorings) */ 1.1168 + /* also, it should not be an expansion, as expansions would break with this */ 1.1169 + // This part came in from ucol_bld.cpp 1.1170 + //if(tok->expansion == 0 1.1171 + //&& noOfBytes[0] == 3 && noOfBytes[1] == 1 && noOfBytes[2] == 1 1.1172 + //&& CEparts[1] == (UCOL_BYTE_COMMON << 24) && CEparts[2] == (UCOL_BYTE_COMMON << 24)) { 1.1173 + /* we will construct a special CE that will go unchanged to the table */ 1.1174 + if(element->noOfCEs == 2 // a two CE expansion 1.1175 + && isContinuation(element->CEs[1]) // which is a continuation 1.1176 + && (element->CEs[1] & (~(0xFF << 24 | UCOL_CONTINUATION_MARKER))) == 0 // that has only primaries in continuation, 1.1177 + && (((element->CEs[0]>>8) & 0xFF) == UCOL_BYTE_COMMON) // a common secondary 1.1178 + && ((element->CEs[0] & 0xFF) == UCOL_BYTE_COMMON) // and a common tertiary 1.1179 + ) 1.1180 + { 1.1181 +#ifdef UCOL_DEBUG 1.1182 + fprintf(stdout, "Long primary %04X\n", element->cPoints[0]); 1.1183 +#endif 1.1184 + element->mapCE = UCOL_SPECIAL_FLAG | (LONG_PRIMARY_TAG<<24) // a long primary special 1.1185 + | ((element->CEs[0]>>8) & 0xFFFF00) // first and second byte of primary 1.1186 + | ((element->CEs[1]>>24) & 0xFF); // third byte of primary 1.1187 + } 1.1188 + else { 1.1189 + expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT) 1.1190 + | (((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4) 1.1191 + & 0xFFFFF0)); 1.1192 + 1.1193 + for(i = 1; i<element->noOfCEs; i++) { 1.1194 + uprv_uca_addExpansion(expansions, element->CEs[i], status); 1.1195 + } 1.1196 + if(element->noOfCEs <= 0xF) { 1.1197 + expansion |= element->noOfCEs; 1.1198 + } else { 1.1199 + uprv_uca_addExpansion(expansions, 0, status); 1.1200 + } 1.1201 + element->mapCE = expansion; 1.1202 + uprv_uca_setMaxExpansion(element->CEs[element->noOfCEs - 1], 1.1203 + (uint8_t)element->noOfCEs, 1.1204 + t->maxExpansions, 1.1205 + status); 1.1206 + if(UCOL_ISJAMO(element->cPoints[0])) { 1.1207 + t->image->jamoSpecial = TRUE; 1.1208 + uprv_uca_setMaxJamoExpansion(element->cPoints[0], 1.1209 + element->CEs[element->noOfCEs - 1], 1.1210 + (uint8_t)element->noOfCEs, 1.1211 + t->maxJamoExpansions, 1.1212 + status); 1.1213 + } 1.1214 + if (U_FAILURE(*status)) { 1.1215 + return 0; 1.1216 + } 1.1217 + } 1.1218 + } 1.1219 + 1.1220 + // We treat digits differently - they are "uber special" and should be 1.1221 + // processed differently if numeric collation is on. 1.1222 + UChar32 uniChar = 0; 1.1223 + //printElement(element); 1.1224 + if ((element->cSize == 2) && U16_IS_LEAD(element->cPoints[0])){ 1.1225 + uniChar = U16_GET_SUPPLEMENTARY(element->cPoints[0], element->cPoints[1]); 1.1226 + } else if (element->cSize == 1){ 1.1227 + uniChar = element->cPoints[0]; 1.1228 + } 1.1229 + 1.1230 + // Here, we either have one normal CE OR mapCE is set. Therefore, we stuff only 1.1231 + // one element to the expansion buffer. When we encounter a digit and we don't 1.1232 + // do numeric collation, we will just pick the CE we have and break out of case 1.1233 + // (see ucol.cpp ucol_prv_getSpecialCE && ucol_prv_getSpecialPrevCE). If we picked 1.1234 + // a special, further processing will occur. If it's a simple CE, we'll return due 1.1235 + // to how the loop is constructed. 1.1236 + if (uniChar != 0 && u_isdigit(uniChar)){ 1.1237 + expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (DIGIT_TAG<<UCOL_TAG_SHIFT) | 1); // prepare the element 1.1238 + if(element->mapCE) { // if there is an expansion, we'll pick it here 1.1239 + expansion |= ((uprv_uca_addExpansion(expansions, element->mapCE, status)+(headersize>>2))<<4); 1.1240 + } else { 1.1241 + expansion |= ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4); 1.1242 + } 1.1243 + element->mapCE = expansion; 1.1244 + 1.1245 + // Need to go back to the beginning of the digit string if in the middle! 1.1246 + if(uniChar <= 0xFFFF) { // supplementaries are always unsafe. API takes UChars 1.1247 + unsafeCPSet(t->unsafeCP, (UChar)uniChar); 1.1248 + } 1.1249 + } 1.1250 + 1.1251 + // here we want to add the prefix structure. 1.1252 + // I will try to process it as a reverse contraction, if possible. 1.1253 + // prefix buffer is already reversed. 1.1254 + 1.1255 + if(element->prefixSize!=0) { 1.1256 + // We keep the seen prefix starter elements in a hashtable 1.1257 + // we need it to be able to distinguish between the simple 1.1258 + // codepoints and prefix starters. Also, we need to use it 1.1259 + // for canonical closure. 1.1260 + 1.1261 + UCAElements *composed = (UCAElements *)uprv_malloc(sizeof(UCAElements)); 1.1262 + /* test for NULL */ 1.1263 + if (composed == NULL) { 1.1264 + *status = U_MEMORY_ALLOCATION_ERROR; 1.1265 + return 0; 1.1266 + } 1.1267 + uprv_memcpy(composed, element, sizeof(UCAElements)); 1.1268 + composed->cPoints = composed->uchars; 1.1269 + composed->prefix = composed->prefixChars; 1.1270 + 1.1271 + composed->prefixSize = unorm_normalize(element->prefix, element->prefixSize, UNORM_NFC, 0, composed->prefix, 128, status); 1.1272 + 1.1273 + 1.1274 + if(t->prefixLookup != NULL) { 1.1275 + UCAElements *uCE = (UCAElements *)uhash_get(t->prefixLookup, element); 1.1276 + if(uCE != NULL) { // there is already a set of code points here 1.1277 + element->mapCE = uprv_uca_addPrefix(t, uCE->mapCE, element, status); 1.1278 + } else { // no code points, so this spot is clean 1.1279 + element->mapCE = uprv_uca_addPrefix(t, UCOL_NOT_FOUND, element, status); 1.1280 + uCE = (UCAElements *)uprv_malloc(sizeof(UCAElements)); 1.1281 + /* test for NULL */ 1.1282 + if (uCE == NULL) { 1.1283 + *status = U_MEMORY_ALLOCATION_ERROR; 1.1284 + return 0; 1.1285 + } 1.1286 + uprv_memcpy(uCE, element, sizeof(UCAElements)); 1.1287 + uCE->cPoints = uCE->uchars; 1.1288 + uhash_put(t->prefixLookup, uCE, uCE, status); 1.1289 + } 1.1290 + if(composed->prefixSize != element->prefixSize || uprv_memcmp(composed->prefix, element->prefix, element->prefixSize)) { 1.1291 + // do it! 1.1292 + composed->mapCE = uprv_uca_addPrefix(t, element->mapCE, composed, status); 1.1293 + } 1.1294 + } 1.1295 + uprv_free(composed); 1.1296 + } 1.1297 + 1.1298 + // We need to use the canonical iterator here 1.1299 + // the way we do it is to generate the canonically equivalent strings 1.1300 + // for the contraction and then add the sequences that pass FCD check 1.1301 + if(element->cSize > 1 && !(element->cSize==2 && U16_IS_LEAD(element->cPoints[0]) && U16_IS_TRAIL(element->cPoints[1]))) { // this is a contraction, we should check whether a composed form should also be included 1.1302 + UnicodeString source(element->cPoints, element->cSize); 1.1303 + CanonicalIterator it(source, *status); 1.1304 + source = it.next(); 1.1305 + while(!source.isBogus()) { 1.1306 + if(Normalizer::quickCheck(source, UNORM_FCD, *status) != UNORM_NO) { 1.1307 + element->cSize = source.extract(element->cPoints, 128, *status); 1.1308 + uprv_uca_finalizeAddition(t, element, status); 1.1309 + } 1.1310 + source = it.next(); 1.1311 + } 1.1312 + CE = element->mapCE; 1.1313 + } else { 1.1314 + CE = uprv_uca_finalizeAddition(t, element, status); 1.1315 + } 1.1316 + 1.1317 + return CE; 1.1318 +} 1.1319 + 1.1320 + 1.1321 +/*void uprv_uca_getMaxExpansionJamo(CompactEIntArray *mapping, */ 1.1322 +static void uprv_uca_getMaxExpansionJamo(UNewTrie *mapping, 1.1323 + MaxExpansionTable *maxexpansion, 1.1324 + MaxJamoExpansionTable *maxjamoexpansion, 1.1325 + UBool jamospecial, 1.1326 + UErrorCode *status) 1.1327 +{ 1.1328 + const uint32_t VBASE = 0x1161; 1.1329 + const uint32_t TBASE = 0x11A8; 1.1330 + const uint32_t VCOUNT = 21; 1.1331 + const uint32_t TCOUNT = 28; 1.1332 + 1.1333 + uint32_t v = VBASE + VCOUNT - 1; 1.1334 + uint32_t t = TBASE + TCOUNT - 1; 1.1335 + uint32_t ce; 1.1336 + 1.1337 + while (v >= VBASE) { 1.1338 + /*ce = ucmpe32_get(mapping, v);*/ 1.1339 + ce = utrie_get32(mapping, v, NULL); 1.1340 + if (ce < UCOL_SPECIAL_FLAG) { 1.1341 + uprv_uca_setMaxExpansion(ce, 2, maxexpansion, status); 1.1342 + } 1.1343 + v --; 1.1344 + } 1.1345 + 1.1346 + while (t >= TBASE) 1.1347 + { 1.1348 + /*ce = ucmpe32_get(mapping, t);*/ 1.1349 + ce = utrie_get32(mapping, t, NULL); 1.1350 + if (ce < UCOL_SPECIAL_FLAG) { 1.1351 + uprv_uca_setMaxExpansion(ce, 3, maxexpansion, status); 1.1352 + } 1.1353 + t --; 1.1354 + } 1.1355 + /* According to the docs, 99% of the time, the Jamo will not be special */ 1.1356 + if (jamospecial) { 1.1357 + /* gets the max expansion in all unicode characters */ 1.1358 + int count = maxjamoexpansion->position; 1.1359 + uint8_t maxTSize = (uint8_t)(maxjamoexpansion->maxLSize + 1.1360 + maxjamoexpansion->maxVSize + 1.1361 + maxjamoexpansion->maxTSize); 1.1362 + uint8_t maxVSize = (uint8_t)(maxjamoexpansion->maxLSize + 1.1363 + maxjamoexpansion->maxVSize); 1.1364 + 1.1365 + while (count > 0) { 1.1366 + count --; 1.1367 + if (*(maxjamoexpansion->isV + count) == TRUE) { 1.1368 + uprv_uca_setMaxExpansion( 1.1369 + *(maxjamoexpansion->endExpansionCE + count), 1.1370 + maxVSize, maxexpansion, status); 1.1371 + } 1.1372 + else { 1.1373 + uprv_uca_setMaxExpansion( 1.1374 + *(maxjamoexpansion->endExpansionCE + count), 1.1375 + maxTSize, maxexpansion, status); 1.1376 + } 1.1377 + } 1.1378 + } 1.1379 +} 1.1380 + 1.1381 +U_CDECL_BEGIN 1.1382 +static inline uint32_t U_CALLCONV 1.1383 +getFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) 1.1384 +{ 1.1385 + uint32_t value; 1.1386 + uint32_t tag; 1.1387 + UChar32 limit; 1.1388 + UBool inBlockZero; 1.1389 + 1.1390 + limit=start+0x400; 1.1391 + while(start<limit) { 1.1392 + value=utrie_get32(trie, start, &inBlockZero); 1.1393 + tag = getCETag(value); 1.1394 + if(inBlockZero == TRUE) { 1.1395 + start+=UTRIE_DATA_BLOCK_LENGTH; 1.1396 + } else if(!(isSpecial(value) && (tag == IMPLICIT_TAG || tag == NOT_FOUND_TAG))) { 1.1397 + /* These are values that are starting in either UCA (IMPLICIT_TAG) or in the 1.1398 + * tailorings (NOT_FOUND_TAG). Presence of these tags means that there is 1.1399 + * nothing in this position and that it should be skipped. 1.1400 + */ 1.1401 +#ifdef UCOL_DEBUG 1.1402 + static int32_t count = 1; 1.1403 + fprintf(stdout, "%i, Folded %08X, value %08X\n", count++, start, value); 1.1404 +#endif 1.1405 + return (uint32_t)(UCOL_SPECIAL_FLAG | (SURROGATE_TAG<<24) | offset); 1.1406 + } else { 1.1407 + ++start; 1.1408 + } 1.1409 + } 1.1410 + return 0; 1.1411 +} 1.1412 +U_CDECL_END 1.1413 + 1.1414 +#ifdef UCOL_DEBUG 1.1415 +// This is a debug function to print the contents of a trie. 1.1416 +// It is used in conjuction with the code around utrie_unserialize call 1.1417 +UBool enumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) { 1.1418 + if(start<0x10000) { 1.1419 + fprintf(stdout, "%08X, %08X, %08X\n", start, limit, value); 1.1420 + } else { 1.1421 + fprintf(stdout, "%08X=%04X %04X, %08X=%04X %04X, %08X\n", start, U16_LEAD(start), U16_TRAIL(start), limit, U16_LEAD(limit), U16_TRAIL(limit), value); 1.1422 + } 1.1423 + return TRUE; 1.1424 +} 1.1425 + 1.1426 +int32_t 1.1427 +myGetFoldingOffset(uint32_t data) { 1.1428 + if(data > UCOL_NOT_FOUND && getCETag(data) == SURROGATE_TAG) { 1.1429 + return (data&0xFFFFFF); 1.1430 + } else { 1.1431 + return 0; 1.1432 + } 1.1433 +} 1.1434 +#endif 1.1435 + 1.1436 +U_CAPI UCATableHeader* U_EXPORT2 1.1437 +uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) { 1.1438 + /*CompactEIntArray *mapping = t->mapping;*/ 1.1439 + UNewTrie *mapping = t->mapping; 1.1440 + ExpansionTable *expansions = t->expansions; 1.1441 + CntTable *contractions = t->contractions; 1.1442 + MaxExpansionTable *maxexpansion = t->maxExpansions; 1.1443 + 1.1444 + if(U_FAILURE(*status)) { 1.1445 + return NULL; 1.1446 + } 1.1447 + 1.1448 + uint32_t beforeContractions = (uint32_t)((headersize+paddedsize(expansions->position*sizeof(uint32_t)))/sizeof(UChar)); 1.1449 + 1.1450 + int32_t contractionsSize = 0; 1.1451 + contractionsSize = uprv_cnttab_constructTable(contractions, beforeContractions, status); 1.1452 + 1.1453 + /* the following operation depends on the trie data. Therefore, we have to do it before */ 1.1454 + /* the trie is compacted */ 1.1455 + /* sets jamo expansions */ 1.1456 + uprv_uca_getMaxExpansionJamo(mapping, maxexpansion, t->maxJamoExpansions, 1.1457 + t->image->jamoSpecial, status); 1.1458 + 1.1459 + /*ucmpe32_compact(mapping);*/ 1.1460 + /*UMemoryStream *ms = uprv_mstrm_openNew(8192);*/ 1.1461 + /*int32_t mappingSize = ucmpe32_flattenMem(mapping, ms);*/ 1.1462 + /*const uint8_t *flattened = uprv_mstrm_getBuffer(ms, &mappingSize);*/ 1.1463 + 1.1464 + // After setting the jamo expansions, compact the trie and get the needed size 1.1465 + int32_t mappingSize = utrie_serialize(mapping, NULL, 0, getFoldedValue /*getFoldedValue*/, FALSE, status); 1.1466 + 1.1467 + uint32_t tableOffset = 0; 1.1468 + uint8_t *dataStart; 1.1469 + 1.1470 + /* TODO: LATIN1 array is now in the utrie - it should be removed from the calculation */ 1.1471 + 1.1472 + uint32_t toAllocate =(uint32_t)(headersize+ 1.1473 + paddedsize(expansions->position*sizeof(uint32_t))+ 1.1474 + paddedsize(mappingSize)+ 1.1475 + paddedsize(contractionsSize*(sizeof(UChar)+sizeof(uint32_t)))+ 1.1476 + //paddedsize(0x100*sizeof(uint32_t)) /* Latin1 is now included in the trie */ 1.1477 + /* maxexpansion array */ 1.1478 + + paddedsize(maxexpansion->position * sizeof(uint32_t)) + 1.1479 + /* maxexpansion size array */ 1.1480 + paddedsize(maxexpansion->position * sizeof(uint8_t)) + 1.1481 + paddedsize(UCOL_UNSAFECP_TABLE_SIZE) + /* Unsafe chars */ 1.1482 + paddedsize(UCOL_UNSAFECP_TABLE_SIZE)); /* Contraction Ending chars */ 1.1483 + 1.1484 + 1.1485 + dataStart = (uint8_t *)uprv_malloc(toAllocate); 1.1486 + /* test for NULL */ 1.1487 + if (dataStart == NULL) { 1.1488 + *status = U_MEMORY_ALLOCATION_ERROR; 1.1489 + return NULL; 1.1490 + } 1.1491 + 1.1492 + UCATableHeader *myData = (UCATableHeader *)dataStart; 1.1493 + // Please, do reset all the fields! 1.1494 + uprv_memset(dataStart, 0, toAllocate); 1.1495 + // Make sure we know this is reset 1.1496 + myData->magic = UCOL_HEADER_MAGIC; 1.1497 + myData->isBigEndian = U_IS_BIG_ENDIAN; 1.1498 + myData->charSetFamily = U_CHARSET_FAMILY; 1.1499 + myData->formatVersion[0] = UCA_FORMAT_VERSION_0; 1.1500 + myData->formatVersion[1] = UCA_FORMAT_VERSION_1; 1.1501 + myData->formatVersion[2] = UCA_FORMAT_VERSION_2; 1.1502 + myData->formatVersion[3] = UCA_FORMAT_VERSION_3; 1.1503 + myData->jamoSpecial = t->image->jamoSpecial; 1.1504 + 1.1505 + // Don't copy stuff from UCA header! 1.1506 + //uprv_memcpy(myData, t->image, sizeof(UCATableHeader)); 1.1507 + 1.1508 + myData->contractionSize = contractionsSize; 1.1509 + 1.1510 + tableOffset += (uint32_t)(paddedsize(sizeof(UCATableHeader))); 1.1511 + 1.1512 + myData->options = tableOffset; 1.1513 + uprv_memcpy(dataStart+tableOffset, t->options, sizeof(UColOptionSet)); 1.1514 + tableOffset += (uint32_t)(paddedsize(sizeof(UColOptionSet))); 1.1515 + 1.1516 + /* copy expansions */ 1.1517 + /*myData->expansion = (uint32_t *)dataStart+tableOffset;*/ 1.1518 + myData->expansion = tableOffset; 1.1519 + uprv_memcpy(dataStart+tableOffset, expansions->CEs, expansions->position*sizeof(uint32_t)); 1.1520 + tableOffset += (uint32_t)(paddedsize(expansions->position*sizeof(uint32_t))); 1.1521 + 1.1522 + /* contractions block */ 1.1523 + if(contractionsSize != 0) { 1.1524 + /* copy contraction index */ 1.1525 + /*myData->contractionIndex = (UChar *)(dataStart+tableOffset);*/ 1.1526 + myData->contractionIndex = tableOffset; 1.1527 + uprv_memcpy(dataStart+tableOffset, contractions->codePoints, contractionsSize*sizeof(UChar)); 1.1528 + tableOffset += (uint32_t)(paddedsize(contractionsSize*sizeof(UChar))); 1.1529 + 1.1530 + /* copy contraction collation elements */ 1.1531 + /*myData->contractionCEs = (uint32_t *)(dataStart+tableOffset);*/ 1.1532 + myData->contractionCEs = tableOffset; 1.1533 + uprv_memcpy(dataStart+tableOffset, contractions->CEs, contractionsSize*sizeof(uint32_t)); 1.1534 + tableOffset += (uint32_t)(paddedsize(contractionsSize*sizeof(uint32_t))); 1.1535 + } else { 1.1536 + myData->contractionIndex = 0; 1.1537 + myData->contractionCEs = 0; 1.1538 + } 1.1539 + 1.1540 + /* copy mapping table */ 1.1541 + /*myData->mappingPosition = dataStart+tableOffset;*/ 1.1542 + /*myData->mappingPosition = tableOffset;*/ 1.1543 + /*uprv_memcpy(dataStart+tableOffset, flattened, mappingSize);*/ 1.1544 + 1.1545 + myData->mappingPosition = tableOffset; 1.1546 + utrie_serialize(mapping, dataStart+tableOffset, toAllocate-tableOffset, getFoldedValue, FALSE, status); 1.1547 +#ifdef UCOL_DEBUG 1.1548 + // This is debug code to dump the contents of the trie. It needs two functions defined above 1.1549 + { 1.1550 + UTrie UCAt = { 0 }; 1.1551 + uint32_t trieWord; 1.1552 + utrie_unserialize(&UCAt, dataStart+tableOffset, 9999999, status); 1.1553 + UCAt.getFoldingOffset = myGetFoldingOffset; 1.1554 + if(U_SUCCESS(*status)) { 1.1555 + utrie_enum(&UCAt, NULL, enumRange, NULL); 1.1556 + } 1.1557 + trieWord = UTRIE_GET32_FROM_LEAD(&UCAt, 0xDC01); 1.1558 + } 1.1559 +#endif 1.1560 + tableOffset += paddedsize(mappingSize); 1.1561 + 1.1562 + 1.1563 + int32_t i = 0; 1.1564 + 1.1565 + /* copy max expansion table */ 1.1566 + myData->endExpansionCE = tableOffset; 1.1567 + myData->endExpansionCECount = maxexpansion->position - 1; 1.1568 + /* not copying the first element which is a dummy */ 1.1569 + uprv_memcpy(dataStart + tableOffset, maxexpansion->endExpansionCE + 1, 1.1570 + (maxexpansion->position - 1) * sizeof(uint32_t)); 1.1571 + tableOffset += (uint32_t)(paddedsize((maxexpansion->position)* sizeof(uint32_t))); 1.1572 + myData->expansionCESize = tableOffset; 1.1573 + uprv_memcpy(dataStart + tableOffset, maxexpansion->expansionCESize + 1, 1.1574 + (maxexpansion->position - 1) * sizeof(uint8_t)); 1.1575 + tableOffset += (uint32_t)(paddedsize((maxexpansion->position)* sizeof(uint8_t))); 1.1576 + 1.1577 + /* Unsafe chars table. Finish it off, then copy it. */ 1.1578 + uprv_uca_unsafeCPAddCCNZ(t, status); 1.1579 + if (t->UCA != 0) { /* Or in unsafebits from UCA, making a combined table. */ 1.1580 + for (i=0; i<UCOL_UNSAFECP_TABLE_SIZE; i++) { 1.1581 + t->unsafeCP[i] |= t->UCA->unsafeCP[i]; 1.1582 + } 1.1583 + } 1.1584 + myData->unsafeCP = tableOffset; 1.1585 + uprv_memcpy(dataStart + tableOffset, t->unsafeCP, UCOL_UNSAFECP_TABLE_SIZE); 1.1586 + tableOffset += paddedsize(UCOL_UNSAFECP_TABLE_SIZE); 1.1587 + 1.1588 + 1.1589 + /* Finish building Contraction Ending chars hash table and then copy it out. */ 1.1590 + if (t->UCA != 0) { /* Or in unsafebits from UCA, making a combined table. */ 1.1591 + for (i=0; i<UCOL_UNSAFECP_TABLE_SIZE; i++) { 1.1592 + t->contrEndCP[i] |= t->UCA->contrEndCP[i]; 1.1593 + } 1.1594 + } 1.1595 + myData->contrEndCP = tableOffset; 1.1596 + uprv_memcpy(dataStart + tableOffset, t->contrEndCP, UCOL_UNSAFECP_TABLE_SIZE); 1.1597 + tableOffset += paddedsize(UCOL_UNSAFECP_TABLE_SIZE); 1.1598 + 1.1599 + if(tableOffset != toAllocate) { 1.1600 +#ifdef UCOL_DEBUG 1.1601 + fprintf(stderr, "calculation screwup!!! Expected to write %i but wrote %i instead!!!\n", toAllocate, tableOffset); 1.1602 +#endif 1.1603 + *status = U_INTERNAL_PROGRAM_ERROR; 1.1604 + uprv_free(dataStart); 1.1605 + return 0; 1.1606 + } 1.1607 + 1.1608 + myData->size = tableOffset; 1.1609 + /* This should happen upon ressurection */ 1.1610 + /*const uint8_t *mapPosition = (uint8_t*)myData+myData->mappingPosition;*/ 1.1611 + /*uprv_mstrm_close(ms);*/ 1.1612 + return myData; 1.1613 +} 1.1614 + 1.1615 + 1.1616 +struct enumStruct { 1.1617 + tempUCATable *t; 1.1618 + UCollator *tempColl; 1.1619 + UCollationElements* colEl; 1.1620 + const Normalizer2Impl *nfcImpl; 1.1621 + UnicodeSet *closed; 1.1622 + int32_t noOfClosures; 1.1623 + UErrorCode *status; 1.1624 +}; 1.1625 +U_CDECL_BEGIN 1.1626 +static UBool U_CALLCONV 1.1627 +_enumCategoryRangeClosureCategory(const void *context, UChar32 start, UChar32 limit, UCharCategory type) { 1.1628 + 1.1629 + if (type != U_UNASSIGNED && type != U_PRIVATE_USE_CHAR) { // if the range is assigned - we might ommit more categories later 1.1630 + UErrorCode *status = ((enumStruct *)context)->status; 1.1631 + tempUCATable *t = ((enumStruct *)context)->t; 1.1632 + UCollator *tempColl = ((enumStruct *)context)->tempColl; 1.1633 + UCollationElements* colEl = ((enumStruct *)context)->colEl; 1.1634 + UCAElements el; 1.1635 + UChar decompBuffer[4]; 1.1636 + const UChar *decomp; 1.1637 + int32_t noOfDec = 0; 1.1638 + 1.1639 + UChar32 u32 = 0; 1.1640 + UChar comp[2]; 1.1641 + uint32_t len = 0; 1.1642 + 1.1643 + for(u32 = start; u32 < limit; u32++) { 1.1644 + decomp = ((enumStruct *)context)->nfcImpl-> 1.1645 + getDecomposition(u32, decompBuffer, noOfDec); 1.1646 + //if((noOfDec = unorm_normalize(comp, len, UNORM_NFD, 0, decomp, 256, status)) > 1 1.1647 + //|| (noOfDec == 1 && *decomp != (UChar)u32)) 1.1648 + if(decomp != NULL) 1.1649 + { 1.1650 + len = 0; 1.1651 + U16_APPEND_UNSAFE(comp, len, u32); 1.1652 + if(ucol_strcoll(tempColl, comp, len, decomp, noOfDec) != UCOL_EQUAL) { 1.1653 +#ifdef UCOL_DEBUG 1.1654 + fprintf(stderr, "Closure: U+%04X -> ", u32); 1.1655 + UChar32 c; 1.1656 + int32_t i = 0; 1.1657 + while(i < noOfDec) { 1.1658 + U16_NEXT(decomp, i, noOfDec, c); 1.1659 + fprintf(stderr, "%04X ", c); 1.1660 + } 1.1661 + fprintf(stderr, "\n"); 1.1662 + // print CEs for code point vs. decomposition 1.1663 + fprintf(stderr, "U+%04X CEs: ", u32); 1.1664 + UCollationElements *iter = ucol_openElements(tempColl, comp, len, status); 1.1665 + int32_t ce; 1.1666 + while((ce = ucol_next(iter, status)) != UCOL_NULLORDER) { 1.1667 + fprintf(stderr, "%08X ", ce); 1.1668 + } 1.1669 + fprintf(stderr, "\nDecomp CEs: "); 1.1670 + ucol_setText(iter, decomp, noOfDec, status); 1.1671 + while((ce = ucol_next(iter, status)) != UCOL_NULLORDER) { 1.1672 + fprintf(stderr, "%08X ", ce); 1.1673 + } 1.1674 + fprintf(stderr, "\n"); 1.1675 + ucol_closeElements(iter); 1.1676 +#endif 1.1677 + if(((enumStruct *)context)->closed != NULL) { 1.1678 + ((enumStruct *)context)->closed->add(u32); 1.1679 + } 1.1680 + ((enumStruct *)context)->noOfClosures++; 1.1681 + el.cPoints = (UChar *)decomp; 1.1682 + el.cSize = noOfDec; 1.1683 + el.noOfCEs = 0; 1.1684 + el.prefix = el.prefixChars; 1.1685 + el.prefixSize = 0; 1.1686 + 1.1687 + UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, &el); 1.1688 + el.cPoints = comp; 1.1689 + el.cSize = len; 1.1690 + el.prefix = el.prefixChars; 1.1691 + el.prefixSize = 0; 1.1692 + if(prefix == NULL) { 1.1693 + el.noOfCEs = 0; 1.1694 + ucol_setText(colEl, decomp, noOfDec, status); 1.1695 + while((el.CEs[el.noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) { 1.1696 + el.noOfCEs++; 1.1697 + } 1.1698 + } else { 1.1699 + el.noOfCEs = 1; 1.1700 + el.CEs[0] = prefix->mapCE; 1.1701 + // This character uses a prefix. We have to add it 1.1702 + // to the unsafe table, as it decomposed form is already 1.1703 + // in. In Japanese, this happens for \u309e & \u30fe 1.1704 + // Since unsafeCPSet is static in ucol_elm, we are going 1.1705 + // to wrap it up in the uprv_uca_unsafeCPAddCCNZ function 1.1706 + } 1.1707 + uprv_uca_addAnElement(t, &el, status); 1.1708 + } 1.1709 + } 1.1710 + } 1.1711 + } 1.1712 + return TRUE; 1.1713 +} 1.1714 +U_CDECL_END 1.1715 + 1.1716 +static void 1.1717 +uprv_uca_setMapCE(tempUCATable *t, UCAElements *element, UErrorCode *status) { 1.1718 + uint32_t expansion = 0; 1.1719 + int32_t j; 1.1720 + 1.1721 + ExpansionTable *expansions = t->expansions; 1.1722 + if(element->noOfCEs == 2 // a two CE expansion 1.1723 + && isContinuation(element->CEs[1]) // which is a continuation 1.1724 + && (element->CEs[1] & (~(0xFF << 24 | UCOL_CONTINUATION_MARKER))) == 0 // that has only primaries in continuation, 1.1725 + && (((element->CEs[0]>>8) & 0xFF) == UCOL_BYTE_COMMON) // a common secondary 1.1726 + && ((element->CEs[0] & 0xFF) == UCOL_BYTE_COMMON) // and a common tertiary 1.1727 + ) { 1.1728 + element->mapCE = UCOL_SPECIAL_FLAG | (LONG_PRIMARY_TAG<<24) // a long primary special 1.1729 + | ((element->CEs[0]>>8) & 0xFFFF00) // first and second byte of primary 1.1730 + | ((element->CEs[1]>>24) & 0xFF); // third byte of primary 1.1731 + } else { 1.1732 + expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT) 1.1733 + | (((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4) 1.1734 + & 0xFFFFF0)); 1.1735 + 1.1736 + for(j = 1; j<(int32_t)element->noOfCEs; j++) { 1.1737 + uprv_uca_addExpansion(expansions, element->CEs[j], status); 1.1738 + } 1.1739 + if(element->noOfCEs <= 0xF) { 1.1740 + expansion |= element->noOfCEs; 1.1741 + } else { 1.1742 + uprv_uca_addExpansion(expansions, 0, status); 1.1743 + } 1.1744 + element->mapCE = expansion; 1.1745 + uprv_uca_setMaxExpansion(element->CEs[element->noOfCEs - 1], 1.1746 + (uint8_t)element->noOfCEs, 1.1747 + t->maxExpansions, 1.1748 + status); 1.1749 + } 1.1750 +} 1.1751 + 1.1752 +static void 1.1753 +uprv_uca_addFCD4AccentedContractions(tempUCATable *t, 1.1754 + UCollationElements* colEl, 1.1755 + UChar *data, 1.1756 + int32_t len, 1.1757 + UCAElements *el, 1.1758 + UErrorCode *status) { 1.1759 + UChar decomp[256], comp[256]; 1.1760 + int32_t decLen, compLen; 1.1761 + 1.1762 + decLen = unorm_normalize(data, len, UNORM_NFD, 0, decomp, 256, status); 1.1763 + compLen = unorm_normalize(data, len, UNORM_NFC, 0, comp, 256, status); 1.1764 + decomp[decLen] = comp[compLen] = 0; 1.1765 + 1.1766 + el->cPoints = decomp; 1.1767 + el->cSize = decLen; 1.1768 + el->noOfCEs = 0; 1.1769 + el->prefixSize = 0; 1.1770 + el->prefix = el->prefixChars; 1.1771 + 1.1772 + UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, el); 1.1773 + el->cPoints = comp; 1.1774 + el->cSize = compLen; 1.1775 + el->prefix = el->prefixChars; 1.1776 + el->prefixSize = 0; 1.1777 + if(prefix == NULL) { 1.1778 + el->noOfCEs = 0; 1.1779 + ucol_setText(colEl, decomp, decLen, status); 1.1780 + while((el->CEs[el->noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) { 1.1781 + el->noOfCEs++; 1.1782 + } 1.1783 + uprv_uca_setMapCE(t, el, status); 1.1784 + uprv_uca_addAnElement(t, el, status); 1.1785 + } 1.1786 + el->cPoints=NULL; /* don't leak reference to stack */ 1.1787 +} 1.1788 + 1.1789 +static void 1.1790 +uprv_uca_addMultiCMContractions(tempUCATable *t, 1.1791 + UCollationElements* colEl, 1.1792 + tempTailorContext *c, 1.1793 + UCAElements *el, 1.1794 + UErrorCode *status) { 1.1795 + CombinClassTable *cmLookup = t->cmLookup; 1.1796 + UChar newDecomp[256]; 1.1797 + int32_t maxComp, newDecLen; 1.1798 + const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status); 1.1799 + if (U_FAILURE(*status)) { 1.1800 + return; 1.1801 + } 1.1802 + int16_t curClass = nfcImpl->getFCD16(c->tailoringCM) & 0xff; 1.1803 + CompData *precomp = c->precomp; 1.1804 + int32_t compLen = c->compLen; 1.1805 + UChar *comp = c->comp; 1.1806 + maxComp = c->precompLen; 1.1807 + 1.1808 + for (int32_t j=0; j < maxComp; j++) { 1.1809 + int32_t count=0; 1.1810 + do { 1.1811 + if ( count == 0 ) { // Decompose the saved precomposed char. 1.1812 + UChar temp[2]; 1.1813 + temp[0]=precomp[j].cp; 1.1814 + temp[1]=0; 1.1815 + newDecLen = unorm_normalize(temp, 1, UNORM_NFD, 0, 1.1816 + newDecomp, sizeof(newDecomp)/sizeof(UChar), status); 1.1817 + newDecomp[newDecLen++] = cmLookup->cPoints[c->cmPos]; 1.1818 + } 1.1819 + else { // swap 2 combining marks when they are equal. 1.1820 + uprv_memcpy(newDecomp, c->decomp, sizeof(UChar)*(c->decompLen)); 1.1821 + newDecLen = c->decompLen; 1.1822 + newDecomp[newDecLen++] = precomp[j].cClass; 1.1823 + } 1.1824 + newDecomp[newDecLen] = 0; 1.1825 + compLen = unorm_normalize(newDecomp, newDecLen, UNORM_NFC, 0, 1.1826 + comp, 256, status); 1.1827 + if (compLen==1) { 1.1828 + comp[compLen++] = newDecomp[newDecLen++] = c->tailoringCM; 1.1829 + comp[compLen] = newDecomp[newDecLen] = 0; 1.1830 + el->cPoints = newDecomp; 1.1831 + el->cSize = newDecLen; 1.1832 + 1.1833 + UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, el); 1.1834 + el->cPoints = c->comp; 1.1835 + el->cSize = compLen; 1.1836 + el->prefix = el->prefixChars; 1.1837 + el->prefixSize = 0; 1.1838 + if(prefix == NULL) { 1.1839 + el->noOfCEs = 0; 1.1840 + ucol_setText(colEl, newDecomp, newDecLen, status); 1.1841 + while((el->CEs[el->noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) { 1.1842 + el->noOfCEs++; 1.1843 + } 1.1844 + uprv_uca_setMapCE(t, el, status); 1.1845 + uprv_uca_finalizeAddition(t, el, status); 1.1846 + 1.1847 + // Save the current precomposed char and its class to find any 1.1848 + // other combining mark combinations. 1.1849 + precomp[c->precompLen].cp=comp[0]; 1.1850 + precomp[c->precompLen].cClass = curClass; 1.1851 + c->precompLen++; 1.1852 + } 1.1853 + } 1.1854 + } while (++count<2 && (precomp[j].cClass == curClass)); 1.1855 + } 1.1856 + 1.1857 +} 1.1858 + 1.1859 +static void 1.1860 +uprv_uca_addTailCanonicalClosures(tempUCATable *t, 1.1861 + UCollationElements* colEl, 1.1862 + UChar baseCh, 1.1863 + UChar cMark, 1.1864 + UCAElements *el, 1.1865 + UErrorCode *status) { 1.1866 + CombinClassTable *cmLookup = t->cmLookup; 1.1867 + const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status); 1.1868 + if (U_FAILURE(*status)) { 1.1869 + return; 1.1870 + } 1.1871 + int16_t maxIndex = nfcImpl->getFCD16(cMark) & 0xff; 1.1872 + UCAElements element; 1.1873 + uint16_t *index; 1.1874 + UChar decomp[256]; 1.1875 + UChar comp[256]; 1.1876 + CompData precomp[256]; // precomposed array 1.1877 + int32_t precompLen = 0; // count for precomp 1.1878 + int32_t i, len, decompLen, replacedPos; 1.1879 + tempTailorContext c; 1.1880 + 1.1881 + if ( cmLookup == NULL ) { 1.1882 + return; 1.1883 + } 1.1884 + index = cmLookup->index; 1.1885 + int32_t cClass=nfcImpl->getFCD16(cMark) & 0xff; 1.1886 + maxIndex = (int32_t)index[(nfcImpl->getFCD16(cMark) & 0xff)-1]; 1.1887 + c.comp = comp; 1.1888 + c.decomp = decomp; 1.1889 + c.precomp = precomp; 1.1890 + c.tailoringCM = cMark; 1.1891 + 1.1892 + if (cClass>0) { 1.1893 + maxIndex = (int32_t)index[cClass-1]; 1.1894 + } 1.1895 + else { 1.1896 + maxIndex=0; 1.1897 + } 1.1898 + decomp[0]=baseCh; 1.1899 + for ( i=0; i<maxIndex ; i++ ) { 1.1900 + decomp[1] = cmLookup->cPoints[i]; 1.1901 + decomp[2]=0; 1.1902 + decompLen=2; 1.1903 + len = unorm_normalize(decomp, decompLen, UNORM_NFC, 0, comp, 256, status); 1.1904 + if (len==1) { 1.1905 + // Save the current precomposed char and its class to find any 1.1906 + // other combining mark combinations. 1.1907 + precomp[precompLen].cp=comp[0]; 1.1908 + precomp[precompLen].cClass = 1.1909 + index[nfcImpl->getFCD16(decomp[1]) & 0xff]; 1.1910 + precompLen++; 1.1911 + replacedPos=0; 1.1912 + for (decompLen=0; decompLen< (int32_t)el->cSize; decompLen++) { 1.1913 + decomp[decompLen] = el->cPoints[decompLen]; 1.1914 + if (decomp[decompLen]==cMark) { 1.1915 + replacedPos = decompLen; // record the position for later use 1.1916 + } 1.1917 + } 1.1918 + if ( replacedPos != 0 ) { 1.1919 + decomp[replacedPos]=cmLookup->cPoints[i]; 1.1920 + } 1.1921 + decomp[decompLen] = 0; 1.1922 + len = unorm_normalize(decomp, decompLen, UNORM_NFC, 0, comp, 256, status); 1.1923 + comp[len++] = decomp[decompLen++] = cMark; 1.1924 + comp[len] = decomp[decompLen] = 0; 1.1925 + element.cPoints = decomp; 1.1926 + element.cSize = decompLen; 1.1927 + element.noOfCEs = 0; 1.1928 + element.prefix = el->prefixChars; 1.1929 + element.prefixSize = 0; 1.1930 + 1.1931 + UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, &element); 1.1932 + element.cPoints = comp; 1.1933 + element.cSize = len; 1.1934 + element.prefix = el->prefixChars; 1.1935 + element.prefixSize = 0; 1.1936 + if(prefix == NULL) { 1.1937 + element.noOfCEs = 0; 1.1938 + ucol_setText(colEl, decomp, decompLen, status); 1.1939 + while((element.CEs[element.noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) { 1.1940 + element.noOfCEs++; 1.1941 + } 1.1942 + uprv_uca_setMapCE(t, &element, status); 1.1943 + uprv_uca_finalizeAddition(t, &element, status); 1.1944 + } 1.1945 + 1.1946 + // This is a fix for tailoring contractions with accented 1.1947 + // character at the end of contraction string. 1.1948 + if ((len>2) && 1.1949 + (nfcImpl->getFCD16(comp[len-2]) & 0xff00)==0) { 1.1950 + uprv_uca_addFCD4AccentedContractions(t, colEl, comp, len, &element, status); 1.1951 + } 1.1952 + 1.1953 + if (precompLen >1) { 1.1954 + c.compLen = len; 1.1955 + c.decompLen = decompLen; 1.1956 + c.precompLen = precompLen; 1.1957 + c.cmPos = i; 1.1958 + uprv_uca_addMultiCMContractions(t, colEl, &c, &element, status); 1.1959 + precompLen = c.precompLen; 1.1960 + } 1.1961 + } 1.1962 + } 1.1963 +} 1.1964 + 1.1965 +U_CFUNC int32_t U_EXPORT2 1.1966 +uprv_uca_canonicalClosure(tempUCATable *t, 1.1967 + UColTokenParser *src, 1.1968 + UnicodeSet *closed, 1.1969 + UErrorCode *status) 1.1970 +{ 1.1971 + enumStruct context; 1.1972 + context.closed = closed; 1.1973 + context.noOfClosures = 0; 1.1974 + UCAElements el; 1.1975 + UColToken *tok; 1.1976 + uint32_t i = 0, j = 0; 1.1977 + UChar baseChar, firstCM; 1.1978 + context.nfcImpl=Normalizer2Factory::getNFCImpl(*status); 1.1979 + if(U_FAILURE(*status)) { 1.1980 + return 0; 1.1981 + } 1.1982 + 1.1983 + UCollator *tempColl = NULL; 1.1984 + tempUCATable *tempTable = uprv_uca_cloneTempTable(t, status); 1.1985 + // Check for null pointer 1.1986 + if (U_FAILURE(*status)) { 1.1987 + return 0; 1.1988 + } 1.1989 + 1.1990 + UCATableHeader *tempData = uprv_uca_assembleTable(tempTable, status); 1.1991 + tempColl = ucol_initCollator(tempData, 0, t->UCA, status); 1.1992 + if ( tempTable->cmLookup != NULL ) { 1.1993 + t->cmLookup = tempTable->cmLookup; // copy over to t 1.1994 + tempTable->cmLookup = NULL; 1.1995 + } 1.1996 + uprv_uca_closeTempTable(tempTable); 1.1997 + 1.1998 + if(U_SUCCESS(*status)) { 1.1999 + tempColl->ucaRules = NULL; 1.2000 + tempColl->actualLocale = NULL; 1.2001 + tempColl->validLocale = NULL; 1.2002 + tempColl->requestedLocale = NULL; 1.2003 + tempColl->hasRealData = TRUE; 1.2004 + tempColl->freeImageOnClose = TRUE; 1.2005 + } else if(tempData != 0) { 1.2006 + uprv_free(tempData); 1.2007 + } 1.2008 + 1.2009 + /* produce canonical closure */ 1.2010 + UCollationElements* colEl = ucol_openElements(tempColl, NULL, 0, status); 1.2011 + // Check for null pointer 1.2012 + if (U_FAILURE(*status)) { 1.2013 + return 0; 1.2014 + } 1.2015 + context.t = t; 1.2016 + context.tempColl = tempColl; 1.2017 + context.colEl = colEl; 1.2018 + context.status = status; 1.2019 + u_enumCharTypes(_enumCategoryRangeClosureCategory, &context); 1.2020 + 1.2021 + if ( (src==NULL) || !src->buildCCTabFlag ) { 1.2022 + ucol_closeElements(colEl); 1.2023 + ucol_close(tempColl); 1.2024 + return context.noOfClosures; // no extra contraction needed to add 1.2025 + } 1.2026 + 1.2027 + for (i=0; i < src->resultLen; i++) { 1.2028 + baseChar = firstCM= (UChar)0; 1.2029 + tok = src->lh[i].first; 1.2030 + while (tok != NULL && U_SUCCESS(*status)) { 1.2031 + el.prefix = el.prefixChars; 1.2032 + el.cPoints = el.uchars; 1.2033 + if(tok->prefix != 0) { 1.2034 + el.prefixSize = tok->prefix>>24; 1.2035 + uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar)); 1.2036 + 1.2037 + el.cSize = (tok->source >> 24)-(tok->prefix>>24); 1.2038 + uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar)); 1.2039 + } else { 1.2040 + el.prefixSize = 0; 1.2041 + *el.prefix = 0; 1.2042 + 1.2043 + el.cSize = (tok->source >> 24); 1.2044 + uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar)); 1.2045 + } 1.2046 + if(src->UCA != NULL) { 1.2047 + for(j = 0; j<el.cSize; j++) { 1.2048 + int16_t fcd = context.nfcImpl->getFCD16(el.cPoints[j]); 1.2049 + if ( (fcd & 0xff) == 0 ) { 1.2050 + baseChar = el.cPoints[j]; // last base character 1.2051 + firstCM=0; // reset combining mark value 1.2052 + } 1.2053 + else { 1.2054 + if ( (baseChar!=0) && (firstCM==0) ) { 1.2055 + firstCM = el.cPoints[j]; // first combining mark 1.2056 + } 1.2057 + } 1.2058 + } 1.2059 + } 1.2060 + if ( (baseChar!= (UChar)0) && (firstCM != (UChar)0) ) { 1.2061 + // find all the canonical rules 1.2062 + uprv_uca_addTailCanonicalClosures(t, colEl, baseChar, firstCM, &el, status); 1.2063 + } 1.2064 + tok = tok->next; 1.2065 + } 1.2066 + } 1.2067 + ucol_closeElements(colEl); 1.2068 + ucol_close(tempColl); 1.2069 + 1.2070 + return context.noOfClosures; 1.2071 +} 1.2072 + 1.2073 +#endif /* #if !UCONFIG_NO_COLLATION */