intl/icu/source/i18n/ucol_elm.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/ucol_elm.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,2070 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 2001-2012, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +*******************************************************************************
    1.11 +*   file name:  ucaelems.cpp
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created 02/22/2001
    1.17 +*   created by: Vladimir Weinstein
    1.18 +*
    1.19 +*   This program reads the Franctional UCA table and generates
    1.20 +*   internal format for UCA table as well as inverse UCA table.
    1.21 +*   It then writes binary files containing the data: ucadata.dat 
    1.22 +*   & invuca.dat
    1.23 +* 
    1.24 +*   date        name       comments
    1.25 +*   03/02/2001  synwee     added setMaxExpansion
    1.26 +*   03/07/2001  synwee     merged UCA's maxexpansion and tailoring's
    1.27 +*/
    1.28 +
    1.29 +#include "unicode/utypes.h"
    1.30 +
    1.31 +#if !UCONFIG_NO_COLLATION
    1.32 +
    1.33 +#include "unicode/uchar.h"
    1.34 +#include "unicode/unistr.h"
    1.35 +#include "unicode/ucoleitr.h"
    1.36 +#include "unicode/normlzr.h"
    1.37 +#include "unicode/utf16.h"
    1.38 +#include "normalizer2impl.h"
    1.39 +#include "ucol_elm.h"
    1.40 +#include "ucol_tok.h"
    1.41 +#include "ucol_cnt.h"
    1.42 +#include "unicode/caniter.h"
    1.43 +#include "cmemory.h"
    1.44 +#include "uassert.h"
    1.45 +
    1.46 +U_NAMESPACE_USE
    1.47 +
    1.48 +static uint32_t uprv_uca_processContraction(CntTable *contractions, UCAElements *element, uint32_t existingCE, UErrorCode *status);
    1.49 +
    1.50 +U_CDECL_BEGIN
    1.51 +static int32_t U_CALLCONV
    1.52 +prefixLookupHash(const UHashTok e) {
    1.53 +    UCAElements *element = (UCAElements *)e.pointer;
    1.54 +    UChar buf[256];
    1.55 +    UHashTok key;
    1.56 +    key.pointer = buf;
    1.57 +    uprv_memcpy(buf, element->cPoints, element->cSize*sizeof(UChar));
    1.58 +    buf[element->cSize] = 0;
    1.59 +    //key.pointer = element->cPoints;
    1.60 +    //element->cPoints[element->cSize] = 0;
    1.61 +    return uhash_hashUChars(key);
    1.62 +}
    1.63 +
    1.64 +static int8_t U_CALLCONV
    1.65 +prefixLookupComp(const UHashTok e1, const UHashTok e2) {
    1.66 +    UCAElements *element1 = (UCAElements *)e1.pointer;
    1.67 +    UCAElements *element2 = (UCAElements *)e2.pointer;
    1.68 +
    1.69 +    UChar buf1[256];
    1.70 +    UHashTok key1;
    1.71 +    key1.pointer = buf1;
    1.72 +    uprv_memcpy(buf1, element1->cPoints, element1->cSize*sizeof(UChar));
    1.73 +    buf1[element1->cSize] = 0;
    1.74 +
    1.75 +    UChar buf2[256];
    1.76 +    UHashTok key2;
    1.77 +    key2.pointer = buf2;
    1.78 +    uprv_memcpy(buf2, element2->cPoints, element2->cSize*sizeof(UChar));
    1.79 +    buf2[element2->cSize] = 0;
    1.80 +
    1.81 +    return uhash_compareUChars(key1, key2);
    1.82 +}
    1.83 +U_CDECL_END
    1.84 +
    1.85 +static int32_t uprv_uca_addExpansion(ExpansionTable *expansions, uint32_t value, UErrorCode *status) {
    1.86 +    if(U_FAILURE(*status)) {
    1.87 +        return 0;
    1.88 +    }
    1.89 +    if(expansions->CEs == NULL) {
    1.90 +        expansions->CEs = (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE*sizeof(uint32_t));
    1.91 +        /* test for NULL */
    1.92 +        if (expansions->CEs == NULL) {
    1.93 +            *status = U_MEMORY_ALLOCATION_ERROR;
    1.94 +            return 0;
    1.95 +        }
    1.96 +        expansions->size = INIT_EXP_TABLE_SIZE;
    1.97 +        expansions->position = 0;
    1.98 +    }
    1.99 +
   1.100 +    if(expansions->position == expansions->size) {
   1.101 +        uint32_t *newData = (uint32_t *)uprv_realloc(expansions->CEs, 2*expansions->size*sizeof(uint32_t));
   1.102 +        if(newData == NULL) {
   1.103 +#ifdef UCOL_DEBUG
   1.104 +            fprintf(stderr, "out of memory for expansions\n");
   1.105 +#endif
   1.106 +            *status = U_MEMORY_ALLOCATION_ERROR;
   1.107 +            return -1;
   1.108 +        }
   1.109 +        expansions->CEs = newData;
   1.110 +        expansions->size *= 2;
   1.111 +    }
   1.112 +
   1.113 +    expansions->CEs[expansions->position] = value;
   1.114 +    return(expansions->position++);
   1.115 +}
   1.116 +
   1.117 +U_CAPI tempUCATable*  U_EXPORT2
   1.118 +uprv_uca_initTempTable(UCATableHeader *image, UColOptionSet *opts, const UCollator *UCA, UColCETags initTag, UColCETags supplementaryInitTag, UErrorCode *status) {
   1.119 +    MaxJamoExpansionTable *maxjet;
   1.120 +    MaxExpansionTable *maxet;
   1.121 +    tempUCATable *t = (tempUCATable *)uprv_malloc(sizeof(tempUCATable));
   1.122 +    /* test for NULL */
   1.123 +    if (t == NULL) {
   1.124 +        *status = U_MEMORY_ALLOCATION_ERROR;
   1.125 +        return NULL;
   1.126 +    }
   1.127 +    uprv_memset(t, 0, sizeof(tempUCATable));
   1.128 +
   1.129 +    maxet  = (MaxExpansionTable *)uprv_malloc(sizeof(MaxExpansionTable));
   1.130 +    if (maxet == NULL) {
   1.131 +        goto allocation_failure;
   1.132 +    }
   1.133 +    uprv_memset(maxet, 0, sizeof(MaxExpansionTable));
   1.134 +    t->maxExpansions       = maxet;
   1.135 +
   1.136 +    maxjet = (MaxJamoExpansionTable *)uprv_malloc(sizeof(MaxJamoExpansionTable));
   1.137 +    if (maxjet == NULL) {
   1.138 +        goto allocation_failure;
   1.139 +    }
   1.140 +    uprv_memset(maxjet, 0, sizeof(MaxJamoExpansionTable));
   1.141 +    t->maxJamoExpansions = maxjet;
   1.142 +
   1.143 +    t->image = image;
   1.144 +    t->options = opts;
   1.145 +
   1.146 +    t->UCA = UCA;
   1.147 +    t->expansions = (ExpansionTable *)uprv_malloc(sizeof(ExpansionTable));
   1.148 +    /* test for NULL */
   1.149 +    if (t->expansions == NULL) {
   1.150 +        goto allocation_failure;
   1.151 +    }
   1.152 +    uprv_memset(t->expansions, 0, sizeof(ExpansionTable));
   1.153 +
   1.154 +    t->mapping = utrie_open(NULL, NULL, UCOL_ELM_TRIE_CAPACITY,
   1.155 +        UCOL_SPECIAL_FLAG | (initTag<<24),
   1.156 +        UCOL_SPECIAL_FLAG | (supplementaryInitTag << 24),
   1.157 +        TRUE); // Do your own mallocs for the structure, array and have linear Latin 1
   1.158 +    if (U_FAILURE(*status)) {
   1.159 +        goto allocation_failure;
   1.160 +    }
   1.161 +    t->prefixLookup = uhash_open(prefixLookupHash, prefixLookupComp, NULL, status);
   1.162 +    if (U_FAILURE(*status)) {
   1.163 +        goto allocation_failure;
   1.164 +    }
   1.165 +    uhash_setValueDeleter(t->prefixLookup, uprv_free);
   1.166 +
   1.167 +    t->contractions = uprv_cnttab_open(t->mapping, status);
   1.168 +    if (U_FAILURE(*status)) {
   1.169 +        goto cleanup;
   1.170 +    }
   1.171 +
   1.172 +    /* copy UCA's maxexpansion and merge as we go along */
   1.173 +    if (UCA != NULL) {
   1.174 +        /* adding an extra initial value for easier manipulation */
   1.175 +        maxet->size            = (int32_t)(UCA->lastEndExpansionCE - UCA->endExpansionCE) + 2;
   1.176 +        maxet->position        = maxet->size - 1;
   1.177 +        maxet->endExpansionCE  = 
   1.178 +            (uint32_t *)uprv_malloc(sizeof(uint32_t) * maxet->size);
   1.179 +        /* test for NULL */
   1.180 +        if (maxet->endExpansionCE == NULL) {
   1.181 +            goto allocation_failure;
   1.182 +        }
   1.183 +        maxet->expansionCESize =
   1.184 +            (uint8_t *)uprv_malloc(sizeof(uint8_t) * maxet->size);
   1.185 +        /* test for NULL */
   1.186 +        if (maxet->expansionCESize == NULL) {
   1.187 +            goto allocation_failure;
   1.188 +        }
   1.189 +        /* initialized value */
   1.190 +        *(maxet->endExpansionCE)  = 0;
   1.191 +        *(maxet->expansionCESize) = 0;
   1.192 +        uprv_memcpy(maxet->endExpansionCE + 1, UCA->endExpansionCE, 
   1.193 +            sizeof(uint32_t) * (maxet->size - 1));
   1.194 +        uprv_memcpy(maxet->expansionCESize + 1, UCA->expansionCESize, 
   1.195 +            sizeof(uint8_t) * (maxet->size - 1));
   1.196 +    }
   1.197 +    else {
   1.198 +        maxet->size     = 0;
   1.199 +    }
   1.200 +    maxjet->endExpansionCE = NULL;
   1.201 +    maxjet->isV = NULL;
   1.202 +    maxjet->size = 0;
   1.203 +    maxjet->position = 0;
   1.204 +    maxjet->maxLSize = 1;
   1.205 +    maxjet->maxVSize = 1;
   1.206 +    maxjet->maxTSize = 1;
   1.207 +
   1.208 +    t->unsafeCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE);
   1.209 +    /* test for NULL */
   1.210 +    if (t->unsafeCP == NULL) {
   1.211 +        goto allocation_failure;
   1.212 +    }
   1.213 +    t->contrEndCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE);
   1.214 +    /* test for NULL */
   1.215 +    if (t->contrEndCP == NULL) {
   1.216 +        goto allocation_failure;
   1.217 +    }
   1.218 +    uprv_memset(t->unsafeCP, 0, UCOL_UNSAFECP_TABLE_SIZE);
   1.219 +    uprv_memset(t->contrEndCP, 0, UCOL_UNSAFECP_TABLE_SIZE);
   1.220 +    t->cmLookup = NULL;
   1.221 +    return t;
   1.222 +
   1.223 +allocation_failure:
   1.224 +    *status = U_MEMORY_ALLOCATION_ERROR;
   1.225 +cleanup:
   1.226 +    uprv_uca_closeTempTable(t);
   1.227 +    return NULL;
   1.228 +}
   1.229 +
   1.230 +static tempUCATable* U_EXPORT2
   1.231 +uprv_uca_cloneTempTable(tempUCATable *t, UErrorCode *status) {
   1.232 +    if(U_FAILURE(*status)) {
   1.233 +        return NULL;
   1.234 +    }
   1.235 +
   1.236 +    tempUCATable *r = (tempUCATable *)uprv_malloc(sizeof(tempUCATable));
   1.237 +    /* test for NULL */
   1.238 +    if (r == NULL) {
   1.239 +        *status = U_MEMORY_ALLOCATION_ERROR;
   1.240 +        return NULL;
   1.241 +    }
   1.242 +    uprv_memset(r, 0, sizeof(tempUCATable));
   1.243 +
   1.244 +    /* mapping */
   1.245 +    if(t->mapping != NULL) {
   1.246 +        /*r->mapping = ucmpe32_clone(t->mapping, status);*/
   1.247 +        r->mapping = utrie_clone(NULL, t->mapping, NULL, 0);
   1.248 +    }
   1.249 +
   1.250 +    // a hashing clone function would be very nice. We have none currently...
   1.251 +    // However, we should be good, as closing should not produce any prefixed elements.
   1.252 +    r->prefixLookup = NULL; // prefixes are not used in closing
   1.253 +
   1.254 +    /* expansions */
   1.255 +    if(t->expansions != NULL) {
   1.256 +        r->expansions = (ExpansionTable *)uprv_malloc(sizeof(ExpansionTable));
   1.257 +        /* test for NULL */
   1.258 +        if (r->expansions == NULL) {
   1.259 +            *status = U_MEMORY_ALLOCATION_ERROR;
   1.260 +            goto cleanup;
   1.261 +        }
   1.262 +        r->expansions->position = t->expansions->position;
   1.263 +        r->expansions->size = t->expansions->size;
   1.264 +        if(t->expansions->CEs != NULL) {
   1.265 +            r->expansions->CEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*t->expansions->size);
   1.266 +            /* test for NULL */
   1.267 +            if (r->expansions->CEs == NULL) {
   1.268 +                *status = U_MEMORY_ALLOCATION_ERROR;
   1.269 +                goto cleanup;
   1.270 +            }
   1.271 +            uprv_memcpy(r->expansions->CEs, t->expansions->CEs, sizeof(uint32_t)*t->expansions->position);
   1.272 +        } else {
   1.273 +            r->expansions->CEs = NULL;
   1.274 +        }
   1.275 +    }
   1.276 +
   1.277 +    if(t->contractions != NULL) {
   1.278 +        r->contractions = uprv_cnttab_clone(t->contractions, status);
   1.279 +        // Check for cloning failure.
   1.280 +        if (r->contractions == NULL) {
   1.281 +            *status = U_MEMORY_ALLOCATION_ERROR;
   1.282 +            goto cleanup;
   1.283 +        }
   1.284 +        r->contractions->mapping = r->mapping;
   1.285 +    }
   1.286 +
   1.287 +    if(t->maxExpansions != NULL) {
   1.288 +        r->maxExpansions = (MaxExpansionTable *)uprv_malloc(sizeof(MaxExpansionTable));
   1.289 +        /* test for NULL */
   1.290 +        if (r->maxExpansions == NULL) {
   1.291 +            *status = U_MEMORY_ALLOCATION_ERROR;
   1.292 +            goto cleanup;
   1.293 +        }
   1.294 +        r->maxExpansions->size = t->maxExpansions->size;
   1.295 +        r->maxExpansions->position = t->maxExpansions->position;
   1.296 +        if(t->maxExpansions->endExpansionCE != NULL) {
   1.297 +            r->maxExpansions->endExpansionCE = (uint32_t *)uprv_malloc(sizeof(uint32_t)*t->maxExpansions->size);
   1.298 +            /* test for NULL */
   1.299 +            if (r->maxExpansions->endExpansionCE == NULL) {
   1.300 +                *status = U_MEMORY_ALLOCATION_ERROR;
   1.301 +                goto cleanup;
   1.302 +            }
   1.303 +            uprv_memset(r->maxExpansions->endExpansionCE, 0xDB, sizeof(uint32_t)*t->maxExpansions->size);
   1.304 +            uprv_memcpy(r->maxExpansions->endExpansionCE, t->maxExpansions->endExpansionCE, t->maxExpansions->position*sizeof(uint32_t));
   1.305 +        } else {
   1.306 +            r->maxExpansions->endExpansionCE = NULL;
   1.307 +        }
   1.308 +        if(t->maxExpansions->expansionCESize != NULL) {
   1.309 +            r->maxExpansions->expansionCESize = (uint8_t *)uprv_malloc(sizeof(uint8_t)*t->maxExpansions->size);
   1.310 +            /* test for NULL */
   1.311 +            if (r->maxExpansions->expansionCESize == NULL) {
   1.312 +                *status = U_MEMORY_ALLOCATION_ERROR;
   1.313 +                goto cleanup;
   1.314 +            }
   1.315 +            uprv_memset(r->maxExpansions->expansionCESize, 0xDB, sizeof(uint8_t)*t->maxExpansions->size);
   1.316 +            uprv_memcpy(r->maxExpansions->expansionCESize, t->maxExpansions->expansionCESize, t->maxExpansions->position*sizeof(uint8_t));
   1.317 +        } else {
   1.318 +            r->maxExpansions->expansionCESize = NULL;
   1.319 +        }
   1.320 +    }
   1.321 +
   1.322 +    if(t->maxJamoExpansions != NULL) {
   1.323 +        r->maxJamoExpansions = (MaxJamoExpansionTable *)uprv_malloc(sizeof(MaxJamoExpansionTable));
   1.324 +        /* test for NULL */
   1.325 +        if (r->maxJamoExpansions == NULL) {
   1.326 +            *status = U_MEMORY_ALLOCATION_ERROR;
   1.327 +            goto cleanup;
   1.328 +        }
   1.329 +        r->maxJamoExpansions->size = t->maxJamoExpansions->size;
   1.330 +        r->maxJamoExpansions->position = t->maxJamoExpansions->position;
   1.331 +        r->maxJamoExpansions->maxLSize = t->maxJamoExpansions->maxLSize;
   1.332 +        r->maxJamoExpansions->maxVSize = t->maxJamoExpansions->maxVSize;
   1.333 +        r->maxJamoExpansions->maxTSize = t->maxJamoExpansions->maxTSize;
   1.334 +        if(t->maxJamoExpansions->size != 0) {
   1.335 +            r->maxJamoExpansions->endExpansionCE = (uint32_t *)uprv_malloc(sizeof(uint32_t)*t->maxJamoExpansions->size);
   1.336 +            /* test for NULL */
   1.337 +            if (r->maxJamoExpansions->endExpansionCE == NULL) {
   1.338 +                *status = U_MEMORY_ALLOCATION_ERROR;
   1.339 +                goto cleanup;
   1.340 +            }
   1.341 +            uprv_memcpy(r->maxJamoExpansions->endExpansionCE, t->maxJamoExpansions->endExpansionCE, t->maxJamoExpansions->position*sizeof(uint32_t));
   1.342 +            r->maxJamoExpansions->isV = (UBool *)uprv_malloc(sizeof(UBool)*t->maxJamoExpansions->size);
   1.343 +            /* test for NULL */
   1.344 +            if (r->maxJamoExpansions->isV == NULL) {
   1.345 +                *status = U_MEMORY_ALLOCATION_ERROR;
   1.346 +                goto cleanup;
   1.347 +            }
   1.348 +            uprv_memcpy(r->maxJamoExpansions->isV, t->maxJamoExpansions->isV, t->maxJamoExpansions->position*sizeof(UBool));
   1.349 +        } else {
   1.350 +            r->maxJamoExpansions->endExpansionCE = NULL;
   1.351 +            r->maxJamoExpansions->isV = NULL;
   1.352 +        }
   1.353 +    }
   1.354 +
   1.355 +    if(t->unsafeCP != NULL) {
   1.356 +        r->unsafeCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE);
   1.357 +        /* test for NULL */
   1.358 +        if (r->unsafeCP == NULL) {
   1.359 +            *status = U_MEMORY_ALLOCATION_ERROR;
   1.360 +            goto cleanup;
   1.361 +        }
   1.362 +        uprv_memcpy(r->unsafeCP, t->unsafeCP, UCOL_UNSAFECP_TABLE_SIZE);
   1.363 +    }
   1.364 +
   1.365 +    if(t->contrEndCP != NULL) {
   1.366 +        r->contrEndCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE);
   1.367 +        /* test for NULL */
   1.368 +        if (r->contrEndCP == NULL) {
   1.369 +            *status = U_MEMORY_ALLOCATION_ERROR;
   1.370 +            goto cleanup;
   1.371 +        }
   1.372 +        uprv_memcpy(r->contrEndCP, t->contrEndCP, UCOL_UNSAFECP_TABLE_SIZE);
   1.373 +    }
   1.374 +
   1.375 +    r->UCA = t->UCA;
   1.376 +    r->image = t->image;
   1.377 +    r->options = t->options;
   1.378 +
   1.379 +    return r;
   1.380 +cleanup:
   1.381 +    uprv_uca_closeTempTable(t);
   1.382 +    return NULL;
   1.383 +}
   1.384 +
   1.385 +
   1.386 +U_CAPI void  U_EXPORT2
   1.387 +uprv_uca_closeTempTable(tempUCATable *t) {
   1.388 +    if(t != NULL) {
   1.389 +        if (t->expansions != NULL) {
   1.390 +            uprv_free(t->expansions->CEs);
   1.391 +            uprv_free(t->expansions);
   1.392 +        }
   1.393 +        if(t->contractions != NULL) {
   1.394 +            uprv_cnttab_close(t->contractions);
   1.395 +        }
   1.396 +        if (t->mapping != NULL) {
   1.397 +            utrie_close(t->mapping);
   1.398 +        }
   1.399 +
   1.400 +        if(t->prefixLookup != NULL) {
   1.401 +            uhash_close(t->prefixLookup);
   1.402 +        }
   1.403 +
   1.404 +        if (t->maxExpansions != NULL) {
   1.405 +            uprv_free(t->maxExpansions->endExpansionCE);
   1.406 +            uprv_free(t->maxExpansions->expansionCESize);
   1.407 +            uprv_free(t->maxExpansions);
   1.408 +        }
   1.409 +
   1.410 +        if (t->maxJamoExpansions->size > 0) {
   1.411 +            uprv_free(t->maxJamoExpansions->endExpansionCE);
   1.412 +            uprv_free(t->maxJamoExpansions->isV);
   1.413 +        }
   1.414 +        uprv_free(t->maxJamoExpansions);
   1.415 +
   1.416 +        uprv_free(t->unsafeCP);
   1.417 +        uprv_free(t->contrEndCP);
   1.418 +        
   1.419 +        if (t->cmLookup != NULL) {
   1.420 +            uprv_free(t->cmLookup->cPoints);
   1.421 +            uprv_free(t->cmLookup);
   1.422 +        }
   1.423 +
   1.424 +        uprv_free(t);
   1.425 +    }
   1.426 +}
   1.427 +
   1.428 +/**
   1.429 +* Looks for the maximum length of all expansion sequences ending with the same
   1.430 +* collation element. The size required for maxexpansion and maxsize is 
   1.431 +* returned if the arrays are too small.
   1.432 +* @param endexpansion the last expansion collation element to be added
   1.433 +* @param expansionsize size of the expansion
   1.434 +* @param maxexpansion data structure to store the maximum expansion data.
   1.435 +* @param status error status
   1.436 +* @returns size of the maxexpansion and maxsize used.
   1.437 +*/
   1.438 +static int uprv_uca_setMaxExpansion(uint32_t           endexpansion,
   1.439 +                                    uint8_t            expansionsize,
   1.440 +                                    MaxExpansionTable *maxexpansion,
   1.441 +                                    UErrorCode        *status)
   1.442 +{
   1.443 +    if (maxexpansion->size == 0) {
   1.444 +        /* we'll always make the first element 0, for easier manipulation */
   1.445 +        maxexpansion->endExpansionCE = 
   1.446 +            (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(int32_t));
   1.447 +        /* test for NULL */
   1.448 +        if (maxexpansion->endExpansionCE == NULL) {
   1.449 +            *status = U_MEMORY_ALLOCATION_ERROR;
   1.450 +            return 0;
   1.451 +        }
   1.452 +        *(maxexpansion->endExpansionCE) = 0;
   1.453 +        maxexpansion->expansionCESize =
   1.454 +            (uint8_t *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(uint8_t));
   1.455 +        /* test for NULL */;
   1.456 +        if (maxexpansion->expansionCESize == NULL) {
   1.457 +            *status = U_MEMORY_ALLOCATION_ERROR;
   1.458 +            return 0;
   1.459 +        }
   1.460 +        *(maxexpansion->expansionCESize) = 0;
   1.461 +        maxexpansion->size     = INIT_EXP_TABLE_SIZE;
   1.462 +        maxexpansion->position = 0;
   1.463 +    }
   1.464 +
   1.465 +    if (maxexpansion->position + 1 == maxexpansion->size) {
   1.466 +        uint32_t *neweece = (uint32_t *)uprv_realloc(maxexpansion->endExpansionCE, 
   1.467 +            2 * maxexpansion->size * sizeof(uint32_t));
   1.468 +        if (neweece == NULL) {
   1.469 +            *status = U_MEMORY_ALLOCATION_ERROR;
   1.470 +            return 0;
   1.471 +        }
   1.472 +        maxexpansion->endExpansionCE  = neweece;
   1.473 +
   1.474 +        uint8_t  *neweces = (uint8_t *)uprv_realloc(maxexpansion->expansionCESize, 
   1.475 +            2 * maxexpansion->size * sizeof(uint8_t));
   1.476 +        if (neweces == NULL) {
   1.477 +            *status = U_MEMORY_ALLOCATION_ERROR;
   1.478 +            return 0;
   1.479 +        }
   1.480 +        maxexpansion->expansionCESize = neweces;
   1.481 +        maxexpansion->size *= 2;
   1.482 +    }
   1.483 +
   1.484 +    uint32_t *pendexpansionce = maxexpansion->endExpansionCE;
   1.485 +    uint8_t  *pexpansionsize  = maxexpansion->expansionCESize;
   1.486 +    int      pos              = maxexpansion->position;
   1.487 +
   1.488 +    uint32_t *start = pendexpansionce;
   1.489 +    uint32_t *limit = pendexpansionce + pos;
   1.490 +
   1.491 +    /* using binary search to determine if last expansion element is
   1.492 +    already in the array */
   1.493 +    uint32_t *mid;
   1.494 +    int       result = -1;
   1.495 +    while (start < limit - 1) {
   1.496 +        mid = start + ((limit - start) >> 1);
   1.497 +        if (endexpansion <= *mid) {
   1.498 +            limit = mid;
   1.499 +        }
   1.500 +        else {
   1.501 +            start = mid;
   1.502 +        }
   1.503 +    }
   1.504 +
   1.505 +    if (*start == endexpansion) {
   1.506 +        result = (int)(start - pendexpansionce);
   1.507 +    }
   1.508 +    else if (*limit == endexpansion) {
   1.509 +        result = (int)(limit - pendexpansionce);
   1.510 +    }
   1.511 +
   1.512 +    if (result > -1) {
   1.513 +        /* found the ce in expansion, we'll just modify the size if it is
   1.514 +        smaller */
   1.515 +        uint8_t *currentsize = pexpansionsize + result;
   1.516 +        if (*currentsize < expansionsize) {
   1.517 +            *currentsize = expansionsize;
   1.518 +        }
   1.519 +    }
   1.520 +    else {
   1.521 +        /* we'll need to squeeze the value into the array.
   1.522 +        initial implementation. */
   1.523 +        /* shifting the subarray down by 1 */
   1.524 +        int      shiftsize     = (int)((pendexpansionce + pos) - start);
   1.525 +        uint32_t *shiftpos     = start + 1;
   1.526 +        uint8_t  *sizeshiftpos = pexpansionsize + (shiftpos - pendexpansionce);
   1.527 +
   1.528 +        /* okay need to rearrange the array into sorted order */
   1.529 +        if (shiftsize == 0 /*|| *(pendexpansionce + pos) < endexpansion*/) { /* the commented part is actually both redundant and dangerous */
   1.530 +            *(pendexpansionce + pos + 1) = endexpansion;
   1.531 +            *(pexpansionsize + pos + 1)  = expansionsize;
   1.532 +        }
   1.533 +        else {
   1.534 +            uprv_memmove(shiftpos + 1, shiftpos, shiftsize * sizeof(int32_t));
   1.535 +            uprv_memmove(sizeshiftpos + 1, sizeshiftpos, 
   1.536 +                shiftsize * sizeof(uint8_t));
   1.537 +            *shiftpos     = endexpansion;
   1.538 +            *sizeshiftpos = expansionsize;
   1.539 +        }
   1.540 +        maxexpansion->position ++;
   1.541 +
   1.542 +#ifdef UCOL_DEBUG
   1.543 +        int   temp;
   1.544 +        UBool found = FALSE;
   1.545 +        for (temp = 0; temp < maxexpansion->position; temp ++) {
   1.546 +            if (pendexpansionce[temp] >= pendexpansionce[temp + 1]) {
   1.547 +                fprintf(stderr, "expansions %d\n", temp);
   1.548 +            }
   1.549 +            if (pendexpansionce[temp] == endexpansion) {
   1.550 +                found =TRUE;
   1.551 +                if (pexpansionsize[temp] < expansionsize) {
   1.552 +                    fprintf(stderr, "expansions size %d\n", temp);
   1.553 +                }
   1.554 +            }
   1.555 +        }
   1.556 +        if (pendexpansionce[temp] == endexpansion) {
   1.557 +            found =TRUE;
   1.558 +            if (pexpansionsize[temp] < expansionsize) {
   1.559 +                fprintf(stderr, "expansions size %d\n", temp);
   1.560 +            }
   1.561 +        }
   1.562 +        if (!found)
   1.563 +            fprintf(stderr, "expansion not found %d\n", temp);
   1.564 +#endif
   1.565 +    }
   1.566 +
   1.567 +    return maxexpansion->position;
   1.568 +}
   1.569 +
   1.570 +/**
   1.571 +* Sets the maximum length of all jamo expansion sequences ending with the same
   1.572 +* collation element. The size required for maxexpansion and maxsize is 
   1.573 +* returned if the arrays are too small.
   1.574 +* @param ch the jamo codepoint
   1.575 +* @param endexpansion the last expansion collation element to be added
   1.576 +* @param expansionsize size of the expansion
   1.577 +* @param maxexpansion data structure to store the maximum expansion data.
   1.578 +* @param status error status
   1.579 +* @returns size of the maxexpansion and maxsize used.
   1.580 +*/
   1.581 +static int uprv_uca_setMaxJamoExpansion(UChar                  ch,
   1.582 +                                        uint32_t               endexpansion,
   1.583 +                                        uint8_t                expansionsize,
   1.584 +                                        MaxJamoExpansionTable *maxexpansion,
   1.585 +                                        UErrorCode            *status)
   1.586 +{
   1.587 +    UBool isV = TRUE;
   1.588 +    if (((uint32_t)ch - 0x1100) <= (0x1112 - 0x1100)) {
   1.589 +        /* determines L for Jamo, doesn't need to store this since it is never
   1.590 +        at the end of a expansion */
   1.591 +        if (maxexpansion->maxLSize < expansionsize) {
   1.592 +            maxexpansion->maxLSize = expansionsize;
   1.593 +        }
   1.594 +        return maxexpansion->position;
   1.595 +    }
   1.596 +
   1.597 +    if (((uint32_t)ch - 0x1161) <= (0x1175 - 0x1161)) {
   1.598 +        /* determines V for Jamo */
   1.599 +        if (maxexpansion->maxVSize < expansionsize) {
   1.600 +            maxexpansion->maxVSize = expansionsize;
   1.601 +        }
   1.602 +    }
   1.603 +
   1.604 +    if (((uint32_t)ch - 0x11A8) <= (0x11C2 - 0x11A8)) {
   1.605 +        isV = FALSE;
   1.606 +        /* determines T for Jamo */
   1.607 +        if (maxexpansion->maxTSize < expansionsize) {
   1.608 +            maxexpansion->maxTSize = expansionsize;
   1.609 +        }
   1.610 +    }
   1.611 +
   1.612 +    if (maxexpansion->size == 0) {
   1.613 +        /* we'll always make the first element 0, for easier manipulation */
   1.614 +        maxexpansion->endExpansionCE = 
   1.615 +            (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(uint32_t));
   1.616 +        /* test for NULL */;
   1.617 +        if (maxexpansion->endExpansionCE == NULL) {
   1.618 +            *status = U_MEMORY_ALLOCATION_ERROR;
   1.619 +            return 0;
   1.620 +        }
   1.621 +        *(maxexpansion->endExpansionCE) = 0;
   1.622 +        maxexpansion->isV = 
   1.623 +            (UBool *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(UBool));
   1.624 +        /* test for NULL */;
   1.625 +        if (maxexpansion->isV == NULL) {
   1.626 +            *status = U_MEMORY_ALLOCATION_ERROR;
   1.627 +            uprv_free(maxexpansion->endExpansionCE);
   1.628 +            maxexpansion->endExpansionCE = NULL;
   1.629 +            return 0;
   1.630 +        }
   1.631 +        *(maxexpansion->isV) = 0;
   1.632 +        maxexpansion->size     = INIT_EXP_TABLE_SIZE;
   1.633 +        maxexpansion->position = 0;
   1.634 +    }
   1.635 +
   1.636 +    if (maxexpansion->position + 1 == maxexpansion->size) {
   1.637 +        maxexpansion->size *= 2;
   1.638 +        maxexpansion->endExpansionCE = (uint32_t *)uprv_realloc(maxexpansion->endExpansionCE, 
   1.639 +            maxexpansion->size * sizeof(uint32_t));
   1.640 +        if (maxexpansion->endExpansionCE == NULL) {
   1.641 +#ifdef UCOL_DEBUG
   1.642 +            fprintf(stderr, "out of memory for maxExpansions\n");
   1.643 +#endif
   1.644 +            *status = U_MEMORY_ALLOCATION_ERROR;
   1.645 +            return 0;
   1.646 +        }
   1.647 +        maxexpansion->isV  = (UBool *)uprv_realloc(maxexpansion->isV, 
   1.648 +            maxexpansion->size * sizeof(UBool));
   1.649 +        if (maxexpansion->isV == NULL) {
   1.650 +#ifdef UCOL_DEBUG
   1.651 +            fprintf(stderr, "out of memory for maxExpansions\n");
   1.652 +#endif
   1.653 +            *status = U_MEMORY_ALLOCATION_ERROR;
   1.654 +            uprv_free(maxexpansion->endExpansionCE);
   1.655 +            maxexpansion->endExpansionCE = NULL;
   1.656 +            return 0;
   1.657 +        }
   1.658 +    }
   1.659 +
   1.660 +    uint32_t *pendexpansionce = maxexpansion->endExpansionCE;
   1.661 +    int       pos             = maxexpansion->position;
   1.662 +
   1.663 +    while (pos > 0) {
   1.664 +        pos --;
   1.665 +        if (*(pendexpansionce + pos) == endexpansion) {
   1.666 +            return maxexpansion->position;
   1.667 +        }
   1.668 +    }
   1.669 +
   1.670 +    *(pendexpansionce + maxexpansion->position) = endexpansion;
   1.671 +    *(maxexpansion->isV + maxexpansion->position) = isV;
   1.672 +    maxexpansion->position ++;
   1.673 +
   1.674 +    return maxexpansion->position;
   1.675 +}
   1.676 +
   1.677 +
   1.678 +static void ContrEndCPSet(uint8_t *table, UChar c) {
   1.679 +    uint32_t    hash;
   1.680 +    uint8_t     *htByte;
   1.681 +
   1.682 +    hash = c;
   1.683 +    if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
   1.684 +        hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
   1.685 +    }
   1.686 +    htByte = &table[hash>>3];
   1.687 +    *htByte |= (1 << (hash & 7));
   1.688 +}
   1.689 +
   1.690 +
   1.691 +static void unsafeCPSet(uint8_t *table, UChar c) {
   1.692 +    uint32_t    hash;
   1.693 +    uint8_t     *htByte;
   1.694 +
   1.695 +    hash = c;
   1.696 +    if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
   1.697 +        if (hash >= 0xd800 && hash <= 0xf8ff) {
   1.698 +            /*  Part of a surrogate, or in private use area.            */
   1.699 +            /*   These don't go in the table                            */
   1.700 +            return;
   1.701 +        }
   1.702 +        hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
   1.703 +    }
   1.704 +    htByte = &table[hash>>3];
   1.705 +    *htByte |= (1 << (hash & 7));
   1.706 +}
   1.707 +
   1.708 +static void
   1.709 +uprv_uca_createCMTable(tempUCATable *t, int32_t noOfCM, UErrorCode *status) {
   1.710 +    t->cmLookup = (CombinClassTable *)uprv_malloc(sizeof(CombinClassTable));
   1.711 +    if (t->cmLookup==NULL) {
   1.712 +        *status = U_MEMORY_ALLOCATION_ERROR;
   1.713 +        return;
   1.714 +    }
   1.715 +    t->cmLookup->cPoints=(UChar *)uprv_malloc(noOfCM*sizeof(UChar));
   1.716 +    if (t->cmLookup->cPoints ==NULL) {
   1.717 +        uprv_free(t->cmLookup);
   1.718 +        t->cmLookup = NULL;
   1.719 +        *status = U_MEMORY_ALLOCATION_ERROR;
   1.720 +        return;
   1.721 +    }
   1.722 +
   1.723 +    t->cmLookup->size=noOfCM;
   1.724 +    uprv_memset(t->cmLookup->index, 0, sizeof(t->cmLookup->index));
   1.725 +
   1.726 +    return;
   1.727 +}
   1.728 +
   1.729 +static void
   1.730 +uprv_uca_copyCMTable(tempUCATable *t, UChar *cm, uint16_t *index) {
   1.731 +    int32_t count=0;
   1.732 +
   1.733 +    for (int32_t i=0; i<256; ++i) {
   1.734 +        if (index[i]>0) {
   1.735 +            // cPoints is ordered by combining class value.
   1.736 +            uprv_memcpy(t->cmLookup->cPoints+count, cm+(i<<8), index[i]*sizeof(UChar));
   1.737 +            count += index[i];
   1.738 +        }
   1.739 +        t->cmLookup->index[i]=count;
   1.740 +    }
   1.741 +    return;
   1.742 +}
   1.743 +
   1.744 +/* 1. to the UnsafeCP hash table, add all chars with combining class != 0     */
   1.745 +/* 2. build combining marks table for all chars with combining class != 0     */
   1.746 +static void uprv_uca_unsafeCPAddCCNZ(tempUCATable *t, UErrorCode *status) {
   1.747 +
   1.748 +    UChar              c;
   1.749 +    uint16_t           fcd;     // Hi byte is lead combining class. lo byte is trailing combing class.
   1.750 +    UBool buildCMTable = (t->cmLookup==NULL); // flag for building combining class table
   1.751 +    UChar *cm=NULL;
   1.752 +    uint16_t index[256];
   1.753 +    int32_t count=0;
   1.754 +    const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
   1.755 +    if (U_FAILURE(*status)) {
   1.756 +        return;
   1.757 +    }
   1.758 +
   1.759 +    if (buildCMTable) {
   1.760 +        if (cm==NULL) {
   1.761 +            cm = (UChar *)uprv_malloc(sizeof(UChar)*UCOL_MAX_CM_TAB);
   1.762 +            if (cm==NULL) {
   1.763 +                *status = U_MEMORY_ALLOCATION_ERROR;
   1.764 +                return;
   1.765 +            }
   1.766 +        }
   1.767 +        uprv_memset(index, 0, sizeof(index));
   1.768 +    }
   1.769 +    for (c=0; c<0xffff; c++) {
   1.770 +        if (U16_IS_LEAD(c)) {
   1.771 +            fcd = 0;
   1.772 +            if (nfcImpl->singleLeadMightHaveNonZeroFCD16(c)) {
   1.773 +                UChar32 supp = U16_GET_SUPPLEMENTARY(c, 0xdc00);
   1.774 +                UChar32 suppLimit = supp + 0x400;
   1.775 +                while (supp < suppLimit) {
   1.776 +                    fcd |= nfcImpl->getFCD16FromNormData(supp++);
   1.777 +                }
   1.778 +            }
   1.779 +        } else {
   1.780 +            fcd = nfcImpl->getFCD16(c);
   1.781 +        }
   1.782 +        if (fcd >= 0x100 ||               // if the leading combining class(c) > 0 ||
   1.783 +            (U16_IS_LEAD(c) && fcd != 0)) {//    c is a leading surrogate with some FCD data
   1.784 +            if (buildCMTable) {
   1.785 +                uint32_t cClass = fcd & 0xff;
   1.786 +                //uint32_t temp=(cClass<<8)+index[cClass];
   1.787 +                cm[(cClass<<8)+index[cClass]] = c; //
   1.788 +                index[cClass]++;
   1.789 +                count++;
   1.790 +            }
   1.791 +            unsafeCPSet(t->unsafeCP, c);
   1.792 +        }
   1.793 +    }
   1.794 +
   1.795 +    // copy to cm table
   1.796 +    if (buildCMTable) {
   1.797 +        uprv_uca_createCMTable(t, count, status);
   1.798 +        if(U_FAILURE(*status)) {
   1.799 +            if (cm!=NULL) {
   1.800 +                uprv_free(cm);
   1.801 +            }
   1.802 +            return;
   1.803 +        }
   1.804 +        uprv_uca_copyCMTable(t, cm, index);
   1.805 +    }
   1.806 +
   1.807 +    if(t->prefixLookup != NULL) {
   1.808 +        int32_t i = -1;
   1.809 +        const UHashElement *e = NULL;
   1.810 +        UCAElements *element = NULL;
   1.811 +        UChar NFCbuf[256];
   1.812 +        while((e = uhash_nextElement(t->prefixLookup, &i)) != NULL) {
   1.813 +            element = (UCAElements *)e->value.pointer;
   1.814 +            // codepoints here are in the NFD form. We need to add the
   1.815 +            // first code point of the NFC form to unsafe, because
   1.816 +            // strcoll needs to backup over them.
   1.817 +            unorm_normalize(element->cPoints, element->cSize, UNORM_NFC, 0,
   1.818 +                NFCbuf, 256, status);
   1.819 +            unsafeCPSet(t->unsafeCP, NFCbuf[0]);
   1.820 +        }
   1.821 +    }
   1.822 +
   1.823 +    if (cm!=NULL) {
   1.824 +        uprv_free(cm);
   1.825 +    }
   1.826 +}
   1.827 +
   1.828 +static uint32_t uprv_uca_addPrefix(tempUCATable *t, uint32_t CE,
   1.829 +                                   UCAElements *element, UErrorCode *status)
   1.830 +{
   1.831 +    // currently the longest prefix we're supporting in Japanese is two characters
   1.832 +    // long. Although this table could quite easily mimic complete contraction stuff
   1.833 +    // there is no good reason to make a general solution, as it would require some
   1.834 +    // error prone messing.
   1.835 +    CntTable *contractions = t->contractions;
   1.836 +    UChar32 cp;
   1.837 +    uint32_t cpsize = 0;
   1.838 +    UChar *oldCP = element->cPoints;
   1.839 +    uint32_t oldCPSize = element->cSize;
   1.840 +
   1.841 +
   1.842 +    contractions->currentTag = SPEC_PROC_TAG;
   1.843 +
   1.844 +    // here, we will normalize & add prefix to the table.
   1.845 +    uint32_t j = 0;
   1.846 +#ifdef UCOL_DEBUG
   1.847 +    for(j=0; j<element->cSize; j++) {
   1.848 +        fprintf(stdout, "CP: %04X ", element->cPoints[j]);
   1.849 +    }
   1.850 +    fprintf(stdout, "El: %08X Pref: ", CE);
   1.851 +    for(j=0; j<element->prefixSize; j++) {
   1.852 +        fprintf(stdout, "%04X ", element->prefix[j]);
   1.853 +    }
   1.854 +    fprintf(stdout, "%08X ", element->mapCE);
   1.855 +#endif
   1.856 +
   1.857 +    for (j = 1; j<element->prefixSize; j++) {   /* First add NFD prefix chars to unsafe CP hash table */
   1.858 +        // Unless it is a trail surrogate, which is handled algoritmically and
   1.859 +        // shouldn't take up space in the table.
   1.860 +        if(!(U16_IS_TRAIL(element->prefix[j]))) {
   1.861 +            unsafeCPSet(t->unsafeCP, element->prefix[j]);
   1.862 +        }
   1.863 +    }
   1.864 +
   1.865 +    UChar tempPrefix = 0;
   1.866 +
   1.867 +    for(j = 0; j < /*nfcSize*/element->prefixSize/2; j++) { // prefixes are going to be looked up backwards
   1.868 +        // therefore, we will promptly reverse the prefix buffer...
   1.869 +        tempPrefix = *(/*nfcBuffer*/element->prefix+element->prefixSize-j-1);
   1.870 +        *(/*nfcBuffer*/element->prefix+element->prefixSize-j-1) = element->prefix[j];
   1.871 +        element->prefix[j] = tempPrefix;
   1.872 +    }
   1.873 +
   1.874 +#ifdef UCOL_DEBUG
   1.875 +    fprintf(stdout, "Reversed: ");
   1.876 +    for(j=0; j<element->prefixSize; j++) {
   1.877 +        fprintf(stdout, "%04X ", element->prefix[j]);
   1.878 +    }
   1.879 +    fprintf(stdout, "%08X\n", element->mapCE);
   1.880 +#endif
   1.881 +
   1.882 +    // the first codepoint is also unsafe, as it forms a 'contraction' with the prefix
   1.883 +    if(!(U16_IS_TRAIL(element->cPoints[0]))) {
   1.884 +        unsafeCPSet(t->unsafeCP, element->cPoints[0]);
   1.885 +    }
   1.886 +
   1.887 +    // Maybe we need this... To handle prefixes completely in the forward direction...
   1.888 +    //if(element->cSize == 1) {
   1.889 +    //  if(!(U16_IS_TRAIL(element->cPoints[0]))) {
   1.890 +    //    ContrEndCPSet(t->contrEndCP, element->cPoints[0]);
   1.891 +    //  }
   1.892 +    //}
   1.893 +
   1.894 +    element->cPoints = element->prefix;
   1.895 +    element->cSize = element->prefixSize;
   1.896 +
   1.897 +    // Add the last char of the contraction to the contraction-end hash table.
   1.898 +    // unless it is a trail surrogate, which is handled algorithmically and
   1.899 +    // shouldn't be in the table
   1.900 +    if(!(U16_IS_TRAIL(element->cPoints[element->cSize -1]))) {
   1.901 +        ContrEndCPSet(t->contrEndCP, element->cPoints[element->cSize -1]);
   1.902 +    }
   1.903 +
   1.904 +    // First we need to check if contractions starts with a surrogate
   1.905 +    U16_NEXT(element->cPoints, cpsize, element->cSize, cp);
   1.906 +
   1.907 +    // If there are any Jamos in the contraction, we should turn on special
   1.908 +    // processing for Jamos
   1.909 +    if(UCOL_ISJAMO(element->prefix[0])) {
   1.910 +        t->image->jamoSpecial = TRUE;
   1.911 +    }
   1.912 +    /* then we need to deal with it */
   1.913 +    /* we could aready have something in table - or we might not */
   1.914 +
   1.915 +    if(!isPrefix(CE)) {
   1.916 +        /* if it wasn't contraction, we wouldn't end up here*/
   1.917 +        int32_t firstContractionOffset = 0;
   1.918 +        firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, CE, status);
   1.919 +        uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
   1.920 +        uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->prefix, newCE, status);
   1.921 +        uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, CE, status);
   1.922 +        CE =  constructContractCE(SPEC_PROC_TAG, firstContractionOffset);
   1.923 +    } else { /* we are adding to existing contraction */
   1.924 +        /* there were already some elements in the table, so we need to add a new contraction */
   1.925 +        /* Two things can happen here: either the codepoint is already in the table, or it is not */
   1.926 +        int32_t position = uprv_cnttab_findCP(contractions, CE, *element->prefix, status);
   1.927 +        if(position > 0) {       /* if it is we just continue down the chain */
   1.928 +            uint32_t eCE = uprv_cnttab_getCE(contractions, CE, position, status);
   1.929 +            uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status);
   1.930 +            uprv_cnttab_setContraction(contractions, CE, position, *(element->prefix), newCE, status);
   1.931 +        } else {                  /* if it isn't, we will have to create a new sequence */
   1.932 +            uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
   1.933 +            uprv_cnttab_insertContraction(contractions, CE, *(element->prefix), element->mapCE, status);
   1.934 +        }
   1.935 +    }
   1.936 +
   1.937 +    element->cPoints = oldCP;
   1.938 +    element->cSize = oldCPSize;
   1.939 +
   1.940 +    return CE;
   1.941 +}
   1.942 +
   1.943 +// Note regarding surrogate handling: We are interested only in the single
   1.944 +// or leading surrogates in a contraction. If a surrogate is somewhere else
   1.945 +// in the contraction, it is going to be handled as a pair of code units,
   1.946 +// as it doesn't affect the performance AND handling surrogates specially
   1.947 +// would complicate code way too much.
   1.948 +static uint32_t uprv_uca_addContraction(tempUCATable *t, uint32_t CE, 
   1.949 +                                        UCAElements *element, UErrorCode *status)
   1.950 +{
   1.951 +    CntTable *contractions = t->contractions;
   1.952 +    UChar32 cp;
   1.953 +    uint32_t cpsize = 0;
   1.954 +
   1.955 +    contractions->currentTag = CONTRACTION_TAG;
   1.956 +
   1.957 +    // First we need to check if contractions starts with a surrogate
   1.958 +    U16_NEXT(element->cPoints, cpsize, element->cSize, cp);
   1.959 +
   1.960 +    if(cpsize<element->cSize) { // This is a real contraction, if there are other characters after the first
   1.961 +        uint32_t j = 0;
   1.962 +        for (j=1; j<element->cSize; j++) {   /* First add contraction chars to unsafe CP hash table */
   1.963 +            // Unless it is a trail surrogate, which is handled algoritmically and 
   1.964 +            // shouldn't take up space in the table.
   1.965 +            if(!(U16_IS_TRAIL(element->cPoints[j]))) {
   1.966 +                unsafeCPSet(t->unsafeCP, element->cPoints[j]);
   1.967 +            }
   1.968 +        }
   1.969 +        // Add the last char of the contraction to the contraction-end hash table.
   1.970 +        // unless it is a trail surrogate, which is handled algorithmically and 
   1.971 +        // shouldn't be in the table
   1.972 +        if(!(U16_IS_TRAIL(element->cPoints[element->cSize -1]))) {
   1.973 +            ContrEndCPSet(t->contrEndCP, element->cPoints[element->cSize -1]);
   1.974 +        }
   1.975 +
   1.976 +        // If there are any Jamos in the contraction, we should turn on special 
   1.977 +        // processing for Jamos
   1.978 +        if(UCOL_ISJAMO(element->cPoints[0])) {
   1.979 +            t->image->jamoSpecial = TRUE;
   1.980 +        }
   1.981 +        /* then we need to deal with it */
   1.982 +        /* we could aready have something in table - or we might not */
   1.983 +        element->cPoints+=cpsize;
   1.984 +        element->cSize-=cpsize;
   1.985 +        if(!isContraction(CE)) { 
   1.986 +            /* if it wasn't contraction, we wouldn't end up here*/
   1.987 +            int32_t firstContractionOffset = 0;
   1.988 +            firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, CE, status);
   1.989 +            uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
   1.990 +            uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->cPoints, newCE, status);
   1.991 +            uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, CE, status);
   1.992 +            CE =  constructContractCE(CONTRACTION_TAG, firstContractionOffset);
   1.993 +        } else { /* we are adding to existing contraction */
   1.994 +            /* there were already some elements in the table, so we need to add a new contraction */
   1.995 +            /* Two things can happen here: either the codepoint is already in the table, or it is not */
   1.996 +            int32_t position = uprv_cnttab_findCP(contractions, CE, *element->cPoints, status);
   1.997 +            if(position > 0) {       /* if it is we just continue down the chain */
   1.998 +                uint32_t eCE = uprv_cnttab_getCE(contractions, CE, position, status);
   1.999 +                uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status);
  1.1000 +                uprv_cnttab_setContraction(contractions, CE, position, *(element->cPoints), newCE, status);
  1.1001 +            } else {                  /* if it isn't, we will have to create a new sequence */
  1.1002 +                uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
  1.1003 +                uprv_cnttab_insertContraction(contractions, CE, *(element->cPoints), newCE, status);
  1.1004 +            }
  1.1005 +        }
  1.1006 +        element->cPoints-=cpsize;
  1.1007 +        element->cSize+=cpsize;
  1.1008 +        /*ucmpe32_set(t->mapping, cp, CE);*/
  1.1009 +        utrie_set32(t->mapping, cp, CE);
  1.1010 +    } else if(!isContraction(CE)) { /* this is just a surrogate, and there is no contraction */
  1.1011 +        /*ucmpe32_set(t->mapping, cp, element->mapCE);*/
  1.1012 +        utrie_set32(t->mapping, cp, element->mapCE);
  1.1013 +    } else { /* fill out the first stage of the contraction with the surrogate CE */
  1.1014 +        uprv_cnttab_changeContraction(contractions, CE, 0, element->mapCE, status);
  1.1015 +        uprv_cnttab_changeContraction(contractions, CE, 0xFFFF, element->mapCE, status);
  1.1016 +    }
  1.1017 +    return CE;
  1.1018 +}
  1.1019 +
  1.1020 +
  1.1021 +static uint32_t uprv_uca_processContraction(CntTable *contractions, UCAElements *element, uint32_t existingCE, UErrorCode *status) {
  1.1022 +    int32_t firstContractionOffset = 0;
  1.1023 +    //    uint32_t contractionElement = UCOL_NOT_FOUND;
  1.1024 +
  1.1025 +    if(U_FAILURE(*status)) {
  1.1026 +        return UCOL_NOT_FOUND;
  1.1027 +    }
  1.1028 +
  1.1029 +    /* end of recursion */
  1.1030 +    if(element->cSize == 1) {
  1.1031 +        if(isCntTableElement(existingCE) && ((UColCETags)getCETag(existingCE) == contractions->currentTag)) {
  1.1032 +            uprv_cnttab_changeContraction(contractions, existingCE, 0, element->mapCE, status);
  1.1033 +            uprv_cnttab_changeContraction(contractions, existingCE, 0xFFFF, element->mapCE, status);
  1.1034 +            return existingCE;
  1.1035 +        } else {
  1.1036 +            return element->mapCE; /*can't do just that. existingCe might be a contraction, meaning that we need to do another step */
  1.1037 +        }
  1.1038 +    }
  1.1039 +
  1.1040 +    /* this recursion currently feeds on the only element we have... We will have to copy it in order to accomodate */
  1.1041 +    /* for both backward and forward cycles */
  1.1042 +
  1.1043 +    /* we encountered either an empty space or a non-contraction element */
  1.1044 +    /* this means we are constructing a new contraction sequence */
  1.1045 +    element->cPoints++;
  1.1046 +    element->cSize--;
  1.1047 +    if(!isCntTableElement(existingCE)) { 
  1.1048 +        /* if it wasn't contraction, we wouldn't end up here*/
  1.1049 +        firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, existingCE, status);
  1.1050 +        uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
  1.1051 +        uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->cPoints, newCE, status);
  1.1052 +        uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, existingCE, status);
  1.1053 +        existingCE =  constructContractCE(contractions->currentTag, firstContractionOffset);
  1.1054 +    } else { /* we are adding to existing contraction */
  1.1055 +        /* there were already some elements in the table, so we need to add a new contraction */
  1.1056 +        /* Two things can happen here: either the codepoint is already in the table, or it is not */
  1.1057 +        int32_t position = uprv_cnttab_findCP(contractions, existingCE, *element->cPoints, status);
  1.1058 +        if(position > 0) {       /* if it is we just continue down the chain */
  1.1059 +            uint32_t eCE = uprv_cnttab_getCE(contractions, existingCE, position, status);
  1.1060 +            uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status);
  1.1061 +            uprv_cnttab_setContraction(contractions, existingCE, position, *(element->cPoints), newCE, status);
  1.1062 +        } else {                  /* if it isn't, we will have to create a new sequence */
  1.1063 +            uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
  1.1064 +            uprv_cnttab_insertContraction(contractions, existingCE, *(element->cPoints), newCE, status);
  1.1065 +        }
  1.1066 +    }
  1.1067 +    element->cPoints--;
  1.1068 +    element->cSize++;
  1.1069 +    return existingCE;
  1.1070 +}
  1.1071 +
  1.1072 +static uint32_t uprv_uca_finalizeAddition(tempUCATable *t, UCAElements *element, UErrorCode *status) {
  1.1073 +    uint32_t CE = UCOL_NOT_FOUND;
  1.1074 +    // This should add a completely ignorable element to the 
  1.1075 +    // unsafe table, so that backward iteration will skip
  1.1076 +    // over it when treating contractions.
  1.1077 +    uint32_t i = 0;
  1.1078 +    if(element->mapCE == 0) {
  1.1079 +        for(i = 0; i < element->cSize; i++) {
  1.1080 +            if(!U16_IS_TRAIL(element->cPoints[i])) {
  1.1081 +                unsafeCPSet(t->unsafeCP, element->cPoints[i]);
  1.1082 +            }
  1.1083 +        }
  1.1084 +    }
  1.1085 +    if(element->cSize > 1) { /* we're adding a contraction */
  1.1086 +        uint32_t i = 0;
  1.1087 +        UChar32 cp;
  1.1088 +
  1.1089 +        U16_NEXT(element->cPoints, i, element->cSize, cp);
  1.1090 +        /*CE = ucmpe32_get(t->mapping, cp);*/
  1.1091 +        CE = utrie_get32(t->mapping, cp, NULL);
  1.1092 +
  1.1093 +        CE = uprv_uca_addContraction(t, CE, element, status);
  1.1094 +    } else { /* easy case, */
  1.1095 +        /*CE = ucmpe32_get(t->mapping, element->cPoints[0]);*/
  1.1096 +        CE = utrie_get32(t->mapping, element->cPoints[0], NULL);
  1.1097 +
  1.1098 +        if( CE != UCOL_NOT_FOUND) {
  1.1099 +            if(isCntTableElement(CE) /*isContraction(CE)*/) { /* adding a non contraction element (thai, expansion, single) to already existing contraction */
  1.1100 +                if(!isPrefix(element->mapCE)) { // we cannot reenter prefix elements - as we are going to create a dead loop
  1.1101 +                    // Only expansions and regular CEs can go here... Contractions will never happen in this place
  1.1102 +                    uprv_cnttab_setContraction(t->contractions, CE, 0, 0, element->mapCE, status);
  1.1103 +                    /* This loop has to change the CE at the end of contraction REDO!*/
  1.1104 +                    uprv_cnttab_changeLastCE(t->contractions, CE, element->mapCE, status);
  1.1105 +                }
  1.1106 +            } else {
  1.1107 +                /*ucmpe32_set(t->mapping, element->cPoints[0], element->mapCE);*/
  1.1108 +                utrie_set32(t->mapping, element->cPoints[0], element->mapCE);
  1.1109 +                if ((element->prefixSize!=0) && (!isSpecial(CE) || (getCETag(CE)!=IMPLICIT_TAG))) {
  1.1110 +                    UCAElements *origElem = (UCAElements *)uprv_malloc(sizeof(UCAElements));
  1.1111 +                    /* test for NULL */
  1.1112 +                    if (origElem== NULL) {
  1.1113 +                        *status = U_MEMORY_ALLOCATION_ERROR;
  1.1114 +                        return 0;
  1.1115 +                    }
  1.1116 +                    /* copy the original UCA value */
  1.1117 +                    origElem->prefixSize = 0;
  1.1118 +                    origElem->prefix = NULL;
  1.1119 +                    origElem->cPoints = origElem->uchars;
  1.1120 +                    origElem->cPoints[0] = element->cPoints[0];
  1.1121 +                    origElem->cSize = 1;
  1.1122 +                    origElem->CEs[0]=CE;
  1.1123 +                    origElem->mapCE=CE;
  1.1124 +                    origElem->noOfCEs=1;
  1.1125 +                    uprv_uca_finalizeAddition(t, origElem, status);
  1.1126 +                    uprv_free(origElem);
  1.1127 +                }
  1.1128 +#ifdef UCOL_DEBUG
  1.1129 +                fprintf(stderr, "Warning - trying to overwrite existing data %08X for cp %04X with %08X\n", CE, element->cPoints[0], element->CEs[0]);
  1.1130 +                //*status = U_ILLEGAL_ARGUMENT_ERROR;
  1.1131 +#endif
  1.1132 +            }
  1.1133 +        } else {
  1.1134 +            /*ucmpe32_set(t->mapping, element->cPoints[0], element->mapCE);*/
  1.1135 +            utrie_set32(t->mapping, element->cPoints[0], element->mapCE);
  1.1136 +        }
  1.1137 +    }
  1.1138 +    return CE;
  1.1139 +}
  1.1140 +
  1.1141 +/* This adds a read element, while testing for existence */
  1.1142 +U_CAPI uint32_t  U_EXPORT2
  1.1143 +uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status) {
  1.1144 +    U_NAMESPACE_USE
  1.1145 +
  1.1146 +    ExpansionTable *expansions = t->expansions;
  1.1147 +
  1.1148 +    uint32_t i = 1;
  1.1149 +    uint32_t expansion = 0;
  1.1150 +    uint32_t CE;
  1.1151 +
  1.1152 +    if(U_FAILURE(*status)) {
  1.1153 +        return 0xFFFF;
  1.1154 +    }
  1.1155 +
  1.1156 +    element->mapCE = 0; // clear mapCE so that we can catch expansions
  1.1157 +
  1.1158 +    if(element->noOfCEs == 1) {
  1.1159 +        element->mapCE = element->CEs[0];      
  1.1160 +    } else {     
  1.1161 +        /* ICU 2.1 long primaries */
  1.1162 +        /* unfortunately, it looks like we have to look for a long primary here */
  1.1163 +        /* since in canonical closure we are going to hit some long primaries from */
  1.1164 +        /* the first phase, and they will come back as continuations/expansions */
  1.1165 +        /* destroying the effect of the previous opitimization */
  1.1166 +        /* A long primary is a three byte primary with starting secondaries and tertiaries */
  1.1167 +        /* It can appear in long runs of only primary differences (like east Asian tailorings) */
  1.1168 +        /* also, it should not be an expansion, as expansions would break with this */
  1.1169 +        // This part came in from ucol_bld.cpp
  1.1170 +        //if(tok->expansion == 0
  1.1171 +        //&& noOfBytes[0] == 3 && noOfBytes[1] == 1 && noOfBytes[2] == 1
  1.1172 +        //&& CEparts[1] == (UCOL_BYTE_COMMON << 24) && CEparts[2] == (UCOL_BYTE_COMMON << 24)) {
  1.1173 +        /* we will construct a special CE that will go unchanged to the table */
  1.1174 +        if(element->noOfCEs == 2 // a two CE expansion 
  1.1175 +            && isContinuation(element->CEs[1]) // which  is a continuation
  1.1176 +            && (element->CEs[1] & (~(0xFF << 24 | UCOL_CONTINUATION_MARKER))) == 0 // that has only primaries in continuation,
  1.1177 +            && (((element->CEs[0]>>8) & 0xFF) == UCOL_BYTE_COMMON) // a common secondary
  1.1178 +            && ((element->CEs[0] & 0xFF) == UCOL_BYTE_COMMON) // and a common tertiary
  1.1179 +            )
  1.1180 +        {
  1.1181 +#ifdef UCOL_DEBUG
  1.1182 +            fprintf(stdout, "Long primary %04X\n", element->cPoints[0]);
  1.1183 +#endif
  1.1184 +            element->mapCE = UCOL_SPECIAL_FLAG | (LONG_PRIMARY_TAG<<24) // a long primary special
  1.1185 +                | ((element->CEs[0]>>8) & 0xFFFF00) // first and second byte of primary
  1.1186 +                | ((element->CEs[1]>>24) & 0xFF);   // third byte of primary
  1.1187 +        }
  1.1188 +        else {
  1.1189 +            expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT) 
  1.1190 +                | (((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4)
  1.1191 +                   & 0xFFFFF0));
  1.1192 +
  1.1193 +            for(i = 1; i<element->noOfCEs; i++) {
  1.1194 +                uprv_uca_addExpansion(expansions, element->CEs[i], status);
  1.1195 +            }
  1.1196 +            if(element->noOfCEs <= 0xF) {
  1.1197 +                expansion |= element->noOfCEs;
  1.1198 +            } else {
  1.1199 +                uprv_uca_addExpansion(expansions, 0, status);
  1.1200 +            }
  1.1201 +            element->mapCE = expansion;
  1.1202 +            uprv_uca_setMaxExpansion(element->CEs[element->noOfCEs - 1],
  1.1203 +                (uint8_t)element->noOfCEs,
  1.1204 +                t->maxExpansions,
  1.1205 +                status);
  1.1206 +            if(UCOL_ISJAMO(element->cPoints[0])) {
  1.1207 +                t->image->jamoSpecial = TRUE;
  1.1208 +                uprv_uca_setMaxJamoExpansion(element->cPoints[0],
  1.1209 +                    element->CEs[element->noOfCEs - 1],
  1.1210 +                    (uint8_t)element->noOfCEs,
  1.1211 +                    t->maxJamoExpansions,
  1.1212 +                    status);
  1.1213 +            }
  1.1214 +            if (U_FAILURE(*status)) {
  1.1215 +                return 0;
  1.1216 +            }
  1.1217 +        }
  1.1218 +    }
  1.1219 +
  1.1220 +    // We treat digits differently - they are "uber special" and should be
  1.1221 +    // processed differently if numeric collation is on. 
  1.1222 +    UChar32 uniChar = 0;
  1.1223 +    //printElement(element);
  1.1224 +    if ((element->cSize == 2) && U16_IS_LEAD(element->cPoints[0])){
  1.1225 +        uniChar = U16_GET_SUPPLEMENTARY(element->cPoints[0], element->cPoints[1]);
  1.1226 +    } else if (element->cSize == 1){
  1.1227 +        uniChar = element->cPoints[0];
  1.1228 +    }
  1.1229 +
  1.1230 +    // Here, we either have one normal CE OR mapCE is set. Therefore, we stuff only
  1.1231 +    // one element to the expansion buffer. When we encounter a digit and we don't 
  1.1232 +    // do numeric collation, we will just pick the CE we have and break out of case
  1.1233 +    // (see ucol.cpp ucol_prv_getSpecialCE && ucol_prv_getSpecialPrevCE). If we picked
  1.1234 +    // a special, further processing will occur. If it's a simple CE, we'll return due
  1.1235 +    // to how the loop is constructed.
  1.1236 +    if (uniChar != 0 && u_isdigit(uniChar)){
  1.1237 +        expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (DIGIT_TAG<<UCOL_TAG_SHIFT) | 1); // prepare the element
  1.1238 +        if(element->mapCE) { // if there is an expansion, we'll pick it here
  1.1239 +            expansion |= ((uprv_uca_addExpansion(expansions, element->mapCE, status)+(headersize>>2))<<4);
  1.1240 +        } else {
  1.1241 +            expansion |= ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4);
  1.1242 +        }
  1.1243 +        element->mapCE = expansion;
  1.1244 +
  1.1245 +        // Need to go back to the beginning of the digit string if in the middle!
  1.1246 +        if(uniChar <= 0xFFFF) { // supplementaries are always unsafe. API takes UChars
  1.1247 +            unsafeCPSet(t->unsafeCP, (UChar)uniChar);
  1.1248 +        }
  1.1249 +    }
  1.1250 +
  1.1251 +    // here we want to add the prefix structure.
  1.1252 +    // I will try to process it as a reverse contraction, if possible.
  1.1253 +    // prefix buffer is already reversed.
  1.1254 +
  1.1255 +    if(element->prefixSize!=0) {
  1.1256 +        // We keep the seen prefix starter elements in a hashtable
  1.1257 +        // we need it to be able to distinguish between the simple
  1.1258 +        // codepoints and prefix starters. Also, we need to use it
  1.1259 +        // for canonical closure.
  1.1260 +
  1.1261 +        UCAElements *composed = (UCAElements *)uprv_malloc(sizeof(UCAElements));
  1.1262 +        /* test for NULL */
  1.1263 +        if (composed == NULL) {
  1.1264 +            *status = U_MEMORY_ALLOCATION_ERROR;
  1.1265 +            return 0;
  1.1266 +        }
  1.1267 +        uprv_memcpy(composed, element, sizeof(UCAElements));
  1.1268 +        composed->cPoints = composed->uchars;
  1.1269 +        composed->prefix = composed->prefixChars;
  1.1270 +
  1.1271 +        composed->prefixSize = unorm_normalize(element->prefix, element->prefixSize, UNORM_NFC, 0, composed->prefix, 128, status);
  1.1272 +
  1.1273 +
  1.1274 +        if(t->prefixLookup != NULL) {
  1.1275 +            UCAElements *uCE = (UCAElements *)uhash_get(t->prefixLookup, element);
  1.1276 +            if(uCE != NULL) { // there is already a set of code points here
  1.1277 +                element->mapCE = uprv_uca_addPrefix(t, uCE->mapCE, element, status);
  1.1278 +            } else { // no code points, so this spot is clean
  1.1279 +                element->mapCE = uprv_uca_addPrefix(t, UCOL_NOT_FOUND, element, status);
  1.1280 +                uCE = (UCAElements *)uprv_malloc(sizeof(UCAElements));
  1.1281 +                /* test for NULL */
  1.1282 +                if (uCE == NULL) {
  1.1283 +                    *status = U_MEMORY_ALLOCATION_ERROR;
  1.1284 +                    return 0;
  1.1285 +                }
  1.1286 +                uprv_memcpy(uCE, element, sizeof(UCAElements));
  1.1287 +                uCE->cPoints = uCE->uchars;
  1.1288 +                uhash_put(t->prefixLookup, uCE, uCE, status);
  1.1289 +            }
  1.1290 +            if(composed->prefixSize != element->prefixSize || uprv_memcmp(composed->prefix, element->prefix, element->prefixSize)) {
  1.1291 +                // do it!
  1.1292 +                composed->mapCE = uprv_uca_addPrefix(t, element->mapCE, composed, status);
  1.1293 +            }
  1.1294 +        }
  1.1295 +        uprv_free(composed);
  1.1296 +    }
  1.1297 +
  1.1298 +    // We need to use the canonical iterator here
  1.1299 +    // the way we do it is to generate the canonically equivalent strings 
  1.1300 +    // for the contraction and then add the sequences that pass FCD check
  1.1301 +    if(element->cSize > 1 && !(element->cSize==2 && U16_IS_LEAD(element->cPoints[0]) && U16_IS_TRAIL(element->cPoints[1]))) { // this is a contraction, we should check whether a composed form should also be included
  1.1302 +        UnicodeString source(element->cPoints, element->cSize);
  1.1303 +        CanonicalIterator it(source, *status);
  1.1304 +        source = it.next();
  1.1305 +        while(!source.isBogus()) {
  1.1306 +            if(Normalizer::quickCheck(source, UNORM_FCD, *status) != UNORM_NO) {
  1.1307 +                element->cSize = source.extract(element->cPoints, 128, *status);
  1.1308 +                uprv_uca_finalizeAddition(t, element, status);
  1.1309 +            }
  1.1310 +            source = it.next();
  1.1311 +        }
  1.1312 +        CE = element->mapCE;
  1.1313 +    } else {
  1.1314 +        CE = uprv_uca_finalizeAddition(t, element, status);  
  1.1315 +    }
  1.1316 +
  1.1317 +    return CE;
  1.1318 +}
  1.1319 +
  1.1320 +
  1.1321 +/*void uprv_uca_getMaxExpansionJamo(CompactEIntArray       *mapping, */
  1.1322 +static void uprv_uca_getMaxExpansionJamo(UNewTrie       *mapping, 
  1.1323 +                                         MaxExpansionTable     *maxexpansion,
  1.1324 +                                         MaxJamoExpansionTable *maxjamoexpansion,
  1.1325 +                                         UBool                  jamospecial,
  1.1326 +                                         UErrorCode            *status)
  1.1327 +{
  1.1328 +    const uint32_t VBASE  = 0x1161;
  1.1329 +    const uint32_t TBASE  = 0x11A8;
  1.1330 +    const uint32_t VCOUNT = 21;
  1.1331 +    const uint32_t TCOUNT = 28;
  1.1332 +
  1.1333 +    uint32_t v = VBASE + VCOUNT - 1;
  1.1334 +    uint32_t t = TBASE + TCOUNT - 1;
  1.1335 +    uint32_t ce;
  1.1336 +
  1.1337 +    while (v >= VBASE) {
  1.1338 +        /*ce = ucmpe32_get(mapping, v);*/
  1.1339 +        ce = utrie_get32(mapping, v, NULL);
  1.1340 +        if (ce < UCOL_SPECIAL_FLAG) {
  1.1341 +            uprv_uca_setMaxExpansion(ce, 2, maxexpansion, status);
  1.1342 +        }
  1.1343 +        v --;
  1.1344 +    }
  1.1345 +
  1.1346 +    while (t >= TBASE)
  1.1347 +    {
  1.1348 +        /*ce = ucmpe32_get(mapping, t);*/
  1.1349 +        ce = utrie_get32(mapping, t, NULL);
  1.1350 +        if (ce < UCOL_SPECIAL_FLAG) {
  1.1351 +            uprv_uca_setMaxExpansion(ce, 3, maxexpansion, status);
  1.1352 +        }
  1.1353 +        t --;
  1.1354 +    }
  1.1355 +    /*  According to the docs, 99% of the time, the Jamo will not be special */
  1.1356 +    if (jamospecial) {
  1.1357 +        /* gets the max expansion in all unicode characters */
  1.1358 +        int     count    = maxjamoexpansion->position;
  1.1359 +        uint8_t maxTSize = (uint8_t)(maxjamoexpansion->maxLSize + 
  1.1360 +            maxjamoexpansion->maxVSize +
  1.1361 +            maxjamoexpansion->maxTSize);
  1.1362 +        uint8_t maxVSize = (uint8_t)(maxjamoexpansion->maxLSize + 
  1.1363 +            maxjamoexpansion->maxVSize);
  1.1364 +
  1.1365 +        while (count > 0) {
  1.1366 +            count --;
  1.1367 +            if (*(maxjamoexpansion->isV + count) == TRUE) {
  1.1368 +                uprv_uca_setMaxExpansion(
  1.1369 +                    *(maxjamoexpansion->endExpansionCE + count), 
  1.1370 +                    maxVSize, maxexpansion, status);
  1.1371 +            }
  1.1372 +            else {
  1.1373 +                uprv_uca_setMaxExpansion(
  1.1374 +                    *(maxjamoexpansion->endExpansionCE + count), 
  1.1375 +                    maxTSize, maxexpansion, status);
  1.1376 +            }
  1.1377 +        }
  1.1378 +    }
  1.1379 +}
  1.1380 +
  1.1381 +U_CDECL_BEGIN
  1.1382 +static inline uint32_t U_CALLCONV
  1.1383 +getFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset)
  1.1384 +{
  1.1385 +    uint32_t value;
  1.1386 +    uint32_t tag;
  1.1387 +    UChar32 limit;
  1.1388 +    UBool inBlockZero;
  1.1389 +
  1.1390 +    limit=start+0x400;
  1.1391 +    while(start<limit) {
  1.1392 +        value=utrie_get32(trie, start, &inBlockZero);
  1.1393 +        tag = getCETag(value);
  1.1394 +        if(inBlockZero == TRUE) {
  1.1395 +            start+=UTRIE_DATA_BLOCK_LENGTH;
  1.1396 +        } else if(!(isSpecial(value) && (tag == IMPLICIT_TAG || tag == NOT_FOUND_TAG))) {
  1.1397 +            /* These are values that are starting in either UCA (IMPLICIT_TAG) or in the 
  1.1398 +            * tailorings (NOT_FOUND_TAG). Presence of these tags means that there is 
  1.1399 +            * nothing in this position and that it should be skipped.
  1.1400 +            */
  1.1401 +#ifdef UCOL_DEBUG
  1.1402 +            static int32_t count = 1;
  1.1403 +            fprintf(stdout, "%i, Folded %08X, value %08X\n", count++, start, value);
  1.1404 +#endif
  1.1405 +            return (uint32_t)(UCOL_SPECIAL_FLAG | (SURROGATE_TAG<<24) | offset);
  1.1406 +        } else {
  1.1407 +            ++start;
  1.1408 +        }
  1.1409 +    }
  1.1410 +    return 0;
  1.1411 +}
  1.1412 +U_CDECL_END
  1.1413 +
  1.1414 +#ifdef UCOL_DEBUG
  1.1415 +// This is a debug function to print the contents of a trie.
  1.1416 +// It is used in conjuction with the code around utrie_unserialize call
  1.1417 +UBool enumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
  1.1418 +    if(start<0x10000) {
  1.1419 +        fprintf(stdout, "%08X, %08X, %08X\n", start, limit, value);
  1.1420 +    } else {
  1.1421 +        fprintf(stdout, "%08X=%04X %04X, %08X=%04X %04X, %08X\n", start, U16_LEAD(start), U16_TRAIL(start), limit, U16_LEAD(limit), U16_TRAIL(limit), value);
  1.1422 +    }
  1.1423 +    return TRUE;
  1.1424 +}
  1.1425 +
  1.1426 +int32_t 
  1.1427 +myGetFoldingOffset(uint32_t data) {
  1.1428 +    if(data > UCOL_NOT_FOUND && getCETag(data) == SURROGATE_TAG) {
  1.1429 +        return (data&0xFFFFFF);
  1.1430 +    } else {
  1.1431 +        return 0;
  1.1432 +    }
  1.1433 +}
  1.1434 +#endif
  1.1435 +
  1.1436 +U_CAPI UCATableHeader* U_EXPORT2
  1.1437 +uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) {
  1.1438 +    /*CompactEIntArray *mapping = t->mapping;*/
  1.1439 +    UNewTrie *mapping = t->mapping;
  1.1440 +    ExpansionTable *expansions = t->expansions;
  1.1441 +    CntTable *contractions = t->contractions;
  1.1442 +    MaxExpansionTable *maxexpansion = t->maxExpansions;
  1.1443 +
  1.1444 +    if(U_FAILURE(*status)) {
  1.1445 +        return NULL;
  1.1446 +    }
  1.1447 +
  1.1448 +    uint32_t beforeContractions = (uint32_t)((headersize+paddedsize(expansions->position*sizeof(uint32_t)))/sizeof(UChar));
  1.1449 +
  1.1450 +    int32_t contractionsSize = 0;
  1.1451 +    contractionsSize = uprv_cnttab_constructTable(contractions, beforeContractions, status);
  1.1452 +
  1.1453 +    /* the following operation depends on the trie data. Therefore, we have to do it before */
  1.1454 +    /* the trie is compacted */
  1.1455 +    /* sets jamo expansions */
  1.1456 +    uprv_uca_getMaxExpansionJamo(mapping, maxexpansion, t->maxJamoExpansions,
  1.1457 +        t->image->jamoSpecial, status);
  1.1458 +
  1.1459 +    /*ucmpe32_compact(mapping);*/
  1.1460 +    /*UMemoryStream *ms = uprv_mstrm_openNew(8192);*/
  1.1461 +    /*int32_t mappingSize = ucmpe32_flattenMem(mapping, ms);*/
  1.1462 +    /*const uint8_t *flattened = uprv_mstrm_getBuffer(ms, &mappingSize);*/
  1.1463 +
  1.1464 +    // After setting the jamo expansions, compact the trie and get the needed size
  1.1465 +    int32_t mappingSize = utrie_serialize(mapping, NULL, 0, getFoldedValue /*getFoldedValue*/, FALSE, status);
  1.1466 +
  1.1467 +    uint32_t tableOffset = 0;
  1.1468 +    uint8_t *dataStart;
  1.1469 +
  1.1470 +    /* TODO: LATIN1 array is now in the utrie - it should be removed from the calculation */
  1.1471 +
  1.1472 +    uint32_t toAllocate =(uint32_t)(headersize+                                    
  1.1473 +        paddedsize(expansions->position*sizeof(uint32_t))+
  1.1474 +        paddedsize(mappingSize)+
  1.1475 +        paddedsize(contractionsSize*(sizeof(UChar)+sizeof(uint32_t)))+
  1.1476 +        //paddedsize(0x100*sizeof(uint32_t))  /* Latin1 is now included in the trie */
  1.1477 +        /* maxexpansion array */
  1.1478 +        + paddedsize(maxexpansion->position * sizeof(uint32_t)) +
  1.1479 +        /* maxexpansion size array */
  1.1480 +        paddedsize(maxexpansion->position * sizeof(uint8_t)) +
  1.1481 +        paddedsize(UCOL_UNSAFECP_TABLE_SIZE) +   /*  Unsafe chars             */
  1.1482 +        paddedsize(UCOL_UNSAFECP_TABLE_SIZE));    /*  Contraction Ending chars */
  1.1483 +
  1.1484 +
  1.1485 +    dataStart = (uint8_t *)uprv_malloc(toAllocate);
  1.1486 +    /* test for NULL */
  1.1487 +    if (dataStart == NULL) {
  1.1488 +        *status = U_MEMORY_ALLOCATION_ERROR;
  1.1489 +        return NULL;
  1.1490 +    }
  1.1491 +
  1.1492 +    UCATableHeader *myData = (UCATableHeader *)dataStart;
  1.1493 +    // Please, do reset all the fields!
  1.1494 +    uprv_memset(dataStart, 0, toAllocate);
  1.1495 +    // Make sure we know this is reset
  1.1496 +    myData->magic = UCOL_HEADER_MAGIC;
  1.1497 +    myData->isBigEndian = U_IS_BIG_ENDIAN;
  1.1498 +    myData->charSetFamily = U_CHARSET_FAMILY;
  1.1499 +    myData->formatVersion[0] = UCA_FORMAT_VERSION_0;
  1.1500 +    myData->formatVersion[1] = UCA_FORMAT_VERSION_1;
  1.1501 +    myData->formatVersion[2] = UCA_FORMAT_VERSION_2;
  1.1502 +    myData->formatVersion[3] = UCA_FORMAT_VERSION_3;
  1.1503 +    myData->jamoSpecial = t->image->jamoSpecial;
  1.1504 +
  1.1505 +    // Don't copy stuff from UCA header!
  1.1506 +    //uprv_memcpy(myData, t->image, sizeof(UCATableHeader));
  1.1507 +
  1.1508 +    myData->contractionSize = contractionsSize;
  1.1509 +
  1.1510 +    tableOffset += (uint32_t)(paddedsize(sizeof(UCATableHeader)));
  1.1511 +
  1.1512 +    myData->options = tableOffset;
  1.1513 +    uprv_memcpy(dataStart+tableOffset, t->options, sizeof(UColOptionSet));
  1.1514 +    tableOffset += (uint32_t)(paddedsize(sizeof(UColOptionSet)));
  1.1515 +
  1.1516 +    /* copy expansions */
  1.1517 +    /*myData->expansion = (uint32_t *)dataStart+tableOffset;*/
  1.1518 +    myData->expansion = tableOffset;
  1.1519 +    uprv_memcpy(dataStart+tableOffset, expansions->CEs, expansions->position*sizeof(uint32_t));
  1.1520 +    tableOffset += (uint32_t)(paddedsize(expansions->position*sizeof(uint32_t)));
  1.1521 +
  1.1522 +    /* contractions block */
  1.1523 +    if(contractionsSize != 0) {
  1.1524 +        /* copy contraction index */
  1.1525 +        /*myData->contractionIndex = (UChar *)(dataStart+tableOffset);*/
  1.1526 +        myData->contractionIndex = tableOffset;
  1.1527 +        uprv_memcpy(dataStart+tableOffset, contractions->codePoints, contractionsSize*sizeof(UChar));
  1.1528 +        tableOffset += (uint32_t)(paddedsize(contractionsSize*sizeof(UChar)));
  1.1529 +
  1.1530 +        /* copy contraction collation elements */
  1.1531 +        /*myData->contractionCEs = (uint32_t *)(dataStart+tableOffset);*/
  1.1532 +        myData->contractionCEs = tableOffset;
  1.1533 +        uprv_memcpy(dataStart+tableOffset, contractions->CEs, contractionsSize*sizeof(uint32_t));
  1.1534 +        tableOffset += (uint32_t)(paddedsize(contractionsSize*sizeof(uint32_t)));
  1.1535 +    } else {
  1.1536 +        myData->contractionIndex = 0;
  1.1537 +        myData->contractionCEs = 0;
  1.1538 +    }
  1.1539 +
  1.1540 +    /* copy mapping table */
  1.1541 +    /*myData->mappingPosition = dataStart+tableOffset;*/
  1.1542 +    /*myData->mappingPosition = tableOffset;*/
  1.1543 +    /*uprv_memcpy(dataStart+tableOffset, flattened, mappingSize);*/
  1.1544 +
  1.1545 +    myData->mappingPosition = tableOffset;
  1.1546 +    utrie_serialize(mapping, dataStart+tableOffset, toAllocate-tableOffset, getFoldedValue, FALSE, status);
  1.1547 +#ifdef UCOL_DEBUG
  1.1548 +    // This is debug code to dump the contents of the trie. It needs two functions defined above
  1.1549 +    {
  1.1550 +        UTrie UCAt = { 0 };
  1.1551 +        uint32_t trieWord;
  1.1552 +        utrie_unserialize(&UCAt, dataStart+tableOffset, 9999999, status);
  1.1553 +        UCAt.getFoldingOffset = myGetFoldingOffset;
  1.1554 +        if(U_SUCCESS(*status)) {
  1.1555 +            utrie_enum(&UCAt, NULL, enumRange, NULL);
  1.1556 +        }
  1.1557 +        trieWord = UTRIE_GET32_FROM_LEAD(&UCAt, 0xDC01);
  1.1558 +    }
  1.1559 +#endif
  1.1560 +    tableOffset += paddedsize(mappingSize);
  1.1561 +
  1.1562 +
  1.1563 +    int32_t i = 0;
  1.1564 +
  1.1565 +    /* copy max expansion table */
  1.1566 +    myData->endExpansionCE      = tableOffset;
  1.1567 +    myData->endExpansionCECount = maxexpansion->position - 1;
  1.1568 +    /* not copying the first element which is a dummy */
  1.1569 +    uprv_memcpy(dataStart + tableOffset, maxexpansion->endExpansionCE + 1, 
  1.1570 +        (maxexpansion->position - 1) * sizeof(uint32_t));
  1.1571 +    tableOffset += (uint32_t)(paddedsize((maxexpansion->position)* sizeof(uint32_t)));
  1.1572 +    myData->expansionCESize = tableOffset;
  1.1573 +    uprv_memcpy(dataStart + tableOffset, maxexpansion->expansionCESize + 1, 
  1.1574 +        (maxexpansion->position - 1) * sizeof(uint8_t));
  1.1575 +    tableOffset += (uint32_t)(paddedsize((maxexpansion->position)* sizeof(uint8_t)));
  1.1576 +
  1.1577 +    /* Unsafe chars table.  Finish it off, then copy it. */
  1.1578 +    uprv_uca_unsafeCPAddCCNZ(t, status);
  1.1579 +    if (t->UCA != 0) {              /* Or in unsafebits from UCA, making a combined table.    */
  1.1580 +        for (i=0; i<UCOL_UNSAFECP_TABLE_SIZE; i++) {    
  1.1581 +            t->unsafeCP[i] |= t->UCA->unsafeCP[i];
  1.1582 +        }
  1.1583 +    }
  1.1584 +    myData->unsafeCP = tableOffset;
  1.1585 +    uprv_memcpy(dataStart + tableOffset, t->unsafeCP, UCOL_UNSAFECP_TABLE_SIZE);
  1.1586 +    tableOffset += paddedsize(UCOL_UNSAFECP_TABLE_SIZE);
  1.1587 +
  1.1588 +
  1.1589 +    /* Finish building Contraction Ending chars hash table and then copy it out.  */
  1.1590 +    if (t->UCA != 0) {              /* Or in unsafebits from UCA, making a combined table.    */
  1.1591 +        for (i=0; i<UCOL_UNSAFECP_TABLE_SIZE; i++) {    
  1.1592 +            t->contrEndCP[i] |= t->UCA->contrEndCP[i];
  1.1593 +        }
  1.1594 +    }
  1.1595 +    myData->contrEndCP = tableOffset;
  1.1596 +    uprv_memcpy(dataStart + tableOffset, t->contrEndCP, UCOL_UNSAFECP_TABLE_SIZE);
  1.1597 +    tableOffset += paddedsize(UCOL_UNSAFECP_TABLE_SIZE);
  1.1598 +
  1.1599 +    if(tableOffset != toAllocate) {
  1.1600 +#ifdef UCOL_DEBUG
  1.1601 +        fprintf(stderr, "calculation screwup!!! Expected to write %i but wrote %i instead!!!\n", toAllocate, tableOffset);
  1.1602 +#endif
  1.1603 +        *status = U_INTERNAL_PROGRAM_ERROR;
  1.1604 +        uprv_free(dataStart);
  1.1605 +        return 0;
  1.1606 +    }
  1.1607 +
  1.1608 +    myData->size = tableOffset;
  1.1609 +    /* This should happen upon ressurection */
  1.1610 +    /*const uint8_t *mapPosition = (uint8_t*)myData+myData->mappingPosition;*/
  1.1611 +    /*uprv_mstrm_close(ms);*/
  1.1612 +    return myData;
  1.1613 +}
  1.1614 +
  1.1615 +
  1.1616 +struct enumStruct {
  1.1617 +    tempUCATable *t;
  1.1618 +    UCollator *tempColl;
  1.1619 +    UCollationElements* colEl;
  1.1620 +    const Normalizer2Impl *nfcImpl;
  1.1621 +    UnicodeSet *closed;
  1.1622 +    int32_t noOfClosures;
  1.1623 +    UErrorCode *status;
  1.1624 +};
  1.1625 +U_CDECL_BEGIN
  1.1626 +static UBool U_CALLCONV
  1.1627 +_enumCategoryRangeClosureCategory(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
  1.1628 +
  1.1629 +    if (type != U_UNASSIGNED && type != U_PRIVATE_USE_CHAR) { // if the range is assigned - we might ommit more categories later
  1.1630 +        UErrorCode *status = ((enumStruct *)context)->status;
  1.1631 +        tempUCATable *t = ((enumStruct *)context)->t;
  1.1632 +        UCollator *tempColl = ((enumStruct *)context)->tempColl;
  1.1633 +        UCollationElements* colEl = ((enumStruct *)context)->colEl;
  1.1634 +        UCAElements el;
  1.1635 +        UChar decompBuffer[4];
  1.1636 +        const UChar *decomp;
  1.1637 +        int32_t noOfDec = 0;
  1.1638 +
  1.1639 +        UChar32 u32 = 0;
  1.1640 +        UChar comp[2];
  1.1641 +        uint32_t len = 0;
  1.1642 +
  1.1643 +        for(u32 = start; u32 < limit; u32++) {
  1.1644 +            decomp = ((enumStruct *)context)->nfcImpl->
  1.1645 +                getDecomposition(u32, decompBuffer, noOfDec);
  1.1646 +            //if((noOfDec = unorm_normalize(comp, len, UNORM_NFD, 0, decomp, 256, status)) > 1
  1.1647 +            //|| (noOfDec == 1 && *decomp != (UChar)u32))
  1.1648 +            if(decomp != NULL)
  1.1649 +            {
  1.1650 +                len = 0;
  1.1651 +                U16_APPEND_UNSAFE(comp, len, u32);
  1.1652 +                if(ucol_strcoll(tempColl, comp, len, decomp, noOfDec) != UCOL_EQUAL) {
  1.1653 +#ifdef UCOL_DEBUG
  1.1654 +                    fprintf(stderr, "Closure: U+%04X -> ", u32);
  1.1655 +                    UChar32 c;
  1.1656 +                    int32_t i = 0;
  1.1657 +                    while(i < noOfDec) {
  1.1658 +                        U16_NEXT(decomp, i, noOfDec, c);
  1.1659 +                        fprintf(stderr, "%04X ", c);
  1.1660 +                    }
  1.1661 +                    fprintf(stderr, "\n");
  1.1662 +                    // print CEs for code point vs. decomposition
  1.1663 +                    fprintf(stderr, "U+%04X CEs: ", u32);
  1.1664 +                    UCollationElements *iter = ucol_openElements(tempColl, comp, len, status);
  1.1665 +                    int32_t ce;
  1.1666 +                    while((ce = ucol_next(iter, status)) != UCOL_NULLORDER) {
  1.1667 +                        fprintf(stderr, "%08X ", ce);
  1.1668 +                    }
  1.1669 +                    fprintf(stderr, "\nDecomp CEs: ");
  1.1670 +                    ucol_setText(iter, decomp, noOfDec, status);
  1.1671 +                    while((ce = ucol_next(iter, status)) != UCOL_NULLORDER) {
  1.1672 +                        fprintf(stderr, "%08X ", ce);
  1.1673 +                    }
  1.1674 +                    fprintf(stderr, "\n");
  1.1675 +                    ucol_closeElements(iter);
  1.1676 +#endif
  1.1677 +                    if(((enumStruct *)context)->closed != NULL) {
  1.1678 +                        ((enumStruct *)context)->closed->add(u32);
  1.1679 +                    }
  1.1680 +                    ((enumStruct *)context)->noOfClosures++;
  1.1681 +                    el.cPoints = (UChar *)decomp;
  1.1682 +                    el.cSize = noOfDec;
  1.1683 +                    el.noOfCEs = 0;
  1.1684 +                    el.prefix = el.prefixChars;
  1.1685 +                    el.prefixSize = 0;
  1.1686 +
  1.1687 +                    UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, &el);
  1.1688 +                    el.cPoints = comp;
  1.1689 +                    el.cSize = len;
  1.1690 +                    el.prefix = el.prefixChars;
  1.1691 +                    el.prefixSize = 0;
  1.1692 +                    if(prefix == NULL) {
  1.1693 +                        el.noOfCEs = 0;
  1.1694 +                        ucol_setText(colEl, decomp, noOfDec, status);
  1.1695 +                        while((el.CEs[el.noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) {
  1.1696 +                            el.noOfCEs++;
  1.1697 +                        }
  1.1698 +                    } else {
  1.1699 +                        el.noOfCEs = 1;
  1.1700 +                        el.CEs[0] = prefix->mapCE;
  1.1701 +                        // This character uses a prefix. We have to add it 
  1.1702 +                        // to the unsafe table, as it decomposed form is already
  1.1703 +                        // in. In Japanese, this happens for \u309e & \u30fe
  1.1704 +                        // Since unsafeCPSet is static in ucol_elm, we are going
  1.1705 +                        // to wrap it up in the uprv_uca_unsafeCPAddCCNZ function
  1.1706 +                    }
  1.1707 +                    uprv_uca_addAnElement(t, &el, status);
  1.1708 +                }
  1.1709 +            }
  1.1710 +        }
  1.1711 +    }
  1.1712 +    return TRUE;
  1.1713 +}
  1.1714 +U_CDECL_END
  1.1715 +
  1.1716 +static void
  1.1717 +uprv_uca_setMapCE(tempUCATable *t, UCAElements *element, UErrorCode *status) {
  1.1718 +    uint32_t expansion = 0;
  1.1719 +    int32_t j;
  1.1720 +
  1.1721 +    ExpansionTable *expansions = t->expansions;
  1.1722 +    if(element->noOfCEs == 2 // a two CE expansion
  1.1723 +        && isContinuation(element->CEs[1]) // which  is a continuation
  1.1724 +        && (element->CEs[1] & (~(0xFF << 24 | UCOL_CONTINUATION_MARKER))) == 0 // that has only primaries in continuation,
  1.1725 +        && (((element->CEs[0]>>8) & 0xFF) == UCOL_BYTE_COMMON) // a common secondary
  1.1726 +        && ((element->CEs[0] & 0xFF) == UCOL_BYTE_COMMON) // and a common tertiary
  1.1727 +        ) {
  1.1728 +            element->mapCE = UCOL_SPECIAL_FLAG | (LONG_PRIMARY_TAG<<24) // a long primary special
  1.1729 +                | ((element->CEs[0]>>8) & 0xFFFF00) // first and second byte of primary
  1.1730 +                | ((element->CEs[1]>>24) & 0xFF);   // third byte of primary
  1.1731 +        } else {
  1.1732 +            expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT)
  1.1733 +                | (((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4)
  1.1734 +                   & 0xFFFFF0));
  1.1735 +
  1.1736 +            for(j = 1; j<(int32_t)element->noOfCEs; j++) {
  1.1737 +                uprv_uca_addExpansion(expansions, element->CEs[j], status);
  1.1738 +            }
  1.1739 +            if(element->noOfCEs <= 0xF) {
  1.1740 +                expansion |= element->noOfCEs;
  1.1741 +            } else {
  1.1742 +                uprv_uca_addExpansion(expansions, 0, status);
  1.1743 +            }
  1.1744 +            element->mapCE = expansion;
  1.1745 +            uprv_uca_setMaxExpansion(element->CEs[element->noOfCEs - 1],
  1.1746 +                (uint8_t)element->noOfCEs,
  1.1747 +                t->maxExpansions,
  1.1748 +                status);
  1.1749 +        }
  1.1750 +}
  1.1751 +
  1.1752 +static void
  1.1753 +uprv_uca_addFCD4AccentedContractions(tempUCATable *t,
  1.1754 +                                      UCollationElements* colEl,
  1.1755 +                                      UChar *data,
  1.1756 +                                      int32_t len,
  1.1757 +                                      UCAElements *el,
  1.1758 +                                      UErrorCode *status) {
  1.1759 +    UChar decomp[256], comp[256];
  1.1760 +    int32_t decLen, compLen;
  1.1761 +
  1.1762 +    decLen = unorm_normalize(data, len, UNORM_NFD, 0, decomp, 256, status);
  1.1763 +    compLen = unorm_normalize(data, len, UNORM_NFC, 0, comp, 256, status);
  1.1764 +    decomp[decLen] = comp[compLen] = 0;
  1.1765 +
  1.1766 +    el->cPoints = decomp;
  1.1767 +    el->cSize = decLen;
  1.1768 +    el->noOfCEs = 0;
  1.1769 +    el->prefixSize = 0;
  1.1770 +    el->prefix = el->prefixChars;
  1.1771 +
  1.1772 +    UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, el);
  1.1773 +    el->cPoints = comp;
  1.1774 +    el->cSize = compLen;
  1.1775 +    el->prefix = el->prefixChars;
  1.1776 +    el->prefixSize = 0;
  1.1777 +    if(prefix == NULL) {
  1.1778 +        el->noOfCEs = 0;
  1.1779 +        ucol_setText(colEl, decomp, decLen, status);
  1.1780 +        while((el->CEs[el->noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) {
  1.1781 +            el->noOfCEs++;
  1.1782 +        }
  1.1783 +        uprv_uca_setMapCE(t, el, status);
  1.1784 +        uprv_uca_addAnElement(t, el, status);
  1.1785 +    }
  1.1786 +    el->cPoints=NULL; /* don't leak reference to stack */
  1.1787 +}
  1.1788 +
  1.1789 +static void
  1.1790 +uprv_uca_addMultiCMContractions(tempUCATable *t,
  1.1791 +                                UCollationElements* colEl,
  1.1792 +                                tempTailorContext *c,
  1.1793 +                                UCAElements *el,
  1.1794 +                                UErrorCode *status) {
  1.1795 +    CombinClassTable *cmLookup = t->cmLookup;
  1.1796 +    UChar  newDecomp[256];
  1.1797 +    int32_t maxComp, newDecLen;
  1.1798 +    const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
  1.1799 +    if (U_FAILURE(*status)) {
  1.1800 +        return;
  1.1801 +    }
  1.1802 +    int16_t curClass = nfcImpl->getFCD16(c->tailoringCM) & 0xff;
  1.1803 +    CompData *precomp = c->precomp;
  1.1804 +    int32_t  compLen = c->compLen;
  1.1805 +    UChar *comp = c->comp;
  1.1806 +    maxComp = c->precompLen;
  1.1807 +
  1.1808 +    for (int32_t j=0; j < maxComp; j++) {
  1.1809 +        int32_t count=0;
  1.1810 +        do {
  1.1811 +            if ( count == 0 ) {  // Decompose the saved precomposed char.
  1.1812 +                UChar temp[2];
  1.1813 +                temp[0]=precomp[j].cp;
  1.1814 +                temp[1]=0;
  1.1815 +                newDecLen = unorm_normalize(temp, 1, UNORM_NFD, 0,
  1.1816 +                            newDecomp, sizeof(newDecomp)/sizeof(UChar), status);
  1.1817 +                newDecomp[newDecLen++] = cmLookup->cPoints[c->cmPos];
  1.1818 +            }
  1.1819 +            else {  // swap 2 combining marks when they are equal.
  1.1820 +                uprv_memcpy(newDecomp, c->decomp, sizeof(UChar)*(c->decompLen));
  1.1821 +                newDecLen = c->decompLen;
  1.1822 +                newDecomp[newDecLen++] = precomp[j].cClass;
  1.1823 +            }
  1.1824 +            newDecomp[newDecLen] = 0;
  1.1825 +            compLen = unorm_normalize(newDecomp, newDecLen, UNORM_NFC, 0,
  1.1826 +                              comp, 256, status);
  1.1827 +            if (compLen==1) {
  1.1828 +                comp[compLen++] = newDecomp[newDecLen++] = c->tailoringCM;
  1.1829 +                comp[compLen] = newDecomp[newDecLen] = 0;
  1.1830 +                el->cPoints = newDecomp;
  1.1831 +                el->cSize = newDecLen;
  1.1832 +
  1.1833 +                UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, el);
  1.1834 +                el->cPoints = c->comp;
  1.1835 +                el->cSize = compLen;
  1.1836 +                el->prefix = el->prefixChars;
  1.1837 +                el->prefixSize = 0;
  1.1838 +                if(prefix == NULL) {
  1.1839 +                    el->noOfCEs = 0;
  1.1840 +                    ucol_setText(colEl, newDecomp, newDecLen, status);
  1.1841 +                    while((el->CEs[el->noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) {
  1.1842 +                        el->noOfCEs++;
  1.1843 +                    }
  1.1844 +                    uprv_uca_setMapCE(t, el, status);
  1.1845 +                    uprv_uca_finalizeAddition(t, el, status);
  1.1846 +
  1.1847 +                    // Save the current precomposed char and its class to find any
  1.1848 +                    // other combining mark combinations.
  1.1849 +                    precomp[c->precompLen].cp=comp[0];
  1.1850 +                    precomp[c->precompLen].cClass = curClass;
  1.1851 +                    c->precompLen++;
  1.1852 +                }
  1.1853 +            }
  1.1854 +        } while (++count<2 && (precomp[j].cClass == curClass));
  1.1855 +    }
  1.1856 +
  1.1857 +}
  1.1858 +
  1.1859 +static void
  1.1860 +uprv_uca_addTailCanonicalClosures(tempUCATable *t,
  1.1861 +                                  UCollationElements* colEl,
  1.1862 +                                  UChar baseCh,
  1.1863 +                                  UChar cMark,
  1.1864 +                                  UCAElements *el,
  1.1865 +                                  UErrorCode *status) {
  1.1866 +    CombinClassTable *cmLookup = t->cmLookup;
  1.1867 +    const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
  1.1868 +    if (U_FAILURE(*status)) {
  1.1869 +        return;
  1.1870 +    }
  1.1871 +    int16_t maxIndex = nfcImpl->getFCD16(cMark) & 0xff;
  1.1872 +    UCAElements element;
  1.1873 +    uint16_t *index;
  1.1874 +    UChar  decomp[256];
  1.1875 +    UChar  comp[256];
  1.1876 +    CompData precomp[256];   // precomposed array
  1.1877 +    int32_t  precompLen = 0; // count for precomp
  1.1878 +    int32_t i, len, decompLen, replacedPos;
  1.1879 +    tempTailorContext c;
  1.1880 +
  1.1881 +    if ( cmLookup == NULL ) {
  1.1882 +        return;
  1.1883 +    }
  1.1884 +    index = cmLookup->index;
  1.1885 +    int32_t cClass=nfcImpl->getFCD16(cMark) & 0xff;
  1.1886 +    maxIndex = (int32_t)index[(nfcImpl->getFCD16(cMark) & 0xff)-1];
  1.1887 +    c.comp = comp;
  1.1888 +    c.decomp = decomp;
  1.1889 +    c.precomp = precomp;
  1.1890 +    c.tailoringCM =  cMark;
  1.1891 +
  1.1892 +    if (cClass>0) {
  1.1893 +        maxIndex = (int32_t)index[cClass-1];
  1.1894 +    }
  1.1895 +    else {
  1.1896 +        maxIndex=0;
  1.1897 +    }
  1.1898 +    decomp[0]=baseCh;
  1.1899 +    for ( i=0; i<maxIndex ; i++ ) {
  1.1900 +        decomp[1] = cmLookup->cPoints[i];
  1.1901 +        decomp[2]=0;
  1.1902 +        decompLen=2;
  1.1903 +        len = unorm_normalize(decomp, decompLen, UNORM_NFC, 0, comp, 256, status);
  1.1904 +        if (len==1) {
  1.1905 +            // Save the current precomposed char and its class to find any
  1.1906 +            // other combining mark combinations.
  1.1907 +            precomp[precompLen].cp=comp[0];
  1.1908 +            precomp[precompLen].cClass =
  1.1909 +                       index[nfcImpl->getFCD16(decomp[1]) & 0xff];
  1.1910 +            precompLen++;
  1.1911 +            replacedPos=0;
  1.1912 +            for (decompLen=0; decompLen< (int32_t)el->cSize; decompLen++) {
  1.1913 +                decomp[decompLen] = el->cPoints[decompLen];
  1.1914 +                if (decomp[decompLen]==cMark) {
  1.1915 +                    replacedPos = decompLen;  // record the position for later use
  1.1916 +                }
  1.1917 +            }
  1.1918 +            if ( replacedPos != 0 ) {
  1.1919 +                decomp[replacedPos]=cmLookup->cPoints[i];
  1.1920 +            }
  1.1921 +            decomp[decompLen] = 0;
  1.1922 +            len = unorm_normalize(decomp, decompLen, UNORM_NFC, 0, comp, 256, status);
  1.1923 +            comp[len++] = decomp[decompLen++] = cMark;
  1.1924 +            comp[len] = decomp[decompLen] = 0;
  1.1925 +            element.cPoints = decomp;
  1.1926 +            element.cSize = decompLen;
  1.1927 +            element.noOfCEs = 0;
  1.1928 +            element.prefix = el->prefixChars;
  1.1929 +            element.prefixSize = 0;
  1.1930 +
  1.1931 +            UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, &element);
  1.1932 +            element.cPoints = comp;
  1.1933 +            element.cSize = len;
  1.1934 +            element.prefix = el->prefixChars;
  1.1935 +            element.prefixSize = 0;
  1.1936 +            if(prefix == NULL) {
  1.1937 +                element.noOfCEs = 0;
  1.1938 +                ucol_setText(colEl, decomp, decompLen, status);
  1.1939 +                while((element.CEs[element.noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) {
  1.1940 +                    element.noOfCEs++;
  1.1941 +                }
  1.1942 +                uprv_uca_setMapCE(t, &element, status);
  1.1943 +                uprv_uca_finalizeAddition(t, &element, status);
  1.1944 +            }
  1.1945 +
  1.1946 +            // This is a fix for tailoring contractions with accented
  1.1947 +            // character at the end of contraction string.
  1.1948 +            if ((len>2) && 
  1.1949 +                (nfcImpl->getFCD16(comp[len-2]) & 0xff00)==0) {
  1.1950 +                uprv_uca_addFCD4AccentedContractions(t, colEl, comp, len, &element, status);
  1.1951 +            }
  1.1952 +
  1.1953 +            if (precompLen >1) {
  1.1954 +                c.compLen = len;
  1.1955 +                c.decompLen = decompLen;
  1.1956 +                c.precompLen = precompLen;
  1.1957 +                c.cmPos = i;
  1.1958 +                uprv_uca_addMultiCMContractions(t, colEl, &c, &element, status);
  1.1959 +                precompLen = c.precompLen;
  1.1960 +            }
  1.1961 +        }
  1.1962 +    }
  1.1963 +}
  1.1964 +
  1.1965 +U_CFUNC int32_t U_EXPORT2
  1.1966 +uprv_uca_canonicalClosure(tempUCATable *t,
  1.1967 +                          UColTokenParser *src,
  1.1968 +                          UnicodeSet *closed,
  1.1969 +                          UErrorCode *status)
  1.1970 +{
  1.1971 +    enumStruct context;
  1.1972 +    context.closed = closed;
  1.1973 +    context.noOfClosures = 0;
  1.1974 +    UCAElements el;
  1.1975 +    UColToken *tok;
  1.1976 +    uint32_t i = 0, j = 0;
  1.1977 +    UChar  baseChar, firstCM;
  1.1978 +    context.nfcImpl=Normalizer2Factory::getNFCImpl(*status);
  1.1979 +    if(U_FAILURE(*status)) {
  1.1980 +        return 0;
  1.1981 +    }
  1.1982 +
  1.1983 +    UCollator *tempColl = NULL;
  1.1984 +    tempUCATable *tempTable = uprv_uca_cloneTempTable(t, status);
  1.1985 +    // Check for null pointer
  1.1986 +    if (U_FAILURE(*status)) {
  1.1987 +        return 0;
  1.1988 +    }
  1.1989 +
  1.1990 +    UCATableHeader *tempData = uprv_uca_assembleTable(tempTable, status);
  1.1991 +    tempColl = ucol_initCollator(tempData, 0, t->UCA, status);
  1.1992 +    if ( tempTable->cmLookup != NULL ) {
  1.1993 +        t->cmLookup = tempTable->cmLookup;  // copy over to t
  1.1994 +        tempTable->cmLookup = NULL;
  1.1995 +    }
  1.1996 +    uprv_uca_closeTempTable(tempTable);
  1.1997 +
  1.1998 +    if(U_SUCCESS(*status)) {
  1.1999 +        tempColl->ucaRules = NULL;
  1.2000 +        tempColl->actualLocale = NULL;
  1.2001 +        tempColl->validLocale = NULL;
  1.2002 +        tempColl->requestedLocale = NULL;
  1.2003 +        tempColl->hasRealData = TRUE;
  1.2004 +        tempColl->freeImageOnClose = TRUE;
  1.2005 +    } else if(tempData != 0) {
  1.2006 +        uprv_free(tempData);
  1.2007 +    }
  1.2008 +
  1.2009 +    /* produce canonical closure */
  1.2010 +    UCollationElements* colEl = ucol_openElements(tempColl, NULL, 0, status);
  1.2011 +    // Check for null pointer
  1.2012 +    if (U_FAILURE(*status)) {
  1.2013 +        return 0;
  1.2014 +    }
  1.2015 +    context.t = t;
  1.2016 +    context.tempColl = tempColl;
  1.2017 +    context.colEl = colEl;
  1.2018 +    context.status = status;
  1.2019 +    u_enumCharTypes(_enumCategoryRangeClosureCategory, &context);
  1.2020 +
  1.2021 +    if ( (src==NULL) || !src->buildCCTabFlag ) {
  1.2022 +        ucol_closeElements(colEl);
  1.2023 +        ucol_close(tempColl);
  1.2024 +        return context.noOfClosures;  // no extra contraction needed to add
  1.2025 +    }
  1.2026 +
  1.2027 +    for (i=0; i < src->resultLen; i++) {
  1.2028 +        baseChar = firstCM= (UChar)0;
  1.2029 +        tok = src->lh[i].first;
  1.2030 +        while (tok != NULL && U_SUCCESS(*status)) {
  1.2031 +            el.prefix = el.prefixChars;
  1.2032 +            el.cPoints = el.uchars;
  1.2033 +            if(tok->prefix != 0) {
  1.2034 +                el.prefixSize = tok->prefix>>24;
  1.2035 +                uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
  1.2036 +
  1.2037 +                el.cSize = (tok->source >> 24)-(tok->prefix>>24);
  1.2038 +                uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
  1.2039 +            } else {
  1.2040 +                el.prefixSize = 0;
  1.2041 +                *el.prefix = 0;
  1.2042 +
  1.2043 +                el.cSize = (tok->source >> 24);
  1.2044 +                uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
  1.2045 +            }
  1.2046 +            if(src->UCA != NULL) {
  1.2047 +                for(j = 0; j<el.cSize; j++) {
  1.2048 +                    int16_t fcd = context.nfcImpl->getFCD16(el.cPoints[j]);
  1.2049 +                    if ( (fcd & 0xff) == 0 ) {
  1.2050 +                        baseChar = el.cPoints[j];  // last base character
  1.2051 +                        firstCM=0;  // reset combining mark value
  1.2052 +                    }
  1.2053 +                    else {
  1.2054 +                        if ( (baseChar!=0) && (firstCM==0) ) {
  1.2055 +                            firstCM = el.cPoints[j];  // first combining mark
  1.2056 +                        }
  1.2057 +                    }
  1.2058 +                }
  1.2059 +            }
  1.2060 +            if ( (baseChar!= (UChar)0) && (firstCM != (UChar)0) ) {
  1.2061 +                // find all the canonical rules
  1.2062 +                uprv_uca_addTailCanonicalClosures(t, colEl, baseChar, firstCM, &el, status);
  1.2063 +            }
  1.2064 +            tok = tok->next;
  1.2065 +        }
  1.2066 +    }
  1.2067 +    ucol_closeElements(colEl);
  1.2068 +    ucol_close(tempColl);
  1.2069 +
  1.2070 +    return context.noOfClosures;
  1.2071 +}
  1.2072 +
  1.2073 +#endif /* #if !UCONFIG_NO_COLLATION */

mercurial