1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/ucasemap.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,537 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 2005-2011, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: ucasemap.cpp 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2005may06 1.17 +* created by: Markus W. Scherer 1.18 +* 1.19 +* Case mapping service object and functions using it. 1.20 +*/ 1.21 + 1.22 +#include "unicode/utypes.h" 1.23 +#include "unicode/brkiter.h" 1.24 +#include "unicode/ubrk.h" 1.25 +#include "unicode/uloc.h" 1.26 +#include "unicode/ustring.h" 1.27 +#include "unicode/ucasemap.h" 1.28 +#if !UCONFIG_NO_BREAK_ITERATION 1.29 +#include "unicode/utext.h" 1.30 +#endif 1.31 +#include "unicode/utf.h" 1.32 +#include "unicode/utf8.h" 1.33 +#include "unicode/utf16.h" 1.34 +#include "cmemory.h" 1.35 +#include "cstring.h" 1.36 +#include "ucase.h" 1.37 +#include "ustr_imp.h" 1.38 + 1.39 +U_NAMESPACE_USE 1.40 + 1.41 +/* UCaseMap service object -------------------------------------------------- */ 1.42 + 1.43 +U_CAPI UCaseMap * U_EXPORT2 1.44 +ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) { 1.45 + UCaseMap *csm; 1.46 + 1.47 + if(U_FAILURE(*pErrorCode)) { 1.48 + return NULL; 1.49 + } 1.50 + 1.51 + csm=(UCaseMap *)uprv_malloc(sizeof(UCaseMap)); 1.52 + if(csm==NULL) { 1.53 + return NULL; 1.54 + } 1.55 + uprv_memset(csm, 0, sizeof(UCaseMap)); 1.56 + 1.57 + csm->csp=ucase_getSingleton(); 1.58 + ucasemap_setLocale(csm, locale, pErrorCode); 1.59 + if(U_FAILURE(*pErrorCode)) { 1.60 + uprv_free(csm); 1.61 + return NULL; 1.62 + } 1.63 + 1.64 + csm->options=options; 1.65 + return csm; 1.66 +} 1.67 + 1.68 +U_CAPI void U_EXPORT2 1.69 +ucasemap_close(UCaseMap *csm) { 1.70 + if(csm!=NULL) { 1.71 +#if !UCONFIG_NO_BREAK_ITERATION 1.72 + // Do not call ubrk_close() so that we do not depend on all of the BreakIterator code. 1.73 + delete reinterpret_cast<BreakIterator *>(csm->iter); 1.74 +#endif 1.75 + uprv_free(csm); 1.76 + } 1.77 +} 1.78 + 1.79 +U_CAPI const char * U_EXPORT2 1.80 +ucasemap_getLocale(const UCaseMap *csm) { 1.81 + return csm->locale; 1.82 +} 1.83 + 1.84 +U_CAPI uint32_t U_EXPORT2 1.85 +ucasemap_getOptions(const UCaseMap *csm) { 1.86 + return csm->options; 1.87 +} 1.88 + 1.89 +U_CAPI void U_EXPORT2 1.90 +ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) { 1.91 + int32_t length; 1.92 + 1.93 + if(U_FAILURE(*pErrorCode)) { 1.94 + return; 1.95 + } 1.96 + 1.97 + length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode); 1.98 + if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) { 1.99 + *pErrorCode=U_ZERO_ERROR; 1.100 + /* we only really need the language code for case mappings */ 1.101 + length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode); 1.102 + } 1.103 + if(length==sizeof(csm->locale)) { 1.104 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.105 + } 1.106 + csm->locCache=0; 1.107 + if(U_SUCCESS(*pErrorCode)) { 1.108 + ucase_getCaseLocale(csm->locale, &csm->locCache); 1.109 + } else { 1.110 + csm->locale[0]=0; 1.111 + } 1.112 +} 1.113 + 1.114 +U_CAPI void U_EXPORT2 1.115 +ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode * /*pErrorCode*/) { 1.116 + csm->options=options; 1.117 +} 1.118 + 1.119 +/* UTF-8 string case mappings ----------------------------------------------- */ 1.120 + 1.121 +/* TODO(markus): Move to a new, separate utf8case.c file. */ 1.122 + 1.123 +/* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */ 1.124 +static inline int32_t 1.125 +appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity, 1.126 + int32_t result, const UChar *s) { 1.127 + UChar32 c; 1.128 + int32_t length, destLength; 1.129 + UErrorCode errorCode; 1.130 + 1.131 + /* decode the result */ 1.132 + if(result<0) { 1.133 + /* (not) original code point */ 1.134 + c=~result; 1.135 + length=-1; 1.136 + } else if(result<=UCASE_MAX_STRING_LENGTH) { 1.137 + c=U_SENTINEL; 1.138 + length=result; 1.139 + } else { 1.140 + c=result; 1.141 + length=-1; 1.142 + } 1.143 + 1.144 + if(destIndex<destCapacity) { 1.145 + /* append the result */ 1.146 + if(length<0) { 1.147 + /* code point */ 1.148 + UBool isError=FALSE; 1.149 + U8_APPEND(dest, destIndex, destCapacity, c, isError); 1.150 + if(isError) { 1.151 + /* overflow, nothing written */ 1.152 + destIndex+=U8_LENGTH(c); 1.153 + } 1.154 + } else { 1.155 + /* string */ 1.156 + errorCode=U_ZERO_ERROR; 1.157 + u_strToUTF8( 1.158 + (char *)(dest+destIndex), destCapacity-destIndex, &destLength, 1.159 + s, length, 1.160 + &errorCode); 1.161 + destIndex+=destLength; 1.162 + /* we might have an overflow, but we know the actual length */ 1.163 + } 1.164 + } else { 1.165 + /* preflight */ 1.166 + if(length<0) { 1.167 + destIndex+=U8_LENGTH(c); 1.168 + } else { 1.169 + errorCode=U_ZERO_ERROR; 1.170 + u_strToUTF8( 1.171 + NULL, 0, &destLength, 1.172 + s, length, 1.173 + &errorCode); 1.174 + destIndex+=destLength; 1.175 + } 1.176 + } 1.177 + return destIndex; 1.178 +} 1.179 + 1.180 +static UChar32 U_CALLCONV 1.181 +utf8_caseContextIterator(void *context, int8_t dir) { 1.182 + UCaseContext *csc=(UCaseContext *)context; 1.183 + UChar32 c; 1.184 + 1.185 + if(dir<0) { 1.186 + /* reset for backward iteration */ 1.187 + csc->index=csc->cpStart; 1.188 + csc->dir=dir; 1.189 + } else if(dir>0) { 1.190 + /* reset for forward iteration */ 1.191 + csc->index=csc->cpLimit; 1.192 + csc->dir=dir; 1.193 + } else { 1.194 + /* continue current iteration direction */ 1.195 + dir=csc->dir; 1.196 + } 1.197 + 1.198 + if(dir<0) { 1.199 + if(csc->start<csc->index) { 1.200 + U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c); 1.201 + return c; 1.202 + } 1.203 + } else { 1.204 + if(csc->index<csc->limit) { 1.205 + U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c); 1.206 + return c; 1.207 + } 1.208 + } 1.209 + return U_SENTINEL; 1.210 +} 1.211 + 1.212 +/* 1.213 + * Case-maps [srcStart..srcLimit[ but takes 1.214 + * context [0..srcLength[ into account. 1.215 + */ 1.216 +static int32_t 1.217 +_caseMap(const UCaseMap *csm, UCaseMapFull *map, 1.218 + uint8_t *dest, int32_t destCapacity, 1.219 + const uint8_t *src, UCaseContext *csc, 1.220 + int32_t srcStart, int32_t srcLimit, 1.221 + UErrorCode *pErrorCode) { 1.222 + const UChar *s; 1.223 + UChar32 c, c2 = 0; 1.224 + int32_t srcIndex, destIndex; 1.225 + int32_t locCache; 1.226 + 1.227 + locCache=csm->locCache; 1.228 + 1.229 + /* case mapping loop */ 1.230 + srcIndex=srcStart; 1.231 + destIndex=0; 1.232 + while(srcIndex<srcLimit) { 1.233 + csc->cpStart=srcIndex; 1.234 + U8_NEXT(src, srcIndex, srcLimit, c); 1.235 + csc->cpLimit=srcIndex; 1.236 + if(c<0) { 1.237 + int32_t i=csc->cpStart; 1.238 + while(destIndex<destCapacity && i<srcIndex) { 1.239 + dest[destIndex++]=src[i++]; 1.240 + } 1.241 + continue; 1.242 + } 1.243 + c=map(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &locCache); 1.244 + if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) { 1.245 + /* fast path version of appendResult() for ASCII results */ 1.246 + dest[destIndex++]=(uint8_t)c2; 1.247 + } else { 1.248 + destIndex=appendResult(dest, destIndex, destCapacity, c, s); 1.249 + } 1.250 + } 1.251 + 1.252 + if(destIndex>destCapacity) { 1.253 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.254 + } 1.255 + return destIndex; 1.256 +} 1.257 + 1.258 +#if !UCONFIG_NO_BREAK_ITERATION 1.259 + 1.260 +U_CFUNC int32_t U_CALLCONV 1.261 +ucasemap_internalUTF8ToTitle(const UCaseMap *csm, 1.262 + uint8_t *dest, int32_t destCapacity, 1.263 + const uint8_t *src, int32_t srcLength, 1.264 + UErrorCode *pErrorCode) { 1.265 + const UChar *s; 1.266 + UChar32 c; 1.267 + int32_t prev, titleStart, titleLimit, idx, destIndex, length; 1.268 + UBool isFirstIndex; 1.269 + 1.270 + if(U_FAILURE(*pErrorCode)) { 1.271 + return 0; 1.272 + } 1.273 + 1.274 + // Use the C++ abstract base class to minimize dependencies. 1.275 + // TODO: Change UCaseMap.iter to store a BreakIterator directly. 1.276 + BreakIterator *bi=reinterpret_cast<BreakIterator *>(csm->iter); 1.277 + 1.278 + /* set up local variables */ 1.279 + int32_t locCache=csm->locCache; 1.280 + UCaseContext csc=UCASECONTEXT_INITIALIZER; 1.281 + csc.p=(void *)src; 1.282 + csc.limit=srcLength; 1.283 + destIndex=0; 1.284 + prev=0; 1.285 + isFirstIndex=TRUE; 1.286 + 1.287 + /* titlecasing loop */ 1.288 + while(prev<srcLength) { 1.289 + /* find next index where to titlecase */ 1.290 + if(isFirstIndex) { 1.291 + isFirstIndex=FALSE; 1.292 + idx=bi->first(); 1.293 + } else { 1.294 + idx=bi->next(); 1.295 + } 1.296 + if(idx==UBRK_DONE || idx>srcLength) { 1.297 + idx=srcLength; 1.298 + } 1.299 + 1.300 + /* 1.301 + * Unicode 4 & 5 section 3.13 Default Case Operations: 1.302 + * 1.303 + * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex 1.304 + * #29, "Text Boundaries." Between each pair of word boundaries, find the first 1.305 + * cased character F. If F exists, map F to default_title(F); then map each 1.306 + * subsequent character C to default_lower(C). 1.307 + * 1.308 + * In this implementation, segment [prev..index[ into 3 parts: 1.309 + * a) uncased characters (copy as-is) [prev..titleStart[ 1.310 + * b) first case letter (titlecase) [titleStart..titleLimit[ 1.311 + * c) subsequent characters (lowercase) [titleLimit..index[ 1.312 + */ 1.313 + if(prev<idx) { 1.314 + /* find and copy uncased characters [prev..titleStart[ */ 1.315 + titleStart=titleLimit=prev; 1.316 + U8_NEXT(src, titleLimit, idx, c); 1.317 + if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) { 1.318 + /* Adjust the titlecasing index (titleStart) to the next cased character. */ 1.319 + for(;;) { 1.320 + titleStart=titleLimit; 1.321 + if(titleLimit==idx) { 1.322 + /* 1.323 + * only uncased characters in [prev..index[ 1.324 + * stop with titleStart==titleLimit==index 1.325 + */ 1.326 + break; 1.327 + } 1.328 + U8_NEXT(src, titleLimit, idx, c); 1.329 + if(UCASE_NONE!=ucase_getType(csm->csp, c)) { 1.330 + break; /* cased letter at [titleStart..titleLimit[ */ 1.331 + } 1.332 + } 1.333 + length=titleStart-prev; 1.334 + if(length>0) { 1.335 + if((destIndex+length)<=destCapacity) { 1.336 + uprv_memcpy(dest+destIndex, src+prev, length); 1.337 + } 1.338 + destIndex+=length; 1.339 + } 1.340 + } 1.341 + 1.342 + if(titleStart<titleLimit) { 1.343 + /* titlecase c which is from [titleStart..titleLimit[ */ 1.344 + csc.cpStart=titleStart; 1.345 + csc.cpLimit=titleLimit; 1.346 + c=ucase_toFullTitle(csm->csp, c, utf8_caseContextIterator, &csc, &s, csm->locale, &locCache); 1.347 + destIndex=appendResult(dest, destIndex, destCapacity, c, s); 1.348 + 1.349 + /* Special case Dutch IJ titlecasing */ 1.350 + if ( titleStart+1 < idx && 1.351 + ucase_getCaseLocale(csm->locale, &locCache) == UCASE_LOC_DUTCH && 1.352 + ( src[titleStart] == 0x0049 || src[titleStart] == 0x0069 ) && 1.353 + ( src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A )) { 1.354 + c=0x004A; 1.355 + destIndex=appendResult(dest, destIndex, destCapacity, c, s); 1.356 + titleLimit++; 1.357 + } 1.358 + /* lowercase [titleLimit..index[ */ 1.359 + if(titleLimit<idx) { 1.360 + if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) { 1.361 + /* Normal operation: Lowercase the rest of the word. */ 1.362 + destIndex+= 1.363 + _caseMap( 1.364 + csm, ucase_toFullLower, 1.365 + dest+destIndex, destCapacity-destIndex, 1.366 + src, &csc, 1.367 + titleLimit, idx, 1.368 + pErrorCode); 1.369 + } else { 1.370 + /* Optionally just copy the rest of the word unchanged. */ 1.371 + length=idx-titleLimit; 1.372 + if((destIndex+length)<=destCapacity) { 1.373 + uprv_memcpy(dest+destIndex, src+titleLimit, length); 1.374 + } 1.375 + destIndex+=length; 1.376 + } 1.377 + } 1.378 + } 1.379 + } 1.380 + 1.381 + prev=idx; 1.382 + } 1.383 + 1.384 + if(destIndex>destCapacity) { 1.385 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.386 + } 1.387 + return destIndex; 1.388 +} 1.389 + 1.390 +#endif 1.391 + 1.392 +static int32_t U_CALLCONV 1.393 +ucasemap_internalUTF8ToLower(const UCaseMap *csm, 1.394 + uint8_t *dest, int32_t destCapacity, 1.395 + const uint8_t *src, int32_t srcLength, 1.396 + UErrorCode *pErrorCode) { 1.397 + UCaseContext csc=UCASECONTEXT_INITIALIZER; 1.398 + csc.p=(void *)src; 1.399 + csc.limit=srcLength; 1.400 + return _caseMap( 1.401 + csm, ucase_toFullLower, 1.402 + dest, destCapacity, 1.403 + src, &csc, 0, srcLength, 1.404 + pErrorCode); 1.405 +} 1.406 + 1.407 +static int32_t U_CALLCONV 1.408 +ucasemap_internalUTF8ToUpper(const UCaseMap *csm, 1.409 + uint8_t *dest, int32_t destCapacity, 1.410 + const uint8_t *src, int32_t srcLength, 1.411 + UErrorCode *pErrorCode) { 1.412 + UCaseContext csc=UCASECONTEXT_INITIALIZER; 1.413 + csc.p=(void *)src; 1.414 + csc.limit=srcLength; 1.415 + return _caseMap( 1.416 + csm, ucase_toFullUpper, 1.417 + dest, destCapacity, 1.418 + src, &csc, 0, srcLength, 1.419 + pErrorCode); 1.420 +} 1.421 + 1.422 +static int32_t 1.423 +utf8_foldCase(const UCaseProps *csp, 1.424 + uint8_t *dest, int32_t destCapacity, 1.425 + const uint8_t *src, int32_t srcLength, 1.426 + uint32_t options, 1.427 + UErrorCode *pErrorCode) { 1.428 + int32_t srcIndex, destIndex; 1.429 + 1.430 + const UChar *s; 1.431 + UChar32 c, c2; 1.432 + int32_t start; 1.433 + 1.434 + /* case mapping loop */ 1.435 + srcIndex=destIndex=0; 1.436 + while(srcIndex<srcLength) { 1.437 + start=srcIndex; 1.438 + U8_NEXT(src, srcIndex, srcLength, c); 1.439 + if(c<0) { 1.440 + while(destIndex<destCapacity && start<srcIndex) { 1.441 + dest[destIndex++]=src[start++]; 1.442 + } 1.443 + continue; 1.444 + } 1.445 + c=ucase_toFullFolding(csp, c, &s, options); 1.446 + if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) { 1.447 + /* fast path version of appendResult() for ASCII results */ 1.448 + dest[destIndex++]=(uint8_t)c2; 1.449 + } else { 1.450 + destIndex=appendResult(dest, destIndex, destCapacity, c, s); 1.451 + } 1.452 + } 1.453 + 1.454 + if(destIndex>destCapacity) { 1.455 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.456 + } 1.457 + return destIndex; 1.458 +} 1.459 + 1.460 +static int32_t U_CALLCONV 1.461 +ucasemap_internalUTF8Fold(const UCaseMap *csm, 1.462 + uint8_t *dest, int32_t destCapacity, 1.463 + const uint8_t *src, int32_t srcLength, 1.464 + UErrorCode *pErrorCode) { 1.465 + return utf8_foldCase(csm->csp, dest, destCapacity, src, srcLength, csm->options, pErrorCode); 1.466 +} 1.467 + 1.468 +U_CFUNC int32_t 1.469 +ucasemap_mapUTF8(const UCaseMap *csm, 1.470 + uint8_t *dest, int32_t destCapacity, 1.471 + const uint8_t *src, int32_t srcLength, 1.472 + UTF8CaseMapper *stringCaseMapper, 1.473 + UErrorCode *pErrorCode) { 1.474 + int32_t destLength; 1.475 + 1.476 + /* check argument values */ 1.477 + if(U_FAILURE(*pErrorCode)) { 1.478 + return 0; 1.479 + } 1.480 + if( destCapacity<0 || 1.481 + (dest==NULL && destCapacity>0) || 1.482 + src==NULL || 1.483 + srcLength<-1 1.484 + ) { 1.485 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.486 + return 0; 1.487 + } 1.488 + 1.489 + /* get the string length */ 1.490 + if(srcLength==-1) { 1.491 + srcLength=(int32_t)uprv_strlen((const char *)src); 1.492 + } 1.493 + 1.494 + /* check for overlapping source and destination */ 1.495 + if( dest!=NULL && 1.496 + ((src>=dest && src<(dest+destCapacity)) || 1.497 + (dest>=src && dest<(src+srcLength))) 1.498 + ) { 1.499 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.500 + return 0; 1.501 + } 1.502 + 1.503 + destLength=stringCaseMapper(csm, dest, destCapacity, src, srcLength, pErrorCode); 1.504 + return u_terminateChars((char *)dest, destCapacity, destLength, pErrorCode); 1.505 +} 1.506 + 1.507 +/* public API functions */ 1.508 + 1.509 +U_CAPI int32_t U_EXPORT2 1.510 +ucasemap_utf8ToLower(const UCaseMap *csm, 1.511 + char *dest, int32_t destCapacity, 1.512 + const char *src, int32_t srcLength, 1.513 + UErrorCode *pErrorCode) { 1.514 + return ucasemap_mapUTF8(csm, 1.515 + (uint8_t *)dest, destCapacity, 1.516 + (const uint8_t *)src, srcLength, 1.517 + ucasemap_internalUTF8ToLower, pErrorCode); 1.518 +} 1.519 + 1.520 +U_CAPI int32_t U_EXPORT2 1.521 +ucasemap_utf8ToUpper(const UCaseMap *csm, 1.522 + char *dest, int32_t destCapacity, 1.523 + const char *src, int32_t srcLength, 1.524 + UErrorCode *pErrorCode) { 1.525 + return ucasemap_mapUTF8(csm, 1.526 + (uint8_t *)dest, destCapacity, 1.527 + (const uint8_t *)src, srcLength, 1.528 + ucasemap_internalUTF8ToUpper, pErrorCode); 1.529 +} 1.530 + 1.531 +U_CAPI int32_t U_EXPORT2 1.532 +ucasemap_utf8FoldCase(const UCaseMap *csm, 1.533 + char *dest, int32_t destCapacity, 1.534 + const char *src, int32_t srcLength, 1.535 + UErrorCode *pErrorCode) { 1.536 + return ucasemap_mapUTF8(csm, 1.537 + (uint8_t *)dest, destCapacity, 1.538 + (const uint8_t *)src, srcLength, 1.539 + ucasemap_internalUTF8Fold, pErrorCode); 1.540 +}