intl/icu/source/common/ucasemap.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2 *******************************************************************************
     3 *
     4 *   Copyright (C) 2005-2011, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 *******************************************************************************
     8 *   file name:  ucasemap.cpp
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2005may06
    14 *   created by: Markus W. Scherer
    15 *
    16 *   Case mapping service object and functions using it.
    17 */
    19 #include "unicode/utypes.h"
    20 #include "unicode/brkiter.h"
    21 #include "unicode/ubrk.h"
    22 #include "unicode/uloc.h"
    23 #include "unicode/ustring.h"
    24 #include "unicode/ucasemap.h"
    25 #if !UCONFIG_NO_BREAK_ITERATION
    26 #include "unicode/utext.h"
    27 #endif
    28 #include "unicode/utf.h"
    29 #include "unicode/utf8.h"
    30 #include "unicode/utf16.h"
    31 #include "cmemory.h"
    32 #include "cstring.h"
    33 #include "ucase.h"
    34 #include "ustr_imp.h"
    36 U_NAMESPACE_USE
    38 /* UCaseMap service object -------------------------------------------------- */
    40 U_CAPI UCaseMap * U_EXPORT2
    41 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
    42     UCaseMap *csm;
    44     if(U_FAILURE(*pErrorCode)) {
    45         return NULL;
    46     }
    48     csm=(UCaseMap *)uprv_malloc(sizeof(UCaseMap));
    49     if(csm==NULL) {
    50         return NULL;
    51     }
    52     uprv_memset(csm, 0, sizeof(UCaseMap));
    54     csm->csp=ucase_getSingleton();
    55     ucasemap_setLocale(csm, locale, pErrorCode);
    56     if(U_FAILURE(*pErrorCode)) {
    57         uprv_free(csm);
    58         return NULL;
    59     }
    61     csm->options=options;
    62     return csm;
    63 }
    65 U_CAPI void U_EXPORT2
    66 ucasemap_close(UCaseMap *csm) {
    67     if(csm!=NULL) {
    68 #if !UCONFIG_NO_BREAK_ITERATION
    69         // Do not call ubrk_close() so that we do not depend on all of the BreakIterator code.
    70         delete reinterpret_cast<BreakIterator *>(csm->iter);
    71 #endif
    72         uprv_free(csm);
    73     }
    74 }
    76 U_CAPI const char * U_EXPORT2
    77 ucasemap_getLocale(const UCaseMap *csm) {
    78     return csm->locale;
    79 }
    81 U_CAPI uint32_t U_EXPORT2
    82 ucasemap_getOptions(const UCaseMap *csm) {
    83     return csm->options;
    84 }
    86 U_CAPI void U_EXPORT2
    87 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
    88     int32_t length;
    90     if(U_FAILURE(*pErrorCode)) {
    91         return;
    92     }
    94     length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
    95     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
    96         *pErrorCode=U_ZERO_ERROR;
    97         /* we only really need the language code for case mappings */
    98         length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
    99     }
   100     if(length==sizeof(csm->locale)) {
   101         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   102     }
   103     csm->locCache=0;
   104     if(U_SUCCESS(*pErrorCode)) {
   105         ucase_getCaseLocale(csm->locale, &csm->locCache);
   106     } else {
   107         csm->locale[0]=0;
   108     }
   109 }
   111 U_CAPI void U_EXPORT2
   112 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode * /*pErrorCode*/) {
   113     csm->options=options;
   114 }
   116 /* UTF-8 string case mappings ----------------------------------------------- */
   118 /* TODO(markus): Move to a new, separate utf8case.c file. */
   120 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
   121 static inline int32_t
   122 appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity,
   123              int32_t result, const UChar *s) {
   124     UChar32 c;
   125     int32_t length, destLength;
   126     UErrorCode errorCode;
   128     /* decode the result */
   129     if(result<0) {
   130         /* (not) original code point */
   131         c=~result;
   132         length=-1;
   133     } else if(result<=UCASE_MAX_STRING_LENGTH) {
   134         c=U_SENTINEL;
   135         length=result;
   136     } else {
   137         c=result;
   138         length=-1;
   139     }
   141     if(destIndex<destCapacity) {
   142         /* append the result */
   143         if(length<0) {
   144             /* code point */
   145             UBool isError=FALSE;
   146             U8_APPEND(dest, destIndex, destCapacity, c, isError);
   147             if(isError) {
   148                 /* overflow, nothing written */
   149                 destIndex+=U8_LENGTH(c);
   150             }
   151         } else {
   152             /* string */
   153             errorCode=U_ZERO_ERROR;
   154             u_strToUTF8(
   155                 (char *)(dest+destIndex), destCapacity-destIndex, &destLength,
   156                 s, length,
   157                 &errorCode);
   158             destIndex+=destLength;
   159             /* we might have an overflow, but we know the actual length */
   160         }
   161     } else {
   162         /* preflight */
   163         if(length<0) {
   164             destIndex+=U8_LENGTH(c);
   165         } else {
   166             errorCode=U_ZERO_ERROR;
   167             u_strToUTF8(
   168                 NULL, 0, &destLength,
   169                 s, length,
   170                 &errorCode);
   171             destIndex+=destLength;
   172         }
   173     }
   174     return destIndex;
   175 }
   177 static UChar32 U_CALLCONV
   178 utf8_caseContextIterator(void *context, int8_t dir) {
   179     UCaseContext *csc=(UCaseContext *)context;
   180     UChar32 c;
   182     if(dir<0) {
   183         /* reset for backward iteration */
   184         csc->index=csc->cpStart;
   185         csc->dir=dir;
   186     } else if(dir>0) {
   187         /* reset for forward iteration */
   188         csc->index=csc->cpLimit;
   189         csc->dir=dir;
   190     } else {
   191         /* continue current iteration direction */
   192         dir=csc->dir;
   193     }
   195     if(dir<0) {
   196         if(csc->start<csc->index) {
   197             U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
   198             return c;
   199         }
   200     } else {
   201         if(csc->index<csc->limit) {
   202             U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
   203             return c;
   204         }
   205     }
   206     return U_SENTINEL;
   207 }
   209 /*
   210  * Case-maps [srcStart..srcLimit[ but takes
   211  * context [0..srcLength[ into account.
   212  */
   213 static int32_t
   214 _caseMap(const UCaseMap *csm, UCaseMapFull *map,
   215          uint8_t *dest, int32_t destCapacity,
   216          const uint8_t *src, UCaseContext *csc,
   217          int32_t srcStart, int32_t srcLimit,
   218          UErrorCode *pErrorCode) {
   219     const UChar *s;
   220     UChar32 c, c2 = 0;
   221     int32_t srcIndex, destIndex;
   222     int32_t locCache;
   224     locCache=csm->locCache;
   226     /* case mapping loop */
   227     srcIndex=srcStart;
   228     destIndex=0;
   229     while(srcIndex<srcLimit) {
   230         csc->cpStart=srcIndex;
   231         U8_NEXT(src, srcIndex, srcLimit, c);
   232         csc->cpLimit=srcIndex;
   233         if(c<0) {
   234             int32_t i=csc->cpStart;
   235             while(destIndex<destCapacity && i<srcIndex) {
   236                 dest[destIndex++]=src[i++];
   237             }
   238             continue;
   239         }
   240         c=map(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &locCache);
   241         if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) {
   242             /* fast path version of appendResult() for ASCII results */
   243             dest[destIndex++]=(uint8_t)c2;
   244         } else {
   245             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
   246         }
   247     }
   249     if(destIndex>destCapacity) {
   250         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   251     }
   252     return destIndex;
   253 }
   255 #if !UCONFIG_NO_BREAK_ITERATION
   257 U_CFUNC int32_t U_CALLCONV
   258 ucasemap_internalUTF8ToTitle(const UCaseMap *csm,
   259          uint8_t *dest, int32_t destCapacity,
   260          const uint8_t *src, int32_t srcLength,
   261          UErrorCode *pErrorCode) {
   262     const UChar *s;
   263     UChar32 c;
   264     int32_t prev, titleStart, titleLimit, idx, destIndex, length;
   265     UBool isFirstIndex;
   267     if(U_FAILURE(*pErrorCode)) {
   268         return 0;
   269     }
   271     // Use the C++ abstract base class to minimize dependencies.
   272     // TODO: Change UCaseMap.iter to store a BreakIterator directly.
   273     BreakIterator *bi=reinterpret_cast<BreakIterator *>(csm->iter);
   275     /* set up local variables */
   276     int32_t locCache=csm->locCache;
   277     UCaseContext csc=UCASECONTEXT_INITIALIZER;
   278     csc.p=(void *)src;
   279     csc.limit=srcLength;
   280     destIndex=0;
   281     prev=0;
   282     isFirstIndex=TRUE;
   284     /* titlecasing loop */
   285     while(prev<srcLength) {
   286         /* find next index where to titlecase */
   287         if(isFirstIndex) {
   288             isFirstIndex=FALSE;
   289             idx=bi->first();
   290         } else {
   291             idx=bi->next();
   292         }
   293         if(idx==UBRK_DONE || idx>srcLength) {
   294             idx=srcLength;
   295         }
   297         /*
   298          * Unicode 4 & 5 section 3.13 Default Case Operations:
   299          *
   300          * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
   301          * #29, "Text Boundaries." Between each pair of word boundaries, find the first
   302          * cased character F. If F exists, map F to default_title(F); then map each
   303          * subsequent character C to default_lower(C).
   304          *
   305          * In this implementation, segment [prev..index[ into 3 parts:
   306          * a) uncased characters (copy as-is) [prev..titleStart[
   307          * b) first case letter (titlecase)         [titleStart..titleLimit[
   308          * c) subsequent characters (lowercase)                 [titleLimit..index[
   309          */
   310         if(prev<idx) {
   311             /* find and copy uncased characters [prev..titleStart[ */
   312             titleStart=titleLimit=prev;
   313             U8_NEXT(src, titleLimit, idx, c);
   314             if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
   315                 /* Adjust the titlecasing index (titleStart) to the next cased character. */
   316                 for(;;) {
   317                     titleStart=titleLimit;
   318                     if(titleLimit==idx) {
   319                         /*
   320                          * only uncased characters in [prev..index[
   321                          * stop with titleStart==titleLimit==index
   322                          */
   323                         break;
   324                     }
   325                     U8_NEXT(src, titleLimit, idx, c);
   326                     if(UCASE_NONE!=ucase_getType(csm->csp, c)) {
   327                         break; /* cased letter at [titleStart..titleLimit[ */
   328                     }
   329                 }
   330                 length=titleStart-prev;
   331                 if(length>0) {
   332                     if((destIndex+length)<=destCapacity) {
   333                         uprv_memcpy(dest+destIndex, src+prev, length);
   334                     }
   335                     destIndex+=length;
   336                 }
   337             }
   339             if(titleStart<titleLimit) {
   340                 /* titlecase c which is from [titleStart..titleLimit[ */
   341                 csc.cpStart=titleStart;
   342                 csc.cpLimit=titleLimit;
   343                 c=ucase_toFullTitle(csm->csp, c, utf8_caseContextIterator, &csc, &s, csm->locale, &locCache);
   344                 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
   346                 /* Special case Dutch IJ titlecasing */
   347                 if ( titleStart+1 < idx && 
   348                      ucase_getCaseLocale(csm->locale, &locCache) == UCASE_LOC_DUTCH &&
   349                      ( src[titleStart] == 0x0049 || src[titleStart] == 0x0069 ) &&
   350                      ( src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A )) { 
   351                             c=0x004A;
   352                             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
   353                             titleLimit++;
   354                 }
   355                 /* lowercase [titleLimit..index[ */
   356                 if(titleLimit<idx) {
   357                     if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
   358                         /* Normal operation: Lowercase the rest of the word. */
   359                         destIndex+=
   360                             _caseMap(
   361                                 csm, ucase_toFullLower,
   362                                 dest+destIndex, destCapacity-destIndex,
   363                                 src, &csc,
   364                                 titleLimit, idx,
   365                                 pErrorCode);
   366                     } else {
   367                         /* Optionally just copy the rest of the word unchanged. */
   368                         length=idx-titleLimit;
   369                         if((destIndex+length)<=destCapacity) {
   370                             uprv_memcpy(dest+destIndex, src+titleLimit, length);
   371                         }
   372                         destIndex+=length;
   373                     }
   374                 }
   375             }
   376         }
   378         prev=idx;
   379     }
   381     if(destIndex>destCapacity) {
   382         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   383     }
   384     return destIndex;
   385 }
   387 #endif
   389 static int32_t U_CALLCONV
   390 ucasemap_internalUTF8ToLower(const UCaseMap *csm,
   391                              uint8_t *dest, int32_t destCapacity,
   392                              const uint8_t *src, int32_t srcLength,
   393                              UErrorCode *pErrorCode) {
   394     UCaseContext csc=UCASECONTEXT_INITIALIZER;
   395     csc.p=(void *)src;
   396     csc.limit=srcLength;
   397     return _caseMap(
   398         csm, ucase_toFullLower,
   399         dest, destCapacity,
   400         src, &csc, 0, srcLength,
   401         pErrorCode);
   402 }
   404 static int32_t U_CALLCONV
   405 ucasemap_internalUTF8ToUpper(const UCaseMap *csm,
   406                              uint8_t *dest, int32_t destCapacity,
   407                              const uint8_t *src, int32_t srcLength,
   408                              UErrorCode *pErrorCode) {
   409     UCaseContext csc=UCASECONTEXT_INITIALIZER;
   410     csc.p=(void *)src;
   411     csc.limit=srcLength;
   412     return _caseMap(
   413         csm, ucase_toFullUpper,
   414         dest, destCapacity,
   415         src, &csc, 0, srcLength,
   416         pErrorCode);
   417 }
   419 static int32_t
   420 utf8_foldCase(const UCaseProps *csp,
   421               uint8_t *dest, int32_t destCapacity,
   422               const uint8_t *src, int32_t srcLength,
   423               uint32_t options,
   424               UErrorCode *pErrorCode) {
   425     int32_t srcIndex, destIndex;
   427     const UChar *s;
   428     UChar32 c, c2;
   429     int32_t start;
   431     /* case mapping loop */
   432     srcIndex=destIndex=0;
   433     while(srcIndex<srcLength) {
   434         start=srcIndex;
   435         U8_NEXT(src, srcIndex, srcLength, c);
   436         if(c<0) {
   437             while(destIndex<destCapacity && start<srcIndex) {
   438                 dest[destIndex++]=src[start++];
   439             }
   440             continue;
   441         }
   442         c=ucase_toFullFolding(csp, c, &s, options);
   443         if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) {
   444             /* fast path version of appendResult() for ASCII results */
   445             dest[destIndex++]=(uint8_t)c2;
   446         } else {
   447             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
   448         }
   449     }
   451     if(destIndex>destCapacity) {
   452         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   453     }
   454     return destIndex;
   455 }
   457 static int32_t U_CALLCONV
   458 ucasemap_internalUTF8Fold(const UCaseMap *csm,
   459                           uint8_t *dest, int32_t destCapacity,
   460                           const uint8_t *src, int32_t srcLength,
   461                           UErrorCode *pErrorCode) {
   462     return utf8_foldCase(csm->csp, dest, destCapacity, src, srcLength, csm->options, pErrorCode);
   463 }
   465 U_CFUNC int32_t
   466 ucasemap_mapUTF8(const UCaseMap *csm,
   467                  uint8_t *dest, int32_t destCapacity,
   468                  const uint8_t *src, int32_t srcLength,
   469                  UTF8CaseMapper *stringCaseMapper,
   470                  UErrorCode *pErrorCode) {
   471     int32_t destLength;
   473     /* check argument values */
   474     if(U_FAILURE(*pErrorCode)) {
   475         return 0;
   476     }
   477     if( destCapacity<0 ||
   478         (dest==NULL && destCapacity>0) ||
   479         src==NULL ||
   480         srcLength<-1
   481     ) {
   482         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   483         return 0;
   484     }
   486     /* get the string length */
   487     if(srcLength==-1) {
   488         srcLength=(int32_t)uprv_strlen((const char *)src);
   489     }
   491     /* check for overlapping source and destination */
   492     if( dest!=NULL &&
   493         ((src>=dest && src<(dest+destCapacity)) ||
   494          (dest>=src && dest<(src+srcLength)))
   495     ) {
   496         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   497         return 0;
   498     }
   500     destLength=stringCaseMapper(csm, dest, destCapacity, src, srcLength, pErrorCode);
   501     return u_terminateChars((char *)dest, destCapacity, destLength, pErrorCode);
   502 }
   504 /* public API functions */
   506 U_CAPI int32_t U_EXPORT2
   507 ucasemap_utf8ToLower(const UCaseMap *csm,
   508                      char *dest, int32_t destCapacity,
   509                      const char *src, int32_t srcLength,
   510                      UErrorCode *pErrorCode) {
   511     return ucasemap_mapUTF8(csm,
   512                    (uint8_t *)dest, destCapacity,
   513                    (const uint8_t *)src, srcLength,
   514                    ucasemap_internalUTF8ToLower, pErrorCode);
   515 }
   517 U_CAPI int32_t U_EXPORT2
   518 ucasemap_utf8ToUpper(const UCaseMap *csm,
   519                      char *dest, int32_t destCapacity,
   520                      const char *src, int32_t srcLength,
   521                      UErrorCode *pErrorCode) {
   522     return ucasemap_mapUTF8(csm,
   523                    (uint8_t *)dest, destCapacity,
   524                    (const uint8_t *)src, srcLength,
   525                    ucasemap_internalUTF8ToUpper, pErrorCode);
   526 }
   528 U_CAPI int32_t U_EXPORT2
   529 ucasemap_utf8FoldCase(const UCaseMap *csm,
   530                       char *dest, int32_t destCapacity,
   531                       const char *src, int32_t srcLength,
   532                       UErrorCode *pErrorCode) {
   533     return ucasemap_mapUTF8(csm,
   534                    (uint8_t *)dest, destCapacity,
   535                    (const uint8_t *)src, srcLength,
   536                    ucasemap_internalUTF8Fold, pErrorCode);
   537 }

mercurial