michael@0: /* michael@0: ******************************************************************************* michael@0: * michael@0: * Copyright (C) 2001-2011, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ******************************************************************************* michael@0: * file name: ustrcase.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2002feb20 michael@0: * created by: Markus W. Scherer michael@0: * michael@0: * Implementation file for string casing C API functions. michael@0: * Uses functions from uchar.c for basic functionality that requires access michael@0: * to the Unicode Character Database (uprops.dat). michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "unicode/brkiter.h" michael@0: #include "unicode/ustring.h" michael@0: #include "unicode/ucasemap.h" michael@0: #include "unicode/ubrk.h" michael@0: #include "unicode/utf.h" michael@0: #include "unicode/utf16.h" michael@0: #include "cmemory.h" michael@0: #include "ucase.h" michael@0: #include "ustr_imp.h" michael@0: michael@0: #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) michael@0: michael@0: U_NAMESPACE_USE michael@0: michael@0: /* string casing ------------------------------------------------------------ */ michael@0: michael@0: /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */ michael@0: static inline int32_t michael@0: appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity, michael@0: int32_t result, const UChar *s) { michael@0: UChar32 c; michael@0: int32_t length; michael@0: michael@0: /* decode the result */ michael@0: if(result<0) { michael@0: /* (not) original code point */ michael@0: c=~result; michael@0: length=-1; michael@0: } else if(result<=UCASE_MAX_STRING_LENGTH) { michael@0: c=U_SENTINEL; michael@0: length=result; michael@0: } else { michael@0: c=result; michael@0: length=-1; michael@0: } michael@0: michael@0: if(destIndex0) { michael@0: dest[destIndex++]=*s++; michael@0: --length; michael@0: } michael@0: } else { michael@0: /* overflow */ michael@0: destIndex+=length; michael@0: } michael@0: } michael@0: } else { michael@0: /* preflight */ michael@0: if(length<0) { michael@0: destIndex+=U16_LENGTH(c); michael@0: } else { michael@0: destIndex+=length; michael@0: } michael@0: } michael@0: return destIndex; michael@0: } michael@0: michael@0: static UChar32 U_CALLCONV michael@0: utf16_caseContextIterator(void *context, int8_t dir) { michael@0: UCaseContext *csc=(UCaseContext *)context; michael@0: UChar32 c; michael@0: michael@0: if(dir<0) { michael@0: /* reset for backward iteration */ michael@0: csc->index=csc->cpStart; michael@0: csc->dir=dir; michael@0: } else if(dir>0) { michael@0: /* reset for forward iteration */ michael@0: csc->index=csc->cpLimit; michael@0: csc->dir=dir; michael@0: } else { michael@0: /* continue current iteration direction */ michael@0: dir=csc->dir; michael@0: } michael@0: michael@0: if(dir<0) { michael@0: if(csc->startindex) { michael@0: U16_PREV((const UChar *)csc->p, csc->start, csc->index, c); michael@0: return c; michael@0: } michael@0: } else { michael@0: if(csc->indexlimit) { michael@0: U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c); michael@0: return c; michael@0: } michael@0: } michael@0: return U_SENTINEL; michael@0: } michael@0: michael@0: /* michael@0: * Case-maps [srcStart..srcLimit[ but takes michael@0: * context [0..srcLength[ into account. michael@0: */ michael@0: static int32_t michael@0: _caseMap(const UCaseMap *csm, UCaseMapFull *map, michael@0: UChar *dest, int32_t destCapacity, michael@0: const UChar *src, UCaseContext *csc, michael@0: int32_t srcStart, int32_t srcLimit, michael@0: UErrorCode *pErrorCode) { michael@0: const UChar *s; michael@0: UChar32 c, c2 = 0; michael@0: int32_t srcIndex, destIndex; michael@0: int32_t locCache; michael@0: michael@0: locCache=csm->locCache; michael@0: michael@0: /* case mapping loop */ michael@0: srcIndex=srcStart; michael@0: destIndex=0; michael@0: while(srcIndexcpStart=srcIndex; michael@0: U16_NEXT(src, srcIndex, srcLimit, c); michael@0: csc->cpLimit=srcIndex; michael@0: c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache); michael@0: if((destIndexdestCapacity) { michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: } michael@0: return destIndex; michael@0: } michael@0: michael@0: #if !UCONFIG_NO_BREAK_ITERATION michael@0: michael@0: U_CFUNC int32_t U_CALLCONV michael@0: ustrcase_internalToTitle(const UCaseMap *csm, michael@0: UChar *dest, int32_t destCapacity, michael@0: const UChar *src, int32_t srcLength, michael@0: UErrorCode *pErrorCode) { michael@0: const UChar *s; michael@0: UChar32 c; michael@0: int32_t prev, titleStart, titleLimit, idx, destIndex, length; michael@0: UBool isFirstIndex; michael@0: michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return 0; michael@0: } michael@0: michael@0: // Use the C++ abstract base class to minimize dependencies. michael@0: // TODO: Change UCaseMap.iter to store a BreakIterator directly. michael@0: BreakIterator *bi=reinterpret_cast(csm->iter); michael@0: michael@0: /* set up local variables */ michael@0: int32_t locCache=csm->locCache; michael@0: UCaseContext csc=UCASECONTEXT_INITIALIZER; michael@0: csc.p=(void *)src; michael@0: csc.limit=srcLength; michael@0: destIndex=0; michael@0: prev=0; michael@0: isFirstIndex=TRUE; michael@0: michael@0: /* titlecasing loop */ michael@0: while(prevfirst(); michael@0: } else { michael@0: idx=bi->next(); michael@0: } michael@0: if(idx==UBRK_DONE || idx>srcLength) { michael@0: idx=srcLength; michael@0: } michael@0: michael@0: /* michael@0: * Unicode 4 & 5 section 3.13 Default Case Operations: michael@0: * michael@0: * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex michael@0: * #29, "Text Boundaries." Between each pair of word boundaries, find the first michael@0: * cased character F. If F exists, map F to default_title(F); then map each michael@0: * subsequent character C to default_lower(C). michael@0: * michael@0: * In this implementation, segment [prev..index[ into 3 parts: michael@0: * a) uncased characters (copy as-is) [prev..titleStart[ michael@0: * b) first case letter (titlecase) [titleStart..titleLimit[ michael@0: * c) subsequent characters (lowercase) [titleLimit..index[ michael@0: */ michael@0: if(prevoptions&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) { michael@0: /* Adjust the titlecasing index (titleStart) to the next cased character. */ michael@0: for(;;) { michael@0: titleStart=titleLimit; michael@0: if(titleLimit==idx) { michael@0: /* michael@0: * only uncased characters in [prev..index[ michael@0: * stop with titleStart==titleLimit==index michael@0: */ michael@0: break; michael@0: } michael@0: U16_NEXT(src, titleLimit, idx, c); michael@0: if(UCASE_NONE!=ucase_getType(csm->csp, c)) { michael@0: break; /* cased letter at [titleStart..titleLimit[ */ michael@0: } michael@0: } michael@0: length=titleStart-prev; michael@0: if(length>0) { michael@0: if((destIndex+length)<=destCapacity) { michael@0: uprv_memcpy(dest+destIndex, src+prev, length*U_SIZEOF_UCHAR); michael@0: } michael@0: destIndex+=length; michael@0: } michael@0: } michael@0: michael@0: if(titleStartcsp, c, utf16_caseContextIterator, &csc, &s, csm->locale, &locCache); michael@0: destIndex=appendResult(dest, destIndex, destCapacity, c, s); michael@0: michael@0: /* Special case Dutch IJ titlecasing */ michael@0: if ( titleStart+1 < idx && michael@0: ucase_getCaseLocale(csm->locale,&locCache) == UCASE_LOC_DUTCH && michael@0: ( src[titleStart] == (UChar32) 0x0049 || src[titleStart] == (UChar32) 0x0069 ) && michael@0: ( src[titleStart+1] == (UChar32) 0x004A || src[titleStart+1] == (UChar32) 0x006A )) { michael@0: c=(UChar32) 0x004A; michael@0: destIndex=appendResult(dest, destIndex, destCapacity, c, s); michael@0: titleLimit++; michael@0: } michael@0: michael@0: /* lowercase [titleLimit..index[ */ michael@0: if(titleLimitoptions&U_TITLECASE_NO_LOWERCASE)==0) { michael@0: /* Normal operation: Lowercase the rest of the word. */ michael@0: destIndex+= michael@0: _caseMap( michael@0: csm, ucase_toFullLower, michael@0: dest+destIndex, destCapacity-destIndex, michael@0: src, &csc, michael@0: titleLimit, idx, michael@0: pErrorCode); michael@0: } else { michael@0: /* Optionally just copy the rest of the word unchanged. */ michael@0: length=idx-titleLimit; michael@0: if((destIndex+length)<=destCapacity) { michael@0: uprv_memcpy(dest+destIndex, src+titleLimit, length*U_SIZEOF_UCHAR); michael@0: } michael@0: destIndex+=length; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: prev=idx; michael@0: } michael@0: michael@0: if(destIndex>destCapacity) { michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: } michael@0: return destIndex; michael@0: } michael@0: michael@0: #endif // !UCONFIG_NO_BREAK_ITERATION michael@0: michael@0: /* functions available in the common library (for unistr_case.cpp) */ michael@0: michael@0: U_CFUNC int32_t U_CALLCONV michael@0: ustrcase_internalToLower(const UCaseMap *csm, michael@0: UChar *dest, int32_t destCapacity, michael@0: const UChar *src, int32_t srcLength, michael@0: UErrorCode *pErrorCode) { michael@0: UCaseContext csc=UCASECONTEXT_INITIALIZER; michael@0: csc.p=(void *)src; michael@0: csc.limit=srcLength; michael@0: return _caseMap( michael@0: csm, ucase_toFullLower, michael@0: dest, destCapacity, michael@0: src, &csc, 0, srcLength, michael@0: pErrorCode); michael@0: } michael@0: michael@0: U_CFUNC int32_t U_CALLCONV michael@0: ustrcase_internalToUpper(const UCaseMap *csm, michael@0: UChar *dest, int32_t destCapacity, michael@0: const UChar *src, int32_t srcLength, michael@0: UErrorCode *pErrorCode) { michael@0: UCaseContext csc=UCASECONTEXT_INITIALIZER; michael@0: csc.p=(void *)src; michael@0: csc.limit=srcLength; michael@0: return _caseMap( michael@0: csm, ucase_toFullUpper, michael@0: dest, destCapacity, michael@0: src, &csc, 0, srcLength, michael@0: pErrorCode); michael@0: } michael@0: michael@0: static int32_t michael@0: ustr_foldCase(const UCaseProps *csp, michael@0: UChar *dest, int32_t destCapacity, michael@0: const UChar *src, int32_t srcLength, michael@0: uint32_t options, michael@0: UErrorCode *pErrorCode) { michael@0: int32_t srcIndex, destIndex; michael@0: michael@0: const UChar *s; michael@0: UChar32 c, c2 = 0; michael@0: michael@0: /* case mapping loop */ michael@0: srcIndex=destIndex=0; michael@0: while(srcIndexdestCapacity) { michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: } michael@0: return destIndex; michael@0: } michael@0: michael@0: U_CFUNC int32_t U_CALLCONV michael@0: ustrcase_internalFold(const UCaseMap *csm, michael@0: UChar *dest, int32_t destCapacity, michael@0: const UChar *src, int32_t srcLength, michael@0: UErrorCode *pErrorCode) { michael@0: return ustr_foldCase(csm->csp, dest, destCapacity, src, srcLength, csm->options, pErrorCode); michael@0: } michael@0: michael@0: U_CFUNC int32_t michael@0: ustrcase_map(const UCaseMap *csm, michael@0: UChar *dest, int32_t destCapacity, michael@0: const UChar *src, int32_t srcLength, michael@0: UStringCaseMapper *stringCaseMapper, michael@0: UErrorCode *pErrorCode) { michael@0: UChar buffer[300]; michael@0: UChar *temp; michael@0: michael@0: int32_t destLength; michael@0: michael@0: /* check argument values */ michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return 0; michael@0: } michael@0: if( destCapacity<0 || michael@0: (dest==NULL && destCapacity>0) || michael@0: src==NULL || michael@0: srcLength<-1 michael@0: ) { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: /* get the string length */ michael@0: if(srcLength==-1) { michael@0: srcLength=u_strlen(src); michael@0: } michael@0: michael@0: /* check for overlapping source and destination */ michael@0: if( dest!=NULL && michael@0: ((src>=dest && src<(dest+destCapacity)) || michael@0: (dest>=src && dest<(src+srcLength))) michael@0: ) { michael@0: /* overlap: provide a temporary destination buffer and later copy the result */ michael@0: if(destCapacity<=LENGTHOF(buffer)) { michael@0: /* the stack buffer is large enough */ michael@0: temp=buffer; michael@0: } else { michael@0: /* allocate a buffer */ michael@0: temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR); michael@0: if(temp==NULL) { michael@0: *pErrorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: return 0; michael@0: } michael@0: } michael@0: } else { michael@0: temp=dest; michael@0: } michael@0: michael@0: destLength=stringCaseMapper(csm, temp, destCapacity, src, srcLength, pErrorCode); michael@0: if(temp!=dest) { michael@0: /* copy the result string to the destination buffer */ michael@0: if(destLength>0) { michael@0: int32_t copyLength= destLength<=destCapacity ? destLength : destCapacity; michael@0: if(copyLength>0) { michael@0: uprv_memmove(dest, temp, copyLength*U_SIZEOF_UCHAR); michael@0: } michael@0: } michael@0: if(temp!=buffer) { michael@0: uprv_free(temp); michael@0: } michael@0: } michael@0: michael@0: return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); michael@0: } michael@0: michael@0: /* public API functions */ michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: u_strFoldCase(UChar *dest, int32_t destCapacity, michael@0: const UChar *src, int32_t srcLength, michael@0: uint32_t options, michael@0: UErrorCode *pErrorCode) { michael@0: UCaseMap csm=UCASEMAP_INITIALIZER; michael@0: csm.csp=ucase_getSingleton(); michael@0: csm.options=options; michael@0: return ustrcase_map( michael@0: &csm, michael@0: dest, destCapacity, michael@0: src, srcLength, michael@0: ustrcase_internalFold, pErrorCode); michael@0: } michael@0: michael@0: /* case-insensitive string comparisons -------------------------------------- */ michael@0: michael@0: /* michael@0: * This function is a copy of unorm_cmpEquivFold() minus the parts for michael@0: * canonical equivalence. michael@0: * Keep the functions in sync, and see there for how this works. michael@0: * The duplication is for modularization: michael@0: * It makes caseless (but not canonical caseless) matches independent of michael@0: * the normalization code. michael@0: */ michael@0: michael@0: /* stack element for previous-level source/decomposition pointers */ michael@0: struct CmpEquivLevel { michael@0: const UChar *start, *s, *limit; michael@0: }; michael@0: typedef struct CmpEquivLevel CmpEquivLevel; michael@0: michael@0: /* internal function */ michael@0: U_CFUNC int32_t michael@0: u_strcmpFold(const UChar *s1, int32_t length1, michael@0: const UChar *s2, int32_t length2, michael@0: uint32_t options, michael@0: UErrorCode *pErrorCode) { michael@0: const UCaseProps *csp; michael@0: michael@0: /* current-level start/limit - s1/s2 as current */ michael@0: const UChar *start1, *start2, *limit1, *limit2; michael@0: michael@0: /* case folding variables */ michael@0: const UChar *p; michael@0: int32_t length; michael@0: michael@0: /* stacks of previous-level start/current/limit */ michael@0: CmpEquivLevel stack1[2], stack2[2]; michael@0: michael@0: /* case folding buffers, only use current-level start/limit */ michael@0: UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; michael@0: michael@0: /* track which is the current level per string */ michael@0: int32_t level1, level2; michael@0: michael@0: /* current code units, and code points for lookups */ michael@0: UChar32 c1, c2, cp1, cp2; michael@0: michael@0: /* no argument error checking because this itself is not an API */ michael@0: michael@0: /* michael@0: * assume that at least the option U_COMPARE_IGNORE_CASE is set michael@0: * otherwise this function would have to behave exactly as uprv_strCompare() michael@0: */ michael@0: csp=ucase_getSingleton(); michael@0: if(U_FAILURE(*pErrorCode)) { michael@0: return 0; michael@0: } michael@0: michael@0: /* initialize */ michael@0: start1=s1; michael@0: if(length1==-1) { michael@0: limit1=NULL; michael@0: } else { michael@0: limit1=s1+length1; michael@0: } michael@0: michael@0: start2=s2; michael@0: if(length2==-1) { michael@0: limit2=NULL; michael@0: } else { michael@0: limit2=s2+length2; michael@0: } michael@0: michael@0: level1=level2=0; michael@0: c1=c2=-1; michael@0: michael@0: /* comparison loop */ michael@0: for(;;) { michael@0: /* michael@0: * here a code unit value of -1 means "get another code unit" michael@0: * below it will mean "this source is finished" michael@0: */ michael@0: michael@0: if(c1<0) { michael@0: /* get next code unit from string 1, post-increment */ michael@0: for(;;) { michael@0: if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) { michael@0: if(level1==0) { michael@0: c1=-1; michael@0: break; michael@0: } michael@0: } else { michael@0: ++s1; michael@0: break; michael@0: } michael@0: michael@0: /* reached end of level buffer, pop one level */ michael@0: do { michael@0: --level1; michael@0: start1=stack1[level1].start; /*Not uninitialized*/ michael@0: } while(start1==NULL); michael@0: s1=stack1[level1].s; /*Not uninitialized*/ michael@0: limit1=stack1[level1].limit; /*Not uninitialized*/ michael@0: } michael@0: } michael@0: michael@0: if(c2<0) { michael@0: /* get next code unit from string 2, post-increment */ michael@0: for(;;) { michael@0: if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) { michael@0: if(level2==0) { michael@0: c2=-1; michael@0: break; michael@0: } michael@0: } else { michael@0: ++s2; michael@0: break; michael@0: } michael@0: michael@0: /* reached end of level buffer, pop one level */ michael@0: do { michael@0: --level2; michael@0: start2=stack2[level2].start; /*Not uninitialized*/ michael@0: } while(start2==NULL); michael@0: s2=stack2[level2].s; /*Not uninitialized*/ michael@0: limit2=stack2[level2].limit; /*Not uninitialized*/ michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * compare c1 and c2 michael@0: * either variable c1, c2 is -1 only if the corresponding string is finished michael@0: */ michael@0: if(c1==c2) { michael@0: if(c1<0) { michael@0: return 0; /* c1==c2==-1 indicating end of strings */ michael@0: } michael@0: c1=c2=-1; /* make us fetch new code units */ michael@0: continue; michael@0: } else if(c1<0) { michael@0: return -1; /* string 1 ends before string 2 */ michael@0: } else if(c2<0) { michael@0: return 1; /* string 2 ends before string 1 */ michael@0: } michael@0: /* c1!=c2 && c1>=0 && c2>=0 */ michael@0: michael@0: /* get complete code points for c1, c2 for lookups if either is a surrogate */ michael@0: cp1=c1; michael@0: if(U_IS_SURROGATE(c1)) { michael@0: UChar c; michael@0: michael@0: if(U_IS_SURROGATE_LEAD(c1)) { michael@0: if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { michael@0: /* advance ++s1; only below if cp1 decomposes/case-folds */ michael@0: cp1=U16_GET_SUPPLEMENTARY(c1, c); michael@0: } michael@0: } else /* isTrail(c1) */ { michael@0: if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) { michael@0: cp1=U16_GET_SUPPLEMENTARY(c, c1); michael@0: } michael@0: } michael@0: } michael@0: michael@0: cp2=c2; michael@0: if(U_IS_SURROGATE(c2)) { michael@0: UChar c; michael@0: michael@0: if(U_IS_SURROGATE_LEAD(c2)) { michael@0: if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) { michael@0: /* advance ++s2; only below if cp2 decomposes/case-folds */ michael@0: cp2=U16_GET_SUPPLEMENTARY(c2, c); michael@0: } michael@0: } else /* isTrail(c2) */ { michael@0: if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) { michael@0: cp2=U16_GET_SUPPLEMENTARY(c, c2); michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * go down one level for each string michael@0: * continue with the main loop as soon as there is a real change michael@0: */ michael@0: michael@0: if( level1==0 && michael@0: (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0 michael@0: ) { michael@0: /* cp1 case-folds to the code point "length" or to p[length] */ michael@0: if(U_IS_SURROGATE(c1)) { michael@0: if(U_IS_SURROGATE_LEAD(c1)) { michael@0: /* advance beyond source surrogate pair if it case-folds */ michael@0: ++s1; michael@0: } else /* isTrail(c1) */ { michael@0: /* michael@0: * we got a supplementary code point when hitting its trail surrogate, michael@0: * therefore the lead surrogate must have been the same as in the other string; michael@0: * compare this decomposition with the lead surrogate in the other string michael@0: * remember that this simulates bulk text replacement: michael@0: * the decomposition would replace the entire code point michael@0: */ michael@0: --s2; michael@0: c2=*(s2-1); michael@0: } michael@0: } michael@0: michael@0: /* push current level pointers */ michael@0: stack1[0].start=start1; michael@0: stack1[0].s=s1; michael@0: stack1[0].limit=limit1; michael@0: ++level1; michael@0: michael@0: /* copy the folding result to fold1[] */ michael@0: if(length<=UCASE_MAX_STRING_LENGTH) { michael@0: u_memcpy(fold1, p, length); michael@0: } else { michael@0: int32_t i=0; michael@0: U16_APPEND_UNSAFE(fold1, i, length); michael@0: length=i; michael@0: } michael@0: michael@0: /* set next level pointers to case folding */ michael@0: start1=s1=fold1; michael@0: limit1=fold1+length; michael@0: michael@0: /* get ready to read from decomposition, continue with loop */ michael@0: c1=-1; michael@0: continue; michael@0: } michael@0: michael@0: if( level2==0 && michael@0: (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0 michael@0: ) { michael@0: /* cp2 case-folds to the code point "length" or to p[length] */ michael@0: if(U_IS_SURROGATE(c2)) { michael@0: if(U_IS_SURROGATE_LEAD(c2)) { michael@0: /* advance beyond source surrogate pair if it case-folds */ michael@0: ++s2; michael@0: } else /* isTrail(c2) */ { michael@0: /* michael@0: * we got a supplementary code point when hitting its trail surrogate, michael@0: * therefore the lead surrogate must have been the same as in the other string; michael@0: * compare this decomposition with the lead surrogate in the other string michael@0: * remember that this simulates bulk text replacement: michael@0: * the decomposition would replace the entire code point michael@0: */ michael@0: --s1; michael@0: c1=*(s1-1); michael@0: } michael@0: } michael@0: michael@0: /* push current level pointers */ michael@0: stack2[0].start=start2; michael@0: stack2[0].s=s2; michael@0: stack2[0].limit=limit2; michael@0: ++level2; michael@0: michael@0: /* copy the folding result to fold2[] */ michael@0: if(length<=UCASE_MAX_STRING_LENGTH) { michael@0: u_memcpy(fold2, p, length); michael@0: } else { michael@0: int32_t i=0; michael@0: U16_APPEND_UNSAFE(fold2, i, length); michael@0: length=i; michael@0: } michael@0: michael@0: /* set next level pointers to case folding */ michael@0: start2=s2=fold2; michael@0: limit2=fold2+length; michael@0: michael@0: /* get ready to read from decomposition, continue with loop */ michael@0: c2=-1; michael@0: continue; michael@0: } michael@0: michael@0: /* michael@0: * no decomposition/case folding, max level for both sides: michael@0: * return difference result michael@0: * michael@0: * code point order comparison must not just return cp1-cp2 michael@0: * because when single surrogates are present then the surrogate pairs michael@0: * that formed cp1 and cp2 may be from different string indexes michael@0: * michael@0: * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units michael@0: * c1=d800 cp1=10001 c2=dc00 cp2=10000 michael@0: * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } michael@0: * michael@0: * therefore, use same fix-up as in ustring.c/uprv_strCompare() michael@0: * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ michael@0: * so we have slightly different pointer/start/limit comparisons here michael@0: */ michael@0: michael@0: if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) { michael@0: /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ michael@0: if( michael@0: (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) || michael@0: (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2))) michael@0: ) { michael@0: /* part of a surrogate pair, leave >=d800 */ michael@0: } else { michael@0: /* BMP code point - may be surrogate code point - make =d800 */ michael@0: } else { michael@0: /* BMP code point - may be surrogate code point - make