michael@0: /* michael@0: ****************************************************************************** michael@0: * michael@0: * Copyright (C) 1998-2012, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ****************************************************************************** michael@0: * michael@0: * File ustring.cpp michael@0: * michael@0: * Modification History: michael@0: * michael@0: * Date Name Description michael@0: * 12/07/98 bertrand Creation. michael@0: ****************************************************************************** michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "unicode/putil.h" michael@0: #include "unicode/ustring.h" michael@0: #include "unicode/utf16.h" michael@0: #include "cstring.h" michael@0: #include "cwchar.h" michael@0: #include "cmemory.h" michael@0: #include "ustr_imp.h" michael@0: michael@0: /* ANSI string.h - style functions ------------------------------------------ */ michael@0: michael@0: /* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */ michael@0: #define U_BMP_MAX 0xffff michael@0: michael@0: /* Forward binary string search functions ----------------------------------- */ michael@0: michael@0: /* michael@0: * Test if a substring match inside a string is at code point boundaries. michael@0: * All pointers refer to the same buffer. michael@0: * The limit pointer may be NULL, all others must be real pointers. michael@0: */ michael@0: static inline UBool michael@0: isMatchAtCPBoundary(const UChar *start, const UChar *match, const UChar *matchLimit, const UChar *limit) { michael@0: if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) { michael@0: /* the leading edge of the match is in the middle of a surrogate pair */ michael@0: return FALSE; michael@0: } michael@0: if(U16_IS_LEAD(*(matchLimit-1)) && match!=limit && U16_IS_TRAIL(*matchLimit)) { michael@0: /* the trailing edge of the match is in the middle of a surrogate pair */ michael@0: return FALSE; michael@0: } michael@0: return TRUE; michael@0: } michael@0: michael@0: U_CAPI UChar * U_EXPORT2 michael@0: u_strFindFirst(const UChar *s, int32_t length, michael@0: const UChar *sub, int32_t subLength) { michael@0: const UChar *start, *p, *q, *subLimit; michael@0: UChar c, cs, cq; michael@0: michael@0: if(sub==NULL || subLength<-1) { michael@0: return (UChar *)s; michael@0: } michael@0: if(s==NULL || length<-1) { michael@0: return NULL; michael@0: } michael@0: michael@0: start=s; michael@0: michael@0: if(length<0 && subLength<0) { michael@0: /* both strings are NUL-terminated */ michael@0: if((cs=*sub++)==0) { michael@0: return (UChar *)s; michael@0: } michael@0: if(*sub==0 && !U16_IS_SURROGATE(cs)) { michael@0: /* the substring consists of a single, non-surrogate BMP code point */ michael@0: return u_strchr(s, cs); michael@0: } michael@0: michael@0: while((c=*s++)!=0) { michael@0: if(c==cs) { michael@0: /* found first substring UChar, compare rest */ michael@0: p=s; michael@0: q=sub; michael@0: for(;;) { michael@0: if((cq=*q)==0) { michael@0: if(isMatchAtCPBoundary(start, s-1, p, NULL)) { michael@0: return (UChar *)(s-1); /* well-formed match */ michael@0: } else { michael@0: break; /* no match because surrogate pair is split */ michael@0: } michael@0: } michael@0: if((c=*p)==0) { michael@0: return NULL; /* no match, and none possible after s */ michael@0: } michael@0: if(c!=cq) { michael@0: break; /* no match */ michael@0: } michael@0: ++p; michael@0: ++q; michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* not found */ michael@0: return NULL; michael@0: } michael@0: michael@0: if(subLength<0) { michael@0: subLength=u_strlen(sub); michael@0: } michael@0: if(subLength==0) { michael@0: return (UChar *)s; michael@0: } michael@0: michael@0: /* get sub[0] to search for it fast */ michael@0: cs=*sub++; michael@0: --subLength; michael@0: subLimit=sub+subLength; michael@0: michael@0: if(subLength==0 && !U16_IS_SURROGATE(cs)) { michael@0: /* the substring consists of a single, non-surrogate BMP code point */ michael@0: return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length); michael@0: } michael@0: michael@0: if(length<0) { michael@0: /* s is NUL-terminated */ michael@0: while((c=*s++)!=0) { michael@0: if(c==cs) { michael@0: /* found first substring UChar, compare rest */ michael@0: p=s; michael@0: q=sub; michael@0: for(;;) { michael@0: if(q==subLimit) { michael@0: if(isMatchAtCPBoundary(start, s-1, p, NULL)) { michael@0: return (UChar *)(s-1); /* well-formed match */ michael@0: } else { michael@0: break; /* no match because surrogate pair is split */ michael@0: } michael@0: } michael@0: if((c=*p)==0) { michael@0: return NULL; /* no match, and none possible after s */ michael@0: } michael@0: if(c!=*q) { michael@0: break; /* no match */ michael@0: } michael@0: ++p; michael@0: ++q; michael@0: } michael@0: } michael@0: } michael@0: } else { michael@0: const UChar *limit, *preLimit; michael@0: michael@0: /* subLength was decremented above */ michael@0: if(length<=subLength) { michael@0: return NULL; /* s is shorter than sub */ michael@0: } michael@0: michael@0: limit=s+length; michael@0: michael@0: /* the substring must start before preLimit */ michael@0: preLimit=limit-subLength; michael@0: michael@0: while(s!=preLimit) { michael@0: c=*s++; michael@0: if(c==cs) { michael@0: /* found first substring UChar, compare rest */ michael@0: p=s; michael@0: q=sub; michael@0: for(;;) { michael@0: if(q==subLimit) { michael@0: if(isMatchAtCPBoundary(start, s-1, p, limit)) { michael@0: return (UChar *)(s-1); /* well-formed match */ michael@0: } else { michael@0: break; /* no match because surrogate pair is split */ michael@0: } michael@0: } michael@0: if(*p!=*q) { michael@0: break; /* no match */ michael@0: } michael@0: ++p; michael@0: ++q; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* not found */ michael@0: return NULL; michael@0: } michael@0: michael@0: U_CAPI UChar * U_EXPORT2 michael@0: u_strstr(const UChar *s, const UChar *substring) { michael@0: return u_strFindFirst(s, -1, substring, -1); michael@0: } michael@0: michael@0: U_CAPI UChar * U_EXPORT2 michael@0: u_strchr(const UChar *s, UChar c) { michael@0: if(U16_IS_SURROGATE(c)) { michael@0: /* make sure to not find half of a surrogate pair */ michael@0: return u_strFindFirst(s, -1, &c, 1); michael@0: } else { michael@0: UChar cs; michael@0: michael@0: /* trivial search for a BMP code point */ michael@0: for(;;) { michael@0: if((cs=*s)==c) { michael@0: return (UChar *)s; michael@0: } michael@0: if(cs==0) { michael@0: return NULL; michael@0: } michael@0: ++s; michael@0: } michael@0: } michael@0: } michael@0: michael@0: U_CAPI UChar * U_EXPORT2 michael@0: u_strchr32(const UChar *s, UChar32 c) { michael@0: if((uint32_t)c<=U_BMP_MAX) { michael@0: /* find BMP code point */ michael@0: return u_strchr(s, (UChar)c); michael@0: } else if((uint32_t)c<=UCHAR_MAX_VALUE) { michael@0: /* find supplementary code point as surrogate pair */ michael@0: UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c); michael@0: michael@0: while((cs=*s++)!=0) { michael@0: if(cs==lead && *s==trail) { michael@0: return (UChar *)(s-1); michael@0: } michael@0: } michael@0: return NULL; michael@0: } else { michael@0: /* not a Unicode code point, not findable */ michael@0: return NULL; michael@0: } michael@0: } michael@0: michael@0: U_CAPI UChar * U_EXPORT2 michael@0: u_memchr(const UChar *s, UChar c, int32_t count) { michael@0: if(count<=0) { michael@0: return NULL; /* no string */ michael@0: } else if(U16_IS_SURROGATE(c)) { michael@0: /* make sure to not find half of a surrogate pair */ michael@0: return u_strFindFirst(s, count, &c, 1); michael@0: } else { michael@0: /* trivial search for a BMP code point */ michael@0: const UChar *limit=s+count; michael@0: do { michael@0: if(*s==c) { michael@0: return (UChar *)s; michael@0: } michael@0: } while(++s!=limit); michael@0: return NULL; michael@0: } michael@0: } michael@0: michael@0: U_CAPI UChar * U_EXPORT2 michael@0: u_memchr32(const UChar *s, UChar32 c, int32_t count) { michael@0: if((uint32_t)c<=U_BMP_MAX) { michael@0: /* find BMP code point */ michael@0: return u_memchr(s, (UChar)c, count); michael@0: } else if(count<2) { michael@0: /* too short for a surrogate pair */ michael@0: return NULL; michael@0: } else if((uint32_t)c<=UCHAR_MAX_VALUE) { michael@0: /* find supplementary code point as surrogate pair */ michael@0: const UChar *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */ michael@0: UChar lead=U16_LEAD(c), trail=U16_TRAIL(c); michael@0: michael@0: do { michael@0: if(*s==lead && *(s+1)==trail) { michael@0: return (UChar *)s; michael@0: } michael@0: } while(++s!=limit); michael@0: return NULL; michael@0: } else { michael@0: /* not a Unicode code point, not findable */ michael@0: return NULL; michael@0: } michael@0: } michael@0: michael@0: /* Backward binary string search functions ---------------------------------- */ michael@0: michael@0: U_CAPI UChar * U_EXPORT2 michael@0: u_strFindLast(const UChar *s, int32_t length, michael@0: const UChar *sub, int32_t subLength) { michael@0: const UChar *start, *limit, *p, *q, *subLimit; michael@0: UChar c, cs; michael@0: michael@0: if(sub==NULL || subLength<-1) { michael@0: return (UChar *)s; michael@0: } michael@0: if(s==NULL || length<-1) { michael@0: return NULL; michael@0: } michael@0: michael@0: /* michael@0: * This implementation is more lazy than the one for u_strFindFirst(): michael@0: * There is no special search code for NUL-terminated strings. michael@0: * It does not seem to be worth it for searching substrings to michael@0: * search forward and find all matches like in u_strrchr() and similar. michael@0: * Therefore, we simply get both string lengths and search backward. michael@0: * michael@0: * markus 2002oct23 michael@0: */ michael@0: michael@0: if(subLength<0) { michael@0: subLength=u_strlen(sub); michael@0: } michael@0: if(subLength==0) { michael@0: return (UChar *)s; michael@0: } michael@0: michael@0: /* get sub[subLength-1] to search for it fast */ michael@0: subLimit=sub+subLength; michael@0: cs=*(--subLimit); michael@0: --subLength; michael@0: michael@0: if(subLength==0 && !U16_IS_SURROGATE(cs)) { michael@0: /* the substring consists of a single, non-surrogate BMP code point */ michael@0: return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length); michael@0: } michael@0: michael@0: if(length<0) { michael@0: length=u_strlen(s); michael@0: } michael@0: michael@0: /* subLength was decremented above */ michael@0: if(length<=subLength) { michael@0: return NULL; /* s is shorter than sub */ michael@0: } michael@0: michael@0: start=s; michael@0: limit=s+length; michael@0: michael@0: /* the substring must start no later than s+subLength */ michael@0: s+=subLength; michael@0: michael@0: while(s!=limit) { michael@0: c=*(--limit); michael@0: if(c==cs) { michael@0: /* found last substring UChar, compare rest */ michael@0: p=limit; michael@0: q=subLimit; michael@0: for(;;) { michael@0: if(q==sub) { michael@0: if(isMatchAtCPBoundary(start, p, limit+1, start+length)) { michael@0: return (UChar *)p; /* well-formed match */ michael@0: } else { michael@0: break; /* no match because surrogate pair is split */ michael@0: } michael@0: } michael@0: if(*(--p)!=*(--q)) { michael@0: break; /* no match */ michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* not found */ michael@0: return NULL; michael@0: } michael@0: michael@0: U_CAPI UChar * U_EXPORT2 michael@0: u_strrstr(const UChar *s, const UChar *substring) { michael@0: return u_strFindLast(s, -1, substring, -1); michael@0: } michael@0: michael@0: U_CAPI UChar * U_EXPORT2 michael@0: u_strrchr(const UChar *s, UChar c) { michael@0: if(U16_IS_SURROGATE(c)) { michael@0: /* make sure to not find half of a surrogate pair */ michael@0: return u_strFindLast(s, -1, &c, 1); michael@0: } else { michael@0: const UChar *result=NULL; michael@0: UChar cs; michael@0: michael@0: /* trivial search for a BMP code point */ michael@0: for(;;) { michael@0: if((cs=*s)==c) { michael@0: result=s; michael@0: } michael@0: if(cs==0) { michael@0: return (UChar *)result; michael@0: } michael@0: ++s; michael@0: } michael@0: } michael@0: } michael@0: michael@0: U_CAPI UChar * U_EXPORT2 michael@0: u_strrchr32(const UChar *s, UChar32 c) { michael@0: if((uint32_t)c<=U_BMP_MAX) { michael@0: /* find BMP code point */ michael@0: return u_strrchr(s, (UChar)c); michael@0: } else if((uint32_t)c<=UCHAR_MAX_VALUE) { michael@0: /* find supplementary code point as surrogate pair */ michael@0: const UChar *result=NULL; michael@0: UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c); michael@0: michael@0: while((cs=*s++)!=0) { michael@0: if(cs==lead && *s==trail) { michael@0: result=s-1; michael@0: } michael@0: } michael@0: return (UChar *)result; michael@0: } else { michael@0: /* not a Unicode code point, not findable */ michael@0: return NULL; michael@0: } michael@0: } michael@0: michael@0: U_CAPI UChar * U_EXPORT2 michael@0: u_memrchr(const UChar *s, UChar c, int32_t count) { michael@0: if(count<=0) { michael@0: return NULL; /* no string */ michael@0: } else if(U16_IS_SURROGATE(c)) { michael@0: /* make sure to not find half of a surrogate pair */ michael@0: return u_strFindLast(s, count, &c, 1); michael@0: } else { michael@0: /* trivial search for a BMP code point */ michael@0: const UChar *limit=s+count; michael@0: do { michael@0: if(*(--limit)==c) { michael@0: return (UChar *)limit; michael@0: } michael@0: } while(s!=limit); michael@0: return NULL; michael@0: } michael@0: } michael@0: michael@0: U_CAPI UChar * U_EXPORT2 michael@0: u_memrchr32(const UChar *s, UChar32 c, int32_t count) { michael@0: if((uint32_t)c<=U_BMP_MAX) { michael@0: /* find BMP code point */ michael@0: return u_memrchr(s, (UChar)c, count); michael@0: } else if(count<2) { michael@0: /* too short for a surrogate pair */ michael@0: return NULL; michael@0: } else if((uint32_t)c<=UCHAR_MAX_VALUE) { michael@0: /* find supplementary code point as surrogate pair */ michael@0: const UChar *limit=s+count-1; michael@0: UChar lead=U16_LEAD(c), trail=U16_TRAIL(c); michael@0: michael@0: do { michael@0: if(*limit==trail && *(limit-1)==lead) { michael@0: return (UChar *)(limit-1); michael@0: } michael@0: } while(s!=--limit); michael@0: return NULL; michael@0: } else { michael@0: /* not a Unicode code point, not findable */ michael@0: return NULL; michael@0: } michael@0: } michael@0: michael@0: /* Tokenization functions --------------------------------------------------- */ michael@0: michael@0: /* michael@0: * Match each code point in a string against each code point in the matchSet. michael@0: * Return the index of the first string code point that michael@0: * is (polarity==TRUE) or is not (FALSE) contained in the matchSet. michael@0: * Return -(string length)-1 if there is no such code point. michael@0: */ michael@0: static int32_t michael@0: _matchFromSet(const UChar *string, const UChar *matchSet, UBool polarity) { michael@0: int32_t matchLen, matchBMPLen, strItr, matchItr; michael@0: UChar32 stringCh, matchCh; michael@0: UChar c, c2; michael@0: michael@0: /* first part of matchSet contains only BMP code points */ michael@0: matchBMPLen = 0; michael@0: while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) { michael@0: ++matchBMPLen; michael@0: } michael@0: michael@0: /* second part of matchSet contains BMP and supplementary code points */ michael@0: matchLen = matchBMPLen; michael@0: while(matchSet[matchLen] != 0) { michael@0: ++matchLen; michael@0: } michael@0: michael@0: for(strItr = 0; (c = string[strItr]) != 0;) { michael@0: ++strItr; michael@0: if(U16_IS_SINGLE(c)) { michael@0: if(polarity) { michael@0: for(matchItr = 0; matchItr < matchLen; ++matchItr) { michael@0: if(c == matchSet[matchItr]) { michael@0: return strItr - 1; /* one matches */ michael@0: } michael@0: } michael@0: } else { michael@0: for(matchItr = 0; matchItr < matchLen; ++matchItr) { michael@0: if(c == matchSet[matchItr]) { michael@0: goto endloop; michael@0: } michael@0: } michael@0: return strItr - 1; /* none matches */ michael@0: } michael@0: } else { michael@0: /* michael@0: * No need to check for string length before U16_IS_TRAIL michael@0: * because c2 could at worst be the terminating NUL. michael@0: */ michael@0: if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) { michael@0: ++strItr; michael@0: stringCh = U16_GET_SUPPLEMENTARY(c, c2); michael@0: } else { michael@0: stringCh = c; /* unpaired trail surrogate */ michael@0: } michael@0: michael@0: if(polarity) { michael@0: for(matchItr = matchBMPLen; matchItr < matchLen;) { michael@0: U16_NEXT(matchSet, matchItr, matchLen, matchCh); michael@0: if(stringCh == matchCh) { michael@0: return strItr - U16_LENGTH(stringCh); /* one matches */ michael@0: } michael@0: } michael@0: } else { michael@0: for(matchItr = matchBMPLen; matchItr < matchLen;) { michael@0: U16_NEXT(matchSet, matchItr, matchLen, matchCh); michael@0: if(stringCh == matchCh) { michael@0: goto endloop; michael@0: } michael@0: } michael@0: return strItr - U16_LENGTH(stringCh); /* none matches */ michael@0: } michael@0: } michael@0: endloop: michael@0: /* wish C had continue with labels like Java... */; michael@0: } michael@0: michael@0: /* Didn't find it. */ michael@0: return -strItr-1; michael@0: } michael@0: michael@0: /* Search for a codepoint in a string that matches one of the matchSet codepoints. */ michael@0: U_CAPI UChar * U_EXPORT2 michael@0: u_strpbrk(const UChar *string, const UChar *matchSet) michael@0: { michael@0: int32_t idx = _matchFromSet(string, matchSet, TRUE); michael@0: if(idx >= 0) { michael@0: return (UChar *)string + idx; michael@0: } else { michael@0: return NULL; michael@0: } michael@0: } michael@0: michael@0: /* Search for a codepoint in a string that matches one of the matchSet codepoints. */ michael@0: U_CAPI int32_t U_EXPORT2 michael@0: u_strcspn(const UChar *string, const UChar *matchSet) michael@0: { michael@0: int32_t idx = _matchFromSet(string, matchSet, TRUE); michael@0: if(idx >= 0) { michael@0: return idx; michael@0: } else { michael@0: return -idx - 1; /* == u_strlen(string) */ michael@0: } michael@0: } michael@0: michael@0: /* Search for a codepoint in a string that does not match one of the matchSet codepoints. */ michael@0: U_CAPI int32_t U_EXPORT2 michael@0: u_strspn(const UChar *string, const UChar *matchSet) michael@0: { michael@0: int32_t idx = _matchFromSet(string, matchSet, FALSE); michael@0: if(idx >= 0) { michael@0: return idx; michael@0: } else { michael@0: return -idx - 1; /* == u_strlen(string) */ michael@0: } michael@0: } michael@0: michael@0: /* ----- Text manipulation functions --- */ michael@0: michael@0: U_CAPI UChar* U_EXPORT2 michael@0: u_strtok_r(UChar *src, michael@0: const UChar *delim, michael@0: UChar **saveState) michael@0: { michael@0: UChar *tokSource; michael@0: UChar *nextToken; michael@0: uint32_t nonDelimIdx; michael@0: michael@0: /* If saveState is NULL, the user messed up. */ michael@0: if (src != NULL) { michael@0: tokSource = src; michael@0: *saveState = src; /* Set to "src" in case there are no delimiters */ michael@0: } michael@0: else if (*saveState) { michael@0: tokSource = *saveState; michael@0: } michael@0: else { michael@0: /* src == NULL && *saveState == NULL */ michael@0: /* This shouldn't happen. We already finished tokenizing. */ michael@0: return NULL; michael@0: } michael@0: michael@0: /* Skip initial delimiters */ michael@0: nonDelimIdx = u_strspn(tokSource, delim); michael@0: tokSource = &tokSource[nonDelimIdx]; michael@0: michael@0: if (*tokSource) { michael@0: nextToken = u_strpbrk(tokSource, delim); michael@0: if (nextToken != NULL) { michael@0: /* Create a token */ michael@0: *(nextToken++) = 0; michael@0: *saveState = nextToken; michael@0: return tokSource; michael@0: } michael@0: else if (*saveState) { michael@0: /* Return the last token */ michael@0: *saveState = NULL; michael@0: return tokSource; michael@0: } michael@0: } michael@0: else { michael@0: /* No tokens were found. Only delimiters were left. */ michael@0: *saveState = NULL; michael@0: } michael@0: return NULL; michael@0: } michael@0: michael@0: /* Miscellaneous functions -------------------------------------------------- */ michael@0: michael@0: U_CAPI UChar* U_EXPORT2 michael@0: u_strcat(UChar *dst, michael@0: const UChar *src) michael@0: { michael@0: UChar *anchor = dst; /* save a pointer to start of dst */ michael@0: michael@0: while(*dst != 0) { /* To end of first string */ michael@0: ++dst; michael@0: } michael@0: while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */ michael@0: } michael@0: michael@0: return anchor; michael@0: } michael@0: michael@0: U_CAPI UChar* U_EXPORT2 michael@0: u_strncat(UChar *dst, michael@0: const UChar *src, michael@0: int32_t n ) michael@0: { michael@0: if(n > 0) { michael@0: UChar *anchor = dst; /* save a pointer to start of dst */ michael@0: michael@0: while(*dst != 0) { /* To end of first string */ michael@0: ++dst; michael@0: } michael@0: while((*dst = *src) != 0) { /* copy string 2 over */ michael@0: ++dst; michael@0: if(--n == 0) { michael@0: *dst = 0; michael@0: break; michael@0: } michael@0: ++src; michael@0: } michael@0: michael@0: return anchor; michael@0: } else { michael@0: return dst; michael@0: } michael@0: } michael@0: michael@0: /* ----- Text property functions --- */ michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: u_strcmp(const UChar *s1, michael@0: const UChar *s2) michael@0: { michael@0: UChar c1, c2; michael@0: michael@0: for(;;) { michael@0: c1=*s1++; michael@0: c2=*s2++; michael@0: if (c1 != c2 || c1 == 0) { michael@0: break; michael@0: } michael@0: } michael@0: return (int32_t)c1 - (int32_t)c2; michael@0: } michael@0: michael@0: U_CFUNC int32_t U_EXPORT2 michael@0: uprv_strCompare(const UChar *s1, int32_t length1, michael@0: const UChar *s2, int32_t length2, michael@0: UBool strncmpStyle, UBool codePointOrder) { michael@0: const UChar *start1, *start2, *limit1, *limit2; michael@0: UChar c1, c2; michael@0: michael@0: /* setup for fix-up */ michael@0: start1=s1; michael@0: start2=s2; michael@0: michael@0: /* compare identical prefixes - they do not need to be fixed up */ michael@0: if(length1<0 && length2<0) { michael@0: /* strcmp style, both NUL-terminated */ michael@0: if(s1==s2) { michael@0: return 0; michael@0: } michael@0: michael@0: for(;;) { michael@0: c1=*s1; michael@0: c2=*s2; michael@0: if(c1!=c2) { michael@0: break; michael@0: } michael@0: if(c1==0) { michael@0: return 0; michael@0: } michael@0: ++s1; michael@0: ++s2; michael@0: } michael@0: michael@0: /* setup for fix-up */ michael@0: limit1=limit2=NULL; michael@0: } else if(strncmpStyle) { michael@0: /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */ michael@0: if(s1==s2) { michael@0: return 0; michael@0: } michael@0: michael@0: limit1=start1+length1; michael@0: michael@0: for(;;) { michael@0: /* both lengths are same, check only one limit */ michael@0: if(s1==limit1) { michael@0: return 0; michael@0: } michael@0: michael@0: c1=*s1; michael@0: c2=*s2; michael@0: if(c1!=c2) { michael@0: break; michael@0: } michael@0: if(c1==0) { michael@0: return 0; michael@0: } michael@0: ++s1; michael@0: ++s2; michael@0: } michael@0: michael@0: /* setup for fix-up */ michael@0: limit2=start2+length1; /* use length1 here, too, to enforce assumption */ michael@0: } else { michael@0: /* memcmp/UnicodeString style, both length-specified */ michael@0: int32_t lengthResult; michael@0: michael@0: if(length1<0) { michael@0: length1=u_strlen(s1); michael@0: } michael@0: if(length2<0) { michael@0: length2=u_strlen(s2); michael@0: } michael@0: michael@0: /* limit1=start1+min(lenght1, length2) */ michael@0: if(length1length2 */ { michael@0: lengthResult=1; michael@0: limit1=start1+length2; michael@0: } michael@0: michael@0: if(s1==s2) { michael@0: return lengthResult; michael@0: } michael@0: michael@0: for(;;) { michael@0: /* check pseudo-limit */ michael@0: if(s1==limit1) { michael@0: return lengthResult; michael@0: } michael@0: michael@0: c1=*s1; michael@0: c2=*s2; michael@0: if(c1!=c2) { michael@0: break; michael@0: } michael@0: ++s1; michael@0: ++s2; michael@0: } michael@0: michael@0: /* setup for fix-up */ michael@0: limit1=start1+length1; michael@0: limit2=start2+length2; michael@0: } michael@0: michael@0: /* if both values are in or above the surrogate range, fix them up */ michael@0: if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { michael@0: /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ michael@0: if( michael@0: (c1<=0xdbff && (s1+1)!=limit1 && U16_IS_TRAIL(*(s1+1))) || michael@0: (U16_IS_TRAIL(c1) && start1!=s1 && U16_IS_LEAD(*(s1-1))) michael@0: ) { michael@0: /* part of a surrogate pair, leave >=d800 */ michael@0: } else { michael@0: /* BMP code point - may be surrogate code point - make =d800 */ michael@0: } else { michael@0: /* BMP code point - may be surrogate code point - make move(iter1, 0, UITER_START); michael@0: iter2->move(iter2, 0, UITER_START); michael@0: michael@0: /* compare identical prefixes - they do not need to be fixed up */ michael@0: for(;;) { michael@0: c1=iter1->next(iter1); michael@0: c2=iter2->next(iter2); michael@0: if(c1!=c2) { michael@0: break; michael@0: } michael@0: if(c1==-1) { michael@0: return 0; michael@0: } michael@0: } michael@0: michael@0: /* if both values are in or above the surrogate range, fix them up */ michael@0: if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { michael@0: /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ michael@0: if( michael@0: (c1<=0xdbff && U16_IS_TRAIL(iter1->current(iter1))) || michael@0: (U16_IS_TRAIL(c1) && (iter1->previous(iter1), U16_IS_LEAD(iter1->previous(iter1)))) michael@0: ) { michael@0: /* part of a surrogate pair, leave >=d800 */ michael@0: } else { michael@0: /* BMP code point - may be surrogate code point - make current(iter2))) || michael@0: (U16_IS_TRAIL(c2) && (iter2->previous(iter2), U16_IS_LEAD(iter2->previous(iter2)))) michael@0: ) { michael@0: /* part of a surrogate pair, leave >=d800 */ michael@0: } else { michael@0: /* BMP code point - may be surrogate code point - make =0. michael@0: * michael@0: * Consistently leaving them _behind_ the different units is not an option michael@0: * because the current "unit" is the end of the string if that is reached, michael@0: * and in such a case the iterator does not move. michael@0: * For example, when comparing "ab" with "abc", both iterators rest _on_ the end michael@0: * of their strings. Calling previous() on each does not move them to where michael@0: * the comparison fails. michael@0: * michael@0: * So the simplest semantics is to not define where the iterators end up. michael@0: * michael@0: * The following fragment is part of what would need to be done for backing up. michael@0: */ michael@0: void fragment { michael@0: /* iff a surrogate is part of a surrogate pair, leave >=d800 */ michael@0: if(c1<=0xdbff) { michael@0: if(!U16_IS_TRAIL(iter1->current(iter1))) { michael@0: /* lead surrogate code point - make getIndex(iter1, UITER_CURRENT); michael@0: iter1->previous(iter1); /* ==c1 */ michael@0: if(!U16_IS_LEAD(iter1->previous(iter1))) { michael@0: /* trail surrogate code point - make move(iter1, idx, UITER_ZERO); michael@0: } else /* 0xe000<=c1<=0xffff */ { michael@0: /* BMP code point - make 0) { michael@0: int32_t rc; michael@0: for(;;) { michael@0: rc = (int32_t)*s1 - (int32_t)*s2; michael@0: if(rc != 0 || *s1 == 0 || --n == 0) { michael@0: return rc; michael@0: } michael@0: ++s1; michael@0: ++s2; michael@0: } michael@0: } else { michael@0: return 0; michael@0: } michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) { michael@0: return uprv_strCompare(s1, n, s2, n, TRUE, TRUE); michael@0: } michael@0: michael@0: U_CAPI UChar* U_EXPORT2 michael@0: u_strcpy(UChar *dst, michael@0: const UChar *src) michael@0: { michael@0: UChar *anchor = dst; /* save a pointer to start of dst */ michael@0: michael@0: while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */ michael@0: } michael@0: michael@0: return anchor; michael@0: } michael@0: michael@0: U_CAPI UChar* U_EXPORT2 michael@0: u_strncpy(UChar *dst, michael@0: const UChar *src, michael@0: int32_t n) michael@0: { michael@0: UChar *anchor = dst; /* save a pointer to start of dst */ michael@0: michael@0: /* copy string 2 over */ michael@0: while(n > 0 && (*(dst++) = *(src++)) != 0) { michael@0: --n; michael@0: } michael@0: michael@0: return anchor; michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: u_strlen(const UChar *s) michael@0: { michael@0: #if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR michael@0: return (int32_t)uprv_wcslen(s); michael@0: #else michael@0: const UChar *t = s; michael@0: while(*t != 0) { michael@0: ++t; michael@0: } michael@0: return t - s; michael@0: #endif michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: u_countChar32(const UChar *s, int32_t length) { michael@0: int32_t count; michael@0: michael@0: if(s==NULL || length<-1) { michael@0: return 0; michael@0: } michael@0: michael@0: count=0; michael@0: if(length>=0) { michael@0: while(length>0) { michael@0: ++count; michael@0: if(U16_IS_LEAD(*s) && length>=2 && U16_IS_TRAIL(*(s+1))) { michael@0: s+=2; michael@0: length-=2; michael@0: } else { michael@0: ++s; michael@0: --length; michael@0: } michael@0: } michael@0: } else /* length==-1 */ { michael@0: UChar c; michael@0: michael@0: for(;;) { michael@0: if((c=*s++)==0) { michael@0: break; michael@0: } michael@0: ++count; michael@0: michael@0: /* michael@0: * sufficient to look ahead one because of UTF-16; michael@0: * safe to look ahead one because at worst that would be the terminating NUL michael@0: */ michael@0: if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) { michael@0: ++s; michael@0: } michael@0: } michael@0: } michael@0: return count; michael@0: } michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number) { michael@0: michael@0: if(number<0) { michael@0: return TRUE; michael@0: } michael@0: if(s==NULL || length<-1) { michael@0: return FALSE; michael@0: } michael@0: michael@0: if(length==-1) { michael@0: /* s is NUL-terminated */ michael@0: UChar c; michael@0: michael@0: /* count code points until they exceed */ michael@0: for(;;) { michael@0: if((c=*s++)==0) { michael@0: return FALSE; michael@0: } michael@0: if(number==0) { michael@0: return TRUE; michael@0: } michael@0: if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) { michael@0: ++s; michael@0: } michael@0: --number; michael@0: } michael@0: } else { michael@0: /* length>=0 known */ michael@0: const UChar *limit; michael@0: int32_t maxSupplementary; michael@0: michael@0: /* s contains at least (length+1)/2 code points: <=2 UChars per cp */ michael@0: if(((length+1)/2)>number) { michael@0: return TRUE; michael@0: } michael@0: michael@0: /* check if s does not even contain enough UChars */ michael@0: maxSupplementary=length-number; michael@0: if(maxSupplementary<=0) { michael@0: return FALSE; michael@0: } michael@0: /* there are maxSupplementary=length-number more UChars than asked-for code points */ michael@0: michael@0: /* michael@0: * count code points until they exceed and also check that there are michael@0: * no more than maxSupplementary supplementary code points (UChar pairs) michael@0: */ michael@0: limit=s+length; michael@0: for(;;) { michael@0: if(s==limit) { michael@0: return FALSE; michael@0: } michael@0: if(number==0) { michael@0: return TRUE; michael@0: } michael@0: if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) { michael@0: ++s; michael@0: if(--maxSupplementary<=0) { michael@0: /* too many pairs - too few code points */ michael@0: return FALSE; michael@0: } michael@0: } michael@0: --number; michael@0: } michael@0: } michael@0: } michael@0: michael@0: U_CAPI UChar * U_EXPORT2 michael@0: u_memcpy(UChar *dest, const UChar *src, int32_t count) { michael@0: if(count > 0) { michael@0: uprv_memcpy(dest, src, count*U_SIZEOF_UCHAR); michael@0: } michael@0: return dest; michael@0: } michael@0: michael@0: U_CAPI UChar * U_EXPORT2 michael@0: u_memmove(UChar *dest, const UChar *src, int32_t count) { michael@0: if(count > 0) { michael@0: uprv_memmove(dest, src, count*U_SIZEOF_UCHAR); michael@0: } michael@0: return dest; michael@0: } michael@0: michael@0: U_CAPI UChar * U_EXPORT2 michael@0: u_memset(UChar *dest, UChar c, int32_t count) { michael@0: if(count > 0) { michael@0: UChar *ptr = dest; michael@0: UChar *limit = dest + count; michael@0: michael@0: while (ptr < limit) { michael@0: *(ptr++) = c; michael@0: } michael@0: } michael@0: return dest; michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count) { michael@0: if(count > 0) { michael@0: const UChar *limit = buf1 + count; michael@0: int32_t result; michael@0: michael@0: while (buf1 < limit) { michael@0: result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2; michael@0: if (result != 0) { michael@0: return result; michael@0: } michael@0: buf1++; michael@0: buf2++; michael@0: } michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) { michael@0: return uprv_strCompare(s1, count, s2, count, FALSE, TRUE); michael@0: } michael@0: michael@0: /* u_unescape & support fns ------------------------------------------------- */ michael@0: michael@0: /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ michael@0: static const UChar UNESCAPE_MAP[] = { michael@0: /*" 0x22, 0x22 */ michael@0: /*' 0x27, 0x27 */ michael@0: /*? 0x3F, 0x3F */ michael@0: /*\ 0x5C, 0x5C */ michael@0: /*a*/ 0x61, 0x07, michael@0: /*b*/ 0x62, 0x08, michael@0: /*e*/ 0x65, 0x1b, michael@0: /*f*/ 0x66, 0x0c, michael@0: /*n*/ 0x6E, 0x0a, michael@0: /*r*/ 0x72, 0x0d, michael@0: /*t*/ 0x74, 0x09, michael@0: /*v*/ 0x76, 0x0b michael@0: }; michael@0: enum { UNESCAPE_MAP_LENGTH = sizeof(UNESCAPE_MAP) / sizeof(UNESCAPE_MAP[0]) }; michael@0: michael@0: /* Convert one octal digit to a numeric value 0..7, or -1 on failure */ michael@0: static int8_t _digit8(UChar c) { michael@0: if (c >= 0x0030 && c <= 0x0037) { michael@0: return (int8_t)(c - 0x0030); michael@0: } michael@0: return -1; michael@0: } michael@0: michael@0: /* Convert one hex digit to a numeric value 0..F, or -1 on failure */ michael@0: static int8_t _digit16(UChar c) { michael@0: if (c >= 0x0030 && c <= 0x0039) { michael@0: return (int8_t)(c - 0x0030); michael@0: } michael@0: if (c >= 0x0041 && c <= 0x0046) { michael@0: return (int8_t)(c - (0x0041 - 10)); michael@0: } michael@0: if (c >= 0x0061 && c <= 0x0066) { michael@0: return (int8_t)(c - (0x0061 - 10)); michael@0: } michael@0: return -1; michael@0: } michael@0: michael@0: /* Parse a single escape sequence. Although this method deals in michael@0: * UChars, it does not use C++ or UnicodeString. This allows it to michael@0: * be used from C contexts. */ michael@0: U_CAPI UChar32 U_EXPORT2 michael@0: u_unescapeAt(UNESCAPE_CHAR_AT charAt, michael@0: int32_t *offset, michael@0: int32_t length, michael@0: void *context) { michael@0: michael@0: int32_t start = *offset; michael@0: UChar c; michael@0: UChar32 result = 0; michael@0: int8_t n = 0; michael@0: int8_t minDig = 0; michael@0: int8_t maxDig = 0; michael@0: int8_t bitsPerDigit = 4; michael@0: int8_t dig; michael@0: int32_t i; michael@0: UBool braces = FALSE; michael@0: michael@0: /* Check that offset is in range */ michael@0: if (*offset < 0 || *offset >= length) { michael@0: goto err; michael@0: } michael@0: michael@0: /* Fetch first UChar after '\\' */ michael@0: c = charAt((*offset)++, context); michael@0: michael@0: /* Convert hexadecimal and octal escapes */ michael@0: switch (c) { michael@0: case 0x0075 /*'u'*/: michael@0: minDig = maxDig = 4; michael@0: break; michael@0: case 0x0055 /*'U'*/: michael@0: minDig = maxDig = 8; michael@0: break; michael@0: case 0x0078 /*'x'*/: michael@0: minDig = 1; michael@0: if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) { michael@0: ++(*offset); michael@0: braces = TRUE; michael@0: maxDig = 8; michael@0: } else { michael@0: maxDig = 2; michael@0: } michael@0: break; michael@0: default: michael@0: dig = _digit8(c); michael@0: if (dig >= 0) { michael@0: minDig = 1; michael@0: maxDig = 3; michael@0: n = 1; /* Already have first octal digit */ michael@0: bitsPerDigit = 3; michael@0: result = dig; michael@0: } michael@0: break; michael@0: } michael@0: if (minDig != 0) { michael@0: while (*offset < length && n < maxDig) { michael@0: c = charAt(*offset, context); michael@0: dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c)); michael@0: if (dig < 0) { michael@0: break; michael@0: } michael@0: result = (result << bitsPerDigit) | dig; michael@0: ++(*offset); michael@0: ++n; michael@0: } michael@0: if (n < minDig) { michael@0: goto err; michael@0: } michael@0: if (braces) { michael@0: if (c != 0x7D /*}*/) { michael@0: goto err; michael@0: } michael@0: ++(*offset); michael@0: } michael@0: if (result < 0 || result >= 0x110000) { michael@0: goto err; michael@0: } michael@0: /* If an escape sequence specifies a lead surrogate, see if michael@0: * there is a trail surrogate after it, either as an escape or michael@0: * as a literal. If so, join them up into a supplementary. michael@0: */ michael@0: if (*offset < length && U16_IS_LEAD(result)) { michael@0: int32_t ahead = *offset + 1; michael@0: c = charAt(*offset, context); michael@0: if (c == 0x5C /*'\\'*/ && ahead < length) { michael@0: c = (UChar) u_unescapeAt(charAt, &ahead, length, context); michael@0: } michael@0: if (U16_IS_TRAIL(c)) { michael@0: *offset = ahead; michael@0: result = U16_GET_SUPPLEMENTARY(result, c); michael@0: } michael@0: } michael@0: return result; michael@0: } michael@0: michael@0: /* Convert C-style escapes in table */ michael@0: for (i=0; i destCapacity) { michael@0: srcLen = destCapacity; michael@0: } michael@0: u_charsToUChars(src, dest, srcLen); michael@0: } michael@0: michael@0: /* Do an invariant conversion of char* -> UChar*, with escape parsing */ michael@0: U_CAPI int32_t U_EXPORT2 michael@0: u_unescape(const char *src, UChar *dest, int32_t destCapacity) { michael@0: const char *segment = src; michael@0: int32_t i = 0; michael@0: char c; michael@0: michael@0: while ((c=*src) != 0) { michael@0: /* '\\' intentionally written as compiler-specific michael@0: * character constant to correspond to compiler-specific michael@0: * char* constants. */ michael@0: if (c == '\\') { michael@0: int32_t lenParsed = 0; michael@0: UChar32 c32; michael@0: if (src != segment) { michael@0: if (dest != NULL) { michael@0: _appendUChars(dest + i, destCapacity - i, michael@0: segment, (int32_t)(src - segment)); michael@0: } michael@0: i += (int32_t)(src - segment); michael@0: } michael@0: ++src; /* advance past '\\' */ michael@0: c32 = (UChar32)u_unescapeAt(_charPtr_charAt, &lenParsed, (int32_t)uprv_strlen(src), (void*)src); michael@0: if (lenParsed == 0) { michael@0: goto err; michael@0: } michael@0: src += lenParsed; /* advance past escape seq. */ michael@0: if (dest != NULL && U16_LENGTH(c32) <= (destCapacity - i)) { michael@0: U16_APPEND_UNSAFE(dest, i, c32); michael@0: } else { michael@0: i += U16_LENGTH(c32); michael@0: } michael@0: segment = src; michael@0: } else { michael@0: ++src; michael@0: } michael@0: } michael@0: if (src != segment) { michael@0: if (dest != NULL) { michael@0: _appendUChars(dest + i, destCapacity - i, michael@0: segment, (int32_t)(src - segment)); michael@0: } michael@0: i += (int32_t)(src - segment); michael@0: } michael@0: if (dest != NULL && i < destCapacity) { michael@0: dest[i] = 0; michael@0: } michael@0: return i; michael@0: michael@0: err: michael@0: if (dest != NULL && destCapacity > 0) { michael@0: *dest = 0; michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: /* NUL-termination of strings ----------------------------------------------- */ michael@0: michael@0: /** michael@0: * NUL-terminate a string no matter what its type. michael@0: * Set warning and error codes accordingly. michael@0: */ michael@0: #define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) \ michael@0: if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) { \ michael@0: /* not a public function, so no complete argument checking */ \ michael@0: \ michael@0: if(length<0) { \ michael@0: /* assume that the caller handles this */ \ michael@0: } else if(lengthdestCapacity */ { \ michael@0: /* even the string itself did not fit - set an error code */ \ michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; \ michael@0: } \ michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { michael@0: __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); michael@0: return length; michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { michael@0: __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); michael@0: return length; michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { michael@0: __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); michael@0: return length; michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { michael@0: __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); michael@0: return length; michael@0: } michael@0: michael@0: // Compute the hash code for a string -------------------------------------- *** michael@0: michael@0: // Moved here from uhash.c so that UnicodeString::hashCode() does not depend michael@0: // on UHashtable code. michael@0: michael@0: /* michael@0: Compute the hash by iterating sparsely over about 32 (up to 63) michael@0: characters spaced evenly through the string. For each character, michael@0: multiply the previous hash value by a prime number and add the new michael@0: character in, like a linear congruential random number generator, michael@0: producing a pseudorandom deterministic value well distributed over michael@0: the output range. [LIU] michael@0: */ michael@0: michael@0: #define STRING_HASH(TYPE, STR, STRLEN, DEREF) \ michael@0: int32_t hash = 0; \ michael@0: const TYPE *p = (const TYPE*) STR; \ michael@0: if (p != NULL) { \ michael@0: int32_t len = (int32_t)(STRLEN); \ michael@0: int32_t inc = ((len - 32) / 32) + 1; \ michael@0: const TYPE *limit = p + len; \ michael@0: while (p