1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/util.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,409 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (c) 2001-2011, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +********************************************************************** 1.9 +* Date Name Description 1.10 +* 11/19/2001 aliu Creation. 1.11 +********************************************************************** 1.12 +*/ 1.13 + 1.14 +#include "unicode/unimatch.h" 1.15 +#include "unicode/utf16.h" 1.16 +#include "patternprops.h" 1.17 +#include "util.h" 1.18 + 1.19 +// Define UChar constants using hex for EBCDIC compatibility 1.20 + 1.21 +static const UChar BACKSLASH = 0x005C; /*\*/ 1.22 +static const UChar UPPER_U = 0x0055; /*U*/ 1.23 +static const UChar LOWER_U = 0x0075; /*u*/ 1.24 +static const UChar APOSTROPHE = 0x0027; // '\'' 1.25 +static const UChar SPACE = 0x0020; // ' ' 1.26 + 1.27 +// "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 1.28 +static const UChar DIGITS[] = { 1.29 + 48,49,50,51,52,53,54,55,56,57, 1.30 + 65,66,67,68,69,70,71,72,73,74, 1.31 + 75,76,77,78,79,80,81,82,83,84, 1.32 + 85,86,87,88,89,90 1.33 +}; 1.34 + 1.35 +U_NAMESPACE_BEGIN 1.36 + 1.37 +UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n, 1.38 + int32_t radix, int32_t minDigits) { 1.39 + if (radix < 2 || radix > 36) { 1.40 + // Bogus radix 1.41 + return result.append((UChar)63/*?*/); 1.42 + } 1.43 + // Handle negatives 1.44 + if (n < 0) { 1.45 + n = -n; 1.46 + result.append((UChar)45/*-*/); 1.47 + } 1.48 + // First determine the number of digits 1.49 + int32_t nn = n; 1.50 + int32_t r = 1; 1.51 + while (nn >= radix) { 1.52 + nn /= radix; 1.53 + r *= radix; 1.54 + --minDigits; 1.55 + } 1.56 + // Now generate the digits 1.57 + while (--minDigits > 0) { 1.58 + result.append(DIGITS[0]); 1.59 + } 1.60 + while (r > 0) { 1.61 + int32_t digit = n / r; 1.62 + result.append(DIGITS[digit]); 1.63 + n -= digit * r; 1.64 + r /= radix; 1.65 + } 1.66 + return result; 1.67 +} 1.68 + 1.69 +/** 1.70 + * Return true if the character is NOT printable ASCII. 1.71 + */ 1.72 +UBool ICU_Utility::isUnprintable(UChar32 c) { 1.73 + return !(c >= 0x20 && c <= 0x7E); 1.74 +} 1.75 + 1.76 +/** 1.77 + * Escape unprintable characters using \uxxxx notation for U+0000 to 1.78 + * U+FFFF and \Uxxxxxxxx for U+10000 and above. If the character is 1.79 + * printable ASCII, then do nothing and return FALSE. Otherwise, 1.80 + * append the escaped notation and return TRUE. 1.81 + */ 1.82 +UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) { 1.83 + if (isUnprintable(c)) { 1.84 + result.append(BACKSLASH); 1.85 + if (c & ~0xFFFF) { 1.86 + result.append(UPPER_U); 1.87 + result.append(DIGITS[0xF&(c>>28)]); 1.88 + result.append(DIGITS[0xF&(c>>24)]); 1.89 + result.append(DIGITS[0xF&(c>>20)]); 1.90 + result.append(DIGITS[0xF&(c>>16)]); 1.91 + } else { 1.92 + result.append(LOWER_U); 1.93 + } 1.94 + result.append(DIGITS[0xF&(c>>12)]); 1.95 + result.append(DIGITS[0xF&(c>>8)]); 1.96 + result.append(DIGITS[0xF&(c>>4)]); 1.97 + result.append(DIGITS[0xF&c]); 1.98 + return TRUE; 1.99 + } 1.100 + return FALSE; 1.101 +} 1.102 + 1.103 +/** 1.104 + * Returns the index of a character, ignoring quoted text. 1.105 + * For example, in the string "abc'hide'h", the 'h' in "hide" will not be 1.106 + * found by a search for 'h'. 1.107 + */ 1.108 +// FOR FUTURE USE. DISABLE FOR NOW for coverage reasons. 1.109 +/* 1.110 +int32_t ICU_Utility::quotedIndexOf(const UnicodeString& text, 1.111 + int32_t start, int32_t limit, 1.112 + UChar charToFind) { 1.113 + for (int32_t i=start; i<limit; ++i) { 1.114 + UChar c = text.charAt(i); 1.115 + if (c == BACKSLASH) { 1.116 + ++i; 1.117 + } else if (c == APOSTROPHE) { 1.118 + while (++i < limit 1.119 + && text.charAt(i) != APOSTROPHE) {} 1.120 + } else if (c == charToFind) { 1.121 + return i; 1.122 + } 1.123 + } 1.124 + return -1; 1.125 +} 1.126 +*/ 1.127 + 1.128 +/** 1.129 + * Skip over a sequence of zero or more white space characters at pos. 1.130 + * @param advance if true, advance pos to the first non-white-space 1.131 + * character at or after pos, or str.length(), if there is none. 1.132 + * Otherwise leave pos unchanged. 1.133 + * @return the index of the first non-white-space character at or 1.134 + * after pos, or str.length(), if there is none. 1.135 + */ 1.136 +int32_t ICU_Utility::skipWhitespace(const UnicodeString& str, int32_t& pos, 1.137 + UBool advance) { 1.138 + int32_t p = pos; 1.139 + const UChar* s = str.getBuffer(); 1.140 + p = (int32_t)(PatternProps::skipWhiteSpace(s + p, str.length() - p) - s); 1.141 + if (advance) { 1.142 + pos = p; 1.143 + } 1.144 + return p; 1.145 +} 1.146 + 1.147 +/** 1.148 + * Skip over Pattern_White_Space in a Replaceable. 1.149 + * Skipping may be done in the forward or 1.150 + * reverse direction. In either case, the leftmost index will be 1.151 + * inclusive, and the rightmost index will be exclusive. That is, 1.152 + * given a range defined as [start, limit), the call 1.153 + * skipWhitespace(text, start, limit) will advance start past leading 1.154 + * whitespace, whereas the call skipWhitespace(text, limit, start), 1.155 + * will back up limit past trailing whitespace. 1.156 + * @param text the text to be analyzed 1.157 + * @param pos either the start or limit of a range of 'text', to skip 1.158 + * leading or trailing whitespace, respectively 1.159 + * @param stop either the limit or start of a range of 'text', to skip 1.160 + * leading or trailing whitespace, respectively 1.161 + * @return the new start or limit, depending on what was passed in to 1.162 + * 'pos' 1.163 + */ 1.164 +//?FOR FUTURE USE. DISABLE FOR NOW for coverage reasons. 1.165 +//?int32_t ICU_Utility::skipWhitespace(const Replaceable& text, 1.166 +//? int32_t pos, int32_t stop) { 1.167 +//? UChar32 c; 1.168 +//? UBool isForward = (stop >= pos); 1.169 +//? 1.170 +//? if (!isForward) { 1.171 +//? --pos; // pos is a limit, so back up by one 1.172 +//? } 1.173 +//? 1.174 +//? while (pos != stop && 1.175 +//? PatternProps::isWhiteSpace(c = text.char32At(pos))) { 1.176 +//? if (isForward) { 1.177 +//? pos += U16_LENGTH(c); 1.178 +//? } else { 1.179 +//? pos -= U16_LENGTH(c); 1.180 +//? } 1.181 +//? } 1.182 +//? 1.183 +//? if (!isForward) { 1.184 +//? ++pos; // make pos back into a limit 1.185 +//? } 1.186 +//? 1.187 +//? return pos; 1.188 +//?} 1.189 + 1.190 +/** 1.191 + * Parse a single non-whitespace character 'ch', optionally 1.192 + * preceded by whitespace. 1.193 + * @param id the string to be parsed 1.194 + * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the 1.195 + * offset of the first character to be parsed. On output, pos[0] 1.196 + * is the index after the last parsed character. If the parse 1.197 + * fails, pos[0] will be unchanged. 1.198 + * @param ch the non-whitespace character to be parsed. 1.199 + * @return true if 'ch' is seen preceded by zero or more 1.200 + * whitespace characters. 1.201 + */ 1.202 +UBool ICU_Utility::parseChar(const UnicodeString& id, int32_t& pos, UChar ch) { 1.203 + int32_t start = pos; 1.204 + skipWhitespace(id, pos, TRUE); 1.205 + if (pos == id.length() || 1.206 + id.charAt(pos) != ch) { 1.207 + pos = start; 1.208 + return FALSE; 1.209 + } 1.210 + ++pos; 1.211 + return TRUE; 1.212 +} 1.213 + 1.214 +/** 1.215 + * Parse a pattern string within the given Replaceable and a parsing 1.216 + * pattern. Characters are matched literally and case-sensitively 1.217 + * except for the following special characters: 1.218 + * 1.219 + * ~ zero or more Pattern_White_Space chars 1.220 + * 1.221 + * If end of pattern is reached with all matches along the way, 1.222 + * pos is advanced to the first unparsed index and returned. 1.223 + * Otherwise -1 is returned. 1.224 + * @param pat pattern that controls parsing 1.225 + * @param text text to be parsed, starting at index 1.226 + * @param index offset to first character to parse 1.227 + * @param limit offset after last character to parse 1.228 + * @return index after last parsed character, or -1 on parse failure. 1.229 + */ 1.230 +int32_t ICU_Utility::parsePattern(const UnicodeString& pat, 1.231 + const Replaceable& text, 1.232 + int32_t index, 1.233 + int32_t limit) { 1.234 + int32_t ipat = 0; 1.235 + 1.236 + // empty pattern matches immediately 1.237 + if (ipat == pat.length()) { 1.238 + return index; 1.239 + } 1.240 + 1.241 + UChar32 cpat = pat.char32At(ipat); 1.242 + 1.243 + while (index < limit) { 1.244 + UChar32 c = text.char32At(index); 1.245 + 1.246 + // parse \s* 1.247 + if (cpat == 126 /*~*/) { 1.248 + if (PatternProps::isWhiteSpace(c)) { 1.249 + index += U16_LENGTH(c); 1.250 + continue; 1.251 + } else { 1.252 + if (++ipat == pat.length()) { 1.253 + return index; // success; c unparsed 1.254 + } 1.255 + // fall thru; process c again with next cpat 1.256 + } 1.257 + } 1.258 + 1.259 + // parse literal 1.260 + else if (c == cpat) { 1.261 + index += U16_LENGTH(c); 1.262 + ipat += U16_LENGTH(cpat); 1.263 + if (ipat == pat.length()) { 1.264 + return index; // success; c parsed 1.265 + } 1.266 + // fall thru; get next cpat 1.267 + } 1.268 + 1.269 + // match failure of literal 1.270 + else { 1.271 + return -1; 1.272 + } 1.273 + 1.274 + cpat = pat.char32At(ipat); 1.275 + } 1.276 + 1.277 + return -1; // text ended before end of pat 1.278 +} 1.279 + 1.280 +/** 1.281 + * Append a character to a rule that is being built up. To flush 1.282 + * the quoteBuf to rule, make one final call with isLiteral == TRUE. 1.283 + * If there is no final character, pass in (UChar32)-1 as c. 1.284 + * @param rule the string to append the character to 1.285 + * @param c the character to append, or (UChar32)-1 if none. 1.286 + * @param isLiteral if true, then the given character should not be 1.287 + * quoted or escaped. Usually this means it is a syntactic element 1.288 + * such as > or $ 1.289 + * @param escapeUnprintable if true, then unprintable characters 1.290 + * should be escaped using \uxxxx or \Uxxxxxxxx. These escapes will 1.291 + * appear outside of quotes. 1.292 + * @param quoteBuf a buffer which is used to build up quoted 1.293 + * substrings. The caller should initially supply an empty buffer, 1.294 + * and thereafter should not modify the buffer. The buffer should be 1.295 + * cleared out by, at the end, calling this method with a literal 1.296 + * character. 1.297 + */ 1.298 +void ICU_Utility::appendToRule(UnicodeString& rule, 1.299 + UChar32 c, 1.300 + UBool isLiteral, 1.301 + UBool escapeUnprintable, 1.302 + UnicodeString& quoteBuf) { 1.303 + // If we are escaping unprintables, then escape them outside 1.304 + // quotes. \u and \U are not recognized within quotes. The same 1.305 + // logic applies to literals, but literals are never escaped. 1.306 + if (isLiteral || 1.307 + (escapeUnprintable && ICU_Utility::isUnprintable(c))) { 1.308 + if (quoteBuf.length() > 0) { 1.309 + // We prefer backslash APOSTROPHE to double APOSTROPHE 1.310 + // (more readable, less similar to ") so if there are 1.311 + // double APOSTROPHEs at the ends, we pull them outside 1.312 + // of the quote. 1.313 + 1.314 + // If the first thing in the quoteBuf is APOSTROPHE 1.315 + // (doubled) then pull it out. 1.316 + while (quoteBuf.length() >= 2 && 1.317 + quoteBuf.charAt(0) == APOSTROPHE && 1.318 + quoteBuf.charAt(1) == APOSTROPHE) { 1.319 + rule.append(BACKSLASH).append(APOSTROPHE); 1.320 + quoteBuf.remove(0, 2); 1.321 + } 1.322 + // If the last thing in the quoteBuf is APOSTROPHE 1.323 + // (doubled) then remove and count it and add it after. 1.324 + int32_t trailingCount = 0; 1.325 + while (quoteBuf.length() >= 2 && 1.326 + quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE && 1.327 + quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) { 1.328 + quoteBuf.truncate(quoteBuf.length()-2); 1.329 + ++trailingCount; 1.330 + } 1.331 + if (quoteBuf.length() > 0) { 1.332 + rule.append(APOSTROPHE); 1.333 + rule.append(quoteBuf); 1.334 + rule.append(APOSTROPHE); 1.335 + quoteBuf.truncate(0); 1.336 + } 1.337 + while (trailingCount-- > 0) { 1.338 + rule.append(BACKSLASH).append(APOSTROPHE); 1.339 + } 1.340 + } 1.341 + if (c != (UChar32)-1) { 1.342 + /* Since spaces are ignored during parsing, they are 1.343 + * emitted only for readability. We emit one here 1.344 + * only if there isn't already one at the end of the 1.345 + * rule. 1.346 + */ 1.347 + if (c == SPACE) { 1.348 + int32_t len = rule.length(); 1.349 + if (len > 0 && rule.charAt(len-1) != c) { 1.350 + rule.append(c); 1.351 + } 1.352 + } else if (!escapeUnprintable || !ICU_Utility::escapeUnprintable(rule, c)) { 1.353 + rule.append(c); 1.354 + } 1.355 + } 1.356 + } 1.357 + 1.358 + // Escape ' and '\' and don't begin a quote just for them 1.359 + else if (quoteBuf.length() == 0 && 1.360 + (c == APOSTROPHE || c == BACKSLASH)) { 1.361 + rule.append(BACKSLASH); 1.362 + rule.append(c); 1.363 + } 1.364 + 1.365 + // Specials (printable ascii that isn't [0-9a-zA-Z]) and 1.366 + // whitespace need quoting. Also append stuff to quotes if we are 1.367 + // building up a quoted substring already. 1.368 + else if (quoteBuf.length() > 0 || 1.369 + (c >= 0x0021 && c <= 0x007E && 1.370 + !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) || 1.371 + (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) || 1.372 + (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) || 1.373 + PatternProps::isWhiteSpace(c)) { 1.374 + quoteBuf.append(c); 1.375 + // Double ' within a quote 1.376 + if (c == APOSTROPHE) { 1.377 + quoteBuf.append(c); 1.378 + } 1.379 + } 1.380 + 1.381 + // Otherwise just append 1.382 + else { 1.383 + rule.append(c); 1.384 + } 1.385 +} 1.386 + 1.387 +void ICU_Utility::appendToRule(UnicodeString& rule, 1.388 + const UnicodeString& text, 1.389 + UBool isLiteral, 1.390 + UBool escapeUnprintable, 1.391 + UnicodeString& quoteBuf) { 1.392 + for (int32_t i=0; i<text.length(); ++i) { 1.393 + appendToRule(rule, text[i], isLiteral, escapeUnprintable, quoteBuf); 1.394 + } 1.395 +} 1.396 + 1.397 +/** 1.398 + * Given a matcher reference, which may be null, append its 1.399 + * pattern as a literal to the given rule. 1.400 + */ 1.401 +void ICU_Utility::appendToRule(UnicodeString& rule, 1.402 + const UnicodeMatcher* matcher, 1.403 + UBool escapeUnprintable, 1.404 + UnicodeString& quoteBuf) { 1.405 + if (matcher != NULL) { 1.406 + UnicodeString pat; 1.407 + appendToRule(rule, matcher->toPattern(pat, escapeUnprintable), 1.408 + TRUE, escapeUnprintable, quoteBuf); 1.409 + } 1.410 +} 1.411 + 1.412 +U_NAMESPACE_END