intl/icu/source/common/util.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (c) 2001-2011, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 * Date Name Description
michael@0 7 * 11/19/2001 aliu Creation.
michael@0 8 **********************************************************************
michael@0 9 */
michael@0 10
michael@0 11 #include "unicode/unimatch.h"
michael@0 12 #include "unicode/utf16.h"
michael@0 13 #include "patternprops.h"
michael@0 14 #include "util.h"
michael@0 15
michael@0 16 // Define UChar constants using hex for EBCDIC compatibility
michael@0 17
michael@0 18 static const UChar BACKSLASH = 0x005C; /*\*/
michael@0 19 static const UChar UPPER_U = 0x0055; /*U*/
michael@0 20 static const UChar LOWER_U = 0x0075; /*u*/
michael@0 21 static const UChar APOSTROPHE = 0x0027; // '\''
michael@0 22 static const UChar SPACE = 0x0020; // ' '
michael@0 23
michael@0 24 // "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
michael@0 25 static const UChar DIGITS[] = {
michael@0 26 48,49,50,51,52,53,54,55,56,57,
michael@0 27 65,66,67,68,69,70,71,72,73,74,
michael@0 28 75,76,77,78,79,80,81,82,83,84,
michael@0 29 85,86,87,88,89,90
michael@0 30 };
michael@0 31
michael@0 32 U_NAMESPACE_BEGIN
michael@0 33
michael@0 34 UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n,
michael@0 35 int32_t radix, int32_t minDigits) {
michael@0 36 if (radix < 2 || radix > 36) {
michael@0 37 // Bogus radix
michael@0 38 return result.append((UChar)63/*?*/);
michael@0 39 }
michael@0 40 // Handle negatives
michael@0 41 if (n < 0) {
michael@0 42 n = -n;
michael@0 43 result.append((UChar)45/*-*/);
michael@0 44 }
michael@0 45 // First determine the number of digits
michael@0 46 int32_t nn = n;
michael@0 47 int32_t r = 1;
michael@0 48 while (nn >= radix) {
michael@0 49 nn /= radix;
michael@0 50 r *= radix;
michael@0 51 --minDigits;
michael@0 52 }
michael@0 53 // Now generate the digits
michael@0 54 while (--minDigits > 0) {
michael@0 55 result.append(DIGITS[0]);
michael@0 56 }
michael@0 57 while (r > 0) {
michael@0 58 int32_t digit = n / r;
michael@0 59 result.append(DIGITS[digit]);
michael@0 60 n -= digit * r;
michael@0 61 r /= radix;
michael@0 62 }
michael@0 63 return result;
michael@0 64 }
michael@0 65
michael@0 66 /**
michael@0 67 * Return true if the character is NOT printable ASCII.
michael@0 68 */
michael@0 69 UBool ICU_Utility::isUnprintable(UChar32 c) {
michael@0 70 return !(c >= 0x20 && c <= 0x7E);
michael@0 71 }
michael@0 72
michael@0 73 /**
michael@0 74 * Escape unprintable characters using \uxxxx notation for U+0000 to
michael@0 75 * U+FFFF and \Uxxxxxxxx for U+10000 and above. If the character is
michael@0 76 * printable ASCII, then do nothing and return FALSE. Otherwise,
michael@0 77 * append the escaped notation and return TRUE.
michael@0 78 */
michael@0 79 UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) {
michael@0 80 if (isUnprintable(c)) {
michael@0 81 result.append(BACKSLASH);
michael@0 82 if (c & ~0xFFFF) {
michael@0 83 result.append(UPPER_U);
michael@0 84 result.append(DIGITS[0xF&(c>>28)]);
michael@0 85 result.append(DIGITS[0xF&(c>>24)]);
michael@0 86 result.append(DIGITS[0xF&(c>>20)]);
michael@0 87 result.append(DIGITS[0xF&(c>>16)]);
michael@0 88 } else {
michael@0 89 result.append(LOWER_U);
michael@0 90 }
michael@0 91 result.append(DIGITS[0xF&(c>>12)]);
michael@0 92 result.append(DIGITS[0xF&(c>>8)]);
michael@0 93 result.append(DIGITS[0xF&(c>>4)]);
michael@0 94 result.append(DIGITS[0xF&c]);
michael@0 95 return TRUE;
michael@0 96 }
michael@0 97 return FALSE;
michael@0 98 }
michael@0 99
michael@0 100 /**
michael@0 101 * Returns the index of a character, ignoring quoted text.
michael@0 102 * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
michael@0 103 * found by a search for 'h'.
michael@0 104 */
michael@0 105 // FOR FUTURE USE. DISABLE FOR NOW for coverage reasons.
michael@0 106 /*
michael@0 107 int32_t ICU_Utility::quotedIndexOf(const UnicodeString& text,
michael@0 108 int32_t start, int32_t limit,
michael@0 109 UChar charToFind) {
michael@0 110 for (int32_t i=start; i<limit; ++i) {
michael@0 111 UChar c = text.charAt(i);
michael@0 112 if (c == BACKSLASH) {
michael@0 113 ++i;
michael@0 114 } else if (c == APOSTROPHE) {
michael@0 115 while (++i < limit
michael@0 116 && text.charAt(i) != APOSTROPHE) {}
michael@0 117 } else if (c == charToFind) {
michael@0 118 return i;
michael@0 119 }
michael@0 120 }
michael@0 121 return -1;
michael@0 122 }
michael@0 123 */
michael@0 124
michael@0 125 /**
michael@0 126 * Skip over a sequence of zero or more white space characters at pos.
michael@0 127 * @param advance if true, advance pos to the first non-white-space
michael@0 128 * character at or after pos, or str.length(), if there is none.
michael@0 129 * Otherwise leave pos unchanged.
michael@0 130 * @return the index of the first non-white-space character at or
michael@0 131 * after pos, or str.length(), if there is none.
michael@0 132 */
michael@0 133 int32_t ICU_Utility::skipWhitespace(const UnicodeString& str, int32_t& pos,
michael@0 134 UBool advance) {
michael@0 135 int32_t p = pos;
michael@0 136 const UChar* s = str.getBuffer();
michael@0 137 p = (int32_t)(PatternProps::skipWhiteSpace(s + p, str.length() - p) - s);
michael@0 138 if (advance) {
michael@0 139 pos = p;
michael@0 140 }
michael@0 141 return p;
michael@0 142 }
michael@0 143
michael@0 144 /**
michael@0 145 * Skip over Pattern_White_Space in a Replaceable.
michael@0 146 * Skipping may be done in the forward or
michael@0 147 * reverse direction. In either case, the leftmost index will be
michael@0 148 * inclusive, and the rightmost index will be exclusive. That is,
michael@0 149 * given a range defined as [start, limit), the call
michael@0 150 * skipWhitespace(text, start, limit) will advance start past leading
michael@0 151 * whitespace, whereas the call skipWhitespace(text, limit, start),
michael@0 152 * will back up limit past trailing whitespace.
michael@0 153 * @param text the text to be analyzed
michael@0 154 * @param pos either the start or limit of a range of 'text', to skip
michael@0 155 * leading or trailing whitespace, respectively
michael@0 156 * @param stop either the limit or start of a range of 'text', to skip
michael@0 157 * leading or trailing whitespace, respectively
michael@0 158 * @return the new start or limit, depending on what was passed in to
michael@0 159 * 'pos'
michael@0 160 */
michael@0 161 //?FOR FUTURE USE. DISABLE FOR NOW for coverage reasons.
michael@0 162 //?int32_t ICU_Utility::skipWhitespace(const Replaceable& text,
michael@0 163 //? int32_t pos, int32_t stop) {
michael@0 164 //? UChar32 c;
michael@0 165 //? UBool isForward = (stop >= pos);
michael@0 166 //?
michael@0 167 //? if (!isForward) {
michael@0 168 //? --pos; // pos is a limit, so back up by one
michael@0 169 //? }
michael@0 170 //?
michael@0 171 //? while (pos != stop &&
michael@0 172 //? PatternProps::isWhiteSpace(c = text.char32At(pos))) {
michael@0 173 //? if (isForward) {
michael@0 174 //? pos += U16_LENGTH(c);
michael@0 175 //? } else {
michael@0 176 //? pos -= U16_LENGTH(c);
michael@0 177 //? }
michael@0 178 //? }
michael@0 179 //?
michael@0 180 //? if (!isForward) {
michael@0 181 //? ++pos; // make pos back into a limit
michael@0 182 //? }
michael@0 183 //?
michael@0 184 //? return pos;
michael@0 185 //?}
michael@0 186
michael@0 187 /**
michael@0 188 * Parse a single non-whitespace character 'ch', optionally
michael@0 189 * preceded by whitespace.
michael@0 190 * @param id the string to be parsed
michael@0 191 * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the
michael@0 192 * offset of the first character to be parsed. On output, pos[0]
michael@0 193 * is the index after the last parsed character. If the parse
michael@0 194 * fails, pos[0] will be unchanged.
michael@0 195 * @param ch the non-whitespace character to be parsed.
michael@0 196 * @return true if 'ch' is seen preceded by zero or more
michael@0 197 * whitespace characters.
michael@0 198 */
michael@0 199 UBool ICU_Utility::parseChar(const UnicodeString& id, int32_t& pos, UChar ch) {
michael@0 200 int32_t start = pos;
michael@0 201 skipWhitespace(id, pos, TRUE);
michael@0 202 if (pos == id.length() ||
michael@0 203 id.charAt(pos) != ch) {
michael@0 204 pos = start;
michael@0 205 return FALSE;
michael@0 206 }
michael@0 207 ++pos;
michael@0 208 return TRUE;
michael@0 209 }
michael@0 210
michael@0 211 /**
michael@0 212 * Parse a pattern string within the given Replaceable and a parsing
michael@0 213 * pattern. Characters are matched literally and case-sensitively
michael@0 214 * except for the following special characters:
michael@0 215 *
michael@0 216 * ~ zero or more Pattern_White_Space chars
michael@0 217 *
michael@0 218 * If end of pattern is reached with all matches along the way,
michael@0 219 * pos is advanced to the first unparsed index and returned.
michael@0 220 * Otherwise -1 is returned.
michael@0 221 * @param pat pattern that controls parsing
michael@0 222 * @param text text to be parsed, starting at index
michael@0 223 * @param index offset to first character to parse
michael@0 224 * @param limit offset after last character to parse
michael@0 225 * @return index after last parsed character, or -1 on parse failure.
michael@0 226 */
michael@0 227 int32_t ICU_Utility::parsePattern(const UnicodeString& pat,
michael@0 228 const Replaceable& text,
michael@0 229 int32_t index,
michael@0 230 int32_t limit) {
michael@0 231 int32_t ipat = 0;
michael@0 232
michael@0 233 // empty pattern matches immediately
michael@0 234 if (ipat == pat.length()) {
michael@0 235 return index;
michael@0 236 }
michael@0 237
michael@0 238 UChar32 cpat = pat.char32At(ipat);
michael@0 239
michael@0 240 while (index < limit) {
michael@0 241 UChar32 c = text.char32At(index);
michael@0 242
michael@0 243 // parse \s*
michael@0 244 if (cpat == 126 /*~*/) {
michael@0 245 if (PatternProps::isWhiteSpace(c)) {
michael@0 246 index += U16_LENGTH(c);
michael@0 247 continue;
michael@0 248 } else {
michael@0 249 if (++ipat == pat.length()) {
michael@0 250 return index; // success; c unparsed
michael@0 251 }
michael@0 252 // fall thru; process c again with next cpat
michael@0 253 }
michael@0 254 }
michael@0 255
michael@0 256 // parse literal
michael@0 257 else if (c == cpat) {
michael@0 258 index += U16_LENGTH(c);
michael@0 259 ipat += U16_LENGTH(cpat);
michael@0 260 if (ipat == pat.length()) {
michael@0 261 return index; // success; c parsed
michael@0 262 }
michael@0 263 // fall thru; get next cpat
michael@0 264 }
michael@0 265
michael@0 266 // match failure of literal
michael@0 267 else {
michael@0 268 return -1;
michael@0 269 }
michael@0 270
michael@0 271 cpat = pat.char32At(ipat);
michael@0 272 }
michael@0 273
michael@0 274 return -1; // text ended before end of pat
michael@0 275 }
michael@0 276
michael@0 277 /**
michael@0 278 * Append a character to a rule that is being built up. To flush
michael@0 279 * the quoteBuf to rule, make one final call with isLiteral == TRUE.
michael@0 280 * If there is no final character, pass in (UChar32)-1 as c.
michael@0 281 * @param rule the string to append the character to
michael@0 282 * @param c the character to append, or (UChar32)-1 if none.
michael@0 283 * @param isLiteral if true, then the given character should not be
michael@0 284 * quoted or escaped. Usually this means it is a syntactic element
michael@0 285 * such as > or $
michael@0 286 * @param escapeUnprintable if true, then unprintable characters
michael@0 287 * should be escaped using \uxxxx or \Uxxxxxxxx. These escapes will
michael@0 288 * appear outside of quotes.
michael@0 289 * @param quoteBuf a buffer which is used to build up quoted
michael@0 290 * substrings. The caller should initially supply an empty buffer,
michael@0 291 * and thereafter should not modify the buffer. The buffer should be
michael@0 292 * cleared out by, at the end, calling this method with a literal
michael@0 293 * character.
michael@0 294 */
michael@0 295 void ICU_Utility::appendToRule(UnicodeString& rule,
michael@0 296 UChar32 c,
michael@0 297 UBool isLiteral,
michael@0 298 UBool escapeUnprintable,
michael@0 299 UnicodeString& quoteBuf) {
michael@0 300 // If we are escaping unprintables, then escape them outside
michael@0 301 // quotes. \u and \U are not recognized within quotes. The same
michael@0 302 // logic applies to literals, but literals are never escaped.
michael@0 303 if (isLiteral ||
michael@0 304 (escapeUnprintable && ICU_Utility::isUnprintable(c))) {
michael@0 305 if (quoteBuf.length() > 0) {
michael@0 306 // We prefer backslash APOSTROPHE to double APOSTROPHE
michael@0 307 // (more readable, less similar to ") so if there are
michael@0 308 // double APOSTROPHEs at the ends, we pull them outside
michael@0 309 // of the quote.
michael@0 310
michael@0 311 // If the first thing in the quoteBuf is APOSTROPHE
michael@0 312 // (doubled) then pull it out.
michael@0 313 while (quoteBuf.length() >= 2 &&
michael@0 314 quoteBuf.charAt(0) == APOSTROPHE &&
michael@0 315 quoteBuf.charAt(1) == APOSTROPHE) {
michael@0 316 rule.append(BACKSLASH).append(APOSTROPHE);
michael@0 317 quoteBuf.remove(0, 2);
michael@0 318 }
michael@0 319 // If the last thing in the quoteBuf is APOSTROPHE
michael@0 320 // (doubled) then remove and count it and add it after.
michael@0 321 int32_t trailingCount = 0;
michael@0 322 while (quoteBuf.length() >= 2 &&
michael@0 323 quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&
michael@0 324 quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {
michael@0 325 quoteBuf.truncate(quoteBuf.length()-2);
michael@0 326 ++trailingCount;
michael@0 327 }
michael@0 328 if (quoteBuf.length() > 0) {
michael@0 329 rule.append(APOSTROPHE);
michael@0 330 rule.append(quoteBuf);
michael@0 331 rule.append(APOSTROPHE);
michael@0 332 quoteBuf.truncate(0);
michael@0 333 }
michael@0 334 while (trailingCount-- > 0) {
michael@0 335 rule.append(BACKSLASH).append(APOSTROPHE);
michael@0 336 }
michael@0 337 }
michael@0 338 if (c != (UChar32)-1) {
michael@0 339 /* Since spaces are ignored during parsing, they are
michael@0 340 * emitted only for readability. We emit one here
michael@0 341 * only if there isn't already one at the end of the
michael@0 342 * rule.
michael@0 343 */
michael@0 344 if (c == SPACE) {
michael@0 345 int32_t len = rule.length();
michael@0 346 if (len > 0 && rule.charAt(len-1) != c) {
michael@0 347 rule.append(c);
michael@0 348 }
michael@0 349 } else if (!escapeUnprintable || !ICU_Utility::escapeUnprintable(rule, c)) {
michael@0 350 rule.append(c);
michael@0 351 }
michael@0 352 }
michael@0 353 }
michael@0 354
michael@0 355 // Escape ' and '\' and don't begin a quote just for them
michael@0 356 else if (quoteBuf.length() == 0 &&
michael@0 357 (c == APOSTROPHE || c == BACKSLASH)) {
michael@0 358 rule.append(BACKSLASH);
michael@0 359 rule.append(c);
michael@0 360 }
michael@0 361
michael@0 362 // Specials (printable ascii that isn't [0-9a-zA-Z]) and
michael@0 363 // whitespace need quoting. Also append stuff to quotes if we are
michael@0 364 // building up a quoted substring already.
michael@0 365 else if (quoteBuf.length() > 0 ||
michael@0 366 (c >= 0x0021 && c <= 0x007E &&
michael@0 367 !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
michael@0 368 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
michael@0 369 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
michael@0 370 PatternProps::isWhiteSpace(c)) {
michael@0 371 quoteBuf.append(c);
michael@0 372 // Double ' within a quote
michael@0 373 if (c == APOSTROPHE) {
michael@0 374 quoteBuf.append(c);
michael@0 375 }
michael@0 376 }
michael@0 377
michael@0 378 // Otherwise just append
michael@0 379 else {
michael@0 380 rule.append(c);
michael@0 381 }
michael@0 382 }
michael@0 383
michael@0 384 void ICU_Utility::appendToRule(UnicodeString& rule,
michael@0 385 const UnicodeString& text,
michael@0 386 UBool isLiteral,
michael@0 387 UBool escapeUnprintable,
michael@0 388 UnicodeString& quoteBuf) {
michael@0 389 for (int32_t i=0; i<text.length(); ++i) {
michael@0 390 appendToRule(rule, text[i], isLiteral, escapeUnprintable, quoteBuf);
michael@0 391 }
michael@0 392 }
michael@0 393
michael@0 394 /**
michael@0 395 * Given a matcher reference, which may be null, append its
michael@0 396 * pattern as a literal to the given rule.
michael@0 397 */
michael@0 398 void ICU_Utility::appendToRule(UnicodeString& rule,
michael@0 399 const UnicodeMatcher* matcher,
michael@0 400 UBool escapeUnprintable,
michael@0 401 UnicodeString& quoteBuf) {
michael@0 402 if (matcher != NULL) {
michael@0 403 UnicodeString pat;
michael@0 404 appendToRule(rule, matcher->toPattern(pat, escapeUnprintable),
michael@0 405 TRUE, escapeUnprintable, quoteBuf);
michael@0 406 }
michael@0 407 }
michael@0 408
michael@0 409 U_NAMESPACE_END

mercurial