intl/icu/source/common/util.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/util.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,409 @@
     1.4 +/*
     1.5 +**********************************************************************
     1.6 +*   Copyright (c) 2001-2011, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +**********************************************************************
     1.9 +*   Date        Name        Description
    1.10 +*   11/19/2001  aliu        Creation.
    1.11 +**********************************************************************
    1.12 +*/
    1.13 +
    1.14 +#include "unicode/unimatch.h"
    1.15 +#include "unicode/utf16.h"
    1.16 +#include "patternprops.h"
    1.17 +#include "util.h"
    1.18 +
    1.19 +// Define UChar constants using hex for EBCDIC compatibility
    1.20 +
    1.21 +static const UChar BACKSLASH  = 0x005C; /*\*/
    1.22 +static const UChar UPPER_U    = 0x0055; /*U*/
    1.23 +static const UChar LOWER_U    = 0x0075; /*u*/
    1.24 +static const UChar APOSTROPHE = 0x0027; // '\''
    1.25 +static const UChar SPACE      = 0x0020; // ' '
    1.26 +
    1.27 +// "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    1.28 +static const UChar DIGITS[] = {
    1.29 +    48,49,50,51,52,53,54,55,56,57,
    1.30 +    65,66,67,68,69,70,71,72,73,74,
    1.31 +    75,76,77,78,79,80,81,82,83,84,
    1.32 +    85,86,87,88,89,90
    1.33 +};
    1.34 +
    1.35 +U_NAMESPACE_BEGIN
    1.36 +
    1.37 +UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n,
    1.38 +                                     int32_t radix, int32_t minDigits) {
    1.39 +    if (radix < 2 || radix > 36) {
    1.40 +        // Bogus radix
    1.41 +        return result.append((UChar)63/*?*/);
    1.42 +    }
    1.43 +    // Handle negatives
    1.44 +    if (n < 0) {
    1.45 +        n = -n;
    1.46 +        result.append((UChar)45/*-*/);
    1.47 +    }
    1.48 +    // First determine the number of digits
    1.49 +    int32_t nn = n;
    1.50 +    int32_t r = 1;
    1.51 +    while (nn >= radix) {
    1.52 +        nn /= radix;
    1.53 +        r *= radix;
    1.54 +        --minDigits;
    1.55 +    }
    1.56 +    // Now generate the digits
    1.57 +    while (--minDigits > 0) {
    1.58 +        result.append(DIGITS[0]);
    1.59 +    }
    1.60 +    while (r > 0) {
    1.61 +        int32_t digit = n / r;
    1.62 +        result.append(DIGITS[digit]);
    1.63 +        n -= digit * r;
    1.64 +        r /= radix;
    1.65 +    }
    1.66 +    return result;
    1.67 +}
    1.68 +
    1.69 +/**
    1.70 + * Return true if the character is NOT printable ASCII.
    1.71 + */
    1.72 +UBool ICU_Utility::isUnprintable(UChar32 c) {
    1.73 +    return !(c >= 0x20 && c <= 0x7E);
    1.74 +}
    1.75 +
    1.76 +/**
    1.77 + * Escape unprintable characters using \uxxxx notation for U+0000 to
    1.78 + * U+FFFF and \Uxxxxxxxx for U+10000 and above.  If the character is
    1.79 + * printable ASCII, then do nothing and return FALSE.  Otherwise,
    1.80 + * append the escaped notation and return TRUE.
    1.81 + */
    1.82 +UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) {
    1.83 +    if (isUnprintable(c)) {
    1.84 +        result.append(BACKSLASH);
    1.85 +        if (c & ~0xFFFF) {
    1.86 +            result.append(UPPER_U);
    1.87 +            result.append(DIGITS[0xF&(c>>28)]);
    1.88 +            result.append(DIGITS[0xF&(c>>24)]);
    1.89 +            result.append(DIGITS[0xF&(c>>20)]);
    1.90 +            result.append(DIGITS[0xF&(c>>16)]);
    1.91 +        } else {
    1.92 +            result.append(LOWER_U);
    1.93 +        }
    1.94 +        result.append(DIGITS[0xF&(c>>12)]);
    1.95 +        result.append(DIGITS[0xF&(c>>8)]);
    1.96 +        result.append(DIGITS[0xF&(c>>4)]);
    1.97 +        result.append(DIGITS[0xF&c]);
    1.98 +        return TRUE;
    1.99 +    }
   1.100 +    return FALSE;
   1.101 +}
   1.102 +
   1.103 +/**
   1.104 + * Returns the index of a character, ignoring quoted text.
   1.105 + * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
   1.106 + * found by a search for 'h'.
   1.107 + */
   1.108 +// FOR FUTURE USE.  DISABLE FOR NOW for coverage reasons.
   1.109 +/*
   1.110 +int32_t ICU_Utility::quotedIndexOf(const UnicodeString& text,
   1.111 +                               int32_t start, int32_t limit,
   1.112 +                               UChar charToFind) {
   1.113 +    for (int32_t i=start; i<limit; ++i) {
   1.114 +        UChar c = text.charAt(i);
   1.115 +        if (c == BACKSLASH) {
   1.116 +            ++i;
   1.117 +        } else if (c == APOSTROPHE) {
   1.118 +            while (++i < limit
   1.119 +                   && text.charAt(i) != APOSTROPHE) {}
   1.120 +        } else if (c == charToFind) {
   1.121 +            return i;
   1.122 +        }
   1.123 +    }
   1.124 +    return -1;
   1.125 +}
   1.126 +*/
   1.127 +
   1.128 +/**
   1.129 + * Skip over a sequence of zero or more white space characters at pos.
   1.130 + * @param advance if true, advance pos to the first non-white-space
   1.131 + * character at or after pos, or str.length(), if there is none.
   1.132 + * Otherwise leave pos unchanged.
   1.133 + * @return the index of the first non-white-space character at or
   1.134 + * after pos, or str.length(), if there is none.
   1.135 + */
   1.136 +int32_t ICU_Utility::skipWhitespace(const UnicodeString& str, int32_t& pos,
   1.137 +                                    UBool advance) {
   1.138 +    int32_t p = pos;
   1.139 +    const UChar* s = str.getBuffer();
   1.140 +    p = (int32_t)(PatternProps::skipWhiteSpace(s + p, str.length() - p) - s);
   1.141 +    if (advance) {
   1.142 +        pos = p;
   1.143 +    }
   1.144 +    return p;
   1.145 +}
   1.146 +
   1.147 +/**
   1.148 + * Skip over Pattern_White_Space in a Replaceable.
   1.149 + * Skipping may be done in the forward or
   1.150 + * reverse direction.  In either case, the leftmost index will be
   1.151 + * inclusive, and the rightmost index will be exclusive.  That is,
   1.152 + * given a range defined as [start, limit), the call
   1.153 + * skipWhitespace(text, start, limit) will advance start past leading
   1.154 + * whitespace, whereas the call skipWhitespace(text, limit, start),
   1.155 + * will back up limit past trailing whitespace.
   1.156 + * @param text the text to be analyzed
   1.157 + * @param pos either the start or limit of a range of 'text', to skip
   1.158 + * leading or trailing whitespace, respectively
   1.159 + * @param stop either the limit or start of a range of 'text', to skip
   1.160 + * leading or trailing whitespace, respectively
   1.161 + * @return the new start or limit, depending on what was passed in to
   1.162 + * 'pos'
   1.163 + */
   1.164 +//?FOR FUTURE USE.  DISABLE FOR NOW for coverage reasons.
   1.165 +//?int32_t ICU_Utility::skipWhitespace(const Replaceable& text,
   1.166 +//?                                    int32_t pos, int32_t stop) {
   1.167 +//?    UChar32 c;
   1.168 +//?    UBool isForward = (stop >= pos);
   1.169 +//?
   1.170 +//?    if (!isForward) {
   1.171 +//?        --pos; // pos is a limit, so back up by one
   1.172 +//?    }
   1.173 +//?    
   1.174 +//?    while (pos != stop &&
   1.175 +//?           PatternProps::isWhiteSpace(c = text.char32At(pos))) {
   1.176 +//?        if (isForward) {
   1.177 +//?            pos += U16_LENGTH(c);
   1.178 +//?        } else {
   1.179 +//?            pos -= U16_LENGTH(c);
   1.180 +//?        }
   1.181 +//?    }
   1.182 +//?
   1.183 +//?    if (!isForward) {
   1.184 +//?        ++pos; // make pos back into a limit
   1.185 +//?    }
   1.186 +//?
   1.187 +//?    return pos;
   1.188 +//?}
   1.189 +
   1.190 +/**
   1.191 + * Parse a single non-whitespace character 'ch', optionally
   1.192 + * preceded by whitespace.
   1.193 + * @param id the string to be parsed
   1.194 + * @param pos INPUT-OUTPUT parameter.  On input, pos[0] is the
   1.195 + * offset of the first character to be parsed.  On output, pos[0]
   1.196 + * is the index after the last parsed character.  If the parse
   1.197 + * fails, pos[0] will be unchanged.
   1.198 + * @param ch the non-whitespace character to be parsed.
   1.199 + * @return true if 'ch' is seen preceded by zero or more
   1.200 + * whitespace characters.
   1.201 + */
   1.202 +UBool ICU_Utility::parseChar(const UnicodeString& id, int32_t& pos, UChar ch) {
   1.203 +    int32_t start = pos;
   1.204 +    skipWhitespace(id, pos, TRUE);
   1.205 +    if (pos == id.length() ||
   1.206 +        id.charAt(pos) != ch) {
   1.207 +        pos = start;
   1.208 +        return FALSE;
   1.209 +    }
   1.210 +    ++pos;
   1.211 +    return TRUE;
   1.212 +}
   1.213 +
   1.214 +/**
   1.215 + * Parse a pattern string within the given Replaceable and a parsing
   1.216 + * pattern.  Characters are matched literally and case-sensitively
   1.217 + * except for the following special characters:
   1.218 + *
   1.219 + * ~  zero or more Pattern_White_Space chars
   1.220 + *
   1.221 + * If end of pattern is reached with all matches along the way,
   1.222 + * pos is advanced to the first unparsed index and returned.
   1.223 + * Otherwise -1 is returned.
   1.224 + * @param pat pattern that controls parsing
   1.225 + * @param text text to be parsed, starting at index
   1.226 + * @param index offset to first character to parse
   1.227 + * @param limit offset after last character to parse
   1.228 + * @return index after last parsed character, or -1 on parse failure.
   1.229 + */
   1.230 +int32_t ICU_Utility::parsePattern(const UnicodeString& pat,
   1.231 +                                  const Replaceable& text,
   1.232 +                                  int32_t index,
   1.233 +                                  int32_t limit) {
   1.234 +    int32_t ipat = 0;
   1.235 +
   1.236 +    // empty pattern matches immediately
   1.237 +    if (ipat == pat.length()) {
   1.238 +        return index;
   1.239 +    }
   1.240 +
   1.241 +    UChar32 cpat = pat.char32At(ipat);
   1.242 +
   1.243 +    while (index < limit) {
   1.244 +        UChar32 c = text.char32At(index);
   1.245 +
   1.246 +        // parse \s*
   1.247 +        if (cpat == 126 /*~*/) {
   1.248 +            if (PatternProps::isWhiteSpace(c)) {
   1.249 +                index += U16_LENGTH(c);
   1.250 +                continue;
   1.251 +            } else {
   1.252 +                if (++ipat == pat.length()) {
   1.253 +                    return index; // success; c unparsed
   1.254 +                }
   1.255 +                // fall thru; process c again with next cpat
   1.256 +            }
   1.257 +        }
   1.258 +
   1.259 +        // parse literal
   1.260 +        else if (c == cpat) {
   1.261 +            index += U16_LENGTH(c);
   1.262 +            ipat += U16_LENGTH(cpat);
   1.263 +            if (ipat == pat.length()) {
   1.264 +                return index; // success; c parsed
   1.265 +            }
   1.266 +            // fall thru; get next cpat
   1.267 +        }
   1.268 +
   1.269 +        // match failure of literal
   1.270 +        else {
   1.271 +            return -1;
   1.272 +        }
   1.273 +
   1.274 +        cpat = pat.char32At(ipat);
   1.275 +    }
   1.276 +
   1.277 +    return -1; // text ended before end of pat
   1.278 +}
   1.279 +
   1.280 +/**
   1.281 + * Append a character to a rule that is being built up.  To flush
   1.282 + * the quoteBuf to rule, make one final call with isLiteral == TRUE.
   1.283 + * If there is no final character, pass in (UChar32)-1 as c.
   1.284 + * @param rule the string to append the character to
   1.285 + * @param c the character to append, or (UChar32)-1 if none.
   1.286 + * @param isLiteral if true, then the given character should not be
   1.287 + * quoted or escaped.  Usually this means it is a syntactic element
   1.288 + * such as > or $
   1.289 + * @param escapeUnprintable if true, then unprintable characters
   1.290 + * should be escaped using \uxxxx or \Uxxxxxxxx.  These escapes will
   1.291 + * appear outside of quotes.
   1.292 + * @param quoteBuf a buffer which is used to build up quoted
   1.293 + * substrings.  The caller should initially supply an empty buffer,
   1.294 + * and thereafter should not modify the buffer.  The buffer should be
   1.295 + * cleared out by, at the end, calling this method with a literal
   1.296 + * character.
   1.297 + */
   1.298 +void ICU_Utility::appendToRule(UnicodeString& rule,
   1.299 +                               UChar32 c,
   1.300 +                               UBool isLiteral,
   1.301 +                               UBool escapeUnprintable,
   1.302 +                               UnicodeString& quoteBuf) {
   1.303 +    // If we are escaping unprintables, then escape them outside
   1.304 +    // quotes.  \u and \U are not recognized within quotes.  The same
   1.305 +    // logic applies to literals, but literals are never escaped.
   1.306 +    if (isLiteral ||
   1.307 +        (escapeUnprintable && ICU_Utility::isUnprintable(c))) {
   1.308 +        if (quoteBuf.length() > 0) {
   1.309 +            // We prefer backslash APOSTROPHE to double APOSTROPHE
   1.310 +            // (more readable, less similar to ") so if there are
   1.311 +            // double APOSTROPHEs at the ends, we pull them outside
   1.312 +            // of the quote.
   1.313 +
   1.314 +            // If the first thing in the quoteBuf is APOSTROPHE
   1.315 +            // (doubled) then pull it out.
   1.316 +            while (quoteBuf.length() >= 2 &&
   1.317 +                   quoteBuf.charAt(0) == APOSTROPHE &&
   1.318 +                   quoteBuf.charAt(1) == APOSTROPHE) {
   1.319 +                rule.append(BACKSLASH).append(APOSTROPHE);
   1.320 +                quoteBuf.remove(0, 2);
   1.321 +            }
   1.322 +            // If the last thing in the quoteBuf is APOSTROPHE
   1.323 +            // (doubled) then remove and count it and add it after.
   1.324 +            int32_t trailingCount = 0;
   1.325 +            while (quoteBuf.length() >= 2 &&
   1.326 +                   quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&
   1.327 +                   quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {
   1.328 +                quoteBuf.truncate(quoteBuf.length()-2);
   1.329 +                ++trailingCount;
   1.330 +            }
   1.331 +            if (quoteBuf.length() > 0) {
   1.332 +                rule.append(APOSTROPHE);
   1.333 +                rule.append(quoteBuf);
   1.334 +                rule.append(APOSTROPHE);
   1.335 +                quoteBuf.truncate(0);
   1.336 +            }
   1.337 +            while (trailingCount-- > 0) {
   1.338 +                rule.append(BACKSLASH).append(APOSTROPHE);
   1.339 +            }
   1.340 +        }
   1.341 +        if (c != (UChar32)-1) {
   1.342 +            /* Since spaces are ignored during parsing, they are
   1.343 +             * emitted only for readability.  We emit one here
   1.344 +             * only if there isn't already one at the end of the
   1.345 +             * rule.
   1.346 +             */
   1.347 +            if (c == SPACE) {
   1.348 +                int32_t len = rule.length();
   1.349 +                if (len > 0 && rule.charAt(len-1) != c) {
   1.350 +                    rule.append(c);
   1.351 +                }
   1.352 +            } else if (!escapeUnprintable || !ICU_Utility::escapeUnprintable(rule, c)) {
   1.353 +                rule.append(c);
   1.354 +            }
   1.355 +        }
   1.356 +    }
   1.357 +
   1.358 +    // Escape ' and '\' and don't begin a quote just for them
   1.359 +    else if (quoteBuf.length() == 0 &&
   1.360 +             (c == APOSTROPHE || c == BACKSLASH)) {
   1.361 +        rule.append(BACKSLASH);
   1.362 +        rule.append(c);
   1.363 +    }
   1.364 +
   1.365 +    // Specials (printable ascii that isn't [0-9a-zA-Z]) and
   1.366 +    // whitespace need quoting.  Also append stuff to quotes if we are
   1.367 +    // building up a quoted substring already.
   1.368 +    else if (quoteBuf.length() > 0 ||
   1.369 +             (c >= 0x0021 && c <= 0x007E &&
   1.370 +              !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
   1.371 +                (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
   1.372 +                (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
   1.373 +             PatternProps::isWhiteSpace(c)) {
   1.374 +        quoteBuf.append(c);
   1.375 +        // Double ' within a quote
   1.376 +        if (c == APOSTROPHE) {
   1.377 +            quoteBuf.append(c);
   1.378 +        }
   1.379 +    }
   1.380 +    
   1.381 +    // Otherwise just append
   1.382 +    else {
   1.383 +        rule.append(c);
   1.384 +    }
   1.385 +}
   1.386 +
   1.387 +void ICU_Utility::appendToRule(UnicodeString& rule,
   1.388 +                               const UnicodeString& text,
   1.389 +                               UBool isLiteral,
   1.390 +                               UBool escapeUnprintable,
   1.391 +                               UnicodeString& quoteBuf) {
   1.392 +    for (int32_t i=0; i<text.length(); ++i) {
   1.393 +        appendToRule(rule, text[i], isLiteral, escapeUnprintable, quoteBuf);
   1.394 +    }
   1.395 +}
   1.396 +
   1.397 +/**
   1.398 + * Given a matcher reference, which may be null, append its
   1.399 + * pattern as a literal to the given rule.
   1.400 + */
   1.401 +void ICU_Utility::appendToRule(UnicodeString& rule,
   1.402 +                               const UnicodeMatcher* matcher,
   1.403 +                               UBool escapeUnprintable,
   1.404 +                               UnicodeString& quoteBuf) {
   1.405 +    if (matcher != NULL) {
   1.406 +        UnicodeString pat;
   1.407 +        appendToRule(rule, matcher->toPattern(pat, escapeUnprintable),
   1.408 +                     TRUE, escapeUnprintable, quoteBuf);
   1.409 +    }
   1.410 +}
   1.411 +
   1.412 +U_NAMESPACE_END

mercurial