1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/util_props.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,214 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (c) 2001-2011, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +********************************************************************** 1.9 +* Date Name Description 1.10 +* 11/19/2001 aliu Creation. 1.11 +********************************************************************** 1.12 +*/ 1.13 + 1.14 +#include "unicode/uchar.h" 1.15 +#include "unicode/utf16.h" 1.16 +#include "patternprops.h" 1.17 +#include "util.h" 1.18 + 1.19 +U_NAMESPACE_BEGIN 1.20 + 1.21 +/** 1.22 + * Parse an integer at pos, either of the form \d+ or of the form 1.23 + * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex, 1.24 + * or octal format. 1.25 + * @param pos INPUT-OUTPUT parameter. On input, the first 1.26 + * character to parse. On output, the character after the last 1.27 + * parsed character. 1.28 + */ 1.29 +int32_t ICU_Utility::parseInteger(const UnicodeString& rule, int32_t& pos, int32_t limit) { 1.30 + int32_t count = 0; 1.31 + int32_t value = 0; 1.32 + int32_t p = pos; 1.33 + int8_t radix = 10; 1.34 + 1.35 + if (p < limit && rule.charAt(p) == 48 /*0*/) { 1.36 + if (p+1 < limit && (rule.charAt(p+1) == 0x78 /*x*/ || rule.charAt(p+1) == 0x58 /*X*/)) { 1.37 + p += 2; 1.38 + radix = 16; 1.39 + } 1.40 + else { 1.41 + p++; 1.42 + count = 1; 1.43 + radix = 8; 1.44 + } 1.45 + } 1.46 + 1.47 + while (p < limit) { 1.48 + int32_t d = u_digit(rule.charAt(p++), radix); 1.49 + if (d < 0) { 1.50 + --p; 1.51 + break; 1.52 + } 1.53 + ++count; 1.54 + int32_t v = (value * radix) + d; 1.55 + if (v <= value) { 1.56 + // If there are too many input digits, at some point 1.57 + // the value will go negative, e.g., if we have seen 1.58 + // "0x8000000" already and there is another '0', when 1.59 + // we parse the next 0 the value will go negative. 1.60 + return 0; 1.61 + } 1.62 + value = v; 1.63 + } 1.64 + if (count > 0) { 1.65 + pos = p; 1.66 + } 1.67 + return value; 1.68 +} 1.69 + 1.70 +/** 1.71 + * Parse a pattern string starting at offset pos. Keywords are 1.72 + * matched case-insensitively. Spaces may be skipped and may be 1.73 + * optional or required. Integer values may be parsed, and if 1.74 + * they are, they will be returned in the given array. If 1.75 + * successful, the offset of the next non-space character is 1.76 + * returned. On failure, -1 is returned. 1.77 + * @param pattern must only contain lowercase characters, which 1.78 + * will match their uppercase equivalents as well. A space 1.79 + * character matches one or more required spaces. A '~' character 1.80 + * matches zero or more optional spaces. A '#' character matches 1.81 + * an integer and stores it in parsedInts, which the caller must 1.82 + * ensure has enough capacity. 1.83 + * @param parsedInts array to receive parsed integers. Caller 1.84 + * must ensure that parsedInts.length is >= the number of '#' 1.85 + * signs in 'pattern'. 1.86 + * @return the position after the last character parsed, or -1 if 1.87 + * the parse failed 1.88 + */ 1.89 +int32_t ICU_Utility::parsePattern(const UnicodeString& rule, int32_t pos, int32_t limit, 1.90 + const UnicodeString& pattern, int32_t* parsedInts) { 1.91 + // TODO Update this to handle surrogates 1.92 + int32_t p; 1.93 + int32_t intCount = 0; // number of integers parsed 1.94 + for (int32_t i=0; i<pattern.length(); ++i) { 1.95 + UChar cpat = pattern.charAt(i); 1.96 + UChar c; 1.97 + switch (cpat) { 1.98 + case 32 /*' '*/: 1.99 + if (pos >= limit) { 1.100 + return -1; 1.101 + } 1.102 + c = rule.charAt(pos++); 1.103 + if (!PatternProps::isWhiteSpace(c)) { 1.104 + return -1; 1.105 + } 1.106 + // FALL THROUGH to skipWhitespace 1.107 + case 126 /*'~'*/: 1.108 + pos = skipWhitespace(rule, pos); 1.109 + break; 1.110 + case 35 /*'#'*/: 1.111 + p = pos; 1.112 + parsedInts[intCount++] = parseInteger(rule, p, limit); 1.113 + if (p == pos) { 1.114 + // Syntax error; failed to parse integer 1.115 + return -1; 1.116 + } 1.117 + pos = p; 1.118 + break; 1.119 + default: 1.120 + if (pos >= limit) { 1.121 + return -1; 1.122 + } 1.123 + c = (UChar) u_tolower(rule.charAt(pos++)); 1.124 + if (c != cpat) { 1.125 + return -1; 1.126 + } 1.127 + break; 1.128 + } 1.129 + } 1.130 + return pos; 1.131 +} 1.132 + 1.133 +/** 1.134 + * Parse a Unicode identifier from the given string at the given 1.135 + * position. Return the identifier, or an empty string if there 1.136 + * is no identifier. 1.137 + * @param str the string to parse 1.138 + * @param pos INPUT-OUPUT parameter. On INPUT, pos is the 1.139 + * first character to examine. It must be less than str.length(), 1.140 + * and it must not point to a whitespace character. That is, must 1.141 + * have pos < str.length(). On 1.142 + * OUTPUT, the position after the last parsed character. 1.143 + * @return the Unicode identifier, or an empty string if there is 1.144 + * no valid identifier at pos. 1.145 + */ 1.146 +UnicodeString ICU_Utility::parseUnicodeIdentifier(const UnicodeString& str, int32_t& pos) { 1.147 + // assert(pos < str.length()); 1.148 + UnicodeString buf; 1.149 + int p = pos; 1.150 + while (p < str.length()) { 1.151 + UChar32 ch = str.char32At(p); 1.152 + if (buf.length() == 0) { 1.153 + if (u_isIDStart(ch)) { 1.154 + buf.append(ch); 1.155 + } else { 1.156 + buf.truncate(0); 1.157 + return buf; 1.158 + } 1.159 + } else { 1.160 + if (u_isIDPart(ch)) { 1.161 + buf.append(ch); 1.162 + } else { 1.163 + break; 1.164 + } 1.165 + } 1.166 + p += U16_LENGTH(ch); 1.167 + } 1.168 + pos = p; 1.169 + return buf; 1.170 +} 1.171 + 1.172 +/** 1.173 + * Parse an unsigned 31-bit integer at the given offset. Use 1.174 + * UCharacter.digit() to parse individual characters into digits. 1.175 + * @param text the text to be parsed 1.176 + * @param pos INPUT-OUTPUT parameter. On entry, pos[0] is the 1.177 + * offset within text at which to start parsing; it should point 1.178 + * to a valid digit. On exit, pos[0] is the offset after the last 1.179 + * parsed character. If the parse failed, it will be unchanged on 1.180 + * exit. Must be >= 0 on entry. 1.181 + * @param radix the radix in which to parse; must be >= 2 and <= 1.182 + * 36. 1.183 + * @return a non-negative parsed number, or -1 upon parse failure. 1.184 + * Parse fails if there are no digits, that is, if pos[0] does not 1.185 + * point to a valid digit on entry, or if the number to be parsed 1.186 + * does not fit into a 31-bit unsigned integer. 1.187 + */ 1.188 +int32_t ICU_Utility::parseNumber(const UnicodeString& text, 1.189 + int32_t& pos, int8_t radix) { 1.190 + // assert(pos[0] >= 0); 1.191 + // assert(radix >= 2); 1.192 + // assert(radix <= 36); 1.193 + int32_t n = 0; 1.194 + int32_t p = pos; 1.195 + while (p < text.length()) { 1.196 + UChar32 ch = text.char32At(p); 1.197 + int32_t d = u_digit(ch, radix); 1.198 + if (d < 0) { 1.199 + break; 1.200 + } 1.201 + n = radix*n + d; 1.202 + // ASSUME that when a 32-bit integer overflows it becomes 1.203 + // negative. E.g., 214748364 * 10 + 8 => negative value. 1.204 + if (n < 0) { 1.205 + return -1; 1.206 + } 1.207 + ++p; 1.208 + } 1.209 + if (p == pos) { 1.210 + return -1; 1.211 + } 1.212 + pos = p; 1.213 + return n; 1.214 +} 1.215 + 1.216 +U_NAMESPACE_END 1.217 +