Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* |
michael@0 | 2 | ******************************************************************************* |
michael@0 | 3 | * |
michael@0 | 4 | * Copyright (C) 2011, International Business Machines |
michael@0 | 5 | * Corporation and others. All Rights Reserved. |
michael@0 | 6 | * |
michael@0 | 7 | ******************************************************************************* |
michael@0 | 8 | * file name: uniset_closure.cpp |
michael@0 | 9 | * encoding: US-ASCII |
michael@0 | 10 | * tab size: 8 (not used) |
michael@0 | 11 | * indentation:4 |
michael@0 | 12 | * |
michael@0 | 13 | * created on: 2011may30 |
michael@0 | 14 | * created by: Markus W. Scherer |
michael@0 | 15 | * |
michael@0 | 16 | * UnicodeSet::closeOver() and related methods moved here from uniset_props.cpp |
michael@0 | 17 | * to simplify dependencies. |
michael@0 | 18 | * In particular, this depends on the BreakIterator, but the BreakIterator |
michael@0 | 19 | * code also builds UnicodeSets from patterns and needs uniset_props. |
michael@0 | 20 | */ |
michael@0 | 21 | |
michael@0 | 22 | #include "unicode/brkiter.h" |
michael@0 | 23 | #include "unicode/locid.h" |
michael@0 | 24 | #include "unicode/parsepos.h" |
michael@0 | 25 | #include "unicode/uniset.h" |
michael@0 | 26 | #include "cmemory.h" |
michael@0 | 27 | #include "ruleiter.h" |
michael@0 | 28 | #include "ucase.h" |
michael@0 | 29 | #include "util.h" |
michael@0 | 30 | #include "uvector.h" |
michael@0 | 31 | |
michael@0 | 32 | // initial storage. Must be >= 0 |
michael@0 | 33 | // *** same as in uniset.cpp ! *** |
michael@0 | 34 | #define START_EXTRA 16 |
michael@0 | 35 | |
michael@0 | 36 | U_NAMESPACE_BEGIN |
michael@0 | 37 | |
michael@0 | 38 | // TODO memory debugging provided inside uniset.cpp |
michael@0 | 39 | // could be made available here but probably obsolete with use of modern |
michael@0 | 40 | // memory leak checker tools |
michael@0 | 41 | #define _dbgct(me) |
michael@0 | 42 | |
michael@0 | 43 | //---------------------------------------------------------------- |
michael@0 | 44 | // Constructors &c |
michael@0 | 45 | //---------------------------------------------------------------- |
michael@0 | 46 | |
michael@0 | 47 | UnicodeSet::UnicodeSet(const UnicodeString& pattern, |
michael@0 | 48 | uint32_t options, |
michael@0 | 49 | const SymbolTable* symbols, |
michael@0 | 50 | UErrorCode& status) : |
michael@0 | 51 | len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), |
michael@0 | 52 | bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), |
michael@0 | 53 | fFlags(0) |
michael@0 | 54 | { |
michael@0 | 55 | if(U_SUCCESS(status)){ |
michael@0 | 56 | list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); |
michael@0 | 57 | /* test for NULL */ |
michael@0 | 58 | if(list == NULL) { |
michael@0 | 59 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 60 | }else{ |
michael@0 | 61 | allocateStrings(status); |
michael@0 | 62 | applyPattern(pattern, options, symbols, status); |
michael@0 | 63 | } |
michael@0 | 64 | } |
michael@0 | 65 | _dbgct(this); |
michael@0 | 66 | } |
michael@0 | 67 | |
michael@0 | 68 | UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, |
michael@0 | 69 | uint32_t options, |
michael@0 | 70 | const SymbolTable* symbols, |
michael@0 | 71 | UErrorCode& status) : |
michael@0 | 72 | len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), |
michael@0 | 73 | bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), |
michael@0 | 74 | fFlags(0) |
michael@0 | 75 | { |
michael@0 | 76 | if(U_SUCCESS(status)){ |
michael@0 | 77 | list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); |
michael@0 | 78 | /* test for NULL */ |
michael@0 | 79 | if(list == NULL) { |
michael@0 | 80 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 81 | }else{ |
michael@0 | 82 | allocateStrings(status); |
michael@0 | 83 | applyPattern(pattern, pos, options, symbols, status); |
michael@0 | 84 | } |
michael@0 | 85 | } |
michael@0 | 86 | _dbgct(this); |
michael@0 | 87 | } |
michael@0 | 88 | |
michael@0 | 89 | //---------------------------------------------------------------- |
michael@0 | 90 | // Public API |
michael@0 | 91 | //---------------------------------------------------------------- |
michael@0 | 92 | |
michael@0 | 93 | UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, |
michael@0 | 94 | uint32_t options, |
michael@0 | 95 | const SymbolTable* symbols, |
michael@0 | 96 | UErrorCode& status) { |
michael@0 | 97 | ParsePosition pos(0); |
michael@0 | 98 | applyPattern(pattern, pos, options, symbols, status); |
michael@0 | 99 | if (U_FAILURE(status)) return *this; |
michael@0 | 100 | |
michael@0 | 101 | int32_t i = pos.getIndex(); |
michael@0 | 102 | |
michael@0 | 103 | if (options & USET_IGNORE_SPACE) { |
michael@0 | 104 | // Skip over trailing whitespace |
michael@0 | 105 | ICU_Utility::skipWhitespace(pattern, i, TRUE); |
michael@0 | 106 | } |
michael@0 | 107 | |
michael@0 | 108 | if (i != pattern.length()) { |
michael@0 | 109 | status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 110 | } |
michael@0 | 111 | return *this; |
michael@0 | 112 | } |
michael@0 | 113 | |
michael@0 | 114 | UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, |
michael@0 | 115 | ParsePosition& pos, |
michael@0 | 116 | uint32_t options, |
michael@0 | 117 | const SymbolTable* symbols, |
michael@0 | 118 | UErrorCode& status) { |
michael@0 | 119 | if (U_FAILURE(status)) { |
michael@0 | 120 | return *this; |
michael@0 | 121 | } |
michael@0 | 122 | if (isFrozen()) { |
michael@0 | 123 | status = U_NO_WRITE_PERMISSION; |
michael@0 | 124 | return *this; |
michael@0 | 125 | } |
michael@0 | 126 | // Need to build the pattern in a temporary string because |
michael@0 | 127 | // _applyPattern calls add() etc., which set pat to empty. |
michael@0 | 128 | UnicodeString rebuiltPat; |
michael@0 | 129 | RuleCharacterIterator chars(pattern, symbols, pos); |
michael@0 | 130 | applyPattern(chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, status); |
michael@0 | 131 | if (U_FAILURE(status)) return *this; |
michael@0 | 132 | if (chars.inVariable()) { |
michael@0 | 133 | // syntaxError(chars, "Extra chars in variable value"); |
michael@0 | 134 | status = U_MALFORMED_SET; |
michael@0 | 135 | return *this; |
michael@0 | 136 | } |
michael@0 | 137 | setPattern(rebuiltPat); |
michael@0 | 138 | return *this; |
michael@0 | 139 | } |
michael@0 | 140 | |
michael@0 | 141 | // USetAdder implementation |
michael@0 | 142 | // Does not use uset.h to reduce code dependencies |
michael@0 | 143 | static void U_CALLCONV |
michael@0 | 144 | _set_add(USet *set, UChar32 c) { |
michael@0 | 145 | ((UnicodeSet *)set)->add(c); |
michael@0 | 146 | } |
michael@0 | 147 | |
michael@0 | 148 | static void U_CALLCONV |
michael@0 | 149 | _set_addRange(USet *set, UChar32 start, UChar32 end) { |
michael@0 | 150 | ((UnicodeSet *)set)->add(start, end); |
michael@0 | 151 | } |
michael@0 | 152 | |
michael@0 | 153 | static void U_CALLCONV |
michael@0 | 154 | _set_addString(USet *set, const UChar *str, int32_t length) { |
michael@0 | 155 | ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length)); |
michael@0 | 156 | } |
michael@0 | 157 | |
michael@0 | 158 | //---------------------------------------------------------------- |
michael@0 | 159 | // Case folding API |
michael@0 | 160 | //---------------------------------------------------------------- |
michael@0 | 161 | |
michael@0 | 162 | // add the result of a full case mapping to the set |
michael@0 | 163 | // use str as a temporary string to avoid constructing one |
michael@0 | 164 | static inline void |
michael@0 | 165 | addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString &str) { |
michael@0 | 166 | if(result >= 0) { |
michael@0 | 167 | if(result > UCASE_MAX_STRING_LENGTH) { |
michael@0 | 168 | // add a single-code point case mapping |
michael@0 | 169 | set.add(result); |
michael@0 | 170 | } else { |
michael@0 | 171 | // add a string case mapping from full with length result |
michael@0 | 172 | str.setTo((UBool)FALSE, full, result); |
michael@0 | 173 | set.add(str); |
michael@0 | 174 | } |
michael@0 | 175 | } |
michael@0 | 176 | // result < 0: the code point mapped to itself, no need to add it |
michael@0 | 177 | // see ucase.h |
michael@0 | 178 | } |
michael@0 | 179 | |
michael@0 | 180 | UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { |
michael@0 | 181 | if (isFrozen() || isBogus()) { |
michael@0 | 182 | return *this; |
michael@0 | 183 | } |
michael@0 | 184 | if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) { |
michael@0 | 185 | const UCaseProps *csp = ucase_getSingleton(); |
michael@0 | 186 | { |
michael@0 | 187 | UnicodeSet foldSet(*this); |
michael@0 | 188 | UnicodeString str; |
michael@0 | 189 | USetAdder sa = { |
michael@0 | 190 | foldSet.toUSet(), |
michael@0 | 191 | _set_add, |
michael@0 | 192 | _set_addRange, |
michael@0 | 193 | _set_addString, |
michael@0 | 194 | NULL, // don't need remove() |
michael@0 | 195 | NULL // don't need removeRange() |
michael@0 | 196 | }; |
michael@0 | 197 | |
michael@0 | 198 | // start with input set to guarantee inclusion |
michael@0 | 199 | // USET_CASE: remove strings because the strings will actually be reduced (folded); |
michael@0 | 200 | // therefore, start with no strings and add only those needed |
michael@0 | 201 | if (attribute & USET_CASE_INSENSITIVE) { |
michael@0 | 202 | foldSet.strings->removeAllElements(); |
michael@0 | 203 | } |
michael@0 | 204 | |
michael@0 | 205 | int32_t n = getRangeCount(); |
michael@0 | 206 | UChar32 result; |
michael@0 | 207 | const UChar *full; |
michael@0 | 208 | int32_t locCache = 0; |
michael@0 | 209 | |
michael@0 | 210 | for (int32_t i=0; i<n; ++i) { |
michael@0 | 211 | UChar32 start = getRangeStart(i); |
michael@0 | 212 | UChar32 end = getRangeEnd(i); |
michael@0 | 213 | |
michael@0 | 214 | if (attribute & USET_CASE_INSENSITIVE) { |
michael@0 | 215 | // full case closure |
michael@0 | 216 | for (UChar32 cp=start; cp<=end; ++cp) { |
michael@0 | 217 | ucase_addCaseClosure(csp, cp, &sa); |
michael@0 | 218 | } |
michael@0 | 219 | } else { |
michael@0 | 220 | // add case mappings |
michael@0 | 221 | // (does not add long s for regular s, or Kelvin for k, for example) |
michael@0 | 222 | for (UChar32 cp=start; cp<=end; ++cp) { |
michael@0 | 223 | result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache); |
michael@0 | 224 | addCaseMapping(foldSet, result, full, str); |
michael@0 | 225 | |
michael@0 | 226 | result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache); |
michael@0 | 227 | addCaseMapping(foldSet, result, full, str); |
michael@0 | 228 | |
michael@0 | 229 | result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache); |
michael@0 | 230 | addCaseMapping(foldSet, result, full, str); |
michael@0 | 231 | |
michael@0 | 232 | result = ucase_toFullFolding(csp, cp, &full, 0); |
michael@0 | 233 | addCaseMapping(foldSet, result, full, str); |
michael@0 | 234 | } |
michael@0 | 235 | } |
michael@0 | 236 | } |
michael@0 | 237 | if (strings != NULL && strings->size() > 0) { |
michael@0 | 238 | if (attribute & USET_CASE_INSENSITIVE) { |
michael@0 | 239 | for (int32_t j=0; j<strings->size(); ++j) { |
michael@0 | 240 | str = *(const UnicodeString *) strings->elementAt(j); |
michael@0 | 241 | str.foldCase(); |
michael@0 | 242 | if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) { |
michael@0 | 243 | foldSet.add(str); // does not map to code points: add the folded string itself |
michael@0 | 244 | } |
michael@0 | 245 | } |
michael@0 | 246 | } else { |
michael@0 | 247 | Locale root(""); |
michael@0 | 248 | #if !UCONFIG_NO_BREAK_ITERATION |
michael@0 | 249 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 250 | BreakIterator *bi = BreakIterator::createWordInstance(root, status); |
michael@0 | 251 | if (U_SUCCESS(status)) { |
michael@0 | 252 | #endif |
michael@0 | 253 | const UnicodeString *pStr; |
michael@0 | 254 | |
michael@0 | 255 | for (int32_t j=0; j<strings->size(); ++j) { |
michael@0 | 256 | pStr = (const UnicodeString *) strings->elementAt(j); |
michael@0 | 257 | (str = *pStr).toLower(root); |
michael@0 | 258 | foldSet.add(str); |
michael@0 | 259 | #if !UCONFIG_NO_BREAK_ITERATION |
michael@0 | 260 | (str = *pStr).toTitle(bi, root); |
michael@0 | 261 | foldSet.add(str); |
michael@0 | 262 | #endif |
michael@0 | 263 | (str = *pStr).toUpper(root); |
michael@0 | 264 | foldSet.add(str); |
michael@0 | 265 | (str = *pStr).foldCase(); |
michael@0 | 266 | foldSet.add(str); |
michael@0 | 267 | } |
michael@0 | 268 | #if !UCONFIG_NO_BREAK_ITERATION |
michael@0 | 269 | } |
michael@0 | 270 | delete bi; |
michael@0 | 271 | #endif |
michael@0 | 272 | } |
michael@0 | 273 | } |
michael@0 | 274 | *this = foldSet; |
michael@0 | 275 | } |
michael@0 | 276 | } |
michael@0 | 277 | return *this; |
michael@0 | 278 | } |
michael@0 | 279 | |
michael@0 | 280 | U_NAMESPACE_END |