1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/uniset_closure.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,280 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 2011, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: uniset_closure.cpp 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2011may30 1.17 +* created by: Markus W. Scherer 1.18 +* 1.19 +* UnicodeSet::closeOver() and related methods moved here from uniset_props.cpp 1.20 +* to simplify dependencies. 1.21 +* In particular, this depends on the BreakIterator, but the BreakIterator 1.22 +* code also builds UnicodeSets from patterns and needs uniset_props. 1.23 +*/ 1.24 + 1.25 +#include "unicode/brkiter.h" 1.26 +#include "unicode/locid.h" 1.27 +#include "unicode/parsepos.h" 1.28 +#include "unicode/uniset.h" 1.29 +#include "cmemory.h" 1.30 +#include "ruleiter.h" 1.31 +#include "ucase.h" 1.32 +#include "util.h" 1.33 +#include "uvector.h" 1.34 + 1.35 +// initial storage. Must be >= 0 1.36 +// *** same as in uniset.cpp ! *** 1.37 +#define START_EXTRA 16 1.38 + 1.39 +U_NAMESPACE_BEGIN 1.40 + 1.41 +// TODO memory debugging provided inside uniset.cpp 1.42 +// could be made available here but probably obsolete with use of modern 1.43 +// memory leak checker tools 1.44 +#define _dbgct(me) 1.45 + 1.46 +//---------------------------------------------------------------- 1.47 +// Constructors &c 1.48 +//---------------------------------------------------------------- 1.49 + 1.50 +UnicodeSet::UnicodeSet(const UnicodeString& pattern, 1.51 + uint32_t options, 1.52 + const SymbolTable* symbols, 1.53 + UErrorCode& status) : 1.54 + len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), 1.55 + bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 1.56 + fFlags(0) 1.57 +{ 1.58 + if(U_SUCCESS(status)){ 1.59 + list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 1.60 + /* test for NULL */ 1.61 + if(list == NULL) { 1.62 + status = U_MEMORY_ALLOCATION_ERROR; 1.63 + }else{ 1.64 + allocateStrings(status); 1.65 + applyPattern(pattern, options, symbols, status); 1.66 + } 1.67 + } 1.68 + _dbgct(this); 1.69 +} 1.70 + 1.71 +UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, 1.72 + uint32_t options, 1.73 + const SymbolTable* symbols, 1.74 + UErrorCode& status) : 1.75 + len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), 1.76 + bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 1.77 + fFlags(0) 1.78 +{ 1.79 + if(U_SUCCESS(status)){ 1.80 + list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 1.81 + /* test for NULL */ 1.82 + if(list == NULL) { 1.83 + status = U_MEMORY_ALLOCATION_ERROR; 1.84 + }else{ 1.85 + allocateStrings(status); 1.86 + applyPattern(pattern, pos, options, symbols, status); 1.87 + } 1.88 + } 1.89 + _dbgct(this); 1.90 +} 1.91 + 1.92 +//---------------------------------------------------------------- 1.93 +// Public API 1.94 +//---------------------------------------------------------------- 1.95 + 1.96 +UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, 1.97 + uint32_t options, 1.98 + const SymbolTable* symbols, 1.99 + UErrorCode& status) { 1.100 + ParsePosition pos(0); 1.101 + applyPattern(pattern, pos, options, symbols, status); 1.102 + if (U_FAILURE(status)) return *this; 1.103 + 1.104 + int32_t i = pos.getIndex(); 1.105 + 1.106 + if (options & USET_IGNORE_SPACE) { 1.107 + // Skip over trailing whitespace 1.108 + ICU_Utility::skipWhitespace(pattern, i, TRUE); 1.109 + } 1.110 + 1.111 + if (i != pattern.length()) { 1.112 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.113 + } 1.114 + return *this; 1.115 +} 1.116 + 1.117 +UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, 1.118 + ParsePosition& pos, 1.119 + uint32_t options, 1.120 + const SymbolTable* symbols, 1.121 + UErrorCode& status) { 1.122 + if (U_FAILURE(status)) { 1.123 + return *this; 1.124 + } 1.125 + if (isFrozen()) { 1.126 + status = U_NO_WRITE_PERMISSION; 1.127 + return *this; 1.128 + } 1.129 + // Need to build the pattern in a temporary string because 1.130 + // _applyPattern calls add() etc., which set pat to empty. 1.131 + UnicodeString rebuiltPat; 1.132 + RuleCharacterIterator chars(pattern, symbols, pos); 1.133 + applyPattern(chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, status); 1.134 + if (U_FAILURE(status)) return *this; 1.135 + if (chars.inVariable()) { 1.136 + // syntaxError(chars, "Extra chars in variable value"); 1.137 + status = U_MALFORMED_SET; 1.138 + return *this; 1.139 + } 1.140 + setPattern(rebuiltPat); 1.141 + return *this; 1.142 +} 1.143 + 1.144 +// USetAdder implementation 1.145 +// Does not use uset.h to reduce code dependencies 1.146 +static void U_CALLCONV 1.147 +_set_add(USet *set, UChar32 c) { 1.148 + ((UnicodeSet *)set)->add(c); 1.149 +} 1.150 + 1.151 +static void U_CALLCONV 1.152 +_set_addRange(USet *set, UChar32 start, UChar32 end) { 1.153 + ((UnicodeSet *)set)->add(start, end); 1.154 +} 1.155 + 1.156 +static void U_CALLCONV 1.157 +_set_addString(USet *set, const UChar *str, int32_t length) { 1.158 + ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length)); 1.159 +} 1.160 + 1.161 +//---------------------------------------------------------------- 1.162 +// Case folding API 1.163 +//---------------------------------------------------------------- 1.164 + 1.165 +// add the result of a full case mapping to the set 1.166 +// use str as a temporary string to avoid constructing one 1.167 +static inline void 1.168 +addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString &str) { 1.169 + if(result >= 0) { 1.170 + if(result > UCASE_MAX_STRING_LENGTH) { 1.171 + // add a single-code point case mapping 1.172 + set.add(result); 1.173 + } else { 1.174 + // add a string case mapping from full with length result 1.175 + str.setTo((UBool)FALSE, full, result); 1.176 + set.add(str); 1.177 + } 1.178 + } 1.179 + // result < 0: the code point mapped to itself, no need to add it 1.180 + // see ucase.h 1.181 +} 1.182 + 1.183 +UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { 1.184 + if (isFrozen() || isBogus()) { 1.185 + return *this; 1.186 + } 1.187 + if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) { 1.188 + const UCaseProps *csp = ucase_getSingleton(); 1.189 + { 1.190 + UnicodeSet foldSet(*this); 1.191 + UnicodeString str; 1.192 + USetAdder sa = { 1.193 + foldSet.toUSet(), 1.194 + _set_add, 1.195 + _set_addRange, 1.196 + _set_addString, 1.197 + NULL, // don't need remove() 1.198 + NULL // don't need removeRange() 1.199 + }; 1.200 + 1.201 + // start with input set to guarantee inclusion 1.202 + // USET_CASE: remove strings because the strings will actually be reduced (folded); 1.203 + // therefore, start with no strings and add only those needed 1.204 + if (attribute & USET_CASE_INSENSITIVE) { 1.205 + foldSet.strings->removeAllElements(); 1.206 + } 1.207 + 1.208 + int32_t n = getRangeCount(); 1.209 + UChar32 result; 1.210 + const UChar *full; 1.211 + int32_t locCache = 0; 1.212 + 1.213 + for (int32_t i=0; i<n; ++i) { 1.214 + UChar32 start = getRangeStart(i); 1.215 + UChar32 end = getRangeEnd(i); 1.216 + 1.217 + if (attribute & USET_CASE_INSENSITIVE) { 1.218 + // full case closure 1.219 + for (UChar32 cp=start; cp<=end; ++cp) { 1.220 + ucase_addCaseClosure(csp, cp, &sa); 1.221 + } 1.222 + } else { 1.223 + // add case mappings 1.224 + // (does not add long s for regular s, or Kelvin for k, for example) 1.225 + for (UChar32 cp=start; cp<=end; ++cp) { 1.226 + result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache); 1.227 + addCaseMapping(foldSet, result, full, str); 1.228 + 1.229 + result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache); 1.230 + addCaseMapping(foldSet, result, full, str); 1.231 + 1.232 + result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache); 1.233 + addCaseMapping(foldSet, result, full, str); 1.234 + 1.235 + result = ucase_toFullFolding(csp, cp, &full, 0); 1.236 + addCaseMapping(foldSet, result, full, str); 1.237 + } 1.238 + } 1.239 + } 1.240 + if (strings != NULL && strings->size() > 0) { 1.241 + if (attribute & USET_CASE_INSENSITIVE) { 1.242 + for (int32_t j=0; j<strings->size(); ++j) { 1.243 + str = *(const UnicodeString *) strings->elementAt(j); 1.244 + str.foldCase(); 1.245 + if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) { 1.246 + foldSet.add(str); // does not map to code points: add the folded string itself 1.247 + } 1.248 + } 1.249 + } else { 1.250 + Locale root(""); 1.251 +#if !UCONFIG_NO_BREAK_ITERATION 1.252 + UErrorCode status = U_ZERO_ERROR; 1.253 + BreakIterator *bi = BreakIterator::createWordInstance(root, status); 1.254 + if (U_SUCCESS(status)) { 1.255 +#endif 1.256 + const UnicodeString *pStr; 1.257 + 1.258 + for (int32_t j=0; j<strings->size(); ++j) { 1.259 + pStr = (const UnicodeString *) strings->elementAt(j); 1.260 + (str = *pStr).toLower(root); 1.261 + foldSet.add(str); 1.262 +#if !UCONFIG_NO_BREAK_ITERATION 1.263 + (str = *pStr).toTitle(bi, root); 1.264 + foldSet.add(str); 1.265 +#endif 1.266 + (str = *pStr).toUpper(root); 1.267 + foldSet.add(str); 1.268 + (str = *pStr).foldCase(); 1.269 + foldSet.add(str); 1.270 + } 1.271 +#if !UCONFIG_NO_BREAK_ITERATION 1.272 + } 1.273 + delete bi; 1.274 +#endif 1.275 + } 1.276 + } 1.277 + *this = foldSet; 1.278 + } 1.279 + } 1.280 + return *this; 1.281 +} 1.282 + 1.283 +U_NAMESPACE_END