michael@0: /* michael@0: ******************************************************************************* michael@0: * michael@0: * Copyright (C) 2011, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ******************************************************************************* michael@0: * file name: uniset_closure.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2011may30 michael@0: * created by: Markus W. Scherer michael@0: * michael@0: * UnicodeSet::closeOver() and related methods moved here from uniset_props.cpp michael@0: * to simplify dependencies. michael@0: * In particular, this depends on the BreakIterator, but the BreakIterator michael@0: * code also builds UnicodeSets from patterns and needs uniset_props. michael@0: */ michael@0: michael@0: #include "unicode/brkiter.h" michael@0: #include "unicode/locid.h" michael@0: #include "unicode/parsepos.h" michael@0: #include "unicode/uniset.h" michael@0: #include "cmemory.h" michael@0: #include "ruleiter.h" michael@0: #include "ucase.h" michael@0: #include "util.h" michael@0: #include "uvector.h" michael@0: michael@0: // initial storage. Must be >= 0 michael@0: // *** same as in uniset.cpp ! *** michael@0: #define START_EXTRA 16 michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: // TODO memory debugging provided inside uniset.cpp michael@0: // could be made available here but probably obsolete with use of modern michael@0: // memory leak checker tools michael@0: #define _dbgct(me) michael@0: michael@0: //---------------------------------------------------------------- michael@0: // Constructors &c michael@0: //---------------------------------------------------------------- michael@0: michael@0: UnicodeSet::UnicodeSet(const UnicodeString& pattern, michael@0: uint32_t options, michael@0: const SymbolTable* symbols, michael@0: UErrorCode& status) : michael@0: len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), michael@0: bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), michael@0: fFlags(0) michael@0: { michael@0: if(U_SUCCESS(status)){ michael@0: list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); michael@0: /* test for NULL */ michael@0: if(list == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: }else{ michael@0: allocateStrings(status); michael@0: applyPattern(pattern, options, symbols, status); michael@0: } michael@0: } michael@0: _dbgct(this); michael@0: } michael@0: michael@0: UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, michael@0: uint32_t options, michael@0: const SymbolTable* symbols, michael@0: UErrorCode& status) : michael@0: len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), michael@0: bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), michael@0: fFlags(0) michael@0: { michael@0: if(U_SUCCESS(status)){ michael@0: list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); michael@0: /* test for NULL */ michael@0: if(list == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: }else{ michael@0: allocateStrings(status); michael@0: applyPattern(pattern, pos, options, symbols, status); michael@0: } michael@0: } michael@0: _dbgct(this); michael@0: } michael@0: michael@0: //---------------------------------------------------------------- michael@0: // Public API michael@0: //---------------------------------------------------------------- michael@0: michael@0: UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, michael@0: uint32_t options, michael@0: const SymbolTable* symbols, michael@0: UErrorCode& status) { michael@0: ParsePosition pos(0); michael@0: applyPattern(pattern, pos, options, symbols, status); michael@0: if (U_FAILURE(status)) return *this; michael@0: michael@0: int32_t i = pos.getIndex(); michael@0: michael@0: if (options & USET_IGNORE_SPACE) { michael@0: // Skip over trailing whitespace michael@0: ICU_Utility::skipWhitespace(pattern, i, TRUE); michael@0: } michael@0: michael@0: if (i != pattern.length()) { michael@0: status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: } michael@0: return *this; michael@0: } michael@0: michael@0: UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, michael@0: ParsePosition& pos, michael@0: uint32_t options, michael@0: const SymbolTable* symbols, michael@0: UErrorCode& status) { michael@0: if (U_FAILURE(status)) { michael@0: return *this; michael@0: } michael@0: if (isFrozen()) { michael@0: status = U_NO_WRITE_PERMISSION; michael@0: return *this; michael@0: } michael@0: // Need to build the pattern in a temporary string because michael@0: // _applyPattern calls add() etc., which set pat to empty. michael@0: UnicodeString rebuiltPat; michael@0: RuleCharacterIterator chars(pattern, symbols, pos); michael@0: applyPattern(chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, status); michael@0: if (U_FAILURE(status)) return *this; michael@0: if (chars.inVariable()) { michael@0: // syntaxError(chars, "Extra chars in variable value"); michael@0: status = U_MALFORMED_SET; michael@0: return *this; michael@0: } michael@0: setPattern(rebuiltPat); michael@0: return *this; michael@0: } michael@0: michael@0: // USetAdder implementation michael@0: // Does not use uset.h to reduce code dependencies michael@0: static void U_CALLCONV michael@0: _set_add(USet *set, UChar32 c) { michael@0: ((UnicodeSet *)set)->add(c); michael@0: } michael@0: michael@0: static void U_CALLCONV michael@0: _set_addRange(USet *set, UChar32 start, UChar32 end) { michael@0: ((UnicodeSet *)set)->add(start, end); michael@0: } michael@0: michael@0: static void U_CALLCONV michael@0: _set_addString(USet *set, const UChar *str, int32_t length) { michael@0: ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length)); michael@0: } michael@0: michael@0: //---------------------------------------------------------------- michael@0: // Case folding API michael@0: //---------------------------------------------------------------- michael@0: michael@0: // add the result of a full case mapping to the set michael@0: // use str as a temporary string to avoid constructing one michael@0: static inline void michael@0: addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString &str) { michael@0: if(result >= 0) { michael@0: if(result > UCASE_MAX_STRING_LENGTH) { michael@0: // add a single-code point case mapping michael@0: set.add(result); michael@0: } else { michael@0: // add a string case mapping from full with length result michael@0: str.setTo((UBool)FALSE, full, result); michael@0: set.add(str); michael@0: } michael@0: } michael@0: // result < 0: the code point mapped to itself, no need to add it michael@0: // see ucase.h michael@0: } michael@0: michael@0: UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { michael@0: if (isFrozen() || isBogus()) { michael@0: return *this; michael@0: } michael@0: if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) { michael@0: const UCaseProps *csp = ucase_getSingleton(); michael@0: { michael@0: UnicodeSet foldSet(*this); michael@0: UnicodeString str; michael@0: USetAdder sa = { michael@0: foldSet.toUSet(), michael@0: _set_add, michael@0: _set_addRange, michael@0: _set_addString, michael@0: NULL, // don't need remove() michael@0: NULL // don't need removeRange() michael@0: }; michael@0: michael@0: // start with input set to guarantee inclusion michael@0: // USET_CASE: remove strings because the strings will actually be reduced (folded); michael@0: // therefore, start with no strings and add only those needed michael@0: if (attribute & USET_CASE_INSENSITIVE) { michael@0: foldSet.strings->removeAllElements(); michael@0: } michael@0: michael@0: int32_t n = getRangeCount(); michael@0: UChar32 result; michael@0: const UChar *full; michael@0: int32_t locCache = 0; michael@0: michael@0: for (int32_t i=0; isize() > 0) { michael@0: if (attribute & USET_CASE_INSENSITIVE) { michael@0: for (int32_t j=0; jsize(); ++j) { michael@0: str = *(const UnicodeString *) strings->elementAt(j); michael@0: str.foldCase(); michael@0: if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) { michael@0: foldSet.add(str); // does not map to code points: add the folded string itself michael@0: } michael@0: } michael@0: } else { michael@0: Locale root(""); michael@0: #if !UCONFIG_NO_BREAK_ITERATION michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: BreakIterator *bi = BreakIterator::createWordInstance(root, status); michael@0: if (U_SUCCESS(status)) { michael@0: #endif michael@0: const UnicodeString *pStr; michael@0: michael@0: for (int32_t j=0; jsize(); ++j) { michael@0: pStr = (const UnicodeString *) strings->elementAt(j); michael@0: (str = *pStr).toLower(root); michael@0: foldSet.add(str); michael@0: #if !UCONFIG_NO_BREAK_ITERATION michael@0: (str = *pStr).toTitle(bi, root); michael@0: foldSet.add(str); michael@0: #endif michael@0: (str = *pStr).toUpper(root); michael@0: foldSet.add(str); michael@0: (str = *pStr).foldCase(); michael@0: foldSet.add(str); michael@0: } michael@0: #if !UCONFIG_NO_BREAK_ITERATION michael@0: } michael@0: delete bi; michael@0: #endif michael@0: } michael@0: } michael@0: *this = foldSet; michael@0: } michael@0: } michael@0: return *this; michael@0: } michael@0: michael@0: U_NAMESPACE_END