michael@0: /* michael@0: ******************************************************************************* michael@0: * Copyright (C) 2010-2012, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ******************************************************************************* michael@0: * file name: ucharstriebuilder.h michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2010nov14 michael@0: * created by: Markus W. Scherer michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "unicode/ucharstrie.h" michael@0: #include "unicode/ucharstriebuilder.h" michael@0: #include "unicode/unistr.h" michael@0: #include "unicode/ustring.h" michael@0: #include "cmemory.h" michael@0: #include "uarrsort.h" michael@0: #include "uassert.h" michael@0: #include "uhash.h" michael@0: #include "ustr_imp.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: /* michael@0: * Note: This builder implementation stores (string, value) pairs with full copies michael@0: * of the 16-bit-unit sequences, until the UCharsTrie is built. michael@0: * It might(!) take less memory if we collected the data in a temporary, dynamic trie. michael@0: */ michael@0: michael@0: class UCharsTrieElement : public UMemory { michael@0: public: michael@0: // Use compiler's default constructor, initializes nothing. michael@0: michael@0: void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode); michael@0: michael@0: UnicodeString getString(const UnicodeString &strings) const { michael@0: int32_t length=strings[stringOffset]; michael@0: return strings.tempSubString(stringOffset+1, length); michael@0: } michael@0: int32_t getStringLength(const UnicodeString &strings) const { michael@0: return strings[stringOffset]; michael@0: } michael@0: michael@0: UChar charAt(int32_t index, const UnicodeString &strings) const { michael@0: return strings[stringOffset+1+index]; michael@0: } michael@0: michael@0: int32_t getValue() const { return value; } michael@0: michael@0: int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const; michael@0: michael@0: private: michael@0: // The first strings unit contains the string length. michael@0: // (Compared with a stringLength field here, this saves 2 bytes per string.) michael@0: int32_t stringOffset; michael@0: int32_t value; michael@0: }; michael@0: michael@0: void michael@0: UCharsTrieElement::setTo(const UnicodeString &s, int32_t val, michael@0: UnicodeString &strings, UErrorCode &errorCode) { michael@0: if(U_FAILURE(errorCode)) { michael@0: return; michael@0: } michael@0: int32_t length=s.length(); michael@0: if(length>0xffff) { michael@0: // Too long: We store the length in 1 unit. michael@0: errorCode=U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return; michael@0: } michael@0: stringOffset=strings.length(); michael@0: strings.append((UChar)length); michael@0: value=val; michael@0: strings.append(s); michael@0: } michael@0: michael@0: int32_t michael@0: UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const { michael@0: return getString(strings).compare(other.getString(strings)); michael@0: } michael@0: michael@0: UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/) michael@0: : elements(NULL), elementsCapacity(0), elementsLength(0), michael@0: uchars(NULL), ucharsCapacity(0), ucharsLength(0) {} michael@0: michael@0: UCharsTrieBuilder::~UCharsTrieBuilder() { michael@0: delete[] elements; michael@0: uprv_free(uchars); michael@0: } michael@0: michael@0: UCharsTrieBuilder & michael@0: UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) { michael@0: if(U_FAILURE(errorCode)) { michael@0: return *this; michael@0: } michael@0: if(ucharsLength>0) { michael@0: // Cannot add elements after building. michael@0: errorCode=U_NO_WRITE_PERMISSION; michael@0: return *this; michael@0: } michael@0: if(elementsLength==elementsCapacity) { michael@0: int32_t newCapacity; michael@0: if(elementsCapacity==0) { michael@0: newCapacity=1024; michael@0: } else { michael@0: newCapacity=4*elementsCapacity; michael@0: } michael@0: UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity]; michael@0: if(newElements==NULL) { michael@0: errorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: return *this; michael@0: } michael@0: if(elementsLength>0) { michael@0: uprv_memcpy(newElements, elements, elementsLength*sizeof(UCharsTrieElement)); michael@0: } michael@0: delete[] elements; michael@0: elements=newElements; michael@0: elementsCapacity=newCapacity; michael@0: } michael@0: elements[elementsLength++].setTo(s, value, strings, errorCode); michael@0: if(U_SUCCESS(errorCode) && strings.isBogus()) { michael@0: errorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: return *this; michael@0: } michael@0: michael@0: U_CDECL_BEGIN michael@0: michael@0: static int32_t U_CALLCONV michael@0: compareElementStrings(const void *context, const void *left, const void *right) { michael@0: const UnicodeString *strings=static_cast(context); michael@0: const UCharsTrieElement *leftElement=static_cast(left); michael@0: const UCharsTrieElement *rightElement=static_cast(right); michael@0: return leftElement->compareStringTo(*rightElement, *strings); michael@0: } michael@0: michael@0: U_CDECL_END michael@0: michael@0: UCharsTrie * michael@0: UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { michael@0: buildUChars(buildOption, errorCode); michael@0: UCharsTrie *newTrie=NULL; michael@0: if(U_SUCCESS(errorCode)) { michael@0: newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength)); michael@0: if(newTrie==NULL) { michael@0: errorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: } else { michael@0: uchars=NULL; // The new trie now owns the array. michael@0: ucharsCapacity=0; michael@0: } michael@0: } michael@0: return newTrie; michael@0: } michael@0: michael@0: UnicodeString & michael@0: UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result, michael@0: UErrorCode &errorCode) { michael@0: buildUChars(buildOption, errorCode); michael@0: if(U_SUCCESS(errorCode)) { michael@0: result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength); michael@0: } michael@0: return result; michael@0: } michael@0: michael@0: void michael@0: UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { michael@0: if(U_FAILURE(errorCode)) { michael@0: return; michael@0: } michael@0: if(uchars!=NULL && ucharsLength>0) { michael@0: // Already built. michael@0: return; michael@0: } michael@0: if(ucharsLength==0) { michael@0: if(elementsLength==0) { michael@0: errorCode=U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return; michael@0: } michael@0: if(strings.isBogus()) { michael@0: errorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharsTrieElement), michael@0: compareElementStrings, &strings, michael@0: FALSE, // need not be a stable sort michael@0: &errorCode); michael@0: if(U_FAILURE(errorCode)) { michael@0: return; michael@0: } michael@0: // Duplicate strings are not allowed. michael@0: UnicodeString prev=elements[0].getString(strings); michael@0: for(int32_t i=1; i(uprv_malloc(capacity*2)); michael@0: if(uchars==NULL) { michael@0: errorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: ucharsCapacity=0; michael@0: return; michael@0: } michael@0: ucharsCapacity=capacity; michael@0: } michael@0: StringTrieBuilder::build(buildOption, elementsLength, errorCode); michael@0: if(uchars==NULL) { michael@0: errorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: } michael@0: michael@0: int32_t michael@0: UCharsTrieBuilder::getElementStringLength(int32_t i) const { michael@0: return elements[i].getStringLength(strings); michael@0: } michael@0: michael@0: UChar michael@0: UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const { michael@0: return elements[i].charAt(unitIndex, strings); michael@0: } michael@0: michael@0: int32_t michael@0: UCharsTrieBuilder::getElementValue(int32_t i) const { michael@0: return elements[i].getValue(); michael@0: } michael@0: michael@0: int32_t michael@0: UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const { michael@0: const UCharsTrieElement &firstElement=elements[first]; michael@0: const UCharsTrieElement &lastElement=elements[last]; michael@0: int32_t minStringLength=firstElement.getStringLength(strings); michael@0: while(++unitIndex0); michael@0: return i; michael@0: } michael@0: michael@0: int32_t michael@0: UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const { michael@0: while(unit==elements[i].charAt(unitIndex, strings)) { michael@0: ++i; michael@0: } michael@0: return i; michael@0: } michael@0: michael@0: UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode) michael@0: : LinearMatchNode(len, nextNode), s(units) { michael@0: hash=hash*37+ustr_hashUCharsN(units, len); michael@0: } michael@0: michael@0: UBool michael@0: UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const { michael@0: if(this==&other) { michael@0: return TRUE; michael@0: } michael@0: if(!LinearMatchNode::operator==(other)) { michael@0: return FALSE; michael@0: } michael@0: const UCTLinearMatchNode &o=(const UCTLinearMatchNode &)other; michael@0: return 0==u_memcmp(s, o.s, length); michael@0: } michael@0: michael@0: void michael@0: UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) { michael@0: UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder; michael@0: next->write(builder); michael@0: b.write(s, length); michael@0: offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1); michael@0: } michael@0: michael@0: StringTrieBuilder::Node * michael@0: UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length, michael@0: Node *nextNode) const { michael@0: return new UCTLinearMatchNode( michael@0: elements[i].getString(strings).getBuffer()+unitIndex, michael@0: length, michael@0: nextNode); michael@0: } michael@0: michael@0: UBool michael@0: UCharsTrieBuilder::ensureCapacity(int32_t length) { michael@0: if(uchars==NULL) { michael@0: return FALSE; // previous memory allocation had failed michael@0: } michael@0: if(length>ucharsCapacity) { michael@0: int32_t newCapacity=ucharsCapacity; michael@0: do { michael@0: newCapacity*=2; michael@0: } while(newCapacity<=length); michael@0: UChar *newUChars=static_cast(uprv_malloc(newCapacity*2)); michael@0: if(newUChars==NULL) { michael@0: // unable to allocate memory michael@0: uprv_free(uchars); michael@0: uchars=NULL; michael@0: ucharsCapacity=0; michael@0: return FALSE; michael@0: } michael@0: u_memcpy(newUChars+(newCapacity-ucharsLength), michael@0: uchars+(ucharsCapacity-ucharsLength), ucharsLength); michael@0: uprv_free(uchars); michael@0: uchars=newUChars; michael@0: ucharsCapacity=newCapacity; michael@0: } michael@0: return TRUE; michael@0: } michael@0: michael@0: int32_t michael@0: UCharsTrieBuilder::write(int32_t unit) { michael@0: int32_t newLength=ucharsLength+1; michael@0: if(ensureCapacity(newLength)) { michael@0: ucharsLength=newLength; michael@0: uchars[ucharsCapacity-ucharsLength]=(UChar)unit; michael@0: } michael@0: return ucharsLength; michael@0: } michael@0: michael@0: int32_t michael@0: UCharsTrieBuilder::write(const UChar *s, int32_t length) { michael@0: int32_t newLength=ucharsLength+length; michael@0: if(ensureCapacity(newLength)) { michael@0: ucharsLength=newLength; michael@0: u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length); michael@0: } michael@0: return ucharsLength; michael@0: } michael@0: michael@0: int32_t michael@0: UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) { michael@0: return write(elements[i].getString(strings).getBuffer()+unitIndex, length); michael@0: } michael@0: michael@0: int32_t michael@0: UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) { michael@0: if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) { michael@0: return write(i|(isFinal<<15)); michael@0: } michael@0: UChar intUnits[3]; michael@0: int32_t length; michael@0: if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) { michael@0: intUnits[0]=(UChar)(UCharsTrie::kThreeUnitValueLead); michael@0: intUnits[1]=(UChar)((uint32_t)i>>16); michael@0: intUnits[2]=(UChar)i; michael@0: length=3; michael@0: // } else if(i<=UCharsTrie::kMaxOneUnitValue) { michael@0: // intUnits[0]=(UChar)(i); michael@0: // length=1; michael@0: } else { michael@0: intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitValueLead+(i>>16)); michael@0: intUnits[1]=(UChar)i; michael@0: length=2; michael@0: } michael@0: intUnits[0]=(UChar)(intUnits[0]|(isFinal<<15)); michael@0: return write(intUnits, length); michael@0: } michael@0: michael@0: int32_t michael@0: UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) { michael@0: if(!hasValue) { michael@0: return write(node); michael@0: } michael@0: UChar intUnits[3]; michael@0: int32_t length; michael@0: if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) { michael@0: intUnits[0]=(UChar)(UCharsTrie::kThreeUnitNodeValueLead); michael@0: intUnits[1]=(UChar)((uint32_t)value>>16); michael@0: intUnits[2]=(UChar)value; michael@0: length=3; michael@0: } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) { michael@0: intUnits[0]=(UChar)((value+1)<<6); michael@0: length=1; michael@0: } else { michael@0: intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitNodeValueLead+((value>>10)&0x7fc0)); michael@0: intUnits[1]=(UChar)value; michael@0: length=2; michael@0: } michael@0: intUnits[0]|=(UChar)node; michael@0: return write(intUnits, length); michael@0: } michael@0: michael@0: int32_t michael@0: UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) { michael@0: int32_t i=ucharsLength-jumpTarget; michael@0: U_ASSERT(i>=0); michael@0: if(i<=UCharsTrie::kMaxOneUnitDelta) { michael@0: return write(i); michael@0: } michael@0: UChar intUnits[3]; michael@0: int32_t length; michael@0: if(i<=UCharsTrie::kMaxTwoUnitDelta) { michael@0: intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitDeltaLead+(i>>16)); michael@0: length=1; michael@0: } else { michael@0: intUnits[0]=(UChar)(UCharsTrie::kThreeUnitDeltaLead); michael@0: intUnits[1]=(UChar)(i>>16); michael@0: length=2; michael@0: } michael@0: intUnits[length++]=(UChar)i; michael@0: return write(intUnits, length); michael@0: } michael@0: michael@0: U_NAMESPACE_END