michael@0: /* michael@0: ******************************************************************************* michael@0: * Copyright (C) 2010-2012, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ******************************************************************************* michael@0: * file name: stringtriebuilder.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2010dec24 michael@0: * created by: Markus W. Scherer michael@0: */ michael@0: michael@0: #include "utypeinfo.h" // for 'typeid' to work michael@0: #include "unicode/utypes.h" michael@0: #include "unicode/stringtriebuilder.h" michael@0: #include "uassert.h" michael@0: #include "uhash.h" michael@0: michael@0: U_CDECL_BEGIN michael@0: michael@0: static int32_t U_CALLCONV michael@0: hashStringTrieNode(const UHashTok key) { michael@0: return icu::StringTrieBuilder::hashNode(key.pointer); michael@0: } michael@0: michael@0: static UBool U_CALLCONV michael@0: equalStringTrieNodes(const UHashTok key1, const UHashTok key2) { michael@0: return icu::StringTrieBuilder::equalNodes(key1.pointer, key2.pointer); michael@0: } michael@0: michael@0: U_CDECL_END michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: StringTrieBuilder::StringTrieBuilder() : nodes(NULL) {} michael@0: michael@0: StringTrieBuilder::~StringTrieBuilder() { michael@0: deleteCompactBuilder(); michael@0: } michael@0: michael@0: void michael@0: StringTrieBuilder::createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode) { michael@0: if(U_FAILURE(errorCode)) { michael@0: return; michael@0: } michael@0: nodes=uhash_openSize(hashStringTrieNode, equalStringTrieNodes, NULL, michael@0: sizeGuess, &errorCode); michael@0: if(U_SUCCESS(errorCode)) { michael@0: if(nodes==NULL) { michael@0: errorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: } else { michael@0: uhash_setKeyDeleter(nodes, uprv_deleteUObject); michael@0: } michael@0: } michael@0: } michael@0: michael@0: void michael@0: StringTrieBuilder::deleteCompactBuilder() { michael@0: uhash_close(nodes); michael@0: nodes=NULL; michael@0: } michael@0: michael@0: void michael@0: StringTrieBuilder::build(UStringTrieBuildOption buildOption, int32_t elementsLength, michael@0: UErrorCode &errorCode) { michael@0: if(buildOption==USTRINGTRIE_BUILD_FAST) { michael@0: writeNode(0, elementsLength, 0); michael@0: } else /* USTRINGTRIE_BUILD_SMALL */ { michael@0: createCompactBuilder(2*elementsLength, errorCode); michael@0: Node *root=makeNode(0, elementsLength, 0, errorCode); michael@0: if(U_SUCCESS(errorCode)) { michael@0: root->markRightEdgesFirst(-1); michael@0: root->write(*this); michael@0: } michael@0: deleteCompactBuilder(); michael@0: } michael@0: } michael@0: michael@0: // Requires startmaxLinearMatchLength) { michael@0: lastUnitIndex-=maxLinearMatchLength; michael@0: length-=maxLinearMatchLength; michael@0: writeElementUnits(start, lastUnitIndex, maxLinearMatchLength); michael@0: write(getMinLinearMatch()+maxLinearMatchLength-1); michael@0: } michael@0: writeElementUnits(start, unitIndex, length); michael@0: type=getMinLinearMatch()+length-1; michael@0: } else { michael@0: // Branch node. michael@0: int32_t length=countElementUnits(start, limit, unitIndex); michael@0: // length>=2 because minUnit!=maxUnit. michael@0: writeBranchSubNode(start, limit, unitIndex, length); michael@0: if(--lengthgetMaxBranchLinearSubNodeLength()) { michael@0: // Branch on the middle unit. michael@0: // First, find the middle unit. michael@0: int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2); michael@0: // Encode the less-than branch first. michael@0: middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit michael@0: lessThan[ltLength]=writeBranchSubNode(start, i, unitIndex, length/2); michael@0: ++ltLength; michael@0: // Continue for the greater-or-equal branch. michael@0: start=i; michael@0: length=length-length/2; michael@0: } michael@0: // For each unit, find its elements array start and whether it has a final value. michael@0: int32_t starts[kMaxBranchLinearSubNodeLength]; michael@0: UBool isFinal[kMaxBranchLinearSubNodeLength-1]; michael@0: int32_t unitNumber=0; michael@0: do { michael@0: int32_t i=starts[unitNumber]=start; michael@0: UChar unit=getElementUnit(i++, unitIndex); michael@0: i=indexOfElementWithNextUnit(i, unitIndex, unit); michael@0: isFinal[unitNumber]= start==i-1 && unitIndex+1==getElementStringLength(start); michael@0: start=i; michael@0: } while(++unitNumber0); michael@0: // The maxUnit sub-node is written as the very last one because we do michael@0: // not jump for it at all. michael@0: unitNumber=length-1; michael@0: writeNode(start, limit, unitIndex+1); michael@0: int32_t offset=write(getElementUnit(start, unitIndex)); michael@0: // Write the rest of this node's unit-value pairs. michael@0: while(--unitNumber>=0) { michael@0: start=starts[unitNumber]; michael@0: int32_t value; michael@0: if(isFinal[unitNumber]) { michael@0: // Write the final value for the one string ending with this unit. michael@0: value=getElementValue(start); michael@0: } else { michael@0: // Write the delta to the start position of the sub-node. michael@0: value=offset-jumpTargets[unitNumber]; michael@0: } michael@0: writeValueAndFinal(value, isFinal[unitNumber]); michael@0: offset=write(getElementUnit(start, unitIndex)); michael@0: } michael@0: // Write the split-branch nodes. michael@0: while(ltLength>0) { michael@0: --ltLength; michael@0: writeDeltaTo(lessThan[ltLength]); michael@0: offset=write(middleUnits[ltLength]); michael@0: } michael@0: return offset; michael@0: } michael@0: michael@0: // Requires startmaxLinearMatchLength) { michael@0: lastUnitIndex-=maxLinearMatchLength; michael@0: length-=maxLinearMatchLength; michael@0: node=createLinearMatchNode(start, lastUnitIndex, maxLinearMatchLength, nextNode); michael@0: nextNode=registerNode(node, errorCode); michael@0: } michael@0: node=createLinearMatchNode(start, unitIndex, length, nextNode); michael@0: } else { michael@0: // Branch node. michael@0: int32_t length=countElementUnits(start, limit, unitIndex); michael@0: // length>=2 because minUnit!=maxUnit. michael@0: Node *subNode=makeBranchSubNode(start, limit, unitIndex, length, errorCode); michael@0: node=new BranchHeadNode(length, subNode); michael@0: } michael@0: if(hasValue && node!=NULL) { michael@0: if(matchNodesCanHaveValues()) { michael@0: ((ValueNode *)node)->setValue(value); michael@0: } else { michael@0: node=new IntermediateValueNode(value, registerNode(node, errorCode)); michael@0: } michael@0: } michael@0: return registerNode(node, errorCode); michael@0: } michael@0: michael@0: // startgetMaxBranchLinearSubNodeLength()) { michael@0: // Branch on the middle unit. michael@0: // First, find the middle unit. michael@0: int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2); michael@0: // Create the less-than branch. michael@0: middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit michael@0: lessThan[ltLength]=makeBranchSubNode(start, i, unitIndex, length/2, errorCode); michael@0: ++ltLength; michael@0: // Continue for the greater-or-equal branch. michael@0: start=i; michael@0: length=length-length/2; michael@0: } michael@0: if(U_FAILURE(errorCode)) { michael@0: return NULL; michael@0: } michael@0: ListBranchNode *listNode=new ListBranchNode(); michael@0: if(listNode==NULL) { michael@0: errorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: // For each unit, find its elements array start and whether it has a final value. michael@0: int32_t unitNumber=0; michael@0: do { michael@0: int32_t i=start; michael@0: UChar unit=getElementUnit(i++, unitIndex); michael@0: i=indexOfElementWithNextUnit(i, unitIndex, unit); michael@0: if(start==i-1 && unitIndex+1==getElementStringLength(start)) { michael@0: listNode->add(unit, getElementValue(start)); michael@0: } else { michael@0: listNode->add(unit, makeNode(start, i, unitIndex+1, errorCode)); michael@0: } michael@0: start=i; michael@0: } while(++unitNumberadd(unit, getElementValue(start)); michael@0: } else { michael@0: listNode->add(unit, makeNode(start, limit, unitIndex+1, errorCode)); michael@0: } michael@0: Node *node=registerNode(listNode, errorCode); michael@0: // Create the split-branch nodes. michael@0: while(ltLength>0) { michael@0: --ltLength; michael@0: node=registerNode( michael@0: new SplitBranchNode(middleUnits[ltLength], lessThan[ltLength], node), errorCode); michael@0: } michael@0: return node; michael@0: } michael@0: michael@0: StringTrieBuilder::Node * michael@0: StringTrieBuilder::registerNode(Node *newNode, UErrorCode &errorCode) { michael@0: if(U_FAILURE(errorCode)) { michael@0: delete newNode; michael@0: return NULL; michael@0: } michael@0: if(newNode==NULL) { michael@0: errorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: const UHashElement *old=uhash_find(nodes, newNode); michael@0: if(old!=NULL) { michael@0: delete newNode; michael@0: return (Node *)old->key.pointer; michael@0: } michael@0: // If uhash_puti() returns a non-zero value from an equivalent, previously michael@0: // registered node, then uhash_find() failed to find that and we will leak newNode. michael@0: #if U_DEBUG michael@0: int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue. michael@0: #endif michael@0: uhash_puti(nodes, newNode, 1, &errorCode); michael@0: U_ASSERT(oldValue==0); michael@0: if(U_FAILURE(errorCode)) { michael@0: delete newNode; michael@0: return NULL; michael@0: } michael@0: return newNode; michael@0: } michael@0: michael@0: StringTrieBuilder::Node * michael@0: StringTrieBuilder::registerFinalValue(int32_t value, UErrorCode &errorCode) { michael@0: if(U_FAILURE(errorCode)) { michael@0: return NULL; michael@0: } michael@0: FinalValueNode key(value); michael@0: const UHashElement *old=uhash_find(nodes, &key); michael@0: if(old!=NULL) { michael@0: return (Node *)old->key.pointer; michael@0: } michael@0: Node *newNode=new FinalValueNode(value); michael@0: if(newNode==NULL) { michael@0: errorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: return NULL; michael@0: } michael@0: // If uhash_puti() returns a non-zero value from an equivalent, previously michael@0: // registered node, then uhash_find() failed to find that and we will leak newNode. michael@0: #if U_DEBUG michael@0: int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue. michael@0: #endif michael@0: uhash_puti(nodes, newNode, 1, &errorCode); michael@0: U_ASSERT(oldValue==0); michael@0: if(U_FAILURE(errorCode)) { michael@0: delete newNode; michael@0: return NULL; michael@0: } michael@0: return newNode; michael@0: } michael@0: michael@0: UBool michael@0: StringTrieBuilder::hashNode(const void *node) { michael@0: return ((const Node *)node)->hashCode(); michael@0: } michael@0: michael@0: UBool michael@0: StringTrieBuilder::equalNodes(const void *left, const void *right) { michael@0: return *(const Node *)left==*(const Node *)right; michael@0: } michael@0: michael@0: UBool michael@0: StringTrieBuilder::Node::operator==(const Node &other) const { michael@0: return this==&other || (typeid(*this)==typeid(other) && hash==other.hash); michael@0: } michael@0: michael@0: int32_t michael@0: StringTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) { michael@0: if(offset==0) { michael@0: offset=edgeNumber; michael@0: } michael@0: return edgeNumber; michael@0: } michael@0: michael@0: UBool michael@0: StringTrieBuilder::FinalValueNode::operator==(const Node &other) const { michael@0: if(this==&other) { michael@0: return TRUE; michael@0: } michael@0: if(!Node::operator==(other)) { michael@0: return FALSE; michael@0: } michael@0: const FinalValueNode &o=(const FinalValueNode &)other; michael@0: return value==o.value; michael@0: } michael@0: michael@0: void michael@0: StringTrieBuilder::FinalValueNode::write(StringTrieBuilder &builder) { michael@0: offset=builder.writeValueAndFinal(value, TRUE); michael@0: } michael@0: michael@0: UBool michael@0: StringTrieBuilder::ValueNode::operator==(const Node &other) const { michael@0: if(this==&other) { michael@0: return TRUE; michael@0: } michael@0: if(!Node::operator==(other)) { michael@0: return FALSE; michael@0: } michael@0: const ValueNode &o=(const ValueNode &)other; michael@0: return hasValue==o.hasValue && (!hasValue || value==o.value); michael@0: } michael@0: michael@0: UBool michael@0: StringTrieBuilder::IntermediateValueNode::operator==(const Node &other) const { michael@0: if(this==&other) { michael@0: return TRUE; michael@0: } michael@0: if(!ValueNode::operator==(other)) { michael@0: return FALSE; michael@0: } michael@0: const IntermediateValueNode &o=(const IntermediateValueNode &)other; michael@0: return next==o.next; michael@0: } michael@0: michael@0: int32_t michael@0: StringTrieBuilder::IntermediateValueNode::markRightEdgesFirst(int32_t edgeNumber) { michael@0: if(offset==0) { michael@0: offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); michael@0: } michael@0: return edgeNumber; michael@0: } michael@0: michael@0: void michael@0: StringTrieBuilder::IntermediateValueNode::write(StringTrieBuilder &builder) { michael@0: next->write(builder); michael@0: offset=builder.writeValueAndFinal(value, FALSE); michael@0: } michael@0: michael@0: UBool michael@0: StringTrieBuilder::LinearMatchNode::operator==(const Node &other) const { michael@0: if(this==&other) { michael@0: return TRUE; michael@0: } michael@0: if(!ValueNode::operator==(other)) { michael@0: return FALSE; michael@0: } michael@0: const LinearMatchNode &o=(const LinearMatchNode &)other; michael@0: return length==o.length && next==o.next; michael@0: } michael@0: michael@0: int32_t michael@0: StringTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber) { michael@0: if(offset==0) { michael@0: offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); michael@0: } michael@0: return edgeNumber; michael@0: } michael@0: michael@0: UBool michael@0: StringTrieBuilder::ListBranchNode::operator==(const Node &other) const { michael@0: if(this==&other) { michael@0: return TRUE; michael@0: } michael@0: if(!Node::operator==(other)) { michael@0: return FALSE; michael@0: } michael@0: const ListBranchNode &o=(const ListBranchNode &)other; michael@0: for(int32_t i=0; imarkRightEdgesFirst(edgeNumber-step); michael@0: } michael@0: // For all but the rightmost edge, decrement the edge number. michael@0: step=1; michael@0: } while(i>0); michael@0: offset=edgeNumber; michael@0: } michael@0: return edgeNumber; michael@0: } michael@0: michael@0: void michael@0: StringTrieBuilder::ListBranchNode::write(StringTrieBuilder &builder) { michael@0: // Write the sub-nodes in reverse order: The jump lengths are deltas from michael@0: // after their own positions, so if we wrote the minUnit sub-node first, michael@0: // then its jump delta would be larger. michael@0: // Instead we write the minUnit sub-node last, for a shorter delta. michael@0: int32_t unitNumber=length-1; michael@0: Node *rightEdge=equal[unitNumber]; michael@0: int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset(); michael@0: do { michael@0: --unitNumber; michael@0: if(equal[unitNumber]!=NULL) { michael@0: equal[unitNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder); michael@0: } michael@0: } while(unitNumber>0); michael@0: // The maxUnit sub-node is written as the very last one because we do michael@0: // not jump for it at all. michael@0: unitNumber=length-1; michael@0: if(rightEdge==NULL) { michael@0: builder.writeValueAndFinal(values[unitNumber], TRUE); michael@0: } else { michael@0: rightEdge->write(builder); michael@0: } michael@0: offset=builder.write(units[unitNumber]); michael@0: // Write the rest of this node's unit-value pairs. michael@0: while(--unitNumber>=0) { michael@0: int32_t value; michael@0: UBool isFinal; michael@0: if(equal[unitNumber]==NULL) { michael@0: // Write the final value for the one string ending with this unit. michael@0: value=values[unitNumber]; michael@0: isFinal=TRUE; michael@0: } else { michael@0: // Write the delta to the start position of the sub-node. michael@0: U_ASSERT(equal[unitNumber]->getOffset()>0); michael@0: value=offset-equal[unitNumber]->getOffset(); michael@0: isFinal=FALSE; michael@0: } michael@0: builder.writeValueAndFinal(value, isFinal); michael@0: offset=builder.write(units[unitNumber]); michael@0: } michael@0: } michael@0: michael@0: UBool michael@0: StringTrieBuilder::SplitBranchNode::operator==(const Node &other) const { michael@0: if(this==&other) { michael@0: return TRUE; michael@0: } michael@0: if(!Node::operator==(other)) { michael@0: return FALSE; michael@0: } michael@0: const SplitBranchNode &o=(const SplitBranchNode &)other; michael@0: return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual; michael@0: } michael@0: michael@0: int32_t michael@0: StringTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber) { michael@0: if(offset==0) { michael@0: firstEdgeNumber=edgeNumber; michael@0: edgeNumber=greaterOrEqual->markRightEdgesFirst(edgeNumber); michael@0: offset=edgeNumber=lessThan->markRightEdgesFirst(edgeNumber-1); michael@0: } michael@0: return edgeNumber; michael@0: } michael@0: michael@0: void michael@0: StringTrieBuilder::SplitBranchNode::write(StringTrieBuilder &builder) { michael@0: // Encode the less-than branch first. michael@0: lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder); michael@0: // Encode the greater-or-equal branch last because we do not jump for it at all. michael@0: greaterOrEqual->write(builder); michael@0: // Write this node. michael@0: U_ASSERT(lessThan->getOffset()>0); michael@0: builder.writeDeltaTo(lessThan->getOffset()); // less-than michael@0: offset=builder.write(unit); michael@0: } michael@0: michael@0: UBool michael@0: StringTrieBuilder::BranchHeadNode::operator==(const Node &other) const { michael@0: if(this==&other) { michael@0: return TRUE; michael@0: } michael@0: if(!ValueNode::operator==(other)) { michael@0: return FALSE; michael@0: } michael@0: const BranchHeadNode &o=(const BranchHeadNode &)other; michael@0: return length==o.length && next==o.next; michael@0: } michael@0: michael@0: int32_t michael@0: StringTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber) { michael@0: if(offset==0) { michael@0: offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); michael@0: } michael@0: return edgeNumber; michael@0: } michael@0: michael@0: void michael@0: StringTrieBuilder::BranchHeadNode::write(StringTrieBuilder &builder) { michael@0: next->write(builder); michael@0: if(length<=builder.getMinLinearMatch()) { michael@0: offset=builder.writeValueAndType(hasValue, value, length-1); michael@0: } else { michael@0: builder.write(length-1); michael@0: offset=builder.writeValueAndType(hasValue, value, 0); michael@0: } michael@0: } michael@0: michael@0: U_NAMESPACE_END