michael@0: /* michael@0: ******************************************************************************* michael@0: * Copyright (C) 2010-2012, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ******************************************************************************* michael@0: * file name: bytestriebuilder.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2010sep25 michael@0: * created by: Markus W. Scherer michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "unicode/bytestrie.h" michael@0: #include "unicode/bytestriebuilder.h" michael@0: #include "unicode/stringpiece.h" michael@0: #include "charstr.h" michael@0: #include "cmemory.h" michael@0: #include "uhash.h" michael@0: #include "uarrsort.h" michael@0: #include "uassert.h" michael@0: #include "ustr_imp.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: /* michael@0: * Note: This builder implementation stores (bytes, value) pairs with full copies michael@0: * of the byte sequences, until the BytesTrie is built. michael@0: * It might(!) take less memory if we collected the data in a temporary, dynamic trie. michael@0: */ michael@0: michael@0: class BytesTrieElement : public UMemory { michael@0: public: michael@0: // Use compiler's default constructor, initializes nothing. michael@0: michael@0: void setTo(const StringPiece &s, int32_t val, CharString &strings, UErrorCode &errorCode); michael@0: michael@0: StringPiece getString(const CharString &strings) const { michael@0: int32_t offset=stringOffset; michael@0: int32_t length; michael@0: if(offset>=0) { michael@0: length=(uint8_t)strings[offset++]; michael@0: } else { michael@0: offset=~offset; michael@0: length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1]; michael@0: offset+=2; michael@0: } michael@0: return StringPiece(strings.data()+offset, length); michael@0: } michael@0: int32_t getStringLength(const CharString &strings) const { michael@0: int32_t offset=stringOffset; michael@0: if(offset>=0) { michael@0: return (uint8_t)strings[offset]; michael@0: } else { michael@0: offset=~offset; michael@0: return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1]; michael@0: } michael@0: } michael@0: michael@0: char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; } michael@0: michael@0: int32_t getValue() const { return value; } michael@0: michael@0: int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const; michael@0: michael@0: private: michael@0: const char *data(const CharString &strings) const { michael@0: int32_t offset=stringOffset; michael@0: if(offset>=0) { michael@0: ++offset; michael@0: } else { michael@0: offset=~offset+2; michael@0: } michael@0: return strings.data()+offset; michael@0: } michael@0: michael@0: // If the stringOffset is non-negative, then the first strings byte contains michael@0: // the string length. michael@0: // If the stringOffset is negative, then the first two strings bytes contain michael@0: // the string length (big-endian), and the offset needs to be bit-inverted. michael@0: // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.) michael@0: int32_t stringOffset; michael@0: int32_t value; michael@0: }; michael@0: michael@0: void michael@0: BytesTrieElement::setTo(const StringPiece &s, int32_t val, michael@0: CharString &strings, UErrorCode &errorCode) { michael@0: if(U_FAILURE(errorCode)) { michael@0: return; michael@0: } michael@0: int32_t length=s.length(); michael@0: if(length>0xffff) { michael@0: // Too long: We store the length in 1 or 2 bytes. michael@0: errorCode=U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return; michael@0: } michael@0: int32_t offset=strings.length(); michael@0: if(length>0xff) { michael@0: offset=~offset; michael@0: strings.append((char)(length>>8), errorCode); michael@0: } michael@0: strings.append((char)length, errorCode); michael@0: stringOffset=offset; michael@0: value=val; michael@0: strings.append(s, errorCode); michael@0: } michael@0: michael@0: int32_t michael@0: BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const { michael@0: // TODO: add StringPiece::compare(), see ticket #8187 michael@0: StringPiece thisString=getString(strings); michael@0: StringPiece otherString=other.getString(strings); michael@0: int32_t lengthDiff=thisString.length()-otherString.length(); michael@0: int32_t commonLength; michael@0: if(lengthDiff<=0) { michael@0: commonLength=thisString.length(); michael@0: } else { michael@0: commonLength=otherString.length(); michael@0: } michael@0: int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength); michael@0: return diff!=0 ? diff : lengthDiff; michael@0: } michael@0: michael@0: BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode) michael@0: : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0), michael@0: bytes(NULL), bytesCapacity(0), bytesLength(0) { michael@0: if(U_FAILURE(errorCode)) { michael@0: return; michael@0: } michael@0: strings=new CharString(); michael@0: if(strings==NULL) { michael@0: errorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: } michael@0: michael@0: BytesTrieBuilder::~BytesTrieBuilder() { michael@0: delete strings; michael@0: delete[] elements; michael@0: uprv_free(bytes); michael@0: } michael@0: michael@0: BytesTrieBuilder & michael@0: BytesTrieBuilder::add(const StringPiece &s, int32_t value, UErrorCode &errorCode) { michael@0: if(U_FAILURE(errorCode)) { michael@0: return *this; michael@0: } michael@0: if(bytesLength>0) { michael@0: // Cannot add elements after building. michael@0: errorCode=U_NO_WRITE_PERMISSION; michael@0: return *this; michael@0: } michael@0: if(elementsLength==elementsCapacity) { michael@0: int32_t newCapacity; michael@0: if(elementsCapacity==0) { michael@0: newCapacity=1024; michael@0: } else { michael@0: newCapacity=4*elementsCapacity; michael@0: } michael@0: BytesTrieElement *newElements=new BytesTrieElement[newCapacity]; michael@0: if(newElements==NULL) { michael@0: errorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: return *this; // error instead of dereferencing null michael@0: } michael@0: if(elementsLength>0) { michael@0: uprv_memcpy(newElements, elements, elementsLength*sizeof(BytesTrieElement)); michael@0: } michael@0: delete[] elements; michael@0: elements=newElements; michael@0: elementsCapacity=newCapacity; michael@0: } michael@0: elements[elementsLength++].setTo(s, value, *strings, errorCode); michael@0: return *this; michael@0: } michael@0: michael@0: U_CDECL_BEGIN michael@0: michael@0: static int32_t U_CALLCONV michael@0: compareElementStrings(const void *context, const void *left, const void *right) { michael@0: const CharString *strings=static_cast(context); michael@0: const BytesTrieElement *leftElement=static_cast(left); michael@0: const BytesTrieElement *rightElement=static_cast(right); michael@0: return leftElement->compareStringTo(*rightElement, *strings); michael@0: } michael@0: michael@0: U_CDECL_END michael@0: michael@0: BytesTrie * michael@0: BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { michael@0: buildBytes(buildOption, errorCode); michael@0: BytesTrie *newTrie=NULL; michael@0: if(U_SUCCESS(errorCode)) { michael@0: newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength)); michael@0: if(newTrie==NULL) { michael@0: errorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: } else { michael@0: bytes=NULL; // The new trie now owns the array. michael@0: bytesCapacity=0; michael@0: } michael@0: } michael@0: return newTrie; michael@0: } michael@0: michael@0: StringPiece michael@0: BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { michael@0: buildBytes(buildOption, errorCode); michael@0: StringPiece result; michael@0: if(U_SUCCESS(errorCode)) { michael@0: result.set(bytes+(bytesCapacity-bytesLength), bytesLength); michael@0: } michael@0: return result; michael@0: } michael@0: michael@0: void michael@0: BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { michael@0: if(U_FAILURE(errorCode)) { michael@0: return; michael@0: } michael@0: if(bytes!=NULL && bytesLength>0) { michael@0: // Already built. michael@0: return; michael@0: } michael@0: if(bytesLength==0) { michael@0: if(elementsLength==0) { michael@0: errorCode=U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return; michael@0: } michael@0: uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement), michael@0: compareElementStrings, strings, michael@0: FALSE, // need not be a stable sort michael@0: &errorCode); michael@0: if(U_FAILURE(errorCode)) { michael@0: return; michael@0: } michael@0: // Duplicate strings are not allowed. michael@0: StringPiece prev=elements[0].getString(*strings); michael@0: for(int32_t i=1; ilength(); michael@0: if(capacity<1024) { michael@0: capacity=1024; michael@0: } michael@0: if(bytesCapacity(uprv_malloc(capacity)); michael@0: if(bytes==NULL) { michael@0: errorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: bytesCapacity=0; michael@0: return; michael@0: } michael@0: bytesCapacity=capacity; michael@0: } michael@0: StringTrieBuilder::build(buildOption, elementsLength, errorCode); michael@0: if(bytes==NULL) { michael@0: errorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: } michael@0: michael@0: BytesTrieBuilder & michael@0: BytesTrieBuilder::clear() { michael@0: strings->clear(); michael@0: elementsLength=0; michael@0: bytesLength=0; michael@0: return *this; michael@0: } michael@0: michael@0: int32_t michael@0: BytesTrieBuilder::getElementStringLength(int32_t i) const { michael@0: return elements[i].getStringLength(*strings); michael@0: } michael@0: michael@0: UChar michael@0: BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const { michael@0: return (uint8_t)elements[i].charAt(byteIndex, *strings); michael@0: } michael@0: michael@0: int32_t michael@0: BytesTrieBuilder::getElementValue(int32_t i) const { michael@0: return elements[i].getValue(); michael@0: } michael@0: michael@0: int32_t michael@0: BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const { michael@0: const BytesTrieElement &firstElement=elements[first]; michael@0: const BytesTrieElement &lastElement=elements[last]; michael@0: int32_t minStringLength=firstElement.getStringLength(*strings); michael@0: while(++byteIndex0); michael@0: return i; michael@0: } michael@0: michael@0: int32_t michael@0: BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const { michael@0: char b=(char)byte; michael@0: while(b==elements[i].charAt(byteIndex, *strings)) { michael@0: ++i; michael@0: } michael@0: return i; michael@0: } michael@0: michael@0: BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode) michael@0: : LinearMatchNode(len, nextNode), s(bytes) { michael@0: hash=hash*37+ustr_hashCharsN(bytes, len); michael@0: } michael@0: michael@0: UBool michael@0: BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const { michael@0: if(this==&other) { michael@0: return TRUE; michael@0: } michael@0: if(!LinearMatchNode::operator==(other)) { michael@0: return FALSE; michael@0: } michael@0: const BTLinearMatchNode &o=(const BTLinearMatchNode &)other; michael@0: return 0==uprv_memcmp(s, o.s, length); michael@0: } michael@0: michael@0: void michael@0: BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) { michael@0: BytesTrieBuilder &b=(BytesTrieBuilder &)builder; michael@0: next->write(builder); michael@0: b.write(s, length); michael@0: offset=b.write(b.getMinLinearMatch()+length-1); michael@0: } michael@0: michael@0: StringTrieBuilder::Node * michael@0: BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length, michael@0: Node *nextNode) const { michael@0: return new BTLinearMatchNode( michael@0: elements[i].getString(*strings).data()+byteIndex, michael@0: length, michael@0: nextNode); michael@0: } michael@0: michael@0: UBool michael@0: BytesTrieBuilder::ensureCapacity(int32_t length) { michael@0: if(bytes==NULL) { michael@0: return FALSE; // previous memory allocation had failed michael@0: } michael@0: if(length>bytesCapacity) { michael@0: int32_t newCapacity=bytesCapacity; michael@0: do { michael@0: newCapacity*=2; michael@0: } while(newCapacity<=length); michael@0: char *newBytes=static_cast(uprv_malloc(newCapacity)); michael@0: if(newBytes==NULL) { michael@0: // unable to allocate memory michael@0: uprv_free(bytes); michael@0: bytes=NULL; michael@0: bytesCapacity=0; michael@0: return FALSE; michael@0: } michael@0: uprv_memcpy(newBytes+(newCapacity-bytesLength), michael@0: bytes+(bytesCapacity-bytesLength), bytesLength); michael@0: uprv_free(bytes); michael@0: bytes=newBytes; michael@0: bytesCapacity=newCapacity; michael@0: } michael@0: return TRUE; michael@0: } michael@0: michael@0: int32_t michael@0: BytesTrieBuilder::write(int32_t byte) { michael@0: int32_t newLength=bytesLength+1; michael@0: if(ensureCapacity(newLength)) { michael@0: bytesLength=newLength; michael@0: bytes[bytesCapacity-bytesLength]=(char)byte; michael@0: } michael@0: return bytesLength; michael@0: } michael@0: michael@0: int32_t michael@0: BytesTrieBuilder::write(const char *b, int32_t length) { michael@0: int32_t newLength=bytesLength+length; michael@0: if(ensureCapacity(newLength)) { michael@0: bytesLength=newLength; michael@0: uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length); michael@0: } michael@0: return bytesLength; michael@0: } michael@0: michael@0: int32_t michael@0: BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) { michael@0: return write(elements[i].getString(*strings).data()+byteIndex, length); michael@0: } michael@0: michael@0: int32_t michael@0: BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) { michael@0: if(0<=i && i<=BytesTrie::kMaxOneByteValue) { michael@0: return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal); michael@0: } michael@0: char intBytes[5]; michael@0: int32_t length=1; michael@0: if(i<0 || i>0xffffff) { michael@0: intBytes[0]=(char)BytesTrie::kFiveByteValueLead; michael@0: intBytes[1]=(char)((uint32_t)i>>24); michael@0: intBytes[2]=(char)((uint32_t)i>>16); michael@0: intBytes[3]=(char)((uint32_t)i>>8); michael@0: intBytes[4]=(char)i; michael@0: length=5; michael@0: // } else if(i<=BytesTrie::kMaxOneByteValue) { michael@0: // intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i); michael@0: } else { michael@0: if(i<=BytesTrie::kMaxTwoByteValue) { michael@0: intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8)); michael@0: } else { michael@0: if(i<=BytesTrie::kMaxThreeByteValue) { michael@0: intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16)); michael@0: } else { michael@0: intBytes[0]=(char)BytesTrie::kFourByteValueLead; michael@0: intBytes[1]=(char)(i>>16); michael@0: length=2; michael@0: } michael@0: intBytes[length++]=(char)(i>>8); michael@0: } michael@0: intBytes[length++]=(char)i; michael@0: } michael@0: intBytes[0]=(char)((intBytes[0]<<1)|isFinal); michael@0: return write(intBytes, length); michael@0: } michael@0: michael@0: int32_t michael@0: BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) { michael@0: int32_t offset=write(node); michael@0: if(hasValue) { michael@0: offset=writeValueAndFinal(value, FALSE); michael@0: } michael@0: return offset; michael@0: } michael@0: michael@0: int32_t michael@0: BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) { michael@0: int32_t i=bytesLength-jumpTarget; michael@0: U_ASSERT(i>=0); michael@0: if(i<=BytesTrie::kMaxOneByteDelta) { michael@0: return write(i); michael@0: } michael@0: char intBytes[5]; michael@0: int32_t length; michael@0: if(i<=BytesTrie::kMaxTwoByteDelta) { michael@0: intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8)); michael@0: length=1; michael@0: } else { michael@0: if(i<=BytesTrie::kMaxThreeByteDelta) { michael@0: intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16)); michael@0: length=2; michael@0: } else { michael@0: if(i<=0xffffff) { michael@0: intBytes[0]=(char)BytesTrie::kFourByteDeltaLead; michael@0: length=3; michael@0: } else { michael@0: intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead; michael@0: intBytes[1]=(char)(i>>24); michael@0: length=4; michael@0: } michael@0: intBytes[1]=(char)(i>>16); michael@0: } michael@0: intBytes[1]=(char)(i>>8); michael@0: } michael@0: intBytes[length++]=(char)i; michael@0: return write(intBytes, length); michael@0: } michael@0: michael@0: U_NAMESPACE_END