michael@0: // michael@0: // rbbisetb.h michael@0: /* michael@0: ********************************************************************** michael@0: * Copyright (c) 2001-2005, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ********************************************************************** michael@0: */ michael@0: michael@0: #ifndef RBBISETB_H michael@0: #define RBBISETB_H michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "unicode/uobject.h" michael@0: #include "rbbirb.h" michael@0: #include "uvector.h" michael@0: michael@0: struct UNewTrie; michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: // michael@0: // RBBISetBuilder Derives the character categories used by the runtime RBBI engine michael@0: // from the Unicode Sets appearing in the source RBBI rules, and michael@0: // creates the TRIE table used to map from Unicode to the michael@0: // character categories. michael@0: // michael@0: michael@0: michael@0: // michael@0: // RangeDescriptor michael@0: // michael@0: // Each of the non-overlapping character ranges gets one of these descriptors. michael@0: // All of them are strung together in a linked list, which is kept in order michael@0: // (by character) michael@0: // michael@0: class RangeDescriptor : public UMemory { michael@0: public: michael@0: UChar32 fStartChar; // Start of range, unicode 32 bit value. michael@0: UChar32 fEndChar; // End of range, unicode 32 bit value. michael@0: int32_t fNum; // runtime-mapped input value for this range. michael@0: UVector *fIncludesSets; // vector of the the original michael@0: // Unicode sets that include this range. michael@0: // (Contains ptrs to uset nodes) michael@0: RangeDescriptor *fNext; // Next RangeDescriptor in the linked list. michael@0: michael@0: RangeDescriptor(UErrorCode &status); michael@0: RangeDescriptor(const RangeDescriptor &other, UErrorCode &status); michael@0: ~RangeDescriptor(); michael@0: void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with michael@0: // where appearing in the second (higher) part. michael@0: void setDictionaryFlag(); // Check whether this range appears as part of michael@0: // the Unicode set named "dictionary" michael@0: michael@0: private: michael@0: RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class michael@0: RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class michael@0: }; michael@0: michael@0: michael@0: // michael@0: // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules. michael@0: // michael@0: // Starting with the rules parse tree from the scanner, michael@0: // michael@0: // - Enumerate the set of UnicodeSets that are referenced michael@0: // by the RBBI rules. michael@0: // - compute a derived set of non-overlapping UnicodeSets michael@0: // that will correspond to columns in the state table for michael@0: // the RBBI execution engine. michael@0: // - construct the trie table that maps input characters michael@0: // to set numbers in the non-overlapping set of sets. michael@0: // michael@0: michael@0: michael@0: class RBBISetBuilder : public UMemory { michael@0: public: michael@0: RBBISetBuilder(RBBIRuleBuilder *rb); michael@0: ~RBBISetBuilder(); michael@0: michael@0: void build(); michael@0: void addValToSets(UVector *sets, uint32_t val); michael@0: void addValToSet (RBBINode *usetNode, uint32_t val); michael@0: int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the michael@0: // runtime state machine, which are the same as michael@0: // columns in the DFA state table michael@0: int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie. michael@0: void serializeTrie(uint8_t *where); // write out the serialized Trie. michael@0: UChar32 getFirstChar(int32_t val) const; michael@0: UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo michael@0: // character were encountered. michael@0: #ifdef RBBI_DEBUG michael@0: void printSets(); michael@0: void printRanges(); michael@0: void printRangeGroups(); michael@0: #else michael@0: #define printSets() michael@0: #define printRanges() michael@0: #define printRangeGroups() michael@0: #endif michael@0: michael@0: private: michael@0: void numberSets(); michael@0: michael@0: RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us. michael@0: UErrorCode *fStatus; michael@0: michael@0: RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors michael@0: michael@0: UNewTrie *fTrie; // The mapping TRIE that is the end result of processing michael@0: uint32_t fTrieSize; // the Unicode Sets. michael@0: michael@0: // Groups correspond to character categories - michael@0: // groups of ranges that are in the same original UnicodeSets. michael@0: // fGroupCount is the index of the last used group. michael@0: // fGroupCount+1 is also the number of columns in the RBBI state table being compiled. michael@0: // State table column 0 is not used. Column 1 is for end-of-input. michael@0: // column 2 is for group 0. Funny counting. michael@0: int32_t fGroupCount; michael@0: michael@0: UBool fSawBOF; michael@0: michael@0: RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class michael@0: RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class michael@0: }; michael@0: michael@0: michael@0: michael@0: U_NAMESPACE_END michael@0: #endif