1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/rbbisetb.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,130 @@ 1.4 +// 1.5 +// rbbisetb.h 1.6 +/* 1.7 +********************************************************************** 1.8 +* Copyright (c) 2001-2005, International Business Machines 1.9 +* Corporation and others. All Rights Reserved. 1.10 +********************************************************************** 1.11 +*/ 1.12 + 1.13 +#ifndef RBBISETB_H 1.14 +#define RBBISETB_H 1.15 + 1.16 +#include "unicode/utypes.h" 1.17 +#include "unicode/uobject.h" 1.18 +#include "rbbirb.h" 1.19 +#include "uvector.h" 1.20 + 1.21 +struct UNewTrie; 1.22 + 1.23 +U_NAMESPACE_BEGIN 1.24 + 1.25 +// 1.26 +// RBBISetBuilder Derives the character categories used by the runtime RBBI engine 1.27 +// from the Unicode Sets appearing in the source RBBI rules, and 1.28 +// creates the TRIE table used to map from Unicode to the 1.29 +// character categories. 1.30 +// 1.31 + 1.32 + 1.33 +// 1.34 +// RangeDescriptor 1.35 +// 1.36 +// Each of the non-overlapping character ranges gets one of these descriptors. 1.37 +// All of them are strung together in a linked list, which is kept in order 1.38 +// (by character) 1.39 +// 1.40 +class RangeDescriptor : public UMemory { 1.41 +public: 1.42 + UChar32 fStartChar; // Start of range, unicode 32 bit value. 1.43 + UChar32 fEndChar; // End of range, unicode 32 bit value. 1.44 + int32_t fNum; // runtime-mapped input value for this range. 1.45 + UVector *fIncludesSets; // vector of the the original 1.46 + // Unicode sets that include this range. 1.47 + // (Contains ptrs to uset nodes) 1.48 + RangeDescriptor *fNext; // Next RangeDescriptor in the linked list. 1.49 + 1.50 + RangeDescriptor(UErrorCode &status); 1.51 + RangeDescriptor(const RangeDescriptor &other, UErrorCode &status); 1.52 + ~RangeDescriptor(); 1.53 + void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with 1.54 + // where appearing in the second (higher) part. 1.55 + void setDictionaryFlag(); // Check whether this range appears as part of 1.56 + // the Unicode set named "dictionary" 1.57 + 1.58 +private: 1.59 + RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class 1.60 + RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class 1.61 +}; 1.62 + 1.63 + 1.64 +// 1.65 +// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules. 1.66 +// 1.67 +// Starting with the rules parse tree from the scanner, 1.68 +// 1.69 +// - Enumerate the set of UnicodeSets that are referenced 1.70 +// by the RBBI rules. 1.71 +// - compute a derived set of non-overlapping UnicodeSets 1.72 +// that will correspond to columns in the state table for 1.73 +// the RBBI execution engine. 1.74 +// - construct the trie table that maps input characters 1.75 +// to set numbers in the non-overlapping set of sets. 1.76 +// 1.77 + 1.78 + 1.79 +class RBBISetBuilder : public UMemory { 1.80 +public: 1.81 + RBBISetBuilder(RBBIRuleBuilder *rb); 1.82 + ~RBBISetBuilder(); 1.83 + 1.84 + void build(); 1.85 + void addValToSets(UVector *sets, uint32_t val); 1.86 + void addValToSet (RBBINode *usetNode, uint32_t val); 1.87 + int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the 1.88 + // runtime state machine, which are the same as 1.89 + // columns in the DFA state table 1.90 + int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie. 1.91 + void serializeTrie(uint8_t *where); // write out the serialized Trie. 1.92 + UChar32 getFirstChar(int32_t val) const; 1.93 + UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo 1.94 + // character were encountered. 1.95 +#ifdef RBBI_DEBUG 1.96 + void printSets(); 1.97 + void printRanges(); 1.98 + void printRangeGroups(); 1.99 +#else 1.100 + #define printSets() 1.101 + #define printRanges() 1.102 + #define printRangeGroups() 1.103 +#endif 1.104 + 1.105 +private: 1.106 + void numberSets(); 1.107 + 1.108 + RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us. 1.109 + UErrorCode *fStatus; 1.110 + 1.111 + RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors 1.112 + 1.113 + UNewTrie *fTrie; // The mapping TRIE that is the end result of processing 1.114 + uint32_t fTrieSize; // the Unicode Sets. 1.115 + 1.116 + // Groups correspond to character categories - 1.117 + // groups of ranges that are in the same original UnicodeSets. 1.118 + // fGroupCount is the index of the last used group. 1.119 + // fGroupCount+1 is also the number of columns in the RBBI state table being compiled. 1.120 + // State table column 0 is not used. Column 1 is for end-of-input. 1.121 + // column 2 is for group 0. Funny counting. 1.122 + int32_t fGroupCount; 1.123 + 1.124 + UBool fSawBOF; 1.125 + 1.126 + RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class 1.127 + RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class 1.128 +}; 1.129 + 1.130 + 1.131 + 1.132 +U_NAMESPACE_END 1.133 +#endif