intl/icu/source/common/rbbisetb.h

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 //
michael@0 2 // rbbisetb.h
michael@0 3 /*
michael@0 4 **********************************************************************
michael@0 5 * Copyright (c) 2001-2005, International Business Machines
michael@0 6 * Corporation and others. All Rights Reserved.
michael@0 7 **********************************************************************
michael@0 8 */
michael@0 9
michael@0 10 #ifndef RBBISETB_H
michael@0 11 #define RBBISETB_H
michael@0 12
michael@0 13 #include "unicode/utypes.h"
michael@0 14 #include "unicode/uobject.h"
michael@0 15 #include "rbbirb.h"
michael@0 16 #include "uvector.h"
michael@0 17
michael@0 18 struct UNewTrie;
michael@0 19
michael@0 20 U_NAMESPACE_BEGIN
michael@0 21
michael@0 22 //
michael@0 23 // RBBISetBuilder Derives the character categories used by the runtime RBBI engine
michael@0 24 // from the Unicode Sets appearing in the source RBBI rules, and
michael@0 25 // creates the TRIE table used to map from Unicode to the
michael@0 26 // character categories.
michael@0 27 //
michael@0 28
michael@0 29
michael@0 30 //
michael@0 31 // RangeDescriptor
michael@0 32 //
michael@0 33 // Each of the non-overlapping character ranges gets one of these descriptors.
michael@0 34 // All of them are strung together in a linked list, which is kept in order
michael@0 35 // (by character)
michael@0 36 //
michael@0 37 class RangeDescriptor : public UMemory {
michael@0 38 public:
michael@0 39 UChar32 fStartChar; // Start of range, unicode 32 bit value.
michael@0 40 UChar32 fEndChar; // End of range, unicode 32 bit value.
michael@0 41 int32_t fNum; // runtime-mapped input value for this range.
michael@0 42 UVector *fIncludesSets; // vector of the the original
michael@0 43 // Unicode sets that include this range.
michael@0 44 // (Contains ptrs to uset nodes)
michael@0 45 RangeDescriptor *fNext; // Next RangeDescriptor in the linked list.
michael@0 46
michael@0 47 RangeDescriptor(UErrorCode &status);
michael@0 48 RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
michael@0 49 ~RangeDescriptor();
michael@0 50 void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with
michael@0 51 // where appearing in the second (higher) part.
michael@0 52 void setDictionaryFlag(); // Check whether this range appears as part of
michael@0 53 // the Unicode set named "dictionary"
michael@0 54
michael@0 55 private:
michael@0 56 RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class
michael@0 57 RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class
michael@0 58 };
michael@0 59
michael@0 60
michael@0 61 //
michael@0 62 // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules.
michael@0 63 //
michael@0 64 // Starting with the rules parse tree from the scanner,
michael@0 65 //
michael@0 66 // - Enumerate the set of UnicodeSets that are referenced
michael@0 67 // by the RBBI rules.
michael@0 68 // - compute a derived set of non-overlapping UnicodeSets
michael@0 69 // that will correspond to columns in the state table for
michael@0 70 // the RBBI execution engine.
michael@0 71 // - construct the trie table that maps input characters
michael@0 72 // to set numbers in the non-overlapping set of sets.
michael@0 73 //
michael@0 74
michael@0 75
michael@0 76 class RBBISetBuilder : public UMemory {
michael@0 77 public:
michael@0 78 RBBISetBuilder(RBBIRuleBuilder *rb);
michael@0 79 ~RBBISetBuilder();
michael@0 80
michael@0 81 void build();
michael@0 82 void addValToSets(UVector *sets, uint32_t val);
michael@0 83 void addValToSet (RBBINode *usetNode, uint32_t val);
michael@0 84 int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the
michael@0 85 // runtime state machine, which are the same as
michael@0 86 // columns in the DFA state table
michael@0 87 int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie.
michael@0 88 void serializeTrie(uint8_t *where); // write out the serialized Trie.
michael@0 89 UChar32 getFirstChar(int32_t val) const;
michael@0 90 UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo
michael@0 91 // character were encountered.
michael@0 92 #ifdef RBBI_DEBUG
michael@0 93 void printSets();
michael@0 94 void printRanges();
michael@0 95 void printRangeGroups();
michael@0 96 #else
michael@0 97 #define printSets()
michael@0 98 #define printRanges()
michael@0 99 #define printRangeGroups()
michael@0 100 #endif
michael@0 101
michael@0 102 private:
michael@0 103 void numberSets();
michael@0 104
michael@0 105 RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us.
michael@0 106 UErrorCode *fStatus;
michael@0 107
michael@0 108 RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors
michael@0 109
michael@0 110 UNewTrie *fTrie; // The mapping TRIE that is the end result of processing
michael@0 111 uint32_t fTrieSize; // the Unicode Sets.
michael@0 112
michael@0 113 // Groups correspond to character categories -
michael@0 114 // groups of ranges that are in the same original UnicodeSets.
michael@0 115 // fGroupCount is the index of the last used group.
michael@0 116 // fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
michael@0 117 // State table column 0 is not used. Column 1 is for end-of-input.
michael@0 118 // column 2 is for group 0. Funny counting.
michael@0 119 int32_t fGroupCount;
michael@0 120
michael@0 121 UBool fSawBOF;
michael@0 122
michael@0 123 RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class
michael@0 124 RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class
michael@0 125 };
michael@0 126
michael@0 127
michael@0 128
michael@0 129 U_NAMESPACE_END
michael@0 130 #endif

mercurial