1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/rbbirb.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,211 @@ 1.4 +// 1.5 +// rbbirb.h 1.6 +// 1.7 +// Copyright (C) 2002-2008, International Business Machines Corporation and others. 1.8 +// All Rights Reserved. 1.9 +// 1.10 +// This file contains declarations for several classes from the 1.11 +// Rule Based Break Iterator rule builder. 1.12 +// 1.13 + 1.14 + 1.15 +#ifndef RBBIRB_H 1.16 +#define RBBIRB_H 1.17 + 1.18 +#include "unicode/utypes.h" 1.19 +#include "unicode/uobject.h" 1.20 +#include "unicode/rbbi.h" 1.21 +#include "unicode/uniset.h" 1.22 +#include "unicode/parseerr.h" 1.23 +#include "uhash.h" 1.24 +#include "uvector.h" 1.25 +#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that 1.26 + // looks up references to $variables within a set. 1.27 + 1.28 + 1.29 + 1.30 +U_NAMESPACE_BEGIN 1.31 + 1.32 +class RBBIRuleScanner; 1.33 +struct RBBIRuleTableEl; 1.34 +class RBBISetBuilder; 1.35 +class RBBINode; 1.36 +class RBBITableBuilder; 1.37 + 1.38 + 1.39 + 1.40 +//-------------------------------------------------------------------------------- 1.41 +// 1.42 +// RBBISymbolTable. Implements SymbolTable interface that is used by the 1.43 +// UnicodeSet parser to resolve references to $variables. 1.44 +// 1.45 +//-------------------------------------------------------------------------------- 1.46 +class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one 1.47 +public: // of these structs for each entry. 1.48 + RBBISymbolTableEntry(); 1.49 + UnicodeString key; 1.50 + RBBINode *val; 1.51 + ~RBBISymbolTableEntry(); 1.52 + 1.53 +private: 1.54 + RBBISymbolTableEntry(const RBBISymbolTableEntry &other); // forbid copying of this class 1.55 + RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other); // forbid copying of this class 1.56 +}; 1.57 + 1.58 + 1.59 +class RBBISymbolTable : public UMemory, public SymbolTable { 1.60 +private: 1.61 + const UnicodeString &fRules; 1.62 + UHashtable *fHashTable; 1.63 + RBBIRuleScanner *fRuleScanner; 1.64 + 1.65 + // These next two fields are part of the mechanism for passing references to 1.66 + // already-constructed UnicodeSets back to the UnicodeSet constructor 1.67 + // when the pattern includes $variable references. 1.68 + const UnicodeString ffffString; // = "/uffff" 1.69 + UnicodeSet *fCachedSetLookup; 1.70 + 1.71 +public: 1.72 + // API inherited from class SymbolTable 1.73 + virtual const UnicodeString* lookup(const UnicodeString& s) const; 1.74 + virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const; 1.75 + virtual UnicodeString parseReference(const UnicodeString& text, 1.76 + ParsePosition& pos, int32_t limit) const; 1.77 + 1.78 + // Additional Functions 1.79 + RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status); 1.80 + virtual ~RBBISymbolTable(); 1.81 + 1.82 + virtual RBBINode *lookupNode(const UnicodeString &key) const; 1.83 + virtual void addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err); 1.84 + 1.85 +#ifdef RBBI_DEBUG 1.86 + virtual void rbbiSymtablePrint() const; 1.87 +#else 1.88 + // A do-nothing inline function for non-debug builds. Member funcs can't be empty 1.89 + // or the call sites won't compile. 1.90 + int32_t fFakeField; 1.91 + #define rbbiSymtablePrint() fFakeField=0; 1.92 +#endif 1.93 + 1.94 +private: 1.95 + RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class 1.96 + RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class 1.97 +}; 1.98 + 1.99 + 1.100 +//-------------------------------------------------------------------------------- 1.101 +// 1.102 +// class RBBIRuleBuilder The top-level class handling RBBI rule compiling. 1.103 +// 1.104 +//-------------------------------------------------------------------------------- 1.105 +class RBBIRuleBuilder : public UMemory { 1.106 +public: 1.107 + 1.108 + // Create a rule based break iterator from a set of rules. 1.109 + // This function is the main entry point into the rule builder. The 1.110 + // public ICU API for creating RBBIs uses this function to do the actual work. 1.111 + // 1.112 + static BreakIterator * createRuleBasedBreakIterator( const UnicodeString &rules, 1.113 + UParseError *parseError, 1.114 + UErrorCode &status); 1.115 + 1.116 +public: 1.117 + // The "public" functions and data members that appear below are accessed 1.118 + // (and shared) by the various parts that make up the rule builder. They 1.119 + // are NOT intended to be accessed by anything outside of the 1.120 + // rule builder implementation. 1.121 + RBBIRuleBuilder(const UnicodeString &rules, 1.122 + UParseError *parseErr, 1.123 + UErrorCode &status 1.124 + ); 1.125 + 1.126 + virtual ~RBBIRuleBuilder(); 1.127 + char *fDebugEnv; // controls debug trace output 1.128 + UErrorCode *fStatus; // Error reporting. Keeping status 1.129 + UParseError *fParseError; // here avoids passing it everywhere. 1.130 + const UnicodeString &fRules; // The rule string that we are compiling 1.131 + 1.132 + RBBIRuleScanner *fScanner; // The scanner. 1.133 + RBBINode *fForwardTree; // The parse trees, generated by the scanner, 1.134 + RBBINode *fReverseTree; // then manipulated by subsequent steps. 1.135 + RBBINode *fSafeFwdTree; 1.136 + RBBINode *fSafeRevTree; 1.137 + 1.138 + RBBINode **fDefaultTree; // For rules not qualified with a ! 1.139 + // the tree to which they belong to. 1.140 + 1.141 + UBool fChainRules; // True for chained Unicode TR style rules. 1.142 + // False for traditional regexp rules. 1.143 + 1.144 + UBool fLBCMNoChain; // True: suppress chaining of rules on 1.145 + // chars with LineBreak property == CM. 1.146 + 1.147 + UBool fLookAheadHardBreak; // True: Look ahead matches cause an 1.148 + // immediate break, no continuing for the 1.149 + // longest match. 1.150 + 1.151 + RBBISetBuilder *fSetBuilder; // Set and Character Category builder. 1.152 + UVector *fUSetNodes; // Vector of all uset nodes. 1.153 + 1.154 + RBBITableBuilder *fForwardTables; // State transition tables 1.155 + RBBITableBuilder *fReverseTables; 1.156 + RBBITableBuilder *fSafeFwdTables; 1.157 + RBBITableBuilder *fSafeRevTables; 1.158 + 1.159 + UVector *fRuleStatusVals; // The values that can be returned 1.160 + // from getRuleStatus(). 1.161 + 1.162 + RBBIDataHeader *flattenData(); // Create the flattened (runtime format) 1.163 + // data tables.. 1.164 +private: 1.165 + RBBIRuleBuilder(const RBBIRuleBuilder &other); // forbid copying of this class 1.166 + RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other); // forbid copying of this class 1.167 +}; 1.168 + 1.169 + 1.170 + 1.171 + 1.172 +//---------------------------------------------------------------------------- 1.173 +// 1.174 +// RBBISetTableEl is an entry in the hash table of UnicodeSets that have 1.175 +// been encountered. The val Node will be of nodetype uset 1.176 +// and contain pointers to the actual UnicodeSets. 1.177 +// The Key is the source string for initializing the set. 1.178 +// 1.179 +// The hash table is used to avoid creating duplicate 1.180 +// unnamed (not $var references) UnicodeSets. 1.181 +// 1.182 +// Memory Management: 1.183 +// The Hash Table owns these RBBISetTableEl structs and 1.184 +// the key strings. It does NOT own the val nodes. 1.185 +// 1.186 +//---------------------------------------------------------------------------- 1.187 +struct RBBISetTableEl { 1.188 + UnicodeString *key; 1.189 + RBBINode *val; 1.190 +}; 1.191 + 1.192 + 1.193 +//---------------------------------------------------------------------------- 1.194 +// 1.195 +// RBBIDebugPrintf Printf equivalent, for debugging output. 1.196 +// Conditional compilation of the implementation lets us 1.197 +// get rid of the stdio dependency in environments where it 1.198 +// is unavailable. 1.199 +// 1.200 +//---------------------------------------------------------------------------- 1.201 +#ifdef RBBI_DEBUG 1.202 +#include <stdio.h> 1.203 +#define RBBIDebugPrintf printf 1.204 +#define RBBIDebugPuts puts 1.205 +#else 1.206 +#undef RBBIDebugPrintf 1.207 +#define RBBIDebugPuts(arg) 1.208 +#endif 1.209 + 1.210 +U_NAMESPACE_END 1.211 +#endif 1.212 + 1.213 + 1.214 +