|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 1999-2013 International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * file name: rbbidata.h |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * RBBI data formats Includes |
|
14 * |
|
15 * Structs that describes the format of the Binary RBBI data, |
|
16 * as it is stored in ICU's data file. |
|
17 * |
|
18 * RBBIDataWrapper - Instances of this class sit between the |
|
19 * raw data structs and the RulesBasedBreakIterator objects |
|
20 * that are created by applications. The wrapper class |
|
21 * provides reference counting for the underlying data, |
|
22 * and direct pointers to data that would not otherwise |
|
23 * be accessible without ugly pointer arithmetic. The |
|
24 * wrapper does not attempt to provide any higher level |
|
25 * abstractions for the data itself. |
|
26 * |
|
27 * There will be only one instance of RBBIDataWrapper for any |
|
28 * set of RBBI run time data being shared by instances |
|
29 * (clones) of RulesBasedBreakIterator. |
|
30 */ |
|
31 |
|
32 #ifndef __RBBIDATA_H__ |
|
33 #define __RBBIDATA_H__ |
|
34 |
|
35 #include "unicode/utypes.h" |
|
36 #include "unicode/udata.h" |
|
37 #include "udataswp.h" |
|
38 |
|
39 /** |
|
40 * Swap RBBI data. See udataswp.h. |
|
41 * @internal |
|
42 */ |
|
43 U_CAPI int32_t U_EXPORT2 |
|
44 ubrk_swap(const UDataSwapper *ds, |
|
45 const void *inData, int32_t length, void *outData, |
|
46 UErrorCode *pErrorCode); |
|
47 |
|
48 #ifdef __cplusplus |
|
49 |
|
50 #include "unicode/uobject.h" |
|
51 #include "unicode/unistr.h" |
|
52 #include "umutex.h" |
|
53 #include "utrie.h" |
|
54 |
|
55 U_NAMESPACE_BEGIN |
|
56 |
|
57 /* |
|
58 * The following structs map exactly onto the raw data from ICU common data file. |
|
59 */ |
|
60 struct RBBIDataHeader { |
|
61 uint32_t fMagic; /* == 0xbla0 */ |
|
62 uint8_t fFormatVersion[4]; /* Data Format. Same as the value in struct UDataInfo */ |
|
63 /* if there is one associated with this data. */ |
|
64 /* (version originates in rbbi, is copied to UDataInfo) */ |
|
65 /* For ICU 3.2 and earlier, this field was */ |
|
66 /* uint32_t fVersion */ |
|
67 /* with a value of 1. */ |
|
68 uint32_t fLength; /* Total length in bytes of this RBBI Data, */ |
|
69 /* including all sections, not just the header. */ |
|
70 uint32_t fCatCount; /* Number of character categories. */ |
|
71 |
|
72 /* */ |
|
73 /* Offsets and sizes of each of the subsections within the RBBI data. */ |
|
74 /* All offsets are bytes from the start of the RBBIDataHeader. */ |
|
75 /* All sizes are in bytes. */ |
|
76 /* */ |
|
77 uint32_t fFTable; /* forward state transition table. */ |
|
78 uint32_t fFTableLen; |
|
79 uint32_t fRTable; /* Offset to the reverse state transition table. */ |
|
80 uint32_t fRTableLen; |
|
81 uint32_t fSFTable; /* safe point forward transition table */ |
|
82 uint32_t fSFTableLen; |
|
83 uint32_t fSRTable; /* safe point reverse transition table */ |
|
84 uint32_t fSRTableLen; |
|
85 uint32_t fTrie; /* Offset to Trie data for character categories */ |
|
86 uint32_t fTrieLen; |
|
87 uint32_t fRuleSource; /* Offset to the source for for the break */ |
|
88 uint32_t fRuleSourceLen; /* rules. Stored UChar *. */ |
|
89 uint32_t fStatusTable; /* Offset to the table of rule status values */ |
|
90 uint32_t fStatusTableLen; |
|
91 |
|
92 uint32_t fReserved[6]; /* Reserved for expansion */ |
|
93 |
|
94 }; |
|
95 |
|
96 |
|
97 |
|
98 struct RBBIStateTableRow { |
|
99 int16_t fAccepting; /* Non-zero if this row is for an accepting state. */ |
|
100 /* Value 0: not an accepting state. */ |
|
101 /* -1: Unconditional Accepting state. */ |
|
102 /* positive: Look-ahead match has completed. */ |
|
103 /* Actual boundary position happened earlier */ |
|
104 /* Value here == fLookAhead in earlier */ |
|
105 /* state, at actual boundary pos. */ |
|
106 int16_t fLookAhead; /* Non-zero if this row is for a state that */ |
|
107 /* corresponds to a '/' in the rule source. */ |
|
108 /* Value is the same as the fAccepting */ |
|
109 /* value for the rule (which will appear */ |
|
110 /* in a different state. */ |
|
111 int16_t fTagIdx; /* Non-zero if this row covers a {tagged} position */ |
|
112 /* from a rule. Value is the index in the */ |
|
113 /* StatusTable of the set of matching */ |
|
114 /* tags (rule status values) */ |
|
115 int16_t fReserved; |
|
116 uint16_t fNextState[2]; /* Next State, indexed by char category. */ |
|
117 /* This array does not have two elements */ |
|
118 /* Array Size is actually fData->fHeader->fCatCount */ |
|
119 /* CAUTION: see RBBITableBuilder::getTableSize() */ |
|
120 /* before changing anything here. */ |
|
121 }; |
|
122 |
|
123 |
|
124 struct RBBIStateTable { |
|
125 uint32_t fNumStates; /* Number of states. */ |
|
126 uint32_t fRowLen; /* Length of a state table row, in bytes. */ |
|
127 uint32_t fFlags; /* Option Flags for this state table */ |
|
128 uint32_t fReserved; /* reserved */ |
|
129 char fTableData[4]; /* First RBBIStateTableRow begins here. */ |
|
130 /* (making it char[] simplifies ugly address */ |
|
131 /* arithmetic for indexing variable length rows.) */ |
|
132 }; |
|
133 |
|
134 typedef enum { |
|
135 RBBI_LOOKAHEAD_HARD_BREAK = 1, |
|
136 RBBI_BOF_REQUIRED = 2 |
|
137 } RBBIStateTableFlags; |
|
138 |
|
139 |
|
140 /* */ |
|
141 /* The reference counting wrapper class */ |
|
142 /* */ |
|
143 class RBBIDataWrapper : public UMemory { |
|
144 public: |
|
145 enum EDontAdopt { |
|
146 kDontAdopt |
|
147 }; |
|
148 RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status); |
|
149 RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt dontAdopt, UErrorCode &status); |
|
150 RBBIDataWrapper(UDataMemory* udm, UErrorCode &status); |
|
151 ~RBBIDataWrapper(); |
|
152 |
|
153 void init(const RBBIDataHeader *data, UErrorCode &status); |
|
154 RBBIDataWrapper *addReference(); |
|
155 void removeReference(); |
|
156 UBool operator ==(const RBBIDataWrapper &other) const; |
|
157 int32_t hashCode(); |
|
158 const UnicodeString &getRuleSourceString() const; |
|
159 #ifdef RBBI_DEBUG |
|
160 void printData(); |
|
161 void printTable(const char *heading, const RBBIStateTable *table); |
|
162 #else |
|
163 #define printData() |
|
164 #define printTable(heading, table) |
|
165 #endif |
|
166 |
|
167 /* */ |
|
168 /* Pointers to items within the data */ |
|
169 /* */ |
|
170 const RBBIDataHeader *fHeader; |
|
171 const RBBIStateTable *fForwardTable; |
|
172 const RBBIStateTable *fReverseTable; |
|
173 const RBBIStateTable *fSafeFwdTable; |
|
174 const RBBIStateTable *fSafeRevTable; |
|
175 const UChar *fRuleSource; |
|
176 const int32_t *fRuleStatusTable; |
|
177 |
|
178 /* number of int32_t values in the rule status table. Used to sanity check indexing */ |
|
179 int32_t fStatusMaxIdx; |
|
180 |
|
181 UTrie fTrie; |
|
182 |
|
183 private: |
|
184 u_atomic_int32_t fRefCount; |
|
185 UDataMemory *fUDataMem; |
|
186 UnicodeString fRuleString; |
|
187 UBool fDontFreeData; |
|
188 |
|
189 RBBIDataWrapper(const RBBIDataWrapper &other); /* forbid copying of this class */ |
|
190 RBBIDataWrapper &operator=(const RBBIDataWrapper &other); /* forbid copying of this class */ |
|
191 }; |
|
192 |
|
193 |
|
194 |
|
195 U_NAMESPACE_END |
|
196 |
|
197 #endif /* C++ */ |
|
198 |
|
199 #endif |