|
1 // Copyright 2013 Google Inc. All Rights Reserved. |
|
2 // |
|
3 // Licensed under the Apache License, Version 2.0 (the "License"); |
|
4 // you may not use this file except in compliance with the License. |
|
5 // You may obtain a copy of the License at |
|
6 // |
|
7 // http://www.apache.org/licenses/LICENSE-2.0 |
|
8 // |
|
9 // Unless required by applicable law or agreed to in writing, software |
|
10 // distributed under the License is distributed on an "AS IS" BASIS, |
|
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
12 // See the License for the specific language governing permissions and |
|
13 // limitations under the License. |
|
14 |
|
15 // |
|
16 // State Table follower for scanning UTF-8 strings without converting to |
|
17 // 32- or 16-bit Unicode values. |
|
18 // |
|
19 // Author: dsites@google.com (Dick Sites) |
|
20 // |
|
21 |
|
22 #ifndef UTIL_UTF8_UTF8STATETABLE_H_ |
|
23 #define UTIL_UTF8_UTF8STATETABLE_H_ |
|
24 |
|
25 #include <string> |
|
26 #include "integral_types.h" // for uint8, uint32, uint16 |
|
27 #include "stringpiece.h" |
|
28 |
|
29 |
|
30 namespace CLD2 { |
|
31 |
|
32 class OffsetMap; |
|
33 |
|
34 |
|
35 // These four-byte entries compactly encode how many bytes 0..255 to delete |
|
36 // in making a string replacement, how many bytes to add 0..255, and the offset |
|
37 // 0..64k-1 of the replacement string in remap_string. |
|
38 struct RemapEntry { |
|
39 uint8 delete_bytes; |
|
40 uint8 add_bytes; |
|
41 uint16 bytes_offset; |
|
42 }; |
|
43 |
|
44 // Exit type codes for state tables. All but the first get stuffed into |
|
45 // signed one-byte entries. The first is only generated by executable code. |
|
46 // To distinguish from next-state entries, these must be contiguous and |
|
47 // all <= kExitNone |
|
48 typedef enum { |
|
49 kExitDstSpaceFull = 239, |
|
50 kExitIllegalStructure, // 240 |
|
51 kExitOK, // 241 |
|
52 kExitReject, // ... |
|
53 kExitReplace1, |
|
54 kExitReplace2, |
|
55 kExitReplace3, |
|
56 kExitReplace21, |
|
57 kExitReplace31, |
|
58 kExitReplace32, |
|
59 kExitReplaceOffset1, |
|
60 kExitReplaceOffset2, |
|
61 kExitReplace1S0, |
|
62 kExitSpecial, |
|
63 kExitDoAgain, |
|
64 kExitRejectAlt, |
|
65 kExitNone // 255 |
|
66 } ExitReason; |
|
67 |
|
68 typedef enum { |
|
69 kExitDstSpaceFull_2 = 32767, // 0x7fff |
|
70 kExitIllegalStructure_2, // 32768 0x8000 |
|
71 kExitOK_2, // 32769 0x8001 |
|
72 kExitReject_2, // ... |
|
73 kExitReplace1_2, |
|
74 kExitReplace2_2, |
|
75 kExitReplace3_2, |
|
76 kExitReplace21_2, |
|
77 kExitReplace31_2, |
|
78 kExitReplace32_2, |
|
79 kExitReplaceOffset1_2, |
|
80 kExitReplaceOffset2_2, |
|
81 kExitReplace1S0_2, |
|
82 kExitSpecial_2, |
|
83 kExitDoAgain_2, |
|
84 kExitRejectAlt_2, |
|
85 kExitNone_2 // 32783 0x800f |
|
86 } ExitReason_2; |
|
87 |
|
88 |
|
89 // This struct represents one entire state table. The three initialized byte |
|
90 // areas are state_table, remap_base, and remap_string. state0 and state0_size |
|
91 // give the byte offset and length within state_table of the initial state -- |
|
92 // table lookups are expected to start and end in this state, but for |
|
93 // truncated UTF-8 strings, may end in a different state. These allow a quick |
|
94 // test for that condition. entry_shift is 8 for tables subscripted by a full |
|
95 // byte value and 6 for space-optimized tables subscripted by only six |
|
96 // significant bits in UTF-8 continuation bytes. |
|
97 typedef struct { |
|
98 const uint32 state0; |
|
99 const uint32 state0_size; |
|
100 const uint32 total_size; |
|
101 const int max_expand; |
|
102 const int entry_shift; |
|
103 const int bytes_per_entry; |
|
104 const uint32 losub; |
|
105 const uint32 hiadd; |
|
106 const uint8* state_table; |
|
107 const RemapEntry* remap_base; |
|
108 const uint8* remap_string; |
|
109 const uint8* fast_state; |
|
110 } UTF8StateMachineObj; |
|
111 |
|
112 // Near-duplicate declaration for tables with two-byte entries |
|
113 typedef struct { |
|
114 const uint32 state0; |
|
115 const uint32 state0_size; |
|
116 const uint32 total_size; |
|
117 const int max_expand; |
|
118 const int entry_shift; |
|
119 const int bytes_per_entry; |
|
120 const uint32 losub; |
|
121 const uint32 hiadd; |
|
122 const unsigned short* state_table; |
|
123 const RemapEntry* remap_base; |
|
124 const uint8* remap_string; |
|
125 const uint8* fast_state; |
|
126 } UTF8StateMachineObj_2; |
|
127 |
|
128 |
|
129 typedef UTF8StateMachineObj UTF8PropObj; |
|
130 typedef UTF8StateMachineObj UTF8ScanObj; |
|
131 typedef UTF8StateMachineObj UTF8ReplaceObj; |
|
132 typedef UTF8StateMachineObj_2 UTF8PropObj_2; |
|
133 typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2; |
|
134 // NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2; |
|
135 |
|
136 |
|
137 // Look up property of one UTF-8 character and advance over it |
|
138 // Return 0 if input length is zero |
|
139 // Return 0 and advance one byte if input is ill-formed |
|
140 uint8 UTF8GenericProperty(const UTF8PropObj* st, |
|
141 const uint8** src, |
|
142 int* srclen); |
|
143 |
|
144 // Look up property of one UTF-8 character (assumed to be valid). |
|
145 // (This is a faster version of UTF8GenericProperty.) |
|
146 bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src); |
|
147 |
|
148 |
|
149 // BigOneByte versions are needed for tables > 240 states, but most |
|
150 // won't need the TwoByte versions. |
|
151 |
|
152 // Look up property of one UTF-8 character and advance over it |
|
153 // Return 0 if input length is zero |
|
154 // Return 0 and advance one byte if input is ill-formed |
|
155 uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st, |
|
156 const uint8** src, |
|
157 int* srclen); |
|
158 |
|
159 |
|
160 // TwoByte versions are needed for tables > 240 states that don't fit onto |
|
161 // BigOneByte -- rare ultimate fallback |
|
162 |
|
163 // Look up property of one UTF-8 character (assumed to be valid). |
|
164 // (This is a faster version of UTF8GenericProperty.) |
|
165 bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src); |
|
166 |
|
167 // Look up property of one UTF-8 character and advance over it |
|
168 // Return 0 if input length is zero |
|
169 // Return 0 and advance one byte if input is ill-formed |
|
170 uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st, |
|
171 const uint8** src, |
|
172 int* srclen); |
|
173 |
|
174 // Look up property of one UTF-8 character (assumed to be valid). |
|
175 // (This is a faster version of UTF8GenericProperty.) |
|
176 bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src); |
|
177 |
|
178 // Scan a UTF-8 stringpiece based on a state table. |
|
179 // Always scan complete UTF-8 characters |
|
180 // Set number of bytes scanned. Return reason for exiting |
|
181 int UTF8GenericScan(const UTF8ScanObj* st, |
|
182 const StringPiece& str, |
|
183 int* bytes_consumed); |
|
184 |
|
185 |
|
186 |
|
187 // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece |
|
188 // and doing text replacements. |
|
189 // Always scan complete UTF-8 characters |
|
190 // Set number of bytes consumed from input, number filled to output. |
|
191 // Return reason for exiting |
|
192 // Also writes an optional OffsetMap. Pass NULL to skip writing one. |
|
193 int UTF8GenericReplace(const UTF8ReplaceObj* st, |
|
194 const StringPiece& istr, |
|
195 StringPiece& ostr, |
|
196 bool is_plain_text, |
|
197 int* bytes_consumed, |
|
198 int* bytes_filled, |
|
199 int* chars_changed, |
|
200 OffsetMap* offsetmap); |
|
201 |
|
202 // Older version without offsetmap |
|
203 int UTF8GenericReplace(const UTF8ReplaceObj* st, |
|
204 const StringPiece& istr, |
|
205 StringPiece& ostr, |
|
206 bool is_plain_text, |
|
207 int* bytes_consumed, |
|
208 int* bytes_filled, |
|
209 int* chars_changed); |
|
210 |
|
211 // Older version without is_plain_text or offsetmap |
|
212 int UTF8GenericReplace(const UTF8ReplaceObj* st, |
|
213 const StringPiece& istr, |
|
214 StringPiece& ostr, |
|
215 int* bytes_consumed, |
|
216 int* bytes_filled, |
|
217 int* chars_changed); |
|
218 |
|
219 |
|
220 // TwoByte version is needed for tables > about 256 states, such |
|
221 // as the table for full Unicode 4.1 canonical + compatibility mapping |
|
222 |
|
223 // Scan a UTF-8 stringpiece based on state table with two-byte entries, |
|
224 // copying to output stringpiece |
|
225 // and doing text replacements. |
|
226 // Always scan complete UTF-8 characters |
|
227 // Set number of bytes consumed from input, number filled to output. |
|
228 // Return reason for exiting |
|
229 // Also writes an optional OffsetMap. Pass NULL to skip writing one. |
|
230 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, |
|
231 const StringPiece& istr, |
|
232 StringPiece& ostr, |
|
233 bool is_plain_text, |
|
234 int* bytes_consumed, |
|
235 int* bytes_filled, |
|
236 int* chars_changed, |
|
237 OffsetMap* offsetmap); |
|
238 |
|
239 // Older version without offsetmap |
|
240 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, |
|
241 const StringPiece& istr, |
|
242 StringPiece& ostr, |
|
243 bool is_plain_text, |
|
244 int* bytes_consumed, |
|
245 int* bytes_filled, |
|
246 int* chars_changed); |
|
247 |
|
248 // Older version without is_plain_text or offsetmap |
|
249 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, |
|
250 const StringPiece& istr, |
|
251 StringPiece& ostr, |
|
252 int* bytes_consumed, |
|
253 int* bytes_filled, |
|
254 int* chars_changed); |
|
255 |
|
256 |
|
257 static const unsigned char kUTF8LenTbl[256] = { |
|
258 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
259 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
260 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
261 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
262 |
|
263 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
264 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
265 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
|
266 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4 |
|
267 }; |
|
268 |
|
269 inline int UTF8OneCharLen(const char* in) { |
|
270 return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)]; |
|
271 } |
|
272 |
|
273 // Adjust a stringpiece to encompass complete UTF-8 characters. |
|
274 // The data pointer will be increased by 0..3 bytes to get to a character |
|
275 // boundary, and the length will then be decreased by 0..3 bytes |
|
276 // to encompass the last complete character. |
|
277 // This is useful especially when a UTF-8 string must be put into a fixed- |
|
278 // maximum-size buffer cleanly, such as a MySQL buffer. |
|
279 void UTF8TrimToChars(StringPiece* istr); |
|
280 |
|
281 } // End namespace CLD2 |
|
282 |
|
283 #endif // UTIL_UTF8_UTF8STATETABLE_H_ |