|
1 // Copyright 2013 Google Inc. All Rights Reserved. |
|
2 // |
|
3 // Licensed under the Apache License, Version 2.0 (the "License"); |
|
4 // you may not use this file except in compliance with the License. |
|
5 // You may obtain a copy of the License at |
|
6 // |
|
7 // http://www.apache.org/licenses/LICENSE-2.0 |
|
8 // |
|
9 // Unless required by applicable law or agreed to in writing, software |
|
10 // distributed under the License is distributed on an "AS IS" BASIS, |
|
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
12 // See the License for the specific language governing permissions and |
|
13 // limitations under the License. |
|
14 |
|
15 // |
|
16 // State Table follower for scanning UTF-8 strings without converting to |
|
17 // 32- or 16-bit Unicode values. |
|
18 // |
|
19 |
|
20 #ifdef COMPILER_MSVC |
|
21 // MSVC warns: warning C4309: 'initializing' : truncation of constant value |
|
22 // But the value is in fact not truncated. 0xFF still comes out 0xFF at |
|
23 // runtime. |
|
24 #pragma warning ( disable : 4309 ) |
|
25 #endif |
|
26 |
|
27 #include "utf8statetable.h" |
|
28 |
|
29 #include <stdint.h> // for uintptr_t |
|
30 #include <string.h> // for NULL, memcpy, memmove |
|
31 |
|
32 #include "integral_types.h" // for uint8, uint32, int8 |
|
33 #include "stringpiece.h" |
|
34 #include "offsetmap.h" |
|
35 |
|
36 |
|
37 namespace CLD2 { |
|
38 |
|
39 static const int kReplaceAndResumeFlag = 0x80; // Bit in del byte to distinguish |
|
40 // optional next-state field |
|
41 // after replacement text |
|
42 static const int kHtmlPlaintextFlag = 0x80; // Bit in add byte to distinguish |
|
43 // HTML replacement vs. plaintext |
|
44 |
|
45 |
|
46 /** |
|
47 * This code implements a little interpreter for UTF8 state |
|
48 * tables. There are three kinds of quite-similar state tables, |
|
49 * property, scanning, and replacement. Each state in one of |
|
50 * these tables consists of an array of 256 or 64 one-byte |
|
51 * entries. The state is subscripted by an incoming source byte, |
|
52 * and the entry either specifies the next state or specifies an |
|
53 * action. Space-optimized tables have full 256-entry states for |
|
54 * the first byte of a UTF-8 character, but only 64-entry states |
|
55 * for continuation bytes. Space-optimized tables may only be |
|
56 * used with source input that has been checked to be |
|
57 * structurally- (or stronger interchange-) valid. |
|
58 * |
|
59 * A property state table has an unsigned one-byte property for |
|
60 * each possible UTF-8 character. One-byte character properties |
|
61 * are in the state[0] array, while for other lengths the |
|
62 * state[0] array gives the next state, which contains the |
|
63 * property value for two-byte characters or yet another state |
|
64 * for longer ones. The code simply loads the right number of |
|
65 * next-state values, then returns the final byte as property |
|
66 * value. There are no actions specified in property tables. |
|
67 * States are typically shared for multi-byte UTF-8 characters |
|
68 * that all have the same property value. |
|
69 * |
|
70 * A scanning state table has entries that are either a |
|
71 * next-state specifier for bytes that are accepted by the |
|
72 * scanner, or an exit action for the last byte of each |
|
73 * character that is rejected by the scanner. |
|
74 * |
|
75 * Scanning long strings involves a tight loop that picks up one |
|
76 * byte at a time and follows next-state value back to state[0] |
|
77 * for each accepted UTF-8 character. Scanning stops at the end |
|
78 * of the string or at the first character encountered that has |
|
79 * an exit action such as "reject". Timing information is given |
|
80 * below. |
|
81 * |
|
82 * Since so much of Google's text is 7-bit-ASCII values |
|
83 * (approximately 94% of the bytes of web documents), the |
|
84 * scanning interpreter has two speed optimizations. One checks |
|
85 * 8 bytes at a time to see if they are all in the range lo..hi, |
|
86 * as specified in constants in the overall statetable object. |
|
87 * The check involves ORing together four 4-byte values that |
|
88 * overflow into the high bit of some byte when a byte is out of |
|
89 * range. For seven-bit-ASCII, lo is 0x20 and hi is 0x7E. This |
|
90 * loop is about 8x faster than the one-byte-at-a-time loop. |
|
91 * |
|
92 * If checking for exit bytes in the 0x00-0x1F and 7F range is |
|
93 * unneeded, an even faster loop just looks at the high bits of |
|
94 * 8 bytes at once, and is about 1.33x faster than the lo..hi |
|
95 * loop. |
|
96 * |
|
97 * Exit from the scanning routines backs up to the first byte of |
|
98 * the rejected character, so the text spanned is always a |
|
99 * complete number of UTF-8 characters. The normal scanning exit |
|
100 * is at the first rejected character, or at the end of the |
|
101 * input text. Scanning also exits on any detected ill-formed |
|
102 * character or at a special do-again action built into some |
|
103 * exit-optimized tables. The do-again action gets back to the |
|
104 * top of the scanning loop to retry eight-byte ASCII scans. It |
|
105 * is typically put into state tables after four seven-bit-ASCII |
|
106 * characters in a row are seen, to allow restarting the fast |
|
107 * scan after some slower processing of multi-byte characters. |
|
108 * |
|
109 * A replacement state table is similar to a scanning state |
|
110 * table but has more extensive actions. The default |
|
111 * byte-at-a-time loop copies one byte from source to |
|
112 * destination and goes to the next state. The replacement |
|
113 * actions overwrite 1-3 bytes of the destination with different |
|
114 * bytes, possibly shortening the output by 1 or 2 bytes. The |
|
115 * replacement bytes come from within the state table, from |
|
116 * dummy states inserted just after any state that contains a |
|
117 * replacement action. This gives a quick address calculation for |
|
118 * the replacement byte(s) and gives some cache locality. |
|
119 * |
|
120 * Additional replacement actions use one or two bytes from |
|
121 * within dummy states to index a side table of more-extensive |
|
122 * replacements. The side table specifies a length of 0..15 |
|
123 * destination bytes to overwrite and a length of 0..127 bytes |
|
124 * to overwrite them with, plus the actual replacement bytes. |
|
125 * |
|
126 * This side table uses one extra bit to specify a pair of |
|
127 * replacements, the first to be used in an HTML context and the |
|
128 * second to be used in a plaintext context. This allows |
|
129 * replacements that are spelled with "<" in the former |
|
130 * context and "<" in the latter. |
|
131 * |
|
132 * The side table also uses an extra bit to specify a non-zero |
|
133 * next state after a replacement. This allows a combination |
|
134 * replacement and state change, used to implement a limited |
|
135 * version of the Boyer-Moore algorithm for multi-character |
|
136 * replacement without backtracking. This is useful when there |
|
137 * are overlapping replacements, such as ch => x and also c => |
|
138 * y, the latter to be used only if the character after c is not |
|
139 * h. in this case, the state[0] table's entry for c would |
|
140 * change c to y and also have a next-state of say n, and the |
|
141 * state[n] entry for h would specify a replacement of the two |
|
142 * bytes yh by x. No backtracking is needed. |
|
143 * |
|
144 * A replacement table may also include the exit actions of a |
|
145 * scanning state table, so some character sequences can |
|
146 * terminate early. |
|
147 * |
|
148 * During replacement, an optional data structure called an |
|
149 * offset map can be updated to reflect each change in length |
|
150 * between source and destination. This offset map can later be |
|
151 * used to map destination-string offsets to corresponding |
|
152 * source-string offsets or vice versa. |
|
153 * |
|
154 * The routines below also have variants in which state-table |
|
155 * entries are all two bytes instead of one byte. This allows |
|
156 * tables with more than 240 total states, but takes up twice as |
|
157 * much space per state. |
|
158 * |
|
159 **/ |
|
160 |
|
161 // Return true if current Tbl pointer is within state0 range |
|
162 // Note that unsigned compare checks both ends of range simultaneously |
|
163 static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) { |
|
164 const uint8* Tbl0 = &st->state_table[st->state0]; |
|
165 return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size); |
|
166 } |
|
167 |
|
168 static inline bool InStateZero_2(const UTF8ReplaceObj_2* st, |
|
169 const unsigned short int* Tbl) { |
|
170 const unsigned short int* Tbl0 = &st->state_table[st->state0]; |
|
171 // Word difference, not byte difference |
|
172 return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size); |
|
173 } |
|
174 |
|
175 // UTF8PropObj, UTF8ScanObj, UTF8ReplaceObj are all typedefs of |
|
176 // UTF8MachineObj. |
|
177 |
|
178 static bool IsPropObj(const UTF8StateMachineObj& obj) { |
|
179 return obj.fast_state == NULL |
|
180 && obj.max_expand == 0; |
|
181 } |
|
182 |
|
183 static bool IsPropObj_2(const UTF8StateMachineObj_2& obj) { |
|
184 return obj.fast_state == NULL |
|
185 && obj.max_expand == 0; |
|
186 } |
|
187 |
|
188 static bool IsScanObj(const UTF8StateMachineObj& obj) { |
|
189 return obj.fast_state != NULL |
|
190 && obj.max_expand == 0; |
|
191 } |
|
192 |
|
193 static bool IsReplaceObj(const UTF8StateMachineObj& obj) { |
|
194 // Normally, obj.fast_state != NULL, but the handwritten tables |
|
195 // in utf8statetable_unittest don't handle fast_states. |
|
196 return obj.max_expand > 0; |
|
197 } |
|
198 |
|
199 static bool IsReplaceObj_2(const UTF8StateMachineObj_2& obj) { |
|
200 return obj.max_expand > 0; |
|
201 } |
|
202 |
|
203 // Look up property of one UTF-8 character and advance over it |
|
204 // Return 0 if input length is zero |
|
205 // Return 0 and advance one byte if input is ill-formed |
|
206 uint8 UTF8GenericProperty(const UTF8PropObj* st, |
|
207 const uint8** src, |
|
208 int* srclen) { |
|
209 if (*srclen <= 0) { |
|
210 return 0; |
|
211 } |
|
212 |
|
213 const uint8* lsrc = *src; |
|
214 const uint8* Tbl_0 = &st->state_table[st->state0]; |
|
215 const uint8* Tbl = Tbl_0; |
|
216 int e; |
|
217 int eshift = st->entry_shift; |
|
218 |
|
219 // Short series of tests faster than switch, optimizes 7-bit ASCII |
|
220 unsigned char c = lsrc[0]; |
|
221 if (static_cast<signed char>(c) >= 0) { // one byte |
|
222 e = Tbl[c]; |
|
223 *src += 1; |
|
224 *srclen -= 1; |
|
225 } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes |
|
226 e = Tbl[c]; |
|
227 Tbl = &Tbl_0[e << eshift]; |
|
228 e = Tbl[lsrc[1]]; |
|
229 *src += 2; |
|
230 *srclen -= 2; |
|
231 } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes |
|
232 e = Tbl[c]; |
|
233 Tbl = &Tbl_0[e << eshift]; |
|
234 e = Tbl[lsrc[1]]; |
|
235 Tbl = &Tbl_0[e << eshift]; |
|
236 e = Tbl[lsrc[2]]; |
|
237 *src += 3; |
|
238 *srclen -= 3; |
|
239 }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes |
|
240 e = Tbl[c]; |
|
241 Tbl = &Tbl_0[e << eshift]; |
|
242 e = Tbl[lsrc[1]]; |
|
243 Tbl = &Tbl_0[e << eshift]; |
|
244 e = Tbl[lsrc[2]]; |
|
245 Tbl = &Tbl_0[e << eshift]; |
|
246 e = Tbl[lsrc[3]]; |
|
247 *src += 4; |
|
248 *srclen -= 4; |
|
249 } else { // Ill-formed |
|
250 e = 0; |
|
251 *src += 1; |
|
252 *srclen -= 1; |
|
253 } |
|
254 return e; |
|
255 } |
|
256 |
|
257 bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src) { |
|
258 const uint8* lsrc = reinterpret_cast<const uint8*>(src); |
|
259 const uint8* Tbl_0 = &st.state_table[st.state0]; |
|
260 const uint8* Tbl = Tbl_0; |
|
261 int e; |
|
262 int eshift = st.entry_shift; |
|
263 |
|
264 // Short series of tests faster than switch, optimizes 7-bit ASCII |
|
265 unsigned char c = lsrc[0]; |
|
266 if (static_cast<signed char>(c) >= 0) { // one byte |
|
267 e = Tbl[c]; |
|
268 } else if ((c & 0xe0) == 0xc0) { // two bytes |
|
269 e = Tbl[c]; |
|
270 Tbl = &Tbl_0[e << eshift]; |
|
271 e = Tbl[lsrc[1]]; |
|
272 } else if ((c & 0xf0) == 0xe0) { // three bytes |
|
273 e = Tbl[c]; |
|
274 Tbl = &Tbl_0[e << eshift]; |
|
275 e = Tbl[lsrc[1]]; |
|
276 Tbl = &Tbl_0[e << eshift]; |
|
277 e = Tbl[lsrc[2]]; |
|
278 } else { // four bytes |
|
279 e = Tbl[c]; |
|
280 Tbl = &Tbl_0[e << eshift]; |
|
281 e = Tbl[lsrc[1]]; |
|
282 Tbl = &Tbl_0[e << eshift]; |
|
283 e = Tbl[lsrc[2]]; |
|
284 Tbl = &Tbl_0[e << eshift]; |
|
285 e = Tbl[lsrc[3]]; |
|
286 } |
|
287 return e; |
|
288 } |
|
289 |
|
290 |
|
291 // BigOneByte versions are needed for tables > 240 states, but most |
|
292 // won't need the TwoByte versions. |
|
293 // Internally, to next-to-last offset is multiplied by 16 and the last |
|
294 // offset is relative instead of absolute. |
|
295 // Look up property of one UTF-8 character and advance over it |
|
296 // Return 0 if input length is zero |
|
297 // Return 0 and advance one byte if input is ill-formed |
|
298 uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st, |
|
299 const uint8** src, |
|
300 int* srclen) { |
|
301 if (*srclen <= 0) { |
|
302 return 0; |
|
303 } |
|
304 |
|
305 const uint8* lsrc = *src; |
|
306 const uint8* Tbl_0 = &st->state_table[st->state0]; |
|
307 const uint8* Tbl = Tbl_0; |
|
308 int e; |
|
309 int eshift = st->entry_shift; |
|
310 |
|
311 // Short series of tests faster than switch, optimizes 7-bit ASCII |
|
312 unsigned char c = lsrc[0]; |
|
313 if (static_cast<signed char>(c) >= 0) { // one byte |
|
314 e = Tbl[c]; |
|
315 *src += 1; |
|
316 *srclen -= 1; |
|
317 } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes |
|
318 e = Tbl[c]; |
|
319 Tbl = &Tbl_0[e << eshift]; |
|
320 e = Tbl[lsrc[1]]; |
|
321 *src += 2; |
|
322 *srclen -= 2; |
|
323 } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes |
|
324 e = Tbl[c]; |
|
325 Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range |
|
326 e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]]; |
|
327 Tbl = &Tbl[e << eshift]; // Relative +/- |
|
328 e = Tbl[lsrc[2]]; |
|
329 *src += 3; |
|
330 *srclen -= 3; |
|
331 }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes |
|
332 e = Tbl[c]; |
|
333 Tbl = &Tbl_0[e << eshift]; |
|
334 e = Tbl[lsrc[1]]; |
|
335 Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range |
|
336 e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]]; |
|
337 Tbl = &Tbl[e << eshift]; // Relative +/- |
|
338 e = Tbl[lsrc[3]]; |
|
339 *src += 4; |
|
340 *srclen -= 4; |
|
341 } else { // Ill-formed |
|
342 e = 0; |
|
343 *src += 1; |
|
344 *srclen -= 1; |
|
345 } |
|
346 return e; |
|
347 } |
|
348 |
|
349 // BigOneByte versions are needed for tables > 240 states, but most |
|
350 // won't need the TwoByte versions. |
|
351 bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src) { |
|
352 const uint8* lsrc = reinterpret_cast<const uint8*>(src); |
|
353 const uint8* Tbl_0 = &st.state_table[st.state0]; |
|
354 const uint8* Tbl = Tbl_0; |
|
355 int e; |
|
356 int eshift = st.entry_shift; |
|
357 |
|
358 // Short series of tests faster than switch, optimizes 7-bit ASCII |
|
359 unsigned char c = lsrc[0]; |
|
360 if (static_cast<signed char>(c) >= 0) { // one byte |
|
361 e = Tbl[c]; |
|
362 } else if ((c & 0xe0) == 0xc0) { // two bytes |
|
363 e = Tbl[c]; |
|
364 Tbl = &Tbl_0[e << eshift]; |
|
365 e = Tbl[lsrc[1]]; |
|
366 } else if ((c & 0xf0) == 0xe0) { // three bytes |
|
367 e = Tbl[c]; |
|
368 Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range |
|
369 e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]]; |
|
370 Tbl = &Tbl[e << eshift]; // Relative +/- |
|
371 e = Tbl[lsrc[2]]; |
|
372 } else { // four bytes |
|
373 e = Tbl[c]; |
|
374 Tbl = &Tbl_0[e << eshift]; |
|
375 e = Tbl[lsrc[1]]; |
|
376 Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range |
|
377 e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]]; |
|
378 Tbl = &Tbl[e << eshift]; // Relative +/- |
|
379 e = Tbl[lsrc[3]]; |
|
380 } |
|
381 return e; |
|
382 } |
|
383 |
|
384 |
|
385 // TwoByte versions are needed for tables > 240 states |
|
386 // Look up property of one UTF-8 character and advance over it |
|
387 // Return 0 if input length is zero |
|
388 // Return 0 and advance one byte if input is ill-formed |
|
389 uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st, |
|
390 const uint8** src, |
|
391 int* srclen) { |
|
392 if (*srclen <= 0) { |
|
393 return 0; |
|
394 } |
|
395 |
|
396 const uint8* lsrc = *src; |
|
397 const unsigned short* Tbl_0 = &st->state_table[st->state0]; |
|
398 const unsigned short* Tbl = Tbl_0; |
|
399 int e; |
|
400 int eshift = st->entry_shift; |
|
401 |
|
402 // Short series of tests faster than switch, optimizes 7-bit ASCII |
|
403 unsigned char c = lsrc[0]; |
|
404 if (static_cast<signed char>(c) >= 0) { // one byte |
|
405 e = Tbl[c]; |
|
406 *src += 1; |
|
407 *srclen -= 1; |
|
408 } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes |
|
409 e = Tbl[c]; |
|
410 Tbl = &Tbl_0[e << eshift]; |
|
411 e = Tbl[lsrc[1]]; |
|
412 *src += 2; |
|
413 *srclen -= 2; |
|
414 } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes |
|
415 e = Tbl[c]; |
|
416 Tbl = &Tbl_0[e << eshift]; |
|
417 e = Tbl[lsrc[1]]; |
|
418 Tbl = &Tbl_0[e << eshift]; |
|
419 e = Tbl[lsrc[2]]; |
|
420 *src += 3; |
|
421 *srclen -= 3; |
|
422 }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes |
|
423 e = Tbl[c]; |
|
424 Tbl = &Tbl_0[e << eshift]; |
|
425 e = Tbl[lsrc[1]]; |
|
426 Tbl = &Tbl_0[e << eshift]; |
|
427 e = Tbl[lsrc[2]]; |
|
428 Tbl = &Tbl_0[e << eshift]; |
|
429 e = Tbl[lsrc[3]]; |
|
430 *src += 4; |
|
431 *srclen -= 4; |
|
432 } else { // Ill-formed |
|
433 e = 0; |
|
434 *src += 1; |
|
435 *srclen -= 1; |
|
436 } |
|
437 return e; |
|
438 } |
|
439 |
|
440 // TwoByte versions are needed for tables > 240 states |
|
441 bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src) { |
|
442 const uint8* lsrc = reinterpret_cast<const uint8*>(src); |
|
443 const unsigned short* Tbl_0 = &st.state_table[st.state0]; |
|
444 const unsigned short* Tbl = Tbl_0; |
|
445 int e; |
|
446 int eshift = st.entry_shift; |
|
447 |
|
448 // Short series of tests faster than switch, optimizes 7-bit ASCII |
|
449 unsigned char c = lsrc[0]; |
|
450 if (static_cast<signed char>(c) >= 0) { // one byte |
|
451 e = Tbl[c]; |
|
452 } else if ((c & 0xe0) == 0xc0) { // two bytes |
|
453 e = Tbl[c]; |
|
454 Tbl = &Tbl_0[e << eshift]; |
|
455 e = Tbl[lsrc[1]]; |
|
456 } else if ((c & 0xf0) == 0xe0) { // three bytes |
|
457 e = Tbl[c]; |
|
458 Tbl = &Tbl_0[e << eshift]; |
|
459 e = Tbl[lsrc[1]]; |
|
460 Tbl = &Tbl_0[e << eshift]; |
|
461 e = Tbl[lsrc[2]]; |
|
462 } else { // four bytes |
|
463 e = Tbl[c]; |
|
464 Tbl = &Tbl_0[e << eshift]; |
|
465 e = Tbl[lsrc[1]]; |
|
466 Tbl = &Tbl_0[e << eshift]; |
|
467 e = Tbl[lsrc[2]]; |
|
468 Tbl = &Tbl_0[e << eshift]; |
|
469 e = Tbl[lsrc[3]]; |
|
470 } |
|
471 return e; |
|
472 } |
|
473 |
|
474 |
|
475 // Approximate speeds on 2.8 GHz Pentium 4: |
|
476 // GenericScan 1-byte loop 300 MB/sec * |
|
477 // GenericScan 4-byte loop 1200 MB/sec |
|
478 // GenericScan 8-byte loop 2400 MB/sec * |
|
479 // GenericScanFastAscii 4-byte loop 3000 MB/sec |
|
480 // GenericScanFastAscii 8-byte loop 3200 MB/sec * |
|
481 // |
|
482 // * Implemented below. FastAscii loop is memory-bandwidth constrained. |
|
483 |
|
484 // Scan a UTF-8 stringpiece based on state table. |
|
485 // Always scan complete UTF-8 characters |
|
486 // Set number of bytes scanned. Return reason for exiting |
|
487 int UTF8GenericScan(const UTF8ScanObj* st, |
|
488 const StringPiece& str, |
|
489 int* bytes_consumed) { |
|
490 int eshift = st->entry_shift; // 6 (space optimized) or 8 |
|
491 // int nEntries = (1 << eshift); // 64 or 256 entries per state |
|
492 |
|
493 const uint8* isrc = |
|
494 reinterpret_cast<const uint8*>(str.data()); |
|
495 const uint8* src = isrc; |
|
496 const int len = str.length(); |
|
497 const uint8* srclimit = isrc + len; |
|
498 const uint8* srclimit8 = srclimit - 7; |
|
499 *bytes_consumed = 0; |
|
500 if (len == 0) return kExitOK; |
|
501 |
|
502 const uint8* Tbl_0 = &st->state_table[st->state0]; |
|
503 |
|
504 DoAgain: |
|
505 // Do state-table scan |
|
506 int e = 0; |
|
507 uint8 c; |
|
508 |
|
509 // Do fast for groups of 8 identity bytes. |
|
510 // This covers a lot of 7-bit ASCII ~8x faster than the 1-byte loop, |
|
511 // including slowing slightly on cr/lf/ht |
|
512 //---------------------------- |
|
513 const uint8* Tbl2 = &st->fast_state[0]; |
|
514 uint32 losub = st->losub; |
|
515 uint32 hiadd = st->hiadd; |
|
516 while (src < srclimit8) { |
|
517 uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0]; |
|
518 uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1]; |
|
519 src += 8; |
|
520 // This is a fast range check for all bytes in [lowsub..0x80-hiadd) |
|
521 uint32 temp = (s0123 - losub) | (s0123 + hiadd) | |
|
522 (s4567 - losub) | (s4567 + hiadd); |
|
523 if ((temp & 0x80808080) != 0) { |
|
524 // We typically end up here on cr/lf/ht; src was incremented |
|
525 int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) | |
|
526 (Tbl2[src[-6]] | Tbl2[src[-5]]); |
|
527 if (e0123 != 0) {src -= 8; break;} // Exit on Non-interchange |
|
528 e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) | |
|
529 (Tbl2[src[-2]] | Tbl2[src[-1]]); |
|
530 if (e0123 != 0) {src -= 4; break;} // Exit on Non-interchange |
|
531 // Else OK, go around again |
|
532 } |
|
533 } |
|
534 //---------------------------- |
|
535 |
|
536 // Byte-at-a-time scan |
|
537 //---------------------------- |
|
538 const uint8* Tbl = Tbl_0; |
|
539 while (src < srclimit) { |
|
540 c = *src; |
|
541 e = Tbl[c]; |
|
542 src++; |
|
543 if (e >= kExitIllegalStructure) {break;} |
|
544 Tbl = &Tbl_0[e << eshift]; |
|
545 } |
|
546 //---------------------------- |
|
547 |
|
548 |
|
549 // Exit possibilities: |
|
550 // Some exit code, !state0, back up over last char |
|
551 // Some exit code, state0, back up one byte exactly |
|
552 // source consumed, !state0, back up over partial char |
|
553 // source consumed, state0, exit OK |
|
554 // For illegal byte in state0, avoid backup up over PREVIOUS char |
|
555 // For truncated last char, back up to beginning of it |
|
556 |
|
557 if (e >= kExitIllegalStructure) { |
|
558 // Back up over exactly one byte of rejected/illegal UTF-8 character |
|
559 src--; |
|
560 // Back up more if needed |
|
561 if (!InStateZero(st, Tbl)) { |
|
562 do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); |
|
563 } |
|
564 } else if (!InStateZero(st, Tbl)) { |
|
565 // Back up over truncated UTF-8 character |
|
566 e = kExitIllegalStructure; |
|
567 do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); |
|
568 } else { |
|
569 // Normal termination, source fully consumed |
|
570 e = kExitOK; |
|
571 } |
|
572 |
|
573 if (e == kExitDoAgain) { |
|
574 // Loop back up to the fast scan |
|
575 goto DoAgain; |
|
576 } |
|
577 |
|
578 *bytes_consumed = src - isrc; |
|
579 return e; |
|
580 } |
|
581 |
|
582 // Scan a UTF-8 stringpiece based on state table. |
|
583 // Always scan complete UTF-8 characters |
|
584 // Set number of bytes scanned. Return reason for exiting |
|
585 // OPTIMIZED for case of 7-bit ASCII 0000..007f all valid |
|
586 int UTF8GenericScanFastAscii(const UTF8ScanObj* st, |
|
587 const StringPiece& str, |
|
588 int* bytes_consumed) { |
|
589 const uint8* isrc = |
|
590 reinterpret_cast<const uint8*>(str.data()); |
|
591 const uint8* src = isrc; |
|
592 const int len = str.length(); |
|
593 const uint8* srclimit = isrc + len; |
|
594 const uint8* srclimit8 = srclimit - 7; |
|
595 *bytes_consumed = 0; |
|
596 if (len == 0) return kExitOK; |
|
597 |
|
598 int n; |
|
599 int rest_consumed; |
|
600 int exit_reason; |
|
601 do { |
|
602 // Skip 8 bytes of ASCII at a whack; no endianness issue |
|
603 while ((src < srclimit8) && |
|
604 (((reinterpret_cast<const uint32*>(src)[0] | |
|
605 reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) { |
|
606 src += 8; |
|
607 } |
|
608 // Run state table on the rest |
|
609 n = src - isrc; |
|
610 StringPiece str2(str.data() + n, str.length() - n); |
|
611 exit_reason = UTF8GenericScan(st, str2, &rest_consumed); |
|
612 src += rest_consumed; |
|
613 } while ( exit_reason == kExitDoAgain ); |
|
614 |
|
615 *bytes_consumed = src - isrc; |
|
616 return exit_reason; |
|
617 } |
|
618 |
|
619 // Hack to change halfwidth katakana to match an old UTF8CharToLower() |
|
620 |
|
621 // Return number of src bytes skipped |
|
622 static int DoSpecialFixup(const unsigned char c, |
|
623 const unsigned char** srcp, const unsigned char* srclimit, |
|
624 unsigned char** dstp, unsigned char* dstlimit) { |
|
625 return 0; |
|
626 } |
|
627 |
|
628 |
|
629 // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece |
|
630 // and doing text replacements. |
|
631 // DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below |
|
632 // Needs caller to loop on kExitDoAgain |
|
633 static int UTF8GenericReplaceInternal(const UTF8ReplaceObj* st, |
|
634 const StringPiece& istr, |
|
635 StringPiece& ostr, |
|
636 bool is_plain_text, |
|
637 int* bytes_consumed, |
|
638 int* bytes_filled, |
|
639 int* chars_changed, |
|
640 OffsetMap* offsetmap) { |
|
641 int eshift = st->entry_shift; |
|
642 int nEntries = (1 << eshift); // 64 or 256 entries per state |
|
643 const uint8* isrc = reinterpret_cast<const uint8*>(istr.data()); |
|
644 const int ilen = istr.length(); |
|
645 const uint8* copystart = isrc; |
|
646 const uint8* src = isrc; |
|
647 const uint8* srclimit = src + ilen; |
|
648 *bytes_consumed = 0; |
|
649 *bytes_filled = 0; |
|
650 *chars_changed = 0; |
|
651 |
|
652 const uint8* odst = reinterpret_cast<const uint8*>(ostr.data()); |
|
653 const int olen = ostr.length(); |
|
654 uint8* dst = const_cast<uint8*>(odst); |
|
655 uint8* dstlimit = dst + olen; |
|
656 |
|
657 int total_changed = 0; |
|
658 |
|
659 // Invariant condition during replacements: |
|
660 // remaining dst size >= remaining src size |
|
661 if ((dstlimit - dst) < (srclimit - src)) { |
|
662 if (offsetmap != NULL) { |
|
663 offsetmap->Copy(src - copystart); |
|
664 copystart = src; |
|
665 } |
|
666 return kExitDstSpaceFull; |
|
667 } |
|
668 const uint8* Tbl_0 = &st->state_table[st->state0]; |
|
669 |
|
670 Do_state_table: |
|
671 // Do state-table scan, copying as we go |
|
672 const uint8* Tbl = Tbl_0; |
|
673 int e = 0; |
|
674 uint8 c = 0; |
|
675 |
|
676 Do_state_table_newe: |
|
677 |
|
678 //---------------------------- |
|
679 while (src < srclimit) { |
|
680 c = *src; |
|
681 e = Tbl[c]; |
|
682 *dst = c; |
|
683 src++; |
|
684 dst++; |
|
685 if (e >= kExitIllegalStructure) {break;} |
|
686 Tbl = &Tbl_0[e << eshift]; |
|
687 } |
|
688 //---------------------------- |
|
689 |
|
690 // Exit possibilities: |
|
691 // Replacement code, do the replacement and loop |
|
692 // Some other exit code, state0, back up one byte exactly |
|
693 // Some other exit code, !state0, back up over last char |
|
694 // source consumed, state0, exit OK |
|
695 // source consumed, !state0, back up over partial char |
|
696 // For illegal byte in state0, avoid backup up over PREVIOUS char |
|
697 // For truncated last char, back up to beginning of it |
|
698 |
|
699 if (e >= kExitIllegalStructure) { |
|
700 // Switch on exit code; most loop back to top |
|
701 int offset = 0; |
|
702 switch (e) { |
|
703 // These all make the output string the same size or shorter |
|
704 // No checking needed |
|
705 case kExitReplace31: // del 2, add 1 bytes to change |
|
706 dst -= 2; |
|
707 if (offsetmap != NULL) { |
|
708 offsetmap->Copy(src - copystart - 2); |
|
709 offsetmap->Delete(2); |
|
710 copystart = src; |
|
711 } |
|
712 dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)]; |
|
713 total_changed++; |
|
714 goto Do_state_table; |
|
715 case kExitReplace32: // del 3, add 2 bytes to change |
|
716 dst--; |
|
717 if (offsetmap != NULL) { |
|
718 offsetmap->Copy(src - copystart - 1); |
|
719 offsetmap->Delete(1); |
|
720 copystart = src; |
|
721 } |
|
722 dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)]; |
|
723 dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)]; |
|
724 total_changed++; |
|
725 goto Do_state_table; |
|
726 case kExitReplace21: // del 2, add 1 bytes to change |
|
727 dst--; |
|
728 if (offsetmap != NULL) { |
|
729 offsetmap->Copy(src - copystart - 1); |
|
730 offsetmap->Delete(1); |
|
731 copystart = src; |
|
732 } |
|
733 dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)]; |
|
734 total_changed++; |
|
735 goto Do_state_table; |
|
736 case kExitReplace3: // update 3 bytes to change |
|
737 dst[-3] = (unsigned char)Tbl[c + (nEntries * 3)]; |
|
738 // Fall into next case |
|
739 case kExitReplace2: // update 2 bytes to change |
|
740 dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)]; |
|
741 // Fall into next case |
|
742 case kExitReplace1: // update 1 byte to change |
|
743 dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)]; |
|
744 total_changed++; |
|
745 goto Do_state_table; |
|
746 case kExitReplace1S0: // update 1 byte to change, 256-entry state |
|
747 dst[-1] = (unsigned char)Tbl[c + (256 * 1)]; |
|
748 total_changed++; |
|
749 goto Do_state_table; |
|
750 // These can make the output string longer than the input |
|
751 case kExitReplaceOffset2: |
|
752 if ((nEntries != 256) && InStateZero(st, Tbl)) { |
|
753 // For space-optimized table, we need multiples of 256 bytes |
|
754 // in state0 and multiples of nEntries in other states |
|
755 offset += ((unsigned char)Tbl[c + (256 * 2)] << 8); |
|
756 } else { |
|
757 offset += ((unsigned char)Tbl[c + (nEntries * 2)] << 8); |
|
758 } |
|
759 // Fall into next case |
|
760 case kExitSpecial: // Apply special fixups [read: hacks] |
|
761 case kExitReplaceOffset1: |
|
762 if ((nEntries != 256) && InStateZero(st, Tbl)) { |
|
763 // For space-optimized table, we need multiples of 256 bytes |
|
764 // in state0 and multiples of nEntries in other states |
|
765 offset += (unsigned char)Tbl[c + (256 * 1)]; |
|
766 } else { |
|
767 offset += (unsigned char)Tbl[c + (nEntries * 1)]; |
|
768 } |
|
769 { |
|
770 const RemapEntry* re = &st->remap_base[offset]; |
|
771 int del_len = re->delete_bytes & ~kReplaceAndResumeFlag; |
|
772 int add_len = re->add_bytes & ~kHtmlPlaintextFlag; |
|
773 |
|
774 // Special-case non-HTML replacement of five sensitive entities |
|
775 // " & ' < > |
|
776 // 0022 0026 0027 003c 003e |
|
777 // A replacement creating one of these is expressed as a pair of |
|
778 // entries, one for HTML output and one for plaintext output. |
|
779 // The first of the pair has the high bit of add_bytes set. |
|
780 if (re->add_bytes & kHtmlPlaintextFlag) { |
|
781 // Use this entry for plain text |
|
782 if (!is_plain_text) { |
|
783 // Use very next entry for HTML text (same back/delete length) |
|
784 re = &st->remap_base[offset + 1]; |
|
785 add_len = re->add_bytes & ~kHtmlPlaintextFlag; |
|
786 } |
|
787 } |
|
788 |
|
789 int string_offset = re->bytes_offset; |
|
790 // After the replacement, need (dstlimit - newdst) >= (srclimit - src) |
|
791 uint8* newdst = dst - del_len + add_len; |
|
792 if ((dstlimit - newdst) < (srclimit - src)) { |
|
793 // Won't fit; don't do the replacement. Caller may realloc and retry |
|
794 e = kExitDstSpaceFull; |
|
795 break; // exit, backing up over this char for later retry |
|
796 } |
|
797 dst -= del_len; |
|
798 memcpy(dst, &st->remap_string[string_offset], add_len); |
|
799 dst += add_len; |
|
800 total_changed++; |
|
801 if (offsetmap != NULL) { |
|
802 if (add_len > del_len) { |
|
803 offsetmap->Copy(src - copystart); |
|
804 offsetmap->Insert(add_len - del_len); |
|
805 copystart = src; |
|
806 } else if (add_len < del_len) { |
|
807 offsetmap->Copy(src - copystart + add_len - del_len); |
|
808 offsetmap->Delete(del_len - add_len); |
|
809 copystart = src; |
|
810 } |
|
811 } |
|
812 if (re->delete_bytes & kReplaceAndResumeFlag) { |
|
813 // There is a non-zero target state at the end of the |
|
814 // replacement string |
|
815 e = st->remap_string[string_offset + add_len]; |
|
816 Tbl = &Tbl_0[e << eshift]; |
|
817 goto Do_state_table_newe; |
|
818 } |
|
819 } |
|
820 if (e == kExitRejectAlt) {break;} |
|
821 if (e != kExitSpecial) {goto Do_state_table;} |
|
822 |
|
823 // case kExitSpecial: // Apply special fixups [read: hacks] |
|
824 // In this routine, do either UTF8CharToLower() |
|
825 // fullwidth/halfwidth mapping or |
|
826 // voiced mapping or |
|
827 // semi-voiced mapping |
|
828 |
|
829 // First, do EXIT_REPLACE_OFFSET1 action (above) |
|
830 // Second: do additional code fixup |
|
831 { |
|
832 int srcdel = DoSpecialFixup(c, &src, srclimit, &dst, dstlimit); |
|
833 if (offsetmap != NULL) { |
|
834 if (srcdel != 0) { |
|
835 offsetmap->Copy(src - copystart - srcdel); |
|
836 offsetmap->Delete(srcdel); |
|
837 copystart = src; |
|
838 } |
|
839 } |
|
840 } |
|
841 goto Do_state_table; |
|
842 |
|
843 case kExitIllegalStructure: // structurally illegal byte; quit |
|
844 case kExitReject: // NUL or illegal code encountered; quit |
|
845 case kExitRejectAlt: // Apply replacement, then exit |
|
846 default: // and all other exits |
|
847 break; |
|
848 } // End switch (e) |
|
849 |
|
850 // Exit possibilities: |
|
851 // Some other exit code, state0, back up one byte exactly |
|
852 // Some other exit code, !state0, back up over last char |
|
853 |
|
854 // Back up over exactly one byte of rejected/illegal UTF-8 character |
|
855 src--; |
|
856 dst--; |
|
857 // Back up more if needed |
|
858 if (!InStateZero(st, Tbl)) { |
|
859 do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); |
|
860 } |
|
861 } else if (!InStateZero(st, Tbl)) { |
|
862 // src >= srclimit, !state0 |
|
863 // Back up over truncated UTF-8 character |
|
864 e = kExitIllegalStructure; |
|
865 do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); |
|
866 } else { |
|
867 // src >= srclimit, state0 |
|
868 // Normal termination, source fully consumed |
|
869 e = kExitOK; |
|
870 } |
|
871 |
|
872 if (offsetmap != NULL) { |
|
873 if (src > copystart) { |
|
874 offsetmap->Copy(src - copystart); |
|
875 copystart = src; |
|
876 } |
|
877 } |
|
878 |
|
879 // Possible return values here: |
|
880 // kExitDstSpaceFull caller may realloc and retry from middle |
|
881 // kExitIllegalStructure caller my overwrite/truncate |
|
882 // kExitOK all done and happy |
|
883 // kExitReject caller may overwrite/truncate |
|
884 // kExitDoAgain LOOP NOT DONE; caller must retry from middle |
|
885 // (may do fast ASCII loop first) |
|
886 // kExitPlaceholder -unused- |
|
887 // kExitNone -unused- |
|
888 *bytes_consumed = src - isrc; |
|
889 *bytes_filled = dst - odst; |
|
890 *chars_changed = total_changed; |
|
891 return e; |
|
892 } |
|
893 |
|
894 // TwoByte versions are needed for tables > 240 states, such |
|
895 // as the table for full Unicode 4.1 canonical + compatibility mapping |
|
896 |
|
897 // Scan a UTF-8 stringpiece based on state table with two-byte entries, |
|
898 // copying to output stringpiece |
|
899 // and doing text replacements. |
|
900 // DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below |
|
901 // Needs caller to loop on kExitDoAgain |
|
902 static int UTF8GenericReplaceInternalTwoByte(const UTF8ReplaceObj_2* st, |
|
903 const StringPiece& istr, |
|
904 StringPiece& ostr, |
|
905 bool is_plain_text, |
|
906 int* bytes_consumed, |
|
907 int* bytes_filled, |
|
908 int* chars_changed, |
|
909 OffsetMap* offsetmap) { |
|
910 int eshift = st->entry_shift; |
|
911 int nEntries = (1 << eshift); // 64 or 256 entries per state |
|
912 const uint8* isrc = reinterpret_cast<const uint8*>(istr.data()); |
|
913 const int ilen = istr.length(); |
|
914 const uint8* copystart = isrc; |
|
915 const uint8* src = isrc; |
|
916 const uint8* srclimit = src + ilen; |
|
917 *bytes_consumed = 0; |
|
918 *bytes_filled = 0; |
|
919 *chars_changed = 0; |
|
920 |
|
921 const uint8* odst = reinterpret_cast<const uint8*>(ostr.data()); |
|
922 const int olen = ostr.length(); |
|
923 uint8* dst = const_cast<uint8*>(odst); |
|
924 uint8* dstlimit = dst + olen; |
|
925 |
|
926 *chars_changed = 0; |
|
927 |
|
928 int total_changed = 0; |
|
929 |
|
930 int src_lll = srclimit - src; |
|
931 int dst_lll = dstlimit - dst; |
|
932 |
|
933 |
|
934 // Invariant condition during replacements: |
|
935 // remaining dst size >= remaining src size |
|
936 if ((dstlimit - dst) < (srclimit - src)) { |
|
937 if (offsetmap != NULL) { |
|
938 offsetmap->Copy(src - copystart); |
|
939 copystart = src; |
|
940 } |
|
941 return kExitDstSpaceFull_2; |
|
942 } |
|
943 const unsigned short* Tbl_0 = &st->state_table[st->state0]; |
|
944 |
|
945 Do_state_table_2: |
|
946 // Do state-table scan, copying as we go |
|
947 const unsigned short* Tbl = Tbl_0; |
|
948 int e = 0; |
|
949 uint8 c = 0; |
|
950 |
|
951 Do_state_table_newe_2: |
|
952 |
|
953 //---------------------------- |
|
954 while (src < srclimit) { |
|
955 c = *src; |
|
956 e = Tbl[c]; |
|
957 *dst = c; |
|
958 src++; |
|
959 dst++; |
|
960 if (e >= kExitIllegalStructure_2) {break;} |
|
961 Tbl = &Tbl_0[e << eshift]; |
|
962 } |
|
963 //---------------------------- |
|
964 src_lll = src - isrc; |
|
965 dst_lll = dst - odst; |
|
966 |
|
967 // Exit possibilities: |
|
968 // Replacement code, do the replacement and loop |
|
969 // Some other exit code, state0, back up one byte exactly |
|
970 // Some other exit code, !state0, back up over last char |
|
971 // source consumed, state0, exit OK |
|
972 // source consumed, !state0, back up over partial char |
|
973 // For illegal byte in state0, avoid backup up over PREVIOUS char |
|
974 // For truncated last char, back up to beginning of it |
|
975 |
|
976 if (e >= kExitIllegalStructure_2) { |
|
977 // Switch on exit code; most loop back to top |
|
978 int offset = 0; |
|
979 switch (e) { |
|
980 // These all make the output string the same size or shorter |
|
981 // No checking needed |
|
982 case kExitReplace31_2: // del 2, add 1 bytes to change |
|
983 dst -= 2; |
|
984 if (offsetmap != NULL) { |
|
985 offsetmap->Copy(src - copystart - 2); |
|
986 offsetmap->Delete(2); |
|
987 copystart = src; |
|
988 } |
|
989 dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); |
|
990 total_changed++; |
|
991 goto Do_state_table_2; |
|
992 case kExitReplace32_2: // del 3, add 2 bytes to change |
|
993 dst--; |
|
994 if (offsetmap != NULL) { |
|
995 offsetmap->Copy(src - copystart - 1); |
|
996 offsetmap->Delete(1); |
|
997 copystart = src; |
|
998 } |
|
999 dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff); |
|
1000 dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); |
|
1001 total_changed++; |
|
1002 goto Do_state_table_2; |
|
1003 case kExitReplace21_2: // del 2, add 1 bytes to change |
|
1004 dst--; |
|
1005 if (offsetmap != NULL) { |
|
1006 offsetmap->Copy(src - copystart - 1); |
|
1007 offsetmap->Delete(1); |
|
1008 copystart = src; |
|
1009 } |
|
1010 dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); |
|
1011 total_changed++; |
|
1012 goto Do_state_table_2; |
|
1013 case kExitReplace3_2: // update 3 bytes to change |
|
1014 dst[-3] = (unsigned char)(Tbl[c + (nEntries * 2)] & 0xff); |
|
1015 // Fall into next case |
|
1016 case kExitReplace2_2: // update 2 bytes to change |
|
1017 dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff); |
|
1018 // Fall into next case |
|
1019 case kExitReplace1_2: // update 1 byte to change |
|
1020 dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); |
|
1021 total_changed++; |
|
1022 goto Do_state_table_2; |
|
1023 case kExitReplace1S0_2: // update 1 byte to change, 256-entry state |
|
1024 dst[-1] = (unsigned char)(Tbl[c + (256 * 1)] & 0xff); |
|
1025 total_changed++; |
|
1026 goto Do_state_table_2; |
|
1027 // These can make the output string longer than the input |
|
1028 case kExitReplaceOffset2_2: |
|
1029 if ((nEntries != 256) && InStateZero_2(st, Tbl)) { |
|
1030 // For space-optimized table, we need multiples of 256 bytes |
|
1031 // in state0 and multiples of nEntries in other states |
|
1032 offset += ((unsigned char)(Tbl[c + (256 * 1)] >> 8 & 0xff) << 8); |
|
1033 } else { |
|
1034 offset += ((unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff) << 8); |
|
1035 } |
|
1036 // Fall into next case |
|
1037 case kExitReplaceOffset1_2: |
|
1038 if ((nEntries != 256) && InStateZero_2(st, Tbl)) { |
|
1039 // For space-optimized table, we need multiples of 256 bytes |
|
1040 // in state0 and multiples of nEntries in other states |
|
1041 offset += (unsigned char)(Tbl[c + (256 * 1)] & 0xff); |
|
1042 } else { |
|
1043 offset += (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); |
|
1044 } |
|
1045 { |
|
1046 const RemapEntry* re = &st->remap_base[offset]; |
|
1047 int del_len = re->delete_bytes & ~kReplaceAndResumeFlag; |
|
1048 int add_len = re->add_bytes & ~kHtmlPlaintextFlag; |
|
1049 // Special-case non-HTML replacement of five sensitive entities |
|
1050 // " & ' < > |
|
1051 // 0022 0026 0027 003c 003e |
|
1052 // A replacement creating one of these is expressed as a pair of |
|
1053 // entries, one for HTML output and one for plaintext output. |
|
1054 // The first of the pair has the high bit of add_bytes set. |
|
1055 if (re->add_bytes & kHtmlPlaintextFlag) { |
|
1056 // Use this entry for plain text |
|
1057 if (!is_plain_text) { |
|
1058 // Use very next entry for HTML text (same back/delete length) |
|
1059 re = &st->remap_base[offset + 1]; |
|
1060 add_len = re->add_bytes & ~kHtmlPlaintextFlag; |
|
1061 } |
|
1062 } |
|
1063 |
|
1064 // After the replacement, need (dstlimit - dst) >= (srclimit - src) |
|
1065 int string_offset = re->bytes_offset; |
|
1066 // After the replacement, need (dstlimit - newdst) >= (srclimit - src) |
|
1067 uint8* newdst = dst - del_len + add_len; |
|
1068 if ((dstlimit - newdst) < (srclimit - src)) { |
|
1069 // Won't fit; don't do the replacement. Caller may realloc and retry |
|
1070 e = kExitDstSpaceFull_2; |
|
1071 break; // exit, backing up over this char for later retry |
|
1072 } |
|
1073 dst -= del_len; |
|
1074 memcpy(dst, &st->remap_string[string_offset], add_len); |
|
1075 dst += add_len; |
|
1076 if (offsetmap != NULL) { |
|
1077 if (add_len > del_len) { |
|
1078 offsetmap->Copy(src - copystart); |
|
1079 offsetmap->Insert(add_len - del_len); |
|
1080 copystart = src; |
|
1081 } else if (add_len < del_len) { |
|
1082 offsetmap->Copy(src - copystart + add_len - del_len); |
|
1083 offsetmap->Delete(del_len - add_len); |
|
1084 copystart = src; |
|
1085 } |
|
1086 } |
|
1087 if (re->delete_bytes & kReplaceAndResumeFlag) { |
|
1088 // There is a two-byte non-zero target state at the end of the |
|
1089 // replacement string |
|
1090 uint8 c1 = st->remap_string[string_offset + add_len]; |
|
1091 uint8 c2 = st->remap_string[string_offset + add_len + 1]; |
|
1092 e = (c1 << 8) | c2; |
|
1093 Tbl = &Tbl_0[e << eshift]; |
|
1094 total_changed++; |
|
1095 goto Do_state_table_newe_2; |
|
1096 } |
|
1097 } |
|
1098 total_changed++; |
|
1099 if (e == kExitRejectAlt_2) {break;} |
|
1100 goto Do_state_table_2; |
|
1101 |
|
1102 case kExitSpecial_2: // NO special fixups [read: hacks] |
|
1103 case kExitIllegalStructure_2: // structurally illegal byte; quit |
|
1104 case kExitReject_2: // NUL or illegal code encountered; quit |
|
1105 // and all other exits |
|
1106 default: |
|
1107 break; |
|
1108 } // End switch (e) |
|
1109 |
|
1110 // Exit possibilities: |
|
1111 // Some other exit code, state0, back up one byte exactly |
|
1112 // Some other exit code, !state0, back up over last char |
|
1113 |
|
1114 // Back up over exactly one byte of rejected/illegal UTF-8 character |
|
1115 src--; |
|
1116 dst--; |
|
1117 // Back up more if needed |
|
1118 if (!InStateZero_2(st, Tbl)) { |
|
1119 do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); |
|
1120 } |
|
1121 } else if (!InStateZero_2(st, Tbl)) { |
|
1122 // src >= srclimit, !state0 |
|
1123 // Back up over truncated UTF-8 character |
|
1124 e = kExitIllegalStructure_2; |
|
1125 |
|
1126 do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); |
|
1127 } else { |
|
1128 // src >= srclimit, state0 |
|
1129 // Normal termination, source fully consumed |
|
1130 e = kExitOK_2; |
|
1131 } |
|
1132 |
|
1133 if (offsetmap != NULL) { |
|
1134 if (src > copystart) { |
|
1135 offsetmap->Copy(src - copystart); |
|
1136 copystart = src; |
|
1137 } |
|
1138 } |
|
1139 |
|
1140 |
|
1141 // Possible return values here: |
|
1142 // kExitDstSpaceFull_2 caller may realloc and retry from middle |
|
1143 // kExitIllegalStructure_2 caller my overwrite/truncate |
|
1144 // kExitOK_2 all done and happy |
|
1145 // kExitReject_2 caller may overwrite/truncate |
|
1146 // kExitDoAgain_2 LOOP NOT DONE; caller must retry from middle |
|
1147 // (may do fast ASCII loop first) |
|
1148 // kExitPlaceholder_2 -unused- |
|
1149 // kExitNone_2 -unused- |
|
1150 *bytes_consumed = src - isrc; |
|
1151 *bytes_filled = dst - odst; |
|
1152 *chars_changed = total_changed; |
|
1153 return e; |
|
1154 } |
|
1155 |
|
1156 |
|
1157 // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece |
|
1158 // and doing text replacements. |
|
1159 // Also writes an optional OffsetMap. Pass NULL to skip writing one. |
|
1160 // Always scan complete UTF-8 characters |
|
1161 // Set number of bytes consumed from input, number filled to output. |
|
1162 // Return reason for exiting |
|
1163 int UTF8GenericReplace(const UTF8ReplaceObj* st, |
|
1164 const StringPiece& istr, |
|
1165 StringPiece& ostr, |
|
1166 bool is_plain_text, |
|
1167 int* bytes_consumed, |
|
1168 int* bytes_filled, |
|
1169 int* chars_changed, |
|
1170 OffsetMap* offsetmap) { |
|
1171 StringPiece local_istr(istr.data(), istr.length()); |
|
1172 StringPiece local_ostr(ostr.data(), ostr.length()); |
|
1173 int total_consumed = 0; |
|
1174 int total_filled = 0; |
|
1175 int total_changed = 0; |
|
1176 int local_bytes_consumed, local_bytes_filled, local_chars_changed; |
|
1177 int e; |
|
1178 do { |
|
1179 e = UTF8GenericReplaceInternal(st, |
|
1180 local_istr, local_ostr, is_plain_text, |
|
1181 &local_bytes_consumed, &local_bytes_filled, |
|
1182 &local_chars_changed, |
|
1183 offsetmap); |
|
1184 local_istr.remove_prefix(local_bytes_consumed); |
|
1185 local_ostr.remove_prefix(local_bytes_filled); |
|
1186 total_consumed += local_bytes_consumed; |
|
1187 total_filled += local_bytes_filled; |
|
1188 total_changed += local_chars_changed; |
|
1189 } while ( e == kExitDoAgain ); |
|
1190 *bytes_consumed = total_consumed; |
|
1191 *bytes_filled = total_filled; |
|
1192 *chars_changed = total_changed; |
|
1193 return e; |
|
1194 } |
|
1195 |
|
1196 // Older version without offsetmap |
|
1197 int UTF8GenericReplace(const UTF8ReplaceObj* st, |
|
1198 const StringPiece& istr, |
|
1199 StringPiece& ostr, |
|
1200 bool is_plain_text, |
|
1201 int* bytes_consumed, |
|
1202 int* bytes_filled, |
|
1203 int* chars_changed) { |
|
1204 return UTF8GenericReplace(st, |
|
1205 istr, |
|
1206 ostr, |
|
1207 is_plain_text, |
|
1208 bytes_consumed, |
|
1209 bytes_filled, |
|
1210 chars_changed, |
|
1211 NULL); |
|
1212 } |
|
1213 |
|
1214 // Older version without is_plain_text or offsetmap |
|
1215 int UTF8GenericReplace(const UTF8ReplaceObj* st, |
|
1216 const StringPiece& istr, |
|
1217 StringPiece& ostr, |
|
1218 int* bytes_consumed, |
|
1219 int* bytes_filled, |
|
1220 int* chars_changed) { |
|
1221 bool is_plain_text = false; |
|
1222 return UTF8GenericReplace(st, |
|
1223 istr, |
|
1224 ostr, |
|
1225 is_plain_text, |
|
1226 bytes_consumed, |
|
1227 bytes_filled, |
|
1228 chars_changed, |
|
1229 NULL); |
|
1230 } |
|
1231 |
|
1232 // Scan a UTF-8 stringpiece based on state table with two-byte entries, |
|
1233 // copying to output stringpiece |
|
1234 // and doing text replacements. |
|
1235 // Also writes an optional OffsetMap. Pass NULL to skip writing one. |
|
1236 // Always scan complete UTF-8 characters |
|
1237 // Set number of bytes consumed from input, number filled to output. |
|
1238 // Return reason for exiting |
|
1239 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, |
|
1240 const StringPiece& istr, |
|
1241 StringPiece& ostr, |
|
1242 bool is_plain_text, |
|
1243 int* bytes_consumed, |
|
1244 int* bytes_filled, |
|
1245 int* chars_changed, |
|
1246 OffsetMap* offsetmap) { |
|
1247 StringPiece local_istr(istr.data(), istr.length()); |
|
1248 StringPiece local_ostr(ostr.data(), ostr.length()); |
|
1249 int total_consumed = 0; |
|
1250 int total_filled = 0; |
|
1251 int total_changed = 0; |
|
1252 int local_bytes_consumed, local_bytes_filled, local_chars_changed; |
|
1253 int e; |
|
1254 do { |
|
1255 e = UTF8GenericReplaceInternalTwoByte(st, |
|
1256 local_istr, local_ostr, is_plain_text, |
|
1257 &local_bytes_consumed, |
|
1258 &local_bytes_filled, |
|
1259 &local_chars_changed, |
|
1260 offsetmap); |
|
1261 local_istr.remove_prefix(local_bytes_consumed); |
|
1262 local_ostr.remove_prefix(local_bytes_filled); |
|
1263 total_consumed += local_bytes_consumed; |
|
1264 total_filled += local_bytes_filled; |
|
1265 total_changed += local_chars_changed; |
|
1266 } while ( e == kExitDoAgain_2 ); |
|
1267 *bytes_consumed = total_consumed; |
|
1268 *bytes_filled = total_filled; |
|
1269 *chars_changed = total_changed; |
|
1270 |
|
1271 return e - kExitOK_2 + kExitOK; |
|
1272 } |
|
1273 |
|
1274 // Older version without offsetmap |
|
1275 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, |
|
1276 const StringPiece& istr, |
|
1277 StringPiece& ostr, |
|
1278 bool is_plain_text, |
|
1279 int* bytes_consumed, |
|
1280 int* bytes_filled, |
|
1281 int* chars_changed) { |
|
1282 return UTF8GenericReplaceTwoByte(st, |
|
1283 istr, |
|
1284 ostr, |
|
1285 is_plain_text, |
|
1286 bytes_consumed, |
|
1287 bytes_filled, |
|
1288 chars_changed, |
|
1289 NULL); |
|
1290 } |
|
1291 |
|
1292 // Older version without is_plain_text or offsetmap |
|
1293 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, |
|
1294 const StringPiece& istr, |
|
1295 StringPiece& ostr, |
|
1296 int* bytes_consumed, |
|
1297 int* bytes_filled, |
|
1298 int* chars_changed) { |
|
1299 bool is_plain_text = false; |
|
1300 return UTF8GenericReplaceTwoByte(st, |
|
1301 istr, |
|
1302 ostr, |
|
1303 is_plain_text, |
|
1304 bytes_consumed, |
|
1305 bytes_filled, |
|
1306 chars_changed, |
|
1307 NULL); |
|
1308 } |
|
1309 |
|
1310 |
|
1311 |
|
1312 // Adjust a stringpiece to encompass complete UTF-8 characters. |
|
1313 // The data pointer will be increased by 0..3 bytes to get to a character |
|
1314 // boundary, and the length will then be decreased by 0..3 bytes |
|
1315 // to encompass the last complete character. |
|
1316 void UTF8TrimToChars(StringPiece* istr) { |
|
1317 const char* src = istr->data(); |
|
1318 int len = istr->length(); |
|
1319 // Exit if empty string |
|
1320 if (len == 0) { |
|
1321 return; |
|
1322 } |
|
1323 |
|
1324 // Exit on simple, common case |
|
1325 if ( ((src[0] & 0xc0) != 0x80) && |
|
1326 (static_cast<signed char>(src[len - 1]) >= 0) ) { |
|
1327 // First byte is not a continuation and last byte is 7-bit ASCII -- done |
|
1328 return; |
|
1329 } |
|
1330 |
|
1331 // Adjust the back end, len > 0 |
|
1332 const char* srclimit = src + len; |
|
1333 // Backscan over any ending continuation bytes to find last char start |
|
1334 const char* s = srclimit - 1; // Last byte of the string |
|
1335 while ((src <= s) && ((*s & 0xc0) == 0x80)) { |
|
1336 s--; |
|
1337 } |
|
1338 // Include entire last char if it fits |
|
1339 if (src <= s) { |
|
1340 int last_char_len = UTF8OneCharLen(s); |
|
1341 if (s + last_char_len <= srclimit) { |
|
1342 // Last char fits, so include it, else exclude it |
|
1343 s += last_char_len; |
|
1344 } |
|
1345 } |
|
1346 if (s != srclimit) { |
|
1347 // s is one byte beyond the last full character, if any |
|
1348 istr->remove_suffix(srclimit - s); |
|
1349 // Exit if now empty string |
|
1350 if (istr->length() == 0) { |
|
1351 return; |
|
1352 } |
|
1353 } |
|
1354 |
|
1355 // Adjust the front end, len > 0 |
|
1356 len = istr->length(); |
|
1357 srclimit = src + len; |
|
1358 s = src; // First byte of the string |
|
1359 // Scan over any beginning continuation bytes to find first char start |
|
1360 while ((s < srclimit) && ((*s & 0xc0) == 0x80)) { |
|
1361 s++; |
|
1362 } |
|
1363 if (s != src) { |
|
1364 // s is at the first full character, if any |
|
1365 istr->remove_prefix(s - src); |
|
1366 } |
|
1367 } |
|
1368 |
|
1369 } // End namespace CLD2 |