browser/components/translation/cld2/internal/utf8statetable.cc

changeset 0
6474c204b198
equal deleted inserted replaced
-1:000000000000 0:6c4c695433c4
1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 //
16 // State Table follower for scanning UTF-8 strings without converting to
17 // 32- or 16-bit Unicode values.
18 //
19
20 #ifdef COMPILER_MSVC
21 // MSVC warns: warning C4309: 'initializing' : truncation of constant value
22 // But the value is in fact not truncated. 0xFF still comes out 0xFF at
23 // runtime.
24 #pragma warning ( disable : 4309 )
25 #endif
26
27 #include "utf8statetable.h"
28
29 #include <stdint.h> // for uintptr_t
30 #include <string.h> // for NULL, memcpy, memmove
31
32 #include "integral_types.h" // for uint8, uint32, int8
33 #include "stringpiece.h"
34 #include "offsetmap.h"
35
36
37 namespace CLD2 {
38
39 static const int kReplaceAndResumeFlag = 0x80; // Bit in del byte to distinguish
40 // optional next-state field
41 // after replacement text
42 static const int kHtmlPlaintextFlag = 0x80; // Bit in add byte to distinguish
43 // HTML replacement vs. plaintext
44
45
46 /**
47 * This code implements a little interpreter for UTF8 state
48 * tables. There are three kinds of quite-similar state tables,
49 * property, scanning, and replacement. Each state in one of
50 * these tables consists of an array of 256 or 64 one-byte
51 * entries. The state is subscripted by an incoming source byte,
52 * and the entry either specifies the next state or specifies an
53 * action. Space-optimized tables have full 256-entry states for
54 * the first byte of a UTF-8 character, but only 64-entry states
55 * for continuation bytes. Space-optimized tables may only be
56 * used with source input that has been checked to be
57 * structurally- (or stronger interchange-) valid.
58 *
59 * A property state table has an unsigned one-byte property for
60 * each possible UTF-8 character. One-byte character properties
61 * are in the state[0] array, while for other lengths the
62 * state[0] array gives the next state, which contains the
63 * property value for two-byte characters or yet another state
64 * for longer ones. The code simply loads the right number of
65 * next-state values, then returns the final byte as property
66 * value. There are no actions specified in property tables.
67 * States are typically shared for multi-byte UTF-8 characters
68 * that all have the same property value.
69 *
70 * A scanning state table has entries that are either a
71 * next-state specifier for bytes that are accepted by the
72 * scanner, or an exit action for the last byte of each
73 * character that is rejected by the scanner.
74 *
75 * Scanning long strings involves a tight loop that picks up one
76 * byte at a time and follows next-state value back to state[0]
77 * for each accepted UTF-8 character. Scanning stops at the end
78 * of the string or at the first character encountered that has
79 * an exit action such as "reject". Timing information is given
80 * below.
81 *
82 * Since so much of Google's text is 7-bit-ASCII values
83 * (approximately 94% of the bytes of web documents), the
84 * scanning interpreter has two speed optimizations. One checks
85 * 8 bytes at a time to see if they are all in the range lo..hi,
86 * as specified in constants in the overall statetable object.
87 * The check involves ORing together four 4-byte values that
88 * overflow into the high bit of some byte when a byte is out of
89 * range. For seven-bit-ASCII, lo is 0x20 and hi is 0x7E. This
90 * loop is about 8x faster than the one-byte-at-a-time loop.
91 *
92 * If checking for exit bytes in the 0x00-0x1F and 7F range is
93 * unneeded, an even faster loop just looks at the high bits of
94 * 8 bytes at once, and is about 1.33x faster than the lo..hi
95 * loop.
96 *
97 * Exit from the scanning routines backs up to the first byte of
98 * the rejected character, so the text spanned is always a
99 * complete number of UTF-8 characters. The normal scanning exit
100 * is at the first rejected character, or at the end of the
101 * input text. Scanning also exits on any detected ill-formed
102 * character or at a special do-again action built into some
103 * exit-optimized tables. The do-again action gets back to the
104 * top of the scanning loop to retry eight-byte ASCII scans. It
105 * is typically put into state tables after four seven-bit-ASCII
106 * characters in a row are seen, to allow restarting the fast
107 * scan after some slower processing of multi-byte characters.
108 *
109 * A replacement state table is similar to a scanning state
110 * table but has more extensive actions. The default
111 * byte-at-a-time loop copies one byte from source to
112 * destination and goes to the next state. The replacement
113 * actions overwrite 1-3 bytes of the destination with different
114 * bytes, possibly shortening the output by 1 or 2 bytes. The
115 * replacement bytes come from within the state table, from
116 * dummy states inserted just after any state that contains a
117 * replacement action. This gives a quick address calculation for
118 * the replacement byte(s) and gives some cache locality.
119 *
120 * Additional replacement actions use one or two bytes from
121 * within dummy states to index a side table of more-extensive
122 * replacements. The side table specifies a length of 0..15
123 * destination bytes to overwrite and a length of 0..127 bytes
124 * to overwrite them with, plus the actual replacement bytes.
125 *
126 * This side table uses one extra bit to specify a pair of
127 * replacements, the first to be used in an HTML context and the
128 * second to be used in a plaintext context. This allows
129 * replacements that are spelled with "&lt;" in the former
130 * context and "<" in the latter.
131 *
132 * The side table also uses an extra bit to specify a non-zero
133 * next state after a replacement. This allows a combination
134 * replacement and state change, used to implement a limited
135 * version of the Boyer-Moore algorithm for multi-character
136 * replacement without backtracking. This is useful when there
137 * are overlapping replacements, such as ch => x and also c =>
138 * y, the latter to be used only if the character after c is not
139 * h. in this case, the state[0] table's entry for c would
140 * change c to y and also have a next-state of say n, and the
141 * state[n] entry for h would specify a replacement of the two
142 * bytes yh by x. No backtracking is needed.
143 *
144 * A replacement table may also include the exit actions of a
145 * scanning state table, so some character sequences can
146 * terminate early.
147 *
148 * During replacement, an optional data structure called an
149 * offset map can be updated to reflect each change in length
150 * between source and destination. This offset map can later be
151 * used to map destination-string offsets to corresponding
152 * source-string offsets or vice versa.
153 *
154 * The routines below also have variants in which state-table
155 * entries are all two bytes instead of one byte. This allows
156 * tables with more than 240 total states, but takes up twice as
157 * much space per state.
158 *
159 **/
160
161 // Return true if current Tbl pointer is within state0 range
162 // Note that unsigned compare checks both ends of range simultaneously
163 static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
164 const uint8* Tbl0 = &st->state_table[st->state0];
165 return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
166 }
167
168 static inline bool InStateZero_2(const UTF8ReplaceObj_2* st,
169 const unsigned short int* Tbl) {
170 const unsigned short int* Tbl0 = &st->state_table[st->state0];
171 // Word difference, not byte difference
172 return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
173 }
174
175 // UTF8PropObj, UTF8ScanObj, UTF8ReplaceObj are all typedefs of
176 // UTF8MachineObj.
177
178 static bool IsPropObj(const UTF8StateMachineObj& obj) {
179 return obj.fast_state == NULL
180 && obj.max_expand == 0;
181 }
182
183 static bool IsPropObj_2(const UTF8StateMachineObj_2& obj) {
184 return obj.fast_state == NULL
185 && obj.max_expand == 0;
186 }
187
188 static bool IsScanObj(const UTF8StateMachineObj& obj) {
189 return obj.fast_state != NULL
190 && obj.max_expand == 0;
191 }
192
193 static bool IsReplaceObj(const UTF8StateMachineObj& obj) {
194 // Normally, obj.fast_state != NULL, but the handwritten tables
195 // in utf8statetable_unittest don't handle fast_states.
196 return obj.max_expand > 0;
197 }
198
199 static bool IsReplaceObj_2(const UTF8StateMachineObj_2& obj) {
200 return obj.max_expand > 0;
201 }
202
203 // Look up property of one UTF-8 character and advance over it
204 // Return 0 if input length is zero
205 // Return 0 and advance one byte if input is ill-formed
206 uint8 UTF8GenericProperty(const UTF8PropObj* st,
207 const uint8** src,
208 int* srclen) {
209 if (*srclen <= 0) {
210 return 0;
211 }
212
213 const uint8* lsrc = *src;
214 const uint8* Tbl_0 = &st->state_table[st->state0];
215 const uint8* Tbl = Tbl_0;
216 int e;
217 int eshift = st->entry_shift;
218
219 // Short series of tests faster than switch, optimizes 7-bit ASCII
220 unsigned char c = lsrc[0];
221 if (static_cast<signed char>(c) >= 0) { // one byte
222 e = Tbl[c];
223 *src += 1;
224 *srclen -= 1;
225 } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
226 e = Tbl[c];
227 Tbl = &Tbl_0[e << eshift];
228 e = Tbl[lsrc[1]];
229 *src += 2;
230 *srclen -= 2;
231 } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
232 e = Tbl[c];
233 Tbl = &Tbl_0[e << eshift];
234 e = Tbl[lsrc[1]];
235 Tbl = &Tbl_0[e << eshift];
236 e = Tbl[lsrc[2]];
237 *src += 3;
238 *srclen -= 3;
239 }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
240 e = Tbl[c];
241 Tbl = &Tbl_0[e << eshift];
242 e = Tbl[lsrc[1]];
243 Tbl = &Tbl_0[e << eshift];
244 e = Tbl[lsrc[2]];
245 Tbl = &Tbl_0[e << eshift];
246 e = Tbl[lsrc[3]];
247 *src += 4;
248 *srclen -= 4;
249 } else { // Ill-formed
250 e = 0;
251 *src += 1;
252 *srclen -= 1;
253 }
254 return e;
255 }
256
257 bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src) {
258 const uint8* lsrc = reinterpret_cast<const uint8*>(src);
259 const uint8* Tbl_0 = &st.state_table[st.state0];
260 const uint8* Tbl = Tbl_0;
261 int e;
262 int eshift = st.entry_shift;
263
264 // Short series of tests faster than switch, optimizes 7-bit ASCII
265 unsigned char c = lsrc[0];
266 if (static_cast<signed char>(c) >= 0) { // one byte
267 e = Tbl[c];
268 } else if ((c & 0xe0) == 0xc0) { // two bytes
269 e = Tbl[c];
270 Tbl = &Tbl_0[e << eshift];
271 e = Tbl[lsrc[1]];
272 } else if ((c & 0xf0) == 0xe0) { // three bytes
273 e = Tbl[c];
274 Tbl = &Tbl_0[e << eshift];
275 e = Tbl[lsrc[1]];
276 Tbl = &Tbl_0[e << eshift];
277 e = Tbl[lsrc[2]];
278 } else { // four bytes
279 e = Tbl[c];
280 Tbl = &Tbl_0[e << eshift];
281 e = Tbl[lsrc[1]];
282 Tbl = &Tbl_0[e << eshift];
283 e = Tbl[lsrc[2]];
284 Tbl = &Tbl_0[e << eshift];
285 e = Tbl[lsrc[3]];
286 }
287 return e;
288 }
289
290
291 // BigOneByte versions are needed for tables > 240 states, but most
292 // won't need the TwoByte versions.
293 // Internally, to next-to-last offset is multiplied by 16 and the last
294 // offset is relative instead of absolute.
295 // Look up property of one UTF-8 character and advance over it
296 // Return 0 if input length is zero
297 // Return 0 and advance one byte if input is ill-formed
298 uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
299 const uint8** src,
300 int* srclen) {
301 if (*srclen <= 0) {
302 return 0;
303 }
304
305 const uint8* lsrc = *src;
306 const uint8* Tbl_0 = &st->state_table[st->state0];
307 const uint8* Tbl = Tbl_0;
308 int e;
309 int eshift = st->entry_shift;
310
311 // Short series of tests faster than switch, optimizes 7-bit ASCII
312 unsigned char c = lsrc[0];
313 if (static_cast<signed char>(c) >= 0) { // one byte
314 e = Tbl[c];
315 *src += 1;
316 *srclen -= 1;
317 } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
318 e = Tbl[c];
319 Tbl = &Tbl_0[e << eshift];
320 e = Tbl[lsrc[1]];
321 *src += 2;
322 *srclen -= 2;
323 } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
324 e = Tbl[c];
325 Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
326 e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
327 Tbl = &Tbl[e << eshift]; // Relative +/-
328 e = Tbl[lsrc[2]];
329 *src += 3;
330 *srclen -= 3;
331 }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
332 e = Tbl[c];
333 Tbl = &Tbl_0[e << eshift];
334 e = Tbl[lsrc[1]];
335 Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
336 e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
337 Tbl = &Tbl[e << eshift]; // Relative +/-
338 e = Tbl[lsrc[3]];
339 *src += 4;
340 *srclen -= 4;
341 } else { // Ill-formed
342 e = 0;
343 *src += 1;
344 *srclen -= 1;
345 }
346 return e;
347 }
348
349 // BigOneByte versions are needed for tables > 240 states, but most
350 // won't need the TwoByte versions.
351 bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src) {
352 const uint8* lsrc = reinterpret_cast<const uint8*>(src);
353 const uint8* Tbl_0 = &st.state_table[st.state0];
354 const uint8* Tbl = Tbl_0;
355 int e;
356 int eshift = st.entry_shift;
357
358 // Short series of tests faster than switch, optimizes 7-bit ASCII
359 unsigned char c = lsrc[0];
360 if (static_cast<signed char>(c) >= 0) { // one byte
361 e = Tbl[c];
362 } else if ((c & 0xe0) == 0xc0) { // two bytes
363 e = Tbl[c];
364 Tbl = &Tbl_0[e << eshift];
365 e = Tbl[lsrc[1]];
366 } else if ((c & 0xf0) == 0xe0) { // three bytes
367 e = Tbl[c];
368 Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
369 e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
370 Tbl = &Tbl[e << eshift]; // Relative +/-
371 e = Tbl[lsrc[2]];
372 } else { // four bytes
373 e = Tbl[c];
374 Tbl = &Tbl_0[e << eshift];
375 e = Tbl[lsrc[1]];
376 Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
377 e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
378 Tbl = &Tbl[e << eshift]; // Relative +/-
379 e = Tbl[lsrc[3]];
380 }
381 return e;
382 }
383
384
385 // TwoByte versions are needed for tables > 240 states
386 // Look up property of one UTF-8 character and advance over it
387 // Return 0 if input length is zero
388 // Return 0 and advance one byte if input is ill-formed
389 uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
390 const uint8** src,
391 int* srclen) {
392 if (*srclen <= 0) {
393 return 0;
394 }
395
396 const uint8* lsrc = *src;
397 const unsigned short* Tbl_0 = &st->state_table[st->state0];
398 const unsigned short* Tbl = Tbl_0;
399 int e;
400 int eshift = st->entry_shift;
401
402 // Short series of tests faster than switch, optimizes 7-bit ASCII
403 unsigned char c = lsrc[0];
404 if (static_cast<signed char>(c) >= 0) { // one byte
405 e = Tbl[c];
406 *src += 1;
407 *srclen -= 1;
408 } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
409 e = Tbl[c];
410 Tbl = &Tbl_0[e << eshift];
411 e = Tbl[lsrc[1]];
412 *src += 2;
413 *srclen -= 2;
414 } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
415 e = Tbl[c];
416 Tbl = &Tbl_0[e << eshift];
417 e = Tbl[lsrc[1]];
418 Tbl = &Tbl_0[e << eshift];
419 e = Tbl[lsrc[2]];
420 *src += 3;
421 *srclen -= 3;
422 }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
423 e = Tbl[c];
424 Tbl = &Tbl_0[e << eshift];
425 e = Tbl[lsrc[1]];
426 Tbl = &Tbl_0[e << eshift];
427 e = Tbl[lsrc[2]];
428 Tbl = &Tbl_0[e << eshift];
429 e = Tbl[lsrc[3]];
430 *src += 4;
431 *srclen -= 4;
432 } else { // Ill-formed
433 e = 0;
434 *src += 1;
435 *srclen -= 1;
436 }
437 return e;
438 }
439
440 // TwoByte versions are needed for tables > 240 states
441 bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src) {
442 const uint8* lsrc = reinterpret_cast<const uint8*>(src);
443 const unsigned short* Tbl_0 = &st.state_table[st.state0];
444 const unsigned short* Tbl = Tbl_0;
445 int e;
446 int eshift = st.entry_shift;
447
448 // Short series of tests faster than switch, optimizes 7-bit ASCII
449 unsigned char c = lsrc[0];
450 if (static_cast<signed char>(c) >= 0) { // one byte
451 e = Tbl[c];
452 } else if ((c & 0xe0) == 0xc0) { // two bytes
453 e = Tbl[c];
454 Tbl = &Tbl_0[e << eshift];
455 e = Tbl[lsrc[1]];
456 } else if ((c & 0xf0) == 0xe0) { // three bytes
457 e = Tbl[c];
458 Tbl = &Tbl_0[e << eshift];
459 e = Tbl[lsrc[1]];
460 Tbl = &Tbl_0[e << eshift];
461 e = Tbl[lsrc[2]];
462 } else { // four bytes
463 e = Tbl[c];
464 Tbl = &Tbl_0[e << eshift];
465 e = Tbl[lsrc[1]];
466 Tbl = &Tbl_0[e << eshift];
467 e = Tbl[lsrc[2]];
468 Tbl = &Tbl_0[e << eshift];
469 e = Tbl[lsrc[3]];
470 }
471 return e;
472 }
473
474
475 // Approximate speeds on 2.8 GHz Pentium 4:
476 // GenericScan 1-byte loop 300 MB/sec *
477 // GenericScan 4-byte loop 1200 MB/sec
478 // GenericScan 8-byte loop 2400 MB/sec *
479 // GenericScanFastAscii 4-byte loop 3000 MB/sec
480 // GenericScanFastAscii 8-byte loop 3200 MB/sec *
481 //
482 // * Implemented below. FastAscii loop is memory-bandwidth constrained.
483
484 // Scan a UTF-8 stringpiece based on state table.
485 // Always scan complete UTF-8 characters
486 // Set number of bytes scanned. Return reason for exiting
487 int UTF8GenericScan(const UTF8ScanObj* st,
488 const StringPiece& str,
489 int* bytes_consumed) {
490 int eshift = st->entry_shift; // 6 (space optimized) or 8
491 // int nEntries = (1 << eshift); // 64 or 256 entries per state
492
493 const uint8* isrc =
494 reinterpret_cast<const uint8*>(str.data());
495 const uint8* src = isrc;
496 const int len = str.length();
497 const uint8* srclimit = isrc + len;
498 const uint8* srclimit8 = srclimit - 7;
499 *bytes_consumed = 0;
500 if (len == 0) return kExitOK;
501
502 const uint8* Tbl_0 = &st->state_table[st->state0];
503
504 DoAgain:
505 // Do state-table scan
506 int e = 0;
507 uint8 c;
508
509 // Do fast for groups of 8 identity bytes.
510 // This covers a lot of 7-bit ASCII ~8x faster than the 1-byte loop,
511 // including slowing slightly on cr/lf/ht
512 //----------------------------
513 const uint8* Tbl2 = &st->fast_state[0];
514 uint32 losub = st->losub;
515 uint32 hiadd = st->hiadd;
516 while (src < srclimit8) {
517 uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
518 uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
519 src += 8;
520 // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
521 uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
522 (s4567 - losub) | (s4567 + hiadd);
523 if ((temp & 0x80808080) != 0) {
524 // We typically end up here on cr/lf/ht; src was incremented
525 int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
526 (Tbl2[src[-6]] | Tbl2[src[-5]]);
527 if (e0123 != 0) {src -= 8; break;} // Exit on Non-interchange
528 e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
529 (Tbl2[src[-2]] | Tbl2[src[-1]]);
530 if (e0123 != 0) {src -= 4; break;} // Exit on Non-interchange
531 // Else OK, go around again
532 }
533 }
534 //----------------------------
535
536 // Byte-at-a-time scan
537 //----------------------------
538 const uint8* Tbl = Tbl_0;
539 while (src < srclimit) {
540 c = *src;
541 e = Tbl[c];
542 src++;
543 if (e >= kExitIllegalStructure) {break;}
544 Tbl = &Tbl_0[e << eshift];
545 }
546 //----------------------------
547
548
549 // Exit possibilities:
550 // Some exit code, !state0, back up over last char
551 // Some exit code, state0, back up one byte exactly
552 // source consumed, !state0, back up over partial char
553 // source consumed, state0, exit OK
554 // For illegal byte in state0, avoid backup up over PREVIOUS char
555 // For truncated last char, back up to beginning of it
556
557 if (e >= kExitIllegalStructure) {
558 // Back up over exactly one byte of rejected/illegal UTF-8 character
559 src--;
560 // Back up more if needed
561 if (!InStateZero(st, Tbl)) {
562 do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
563 }
564 } else if (!InStateZero(st, Tbl)) {
565 // Back up over truncated UTF-8 character
566 e = kExitIllegalStructure;
567 do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
568 } else {
569 // Normal termination, source fully consumed
570 e = kExitOK;
571 }
572
573 if (e == kExitDoAgain) {
574 // Loop back up to the fast scan
575 goto DoAgain;
576 }
577
578 *bytes_consumed = src - isrc;
579 return e;
580 }
581
582 // Scan a UTF-8 stringpiece based on state table.
583 // Always scan complete UTF-8 characters
584 // Set number of bytes scanned. Return reason for exiting
585 // OPTIMIZED for case of 7-bit ASCII 0000..007f all valid
586 int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
587 const StringPiece& str,
588 int* bytes_consumed) {
589 const uint8* isrc =
590 reinterpret_cast<const uint8*>(str.data());
591 const uint8* src = isrc;
592 const int len = str.length();
593 const uint8* srclimit = isrc + len;
594 const uint8* srclimit8 = srclimit - 7;
595 *bytes_consumed = 0;
596 if (len == 0) return kExitOK;
597
598 int n;
599 int rest_consumed;
600 int exit_reason;
601 do {
602 // Skip 8 bytes of ASCII at a whack; no endianness issue
603 while ((src < srclimit8) &&
604 (((reinterpret_cast<const uint32*>(src)[0] |
605 reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
606 src += 8;
607 }
608 // Run state table on the rest
609 n = src - isrc;
610 StringPiece str2(str.data() + n, str.length() - n);
611 exit_reason = UTF8GenericScan(st, str2, &rest_consumed);
612 src += rest_consumed;
613 } while ( exit_reason == kExitDoAgain );
614
615 *bytes_consumed = src - isrc;
616 return exit_reason;
617 }
618
619 // Hack to change halfwidth katakana to match an old UTF8CharToLower()
620
621 // Return number of src bytes skipped
622 static int DoSpecialFixup(const unsigned char c,
623 const unsigned char** srcp, const unsigned char* srclimit,
624 unsigned char** dstp, unsigned char* dstlimit) {
625 return 0;
626 }
627
628
629 // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
630 // and doing text replacements.
631 // DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below
632 // Needs caller to loop on kExitDoAgain
633 static int UTF8GenericReplaceInternal(const UTF8ReplaceObj* st,
634 const StringPiece& istr,
635 StringPiece& ostr,
636 bool is_plain_text,
637 int* bytes_consumed,
638 int* bytes_filled,
639 int* chars_changed,
640 OffsetMap* offsetmap) {
641 int eshift = st->entry_shift;
642 int nEntries = (1 << eshift); // 64 or 256 entries per state
643 const uint8* isrc = reinterpret_cast<const uint8*>(istr.data());
644 const int ilen = istr.length();
645 const uint8* copystart = isrc;
646 const uint8* src = isrc;
647 const uint8* srclimit = src + ilen;
648 *bytes_consumed = 0;
649 *bytes_filled = 0;
650 *chars_changed = 0;
651
652 const uint8* odst = reinterpret_cast<const uint8*>(ostr.data());
653 const int olen = ostr.length();
654 uint8* dst = const_cast<uint8*>(odst);
655 uint8* dstlimit = dst + olen;
656
657 int total_changed = 0;
658
659 // Invariant condition during replacements:
660 // remaining dst size >= remaining src size
661 if ((dstlimit - dst) < (srclimit - src)) {
662 if (offsetmap != NULL) {
663 offsetmap->Copy(src - copystart);
664 copystart = src;
665 }
666 return kExitDstSpaceFull;
667 }
668 const uint8* Tbl_0 = &st->state_table[st->state0];
669
670 Do_state_table:
671 // Do state-table scan, copying as we go
672 const uint8* Tbl = Tbl_0;
673 int e = 0;
674 uint8 c = 0;
675
676 Do_state_table_newe:
677
678 //----------------------------
679 while (src < srclimit) {
680 c = *src;
681 e = Tbl[c];
682 *dst = c;
683 src++;
684 dst++;
685 if (e >= kExitIllegalStructure) {break;}
686 Tbl = &Tbl_0[e << eshift];
687 }
688 //----------------------------
689
690 // Exit possibilities:
691 // Replacement code, do the replacement and loop
692 // Some other exit code, state0, back up one byte exactly
693 // Some other exit code, !state0, back up over last char
694 // source consumed, state0, exit OK
695 // source consumed, !state0, back up over partial char
696 // For illegal byte in state0, avoid backup up over PREVIOUS char
697 // For truncated last char, back up to beginning of it
698
699 if (e >= kExitIllegalStructure) {
700 // Switch on exit code; most loop back to top
701 int offset = 0;
702 switch (e) {
703 // These all make the output string the same size or shorter
704 // No checking needed
705 case kExitReplace31: // del 2, add 1 bytes to change
706 dst -= 2;
707 if (offsetmap != NULL) {
708 offsetmap->Copy(src - copystart - 2);
709 offsetmap->Delete(2);
710 copystart = src;
711 }
712 dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
713 total_changed++;
714 goto Do_state_table;
715 case kExitReplace32: // del 3, add 2 bytes to change
716 dst--;
717 if (offsetmap != NULL) {
718 offsetmap->Copy(src - copystart - 1);
719 offsetmap->Delete(1);
720 copystart = src;
721 }
722 dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)];
723 dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
724 total_changed++;
725 goto Do_state_table;
726 case kExitReplace21: // del 2, add 1 bytes to change
727 dst--;
728 if (offsetmap != NULL) {
729 offsetmap->Copy(src - copystart - 1);
730 offsetmap->Delete(1);
731 copystart = src;
732 }
733 dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
734 total_changed++;
735 goto Do_state_table;
736 case kExitReplace3: // update 3 bytes to change
737 dst[-3] = (unsigned char)Tbl[c + (nEntries * 3)];
738 // Fall into next case
739 case kExitReplace2: // update 2 bytes to change
740 dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)];
741 // Fall into next case
742 case kExitReplace1: // update 1 byte to change
743 dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
744 total_changed++;
745 goto Do_state_table;
746 case kExitReplace1S0: // update 1 byte to change, 256-entry state
747 dst[-1] = (unsigned char)Tbl[c + (256 * 1)];
748 total_changed++;
749 goto Do_state_table;
750 // These can make the output string longer than the input
751 case kExitReplaceOffset2:
752 if ((nEntries != 256) && InStateZero(st, Tbl)) {
753 // For space-optimized table, we need multiples of 256 bytes
754 // in state0 and multiples of nEntries in other states
755 offset += ((unsigned char)Tbl[c + (256 * 2)] << 8);
756 } else {
757 offset += ((unsigned char)Tbl[c + (nEntries * 2)] << 8);
758 }
759 // Fall into next case
760 case kExitSpecial: // Apply special fixups [read: hacks]
761 case kExitReplaceOffset1:
762 if ((nEntries != 256) && InStateZero(st, Tbl)) {
763 // For space-optimized table, we need multiples of 256 bytes
764 // in state0 and multiples of nEntries in other states
765 offset += (unsigned char)Tbl[c + (256 * 1)];
766 } else {
767 offset += (unsigned char)Tbl[c + (nEntries * 1)];
768 }
769 {
770 const RemapEntry* re = &st->remap_base[offset];
771 int del_len = re->delete_bytes & ~kReplaceAndResumeFlag;
772 int add_len = re->add_bytes & ~kHtmlPlaintextFlag;
773
774 // Special-case non-HTML replacement of five sensitive entities
775 // &quot; &amp; &apos; &lt; &gt;
776 // 0022 0026 0027 003c 003e
777 // A replacement creating one of these is expressed as a pair of
778 // entries, one for HTML output and one for plaintext output.
779 // The first of the pair has the high bit of add_bytes set.
780 if (re->add_bytes & kHtmlPlaintextFlag) {
781 // Use this entry for plain text
782 if (!is_plain_text) {
783 // Use very next entry for HTML text (same back/delete length)
784 re = &st->remap_base[offset + 1];
785 add_len = re->add_bytes & ~kHtmlPlaintextFlag;
786 }
787 }
788
789 int string_offset = re->bytes_offset;
790 // After the replacement, need (dstlimit - newdst) >= (srclimit - src)
791 uint8* newdst = dst - del_len + add_len;
792 if ((dstlimit - newdst) < (srclimit - src)) {
793 // Won't fit; don't do the replacement. Caller may realloc and retry
794 e = kExitDstSpaceFull;
795 break; // exit, backing up over this char for later retry
796 }
797 dst -= del_len;
798 memcpy(dst, &st->remap_string[string_offset], add_len);
799 dst += add_len;
800 total_changed++;
801 if (offsetmap != NULL) {
802 if (add_len > del_len) {
803 offsetmap->Copy(src - copystart);
804 offsetmap->Insert(add_len - del_len);
805 copystart = src;
806 } else if (add_len < del_len) {
807 offsetmap->Copy(src - copystart + add_len - del_len);
808 offsetmap->Delete(del_len - add_len);
809 copystart = src;
810 }
811 }
812 if (re->delete_bytes & kReplaceAndResumeFlag) {
813 // There is a non-zero target state at the end of the
814 // replacement string
815 e = st->remap_string[string_offset + add_len];
816 Tbl = &Tbl_0[e << eshift];
817 goto Do_state_table_newe;
818 }
819 }
820 if (e == kExitRejectAlt) {break;}
821 if (e != kExitSpecial) {goto Do_state_table;}
822
823 // case kExitSpecial: // Apply special fixups [read: hacks]
824 // In this routine, do either UTF8CharToLower()
825 // fullwidth/halfwidth mapping or
826 // voiced mapping or
827 // semi-voiced mapping
828
829 // First, do EXIT_REPLACE_OFFSET1 action (above)
830 // Second: do additional code fixup
831 {
832 int srcdel = DoSpecialFixup(c, &src, srclimit, &dst, dstlimit);
833 if (offsetmap != NULL) {
834 if (srcdel != 0) {
835 offsetmap->Copy(src - copystart - srcdel);
836 offsetmap->Delete(srcdel);
837 copystart = src;
838 }
839 }
840 }
841 goto Do_state_table;
842
843 case kExitIllegalStructure: // structurally illegal byte; quit
844 case kExitReject: // NUL or illegal code encountered; quit
845 case kExitRejectAlt: // Apply replacement, then exit
846 default: // and all other exits
847 break;
848 } // End switch (e)
849
850 // Exit possibilities:
851 // Some other exit code, state0, back up one byte exactly
852 // Some other exit code, !state0, back up over last char
853
854 // Back up over exactly one byte of rejected/illegal UTF-8 character
855 src--;
856 dst--;
857 // Back up more if needed
858 if (!InStateZero(st, Tbl)) {
859 do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
860 }
861 } else if (!InStateZero(st, Tbl)) {
862 // src >= srclimit, !state0
863 // Back up over truncated UTF-8 character
864 e = kExitIllegalStructure;
865 do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
866 } else {
867 // src >= srclimit, state0
868 // Normal termination, source fully consumed
869 e = kExitOK;
870 }
871
872 if (offsetmap != NULL) {
873 if (src > copystart) {
874 offsetmap->Copy(src - copystart);
875 copystart = src;
876 }
877 }
878
879 // Possible return values here:
880 // kExitDstSpaceFull caller may realloc and retry from middle
881 // kExitIllegalStructure caller my overwrite/truncate
882 // kExitOK all done and happy
883 // kExitReject caller may overwrite/truncate
884 // kExitDoAgain LOOP NOT DONE; caller must retry from middle
885 // (may do fast ASCII loop first)
886 // kExitPlaceholder -unused-
887 // kExitNone -unused-
888 *bytes_consumed = src - isrc;
889 *bytes_filled = dst - odst;
890 *chars_changed = total_changed;
891 return e;
892 }
893
894 // TwoByte versions are needed for tables > 240 states, such
895 // as the table for full Unicode 4.1 canonical + compatibility mapping
896
897 // Scan a UTF-8 stringpiece based on state table with two-byte entries,
898 // copying to output stringpiece
899 // and doing text replacements.
900 // DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below
901 // Needs caller to loop on kExitDoAgain
902 static int UTF8GenericReplaceInternalTwoByte(const UTF8ReplaceObj_2* st,
903 const StringPiece& istr,
904 StringPiece& ostr,
905 bool is_plain_text,
906 int* bytes_consumed,
907 int* bytes_filled,
908 int* chars_changed,
909 OffsetMap* offsetmap) {
910 int eshift = st->entry_shift;
911 int nEntries = (1 << eshift); // 64 or 256 entries per state
912 const uint8* isrc = reinterpret_cast<const uint8*>(istr.data());
913 const int ilen = istr.length();
914 const uint8* copystart = isrc;
915 const uint8* src = isrc;
916 const uint8* srclimit = src + ilen;
917 *bytes_consumed = 0;
918 *bytes_filled = 0;
919 *chars_changed = 0;
920
921 const uint8* odst = reinterpret_cast<const uint8*>(ostr.data());
922 const int olen = ostr.length();
923 uint8* dst = const_cast<uint8*>(odst);
924 uint8* dstlimit = dst + olen;
925
926 *chars_changed = 0;
927
928 int total_changed = 0;
929
930 int src_lll = srclimit - src;
931 int dst_lll = dstlimit - dst;
932
933
934 // Invariant condition during replacements:
935 // remaining dst size >= remaining src size
936 if ((dstlimit - dst) < (srclimit - src)) {
937 if (offsetmap != NULL) {
938 offsetmap->Copy(src - copystart);
939 copystart = src;
940 }
941 return kExitDstSpaceFull_2;
942 }
943 const unsigned short* Tbl_0 = &st->state_table[st->state0];
944
945 Do_state_table_2:
946 // Do state-table scan, copying as we go
947 const unsigned short* Tbl = Tbl_0;
948 int e = 0;
949 uint8 c = 0;
950
951 Do_state_table_newe_2:
952
953 //----------------------------
954 while (src < srclimit) {
955 c = *src;
956 e = Tbl[c];
957 *dst = c;
958 src++;
959 dst++;
960 if (e >= kExitIllegalStructure_2) {break;}
961 Tbl = &Tbl_0[e << eshift];
962 }
963 //----------------------------
964 src_lll = src - isrc;
965 dst_lll = dst - odst;
966
967 // Exit possibilities:
968 // Replacement code, do the replacement and loop
969 // Some other exit code, state0, back up one byte exactly
970 // Some other exit code, !state0, back up over last char
971 // source consumed, state0, exit OK
972 // source consumed, !state0, back up over partial char
973 // For illegal byte in state0, avoid backup up over PREVIOUS char
974 // For truncated last char, back up to beginning of it
975
976 if (e >= kExitIllegalStructure_2) {
977 // Switch on exit code; most loop back to top
978 int offset = 0;
979 switch (e) {
980 // These all make the output string the same size or shorter
981 // No checking needed
982 case kExitReplace31_2: // del 2, add 1 bytes to change
983 dst -= 2;
984 if (offsetmap != NULL) {
985 offsetmap->Copy(src - copystart - 2);
986 offsetmap->Delete(2);
987 copystart = src;
988 }
989 dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
990 total_changed++;
991 goto Do_state_table_2;
992 case kExitReplace32_2: // del 3, add 2 bytes to change
993 dst--;
994 if (offsetmap != NULL) {
995 offsetmap->Copy(src - copystart - 1);
996 offsetmap->Delete(1);
997 copystart = src;
998 }
999 dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff);
1000 dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
1001 total_changed++;
1002 goto Do_state_table_2;
1003 case kExitReplace21_2: // del 2, add 1 bytes to change
1004 dst--;
1005 if (offsetmap != NULL) {
1006 offsetmap->Copy(src - copystart - 1);
1007 offsetmap->Delete(1);
1008 copystart = src;
1009 }
1010 dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
1011 total_changed++;
1012 goto Do_state_table_2;
1013 case kExitReplace3_2: // update 3 bytes to change
1014 dst[-3] = (unsigned char)(Tbl[c + (nEntries * 2)] & 0xff);
1015 // Fall into next case
1016 case kExitReplace2_2: // update 2 bytes to change
1017 dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff);
1018 // Fall into next case
1019 case kExitReplace1_2: // update 1 byte to change
1020 dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
1021 total_changed++;
1022 goto Do_state_table_2;
1023 case kExitReplace1S0_2: // update 1 byte to change, 256-entry state
1024 dst[-1] = (unsigned char)(Tbl[c + (256 * 1)] & 0xff);
1025 total_changed++;
1026 goto Do_state_table_2;
1027 // These can make the output string longer than the input
1028 case kExitReplaceOffset2_2:
1029 if ((nEntries != 256) && InStateZero_2(st, Tbl)) {
1030 // For space-optimized table, we need multiples of 256 bytes
1031 // in state0 and multiples of nEntries in other states
1032 offset += ((unsigned char)(Tbl[c + (256 * 1)] >> 8 & 0xff) << 8);
1033 } else {
1034 offset += ((unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff) << 8);
1035 }
1036 // Fall into next case
1037 case kExitReplaceOffset1_2:
1038 if ((nEntries != 256) && InStateZero_2(st, Tbl)) {
1039 // For space-optimized table, we need multiples of 256 bytes
1040 // in state0 and multiples of nEntries in other states
1041 offset += (unsigned char)(Tbl[c + (256 * 1)] & 0xff);
1042 } else {
1043 offset += (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
1044 }
1045 {
1046 const RemapEntry* re = &st->remap_base[offset];
1047 int del_len = re->delete_bytes & ~kReplaceAndResumeFlag;
1048 int add_len = re->add_bytes & ~kHtmlPlaintextFlag;
1049 // Special-case non-HTML replacement of five sensitive entities
1050 // &quot; &amp; &apos; &lt; &gt;
1051 // 0022 0026 0027 003c 003e
1052 // A replacement creating one of these is expressed as a pair of
1053 // entries, one for HTML output and one for plaintext output.
1054 // The first of the pair has the high bit of add_bytes set.
1055 if (re->add_bytes & kHtmlPlaintextFlag) {
1056 // Use this entry for plain text
1057 if (!is_plain_text) {
1058 // Use very next entry for HTML text (same back/delete length)
1059 re = &st->remap_base[offset + 1];
1060 add_len = re->add_bytes & ~kHtmlPlaintextFlag;
1061 }
1062 }
1063
1064 // After the replacement, need (dstlimit - dst) >= (srclimit - src)
1065 int string_offset = re->bytes_offset;
1066 // After the replacement, need (dstlimit - newdst) >= (srclimit - src)
1067 uint8* newdst = dst - del_len + add_len;
1068 if ((dstlimit - newdst) < (srclimit - src)) {
1069 // Won't fit; don't do the replacement. Caller may realloc and retry
1070 e = kExitDstSpaceFull_2;
1071 break; // exit, backing up over this char for later retry
1072 }
1073 dst -= del_len;
1074 memcpy(dst, &st->remap_string[string_offset], add_len);
1075 dst += add_len;
1076 if (offsetmap != NULL) {
1077 if (add_len > del_len) {
1078 offsetmap->Copy(src - copystart);
1079 offsetmap->Insert(add_len - del_len);
1080 copystart = src;
1081 } else if (add_len < del_len) {
1082 offsetmap->Copy(src - copystart + add_len - del_len);
1083 offsetmap->Delete(del_len - add_len);
1084 copystart = src;
1085 }
1086 }
1087 if (re->delete_bytes & kReplaceAndResumeFlag) {
1088 // There is a two-byte non-zero target state at the end of the
1089 // replacement string
1090 uint8 c1 = st->remap_string[string_offset + add_len];
1091 uint8 c2 = st->remap_string[string_offset + add_len + 1];
1092 e = (c1 << 8) | c2;
1093 Tbl = &Tbl_0[e << eshift];
1094 total_changed++;
1095 goto Do_state_table_newe_2;
1096 }
1097 }
1098 total_changed++;
1099 if (e == kExitRejectAlt_2) {break;}
1100 goto Do_state_table_2;
1101
1102 case kExitSpecial_2: // NO special fixups [read: hacks]
1103 case kExitIllegalStructure_2: // structurally illegal byte; quit
1104 case kExitReject_2: // NUL or illegal code encountered; quit
1105 // and all other exits
1106 default:
1107 break;
1108 } // End switch (e)
1109
1110 // Exit possibilities:
1111 // Some other exit code, state0, back up one byte exactly
1112 // Some other exit code, !state0, back up over last char
1113
1114 // Back up over exactly one byte of rejected/illegal UTF-8 character
1115 src--;
1116 dst--;
1117 // Back up more if needed
1118 if (!InStateZero_2(st, Tbl)) {
1119 do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
1120 }
1121 } else if (!InStateZero_2(st, Tbl)) {
1122 // src >= srclimit, !state0
1123 // Back up over truncated UTF-8 character
1124 e = kExitIllegalStructure_2;
1125
1126 do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
1127 } else {
1128 // src >= srclimit, state0
1129 // Normal termination, source fully consumed
1130 e = kExitOK_2;
1131 }
1132
1133 if (offsetmap != NULL) {
1134 if (src > copystart) {
1135 offsetmap->Copy(src - copystart);
1136 copystart = src;
1137 }
1138 }
1139
1140
1141 // Possible return values here:
1142 // kExitDstSpaceFull_2 caller may realloc and retry from middle
1143 // kExitIllegalStructure_2 caller my overwrite/truncate
1144 // kExitOK_2 all done and happy
1145 // kExitReject_2 caller may overwrite/truncate
1146 // kExitDoAgain_2 LOOP NOT DONE; caller must retry from middle
1147 // (may do fast ASCII loop first)
1148 // kExitPlaceholder_2 -unused-
1149 // kExitNone_2 -unused-
1150 *bytes_consumed = src - isrc;
1151 *bytes_filled = dst - odst;
1152 *chars_changed = total_changed;
1153 return e;
1154 }
1155
1156
1157 // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
1158 // and doing text replacements.
1159 // Also writes an optional OffsetMap. Pass NULL to skip writing one.
1160 // Always scan complete UTF-8 characters
1161 // Set number of bytes consumed from input, number filled to output.
1162 // Return reason for exiting
1163 int UTF8GenericReplace(const UTF8ReplaceObj* st,
1164 const StringPiece& istr,
1165 StringPiece& ostr,
1166 bool is_plain_text,
1167 int* bytes_consumed,
1168 int* bytes_filled,
1169 int* chars_changed,
1170 OffsetMap* offsetmap) {
1171 StringPiece local_istr(istr.data(), istr.length());
1172 StringPiece local_ostr(ostr.data(), ostr.length());
1173 int total_consumed = 0;
1174 int total_filled = 0;
1175 int total_changed = 0;
1176 int local_bytes_consumed, local_bytes_filled, local_chars_changed;
1177 int e;
1178 do {
1179 e = UTF8GenericReplaceInternal(st,
1180 local_istr, local_ostr, is_plain_text,
1181 &local_bytes_consumed, &local_bytes_filled,
1182 &local_chars_changed,
1183 offsetmap);
1184 local_istr.remove_prefix(local_bytes_consumed);
1185 local_ostr.remove_prefix(local_bytes_filled);
1186 total_consumed += local_bytes_consumed;
1187 total_filled += local_bytes_filled;
1188 total_changed += local_chars_changed;
1189 } while ( e == kExitDoAgain );
1190 *bytes_consumed = total_consumed;
1191 *bytes_filled = total_filled;
1192 *chars_changed = total_changed;
1193 return e;
1194 }
1195
1196 // Older version without offsetmap
1197 int UTF8GenericReplace(const UTF8ReplaceObj* st,
1198 const StringPiece& istr,
1199 StringPiece& ostr,
1200 bool is_plain_text,
1201 int* bytes_consumed,
1202 int* bytes_filled,
1203 int* chars_changed) {
1204 return UTF8GenericReplace(st,
1205 istr,
1206 ostr,
1207 is_plain_text,
1208 bytes_consumed,
1209 bytes_filled,
1210 chars_changed,
1211 NULL);
1212 }
1213
1214 // Older version without is_plain_text or offsetmap
1215 int UTF8GenericReplace(const UTF8ReplaceObj* st,
1216 const StringPiece& istr,
1217 StringPiece& ostr,
1218 int* bytes_consumed,
1219 int* bytes_filled,
1220 int* chars_changed) {
1221 bool is_plain_text = false;
1222 return UTF8GenericReplace(st,
1223 istr,
1224 ostr,
1225 is_plain_text,
1226 bytes_consumed,
1227 bytes_filled,
1228 chars_changed,
1229 NULL);
1230 }
1231
1232 // Scan a UTF-8 stringpiece based on state table with two-byte entries,
1233 // copying to output stringpiece
1234 // and doing text replacements.
1235 // Also writes an optional OffsetMap. Pass NULL to skip writing one.
1236 // Always scan complete UTF-8 characters
1237 // Set number of bytes consumed from input, number filled to output.
1238 // Return reason for exiting
1239 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
1240 const StringPiece& istr,
1241 StringPiece& ostr,
1242 bool is_plain_text,
1243 int* bytes_consumed,
1244 int* bytes_filled,
1245 int* chars_changed,
1246 OffsetMap* offsetmap) {
1247 StringPiece local_istr(istr.data(), istr.length());
1248 StringPiece local_ostr(ostr.data(), ostr.length());
1249 int total_consumed = 0;
1250 int total_filled = 0;
1251 int total_changed = 0;
1252 int local_bytes_consumed, local_bytes_filled, local_chars_changed;
1253 int e;
1254 do {
1255 e = UTF8GenericReplaceInternalTwoByte(st,
1256 local_istr, local_ostr, is_plain_text,
1257 &local_bytes_consumed,
1258 &local_bytes_filled,
1259 &local_chars_changed,
1260 offsetmap);
1261 local_istr.remove_prefix(local_bytes_consumed);
1262 local_ostr.remove_prefix(local_bytes_filled);
1263 total_consumed += local_bytes_consumed;
1264 total_filled += local_bytes_filled;
1265 total_changed += local_chars_changed;
1266 } while ( e == kExitDoAgain_2 );
1267 *bytes_consumed = total_consumed;
1268 *bytes_filled = total_filled;
1269 *chars_changed = total_changed;
1270
1271 return e - kExitOK_2 + kExitOK;
1272 }
1273
1274 // Older version without offsetmap
1275 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
1276 const StringPiece& istr,
1277 StringPiece& ostr,
1278 bool is_plain_text,
1279 int* bytes_consumed,
1280 int* bytes_filled,
1281 int* chars_changed) {
1282 return UTF8GenericReplaceTwoByte(st,
1283 istr,
1284 ostr,
1285 is_plain_text,
1286 bytes_consumed,
1287 bytes_filled,
1288 chars_changed,
1289 NULL);
1290 }
1291
1292 // Older version without is_plain_text or offsetmap
1293 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
1294 const StringPiece& istr,
1295 StringPiece& ostr,
1296 int* bytes_consumed,
1297 int* bytes_filled,
1298 int* chars_changed) {
1299 bool is_plain_text = false;
1300 return UTF8GenericReplaceTwoByte(st,
1301 istr,
1302 ostr,
1303 is_plain_text,
1304 bytes_consumed,
1305 bytes_filled,
1306 chars_changed,
1307 NULL);
1308 }
1309
1310
1311
1312 // Adjust a stringpiece to encompass complete UTF-8 characters.
1313 // The data pointer will be increased by 0..3 bytes to get to a character
1314 // boundary, and the length will then be decreased by 0..3 bytes
1315 // to encompass the last complete character.
1316 void UTF8TrimToChars(StringPiece* istr) {
1317 const char* src = istr->data();
1318 int len = istr->length();
1319 // Exit if empty string
1320 if (len == 0) {
1321 return;
1322 }
1323
1324 // Exit on simple, common case
1325 if ( ((src[0] & 0xc0) != 0x80) &&
1326 (static_cast<signed char>(src[len - 1]) >= 0) ) {
1327 // First byte is not a continuation and last byte is 7-bit ASCII -- done
1328 return;
1329 }
1330
1331 // Adjust the back end, len > 0
1332 const char* srclimit = src + len;
1333 // Backscan over any ending continuation bytes to find last char start
1334 const char* s = srclimit - 1; // Last byte of the string
1335 while ((src <= s) && ((*s & 0xc0) == 0x80)) {
1336 s--;
1337 }
1338 // Include entire last char if it fits
1339 if (src <= s) {
1340 int last_char_len = UTF8OneCharLen(s);
1341 if (s + last_char_len <= srclimit) {
1342 // Last char fits, so include it, else exclude it
1343 s += last_char_len;
1344 }
1345 }
1346 if (s != srclimit) {
1347 // s is one byte beyond the last full character, if any
1348 istr->remove_suffix(srclimit - s);
1349 // Exit if now empty string
1350 if (istr->length() == 0) {
1351 return;
1352 }
1353 }
1354
1355 // Adjust the front end, len > 0
1356 len = istr->length();
1357 srclimit = src + len;
1358 s = src; // First byte of the string
1359 // Scan over any beginning continuation bytes to find first char start
1360 while ((s < srclimit) && ((*s & 0xc0) == 0x80)) {
1361 s++;
1362 }
1363 if (s != src) {
1364 // s is at the first full character, if any
1365 istr->remove_prefix(s - src);
1366 }
1367 }
1368
1369 } // End namespace CLD2

mercurial