1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/browser/components/translation/cld2/internal/utf8statetable.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1369 @@ 1.4 +// Copyright 2013 Google Inc. All Rights Reserved. 1.5 +// 1.6 +// Licensed under the Apache License, Version 2.0 (the "License"); 1.7 +// you may not use this file except in compliance with the License. 1.8 +// You may obtain a copy of the License at 1.9 +// 1.10 +// http://www.apache.org/licenses/LICENSE-2.0 1.11 +// 1.12 +// Unless required by applicable law or agreed to in writing, software 1.13 +// distributed under the License is distributed on an "AS IS" BASIS, 1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.15 +// See the License for the specific language governing permissions and 1.16 +// limitations under the License. 1.17 + 1.18 +// 1.19 +// State Table follower for scanning UTF-8 strings without converting to 1.20 +// 32- or 16-bit Unicode values. 1.21 +// 1.22 + 1.23 +#ifdef COMPILER_MSVC 1.24 +// MSVC warns: warning C4309: 'initializing' : truncation of constant value 1.25 +// But the value is in fact not truncated. 0xFF still comes out 0xFF at 1.26 +// runtime. 1.27 +#pragma warning ( disable : 4309 ) 1.28 +#endif 1.29 + 1.30 +#include "utf8statetable.h" 1.31 + 1.32 +#include <stdint.h> // for uintptr_t 1.33 +#include <string.h> // for NULL, memcpy, memmove 1.34 + 1.35 +#include "integral_types.h" // for uint8, uint32, int8 1.36 +#include "stringpiece.h" 1.37 +#include "offsetmap.h" 1.38 + 1.39 + 1.40 +namespace CLD2 { 1.41 + 1.42 +static const int kReplaceAndResumeFlag = 0x80; // Bit in del byte to distinguish 1.43 + // optional next-state field 1.44 + // after replacement text 1.45 +static const int kHtmlPlaintextFlag = 0x80; // Bit in add byte to distinguish 1.46 + // HTML replacement vs. plaintext 1.47 + 1.48 + 1.49 +/** 1.50 + * This code implements a little interpreter for UTF8 state 1.51 + * tables. There are three kinds of quite-similar state tables, 1.52 + * property, scanning, and replacement. Each state in one of 1.53 + * these tables consists of an array of 256 or 64 one-byte 1.54 + * entries. The state is subscripted by an incoming source byte, 1.55 + * and the entry either specifies the next state or specifies an 1.56 + * action. Space-optimized tables have full 256-entry states for 1.57 + * the first byte of a UTF-8 character, but only 64-entry states 1.58 + * for continuation bytes. Space-optimized tables may only be 1.59 + * used with source input that has been checked to be 1.60 + * structurally- (or stronger interchange-) valid. 1.61 + * 1.62 + * A property state table has an unsigned one-byte property for 1.63 + * each possible UTF-8 character. One-byte character properties 1.64 + * are in the state[0] array, while for other lengths the 1.65 + * state[0] array gives the next state, which contains the 1.66 + * property value for two-byte characters or yet another state 1.67 + * for longer ones. The code simply loads the right number of 1.68 + * next-state values, then returns the final byte as property 1.69 + * value. There are no actions specified in property tables. 1.70 + * States are typically shared for multi-byte UTF-8 characters 1.71 + * that all have the same property value. 1.72 + * 1.73 + * A scanning state table has entries that are either a 1.74 + * next-state specifier for bytes that are accepted by the 1.75 + * scanner, or an exit action for the last byte of each 1.76 + * character that is rejected by the scanner. 1.77 + * 1.78 + * Scanning long strings involves a tight loop that picks up one 1.79 + * byte at a time and follows next-state value back to state[0] 1.80 + * for each accepted UTF-8 character. Scanning stops at the end 1.81 + * of the string or at the first character encountered that has 1.82 + * an exit action such as "reject". Timing information is given 1.83 + * below. 1.84 + * 1.85 + * Since so much of Google's text is 7-bit-ASCII values 1.86 + * (approximately 94% of the bytes of web documents), the 1.87 + * scanning interpreter has two speed optimizations. One checks 1.88 + * 8 bytes at a time to see if they are all in the range lo..hi, 1.89 + * as specified in constants in the overall statetable object. 1.90 + * The check involves ORing together four 4-byte values that 1.91 + * overflow into the high bit of some byte when a byte is out of 1.92 + * range. For seven-bit-ASCII, lo is 0x20 and hi is 0x7E. This 1.93 + * loop is about 8x faster than the one-byte-at-a-time loop. 1.94 + * 1.95 + * If checking for exit bytes in the 0x00-0x1F and 7F range is 1.96 + * unneeded, an even faster loop just looks at the high bits of 1.97 + * 8 bytes at once, and is about 1.33x faster than the lo..hi 1.98 + * loop. 1.99 + * 1.100 + * Exit from the scanning routines backs up to the first byte of 1.101 + * the rejected character, so the text spanned is always a 1.102 + * complete number of UTF-8 characters. The normal scanning exit 1.103 + * is at the first rejected character, or at the end of the 1.104 + * input text. Scanning also exits on any detected ill-formed 1.105 + * character or at a special do-again action built into some 1.106 + * exit-optimized tables. The do-again action gets back to the 1.107 + * top of the scanning loop to retry eight-byte ASCII scans. It 1.108 + * is typically put into state tables after four seven-bit-ASCII 1.109 + * characters in a row are seen, to allow restarting the fast 1.110 + * scan after some slower processing of multi-byte characters. 1.111 + * 1.112 + * A replacement state table is similar to a scanning state 1.113 + * table but has more extensive actions. The default 1.114 + * byte-at-a-time loop copies one byte from source to 1.115 + * destination and goes to the next state. The replacement 1.116 + * actions overwrite 1-3 bytes of the destination with different 1.117 + * bytes, possibly shortening the output by 1 or 2 bytes. The 1.118 + * replacement bytes come from within the state table, from 1.119 + * dummy states inserted just after any state that contains a 1.120 + * replacement action. This gives a quick address calculation for 1.121 + * the replacement byte(s) and gives some cache locality. 1.122 + * 1.123 + * Additional replacement actions use one or two bytes from 1.124 + * within dummy states to index a side table of more-extensive 1.125 + * replacements. The side table specifies a length of 0..15 1.126 + * destination bytes to overwrite and a length of 0..127 bytes 1.127 + * to overwrite them with, plus the actual replacement bytes. 1.128 + * 1.129 + * This side table uses one extra bit to specify a pair of 1.130 + * replacements, the first to be used in an HTML context and the 1.131 + * second to be used in a plaintext context. This allows 1.132 + * replacements that are spelled with "<" in the former 1.133 + * context and "<" in the latter. 1.134 + * 1.135 + * The side table also uses an extra bit to specify a non-zero 1.136 + * next state after a replacement. This allows a combination 1.137 + * replacement and state change, used to implement a limited 1.138 + * version of the Boyer-Moore algorithm for multi-character 1.139 + * replacement without backtracking. This is useful when there 1.140 + * are overlapping replacements, such as ch => x and also c => 1.141 + * y, the latter to be used only if the character after c is not 1.142 + * h. in this case, the state[0] table's entry for c would 1.143 + * change c to y and also have a next-state of say n, and the 1.144 + * state[n] entry for h would specify a replacement of the two 1.145 + * bytes yh by x. No backtracking is needed. 1.146 + * 1.147 + * A replacement table may also include the exit actions of a 1.148 + * scanning state table, so some character sequences can 1.149 + * terminate early. 1.150 + * 1.151 + * During replacement, an optional data structure called an 1.152 + * offset map can be updated to reflect each change in length 1.153 + * between source and destination. This offset map can later be 1.154 + * used to map destination-string offsets to corresponding 1.155 + * source-string offsets or vice versa. 1.156 + * 1.157 + * The routines below also have variants in which state-table 1.158 + * entries are all two bytes instead of one byte. This allows 1.159 + * tables with more than 240 total states, but takes up twice as 1.160 + * much space per state. 1.161 + * 1.162 +**/ 1.163 + 1.164 +// Return true if current Tbl pointer is within state0 range 1.165 +// Note that unsigned compare checks both ends of range simultaneously 1.166 +static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) { 1.167 + const uint8* Tbl0 = &st->state_table[st->state0]; 1.168 + return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size); 1.169 +} 1.170 + 1.171 +static inline bool InStateZero_2(const UTF8ReplaceObj_2* st, 1.172 + const unsigned short int* Tbl) { 1.173 + const unsigned short int* Tbl0 = &st->state_table[st->state0]; 1.174 + // Word difference, not byte difference 1.175 + return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size); 1.176 +} 1.177 + 1.178 +// UTF8PropObj, UTF8ScanObj, UTF8ReplaceObj are all typedefs of 1.179 +// UTF8MachineObj. 1.180 + 1.181 +static bool IsPropObj(const UTF8StateMachineObj& obj) { 1.182 + return obj.fast_state == NULL 1.183 + && obj.max_expand == 0; 1.184 +} 1.185 + 1.186 +static bool IsPropObj_2(const UTF8StateMachineObj_2& obj) { 1.187 + return obj.fast_state == NULL 1.188 + && obj.max_expand == 0; 1.189 +} 1.190 + 1.191 +static bool IsScanObj(const UTF8StateMachineObj& obj) { 1.192 + return obj.fast_state != NULL 1.193 + && obj.max_expand == 0; 1.194 +} 1.195 + 1.196 +static bool IsReplaceObj(const UTF8StateMachineObj& obj) { 1.197 + // Normally, obj.fast_state != NULL, but the handwritten tables 1.198 + // in utf8statetable_unittest don't handle fast_states. 1.199 + return obj.max_expand > 0; 1.200 +} 1.201 + 1.202 +static bool IsReplaceObj_2(const UTF8StateMachineObj_2& obj) { 1.203 + return obj.max_expand > 0; 1.204 +} 1.205 + 1.206 +// Look up property of one UTF-8 character and advance over it 1.207 +// Return 0 if input length is zero 1.208 +// Return 0 and advance one byte if input is ill-formed 1.209 +uint8 UTF8GenericProperty(const UTF8PropObj* st, 1.210 + const uint8** src, 1.211 + int* srclen) { 1.212 + if (*srclen <= 0) { 1.213 + return 0; 1.214 + } 1.215 + 1.216 + const uint8* lsrc = *src; 1.217 + const uint8* Tbl_0 = &st->state_table[st->state0]; 1.218 + const uint8* Tbl = Tbl_0; 1.219 + int e; 1.220 + int eshift = st->entry_shift; 1.221 + 1.222 + // Short series of tests faster than switch, optimizes 7-bit ASCII 1.223 + unsigned char c = lsrc[0]; 1.224 + if (static_cast<signed char>(c) >= 0) { // one byte 1.225 + e = Tbl[c]; 1.226 + *src += 1; 1.227 + *srclen -= 1; 1.228 + } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes 1.229 + e = Tbl[c]; 1.230 + Tbl = &Tbl_0[e << eshift]; 1.231 + e = Tbl[lsrc[1]]; 1.232 + *src += 2; 1.233 + *srclen -= 2; 1.234 + } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes 1.235 + e = Tbl[c]; 1.236 + Tbl = &Tbl_0[e << eshift]; 1.237 + e = Tbl[lsrc[1]]; 1.238 + Tbl = &Tbl_0[e << eshift]; 1.239 + e = Tbl[lsrc[2]]; 1.240 + *src += 3; 1.241 + *srclen -= 3; 1.242 + }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes 1.243 + e = Tbl[c]; 1.244 + Tbl = &Tbl_0[e << eshift]; 1.245 + e = Tbl[lsrc[1]]; 1.246 + Tbl = &Tbl_0[e << eshift]; 1.247 + e = Tbl[lsrc[2]]; 1.248 + Tbl = &Tbl_0[e << eshift]; 1.249 + e = Tbl[lsrc[3]]; 1.250 + *src += 4; 1.251 + *srclen -= 4; 1.252 + } else { // Ill-formed 1.253 + e = 0; 1.254 + *src += 1; 1.255 + *srclen -= 1; 1.256 + } 1.257 + return e; 1.258 +} 1.259 + 1.260 +bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src) { 1.261 + const uint8* lsrc = reinterpret_cast<const uint8*>(src); 1.262 + const uint8* Tbl_0 = &st.state_table[st.state0]; 1.263 + const uint8* Tbl = Tbl_0; 1.264 + int e; 1.265 + int eshift = st.entry_shift; 1.266 + 1.267 + // Short series of tests faster than switch, optimizes 7-bit ASCII 1.268 + unsigned char c = lsrc[0]; 1.269 + if (static_cast<signed char>(c) >= 0) { // one byte 1.270 + e = Tbl[c]; 1.271 + } else if ((c & 0xe0) == 0xc0) { // two bytes 1.272 + e = Tbl[c]; 1.273 + Tbl = &Tbl_0[e << eshift]; 1.274 + e = Tbl[lsrc[1]]; 1.275 + } else if ((c & 0xf0) == 0xe0) { // three bytes 1.276 + e = Tbl[c]; 1.277 + Tbl = &Tbl_0[e << eshift]; 1.278 + e = Tbl[lsrc[1]]; 1.279 + Tbl = &Tbl_0[e << eshift]; 1.280 + e = Tbl[lsrc[2]]; 1.281 + } else { // four bytes 1.282 + e = Tbl[c]; 1.283 + Tbl = &Tbl_0[e << eshift]; 1.284 + e = Tbl[lsrc[1]]; 1.285 + Tbl = &Tbl_0[e << eshift]; 1.286 + e = Tbl[lsrc[2]]; 1.287 + Tbl = &Tbl_0[e << eshift]; 1.288 + e = Tbl[lsrc[3]]; 1.289 + } 1.290 + return e; 1.291 +} 1.292 + 1.293 + 1.294 +// BigOneByte versions are needed for tables > 240 states, but most 1.295 +// won't need the TwoByte versions. 1.296 +// Internally, to next-to-last offset is multiplied by 16 and the last 1.297 +// offset is relative instead of absolute. 1.298 +// Look up property of one UTF-8 character and advance over it 1.299 +// Return 0 if input length is zero 1.300 +// Return 0 and advance one byte if input is ill-formed 1.301 +uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st, 1.302 + const uint8** src, 1.303 + int* srclen) { 1.304 + if (*srclen <= 0) { 1.305 + return 0; 1.306 + } 1.307 + 1.308 + const uint8* lsrc = *src; 1.309 + const uint8* Tbl_0 = &st->state_table[st->state0]; 1.310 + const uint8* Tbl = Tbl_0; 1.311 + int e; 1.312 + int eshift = st->entry_shift; 1.313 + 1.314 + // Short series of tests faster than switch, optimizes 7-bit ASCII 1.315 + unsigned char c = lsrc[0]; 1.316 + if (static_cast<signed char>(c) >= 0) { // one byte 1.317 + e = Tbl[c]; 1.318 + *src += 1; 1.319 + *srclen -= 1; 1.320 + } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes 1.321 + e = Tbl[c]; 1.322 + Tbl = &Tbl_0[e << eshift]; 1.323 + e = Tbl[lsrc[1]]; 1.324 + *src += 2; 1.325 + *srclen -= 2; 1.326 + } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes 1.327 + e = Tbl[c]; 1.328 + Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range 1.329 + e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]]; 1.330 + Tbl = &Tbl[e << eshift]; // Relative +/- 1.331 + e = Tbl[lsrc[2]]; 1.332 + *src += 3; 1.333 + *srclen -= 3; 1.334 + }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes 1.335 + e = Tbl[c]; 1.336 + Tbl = &Tbl_0[e << eshift]; 1.337 + e = Tbl[lsrc[1]]; 1.338 + Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range 1.339 + e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]]; 1.340 + Tbl = &Tbl[e << eshift]; // Relative +/- 1.341 + e = Tbl[lsrc[3]]; 1.342 + *src += 4; 1.343 + *srclen -= 4; 1.344 + } else { // Ill-formed 1.345 + e = 0; 1.346 + *src += 1; 1.347 + *srclen -= 1; 1.348 + } 1.349 + return e; 1.350 +} 1.351 + 1.352 +// BigOneByte versions are needed for tables > 240 states, but most 1.353 +// won't need the TwoByte versions. 1.354 +bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src) { 1.355 + const uint8* lsrc = reinterpret_cast<const uint8*>(src); 1.356 + const uint8* Tbl_0 = &st.state_table[st.state0]; 1.357 + const uint8* Tbl = Tbl_0; 1.358 + int e; 1.359 + int eshift = st.entry_shift; 1.360 + 1.361 + // Short series of tests faster than switch, optimizes 7-bit ASCII 1.362 + unsigned char c = lsrc[0]; 1.363 + if (static_cast<signed char>(c) >= 0) { // one byte 1.364 + e = Tbl[c]; 1.365 + } else if ((c & 0xe0) == 0xc0) { // two bytes 1.366 + e = Tbl[c]; 1.367 + Tbl = &Tbl_0[e << eshift]; 1.368 + e = Tbl[lsrc[1]]; 1.369 + } else if ((c & 0xf0) == 0xe0) { // three bytes 1.370 + e = Tbl[c]; 1.371 + Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range 1.372 + e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]]; 1.373 + Tbl = &Tbl[e << eshift]; // Relative +/- 1.374 + e = Tbl[lsrc[2]]; 1.375 + } else { // four bytes 1.376 + e = Tbl[c]; 1.377 + Tbl = &Tbl_0[e << eshift]; 1.378 + e = Tbl[lsrc[1]]; 1.379 + Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range 1.380 + e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]]; 1.381 + Tbl = &Tbl[e << eshift]; // Relative +/- 1.382 + e = Tbl[lsrc[3]]; 1.383 + } 1.384 + return e; 1.385 +} 1.386 + 1.387 + 1.388 +// TwoByte versions are needed for tables > 240 states 1.389 +// Look up property of one UTF-8 character and advance over it 1.390 +// Return 0 if input length is zero 1.391 +// Return 0 and advance one byte if input is ill-formed 1.392 +uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st, 1.393 + const uint8** src, 1.394 + int* srclen) { 1.395 + if (*srclen <= 0) { 1.396 + return 0; 1.397 + } 1.398 + 1.399 + const uint8* lsrc = *src; 1.400 + const unsigned short* Tbl_0 = &st->state_table[st->state0]; 1.401 + const unsigned short* Tbl = Tbl_0; 1.402 + int e; 1.403 + int eshift = st->entry_shift; 1.404 + 1.405 + // Short series of tests faster than switch, optimizes 7-bit ASCII 1.406 + unsigned char c = lsrc[0]; 1.407 + if (static_cast<signed char>(c) >= 0) { // one byte 1.408 + e = Tbl[c]; 1.409 + *src += 1; 1.410 + *srclen -= 1; 1.411 + } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes 1.412 + e = Tbl[c]; 1.413 + Tbl = &Tbl_0[e << eshift]; 1.414 + e = Tbl[lsrc[1]]; 1.415 + *src += 2; 1.416 + *srclen -= 2; 1.417 + } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes 1.418 + e = Tbl[c]; 1.419 + Tbl = &Tbl_0[e << eshift]; 1.420 + e = Tbl[lsrc[1]]; 1.421 + Tbl = &Tbl_0[e << eshift]; 1.422 + e = Tbl[lsrc[2]]; 1.423 + *src += 3; 1.424 + *srclen -= 3; 1.425 + }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes 1.426 + e = Tbl[c]; 1.427 + Tbl = &Tbl_0[e << eshift]; 1.428 + e = Tbl[lsrc[1]]; 1.429 + Tbl = &Tbl_0[e << eshift]; 1.430 + e = Tbl[lsrc[2]]; 1.431 + Tbl = &Tbl_0[e << eshift]; 1.432 + e = Tbl[lsrc[3]]; 1.433 + *src += 4; 1.434 + *srclen -= 4; 1.435 + } else { // Ill-formed 1.436 + e = 0; 1.437 + *src += 1; 1.438 + *srclen -= 1; 1.439 + } 1.440 + return e; 1.441 +} 1.442 + 1.443 +// TwoByte versions are needed for tables > 240 states 1.444 +bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src) { 1.445 + const uint8* lsrc = reinterpret_cast<const uint8*>(src); 1.446 + const unsigned short* Tbl_0 = &st.state_table[st.state0]; 1.447 + const unsigned short* Tbl = Tbl_0; 1.448 + int e; 1.449 + int eshift = st.entry_shift; 1.450 + 1.451 + // Short series of tests faster than switch, optimizes 7-bit ASCII 1.452 + unsigned char c = lsrc[0]; 1.453 + if (static_cast<signed char>(c) >= 0) { // one byte 1.454 + e = Tbl[c]; 1.455 + } else if ((c & 0xe0) == 0xc0) { // two bytes 1.456 + e = Tbl[c]; 1.457 + Tbl = &Tbl_0[e << eshift]; 1.458 + e = Tbl[lsrc[1]]; 1.459 + } else if ((c & 0xf0) == 0xe0) { // three bytes 1.460 + e = Tbl[c]; 1.461 + Tbl = &Tbl_0[e << eshift]; 1.462 + e = Tbl[lsrc[1]]; 1.463 + Tbl = &Tbl_0[e << eshift]; 1.464 + e = Tbl[lsrc[2]]; 1.465 + } else { // four bytes 1.466 + e = Tbl[c]; 1.467 + Tbl = &Tbl_0[e << eshift]; 1.468 + e = Tbl[lsrc[1]]; 1.469 + Tbl = &Tbl_0[e << eshift]; 1.470 + e = Tbl[lsrc[2]]; 1.471 + Tbl = &Tbl_0[e << eshift]; 1.472 + e = Tbl[lsrc[3]]; 1.473 + } 1.474 + return e; 1.475 +} 1.476 + 1.477 + 1.478 +// Approximate speeds on 2.8 GHz Pentium 4: 1.479 +// GenericScan 1-byte loop 300 MB/sec * 1.480 +// GenericScan 4-byte loop 1200 MB/sec 1.481 +// GenericScan 8-byte loop 2400 MB/sec * 1.482 +// GenericScanFastAscii 4-byte loop 3000 MB/sec 1.483 +// GenericScanFastAscii 8-byte loop 3200 MB/sec * 1.484 +// 1.485 +// * Implemented below. FastAscii loop is memory-bandwidth constrained. 1.486 + 1.487 +// Scan a UTF-8 stringpiece based on state table. 1.488 +// Always scan complete UTF-8 characters 1.489 +// Set number of bytes scanned. Return reason for exiting 1.490 +int UTF8GenericScan(const UTF8ScanObj* st, 1.491 + const StringPiece& str, 1.492 + int* bytes_consumed) { 1.493 + int eshift = st->entry_shift; // 6 (space optimized) or 8 1.494 + // int nEntries = (1 << eshift); // 64 or 256 entries per state 1.495 + 1.496 + const uint8* isrc = 1.497 + reinterpret_cast<const uint8*>(str.data()); 1.498 + const uint8* src = isrc; 1.499 + const int len = str.length(); 1.500 + const uint8* srclimit = isrc + len; 1.501 + const uint8* srclimit8 = srclimit - 7; 1.502 + *bytes_consumed = 0; 1.503 + if (len == 0) return kExitOK; 1.504 + 1.505 + const uint8* Tbl_0 = &st->state_table[st->state0]; 1.506 + 1.507 +DoAgain: 1.508 + // Do state-table scan 1.509 + int e = 0; 1.510 + uint8 c; 1.511 + 1.512 + // Do fast for groups of 8 identity bytes. 1.513 + // This covers a lot of 7-bit ASCII ~8x faster than the 1-byte loop, 1.514 + // including slowing slightly on cr/lf/ht 1.515 + //---------------------------- 1.516 + const uint8* Tbl2 = &st->fast_state[0]; 1.517 + uint32 losub = st->losub; 1.518 + uint32 hiadd = st->hiadd; 1.519 + while (src < srclimit8) { 1.520 + uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0]; 1.521 + uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1]; 1.522 + src += 8; 1.523 + // This is a fast range check for all bytes in [lowsub..0x80-hiadd) 1.524 + uint32 temp = (s0123 - losub) | (s0123 + hiadd) | 1.525 + (s4567 - losub) | (s4567 + hiadd); 1.526 + if ((temp & 0x80808080) != 0) { 1.527 + // We typically end up here on cr/lf/ht; src was incremented 1.528 + int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) | 1.529 + (Tbl2[src[-6]] | Tbl2[src[-5]]); 1.530 + if (e0123 != 0) {src -= 8; break;} // Exit on Non-interchange 1.531 + e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) | 1.532 + (Tbl2[src[-2]] | Tbl2[src[-1]]); 1.533 + if (e0123 != 0) {src -= 4; break;} // Exit on Non-interchange 1.534 + // Else OK, go around again 1.535 + } 1.536 + } 1.537 + //---------------------------- 1.538 + 1.539 + // Byte-at-a-time scan 1.540 + //---------------------------- 1.541 + const uint8* Tbl = Tbl_0; 1.542 + while (src < srclimit) { 1.543 + c = *src; 1.544 + e = Tbl[c]; 1.545 + src++; 1.546 + if (e >= kExitIllegalStructure) {break;} 1.547 + Tbl = &Tbl_0[e << eshift]; 1.548 + } 1.549 + //---------------------------- 1.550 + 1.551 + 1.552 + // Exit possibilities: 1.553 + // Some exit code, !state0, back up over last char 1.554 + // Some exit code, state0, back up one byte exactly 1.555 + // source consumed, !state0, back up over partial char 1.556 + // source consumed, state0, exit OK 1.557 + // For illegal byte in state0, avoid backup up over PREVIOUS char 1.558 + // For truncated last char, back up to beginning of it 1.559 + 1.560 + if (e >= kExitIllegalStructure) { 1.561 + // Back up over exactly one byte of rejected/illegal UTF-8 character 1.562 + src--; 1.563 + // Back up more if needed 1.564 + if (!InStateZero(st, Tbl)) { 1.565 + do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); 1.566 + } 1.567 + } else if (!InStateZero(st, Tbl)) { 1.568 + // Back up over truncated UTF-8 character 1.569 + e = kExitIllegalStructure; 1.570 + do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); 1.571 + } else { 1.572 + // Normal termination, source fully consumed 1.573 + e = kExitOK; 1.574 + } 1.575 + 1.576 + if (e == kExitDoAgain) { 1.577 + // Loop back up to the fast scan 1.578 + goto DoAgain; 1.579 + } 1.580 + 1.581 + *bytes_consumed = src - isrc; 1.582 + return e; 1.583 +} 1.584 + 1.585 +// Scan a UTF-8 stringpiece based on state table. 1.586 +// Always scan complete UTF-8 characters 1.587 +// Set number of bytes scanned. Return reason for exiting 1.588 +// OPTIMIZED for case of 7-bit ASCII 0000..007f all valid 1.589 +int UTF8GenericScanFastAscii(const UTF8ScanObj* st, 1.590 + const StringPiece& str, 1.591 + int* bytes_consumed) { 1.592 + const uint8* isrc = 1.593 + reinterpret_cast<const uint8*>(str.data()); 1.594 + const uint8* src = isrc; 1.595 + const int len = str.length(); 1.596 + const uint8* srclimit = isrc + len; 1.597 + const uint8* srclimit8 = srclimit - 7; 1.598 + *bytes_consumed = 0; 1.599 + if (len == 0) return kExitOK; 1.600 + 1.601 + int n; 1.602 + int rest_consumed; 1.603 + int exit_reason; 1.604 + do { 1.605 + // Skip 8 bytes of ASCII at a whack; no endianness issue 1.606 + while ((src < srclimit8) && 1.607 + (((reinterpret_cast<const uint32*>(src)[0] | 1.608 + reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) { 1.609 + src += 8; 1.610 + } 1.611 + // Run state table on the rest 1.612 + n = src - isrc; 1.613 + StringPiece str2(str.data() + n, str.length() - n); 1.614 + exit_reason = UTF8GenericScan(st, str2, &rest_consumed); 1.615 + src += rest_consumed; 1.616 + } while ( exit_reason == kExitDoAgain ); 1.617 + 1.618 + *bytes_consumed = src - isrc; 1.619 + return exit_reason; 1.620 +} 1.621 + 1.622 +// Hack to change halfwidth katakana to match an old UTF8CharToLower() 1.623 + 1.624 +// Return number of src bytes skipped 1.625 +static int DoSpecialFixup(const unsigned char c, 1.626 + const unsigned char** srcp, const unsigned char* srclimit, 1.627 + unsigned char** dstp, unsigned char* dstlimit) { 1.628 + return 0; 1.629 +} 1.630 + 1.631 + 1.632 +// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece 1.633 +// and doing text replacements. 1.634 +// DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below 1.635 +// Needs caller to loop on kExitDoAgain 1.636 +static int UTF8GenericReplaceInternal(const UTF8ReplaceObj* st, 1.637 + const StringPiece& istr, 1.638 + StringPiece& ostr, 1.639 + bool is_plain_text, 1.640 + int* bytes_consumed, 1.641 + int* bytes_filled, 1.642 + int* chars_changed, 1.643 + OffsetMap* offsetmap) { 1.644 + int eshift = st->entry_shift; 1.645 + int nEntries = (1 << eshift); // 64 or 256 entries per state 1.646 + const uint8* isrc = reinterpret_cast<const uint8*>(istr.data()); 1.647 + const int ilen = istr.length(); 1.648 + const uint8* copystart = isrc; 1.649 + const uint8* src = isrc; 1.650 + const uint8* srclimit = src + ilen; 1.651 + *bytes_consumed = 0; 1.652 + *bytes_filled = 0; 1.653 + *chars_changed = 0; 1.654 + 1.655 + const uint8* odst = reinterpret_cast<const uint8*>(ostr.data()); 1.656 + const int olen = ostr.length(); 1.657 + uint8* dst = const_cast<uint8*>(odst); 1.658 + uint8* dstlimit = dst + olen; 1.659 + 1.660 + int total_changed = 0; 1.661 + 1.662 + // Invariant condition during replacements: 1.663 + // remaining dst size >= remaining src size 1.664 + if ((dstlimit - dst) < (srclimit - src)) { 1.665 + if (offsetmap != NULL) { 1.666 + offsetmap->Copy(src - copystart); 1.667 + copystart = src; 1.668 + } 1.669 + return kExitDstSpaceFull; 1.670 + } 1.671 + const uint8* Tbl_0 = &st->state_table[st->state0]; 1.672 + 1.673 + Do_state_table: 1.674 + // Do state-table scan, copying as we go 1.675 + const uint8* Tbl = Tbl_0; 1.676 + int e = 0; 1.677 + uint8 c = 0; 1.678 + 1.679 + Do_state_table_newe: 1.680 + 1.681 + //---------------------------- 1.682 + while (src < srclimit) { 1.683 + c = *src; 1.684 + e = Tbl[c]; 1.685 + *dst = c; 1.686 + src++; 1.687 + dst++; 1.688 + if (e >= kExitIllegalStructure) {break;} 1.689 + Tbl = &Tbl_0[e << eshift]; 1.690 + } 1.691 + //---------------------------- 1.692 + 1.693 + // Exit possibilities: 1.694 + // Replacement code, do the replacement and loop 1.695 + // Some other exit code, state0, back up one byte exactly 1.696 + // Some other exit code, !state0, back up over last char 1.697 + // source consumed, state0, exit OK 1.698 + // source consumed, !state0, back up over partial char 1.699 + // For illegal byte in state0, avoid backup up over PREVIOUS char 1.700 + // For truncated last char, back up to beginning of it 1.701 + 1.702 + if (e >= kExitIllegalStructure) { 1.703 + // Switch on exit code; most loop back to top 1.704 + int offset = 0; 1.705 + switch (e) { 1.706 + // These all make the output string the same size or shorter 1.707 + // No checking needed 1.708 + case kExitReplace31: // del 2, add 1 bytes to change 1.709 + dst -= 2; 1.710 + if (offsetmap != NULL) { 1.711 + offsetmap->Copy(src - copystart - 2); 1.712 + offsetmap->Delete(2); 1.713 + copystart = src; 1.714 + } 1.715 + dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)]; 1.716 + total_changed++; 1.717 + goto Do_state_table; 1.718 + case kExitReplace32: // del 3, add 2 bytes to change 1.719 + dst--; 1.720 + if (offsetmap != NULL) { 1.721 + offsetmap->Copy(src - copystart - 1); 1.722 + offsetmap->Delete(1); 1.723 + copystart = src; 1.724 + } 1.725 + dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)]; 1.726 + dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)]; 1.727 + total_changed++; 1.728 + goto Do_state_table; 1.729 + case kExitReplace21: // del 2, add 1 bytes to change 1.730 + dst--; 1.731 + if (offsetmap != NULL) { 1.732 + offsetmap->Copy(src - copystart - 1); 1.733 + offsetmap->Delete(1); 1.734 + copystart = src; 1.735 + } 1.736 + dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)]; 1.737 + total_changed++; 1.738 + goto Do_state_table; 1.739 + case kExitReplace3: // update 3 bytes to change 1.740 + dst[-3] = (unsigned char)Tbl[c + (nEntries * 3)]; 1.741 + // Fall into next case 1.742 + case kExitReplace2: // update 2 bytes to change 1.743 + dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)]; 1.744 + // Fall into next case 1.745 + case kExitReplace1: // update 1 byte to change 1.746 + dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)]; 1.747 + total_changed++; 1.748 + goto Do_state_table; 1.749 + case kExitReplace1S0: // update 1 byte to change, 256-entry state 1.750 + dst[-1] = (unsigned char)Tbl[c + (256 * 1)]; 1.751 + total_changed++; 1.752 + goto Do_state_table; 1.753 + // These can make the output string longer than the input 1.754 + case kExitReplaceOffset2: 1.755 + if ((nEntries != 256) && InStateZero(st, Tbl)) { 1.756 + // For space-optimized table, we need multiples of 256 bytes 1.757 + // in state0 and multiples of nEntries in other states 1.758 + offset += ((unsigned char)Tbl[c + (256 * 2)] << 8); 1.759 + } else { 1.760 + offset += ((unsigned char)Tbl[c + (nEntries * 2)] << 8); 1.761 + } 1.762 + // Fall into next case 1.763 + case kExitSpecial: // Apply special fixups [read: hacks] 1.764 + case kExitReplaceOffset1: 1.765 + if ((nEntries != 256) && InStateZero(st, Tbl)) { 1.766 + // For space-optimized table, we need multiples of 256 bytes 1.767 + // in state0 and multiples of nEntries in other states 1.768 + offset += (unsigned char)Tbl[c + (256 * 1)]; 1.769 + } else { 1.770 + offset += (unsigned char)Tbl[c + (nEntries * 1)]; 1.771 + } 1.772 + { 1.773 + const RemapEntry* re = &st->remap_base[offset]; 1.774 + int del_len = re->delete_bytes & ~kReplaceAndResumeFlag; 1.775 + int add_len = re->add_bytes & ~kHtmlPlaintextFlag; 1.776 + 1.777 + // Special-case non-HTML replacement of five sensitive entities 1.778 + // " & ' < > 1.779 + // 0022 0026 0027 003c 003e 1.780 + // A replacement creating one of these is expressed as a pair of 1.781 + // entries, one for HTML output and one for plaintext output. 1.782 + // The first of the pair has the high bit of add_bytes set. 1.783 + if (re->add_bytes & kHtmlPlaintextFlag) { 1.784 + // Use this entry for plain text 1.785 + if (!is_plain_text) { 1.786 + // Use very next entry for HTML text (same back/delete length) 1.787 + re = &st->remap_base[offset + 1]; 1.788 + add_len = re->add_bytes & ~kHtmlPlaintextFlag; 1.789 + } 1.790 + } 1.791 + 1.792 + int string_offset = re->bytes_offset; 1.793 + // After the replacement, need (dstlimit - newdst) >= (srclimit - src) 1.794 + uint8* newdst = dst - del_len + add_len; 1.795 + if ((dstlimit - newdst) < (srclimit - src)) { 1.796 + // Won't fit; don't do the replacement. Caller may realloc and retry 1.797 + e = kExitDstSpaceFull; 1.798 + break; // exit, backing up over this char for later retry 1.799 + } 1.800 + dst -= del_len; 1.801 + memcpy(dst, &st->remap_string[string_offset], add_len); 1.802 + dst += add_len; 1.803 + total_changed++; 1.804 + if (offsetmap != NULL) { 1.805 + if (add_len > del_len) { 1.806 + offsetmap->Copy(src - copystart); 1.807 + offsetmap->Insert(add_len - del_len); 1.808 + copystart = src; 1.809 + } else if (add_len < del_len) { 1.810 + offsetmap->Copy(src - copystart + add_len - del_len); 1.811 + offsetmap->Delete(del_len - add_len); 1.812 + copystart = src; 1.813 + } 1.814 + } 1.815 + if (re->delete_bytes & kReplaceAndResumeFlag) { 1.816 + // There is a non-zero target state at the end of the 1.817 + // replacement string 1.818 + e = st->remap_string[string_offset + add_len]; 1.819 + Tbl = &Tbl_0[e << eshift]; 1.820 + goto Do_state_table_newe; 1.821 + } 1.822 + } 1.823 + if (e == kExitRejectAlt) {break;} 1.824 + if (e != kExitSpecial) {goto Do_state_table;} 1.825 + 1.826 + // case kExitSpecial: // Apply special fixups [read: hacks] 1.827 + // In this routine, do either UTF8CharToLower() 1.828 + // fullwidth/halfwidth mapping or 1.829 + // voiced mapping or 1.830 + // semi-voiced mapping 1.831 + 1.832 + // First, do EXIT_REPLACE_OFFSET1 action (above) 1.833 + // Second: do additional code fixup 1.834 + { 1.835 + int srcdel = DoSpecialFixup(c, &src, srclimit, &dst, dstlimit); 1.836 + if (offsetmap != NULL) { 1.837 + if (srcdel != 0) { 1.838 + offsetmap->Copy(src - copystart - srcdel); 1.839 + offsetmap->Delete(srcdel); 1.840 + copystart = src; 1.841 + } 1.842 + } 1.843 + } 1.844 + goto Do_state_table; 1.845 + 1.846 + case kExitIllegalStructure: // structurally illegal byte; quit 1.847 + case kExitReject: // NUL or illegal code encountered; quit 1.848 + case kExitRejectAlt: // Apply replacement, then exit 1.849 + default: // and all other exits 1.850 + break; 1.851 + } // End switch (e) 1.852 + 1.853 + // Exit possibilities: 1.854 + // Some other exit code, state0, back up one byte exactly 1.855 + // Some other exit code, !state0, back up over last char 1.856 + 1.857 + // Back up over exactly one byte of rejected/illegal UTF-8 character 1.858 + src--; 1.859 + dst--; 1.860 + // Back up more if needed 1.861 + if (!InStateZero(st, Tbl)) { 1.862 + do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); 1.863 + } 1.864 + } else if (!InStateZero(st, Tbl)) { 1.865 + // src >= srclimit, !state0 1.866 + // Back up over truncated UTF-8 character 1.867 + e = kExitIllegalStructure; 1.868 + do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); 1.869 + } else { 1.870 + // src >= srclimit, state0 1.871 + // Normal termination, source fully consumed 1.872 + e = kExitOK; 1.873 + } 1.874 + 1.875 + if (offsetmap != NULL) { 1.876 + if (src > copystart) { 1.877 + offsetmap->Copy(src - copystart); 1.878 + copystart = src; 1.879 + } 1.880 + } 1.881 + 1.882 + // Possible return values here: 1.883 + // kExitDstSpaceFull caller may realloc and retry from middle 1.884 + // kExitIllegalStructure caller my overwrite/truncate 1.885 + // kExitOK all done and happy 1.886 + // kExitReject caller may overwrite/truncate 1.887 + // kExitDoAgain LOOP NOT DONE; caller must retry from middle 1.888 + // (may do fast ASCII loop first) 1.889 + // kExitPlaceholder -unused- 1.890 + // kExitNone -unused- 1.891 + *bytes_consumed = src - isrc; 1.892 + *bytes_filled = dst - odst; 1.893 + *chars_changed = total_changed; 1.894 + return e; 1.895 +} 1.896 + 1.897 +// TwoByte versions are needed for tables > 240 states, such 1.898 +// as the table for full Unicode 4.1 canonical + compatibility mapping 1.899 + 1.900 +// Scan a UTF-8 stringpiece based on state table with two-byte entries, 1.901 +// copying to output stringpiece 1.902 +// and doing text replacements. 1.903 +// DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below 1.904 +// Needs caller to loop on kExitDoAgain 1.905 +static int UTF8GenericReplaceInternalTwoByte(const UTF8ReplaceObj_2* st, 1.906 + const StringPiece& istr, 1.907 + StringPiece& ostr, 1.908 + bool is_plain_text, 1.909 + int* bytes_consumed, 1.910 + int* bytes_filled, 1.911 + int* chars_changed, 1.912 + OffsetMap* offsetmap) { 1.913 + int eshift = st->entry_shift; 1.914 + int nEntries = (1 << eshift); // 64 or 256 entries per state 1.915 + const uint8* isrc = reinterpret_cast<const uint8*>(istr.data()); 1.916 + const int ilen = istr.length(); 1.917 + const uint8* copystart = isrc; 1.918 + const uint8* src = isrc; 1.919 + const uint8* srclimit = src + ilen; 1.920 + *bytes_consumed = 0; 1.921 + *bytes_filled = 0; 1.922 + *chars_changed = 0; 1.923 + 1.924 + const uint8* odst = reinterpret_cast<const uint8*>(ostr.data()); 1.925 + const int olen = ostr.length(); 1.926 + uint8* dst = const_cast<uint8*>(odst); 1.927 + uint8* dstlimit = dst + olen; 1.928 + 1.929 + *chars_changed = 0; 1.930 + 1.931 + int total_changed = 0; 1.932 + 1.933 + int src_lll = srclimit - src; 1.934 + int dst_lll = dstlimit - dst; 1.935 + 1.936 + 1.937 + // Invariant condition during replacements: 1.938 + // remaining dst size >= remaining src size 1.939 + if ((dstlimit - dst) < (srclimit - src)) { 1.940 + if (offsetmap != NULL) { 1.941 + offsetmap->Copy(src - copystart); 1.942 + copystart = src; 1.943 + } 1.944 + return kExitDstSpaceFull_2; 1.945 + } 1.946 + const unsigned short* Tbl_0 = &st->state_table[st->state0]; 1.947 + 1.948 + Do_state_table_2: 1.949 + // Do state-table scan, copying as we go 1.950 + const unsigned short* Tbl = Tbl_0; 1.951 + int e = 0; 1.952 + uint8 c = 0; 1.953 + 1.954 + Do_state_table_newe_2: 1.955 + 1.956 + //---------------------------- 1.957 + while (src < srclimit) { 1.958 + c = *src; 1.959 + e = Tbl[c]; 1.960 + *dst = c; 1.961 + src++; 1.962 + dst++; 1.963 + if (e >= kExitIllegalStructure_2) {break;} 1.964 + Tbl = &Tbl_0[e << eshift]; 1.965 + } 1.966 + //---------------------------- 1.967 + src_lll = src - isrc; 1.968 + dst_lll = dst - odst; 1.969 + 1.970 + // Exit possibilities: 1.971 + // Replacement code, do the replacement and loop 1.972 + // Some other exit code, state0, back up one byte exactly 1.973 + // Some other exit code, !state0, back up over last char 1.974 + // source consumed, state0, exit OK 1.975 + // source consumed, !state0, back up over partial char 1.976 + // For illegal byte in state0, avoid backup up over PREVIOUS char 1.977 + // For truncated last char, back up to beginning of it 1.978 + 1.979 + if (e >= kExitIllegalStructure_2) { 1.980 + // Switch on exit code; most loop back to top 1.981 + int offset = 0; 1.982 + switch (e) { 1.983 + // These all make the output string the same size or shorter 1.984 + // No checking needed 1.985 + case kExitReplace31_2: // del 2, add 1 bytes to change 1.986 + dst -= 2; 1.987 + if (offsetmap != NULL) { 1.988 + offsetmap->Copy(src - copystart - 2); 1.989 + offsetmap->Delete(2); 1.990 + copystart = src; 1.991 + } 1.992 + dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); 1.993 + total_changed++; 1.994 + goto Do_state_table_2; 1.995 + case kExitReplace32_2: // del 3, add 2 bytes to change 1.996 + dst--; 1.997 + if (offsetmap != NULL) { 1.998 + offsetmap->Copy(src - copystart - 1); 1.999 + offsetmap->Delete(1); 1.1000 + copystart = src; 1.1001 + } 1.1002 + dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff); 1.1003 + dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); 1.1004 + total_changed++; 1.1005 + goto Do_state_table_2; 1.1006 + case kExitReplace21_2: // del 2, add 1 bytes to change 1.1007 + dst--; 1.1008 + if (offsetmap != NULL) { 1.1009 + offsetmap->Copy(src - copystart - 1); 1.1010 + offsetmap->Delete(1); 1.1011 + copystart = src; 1.1012 + } 1.1013 + dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); 1.1014 + total_changed++; 1.1015 + goto Do_state_table_2; 1.1016 + case kExitReplace3_2: // update 3 bytes to change 1.1017 + dst[-3] = (unsigned char)(Tbl[c + (nEntries * 2)] & 0xff); 1.1018 + // Fall into next case 1.1019 + case kExitReplace2_2: // update 2 bytes to change 1.1020 + dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff); 1.1021 + // Fall into next case 1.1022 + case kExitReplace1_2: // update 1 byte to change 1.1023 + dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); 1.1024 + total_changed++; 1.1025 + goto Do_state_table_2; 1.1026 + case kExitReplace1S0_2: // update 1 byte to change, 256-entry state 1.1027 + dst[-1] = (unsigned char)(Tbl[c + (256 * 1)] & 0xff); 1.1028 + total_changed++; 1.1029 + goto Do_state_table_2; 1.1030 + // These can make the output string longer than the input 1.1031 + case kExitReplaceOffset2_2: 1.1032 + if ((nEntries != 256) && InStateZero_2(st, Tbl)) { 1.1033 + // For space-optimized table, we need multiples of 256 bytes 1.1034 + // in state0 and multiples of nEntries in other states 1.1035 + offset += ((unsigned char)(Tbl[c + (256 * 1)] >> 8 & 0xff) << 8); 1.1036 + } else { 1.1037 + offset += ((unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff) << 8); 1.1038 + } 1.1039 + // Fall into next case 1.1040 + case kExitReplaceOffset1_2: 1.1041 + if ((nEntries != 256) && InStateZero_2(st, Tbl)) { 1.1042 + // For space-optimized table, we need multiples of 256 bytes 1.1043 + // in state0 and multiples of nEntries in other states 1.1044 + offset += (unsigned char)(Tbl[c + (256 * 1)] & 0xff); 1.1045 + } else { 1.1046 + offset += (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); 1.1047 + } 1.1048 + { 1.1049 + const RemapEntry* re = &st->remap_base[offset]; 1.1050 + int del_len = re->delete_bytes & ~kReplaceAndResumeFlag; 1.1051 + int add_len = re->add_bytes & ~kHtmlPlaintextFlag; 1.1052 + // Special-case non-HTML replacement of five sensitive entities 1.1053 + // " & ' < > 1.1054 + // 0022 0026 0027 003c 003e 1.1055 + // A replacement creating one of these is expressed as a pair of 1.1056 + // entries, one for HTML output and one for plaintext output. 1.1057 + // The first of the pair has the high bit of add_bytes set. 1.1058 + if (re->add_bytes & kHtmlPlaintextFlag) { 1.1059 + // Use this entry for plain text 1.1060 + if (!is_plain_text) { 1.1061 + // Use very next entry for HTML text (same back/delete length) 1.1062 + re = &st->remap_base[offset + 1]; 1.1063 + add_len = re->add_bytes & ~kHtmlPlaintextFlag; 1.1064 + } 1.1065 + } 1.1066 + 1.1067 + // After the replacement, need (dstlimit - dst) >= (srclimit - src) 1.1068 + int string_offset = re->bytes_offset; 1.1069 + // After the replacement, need (dstlimit - newdst) >= (srclimit - src) 1.1070 + uint8* newdst = dst - del_len + add_len; 1.1071 + if ((dstlimit - newdst) < (srclimit - src)) { 1.1072 + // Won't fit; don't do the replacement. Caller may realloc and retry 1.1073 + e = kExitDstSpaceFull_2; 1.1074 + break; // exit, backing up over this char for later retry 1.1075 + } 1.1076 + dst -= del_len; 1.1077 + memcpy(dst, &st->remap_string[string_offset], add_len); 1.1078 + dst += add_len; 1.1079 + if (offsetmap != NULL) { 1.1080 + if (add_len > del_len) { 1.1081 + offsetmap->Copy(src - copystart); 1.1082 + offsetmap->Insert(add_len - del_len); 1.1083 + copystart = src; 1.1084 + } else if (add_len < del_len) { 1.1085 + offsetmap->Copy(src - copystart + add_len - del_len); 1.1086 + offsetmap->Delete(del_len - add_len); 1.1087 + copystart = src; 1.1088 + } 1.1089 + } 1.1090 + if (re->delete_bytes & kReplaceAndResumeFlag) { 1.1091 + // There is a two-byte non-zero target state at the end of the 1.1092 + // replacement string 1.1093 + uint8 c1 = st->remap_string[string_offset + add_len]; 1.1094 + uint8 c2 = st->remap_string[string_offset + add_len + 1]; 1.1095 + e = (c1 << 8) | c2; 1.1096 + Tbl = &Tbl_0[e << eshift]; 1.1097 + total_changed++; 1.1098 + goto Do_state_table_newe_2; 1.1099 + } 1.1100 + } 1.1101 + total_changed++; 1.1102 + if (e == kExitRejectAlt_2) {break;} 1.1103 + goto Do_state_table_2; 1.1104 + 1.1105 + case kExitSpecial_2: // NO special fixups [read: hacks] 1.1106 + case kExitIllegalStructure_2: // structurally illegal byte; quit 1.1107 + case kExitReject_2: // NUL or illegal code encountered; quit 1.1108 + // and all other exits 1.1109 + default: 1.1110 + break; 1.1111 + } // End switch (e) 1.1112 + 1.1113 + // Exit possibilities: 1.1114 + // Some other exit code, state0, back up one byte exactly 1.1115 + // Some other exit code, !state0, back up over last char 1.1116 + 1.1117 + // Back up over exactly one byte of rejected/illegal UTF-8 character 1.1118 + src--; 1.1119 + dst--; 1.1120 + // Back up more if needed 1.1121 + if (!InStateZero_2(st, Tbl)) { 1.1122 + do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); 1.1123 + } 1.1124 + } else if (!InStateZero_2(st, Tbl)) { 1.1125 + // src >= srclimit, !state0 1.1126 + // Back up over truncated UTF-8 character 1.1127 + e = kExitIllegalStructure_2; 1.1128 + 1.1129 + do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); 1.1130 + } else { 1.1131 + // src >= srclimit, state0 1.1132 + // Normal termination, source fully consumed 1.1133 + e = kExitOK_2; 1.1134 + } 1.1135 + 1.1136 + if (offsetmap != NULL) { 1.1137 + if (src > copystart) { 1.1138 + offsetmap->Copy(src - copystart); 1.1139 + copystart = src; 1.1140 + } 1.1141 + } 1.1142 + 1.1143 + 1.1144 + // Possible return values here: 1.1145 + // kExitDstSpaceFull_2 caller may realloc and retry from middle 1.1146 + // kExitIllegalStructure_2 caller my overwrite/truncate 1.1147 + // kExitOK_2 all done and happy 1.1148 + // kExitReject_2 caller may overwrite/truncate 1.1149 + // kExitDoAgain_2 LOOP NOT DONE; caller must retry from middle 1.1150 + // (may do fast ASCII loop first) 1.1151 + // kExitPlaceholder_2 -unused- 1.1152 + // kExitNone_2 -unused- 1.1153 + *bytes_consumed = src - isrc; 1.1154 + *bytes_filled = dst - odst; 1.1155 + *chars_changed = total_changed; 1.1156 + return e; 1.1157 +} 1.1158 + 1.1159 + 1.1160 +// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece 1.1161 +// and doing text replacements. 1.1162 +// Also writes an optional OffsetMap. Pass NULL to skip writing one. 1.1163 +// Always scan complete UTF-8 characters 1.1164 +// Set number of bytes consumed from input, number filled to output. 1.1165 +// Return reason for exiting 1.1166 +int UTF8GenericReplace(const UTF8ReplaceObj* st, 1.1167 + const StringPiece& istr, 1.1168 + StringPiece& ostr, 1.1169 + bool is_plain_text, 1.1170 + int* bytes_consumed, 1.1171 + int* bytes_filled, 1.1172 + int* chars_changed, 1.1173 + OffsetMap* offsetmap) { 1.1174 + StringPiece local_istr(istr.data(), istr.length()); 1.1175 + StringPiece local_ostr(ostr.data(), ostr.length()); 1.1176 + int total_consumed = 0; 1.1177 + int total_filled = 0; 1.1178 + int total_changed = 0; 1.1179 + int local_bytes_consumed, local_bytes_filled, local_chars_changed; 1.1180 + int e; 1.1181 + do { 1.1182 + e = UTF8GenericReplaceInternal(st, 1.1183 + local_istr, local_ostr, is_plain_text, 1.1184 + &local_bytes_consumed, &local_bytes_filled, 1.1185 + &local_chars_changed, 1.1186 + offsetmap); 1.1187 + local_istr.remove_prefix(local_bytes_consumed); 1.1188 + local_ostr.remove_prefix(local_bytes_filled); 1.1189 + total_consumed += local_bytes_consumed; 1.1190 + total_filled += local_bytes_filled; 1.1191 + total_changed += local_chars_changed; 1.1192 + } while ( e == kExitDoAgain ); 1.1193 + *bytes_consumed = total_consumed; 1.1194 + *bytes_filled = total_filled; 1.1195 + *chars_changed = total_changed; 1.1196 + return e; 1.1197 +} 1.1198 + 1.1199 +// Older version without offsetmap 1.1200 +int UTF8GenericReplace(const UTF8ReplaceObj* st, 1.1201 + const StringPiece& istr, 1.1202 + StringPiece& ostr, 1.1203 + bool is_plain_text, 1.1204 + int* bytes_consumed, 1.1205 + int* bytes_filled, 1.1206 + int* chars_changed) { 1.1207 + return UTF8GenericReplace(st, 1.1208 + istr, 1.1209 + ostr, 1.1210 + is_plain_text, 1.1211 + bytes_consumed, 1.1212 + bytes_filled, 1.1213 + chars_changed, 1.1214 + NULL); 1.1215 +} 1.1216 + 1.1217 +// Older version without is_plain_text or offsetmap 1.1218 +int UTF8GenericReplace(const UTF8ReplaceObj* st, 1.1219 + const StringPiece& istr, 1.1220 + StringPiece& ostr, 1.1221 + int* bytes_consumed, 1.1222 + int* bytes_filled, 1.1223 + int* chars_changed) { 1.1224 + bool is_plain_text = false; 1.1225 + return UTF8GenericReplace(st, 1.1226 + istr, 1.1227 + ostr, 1.1228 + is_plain_text, 1.1229 + bytes_consumed, 1.1230 + bytes_filled, 1.1231 + chars_changed, 1.1232 + NULL); 1.1233 +} 1.1234 + 1.1235 +// Scan a UTF-8 stringpiece based on state table with two-byte entries, 1.1236 +// copying to output stringpiece 1.1237 +// and doing text replacements. 1.1238 +// Also writes an optional OffsetMap. Pass NULL to skip writing one. 1.1239 +// Always scan complete UTF-8 characters 1.1240 +// Set number of bytes consumed from input, number filled to output. 1.1241 +// Return reason for exiting 1.1242 +int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, 1.1243 + const StringPiece& istr, 1.1244 + StringPiece& ostr, 1.1245 + bool is_plain_text, 1.1246 + int* bytes_consumed, 1.1247 + int* bytes_filled, 1.1248 + int* chars_changed, 1.1249 + OffsetMap* offsetmap) { 1.1250 + StringPiece local_istr(istr.data(), istr.length()); 1.1251 + StringPiece local_ostr(ostr.data(), ostr.length()); 1.1252 + int total_consumed = 0; 1.1253 + int total_filled = 0; 1.1254 + int total_changed = 0; 1.1255 + int local_bytes_consumed, local_bytes_filled, local_chars_changed; 1.1256 + int e; 1.1257 + do { 1.1258 + e = UTF8GenericReplaceInternalTwoByte(st, 1.1259 + local_istr, local_ostr, is_plain_text, 1.1260 + &local_bytes_consumed, 1.1261 + &local_bytes_filled, 1.1262 + &local_chars_changed, 1.1263 + offsetmap); 1.1264 + local_istr.remove_prefix(local_bytes_consumed); 1.1265 + local_ostr.remove_prefix(local_bytes_filled); 1.1266 + total_consumed += local_bytes_consumed; 1.1267 + total_filled += local_bytes_filled; 1.1268 + total_changed += local_chars_changed; 1.1269 + } while ( e == kExitDoAgain_2 ); 1.1270 + *bytes_consumed = total_consumed; 1.1271 + *bytes_filled = total_filled; 1.1272 + *chars_changed = total_changed; 1.1273 + 1.1274 + return e - kExitOK_2 + kExitOK; 1.1275 +} 1.1276 + 1.1277 +// Older version without offsetmap 1.1278 +int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, 1.1279 + const StringPiece& istr, 1.1280 + StringPiece& ostr, 1.1281 + bool is_plain_text, 1.1282 + int* bytes_consumed, 1.1283 + int* bytes_filled, 1.1284 + int* chars_changed) { 1.1285 + return UTF8GenericReplaceTwoByte(st, 1.1286 + istr, 1.1287 + ostr, 1.1288 + is_plain_text, 1.1289 + bytes_consumed, 1.1290 + bytes_filled, 1.1291 + chars_changed, 1.1292 + NULL); 1.1293 +} 1.1294 + 1.1295 +// Older version without is_plain_text or offsetmap 1.1296 +int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, 1.1297 + const StringPiece& istr, 1.1298 + StringPiece& ostr, 1.1299 + int* bytes_consumed, 1.1300 + int* bytes_filled, 1.1301 + int* chars_changed) { 1.1302 + bool is_plain_text = false; 1.1303 + return UTF8GenericReplaceTwoByte(st, 1.1304 + istr, 1.1305 + ostr, 1.1306 + is_plain_text, 1.1307 + bytes_consumed, 1.1308 + bytes_filled, 1.1309 + chars_changed, 1.1310 + NULL); 1.1311 +} 1.1312 + 1.1313 + 1.1314 + 1.1315 +// Adjust a stringpiece to encompass complete UTF-8 characters. 1.1316 +// The data pointer will be increased by 0..3 bytes to get to a character 1.1317 +// boundary, and the length will then be decreased by 0..3 bytes 1.1318 +// to encompass the last complete character. 1.1319 +void UTF8TrimToChars(StringPiece* istr) { 1.1320 + const char* src = istr->data(); 1.1321 + int len = istr->length(); 1.1322 + // Exit if empty string 1.1323 + if (len == 0) { 1.1324 + return; 1.1325 + } 1.1326 + 1.1327 + // Exit on simple, common case 1.1328 + if ( ((src[0] & 0xc0) != 0x80) && 1.1329 + (static_cast<signed char>(src[len - 1]) >= 0) ) { 1.1330 + // First byte is not a continuation and last byte is 7-bit ASCII -- done 1.1331 + return; 1.1332 + } 1.1333 + 1.1334 + // Adjust the back end, len > 0 1.1335 + const char* srclimit = src + len; 1.1336 + // Backscan over any ending continuation bytes to find last char start 1.1337 + const char* s = srclimit - 1; // Last byte of the string 1.1338 + while ((src <= s) && ((*s & 0xc0) == 0x80)) { 1.1339 + s--; 1.1340 + } 1.1341 + // Include entire last char if it fits 1.1342 + if (src <= s) { 1.1343 + int last_char_len = UTF8OneCharLen(s); 1.1344 + if (s + last_char_len <= srclimit) { 1.1345 + // Last char fits, so include it, else exclude it 1.1346 + s += last_char_len; 1.1347 + } 1.1348 + } 1.1349 + if (s != srclimit) { 1.1350 + // s is one byte beyond the last full character, if any 1.1351 + istr->remove_suffix(srclimit - s); 1.1352 + // Exit if now empty string 1.1353 + if (istr->length() == 0) { 1.1354 + return; 1.1355 + } 1.1356 + } 1.1357 + 1.1358 + // Adjust the front end, len > 0 1.1359 + len = istr->length(); 1.1360 + srclimit = src + len; 1.1361 + s = src; // First byte of the string 1.1362 + // Scan over any beginning continuation bytes to find first char start 1.1363 + while ((s < srclimit) && ((*s & 0xc0) == 0x80)) { 1.1364 + s++; 1.1365 + } 1.1366 + if (s != src) { 1.1367 + // s is at the first full character, if any 1.1368 + istr->remove_prefix(s - src); 1.1369 + } 1.1370 +} 1.1371 + 1.1372 +} // End namespace CLD2