browser/components/translation/cld2/internal/utf8statetable.cc

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 // Copyright 2013 Google Inc. All Rights Reserved.
michael@0 2 //
michael@0 3 // Licensed under the Apache License, Version 2.0 (the "License");
michael@0 4 // you may not use this file except in compliance with the License.
michael@0 5 // You may obtain a copy of the License at
michael@0 6 //
michael@0 7 // http://www.apache.org/licenses/LICENSE-2.0
michael@0 8 //
michael@0 9 // Unless required by applicable law or agreed to in writing, software
michael@0 10 // distributed under the License is distributed on an "AS IS" BASIS,
michael@0 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
michael@0 12 // See the License for the specific language governing permissions and
michael@0 13 // limitations under the License.
michael@0 14
michael@0 15 //
michael@0 16 // State Table follower for scanning UTF-8 strings without converting to
michael@0 17 // 32- or 16-bit Unicode values.
michael@0 18 //
michael@0 19
michael@0 20 #ifdef COMPILER_MSVC
michael@0 21 // MSVC warns: warning C4309: 'initializing' : truncation of constant value
michael@0 22 // But the value is in fact not truncated. 0xFF still comes out 0xFF at
michael@0 23 // runtime.
michael@0 24 #pragma warning ( disable : 4309 )
michael@0 25 #endif
michael@0 26
michael@0 27 #include "utf8statetable.h"
michael@0 28
michael@0 29 #include <stdint.h> // for uintptr_t
michael@0 30 #include <string.h> // for NULL, memcpy, memmove
michael@0 31
michael@0 32 #include "integral_types.h" // for uint8, uint32, int8
michael@0 33 #include "stringpiece.h"
michael@0 34 #include "offsetmap.h"
michael@0 35
michael@0 36
michael@0 37 namespace CLD2 {
michael@0 38
michael@0 39 static const int kReplaceAndResumeFlag = 0x80; // Bit in del byte to distinguish
michael@0 40 // optional next-state field
michael@0 41 // after replacement text
michael@0 42 static const int kHtmlPlaintextFlag = 0x80; // Bit in add byte to distinguish
michael@0 43 // HTML replacement vs. plaintext
michael@0 44
michael@0 45
michael@0 46 /**
michael@0 47 * This code implements a little interpreter for UTF8 state
michael@0 48 * tables. There are three kinds of quite-similar state tables,
michael@0 49 * property, scanning, and replacement. Each state in one of
michael@0 50 * these tables consists of an array of 256 or 64 one-byte
michael@0 51 * entries. The state is subscripted by an incoming source byte,
michael@0 52 * and the entry either specifies the next state or specifies an
michael@0 53 * action. Space-optimized tables have full 256-entry states for
michael@0 54 * the first byte of a UTF-8 character, but only 64-entry states
michael@0 55 * for continuation bytes. Space-optimized tables may only be
michael@0 56 * used with source input that has been checked to be
michael@0 57 * structurally- (or stronger interchange-) valid.
michael@0 58 *
michael@0 59 * A property state table has an unsigned one-byte property for
michael@0 60 * each possible UTF-8 character. One-byte character properties
michael@0 61 * are in the state[0] array, while for other lengths the
michael@0 62 * state[0] array gives the next state, which contains the
michael@0 63 * property value for two-byte characters or yet another state
michael@0 64 * for longer ones. The code simply loads the right number of
michael@0 65 * next-state values, then returns the final byte as property
michael@0 66 * value. There are no actions specified in property tables.
michael@0 67 * States are typically shared for multi-byte UTF-8 characters
michael@0 68 * that all have the same property value.
michael@0 69 *
michael@0 70 * A scanning state table has entries that are either a
michael@0 71 * next-state specifier for bytes that are accepted by the
michael@0 72 * scanner, or an exit action for the last byte of each
michael@0 73 * character that is rejected by the scanner.
michael@0 74 *
michael@0 75 * Scanning long strings involves a tight loop that picks up one
michael@0 76 * byte at a time and follows next-state value back to state[0]
michael@0 77 * for each accepted UTF-8 character. Scanning stops at the end
michael@0 78 * of the string or at the first character encountered that has
michael@0 79 * an exit action such as "reject". Timing information is given
michael@0 80 * below.
michael@0 81 *
michael@0 82 * Since so much of Google's text is 7-bit-ASCII values
michael@0 83 * (approximately 94% of the bytes of web documents), the
michael@0 84 * scanning interpreter has two speed optimizations. One checks
michael@0 85 * 8 bytes at a time to see if they are all in the range lo..hi,
michael@0 86 * as specified in constants in the overall statetable object.
michael@0 87 * The check involves ORing together four 4-byte values that
michael@0 88 * overflow into the high bit of some byte when a byte is out of
michael@0 89 * range. For seven-bit-ASCII, lo is 0x20 and hi is 0x7E. This
michael@0 90 * loop is about 8x faster than the one-byte-at-a-time loop.
michael@0 91 *
michael@0 92 * If checking for exit bytes in the 0x00-0x1F and 7F range is
michael@0 93 * unneeded, an even faster loop just looks at the high bits of
michael@0 94 * 8 bytes at once, and is about 1.33x faster than the lo..hi
michael@0 95 * loop.
michael@0 96 *
michael@0 97 * Exit from the scanning routines backs up to the first byte of
michael@0 98 * the rejected character, so the text spanned is always a
michael@0 99 * complete number of UTF-8 characters. The normal scanning exit
michael@0 100 * is at the first rejected character, or at the end of the
michael@0 101 * input text. Scanning also exits on any detected ill-formed
michael@0 102 * character or at a special do-again action built into some
michael@0 103 * exit-optimized tables. The do-again action gets back to the
michael@0 104 * top of the scanning loop to retry eight-byte ASCII scans. It
michael@0 105 * is typically put into state tables after four seven-bit-ASCII
michael@0 106 * characters in a row are seen, to allow restarting the fast
michael@0 107 * scan after some slower processing of multi-byte characters.
michael@0 108 *
michael@0 109 * A replacement state table is similar to a scanning state
michael@0 110 * table but has more extensive actions. The default
michael@0 111 * byte-at-a-time loop copies one byte from source to
michael@0 112 * destination and goes to the next state. The replacement
michael@0 113 * actions overwrite 1-3 bytes of the destination with different
michael@0 114 * bytes, possibly shortening the output by 1 or 2 bytes. The
michael@0 115 * replacement bytes come from within the state table, from
michael@0 116 * dummy states inserted just after any state that contains a
michael@0 117 * replacement action. This gives a quick address calculation for
michael@0 118 * the replacement byte(s) and gives some cache locality.
michael@0 119 *
michael@0 120 * Additional replacement actions use one or two bytes from
michael@0 121 * within dummy states to index a side table of more-extensive
michael@0 122 * replacements. The side table specifies a length of 0..15
michael@0 123 * destination bytes to overwrite and a length of 0..127 bytes
michael@0 124 * to overwrite them with, plus the actual replacement bytes.
michael@0 125 *
michael@0 126 * This side table uses one extra bit to specify a pair of
michael@0 127 * replacements, the first to be used in an HTML context and the
michael@0 128 * second to be used in a plaintext context. This allows
michael@0 129 * replacements that are spelled with "&lt;" in the former
michael@0 130 * context and "<" in the latter.
michael@0 131 *
michael@0 132 * The side table also uses an extra bit to specify a non-zero
michael@0 133 * next state after a replacement. This allows a combination
michael@0 134 * replacement and state change, used to implement a limited
michael@0 135 * version of the Boyer-Moore algorithm for multi-character
michael@0 136 * replacement without backtracking. This is useful when there
michael@0 137 * are overlapping replacements, such as ch => x and also c =>
michael@0 138 * y, the latter to be used only if the character after c is not
michael@0 139 * h. in this case, the state[0] table's entry for c would
michael@0 140 * change c to y and also have a next-state of say n, and the
michael@0 141 * state[n] entry for h would specify a replacement of the two
michael@0 142 * bytes yh by x. No backtracking is needed.
michael@0 143 *
michael@0 144 * A replacement table may also include the exit actions of a
michael@0 145 * scanning state table, so some character sequences can
michael@0 146 * terminate early.
michael@0 147 *
michael@0 148 * During replacement, an optional data structure called an
michael@0 149 * offset map can be updated to reflect each change in length
michael@0 150 * between source and destination. This offset map can later be
michael@0 151 * used to map destination-string offsets to corresponding
michael@0 152 * source-string offsets or vice versa.
michael@0 153 *
michael@0 154 * The routines below also have variants in which state-table
michael@0 155 * entries are all two bytes instead of one byte. This allows
michael@0 156 * tables with more than 240 total states, but takes up twice as
michael@0 157 * much space per state.
michael@0 158 *
michael@0 159 **/
michael@0 160
michael@0 161 // Return true if current Tbl pointer is within state0 range
michael@0 162 // Note that unsigned compare checks both ends of range simultaneously
michael@0 163 static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
michael@0 164 const uint8* Tbl0 = &st->state_table[st->state0];
michael@0 165 return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
michael@0 166 }
michael@0 167
michael@0 168 static inline bool InStateZero_2(const UTF8ReplaceObj_2* st,
michael@0 169 const unsigned short int* Tbl) {
michael@0 170 const unsigned short int* Tbl0 = &st->state_table[st->state0];
michael@0 171 // Word difference, not byte difference
michael@0 172 return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
michael@0 173 }
michael@0 174
michael@0 175 // UTF8PropObj, UTF8ScanObj, UTF8ReplaceObj are all typedefs of
michael@0 176 // UTF8MachineObj.
michael@0 177
michael@0 178 static bool IsPropObj(const UTF8StateMachineObj& obj) {
michael@0 179 return obj.fast_state == NULL
michael@0 180 && obj.max_expand == 0;
michael@0 181 }
michael@0 182
michael@0 183 static bool IsPropObj_2(const UTF8StateMachineObj_2& obj) {
michael@0 184 return obj.fast_state == NULL
michael@0 185 && obj.max_expand == 0;
michael@0 186 }
michael@0 187
michael@0 188 static bool IsScanObj(const UTF8StateMachineObj& obj) {
michael@0 189 return obj.fast_state != NULL
michael@0 190 && obj.max_expand == 0;
michael@0 191 }
michael@0 192
michael@0 193 static bool IsReplaceObj(const UTF8StateMachineObj& obj) {
michael@0 194 // Normally, obj.fast_state != NULL, but the handwritten tables
michael@0 195 // in utf8statetable_unittest don't handle fast_states.
michael@0 196 return obj.max_expand > 0;
michael@0 197 }
michael@0 198
michael@0 199 static bool IsReplaceObj_2(const UTF8StateMachineObj_2& obj) {
michael@0 200 return obj.max_expand > 0;
michael@0 201 }
michael@0 202
michael@0 203 // Look up property of one UTF-8 character and advance over it
michael@0 204 // Return 0 if input length is zero
michael@0 205 // Return 0 and advance one byte if input is ill-formed
michael@0 206 uint8 UTF8GenericProperty(const UTF8PropObj* st,
michael@0 207 const uint8** src,
michael@0 208 int* srclen) {
michael@0 209 if (*srclen <= 0) {
michael@0 210 return 0;
michael@0 211 }
michael@0 212
michael@0 213 const uint8* lsrc = *src;
michael@0 214 const uint8* Tbl_0 = &st->state_table[st->state0];
michael@0 215 const uint8* Tbl = Tbl_0;
michael@0 216 int e;
michael@0 217 int eshift = st->entry_shift;
michael@0 218
michael@0 219 // Short series of tests faster than switch, optimizes 7-bit ASCII
michael@0 220 unsigned char c = lsrc[0];
michael@0 221 if (static_cast<signed char>(c) >= 0) { // one byte
michael@0 222 e = Tbl[c];
michael@0 223 *src += 1;
michael@0 224 *srclen -= 1;
michael@0 225 } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
michael@0 226 e = Tbl[c];
michael@0 227 Tbl = &Tbl_0[e << eshift];
michael@0 228 e = Tbl[lsrc[1]];
michael@0 229 *src += 2;
michael@0 230 *srclen -= 2;
michael@0 231 } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
michael@0 232 e = Tbl[c];
michael@0 233 Tbl = &Tbl_0[e << eshift];
michael@0 234 e = Tbl[lsrc[1]];
michael@0 235 Tbl = &Tbl_0[e << eshift];
michael@0 236 e = Tbl[lsrc[2]];
michael@0 237 *src += 3;
michael@0 238 *srclen -= 3;
michael@0 239 }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
michael@0 240 e = Tbl[c];
michael@0 241 Tbl = &Tbl_0[e << eshift];
michael@0 242 e = Tbl[lsrc[1]];
michael@0 243 Tbl = &Tbl_0[e << eshift];
michael@0 244 e = Tbl[lsrc[2]];
michael@0 245 Tbl = &Tbl_0[e << eshift];
michael@0 246 e = Tbl[lsrc[3]];
michael@0 247 *src += 4;
michael@0 248 *srclen -= 4;
michael@0 249 } else { // Ill-formed
michael@0 250 e = 0;
michael@0 251 *src += 1;
michael@0 252 *srclen -= 1;
michael@0 253 }
michael@0 254 return e;
michael@0 255 }
michael@0 256
michael@0 257 bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src) {
michael@0 258 const uint8* lsrc = reinterpret_cast<const uint8*>(src);
michael@0 259 const uint8* Tbl_0 = &st.state_table[st.state0];
michael@0 260 const uint8* Tbl = Tbl_0;
michael@0 261 int e;
michael@0 262 int eshift = st.entry_shift;
michael@0 263
michael@0 264 // Short series of tests faster than switch, optimizes 7-bit ASCII
michael@0 265 unsigned char c = lsrc[0];
michael@0 266 if (static_cast<signed char>(c) >= 0) { // one byte
michael@0 267 e = Tbl[c];
michael@0 268 } else if ((c & 0xe0) == 0xc0) { // two bytes
michael@0 269 e = Tbl[c];
michael@0 270 Tbl = &Tbl_0[e << eshift];
michael@0 271 e = Tbl[lsrc[1]];
michael@0 272 } else if ((c & 0xf0) == 0xe0) { // three bytes
michael@0 273 e = Tbl[c];
michael@0 274 Tbl = &Tbl_0[e << eshift];
michael@0 275 e = Tbl[lsrc[1]];
michael@0 276 Tbl = &Tbl_0[e << eshift];
michael@0 277 e = Tbl[lsrc[2]];
michael@0 278 } else { // four bytes
michael@0 279 e = Tbl[c];
michael@0 280 Tbl = &Tbl_0[e << eshift];
michael@0 281 e = Tbl[lsrc[1]];
michael@0 282 Tbl = &Tbl_0[e << eshift];
michael@0 283 e = Tbl[lsrc[2]];
michael@0 284 Tbl = &Tbl_0[e << eshift];
michael@0 285 e = Tbl[lsrc[3]];
michael@0 286 }
michael@0 287 return e;
michael@0 288 }
michael@0 289
michael@0 290
michael@0 291 // BigOneByte versions are needed for tables > 240 states, but most
michael@0 292 // won't need the TwoByte versions.
michael@0 293 // Internally, to next-to-last offset is multiplied by 16 and the last
michael@0 294 // offset is relative instead of absolute.
michael@0 295 // Look up property of one UTF-8 character and advance over it
michael@0 296 // Return 0 if input length is zero
michael@0 297 // Return 0 and advance one byte if input is ill-formed
michael@0 298 uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
michael@0 299 const uint8** src,
michael@0 300 int* srclen) {
michael@0 301 if (*srclen <= 0) {
michael@0 302 return 0;
michael@0 303 }
michael@0 304
michael@0 305 const uint8* lsrc = *src;
michael@0 306 const uint8* Tbl_0 = &st->state_table[st->state0];
michael@0 307 const uint8* Tbl = Tbl_0;
michael@0 308 int e;
michael@0 309 int eshift = st->entry_shift;
michael@0 310
michael@0 311 // Short series of tests faster than switch, optimizes 7-bit ASCII
michael@0 312 unsigned char c = lsrc[0];
michael@0 313 if (static_cast<signed char>(c) >= 0) { // one byte
michael@0 314 e = Tbl[c];
michael@0 315 *src += 1;
michael@0 316 *srclen -= 1;
michael@0 317 } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
michael@0 318 e = Tbl[c];
michael@0 319 Tbl = &Tbl_0[e << eshift];
michael@0 320 e = Tbl[lsrc[1]];
michael@0 321 *src += 2;
michael@0 322 *srclen -= 2;
michael@0 323 } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
michael@0 324 e = Tbl[c];
michael@0 325 Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
michael@0 326 e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
michael@0 327 Tbl = &Tbl[e << eshift]; // Relative +/-
michael@0 328 e = Tbl[lsrc[2]];
michael@0 329 *src += 3;
michael@0 330 *srclen -= 3;
michael@0 331 }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
michael@0 332 e = Tbl[c];
michael@0 333 Tbl = &Tbl_0[e << eshift];
michael@0 334 e = Tbl[lsrc[1]];
michael@0 335 Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
michael@0 336 e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
michael@0 337 Tbl = &Tbl[e << eshift]; // Relative +/-
michael@0 338 e = Tbl[lsrc[3]];
michael@0 339 *src += 4;
michael@0 340 *srclen -= 4;
michael@0 341 } else { // Ill-formed
michael@0 342 e = 0;
michael@0 343 *src += 1;
michael@0 344 *srclen -= 1;
michael@0 345 }
michael@0 346 return e;
michael@0 347 }
michael@0 348
michael@0 349 // BigOneByte versions are needed for tables > 240 states, but most
michael@0 350 // won't need the TwoByte versions.
michael@0 351 bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src) {
michael@0 352 const uint8* lsrc = reinterpret_cast<const uint8*>(src);
michael@0 353 const uint8* Tbl_0 = &st.state_table[st.state0];
michael@0 354 const uint8* Tbl = Tbl_0;
michael@0 355 int e;
michael@0 356 int eshift = st.entry_shift;
michael@0 357
michael@0 358 // Short series of tests faster than switch, optimizes 7-bit ASCII
michael@0 359 unsigned char c = lsrc[0];
michael@0 360 if (static_cast<signed char>(c) >= 0) { // one byte
michael@0 361 e = Tbl[c];
michael@0 362 } else if ((c & 0xe0) == 0xc0) { // two bytes
michael@0 363 e = Tbl[c];
michael@0 364 Tbl = &Tbl_0[e << eshift];
michael@0 365 e = Tbl[lsrc[1]];
michael@0 366 } else if ((c & 0xf0) == 0xe0) { // three bytes
michael@0 367 e = Tbl[c];
michael@0 368 Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
michael@0 369 e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
michael@0 370 Tbl = &Tbl[e << eshift]; // Relative +/-
michael@0 371 e = Tbl[lsrc[2]];
michael@0 372 } else { // four bytes
michael@0 373 e = Tbl[c];
michael@0 374 Tbl = &Tbl_0[e << eshift];
michael@0 375 e = Tbl[lsrc[1]];
michael@0 376 Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
michael@0 377 e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
michael@0 378 Tbl = &Tbl[e << eshift]; // Relative +/-
michael@0 379 e = Tbl[lsrc[3]];
michael@0 380 }
michael@0 381 return e;
michael@0 382 }
michael@0 383
michael@0 384
michael@0 385 // TwoByte versions are needed for tables > 240 states
michael@0 386 // Look up property of one UTF-8 character and advance over it
michael@0 387 // Return 0 if input length is zero
michael@0 388 // Return 0 and advance one byte if input is ill-formed
michael@0 389 uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
michael@0 390 const uint8** src,
michael@0 391 int* srclen) {
michael@0 392 if (*srclen <= 0) {
michael@0 393 return 0;
michael@0 394 }
michael@0 395
michael@0 396 const uint8* lsrc = *src;
michael@0 397 const unsigned short* Tbl_0 = &st->state_table[st->state0];
michael@0 398 const unsigned short* Tbl = Tbl_0;
michael@0 399 int e;
michael@0 400 int eshift = st->entry_shift;
michael@0 401
michael@0 402 // Short series of tests faster than switch, optimizes 7-bit ASCII
michael@0 403 unsigned char c = lsrc[0];
michael@0 404 if (static_cast<signed char>(c) >= 0) { // one byte
michael@0 405 e = Tbl[c];
michael@0 406 *src += 1;
michael@0 407 *srclen -= 1;
michael@0 408 } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
michael@0 409 e = Tbl[c];
michael@0 410 Tbl = &Tbl_0[e << eshift];
michael@0 411 e = Tbl[lsrc[1]];
michael@0 412 *src += 2;
michael@0 413 *srclen -= 2;
michael@0 414 } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
michael@0 415 e = Tbl[c];
michael@0 416 Tbl = &Tbl_0[e << eshift];
michael@0 417 e = Tbl[lsrc[1]];
michael@0 418 Tbl = &Tbl_0[e << eshift];
michael@0 419 e = Tbl[lsrc[2]];
michael@0 420 *src += 3;
michael@0 421 *srclen -= 3;
michael@0 422 }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
michael@0 423 e = Tbl[c];
michael@0 424 Tbl = &Tbl_0[e << eshift];
michael@0 425 e = Tbl[lsrc[1]];
michael@0 426 Tbl = &Tbl_0[e << eshift];
michael@0 427 e = Tbl[lsrc[2]];
michael@0 428 Tbl = &Tbl_0[e << eshift];
michael@0 429 e = Tbl[lsrc[3]];
michael@0 430 *src += 4;
michael@0 431 *srclen -= 4;
michael@0 432 } else { // Ill-formed
michael@0 433 e = 0;
michael@0 434 *src += 1;
michael@0 435 *srclen -= 1;
michael@0 436 }
michael@0 437 return e;
michael@0 438 }
michael@0 439
michael@0 440 // TwoByte versions are needed for tables > 240 states
michael@0 441 bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src) {
michael@0 442 const uint8* lsrc = reinterpret_cast<const uint8*>(src);
michael@0 443 const unsigned short* Tbl_0 = &st.state_table[st.state0];
michael@0 444 const unsigned short* Tbl = Tbl_0;
michael@0 445 int e;
michael@0 446 int eshift = st.entry_shift;
michael@0 447
michael@0 448 // Short series of tests faster than switch, optimizes 7-bit ASCII
michael@0 449 unsigned char c = lsrc[0];
michael@0 450 if (static_cast<signed char>(c) >= 0) { // one byte
michael@0 451 e = Tbl[c];
michael@0 452 } else if ((c & 0xe0) == 0xc0) { // two bytes
michael@0 453 e = Tbl[c];
michael@0 454 Tbl = &Tbl_0[e << eshift];
michael@0 455 e = Tbl[lsrc[1]];
michael@0 456 } else if ((c & 0xf0) == 0xe0) { // three bytes
michael@0 457 e = Tbl[c];
michael@0 458 Tbl = &Tbl_0[e << eshift];
michael@0 459 e = Tbl[lsrc[1]];
michael@0 460 Tbl = &Tbl_0[e << eshift];
michael@0 461 e = Tbl[lsrc[2]];
michael@0 462 } else { // four bytes
michael@0 463 e = Tbl[c];
michael@0 464 Tbl = &Tbl_0[e << eshift];
michael@0 465 e = Tbl[lsrc[1]];
michael@0 466 Tbl = &Tbl_0[e << eshift];
michael@0 467 e = Tbl[lsrc[2]];
michael@0 468 Tbl = &Tbl_0[e << eshift];
michael@0 469 e = Tbl[lsrc[3]];
michael@0 470 }
michael@0 471 return e;
michael@0 472 }
michael@0 473
michael@0 474
michael@0 475 // Approximate speeds on 2.8 GHz Pentium 4:
michael@0 476 // GenericScan 1-byte loop 300 MB/sec *
michael@0 477 // GenericScan 4-byte loop 1200 MB/sec
michael@0 478 // GenericScan 8-byte loop 2400 MB/sec *
michael@0 479 // GenericScanFastAscii 4-byte loop 3000 MB/sec
michael@0 480 // GenericScanFastAscii 8-byte loop 3200 MB/sec *
michael@0 481 //
michael@0 482 // * Implemented below. FastAscii loop is memory-bandwidth constrained.
michael@0 483
michael@0 484 // Scan a UTF-8 stringpiece based on state table.
michael@0 485 // Always scan complete UTF-8 characters
michael@0 486 // Set number of bytes scanned. Return reason for exiting
michael@0 487 int UTF8GenericScan(const UTF8ScanObj* st,
michael@0 488 const StringPiece& str,
michael@0 489 int* bytes_consumed) {
michael@0 490 int eshift = st->entry_shift; // 6 (space optimized) or 8
michael@0 491 // int nEntries = (1 << eshift); // 64 or 256 entries per state
michael@0 492
michael@0 493 const uint8* isrc =
michael@0 494 reinterpret_cast<const uint8*>(str.data());
michael@0 495 const uint8* src = isrc;
michael@0 496 const int len = str.length();
michael@0 497 const uint8* srclimit = isrc + len;
michael@0 498 const uint8* srclimit8 = srclimit - 7;
michael@0 499 *bytes_consumed = 0;
michael@0 500 if (len == 0) return kExitOK;
michael@0 501
michael@0 502 const uint8* Tbl_0 = &st->state_table[st->state0];
michael@0 503
michael@0 504 DoAgain:
michael@0 505 // Do state-table scan
michael@0 506 int e = 0;
michael@0 507 uint8 c;
michael@0 508
michael@0 509 // Do fast for groups of 8 identity bytes.
michael@0 510 // This covers a lot of 7-bit ASCII ~8x faster than the 1-byte loop,
michael@0 511 // including slowing slightly on cr/lf/ht
michael@0 512 //----------------------------
michael@0 513 const uint8* Tbl2 = &st->fast_state[0];
michael@0 514 uint32 losub = st->losub;
michael@0 515 uint32 hiadd = st->hiadd;
michael@0 516 while (src < srclimit8) {
michael@0 517 uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
michael@0 518 uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
michael@0 519 src += 8;
michael@0 520 // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
michael@0 521 uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
michael@0 522 (s4567 - losub) | (s4567 + hiadd);
michael@0 523 if ((temp & 0x80808080) != 0) {
michael@0 524 // We typically end up here on cr/lf/ht; src was incremented
michael@0 525 int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
michael@0 526 (Tbl2[src[-6]] | Tbl2[src[-5]]);
michael@0 527 if (e0123 != 0) {src -= 8; break;} // Exit on Non-interchange
michael@0 528 e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
michael@0 529 (Tbl2[src[-2]] | Tbl2[src[-1]]);
michael@0 530 if (e0123 != 0) {src -= 4; break;} // Exit on Non-interchange
michael@0 531 // Else OK, go around again
michael@0 532 }
michael@0 533 }
michael@0 534 //----------------------------
michael@0 535
michael@0 536 // Byte-at-a-time scan
michael@0 537 //----------------------------
michael@0 538 const uint8* Tbl = Tbl_0;
michael@0 539 while (src < srclimit) {
michael@0 540 c = *src;
michael@0 541 e = Tbl[c];
michael@0 542 src++;
michael@0 543 if (e >= kExitIllegalStructure) {break;}
michael@0 544 Tbl = &Tbl_0[e << eshift];
michael@0 545 }
michael@0 546 //----------------------------
michael@0 547
michael@0 548
michael@0 549 // Exit possibilities:
michael@0 550 // Some exit code, !state0, back up over last char
michael@0 551 // Some exit code, state0, back up one byte exactly
michael@0 552 // source consumed, !state0, back up over partial char
michael@0 553 // source consumed, state0, exit OK
michael@0 554 // For illegal byte in state0, avoid backup up over PREVIOUS char
michael@0 555 // For truncated last char, back up to beginning of it
michael@0 556
michael@0 557 if (e >= kExitIllegalStructure) {
michael@0 558 // Back up over exactly one byte of rejected/illegal UTF-8 character
michael@0 559 src--;
michael@0 560 // Back up more if needed
michael@0 561 if (!InStateZero(st, Tbl)) {
michael@0 562 do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
michael@0 563 }
michael@0 564 } else if (!InStateZero(st, Tbl)) {
michael@0 565 // Back up over truncated UTF-8 character
michael@0 566 e = kExitIllegalStructure;
michael@0 567 do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
michael@0 568 } else {
michael@0 569 // Normal termination, source fully consumed
michael@0 570 e = kExitOK;
michael@0 571 }
michael@0 572
michael@0 573 if (e == kExitDoAgain) {
michael@0 574 // Loop back up to the fast scan
michael@0 575 goto DoAgain;
michael@0 576 }
michael@0 577
michael@0 578 *bytes_consumed = src - isrc;
michael@0 579 return e;
michael@0 580 }
michael@0 581
michael@0 582 // Scan a UTF-8 stringpiece based on state table.
michael@0 583 // Always scan complete UTF-8 characters
michael@0 584 // Set number of bytes scanned. Return reason for exiting
michael@0 585 // OPTIMIZED for case of 7-bit ASCII 0000..007f all valid
michael@0 586 int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
michael@0 587 const StringPiece& str,
michael@0 588 int* bytes_consumed) {
michael@0 589 const uint8* isrc =
michael@0 590 reinterpret_cast<const uint8*>(str.data());
michael@0 591 const uint8* src = isrc;
michael@0 592 const int len = str.length();
michael@0 593 const uint8* srclimit = isrc + len;
michael@0 594 const uint8* srclimit8 = srclimit - 7;
michael@0 595 *bytes_consumed = 0;
michael@0 596 if (len == 0) return kExitOK;
michael@0 597
michael@0 598 int n;
michael@0 599 int rest_consumed;
michael@0 600 int exit_reason;
michael@0 601 do {
michael@0 602 // Skip 8 bytes of ASCII at a whack; no endianness issue
michael@0 603 while ((src < srclimit8) &&
michael@0 604 (((reinterpret_cast<const uint32*>(src)[0] |
michael@0 605 reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
michael@0 606 src += 8;
michael@0 607 }
michael@0 608 // Run state table on the rest
michael@0 609 n = src - isrc;
michael@0 610 StringPiece str2(str.data() + n, str.length() - n);
michael@0 611 exit_reason = UTF8GenericScan(st, str2, &rest_consumed);
michael@0 612 src += rest_consumed;
michael@0 613 } while ( exit_reason == kExitDoAgain );
michael@0 614
michael@0 615 *bytes_consumed = src - isrc;
michael@0 616 return exit_reason;
michael@0 617 }
michael@0 618
michael@0 619 // Hack to change halfwidth katakana to match an old UTF8CharToLower()
michael@0 620
michael@0 621 // Return number of src bytes skipped
michael@0 622 static int DoSpecialFixup(const unsigned char c,
michael@0 623 const unsigned char** srcp, const unsigned char* srclimit,
michael@0 624 unsigned char** dstp, unsigned char* dstlimit) {
michael@0 625 return 0;
michael@0 626 }
michael@0 627
michael@0 628
michael@0 629 // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
michael@0 630 // and doing text replacements.
michael@0 631 // DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below
michael@0 632 // Needs caller to loop on kExitDoAgain
michael@0 633 static int UTF8GenericReplaceInternal(const UTF8ReplaceObj* st,
michael@0 634 const StringPiece& istr,
michael@0 635 StringPiece& ostr,
michael@0 636 bool is_plain_text,
michael@0 637 int* bytes_consumed,
michael@0 638 int* bytes_filled,
michael@0 639 int* chars_changed,
michael@0 640 OffsetMap* offsetmap) {
michael@0 641 int eshift = st->entry_shift;
michael@0 642 int nEntries = (1 << eshift); // 64 or 256 entries per state
michael@0 643 const uint8* isrc = reinterpret_cast<const uint8*>(istr.data());
michael@0 644 const int ilen = istr.length();
michael@0 645 const uint8* copystart = isrc;
michael@0 646 const uint8* src = isrc;
michael@0 647 const uint8* srclimit = src + ilen;
michael@0 648 *bytes_consumed = 0;
michael@0 649 *bytes_filled = 0;
michael@0 650 *chars_changed = 0;
michael@0 651
michael@0 652 const uint8* odst = reinterpret_cast<const uint8*>(ostr.data());
michael@0 653 const int olen = ostr.length();
michael@0 654 uint8* dst = const_cast<uint8*>(odst);
michael@0 655 uint8* dstlimit = dst + olen;
michael@0 656
michael@0 657 int total_changed = 0;
michael@0 658
michael@0 659 // Invariant condition during replacements:
michael@0 660 // remaining dst size >= remaining src size
michael@0 661 if ((dstlimit - dst) < (srclimit - src)) {
michael@0 662 if (offsetmap != NULL) {
michael@0 663 offsetmap->Copy(src - copystart);
michael@0 664 copystart = src;
michael@0 665 }
michael@0 666 return kExitDstSpaceFull;
michael@0 667 }
michael@0 668 const uint8* Tbl_0 = &st->state_table[st->state0];
michael@0 669
michael@0 670 Do_state_table:
michael@0 671 // Do state-table scan, copying as we go
michael@0 672 const uint8* Tbl = Tbl_0;
michael@0 673 int e = 0;
michael@0 674 uint8 c = 0;
michael@0 675
michael@0 676 Do_state_table_newe:
michael@0 677
michael@0 678 //----------------------------
michael@0 679 while (src < srclimit) {
michael@0 680 c = *src;
michael@0 681 e = Tbl[c];
michael@0 682 *dst = c;
michael@0 683 src++;
michael@0 684 dst++;
michael@0 685 if (e >= kExitIllegalStructure) {break;}
michael@0 686 Tbl = &Tbl_0[e << eshift];
michael@0 687 }
michael@0 688 //----------------------------
michael@0 689
michael@0 690 // Exit possibilities:
michael@0 691 // Replacement code, do the replacement and loop
michael@0 692 // Some other exit code, state0, back up one byte exactly
michael@0 693 // Some other exit code, !state0, back up over last char
michael@0 694 // source consumed, state0, exit OK
michael@0 695 // source consumed, !state0, back up over partial char
michael@0 696 // For illegal byte in state0, avoid backup up over PREVIOUS char
michael@0 697 // For truncated last char, back up to beginning of it
michael@0 698
michael@0 699 if (e >= kExitIllegalStructure) {
michael@0 700 // Switch on exit code; most loop back to top
michael@0 701 int offset = 0;
michael@0 702 switch (e) {
michael@0 703 // These all make the output string the same size or shorter
michael@0 704 // No checking needed
michael@0 705 case kExitReplace31: // del 2, add 1 bytes to change
michael@0 706 dst -= 2;
michael@0 707 if (offsetmap != NULL) {
michael@0 708 offsetmap->Copy(src - copystart - 2);
michael@0 709 offsetmap->Delete(2);
michael@0 710 copystart = src;
michael@0 711 }
michael@0 712 dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
michael@0 713 total_changed++;
michael@0 714 goto Do_state_table;
michael@0 715 case kExitReplace32: // del 3, add 2 bytes to change
michael@0 716 dst--;
michael@0 717 if (offsetmap != NULL) {
michael@0 718 offsetmap->Copy(src - copystart - 1);
michael@0 719 offsetmap->Delete(1);
michael@0 720 copystart = src;
michael@0 721 }
michael@0 722 dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)];
michael@0 723 dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
michael@0 724 total_changed++;
michael@0 725 goto Do_state_table;
michael@0 726 case kExitReplace21: // del 2, add 1 bytes to change
michael@0 727 dst--;
michael@0 728 if (offsetmap != NULL) {
michael@0 729 offsetmap->Copy(src - copystart - 1);
michael@0 730 offsetmap->Delete(1);
michael@0 731 copystart = src;
michael@0 732 }
michael@0 733 dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
michael@0 734 total_changed++;
michael@0 735 goto Do_state_table;
michael@0 736 case kExitReplace3: // update 3 bytes to change
michael@0 737 dst[-3] = (unsigned char)Tbl[c + (nEntries * 3)];
michael@0 738 // Fall into next case
michael@0 739 case kExitReplace2: // update 2 bytes to change
michael@0 740 dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)];
michael@0 741 // Fall into next case
michael@0 742 case kExitReplace1: // update 1 byte to change
michael@0 743 dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
michael@0 744 total_changed++;
michael@0 745 goto Do_state_table;
michael@0 746 case kExitReplace1S0: // update 1 byte to change, 256-entry state
michael@0 747 dst[-1] = (unsigned char)Tbl[c + (256 * 1)];
michael@0 748 total_changed++;
michael@0 749 goto Do_state_table;
michael@0 750 // These can make the output string longer than the input
michael@0 751 case kExitReplaceOffset2:
michael@0 752 if ((nEntries != 256) && InStateZero(st, Tbl)) {
michael@0 753 // For space-optimized table, we need multiples of 256 bytes
michael@0 754 // in state0 and multiples of nEntries in other states
michael@0 755 offset += ((unsigned char)Tbl[c + (256 * 2)] << 8);
michael@0 756 } else {
michael@0 757 offset += ((unsigned char)Tbl[c + (nEntries * 2)] << 8);
michael@0 758 }
michael@0 759 // Fall into next case
michael@0 760 case kExitSpecial: // Apply special fixups [read: hacks]
michael@0 761 case kExitReplaceOffset1:
michael@0 762 if ((nEntries != 256) && InStateZero(st, Tbl)) {
michael@0 763 // For space-optimized table, we need multiples of 256 bytes
michael@0 764 // in state0 and multiples of nEntries in other states
michael@0 765 offset += (unsigned char)Tbl[c + (256 * 1)];
michael@0 766 } else {
michael@0 767 offset += (unsigned char)Tbl[c + (nEntries * 1)];
michael@0 768 }
michael@0 769 {
michael@0 770 const RemapEntry* re = &st->remap_base[offset];
michael@0 771 int del_len = re->delete_bytes & ~kReplaceAndResumeFlag;
michael@0 772 int add_len = re->add_bytes & ~kHtmlPlaintextFlag;
michael@0 773
michael@0 774 // Special-case non-HTML replacement of five sensitive entities
michael@0 775 // &quot; &amp; &apos; &lt; &gt;
michael@0 776 // 0022 0026 0027 003c 003e
michael@0 777 // A replacement creating one of these is expressed as a pair of
michael@0 778 // entries, one for HTML output and one for plaintext output.
michael@0 779 // The first of the pair has the high bit of add_bytes set.
michael@0 780 if (re->add_bytes & kHtmlPlaintextFlag) {
michael@0 781 // Use this entry for plain text
michael@0 782 if (!is_plain_text) {
michael@0 783 // Use very next entry for HTML text (same back/delete length)
michael@0 784 re = &st->remap_base[offset + 1];
michael@0 785 add_len = re->add_bytes & ~kHtmlPlaintextFlag;
michael@0 786 }
michael@0 787 }
michael@0 788
michael@0 789 int string_offset = re->bytes_offset;
michael@0 790 // After the replacement, need (dstlimit - newdst) >= (srclimit - src)
michael@0 791 uint8* newdst = dst - del_len + add_len;
michael@0 792 if ((dstlimit - newdst) < (srclimit - src)) {
michael@0 793 // Won't fit; don't do the replacement. Caller may realloc and retry
michael@0 794 e = kExitDstSpaceFull;
michael@0 795 break; // exit, backing up over this char for later retry
michael@0 796 }
michael@0 797 dst -= del_len;
michael@0 798 memcpy(dst, &st->remap_string[string_offset], add_len);
michael@0 799 dst += add_len;
michael@0 800 total_changed++;
michael@0 801 if (offsetmap != NULL) {
michael@0 802 if (add_len > del_len) {
michael@0 803 offsetmap->Copy(src - copystart);
michael@0 804 offsetmap->Insert(add_len - del_len);
michael@0 805 copystart = src;
michael@0 806 } else if (add_len < del_len) {
michael@0 807 offsetmap->Copy(src - copystart + add_len - del_len);
michael@0 808 offsetmap->Delete(del_len - add_len);
michael@0 809 copystart = src;
michael@0 810 }
michael@0 811 }
michael@0 812 if (re->delete_bytes & kReplaceAndResumeFlag) {
michael@0 813 // There is a non-zero target state at the end of the
michael@0 814 // replacement string
michael@0 815 e = st->remap_string[string_offset + add_len];
michael@0 816 Tbl = &Tbl_0[e << eshift];
michael@0 817 goto Do_state_table_newe;
michael@0 818 }
michael@0 819 }
michael@0 820 if (e == kExitRejectAlt) {break;}
michael@0 821 if (e != kExitSpecial) {goto Do_state_table;}
michael@0 822
michael@0 823 // case kExitSpecial: // Apply special fixups [read: hacks]
michael@0 824 // In this routine, do either UTF8CharToLower()
michael@0 825 // fullwidth/halfwidth mapping or
michael@0 826 // voiced mapping or
michael@0 827 // semi-voiced mapping
michael@0 828
michael@0 829 // First, do EXIT_REPLACE_OFFSET1 action (above)
michael@0 830 // Second: do additional code fixup
michael@0 831 {
michael@0 832 int srcdel = DoSpecialFixup(c, &src, srclimit, &dst, dstlimit);
michael@0 833 if (offsetmap != NULL) {
michael@0 834 if (srcdel != 0) {
michael@0 835 offsetmap->Copy(src - copystart - srcdel);
michael@0 836 offsetmap->Delete(srcdel);
michael@0 837 copystart = src;
michael@0 838 }
michael@0 839 }
michael@0 840 }
michael@0 841 goto Do_state_table;
michael@0 842
michael@0 843 case kExitIllegalStructure: // structurally illegal byte; quit
michael@0 844 case kExitReject: // NUL or illegal code encountered; quit
michael@0 845 case kExitRejectAlt: // Apply replacement, then exit
michael@0 846 default: // and all other exits
michael@0 847 break;
michael@0 848 } // End switch (e)
michael@0 849
michael@0 850 // Exit possibilities:
michael@0 851 // Some other exit code, state0, back up one byte exactly
michael@0 852 // Some other exit code, !state0, back up over last char
michael@0 853
michael@0 854 // Back up over exactly one byte of rejected/illegal UTF-8 character
michael@0 855 src--;
michael@0 856 dst--;
michael@0 857 // Back up more if needed
michael@0 858 if (!InStateZero(st, Tbl)) {
michael@0 859 do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
michael@0 860 }
michael@0 861 } else if (!InStateZero(st, Tbl)) {
michael@0 862 // src >= srclimit, !state0
michael@0 863 // Back up over truncated UTF-8 character
michael@0 864 e = kExitIllegalStructure;
michael@0 865 do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
michael@0 866 } else {
michael@0 867 // src >= srclimit, state0
michael@0 868 // Normal termination, source fully consumed
michael@0 869 e = kExitOK;
michael@0 870 }
michael@0 871
michael@0 872 if (offsetmap != NULL) {
michael@0 873 if (src > copystart) {
michael@0 874 offsetmap->Copy(src - copystart);
michael@0 875 copystart = src;
michael@0 876 }
michael@0 877 }
michael@0 878
michael@0 879 // Possible return values here:
michael@0 880 // kExitDstSpaceFull caller may realloc and retry from middle
michael@0 881 // kExitIllegalStructure caller my overwrite/truncate
michael@0 882 // kExitOK all done and happy
michael@0 883 // kExitReject caller may overwrite/truncate
michael@0 884 // kExitDoAgain LOOP NOT DONE; caller must retry from middle
michael@0 885 // (may do fast ASCII loop first)
michael@0 886 // kExitPlaceholder -unused-
michael@0 887 // kExitNone -unused-
michael@0 888 *bytes_consumed = src - isrc;
michael@0 889 *bytes_filled = dst - odst;
michael@0 890 *chars_changed = total_changed;
michael@0 891 return e;
michael@0 892 }
michael@0 893
michael@0 894 // TwoByte versions are needed for tables > 240 states, such
michael@0 895 // as the table for full Unicode 4.1 canonical + compatibility mapping
michael@0 896
michael@0 897 // Scan a UTF-8 stringpiece based on state table with two-byte entries,
michael@0 898 // copying to output stringpiece
michael@0 899 // and doing text replacements.
michael@0 900 // DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below
michael@0 901 // Needs caller to loop on kExitDoAgain
michael@0 902 static int UTF8GenericReplaceInternalTwoByte(const UTF8ReplaceObj_2* st,
michael@0 903 const StringPiece& istr,
michael@0 904 StringPiece& ostr,
michael@0 905 bool is_plain_text,
michael@0 906 int* bytes_consumed,
michael@0 907 int* bytes_filled,
michael@0 908 int* chars_changed,
michael@0 909 OffsetMap* offsetmap) {
michael@0 910 int eshift = st->entry_shift;
michael@0 911 int nEntries = (1 << eshift); // 64 or 256 entries per state
michael@0 912 const uint8* isrc = reinterpret_cast<const uint8*>(istr.data());
michael@0 913 const int ilen = istr.length();
michael@0 914 const uint8* copystart = isrc;
michael@0 915 const uint8* src = isrc;
michael@0 916 const uint8* srclimit = src + ilen;
michael@0 917 *bytes_consumed = 0;
michael@0 918 *bytes_filled = 0;
michael@0 919 *chars_changed = 0;
michael@0 920
michael@0 921 const uint8* odst = reinterpret_cast<const uint8*>(ostr.data());
michael@0 922 const int olen = ostr.length();
michael@0 923 uint8* dst = const_cast<uint8*>(odst);
michael@0 924 uint8* dstlimit = dst + olen;
michael@0 925
michael@0 926 *chars_changed = 0;
michael@0 927
michael@0 928 int total_changed = 0;
michael@0 929
michael@0 930 int src_lll = srclimit - src;
michael@0 931 int dst_lll = dstlimit - dst;
michael@0 932
michael@0 933
michael@0 934 // Invariant condition during replacements:
michael@0 935 // remaining dst size >= remaining src size
michael@0 936 if ((dstlimit - dst) < (srclimit - src)) {
michael@0 937 if (offsetmap != NULL) {
michael@0 938 offsetmap->Copy(src - copystart);
michael@0 939 copystart = src;
michael@0 940 }
michael@0 941 return kExitDstSpaceFull_2;
michael@0 942 }
michael@0 943 const unsigned short* Tbl_0 = &st->state_table[st->state0];
michael@0 944
michael@0 945 Do_state_table_2:
michael@0 946 // Do state-table scan, copying as we go
michael@0 947 const unsigned short* Tbl = Tbl_0;
michael@0 948 int e = 0;
michael@0 949 uint8 c = 0;
michael@0 950
michael@0 951 Do_state_table_newe_2:
michael@0 952
michael@0 953 //----------------------------
michael@0 954 while (src < srclimit) {
michael@0 955 c = *src;
michael@0 956 e = Tbl[c];
michael@0 957 *dst = c;
michael@0 958 src++;
michael@0 959 dst++;
michael@0 960 if (e >= kExitIllegalStructure_2) {break;}
michael@0 961 Tbl = &Tbl_0[e << eshift];
michael@0 962 }
michael@0 963 //----------------------------
michael@0 964 src_lll = src - isrc;
michael@0 965 dst_lll = dst - odst;
michael@0 966
michael@0 967 // Exit possibilities:
michael@0 968 // Replacement code, do the replacement and loop
michael@0 969 // Some other exit code, state0, back up one byte exactly
michael@0 970 // Some other exit code, !state0, back up over last char
michael@0 971 // source consumed, state0, exit OK
michael@0 972 // source consumed, !state0, back up over partial char
michael@0 973 // For illegal byte in state0, avoid backup up over PREVIOUS char
michael@0 974 // For truncated last char, back up to beginning of it
michael@0 975
michael@0 976 if (e >= kExitIllegalStructure_2) {
michael@0 977 // Switch on exit code; most loop back to top
michael@0 978 int offset = 0;
michael@0 979 switch (e) {
michael@0 980 // These all make the output string the same size or shorter
michael@0 981 // No checking needed
michael@0 982 case kExitReplace31_2: // del 2, add 1 bytes to change
michael@0 983 dst -= 2;
michael@0 984 if (offsetmap != NULL) {
michael@0 985 offsetmap->Copy(src - copystart - 2);
michael@0 986 offsetmap->Delete(2);
michael@0 987 copystart = src;
michael@0 988 }
michael@0 989 dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
michael@0 990 total_changed++;
michael@0 991 goto Do_state_table_2;
michael@0 992 case kExitReplace32_2: // del 3, add 2 bytes to change
michael@0 993 dst--;
michael@0 994 if (offsetmap != NULL) {
michael@0 995 offsetmap->Copy(src - copystart - 1);
michael@0 996 offsetmap->Delete(1);
michael@0 997 copystart = src;
michael@0 998 }
michael@0 999 dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff);
michael@0 1000 dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
michael@0 1001 total_changed++;
michael@0 1002 goto Do_state_table_2;
michael@0 1003 case kExitReplace21_2: // del 2, add 1 bytes to change
michael@0 1004 dst--;
michael@0 1005 if (offsetmap != NULL) {
michael@0 1006 offsetmap->Copy(src - copystart - 1);
michael@0 1007 offsetmap->Delete(1);
michael@0 1008 copystart = src;
michael@0 1009 }
michael@0 1010 dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
michael@0 1011 total_changed++;
michael@0 1012 goto Do_state_table_2;
michael@0 1013 case kExitReplace3_2: // update 3 bytes to change
michael@0 1014 dst[-3] = (unsigned char)(Tbl[c + (nEntries * 2)] & 0xff);
michael@0 1015 // Fall into next case
michael@0 1016 case kExitReplace2_2: // update 2 bytes to change
michael@0 1017 dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff);
michael@0 1018 // Fall into next case
michael@0 1019 case kExitReplace1_2: // update 1 byte to change
michael@0 1020 dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
michael@0 1021 total_changed++;
michael@0 1022 goto Do_state_table_2;
michael@0 1023 case kExitReplace1S0_2: // update 1 byte to change, 256-entry state
michael@0 1024 dst[-1] = (unsigned char)(Tbl[c + (256 * 1)] & 0xff);
michael@0 1025 total_changed++;
michael@0 1026 goto Do_state_table_2;
michael@0 1027 // These can make the output string longer than the input
michael@0 1028 case kExitReplaceOffset2_2:
michael@0 1029 if ((nEntries != 256) && InStateZero_2(st, Tbl)) {
michael@0 1030 // For space-optimized table, we need multiples of 256 bytes
michael@0 1031 // in state0 and multiples of nEntries in other states
michael@0 1032 offset += ((unsigned char)(Tbl[c + (256 * 1)] >> 8 & 0xff) << 8);
michael@0 1033 } else {
michael@0 1034 offset += ((unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff) << 8);
michael@0 1035 }
michael@0 1036 // Fall into next case
michael@0 1037 case kExitReplaceOffset1_2:
michael@0 1038 if ((nEntries != 256) && InStateZero_2(st, Tbl)) {
michael@0 1039 // For space-optimized table, we need multiples of 256 bytes
michael@0 1040 // in state0 and multiples of nEntries in other states
michael@0 1041 offset += (unsigned char)(Tbl[c + (256 * 1)] & 0xff);
michael@0 1042 } else {
michael@0 1043 offset += (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
michael@0 1044 }
michael@0 1045 {
michael@0 1046 const RemapEntry* re = &st->remap_base[offset];
michael@0 1047 int del_len = re->delete_bytes & ~kReplaceAndResumeFlag;
michael@0 1048 int add_len = re->add_bytes & ~kHtmlPlaintextFlag;
michael@0 1049 // Special-case non-HTML replacement of five sensitive entities
michael@0 1050 // &quot; &amp; &apos; &lt; &gt;
michael@0 1051 // 0022 0026 0027 003c 003e
michael@0 1052 // A replacement creating one of these is expressed as a pair of
michael@0 1053 // entries, one for HTML output and one for plaintext output.
michael@0 1054 // The first of the pair has the high bit of add_bytes set.
michael@0 1055 if (re->add_bytes & kHtmlPlaintextFlag) {
michael@0 1056 // Use this entry for plain text
michael@0 1057 if (!is_plain_text) {
michael@0 1058 // Use very next entry for HTML text (same back/delete length)
michael@0 1059 re = &st->remap_base[offset + 1];
michael@0 1060 add_len = re->add_bytes & ~kHtmlPlaintextFlag;
michael@0 1061 }
michael@0 1062 }
michael@0 1063
michael@0 1064 // After the replacement, need (dstlimit - dst) >= (srclimit - src)
michael@0 1065 int string_offset = re->bytes_offset;
michael@0 1066 // After the replacement, need (dstlimit - newdst) >= (srclimit - src)
michael@0 1067 uint8* newdst = dst - del_len + add_len;
michael@0 1068 if ((dstlimit - newdst) < (srclimit - src)) {
michael@0 1069 // Won't fit; don't do the replacement. Caller may realloc and retry
michael@0 1070 e = kExitDstSpaceFull_2;
michael@0 1071 break; // exit, backing up over this char for later retry
michael@0 1072 }
michael@0 1073 dst -= del_len;
michael@0 1074 memcpy(dst, &st->remap_string[string_offset], add_len);
michael@0 1075 dst += add_len;
michael@0 1076 if (offsetmap != NULL) {
michael@0 1077 if (add_len > del_len) {
michael@0 1078 offsetmap->Copy(src - copystart);
michael@0 1079 offsetmap->Insert(add_len - del_len);
michael@0 1080 copystart = src;
michael@0 1081 } else if (add_len < del_len) {
michael@0 1082 offsetmap->Copy(src - copystart + add_len - del_len);
michael@0 1083 offsetmap->Delete(del_len - add_len);
michael@0 1084 copystart = src;
michael@0 1085 }
michael@0 1086 }
michael@0 1087 if (re->delete_bytes & kReplaceAndResumeFlag) {
michael@0 1088 // There is a two-byte non-zero target state at the end of the
michael@0 1089 // replacement string
michael@0 1090 uint8 c1 = st->remap_string[string_offset + add_len];
michael@0 1091 uint8 c2 = st->remap_string[string_offset + add_len + 1];
michael@0 1092 e = (c1 << 8) | c2;
michael@0 1093 Tbl = &Tbl_0[e << eshift];
michael@0 1094 total_changed++;
michael@0 1095 goto Do_state_table_newe_2;
michael@0 1096 }
michael@0 1097 }
michael@0 1098 total_changed++;
michael@0 1099 if (e == kExitRejectAlt_2) {break;}
michael@0 1100 goto Do_state_table_2;
michael@0 1101
michael@0 1102 case kExitSpecial_2: // NO special fixups [read: hacks]
michael@0 1103 case kExitIllegalStructure_2: // structurally illegal byte; quit
michael@0 1104 case kExitReject_2: // NUL or illegal code encountered; quit
michael@0 1105 // and all other exits
michael@0 1106 default:
michael@0 1107 break;
michael@0 1108 } // End switch (e)
michael@0 1109
michael@0 1110 // Exit possibilities:
michael@0 1111 // Some other exit code, state0, back up one byte exactly
michael@0 1112 // Some other exit code, !state0, back up over last char
michael@0 1113
michael@0 1114 // Back up over exactly one byte of rejected/illegal UTF-8 character
michael@0 1115 src--;
michael@0 1116 dst--;
michael@0 1117 // Back up more if needed
michael@0 1118 if (!InStateZero_2(st, Tbl)) {
michael@0 1119 do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
michael@0 1120 }
michael@0 1121 } else if (!InStateZero_2(st, Tbl)) {
michael@0 1122 // src >= srclimit, !state0
michael@0 1123 // Back up over truncated UTF-8 character
michael@0 1124 e = kExitIllegalStructure_2;
michael@0 1125
michael@0 1126 do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
michael@0 1127 } else {
michael@0 1128 // src >= srclimit, state0
michael@0 1129 // Normal termination, source fully consumed
michael@0 1130 e = kExitOK_2;
michael@0 1131 }
michael@0 1132
michael@0 1133 if (offsetmap != NULL) {
michael@0 1134 if (src > copystart) {
michael@0 1135 offsetmap->Copy(src - copystart);
michael@0 1136 copystart = src;
michael@0 1137 }
michael@0 1138 }
michael@0 1139
michael@0 1140
michael@0 1141 // Possible return values here:
michael@0 1142 // kExitDstSpaceFull_2 caller may realloc and retry from middle
michael@0 1143 // kExitIllegalStructure_2 caller my overwrite/truncate
michael@0 1144 // kExitOK_2 all done and happy
michael@0 1145 // kExitReject_2 caller may overwrite/truncate
michael@0 1146 // kExitDoAgain_2 LOOP NOT DONE; caller must retry from middle
michael@0 1147 // (may do fast ASCII loop first)
michael@0 1148 // kExitPlaceholder_2 -unused-
michael@0 1149 // kExitNone_2 -unused-
michael@0 1150 *bytes_consumed = src - isrc;
michael@0 1151 *bytes_filled = dst - odst;
michael@0 1152 *chars_changed = total_changed;
michael@0 1153 return e;
michael@0 1154 }
michael@0 1155
michael@0 1156
michael@0 1157 // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
michael@0 1158 // and doing text replacements.
michael@0 1159 // Also writes an optional OffsetMap. Pass NULL to skip writing one.
michael@0 1160 // Always scan complete UTF-8 characters
michael@0 1161 // Set number of bytes consumed from input, number filled to output.
michael@0 1162 // Return reason for exiting
michael@0 1163 int UTF8GenericReplace(const UTF8ReplaceObj* st,
michael@0 1164 const StringPiece& istr,
michael@0 1165 StringPiece& ostr,
michael@0 1166 bool is_plain_text,
michael@0 1167 int* bytes_consumed,
michael@0 1168 int* bytes_filled,
michael@0 1169 int* chars_changed,
michael@0 1170 OffsetMap* offsetmap) {
michael@0 1171 StringPiece local_istr(istr.data(), istr.length());
michael@0 1172 StringPiece local_ostr(ostr.data(), ostr.length());
michael@0 1173 int total_consumed = 0;
michael@0 1174 int total_filled = 0;
michael@0 1175 int total_changed = 0;
michael@0 1176 int local_bytes_consumed, local_bytes_filled, local_chars_changed;
michael@0 1177 int e;
michael@0 1178 do {
michael@0 1179 e = UTF8GenericReplaceInternal(st,
michael@0 1180 local_istr, local_ostr, is_plain_text,
michael@0 1181 &local_bytes_consumed, &local_bytes_filled,
michael@0 1182 &local_chars_changed,
michael@0 1183 offsetmap);
michael@0 1184 local_istr.remove_prefix(local_bytes_consumed);
michael@0 1185 local_ostr.remove_prefix(local_bytes_filled);
michael@0 1186 total_consumed += local_bytes_consumed;
michael@0 1187 total_filled += local_bytes_filled;
michael@0 1188 total_changed += local_chars_changed;
michael@0 1189 } while ( e == kExitDoAgain );
michael@0 1190 *bytes_consumed = total_consumed;
michael@0 1191 *bytes_filled = total_filled;
michael@0 1192 *chars_changed = total_changed;
michael@0 1193 return e;
michael@0 1194 }
michael@0 1195
michael@0 1196 // Older version without offsetmap
michael@0 1197 int UTF8GenericReplace(const UTF8ReplaceObj* st,
michael@0 1198 const StringPiece& istr,
michael@0 1199 StringPiece& ostr,
michael@0 1200 bool is_plain_text,
michael@0 1201 int* bytes_consumed,
michael@0 1202 int* bytes_filled,
michael@0 1203 int* chars_changed) {
michael@0 1204 return UTF8GenericReplace(st,
michael@0 1205 istr,
michael@0 1206 ostr,
michael@0 1207 is_plain_text,
michael@0 1208 bytes_consumed,
michael@0 1209 bytes_filled,
michael@0 1210 chars_changed,
michael@0 1211 NULL);
michael@0 1212 }
michael@0 1213
michael@0 1214 // Older version without is_plain_text or offsetmap
michael@0 1215 int UTF8GenericReplace(const UTF8ReplaceObj* st,
michael@0 1216 const StringPiece& istr,
michael@0 1217 StringPiece& ostr,
michael@0 1218 int* bytes_consumed,
michael@0 1219 int* bytes_filled,
michael@0 1220 int* chars_changed) {
michael@0 1221 bool is_plain_text = false;
michael@0 1222 return UTF8GenericReplace(st,
michael@0 1223 istr,
michael@0 1224 ostr,
michael@0 1225 is_plain_text,
michael@0 1226 bytes_consumed,
michael@0 1227 bytes_filled,
michael@0 1228 chars_changed,
michael@0 1229 NULL);
michael@0 1230 }
michael@0 1231
michael@0 1232 // Scan a UTF-8 stringpiece based on state table with two-byte entries,
michael@0 1233 // copying to output stringpiece
michael@0 1234 // and doing text replacements.
michael@0 1235 // Also writes an optional OffsetMap. Pass NULL to skip writing one.
michael@0 1236 // Always scan complete UTF-8 characters
michael@0 1237 // Set number of bytes consumed from input, number filled to output.
michael@0 1238 // Return reason for exiting
michael@0 1239 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
michael@0 1240 const StringPiece& istr,
michael@0 1241 StringPiece& ostr,
michael@0 1242 bool is_plain_text,
michael@0 1243 int* bytes_consumed,
michael@0 1244 int* bytes_filled,
michael@0 1245 int* chars_changed,
michael@0 1246 OffsetMap* offsetmap) {
michael@0 1247 StringPiece local_istr(istr.data(), istr.length());
michael@0 1248 StringPiece local_ostr(ostr.data(), ostr.length());
michael@0 1249 int total_consumed = 0;
michael@0 1250 int total_filled = 0;
michael@0 1251 int total_changed = 0;
michael@0 1252 int local_bytes_consumed, local_bytes_filled, local_chars_changed;
michael@0 1253 int e;
michael@0 1254 do {
michael@0 1255 e = UTF8GenericReplaceInternalTwoByte(st,
michael@0 1256 local_istr, local_ostr, is_plain_text,
michael@0 1257 &local_bytes_consumed,
michael@0 1258 &local_bytes_filled,
michael@0 1259 &local_chars_changed,
michael@0 1260 offsetmap);
michael@0 1261 local_istr.remove_prefix(local_bytes_consumed);
michael@0 1262 local_ostr.remove_prefix(local_bytes_filled);
michael@0 1263 total_consumed += local_bytes_consumed;
michael@0 1264 total_filled += local_bytes_filled;
michael@0 1265 total_changed += local_chars_changed;
michael@0 1266 } while ( e == kExitDoAgain_2 );
michael@0 1267 *bytes_consumed = total_consumed;
michael@0 1268 *bytes_filled = total_filled;
michael@0 1269 *chars_changed = total_changed;
michael@0 1270
michael@0 1271 return e - kExitOK_2 + kExitOK;
michael@0 1272 }
michael@0 1273
michael@0 1274 // Older version without offsetmap
michael@0 1275 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
michael@0 1276 const StringPiece& istr,
michael@0 1277 StringPiece& ostr,
michael@0 1278 bool is_plain_text,
michael@0 1279 int* bytes_consumed,
michael@0 1280 int* bytes_filled,
michael@0 1281 int* chars_changed) {
michael@0 1282 return UTF8GenericReplaceTwoByte(st,
michael@0 1283 istr,
michael@0 1284 ostr,
michael@0 1285 is_plain_text,
michael@0 1286 bytes_consumed,
michael@0 1287 bytes_filled,
michael@0 1288 chars_changed,
michael@0 1289 NULL);
michael@0 1290 }
michael@0 1291
michael@0 1292 // Older version without is_plain_text or offsetmap
michael@0 1293 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
michael@0 1294 const StringPiece& istr,
michael@0 1295 StringPiece& ostr,
michael@0 1296 int* bytes_consumed,
michael@0 1297 int* bytes_filled,
michael@0 1298 int* chars_changed) {
michael@0 1299 bool is_plain_text = false;
michael@0 1300 return UTF8GenericReplaceTwoByte(st,
michael@0 1301 istr,
michael@0 1302 ostr,
michael@0 1303 is_plain_text,
michael@0 1304 bytes_consumed,
michael@0 1305 bytes_filled,
michael@0 1306 chars_changed,
michael@0 1307 NULL);
michael@0 1308 }
michael@0 1309
michael@0 1310
michael@0 1311
michael@0 1312 // Adjust a stringpiece to encompass complete UTF-8 characters.
michael@0 1313 // The data pointer will be increased by 0..3 bytes to get to a character
michael@0 1314 // boundary, and the length will then be decreased by 0..3 bytes
michael@0 1315 // to encompass the last complete character.
michael@0 1316 void UTF8TrimToChars(StringPiece* istr) {
michael@0 1317 const char* src = istr->data();
michael@0 1318 int len = istr->length();
michael@0 1319 // Exit if empty string
michael@0 1320 if (len == 0) {
michael@0 1321 return;
michael@0 1322 }
michael@0 1323
michael@0 1324 // Exit on simple, common case
michael@0 1325 if ( ((src[0] & 0xc0) != 0x80) &&
michael@0 1326 (static_cast<signed char>(src[len - 1]) >= 0) ) {
michael@0 1327 // First byte is not a continuation and last byte is 7-bit ASCII -- done
michael@0 1328 return;
michael@0 1329 }
michael@0 1330
michael@0 1331 // Adjust the back end, len > 0
michael@0 1332 const char* srclimit = src + len;
michael@0 1333 // Backscan over any ending continuation bytes to find last char start
michael@0 1334 const char* s = srclimit - 1; // Last byte of the string
michael@0 1335 while ((src <= s) && ((*s & 0xc0) == 0x80)) {
michael@0 1336 s--;
michael@0 1337 }
michael@0 1338 // Include entire last char if it fits
michael@0 1339 if (src <= s) {
michael@0 1340 int last_char_len = UTF8OneCharLen(s);
michael@0 1341 if (s + last_char_len <= srclimit) {
michael@0 1342 // Last char fits, so include it, else exclude it
michael@0 1343 s += last_char_len;
michael@0 1344 }
michael@0 1345 }
michael@0 1346 if (s != srclimit) {
michael@0 1347 // s is one byte beyond the last full character, if any
michael@0 1348 istr->remove_suffix(srclimit - s);
michael@0 1349 // Exit if now empty string
michael@0 1350 if (istr->length() == 0) {
michael@0 1351 return;
michael@0 1352 }
michael@0 1353 }
michael@0 1354
michael@0 1355 // Adjust the front end, len > 0
michael@0 1356 len = istr->length();
michael@0 1357 srclimit = src + len;
michael@0 1358 s = src; // First byte of the string
michael@0 1359 // Scan over any beginning continuation bytes to find first char start
michael@0 1360 while ((s < srclimit) && ((*s & 0xc0) == 0x80)) {
michael@0 1361 s++;
michael@0 1362 }
michael@0 1363 if (s != src) {
michael@0 1364 // s is at the first full character, if any
michael@0 1365 istr->remove_prefix(s - src);
michael@0 1366 }
michael@0 1367 }
michael@0 1368
michael@0 1369 } // End namespace CLD2

mercurial