browser/components/translation/cld2/internal/utf8statetable.cc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/browser/components/translation/cld2/internal/utf8statetable.cc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1369 @@
     1.4 +// Copyright 2013 Google Inc. All Rights Reserved.
     1.5 +//
     1.6 +// Licensed under the Apache License, Version 2.0 (the "License");
     1.7 +// you may not use this file except in compliance with the License.
     1.8 +// You may obtain a copy of the License at
     1.9 +//
    1.10 +//     http://www.apache.org/licenses/LICENSE-2.0
    1.11 +//
    1.12 +// Unless required by applicable law or agreed to in writing, software
    1.13 +// distributed under the License is distributed on an "AS IS" BASIS,
    1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    1.15 +// See the License for the specific language governing permissions and
    1.16 +// limitations under the License.
    1.17 +
    1.18 +//
    1.19 +// State Table follower for scanning UTF-8 strings without converting to
    1.20 +// 32- or 16-bit Unicode values.
    1.21 +//
    1.22 +
    1.23 +#ifdef COMPILER_MSVC
    1.24 +// MSVC warns: warning C4309: 'initializing' : truncation of constant value
    1.25 +// But the value is in fact not truncated.  0xFF still comes out 0xFF at
    1.26 +// runtime.
    1.27 +#pragma warning ( disable : 4309 )
    1.28 +#endif
    1.29 +
    1.30 +#include "utf8statetable.h"
    1.31 +
    1.32 +#include <stdint.h>                     // for uintptr_t
    1.33 +#include <string.h>                     // for NULL, memcpy, memmove
    1.34 +
    1.35 +#include "integral_types.h"        // for uint8, uint32, int8
    1.36 +#include "stringpiece.h"
    1.37 +#include "offsetmap.h"
    1.38 +
    1.39 +
    1.40 +namespace CLD2 {
    1.41 +
    1.42 +static const int kReplaceAndResumeFlag = 0x80; // Bit in del byte to distinguish
    1.43 +                                               // optional next-state field
    1.44 +                                               // after replacement text
    1.45 +static const int kHtmlPlaintextFlag = 0x80;    // Bit in add byte to distinguish
    1.46 +                                               // HTML replacement vs. plaintext
    1.47 +
    1.48 +
    1.49 +/**
    1.50 + * This code implements a little interpreter for UTF8 state
    1.51 + * tables. There are three kinds of quite-similar state tables,
    1.52 + * property, scanning, and replacement. Each state in one of
    1.53 + * these tables consists of an array of 256 or 64 one-byte
    1.54 + * entries. The state is subscripted by an incoming source byte,
    1.55 + * and the entry either specifies the next state or specifies an
    1.56 + * action. Space-optimized tables have full 256-entry states for
    1.57 + * the first byte of a UTF-8 character, but only 64-entry states
    1.58 + * for continuation bytes. Space-optimized tables may only be
    1.59 + * used with source input that has been checked to be
    1.60 + * structurally- (or stronger interchange-) valid.
    1.61 + *
    1.62 + * A property state table has an unsigned one-byte property for
    1.63 + * each possible UTF-8 character. One-byte character properties
    1.64 + * are in the state[0] array, while for other lengths the
    1.65 + * state[0] array gives the next state, which contains the
    1.66 + * property value for two-byte characters or yet another state
    1.67 + * for longer ones. The code simply loads the right number of
    1.68 + * next-state values, then returns the final byte as property
    1.69 + * value. There are no actions specified in property tables.
    1.70 + * States are typically shared for multi-byte UTF-8 characters
    1.71 + * that all have the same property value.
    1.72 + *
    1.73 + * A scanning state table has entries that are either a
    1.74 + * next-state specifier for bytes that are accepted by the
    1.75 + * scanner, or an exit action for the last byte of each
    1.76 + * character that is rejected by the scanner.
    1.77 + *
    1.78 + * Scanning long strings involves a tight loop that picks up one
    1.79 + * byte at a time and follows next-state value back to state[0]
    1.80 + * for each accepted UTF-8 character. Scanning stops at the end
    1.81 + * of the string or at the first character encountered that has
    1.82 + * an exit action such as "reject". Timing information is given
    1.83 + * below.
    1.84 + *
    1.85 + * Since so much of Google's text is 7-bit-ASCII values
    1.86 + * (approximately 94% of the bytes of web documents), the
    1.87 + * scanning interpreter has two speed optimizations. One checks
    1.88 + * 8 bytes at a time to see if they are all in the range lo..hi,
    1.89 + * as specified in constants in the overall statetable object.
    1.90 + * The check involves ORing together four 4-byte values that
    1.91 + * overflow into the high bit of some byte when a byte is out of
    1.92 + * range. For seven-bit-ASCII, lo is 0x20 and hi is 0x7E. This
    1.93 + * loop is about 8x faster than the one-byte-at-a-time loop.
    1.94 + *
    1.95 + * If checking for exit bytes in the 0x00-0x1F and 7F range is
    1.96 + * unneeded, an even faster loop just looks at the high bits of
    1.97 + * 8 bytes at once, and is about 1.33x faster than the lo..hi
    1.98 + * loop.
    1.99 + *
   1.100 + * Exit from the scanning routines backs up to the first byte of
   1.101 + * the rejected character, so the text spanned is always a
   1.102 + * complete number of UTF-8 characters. The normal scanning exit
   1.103 + * is at the first rejected character, or at the end of the
   1.104 + * input text. Scanning also exits on any detected ill-formed
   1.105 + * character or at a special do-again action built into some
   1.106 + * exit-optimized tables. The do-again action gets back to the
   1.107 + * top of the scanning loop to retry eight-byte ASCII scans. It
   1.108 + * is typically put into state tables after four seven-bit-ASCII
   1.109 + * characters in a row are seen, to allow restarting the fast
   1.110 + * scan after some slower processing of multi-byte characters.
   1.111 + *
   1.112 + * A replacement state table is similar to a scanning state
   1.113 + * table but has more extensive actions. The default
   1.114 + * byte-at-a-time loop copies one byte from source to
   1.115 + * destination and goes to the next state. The replacement
   1.116 + * actions overwrite 1-3 bytes of the destination with different
   1.117 + * bytes, possibly shortening the output by 1 or 2 bytes. The
   1.118 + * replacement bytes come from within the state table, from
   1.119 + * dummy states inserted just after any state that contains a
   1.120 + * replacement action. This gives a quick address calculation for
   1.121 + * the replacement byte(s) and gives some cache locality.
   1.122 + *
   1.123 + * Additional replacement actions use one or two bytes from
   1.124 + * within dummy states to index a side table of more-extensive
   1.125 + * replacements. The side table specifies a length of 0..15
   1.126 + * destination bytes to overwrite and a length of 0..127 bytes
   1.127 + * to overwrite them with, plus the actual replacement bytes.
   1.128 + *
   1.129 + * This side table uses one extra bit to specify a pair of
   1.130 + * replacements, the first to be used in an HTML context and the
   1.131 + * second to be used in a plaintext context. This allows
   1.132 + * replacements that are spelled with "&lt;" in the former
   1.133 + * context and "<" in the latter.
   1.134 + *
   1.135 + * The side table also uses an extra bit to specify a non-zero
   1.136 + * next state after a replacement. This allows a combination
   1.137 + * replacement and state change, used to implement a limited
   1.138 + * version of the Boyer-Moore algorithm for multi-character
   1.139 + * replacement without backtracking. This is useful when there
   1.140 + * are overlapping replacements, such as ch => x and also c =>
   1.141 + * y, the latter to be used only if the character after c is not
   1.142 + * h. in this case, the state[0] table's entry for c would
   1.143 + * change c to y and also have a next-state of say n, and the
   1.144 + * state[n] entry for h would specify a replacement of the two
   1.145 + * bytes yh by x. No backtracking is needed.
   1.146 + *
   1.147 + * A replacement table may also include the exit actions of a
   1.148 + * scanning state table, so some character sequences can
   1.149 + * terminate early.
   1.150 + *
   1.151 + * During replacement, an optional data structure called an
   1.152 + * offset map can be updated to reflect each change in length
   1.153 + * between source and destination. This offset map can later be
   1.154 + * used to map destination-string offsets to corresponding
   1.155 + * source-string offsets or vice versa.
   1.156 + *
   1.157 + * The routines below also have variants in which state-table
   1.158 + * entries are all two bytes instead of one byte. This allows
   1.159 + * tables with more than 240 total states, but takes up twice as
   1.160 + * much space per state.
   1.161 + *
   1.162 +**/
   1.163 +
   1.164 +// Return true if current Tbl pointer is within state0 range
   1.165 +// Note that unsigned compare checks both ends of range simultaneously
   1.166 +static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
   1.167 +  const uint8* Tbl0 = &st->state_table[st->state0];
   1.168 +  return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
   1.169 +}
   1.170 +
   1.171 +static inline bool InStateZero_2(const UTF8ReplaceObj_2* st,
   1.172 +                                 const unsigned short int* Tbl) {
   1.173 +  const unsigned short int* Tbl0 =  &st->state_table[st->state0];
   1.174 +  // Word difference, not byte difference
   1.175 +  return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
   1.176 +}
   1.177 +
   1.178 +// UTF8PropObj, UTF8ScanObj, UTF8ReplaceObj are all typedefs of
   1.179 +// UTF8MachineObj.
   1.180 +
   1.181 +static bool IsPropObj(const UTF8StateMachineObj& obj) {
   1.182 +  return obj.fast_state == NULL
   1.183 +      && obj.max_expand == 0;
   1.184 +}
   1.185 +
   1.186 +static bool IsPropObj_2(const UTF8StateMachineObj_2& obj) {
   1.187 +  return obj.fast_state == NULL
   1.188 +      && obj.max_expand == 0;
   1.189 +}
   1.190 +
   1.191 +static bool IsScanObj(const UTF8StateMachineObj& obj) {
   1.192 +  return obj.fast_state != NULL
   1.193 +      && obj.max_expand == 0;
   1.194 +}
   1.195 +
   1.196 +static bool IsReplaceObj(const UTF8StateMachineObj& obj) {
   1.197 +  // Normally, obj.fast_state != NULL, but the handwritten tables
   1.198 +  // in utf8statetable_unittest don't handle fast_states.
   1.199 +  return obj.max_expand > 0;
   1.200 +}
   1.201 +
   1.202 +static bool IsReplaceObj_2(const UTF8StateMachineObj_2& obj) {
   1.203 +  return obj.max_expand > 0;
   1.204 +}
   1.205 +
   1.206 +// Look up property of one UTF-8 character and advance over it
   1.207 +// Return 0 if input length is zero
   1.208 +// Return 0 and advance one byte if input is ill-formed
   1.209 +uint8 UTF8GenericProperty(const UTF8PropObj* st,
   1.210 +                          const uint8** src,
   1.211 +                          int* srclen) {
   1.212 +  if (*srclen <= 0) {
   1.213 +    return 0;
   1.214 +  }
   1.215 +
   1.216 +  const uint8* lsrc = *src;
   1.217 +  const uint8* Tbl_0 = &st->state_table[st->state0];
   1.218 +  const uint8* Tbl = Tbl_0;
   1.219 +  int e;
   1.220 +  int eshift = st->entry_shift;
   1.221 +
   1.222 +  // Short series of tests faster than switch, optimizes 7-bit ASCII
   1.223 +  unsigned char c = lsrc[0];
   1.224 +  if (static_cast<signed char>(c) >= 0) {           // one byte
   1.225 +    e = Tbl[c];
   1.226 +    *src += 1;
   1.227 +    *srclen -= 1;
   1.228 +  } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
   1.229 +    e = Tbl[c];
   1.230 +    Tbl = &Tbl_0[e << eshift];
   1.231 +    e = Tbl[lsrc[1]];
   1.232 +    *src += 2;
   1.233 +    *srclen -= 2;
   1.234 +  } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
   1.235 +    e = Tbl[c];
   1.236 +    Tbl = &Tbl_0[e << eshift];
   1.237 +    e = Tbl[lsrc[1]];
   1.238 +    Tbl = &Tbl_0[e << eshift];
   1.239 +    e = Tbl[lsrc[2]];
   1.240 +    *src += 3;
   1.241 +    *srclen -= 3;
   1.242 +  }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
   1.243 +    e = Tbl[c];
   1.244 +    Tbl = &Tbl_0[e << eshift];
   1.245 +    e = Tbl[lsrc[1]];
   1.246 +    Tbl = &Tbl_0[e << eshift];
   1.247 +    e = Tbl[lsrc[2]];
   1.248 +    Tbl = &Tbl_0[e << eshift];
   1.249 +    e = Tbl[lsrc[3]];
   1.250 +    *src += 4;
   1.251 +    *srclen -= 4;
   1.252 +  } else {                                                // Ill-formed
   1.253 +    e = 0;
   1.254 +    *src += 1;
   1.255 +    *srclen -= 1;
   1.256 +  }
   1.257 +  return e;
   1.258 +}
   1.259 +
   1.260 +bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src) {
   1.261 +  const uint8* lsrc = reinterpret_cast<const uint8*>(src);
   1.262 +  const uint8* Tbl_0 = &st.state_table[st.state0];
   1.263 +  const uint8* Tbl = Tbl_0;
   1.264 +  int e;
   1.265 +  int eshift = st.entry_shift;
   1.266 +
   1.267 +  // Short series of tests faster than switch, optimizes 7-bit ASCII
   1.268 +  unsigned char c = lsrc[0];
   1.269 +  if (static_cast<signed char>(c) >= 0) {           // one byte
   1.270 +    e = Tbl[c];
   1.271 +  } else if ((c & 0xe0) == 0xc0) {     // two bytes
   1.272 +    e = Tbl[c];
   1.273 +    Tbl = &Tbl_0[e << eshift];
   1.274 +    e = Tbl[lsrc[1]];
   1.275 +  } else if ((c & 0xf0) == 0xe0) {     // three bytes
   1.276 +    e = Tbl[c];
   1.277 +    Tbl = &Tbl_0[e << eshift];
   1.278 +    e = Tbl[lsrc[1]];
   1.279 +    Tbl = &Tbl_0[e << eshift];
   1.280 +    e = Tbl[lsrc[2]];
   1.281 +  } else {                             // four bytes
   1.282 +    e = Tbl[c];
   1.283 +    Tbl = &Tbl_0[e << eshift];
   1.284 +    e = Tbl[lsrc[1]];
   1.285 +    Tbl = &Tbl_0[e << eshift];
   1.286 +    e = Tbl[lsrc[2]];
   1.287 +    Tbl = &Tbl_0[e << eshift];
   1.288 +    e = Tbl[lsrc[3]];
   1.289 +  }
   1.290 +  return e;
   1.291 +}
   1.292 +
   1.293 +
   1.294 +// BigOneByte versions are needed for tables > 240 states, but most
   1.295 +// won't need the TwoByte versions.
   1.296 +// Internally, to next-to-last offset is multiplied by 16 and the last
   1.297 +// offset is relative instead of absolute.
   1.298 +// Look up property of one UTF-8 character and advance over it
   1.299 +// Return 0 if input length is zero
   1.300 +// Return 0 and advance one byte if input is ill-formed
   1.301 +uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
   1.302 +                          const uint8** src,
   1.303 +                          int* srclen) {
   1.304 +  if (*srclen <= 0) {
   1.305 +    return 0;
   1.306 +  }
   1.307 +
   1.308 +  const uint8* lsrc = *src;
   1.309 +  const uint8* Tbl_0 = &st->state_table[st->state0];
   1.310 +  const uint8* Tbl = Tbl_0;
   1.311 +  int e;
   1.312 +  int eshift = st->entry_shift;
   1.313 +
   1.314 +  // Short series of tests faster than switch, optimizes 7-bit ASCII
   1.315 +  unsigned char c = lsrc[0];
   1.316 +  if (static_cast<signed char>(c) >= 0) {           // one byte
   1.317 +    e = Tbl[c];
   1.318 +    *src += 1;
   1.319 +    *srclen -= 1;
   1.320 +  } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
   1.321 +    e = Tbl[c];
   1.322 +    Tbl = &Tbl_0[e << eshift];
   1.323 +    e = Tbl[lsrc[1]];
   1.324 +    *src += 2;
   1.325 +    *srclen -= 2;
   1.326 +  } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
   1.327 +    e = Tbl[c];
   1.328 +    Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
   1.329 +    e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
   1.330 +    Tbl = &Tbl[e << eshift];          // Relative +/-
   1.331 +    e = Tbl[lsrc[2]];
   1.332 +    *src += 3;
   1.333 +    *srclen -= 3;
   1.334 +  }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
   1.335 +    e = Tbl[c];
   1.336 +    Tbl = &Tbl_0[e << eshift];
   1.337 +    e = Tbl[lsrc[1]];
   1.338 +    Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
   1.339 +    e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
   1.340 +    Tbl = &Tbl[e << eshift];          // Relative +/-
   1.341 +    e = Tbl[lsrc[3]];
   1.342 +    *src += 4;
   1.343 +    *srclen -= 4;
   1.344 +  } else {                                                // Ill-formed
   1.345 +    e = 0;
   1.346 +    *src += 1;
   1.347 +    *srclen -= 1;
   1.348 +  }
   1.349 +  return e;
   1.350 +}
   1.351 +
   1.352 +// BigOneByte versions are needed for tables > 240 states, but most
   1.353 +// won't need the TwoByte versions.
   1.354 +bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src) {
   1.355 +  const uint8* lsrc = reinterpret_cast<const uint8*>(src);
   1.356 +  const uint8* Tbl_0 = &st.state_table[st.state0];
   1.357 +  const uint8* Tbl = Tbl_0;
   1.358 +  int e;
   1.359 +  int eshift = st.entry_shift;
   1.360 +
   1.361 +  // Short series of tests faster than switch, optimizes 7-bit ASCII
   1.362 +  unsigned char c = lsrc[0];
   1.363 +  if (static_cast<signed char>(c) >= 0) {           // one byte
   1.364 +    e = Tbl[c];
   1.365 +  } else if ((c & 0xe0) == 0xc0) {    // two bytes
   1.366 +    e = Tbl[c];
   1.367 +    Tbl = &Tbl_0[e << eshift];
   1.368 +    e = Tbl[lsrc[1]];
   1.369 +  } else if ((c & 0xf0) == 0xe0) {    // three bytes
   1.370 +    e = Tbl[c];
   1.371 +    Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
   1.372 +    e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
   1.373 +    Tbl = &Tbl[e << eshift];          // Relative +/-
   1.374 +    e = Tbl[lsrc[2]];
   1.375 +  } else {                            // four bytes
   1.376 +    e = Tbl[c];
   1.377 +    Tbl = &Tbl_0[e << eshift];
   1.378 +    e = Tbl[lsrc[1]];
   1.379 +    Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
   1.380 +    e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
   1.381 +    Tbl = &Tbl[e << eshift];          // Relative +/-
   1.382 +    e = Tbl[lsrc[3]];
   1.383 +  }
   1.384 +  return e;
   1.385 +}
   1.386 +
   1.387 +
   1.388 +// TwoByte versions are needed for tables > 240 states
   1.389 +// Look up property of one UTF-8 character and advance over it
   1.390 +// Return 0 if input length is zero
   1.391 +// Return 0 and advance one byte if input is ill-formed
   1.392 +uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
   1.393 +                          const uint8** src,
   1.394 +                          int* srclen) {
   1.395 +  if (*srclen <= 0) {
   1.396 +    return 0;
   1.397 +  }
   1.398 +
   1.399 +  const uint8* lsrc = *src;
   1.400 +  const unsigned short* Tbl_0 = &st->state_table[st->state0];
   1.401 +  const unsigned short* Tbl = Tbl_0;
   1.402 +  int e;
   1.403 +  int eshift = st->entry_shift;
   1.404 +
   1.405 +  // Short series of tests faster than switch, optimizes 7-bit ASCII
   1.406 +  unsigned char c = lsrc[0];
   1.407 +  if (static_cast<signed char>(c) >= 0) {           // one byte
   1.408 +    e = Tbl[c];
   1.409 +    *src += 1;
   1.410 +    *srclen -= 1;
   1.411 +  } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
   1.412 +    e = Tbl[c];
   1.413 +    Tbl = &Tbl_0[e << eshift];
   1.414 +    e = Tbl[lsrc[1]];
   1.415 +    *src += 2;
   1.416 +    *srclen -= 2;
   1.417 +  } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
   1.418 +    e = Tbl[c];
   1.419 +    Tbl = &Tbl_0[e << eshift];
   1.420 +    e = Tbl[lsrc[1]];
   1.421 +    Tbl = &Tbl_0[e << eshift];
   1.422 +    e = Tbl[lsrc[2]];
   1.423 +    *src += 3;
   1.424 +    *srclen -= 3;
   1.425 +  }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
   1.426 +    e = Tbl[c];
   1.427 +    Tbl = &Tbl_0[e << eshift];
   1.428 +    e = Tbl[lsrc[1]];
   1.429 +    Tbl = &Tbl_0[e << eshift];
   1.430 +    e = Tbl[lsrc[2]];
   1.431 +    Tbl = &Tbl_0[e << eshift];
   1.432 +    e = Tbl[lsrc[3]];
   1.433 +    *src += 4;
   1.434 +    *srclen -= 4;
   1.435 +  } else {                                                // Ill-formed
   1.436 +    e = 0;
   1.437 +    *src += 1;
   1.438 +    *srclen -= 1;
   1.439 +  }
   1.440 +  return e;
   1.441 +}
   1.442 +
   1.443 +// TwoByte versions are needed for tables > 240 states
   1.444 +bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src) {
   1.445 +  const uint8* lsrc = reinterpret_cast<const uint8*>(src);
   1.446 +  const unsigned short* Tbl_0 = &st.state_table[st.state0];
   1.447 +  const unsigned short* Tbl = Tbl_0;
   1.448 +  int e;
   1.449 +  int eshift = st.entry_shift;
   1.450 +
   1.451 +  // Short series of tests faster than switch, optimizes 7-bit ASCII
   1.452 +  unsigned char c = lsrc[0];
   1.453 +  if (static_cast<signed char>(c) >= 0) {           // one byte
   1.454 +    e = Tbl[c];
   1.455 +  } else if ((c & 0xe0) == 0xc0) {     // two bytes
   1.456 +    e = Tbl[c];
   1.457 +    Tbl = &Tbl_0[e << eshift];
   1.458 +    e = Tbl[lsrc[1]];
   1.459 +  } else if ((c & 0xf0) == 0xe0) {     // three bytes
   1.460 +    e = Tbl[c];
   1.461 +    Tbl = &Tbl_0[e << eshift];
   1.462 +    e = Tbl[lsrc[1]];
   1.463 +    Tbl = &Tbl_0[e << eshift];
   1.464 +    e = Tbl[lsrc[2]];
   1.465 +  } else {                             // four bytes
   1.466 +    e = Tbl[c];
   1.467 +    Tbl = &Tbl_0[e << eshift];
   1.468 +    e = Tbl[lsrc[1]];
   1.469 +    Tbl = &Tbl_0[e << eshift];
   1.470 +    e = Tbl[lsrc[2]];
   1.471 +    Tbl = &Tbl_0[e << eshift];
   1.472 +    e = Tbl[lsrc[3]];
   1.473 +  }
   1.474 +  return e;
   1.475 +}
   1.476 +
   1.477 +
   1.478 +// Approximate speeds on 2.8 GHz Pentium 4:
   1.479 +//   GenericScan 1-byte loop           300 MB/sec *
   1.480 +//   GenericScan 4-byte loop          1200 MB/sec
   1.481 +//   GenericScan 8-byte loop          2400 MB/sec *
   1.482 +//   GenericScanFastAscii 4-byte loop 3000 MB/sec
   1.483 +//   GenericScanFastAscii 8-byte loop 3200 MB/sec *
   1.484 +//
   1.485 +// * Implemented below. FastAscii loop is memory-bandwidth constrained.
   1.486 +
   1.487 +// Scan a UTF-8 stringpiece based on state table.
   1.488 +// Always scan complete UTF-8 characters
   1.489 +// Set number of bytes scanned. Return reason for exiting
   1.490 +int UTF8GenericScan(const UTF8ScanObj* st,
   1.491 +                    const StringPiece& str,
   1.492 +                    int* bytes_consumed) {
   1.493 +  int eshift = st->entry_shift;       // 6 (space optimized) or 8
   1.494 +  // int nEntries = (1 << eshift);       // 64 or 256 entries per state
   1.495 +
   1.496 +  const uint8* isrc =
   1.497 +    reinterpret_cast<const uint8*>(str.data());
   1.498 +  const uint8* src = isrc;
   1.499 +  const int len = str.length();
   1.500 +  const uint8* srclimit = isrc + len;
   1.501 +  const uint8* srclimit8 = srclimit - 7;
   1.502 +  *bytes_consumed = 0;
   1.503 +  if (len == 0) return kExitOK;
   1.504 +
   1.505 +  const uint8* Tbl_0 = &st->state_table[st->state0];
   1.506 +
   1.507 +DoAgain:
   1.508 +  // Do state-table scan
   1.509 +  int e = 0;
   1.510 +  uint8 c;
   1.511 +
   1.512 +  // Do fast for groups of 8 identity bytes.
   1.513 +  // This covers a lot of 7-bit ASCII ~8x faster than the 1-byte loop,
   1.514 +  // including slowing slightly on cr/lf/ht
   1.515 +  //----------------------------
   1.516 +  const uint8* Tbl2 = &st->fast_state[0];
   1.517 +  uint32 losub = st->losub;
   1.518 +  uint32 hiadd = st->hiadd;
   1.519 +  while (src < srclimit8) {
   1.520 +    uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
   1.521 +    uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
   1.522 +    src += 8;
   1.523 +    // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
   1.524 +    uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
   1.525 +                  (s4567 - losub) | (s4567 + hiadd);
   1.526 +    if ((temp & 0x80808080) != 0) {
   1.527 +      // We typically end up here on cr/lf/ht; src was incremented
   1.528 +      int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
   1.529 +                  (Tbl2[src[-6]] | Tbl2[src[-5]]);
   1.530 +      if (e0123 != 0) {src -= 8; break;}    // Exit on Non-interchange
   1.531 +      e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
   1.532 +              (Tbl2[src[-2]] | Tbl2[src[-1]]);
   1.533 +      if (e0123 != 0) {src -= 4; break;}    // Exit on Non-interchange
   1.534 +      // Else OK, go around again
   1.535 +    }
   1.536 +  }
   1.537 +  //----------------------------
   1.538 +
   1.539 +  // Byte-at-a-time scan
   1.540 +  //----------------------------
   1.541 +  const uint8* Tbl = Tbl_0;
   1.542 +  while (src < srclimit) {
   1.543 +    c = *src;
   1.544 +    e = Tbl[c];
   1.545 +    src++;
   1.546 +    if (e >= kExitIllegalStructure) {break;}
   1.547 +    Tbl = &Tbl_0[e << eshift];
   1.548 +  }
   1.549 +  //----------------------------
   1.550 +
   1.551 +
   1.552 +  // Exit possibilities:
   1.553 +  //  Some exit code, !state0, back up over last char
   1.554 +  //  Some exit code, state0, back up one byte exactly
   1.555 +  //  source consumed, !state0, back up over partial char
   1.556 +  //  source consumed, state0, exit OK
   1.557 +  // For illegal byte in state0, avoid backup up over PREVIOUS char
   1.558 +  // For truncated last char, back up to beginning of it
   1.559 +
   1.560 +  if (e >= kExitIllegalStructure) {
   1.561 +    // Back up over exactly one byte of rejected/illegal UTF-8 character
   1.562 +    src--;
   1.563 +    // Back up more if needed
   1.564 +    if (!InStateZero(st, Tbl)) {
   1.565 +      do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
   1.566 +    }
   1.567 +  } else if (!InStateZero(st, Tbl)) {
   1.568 +    // Back up over truncated UTF-8 character
   1.569 +    e = kExitIllegalStructure;
   1.570 +    do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
   1.571 +  } else {
   1.572 +    // Normal termination, source fully consumed
   1.573 +    e = kExitOK;
   1.574 +  }
   1.575 +
   1.576 +  if (e == kExitDoAgain) {
   1.577 +    // Loop back up to the fast scan
   1.578 +    goto DoAgain;
   1.579 +  }
   1.580 +
   1.581 +  *bytes_consumed = src - isrc;
   1.582 +  return e;
   1.583 +}
   1.584 +
   1.585 +// Scan a UTF-8 stringpiece based on state table.
   1.586 +// Always scan complete UTF-8 characters
   1.587 +// Set number of bytes scanned. Return reason for exiting
   1.588 +// OPTIMIZED for case of 7-bit ASCII 0000..007f all valid
   1.589 +int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
   1.590 +                    const StringPiece& str,
   1.591 +                    int* bytes_consumed) {
   1.592 +  const uint8* isrc =
   1.593 +    reinterpret_cast<const uint8*>(str.data());
   1.594 +  const uint8* src = isrc;
   1.595 +  const int len = str.length();
   1.596 +  const uint8* srclimit = isrc + len;
   1.597 +  const uint8* srclimit8 = srclimit - 7;
   1.598 +  *bytes_consumed = 0;
   1.599 +  if (len == 0) return kExitOK;
   1.600 +
   1.601 +  int n;
   1.602 +  int rest_consumed;
   1.603 +  int exit_reason;
   1.604 +  do {
   1.605 +    // Skip 8 bytes of ASCII at a whack; no endianness issue
   1.606 +    while ((src < srclimit8) &&
   1.607 +           (((reinterpret_cast<const uint32*>(src)[0] |
   1.608 +              reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
   1.609 +      src += 8;
   1.610 +    }
   1.611 +    // Run state table on the rest
   1.612 +    n = src - isrc;
   1.613 +    StringPiece str2(str.data() + n, str.length() - n);
   1.614 +    exit_reason = UTF8GenericScan(st, str2, &rest_consumed);
   1.615 +    src += rest_consumed;
   1.616 +  } while ( exit_reason == kExitDoAgain );
   1.617 +
   1.618 +  *bytes_consumed = src - isrc;
   1.619 +  return exit_reason;
   1.620 +}
   1.621 +
   1.622 +// Hack to change halfwidth katakana to match an old UTF8CharToLower()
   1.623 +
   1.624 +// Return number of src bytes skipped
   1.625 +static int DoSpecialFixup(const unsigned char c,
   1.626 +                    const unsigned char** srcp, const unsigned char* srclimit,
   1.627 +                    unsigned char** dstp, unsigned char* dstlimit) {
   1.628 +  return 0;
   1.629 +}
   1.630 +
   1.631 +
   1.632 +// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
   1.633 +//   and doing text replacements.
   1.634 +// DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below
   1.635 +//   Needs caller to loop on kExitDoAgain
   1.636 +static int UTF8GenericReplaceInternal(const UTF8ReplaceObj* st,
   1.637 +                    const StringPiece& istr,
   1.638 +                    StringPiece& ostr,
   1.639 +                    bool is_plain_text,
   1.640 +                    int* bytes_consumed,
   1.641 +                    int* bytes_filled,
   1.642 +                    int* chars_changed,
   1.643 +                    OffsetMap* offsetmap) {
   1.644 +  int eshift = st->entry_shift;
   1.645 +  int nEntries = (1 << eshift);       // 64 or 256 entries per state
   1.646 +  const uint8* isrc = reinterpret_cast<const uint8*>(istr.data());
   1.647 +  const int ilen = istr.length();
   1.648 +  const uint8* copystart = isrc;
   1.649 +  const uint8* src = isrc;
   1.650 +  const uint8* srclimit = src + ilen;
   1.651 +  *bytes_consumed = 0;
   1.652 +  *bytes_filled = 0;
   1.653 +  *chars_changed = 0;
   1.654 +
   1.655 +  const uint8* odst = reinterpret_cast<const uint8*>(ostr.data());
   1.656 +  const int olen = ostr.length();
   1.657 +  uint8* dst = const_cast<uint8*>(odst);
   1.658 +  uint8* dstlimit = dst + olen;
   1.659 +
   1.660 +  int total_changed = 0;
   1.661 +
   1.662 +  // Invariant condition during replacements:
   1.663 +  //  remaining dst size >= remaining src size
   1.664 +  if ((dstlimit - dst) < (srclimit - src)) {
   1.665 +    if (offsetmap != NULL) {
   1.666 +      offsetmap->Copy(src - copystart);
   1.667 +      copystart = src;
   1.668 +    }
   1.669 +    return kExitDstSpaceFull;
   1.670 +  }
   1.671 +  const uint8* Tbl_0 = &st->state_table[st->state0];
   1.672 +
   1.673 + Do_state_table:
   1.674 +  // Do state-table scan, copying as we go
   1.675 +  const uint8* Tbl = Tbl_0;
   1.676 +  int e = 0;
   1.677 +  uint8 c = 0;
   1.678 +
   1.679 + Do_state_table_newe:
   1.680 +
   1.681 +  //----------------------------
   1.682 +  while (src < srclimit) {
   1.683 +    c = *src;
   1.684 +    e = Tbl[c];
   1.685 +    *dst = c;
   1.686 +    src++;
   1.687 +    dst++;
   1.688 +    if (e >= kExitIllegalStructure) {break;}
   1.689 +    Tbl = &Tbl_0[e << eshift];
   1.690 +  }
   1.691 +  //----------------------------
   1.692 +
   1.693 +  // Exit possibilities:
   1.694 +  //  Replacement code, do the replacement and loop
   1.695 +  //  Some other exit code, state0, back up one byte exactly
   1.696 +  //  Some other exit code, !state0, back up over last char
   1.697 +  //  source consumed, state0, exit OK
   1.698 +  //  source consumed, !state0, back up over partial char
   1.699 +  // For illegal byte in state0, avoid backup up over PREVIOUS char
   1.700 +  // For truncated last char, back up to beginning of it
   1.701 +
   1.702 +  if (e >= kExitIllegalStructure) {
   1.703 +    // Switch on exit code; most loop back to top
   1.704 +    int offset = 0;
   1.705 +    switch (e) {
   1.706 +    // These all make the output string the same size or shorter
   1.707 +    // No checking needed
   1.708 +    case kExitReplace31:    // del 2, add 1 bytes to change
   1.709 +      dst -= 2;
   1.710 +      if (offsetmap != NULL) {
   1.711 +        offsetmap->Copy(src - copystart - 2);
   1.712 +        offsetmap->Delete(2);
   1.713 +        copystart = src;
   1.714 +      }
   1.715 +      dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
   1.716 +      total_changed++;
   1.717 +      goto Do_state_table;
   1.718 +    case kExitReplace32:    // del 3, add 2 bytes to change
   1.719 +      dst--;
   1.720 +      if (offsetmap != NULL) {
   1.721 +        offsetmap->Copy(src - copystart - 1);
   1.722 +        offsetmap->Delete(1);
   1.723 +        copystart = src;
   1.724 +      }
   1.725 +      dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)];
   1.726 +      dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
   1.727 +      total_changed++;
   1.728 +      goto Do_state_table;
   1.729 +    case kExitReplace21:    // del 2, add 1 bytes to change
   1.730 +      dst--;
   1.731 +      if (offsetmap != NULL) {
   1.732 +        offsetmap->Copy(src - copystart - 1);
   1.733 +        offsetmap->Delete(1);
   1.734 +        copystart = src;
   1.735 +      }
   1.736 +      dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
   1.737 +      total_changed++;
   1.738 +      goto Do_state_table;
   1.739 +    case kExitReplace3:    // update 3 bytes to change
   1.740 +      dst[-3] = (unsigned char)Tbl[c + (nEntries * 3)];
   1.741 +      // Fall into next case
   1.742 +    case kExitReplace2:    // update 2 bytes to change
   1.743 +      dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)];
   1.744 +      // Fall into next case
   1.745 +    case kExitReplace1:    // update 1 byte to change
   1.746 +      dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
   1.747 +      total_changed++;
   1.748 +      goto Do_state_table;
   1.749 +    case kExitReplace1S0:     // update 1 byte to change, 256-entry state
   1.750 +      dst[-1] = (unsigned char)Tbl[c + (256 * 1)];
   1.751 +      total_changed++;
   1.752 +      goto Do_state_table;
   1.753 +    // These can make the output string longer than the input
   1.754 +    case kExitReplaceOffset2:
   1.755 +      if ((nEntries != 256) && InStateZero(st, Tbl)) {
   1.756 +        // For space-optimized table, we need multiples of 256 bytes
   1.757 +        // in state0 and multiples of nEntries in other states
   1.758 +        offset += ((unsigned char)Tbl[c + (256 * 2)] << 8);
   1.759 +      } else {
   1.760 +        offset += ((unsigned char)Tbl[c + (nEntries * 2)] << 8);
   1.761 +      }
   1.762 +      // Fall into next case
   1.763 +    case kExitSpecial:      // Apply special fixups [read: hacks]
   1.764 +    case kExitReplaceOffset1:
   1.765 +      if ((nEntries != 256) && InStateZero(st, Tbl)) {
   1.766 +        // For space-optimized table, we need multiples of 256 bytes
   1.767 +        // in state0 and multiples of nEntries in other states
   1.768 +        offset += (unsigned char)Tbl[c + (256 * 1)];
   1.769 +      } else {
   1.770 +        offset += (unsigned char)Tbl[c + (nEntries * 1)];
   1.771 +      }
   1.772 +      {
   1.773 +        const RemapEntry* re = &st->remap_base[offset];
   1.774 +        int del_len = re->delete_bytes & ~kReplaceAndResumeFlag;
   1.775 +        int add_len = re->add_bytes & ~kHtmlPlaintextFlag;
   1.776 +
   1.777 +        // Special-case non-HTML replacement of five sensitive entities
   1.778 +        //   &quot; &amp; &apos; &lt; &gt;
   1.779 +        //   0022   0026  0027   003c 003e
   1.780 +        // A replacement creating one of these is expressed as a pair of
   1.781 +        // entries, one for HTML output and one for plaintext output.
   1.782 +        // The first of the pair has the high bit of add_bytes set.
   1.783 +        if (re->add_bytes & kHtmlPlaintextFlag) {
   1.784 +          // Use this entry for plain text
   1.785 +          if (!is_plain_text) {
   1.786 +            // Use very next entry for HTML text (same back/delete length)
   1.787 +            re = &st->remap_base[offset + 1];
   1.788 +            add_len = re->add_bytes & ~kHtmlPlaintextFlag;
   1.789 +          }
   1.790 +        }
   1.791 +
   1.792 +        int string_offset = re->bytes_offset;
   1.793 +        // After the replacement, need (dstlimit - newdst) >= (srclimit - src)
   1.794 +        uint8* newdst = dst - del_len + add_len;
   1.795 +        if ((dstlimit - newdst) < (srclimit - src)) {
   1.796 +          // Won't fit; don't do the replacement. Caller may realloc and retry
   1.797 +          e = kExitDstSpaceFull;
   1.798 +          break;    // exit, backing up over this char for later retry
   1.799 +        }
   1.800 +        dst -= del_len;
   1.801 +        memcpy(dst, &st->remap_string[string_offset], add_len);
   1.802 +        dst += add_len;
   1.803 +        total_changed++;
   1.804 +        if (offsetmap != NULL) {
   1.805 +          if (add_len > del_len) {
   1.806 +            offsetmap->Copy(src - copystart);
   1.807 +            offsetmap->Insert(add_len - del_len);
   1.808 +            copystart = src;
   1.809 +          } else if (add_len < del_len) {
   1.810 +            offsetmap->Copy(src - copystart + add_len - del_len);
   1.811 +            offsetmap->Delete(del_len - add_len);
   1.812 +            copystart = src;
   1.813 +          }
   1.814 +        }
   1.815 +        if (re->delete_bytes & kReplaceAndResumeFlag) {
   1.816 +          // There is a non-zero  target state at the end of the
   1.817 +          // replacement string
   1.818 +          e = st->remap_string[string_offset + add_len];
   1.819 +          Tbl = &Tbl_0[e << eshift];
   1.820 +          goto Do_state_table_newe;
   1.821 +        }
   1.822 +      }
   1.823 +      if (e == kExitRejectAlt) {break;}
   1.824 +      if (e != kExitSpecial) {goto Do_state_table;}
   1.825 +
   1.826 +    // case kExitSpecial:      // Apply special fixups [read: hacks]
   1.827 +      // In this routine, do either UTF8CharToLower()
   1.828 +      //   fullwidth/halfwidth mapping or
   1.829 +      //   voiced mapping or
   1.830 +      //   semi-voiced mapping
   1.831 +
   1.832 +      // First, do EXIT_REPLACE_OFFSET1 action (above)
   1.833 +      // Second: do additional code fixup
   1.834 +      {
   1.835 +        int srcdel = DoSpecialFixup(c, &src, srclimit, &dst, dstlimit);
   1.836 +        if (offsetmap != NULL) {
   1.837 +          if (srcdel != 0) {
   1.838 +            offsetmap->Copy(src - copystart - srcdel);
   1.839 +            offsetmap->Delete(srcdel);
   1.840 +            copystart = src;
   1.841 +          }
   1.842 +        }
   1.843 +      }
   1.844 +      goto Do_state_table;
   1.845 +
   1.846 +    case kExitIllegalStructure:   // structurally illegal byte; quit
   1.847 +    case kExitReject:             // NUL or illegal code encountered; quit
   1.848 +    case kExitRejectAlt:          // Apply replacement, then exit
   1.849 +    default:                      // and all other exits
   1.850 +      break;
   1.851 +    }   // End switch (e)
   1.852 +
   1.853 +    // Exit possibilities:
   1.854 +    //  Some other exit code, state0, back up one byte exactly
   1.855 +    //  Some other exit code, !state0, back up over last char
   1.856 +
   1.857 +    // Back up over exactly one byte of rejected/illegal UTF-8 character
   1.858 +    src--;
   1.859 +    dst--;
   1.860 +    // Back up more if needed
   1.861 +    if (!InStateZero(st, Tbl)) {
   1.862 +      do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
   1.863 +    }
   1.864 +  } else if (!InStateZero(st, Tbl)) {
   1.865 +    // src >= srclimit, !state0
   1.866 +    // Back up over truncated UTF-8 character
   1.867 +    e = kExitIllegalStructure;
   1.868 +    do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
   1.869 +  } else {
   1.870 +    // src >= srclimit, state0
   1.871 +    // Normal termination, source fully consumed
   1.872 +    e = kExitOK;
   1.873 +  }
   1.874 +
   1.875 +  if (offsetmap != NULL) {
   1.876 +    if (src > copystart) {
   1.877 +      offsetmap->Copy(src - copystart);
   1.878 +      copystart = src;
   1.879 +    }
   1.880 +  }
   1.881 +
   1.882 +  // Possible return values here:
   1.883 +  //  kExitDstSpaceFull         caller may realloc and retry from middle
   1.884 +  //  kExitIllegalStructure     caller my overwrite/truncate
   1.885 +  //  kExitOK                   all done and happy
   1.886 +  //  kExitReject               caller may overwrite/truncate
   1.887 +  //  kExitDoAgain              LOOP NOT DONE; caller must retry from middle
   1.888 +  //                            (may do fast ASCII loop first)
   1.889 +  //  kExitPlaceholder          -unused-
   1.890 +  //  kExitNone                 -unused-
   1.891 +  *bytes_consumed = src - isrc;
   1.892 +  *bytes_filled = dst - odst;
   1.893 +  *chars_changed = total_changed;
   1.894 +  return e;
   1.895 +}
   1.896 +
   1.897 +// TwoByte versions are needed for tables > 240 states, such
   1.898 +// as the table for full Unicode 4.1 canonical + compatibility mapping
   1.899 +
   1.900 +// Scan a UTF-8 stringpiece based on state table with two-byte entries,
   1.901 +//   copying to output stringpiece
   1.902 +//   and doing text replacements.
   1.903 +// DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below
   1.904 +//   Needs caller to loop on kExitDoAgain
   1.905 +static int UTF8GenericReplaceInternalTwoByte(const UTF8ReplaceObj_2* st,
   1.906 +                    const StringPiece& istr,
   1.907 +                    StringPiece& ostr,
   1.908 +                    bool is_plain_text,
   1.909 +                    int* bytes_consumed,
   1.910 +                    int* bytes_filled,
   1.911 +                    int* chars_changed,
   1.912 +                    OffsetMap* offsetmap) {
   1.913 +  int eshift = st->entry_shift;
   1.914 +  int nEntries = (1 << eshift);       // 64 or 256 entries per state
   1.915 +  const uint8* isrc = reinterpret_cast<const uint8*>(istr.data());
   1.916 +  const int ilen = istr.length();
   1.917 +  const uint8* copystart = isrc;
   1.918 +  const uint8* src = isrc;
   1.919 +  const uint8* srclimit = src + ilen;
   1.920 +  *bytes_consumed = 0;
   1.921 +  *bytes_filled = 0;
   1.922 +  *chars_changed = 0;
   1.923 +
   1.924 +  const uint8* odst = reinterpret_cast<const uint8*>(ostr.data());
   1.925 +  const int olen = ostr.length();
   1.926 +  uint8* dst = const_cast<uint8*>(odst);
   1.927 +  uint8* dstlimit = dst + olen;
   1.928 +
   1.929 +  *chars_changed = 0;
   1.930 +
   1.931 +  int total_changed = 0;
   1.932 +
   1.933 +  int src_lll = srclimit - src;
   1.934 +  int dst_lll = dstlimit - dst;
   1.935 +
   1.936 +
   1.937 +  // Invariant condition during replacements:
   1.938 +  //  remaining dst size >= remaining src size
   1.939 +  if ((dstlimit - dst) < (srclimit - src)) {
   1.940 +    if (offsetmap != NULL) {
   1.941 +      offsetmap->Copy(src - copystart);
   1.942 +      copystart = src;
   1.943 +    }
   1.944 +    return kExitDstSpaceFull_2;
   1.945 +  }
   1.946 +  const unsigned short* Tbl_0 = &st->state_table[st->state0];
   1.947 +
   1.948 + Do_state_table_2:
   1.949 +  // Do state-table scan, copying as we go
   1.950 +  const unsigned short* Tbl = Tbl_0;
   1.951 +  int e = 0;
   1.952 +  uint8 c = 0;
   1.953 +
   1.954 + Do_state_table_newe_2:
   1.955 +
   1.956 +  //----------------------------
   1.957 +  while (src < srclimit) {
   1.958 +    c = *src;
   1.959 +    e = Tbl[c];
   1.960 +    *dst = c;
   1.961 +    src++;
   1.962 +    dst++;
   1.963 +    if (e >= kExitIllegalStructure_2) {break;}
   1.964 +    Tbl = &Tbl_0[e << eshift];
   1.965 +  }
   1.966 +  //----------------------------
   1.967 +  src_lll = src - isrc;
   1.968 +  dst_lll = dst - odst;
   1.969 +
   1.970 +  // Exit possibilities:
   1.971 +  //  Replacement code, do the replacement and loop
   1.972 +  //  Some other exit code, state0, back up one byte exactly
   1.973 +  //  Some other exit code, !state0, back up over last char
   1.974 +  //  source consumed, state0, exit OK
   1.975 +  //  source consumed, !state0, back up over partial char
   1.976 +  // For illegal byte in state0, avoid backup up over PREVIOUS char
   1.977 +  // For truncated last char, back up to beginning of it
   1.978 +
   1.979 +  if (e >= kExitIllegalStructure_2) {
   1.980 +    // Switch on exit code; most loop back to top
   1.981 +    int offset = 0;
   1.982 +    switch (e) {
   1.983 +    // These all make the output string the same size or shorter
   1.984 +    // No checking needed
   1.985 +    case kExitReplace31_2:    // del 2, add 1 bytes to change
   1.986 +      dst -= 2;
   1.987 +      if (offsetmap != NULL) {
   1.988 +        offsetmap->Copy(src - copystart - 2);
   1.989 +        offsetmap->Delete(2);
   1.990 +        copystart = src;
   1.991 +      }
   1.992 +      dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
   1.993 +      total_changed++;
   1.994 +      goto Do_state_table_2;
   1.995 +    case kExitReplace32_2:    // del 3, add 2 bytes to change
   1.996 +      dst--;
   1.997 +      if (offsetmap != NULL) {
   1.998 +        offsetmap->Copy(src - copystart - 1);
   1.999 +        offsetmap->Delete(1);
  1.1000 +        copystart = src;
  1.1001 +      }
  1.1002 +      dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff);
  1.1003 +      dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
  1.1004 +      total_changed++;
  1.1005 +      goto Do_state_table_2;
  1.1006 +    case kExitReplace21_2:    // del 2, add 1 bytes to change
  1.1007 +      dst--;
  1.1008 +      if (offsetmap != NULL) {
  1.1009 +        offsetmap->Copy(src - copystart - 1);
  1.1010 +        offsetmap->Delete(1);
  1.1011 +        copystart = src;
  1.1012 +      }
  1.1013 +      dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
  1.1014 +      total_changed++;
  1.1015 +      goto Do_state_table_2;
  1.1016 +    case kExitReplace3_2:    // update 3 bytes to change
  1.1017 +      dst[-3] = (unsigned char)(Tbl[c + (nEntries * 2)] & 0xff);
  1.1018 +      // Fall into next case
  1.1019 +    case kExitReplace2_2:    // update 2 bytes to change
  1.1020 +      dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff);
  1.1021 +      // Fall into next case
  1.1022 +    case kExitReplace1_2:    // update 1 byte to change
  1.1023 +      dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
  1.1024 +      total_changed++;
  1.1025 +      goto Do_state_table_2;
  1.1026 +    case kExitReplace1S0_2:     // update 1 byte to change, 256-entry state
  1.1027 +      dst[-1] = (unsigned char)(Tbl[c + (256 * 1)] & 0xff);
  1.1028 +      total_changed++;
  1.1029 +      goto Do_state_table_2;
  1.1030 +    // These can make the output string longer than the input
  1.1031 +    case kExitReplaceOffset2_2:
  1.1032 +      if ((nEntries != 256) && InStateZero_2(st, Tbl)) {
  1.1033 +        // For space-optimized table, we need multiples of 256 bytes
  1.1034 +        // in state0 and multiples of nEntries in other states
  1.1035 +        offset += ((unsigned char)(Tbl[c + (256 * 1)] >> 8 & 0xff) << 8);
  1.1036 +      } else {
  1.1037 +        offset += ((unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff) << 8);
  1.1038 +      }
  1.1039 +      // Fall into next case
  1.1040 +    case kExitReplaceOffset1_2:
  1.1041 +      if ((nEntries != 256) && InStateZero_2(st, Tbl)) {
  1.1042 +        // For space-optimized table, we need multiples of 256 bytes
  1.1043 +        // in state0 and multiples of nEntries in other states
  1.1044 +        offset += (unsigned char)(Tbl[c + (256 * 1)] & 0xff);
  1.1045 +      } else {
  1.1046 +        offset += (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
  1.1047 +      }
  1.1048 +      {
  1.1049 +        const RemapEntry* re = &st->remap_base[offset];
  1.1050 +        int del_len = re->delete_bytes & ~kReplaceAndResumeFlag;
  1.1051 +        int add_len = re->add_bytes & ~kHtmlPlaintextFlag;
  1.1052 +        // Special-case non-HTML replacement of five sensitive entities
  1.1053 +        //   &quot; &amp; &apos; &lt; &gt;
  1.1054 +        //   0022   0026  0027   003c 003e
  1.1055 +        // A replacement creating one of these is expressed as a pair of
  1.1056 +        // entries, one for HTML output and one for plaintext output.
  1.1057 +        // The first of the pair has the high bit of add_bytes set.
  1.1058 +        if (re->add_bytes & kHtmlPlaintextFlag) {
  1.1059 +          // Use this entry for plain text
  1.1060 +          if (!is_plain_text) {
  1.1061 +            // Use very next entry for HTML text (same back/delete length)
  1.1062 +            re = &st->remap_base[offset + 1];
  1.1063 +            add_len = re->add_bytes & ~kHtmlPlaintextFlag;
  1.1064 +          }
  1.1065 +        }
  1.1066 +
  1.1067 +        // After the replacement, need (dstlimit - dst) >= (srclimit - src)
  1.1068 +        int string_offset = re->bytes_offset;
  1.1069 +        // After the replacement, need (dstlimit - newdst) >= (srclimit - src)
  1.1070 +        uint8* newdst = dst - del_len + add_len;
  1.1071 +        if ((dstlimit - newdst) < (srclimit - src)) {
  1.1072 +          // Won't fit; don't do the replacement. Caller may realloc and retry
  1.1073 +          e = kExitDstSpaceFull_2;
  1.1074 +          break;    // exit, backing up over this char for later retry
  1.1075 +        }
  1.1076 +        dst -= del_len;
  1.1077 +        memcpy(dst, &st->remap_string[string_offset], add_len);
  1.1078 +        dst += add_len;
  1.1079 +        if (offsetmap != NULL) {
  1.1080 +          if (add_len > del_len) {
  1.1081 +            offsetmap->Copy(src - copystart);
  1.1082 +            offsetmap->Insert(add_len - del_len);
  1.1083 +            copystart = src;
  1.1084 +          } else if (add_len < del_len) {
  1.1085 +            offsetmap->Copy(src - copystart + add_len - del_len);
  1.1086 +            offsetmap->Delete(del_len - add_len);
  1.1087 +            copystart = src;
  1.1088 +          }
  1.1089 +        }
  1.1090 +        if (re->delete_bytes & kReplaceAndResumeFlag) {
  1.1091 +          // There is a two-byte non-zero target state at the end of the
  1.1092 +          // replacement string
  1.1093 +          uint8 c1 = st->remap_string[string_offset + add_len];
  1.1094 +          uint8 c2 = st->remap_string[string_offset + add_len + 1];
  1.1095 +          e = (c1 << 8) | c2;
  1.1096 +          Tbl = &Tbl_0[e << eshift];
  1.1097 +          total_changed++;
  1.1098 +          goto Do_state_table_newe_2;
  1.1099 +        }
  1.1100 +      }
  1.1101 +      total_changed++;
  1.1102 +      if (e == kExitRejectAlt_2) {break;}
  1.1103 +      goto Do_state_table_2;
  1.1104 +
  1.1105 +    case kExitSpecial_2:           // NO special fixups [read: hacks]
  1.1106 +    case kExitIllegalStructure_2:  // structurally illegal byte; quit
  1.1107 +    case kExitReject_2:            // NUL or illegal code encountered; quit
  1.1108 +                                   // and all other exits
  1.1109 +    default:
  1.1110 +      break;
  1.1111 +    }   // End switch (e)
  1.1112 +
  1.1113 +    // Exit possibilities:
  1.1114 +    //  Some other exit code, state0, back up one byte exactly
  1.1115 +    //  Some other exit code, !state0, back up over last char
  1.1116 +
  1.1117 +    // Back up over exactly one byte of rejected/illegal UTF-8 character
  1.1118 +    src--;
  1.1119 +    dst--;
  1.1120 +    // Back up more if needed
  1.1121 +    if (!InStateZero_2(st, Tbl)) {
  1.1122 +      do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
  1.1123 +    }
  1.1124 +  } else if (!InStateZero_2(st, Tbl)) {
  1.1125 +    // src >= srclimit, !state0
  1.1126 +    // Back up over truncated UTF-8 character
  1.1127 +    e = kExitIllegalStructure_2;
  1.1128 +
  1.1129 +    do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
  1.1130 +  } else {
  1.1131 +    // src >= srclimit, state0
  1.1132 +    // Normal termination, source fully consumed
  1.1133 +    e = kExitOK_2;
  1.1134 +  }
  1.1135 +
  1.1136 +  if (offsetmap != NULL) {
  1.1137 +    if (src > copystart) {
  1.1138 +      offsetmap->Copy(src - copystart);
  1.1139 +      copystart = src;
  1.1140 +    }
  1.1141 +  }
  1.1142 +
  1.1143 +
  1.1144 +  // Possible return values here:
  1.1145 +  //  kExitDstSpaceFull_2         caller may realloc and retry from middle
  1.1146 +  //  kExitIllegalStructure_2     caller my overwrite/truncate
  1.1147 +  //  kExitOK_2                   all done and happy
  1.1148 +  //  kExitReject_2               caller may overwrite/truncate
  1.1149 +  //  kExitDoAgain_2              LOOP NOT DONE; caller must retry from middle
  1.1150 +  //                            (may do fast ASCII loop first)
  1.1151 +  //  kExitPlaceholder_2          -unused-
  1.1152 +  //  kExitNone_2                 -unused-
  1.1153 +  *bytes_consumed = src - isrc;
  1.1154 +  *bytes_filled = dst - odst;
  1.1155 +  *chars_changed = total_changed;
  1.1156 +  return e;
  1.1157 +}
  1.1158 +
  1.1159 +
  1.1160 +// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
  1.1161 +//   and doing text replacements.
  1.1162 +// Also writes an optional OffsetMap. Pass NULL to skip writing one.
  1.1163 +// Always scan complete UTF-8 characters
  1.1164 +// Set number of bytes consumed from input, number filled to output.
  1.1165 +// Return reason for exiting
  1.1166 +int UTF8GenericReplace(const UTF8ReplaceObj* st,
  1.1167 +                    const StringPiece& istr,
  1.1168 +                    StringPiece& ostr,
  1.1169 +                    bool is_plain_text,
  1.1170 +                    int* bytes_consumed,
  1.1171 +                    int* bytes_filled,
  1.1172 +                    int* chars_changed,
  1.1173 +                    OffsetMap* offsetmap) {
  1.1174 +  StringPiece local_istr(istr.data(), istr.length());
  1.1175 +  StringPiece local_ostr(ostr.data(), ostr.length());
  1.1176 +  int total_consumed = 0;
  1.1177 +  int total_filled = 0;
  1.1178 +  int total_changed = 0;
  1.1179 +  int local_bytes_consumed, local_bytes_filled, local_chars_changed;
  1.1180 +  int e;
  1.1181 +  do {
  1.1182 +    e = UTF8GenericReplaceInternal(st,
  1.1183 +                    local_istr, local_ostr, is_plain_text,
  1.1184 +                    &local_bytes_consumed, &local_bytes_filled,
  1.1185 +                    &local_chars_changed,
  1.1186 +                    offsetmap);
  1.1187 +    local_istr.remove_prefix(local_bytes_consumed);
  1.1188 +    local_ostr.remove_prefix(local_bytes_filled);
  1.1189 +    total_consumed += local_bytes_consumed;
  1.1190 +    total_filled += local_bytes_filled;
  1.1191 +    total_changed += local_chars_changed;
  1.1192 +  } while ( e == kExitDoAgain );
  1.1193 +  *bytes_consumed = total_consumed;
  1.1194 +  *bytes_filled = total_filled;
  1.1195 +  *chars_changed = total_changed;
  1.1196 +  return e;
  1.1197 +}
  1.1198 +
  1.1199 +// Older version without offsetmap
  1.1200 +int UTF8GenericReplace(const UTF8ReplaceObj* st,
  1.1201 +                    const StringPiece& istr,
  1.1202 +                    StringPiece& ostr,
  1.1203 +                    bool is_plain_text,
  1.1204 +                    int* bytes_consumed,
  1.1205 +                    int* bytes_filled,
  1.1206 +                    int* chars_changed) {
  1.1207 +  return UTF8GenericReplace(st,
  1.1208 +                    istr,
  1.1209 +                    ostr,
  1.1210 +                    is_plain_text,
  1.1211 +                    bytes_consumed,
  1.1212 +                    bytes_filled,
  1.1213 +                    chars_changed,
  1.1214 +                    NULL);
  1.1215 +}
  1.1216 +
  1.1217 +// Older version without is_plain_text or offsetmap
  1.1218 +int UTF8GenericReplace(const UTF8ReplaceObj* st,
  1.1219 +                    const StringPiece& istr,
  1.1220 +                    StringPiece& ostr,
  1.1221 +                    int* bytes_consumed,
  1.1222 +                    int* bytes_filled,
  1.1223 +                    int* chars_changed) {
  1.1224 +  bool is_plain_text = false;
  1.1225 +  return UTF8GenericReplace(st,
  1.1226 +                    istr,
  1.1227 +                    ostr,
  1.1228 +                    is_plain_text,
  1.1229 +                    bytes_consumed,
  1.1230 +                    bytes_filled,
  1.1231 +                    chars_changed,
  1.1232 +                    NULL);
  1.1233 +}
  1.1234 +
  1.1235 +// Scan a UTF-8 stringpiece based on state table with two-byte entries,
  1.1236 +//   copying to output stringpiece
  1.1237 +//   and doing text replacements.
  1.1238 +// Also writes an optional OffsetMap. Pass NULL to skip writing one.
  1.1239 +// Always scan complete UTF-8 characters
  1.1240 +// Set number of bytes consumed from input, number filled to output.
  1.1241 +// Return reason for exiting
  1.1242 +int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
  1.1243 +                    const StringPiece& istr,
  1.1244 +                    StringPiece& ostr,
  1.1245 +                    bool is_plain_text,
  1.1246 +                    int* bytes_consumed,
  1.1247 +                    int* bytes_filled,
  1.1248 +                    int* chars_changed,
  1.1249 +                    OffsetMap* offsetmap) {
  1.1250 +  StringPiece local_istr(istr.data(), istr.length());
  1.1251 +  StringPiece local_ostr(ostr.data(), ostr.length());
  1.1252 +  int total_consumed = 0;
  1.1253 +  int total_filled = 0;
  1.1254 +  int total_changed = 0;
  1.1255 +  int local_bytes_consumed, local_bytes_filled, local_chars_changed;
  1.1256 +  int e;
  1.1257 +  do {
  1.1258 +    e = UTF8GenericReplaceInternalTwoByte(st,
  1.1259 +                    local_istr, local_ostr, is_plain_text,
  1.1260 +                    &local_bytes_consumed,
  1.1261 +                    &local_bytes_filled,
  1.1262 +                    &local_chars_changed,
  1.1263 +                    offsetmap);
  1.1264 +    local_istr.remove_prefix(local_bytes_consumed);
  1.1265 +    local_ostr.remove_prefix(local_bytes_filled);
  1.1266 +    total_consumed += local_bytes_consumed;
  1.1267 +    total_filled += local_bytes_filled;
  1.1268 +    total_changed += local_chars_changed;
  1.1269 +  } while ( e == kExitDoAgain_2 );
  1.1270 +  *bytes_consumed = total_consumed;
  1.1271 +  *bytes_filled = total_filled;
  1.1272 +  *chars_changed = total_changed;
  1.1273 +
  1.1274 +  return e - kExitOK_2 + kExitOK;
  1.1275 +}
  1.1276 +
  1.1277 +// Older version without offsetmap
  1.1278 +int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
  1.1279 +                    const StringPiece& istr,
  1.1280 +                    StringPiece& ostr,
  1.1281 +                    bool is_plain_text,
  1.1282 +                    int* bytes_consumed,
  1.1283 +                    int* bytes_filled,
  1.1284 +                    int* chars_changed) {
  1.1285 +  return UTF8GenericReplaceTwoByte(st,
  1.1286 +                    istr,
  1.1287 +                    ostr,
  1.1288 +                    is_plain_text,
  1.1289 +                    bytes_consumed,
  1.1290 +                    bytes_filled,
  1.1291 +                    chars_changed,
  1.1292 +                    NULL);
  1.1293 +}
  1.1294 +
  1.1295 +// Older version without is_plain_text or offsetmap
  1.1296 +int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
  1.1297 +                    const StringPiece& istr,
  1.1298 +                    StringPiece& ostr,
  1.1299 +                    int* bytes_consumed,
  1.1300 +                    int* bytes_filled,
  1.1301 +                    int* chars_changed) {
  1.1302 +  bool is_plain_text = false;
  1.1303 +  return UTF8GenericReplaceTwoByte(st,
  1.1304 +                    istr,
  1.1305 +                    ostr,
  1.1306 +                    is_plain_text,
  1.1307 +                    bytes_consumed,
  1.1308 +                    bytes_filled,
  1.1309 +                    chars_changed,
  1.1310 +                    NULL);
  1.1311 +}
  1.1312 +
  1.1313 +
  1.1314 +
  1.1315 +// Adjust a stringpiece to encompass complete UTF-8 characters.
  1.1316 +// The data pointer will be increased by 0..3 bytes to get to a character
  1.1317 +// boundary, and the length will then be decreased by 0..3 bytes
  1.1318 +// to encompass the last complete character.
  1.1319 +void UTF8TrimToChars(StringPiece* istr) {
  1.1320 +  const char* src = istr->data();
  1.1321 +  int len = istr->length();
  1.1322 +  // Exit if empty string
  1.1323 +  if (len == 0) {
  1.1324 +    return;
  1.1325 +  }
  1.1326 +
  1.1327 +  // Exit on simple, common case
  1.1328 +  if ( ((src[0] & 0xc0) != 0x80) &&
  1.1329 +       (static_cast<signed char>(src[len - 1]) >= 0) ) {
  1.1330 +    // First byte is not a continuation and last byte is 7-bit ASCII -- done
  1.1331 +    return;
  1.1332 +  }
  1.1333 +
  1.1334 +  // Adjust the back end, len > 0
  1.1335 +  const char* srclimit = src + len;
  1.1336 +  // Backscan over any ending continuation bytes to find last char start
  1.1337 +  const char* s = srclimit - 1;         // Last byte of the string
  1.1338 +  while ((src <= s) && ((*s & 0xc0) == 0x80)) {
  1.1339 +    s--;
  1.1340 +  }
  1.1341 +  // Include entire last char if it fits
  1.1342 +  if (src <= s) {
  1.1343 +    int last_char_len = UTF8OneCharLen(s);
  1.1344 +    if (s + last_char_len <= srclimit) {
  1.1345 +      // Last char fits, so include it, else exclude it
  1.1346 +      s += last_char_len;
  1.1347 +    }
  1.1348 +  }
  1.1349 +  if (s != srclimit) {
  1.1350 +    // s is one byte beyond the last full character, if any
  1.1351 +    istr->remove_suffix(srclimit - s);
  1.1352 +    // Exit if now empty string
  1.1353 +    if (istr->length() == 0) {
  1.1354 +      return;
  1.1355 +    }
  1.1356 +  }
  1.1357 +
  1.1358 +  // Adjust the front end, len > 0
  1.1359 +  len = istr->length();
  1.1360 +  srclimit = src + len;
  1.1361 +  s = src;                            // First byte of the string
  1.1362 +  // Scan over any beginning continuation bytes to find first char start
  1.1363 +  while ((s < srclimit) && ((*s & 0xc0) == 0x80)) {
  1.1364 +    s++;
  1.1365 +  }
  1.1366 +  if (s != src) {
  1.1367 +    // s is at the first full character, if any
  1.1368 +    istr->remove_prefix(s - src);
  1.1369 +  }
  1.1370 +}
  1.1371 +
  1.1372 +}       // End namespace CLD2

mercurial