michael@0: // Copyright 2013 Google Inc. All Rights Reserved.
michael@0: //
michael@0: // Licensed under the Apache License, Version 2.0 (the "License");
michael@0: // you may not use this file except in compliance with the License.
michael@0: // You may obtain a copy of the License at
michael@0: //
michael@0: //     http://www.apache.org/licenses/LICENSE-2.0
michael@0: //
michael@0: // Unless required by applicable law or agreed to in writing, software
michael@0: // distributed under the License is distributed on an "AS IS" BASIS,
michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
michael@0: // See the License for the specific language governing permissions and
michael@0: // limitations under the License.
michael@0: 
michael@0: //
michael@0: // State Table follower for scanning UTF-8 strings without converting to
michael@0: // 32- or 16-bit Unicode values.
michael@0: //
michael@0: 
michael@0: #ifdef COMPILER_MSVC
michael@0: // MSVC warns: warning C4309: 'initializing' : truncation of constant value
michael@0: // But the value is in fact not truncated.  0xFF still comes out 0xFF at
michael@0: // runtime.
michael@0: #pragma warning ( disable : 4309 )
michael@0: #endif
michael@0: 
michael@0: #include "utf8statetable.h"
michael@0: 
michael@0: #include <stdint.h>                     // for uintptr_t
michael@0: #include <string.h>                     // for NULL, memcpy, memmove
michael@0: 
michael@0: #include "integral_types.h"        // for uint8, uint32, int8
michael@0: #include "stringpiece.h"
michael@0: #include "offsetmap.h"
michael@0: 
michael@0: 
michael@0: namespace CLD2 {
michael@0: 
michael@0: static const int kReplaceAndResumeFlag = 0x80; // Bit in del byte to distinguish
michael@0:                                                // optional next-state field
michael@0:                                                // after replacement text
michael@0: static const int kHtmlPlaintextFlag = 0x80;    // Bit in add byte to distinguish
michael@0:                                                // HTML replacement vs. plaintext
michael@0: 
michael@0: 
michael@0: /**
michael@0:  * This code implements a little interpreter for UTF8 state
michael@0:  * tables. There are three kinds of quite-similar state tables,
michael@0:  * property, scanning, and replacement. Each state in one of
michael@0:  * these tables consists of an array of 256 or 64 one-byte
michael@0:  * entries. The state is subscripted by an incoming source byte,
michael@0:  * and the entry either specifies the next state or specifies an
michael@0:  * action. Space-optimized tables have full 256-entry states for
michael@0:  * the first byte of a UTF-8 character, but only 64-entry states
michael@0:  * for continuation bytes. Space-optimized tables may only be
michael@0:  * used with source input that has been checked to be
michael@0:  * structurally- (or stronger interchange-) valid.
michael@0:  *
michael@0:  * A property state table has an unsigned one-byte property for
michael@0:  * each possible UTF-8 character. One-byte character properties
michael@0:  * are in the state[0] array, while for other lengths the
michael@0:  * state[0] array gives the next state, which contains the
michael@0:  * property value for two-byte characters or yet another state
michael@0:  * for longer ones. The code simply loads the right number of
michael@0:  * next-state values, then returns the final byte as property
michael@0:  * value. There are no actions specified in property tables.
michael@0:  * States are typically shared for multi-byte UTF-8 characters
michael@0:  * that all have the same property value.
michael@0:  *
michael@0:  * A scanning state table has entries that are either a
michael@0:  * next-state specifier for bytes that are accepted by the
michael@0:  * scanner, or an exit action for the last byte of each
michael@0:  * character that is rejected by the scanner.
michael@0:  *
michael@0:  * Scanning long strings involves a tight loop that picks up one
michael@0:  * byte at a time and follows next-state value back to state[0]
michael@0:  * for each accepted UTF-8 character. Scanning stops at the end
michael@0:  * of the string or at the first character encountered that has
michael@0:  * an exit action such as "reject". Timing information is given
michael@0:  * below.
michael@0:  *
michael@0:  * Since so much of Google's text is 7-bit-ASCII values
michael@0:  * (approximately 94% of the bytes of web documents), the
michael@0:  * scanning interpreter has two speed optimizations. One checks
michael@0:  * 8 bytes at a time to see if they are all in the range lo..hi,
michael@0:  * as specified in constants in the overall statetable object.
michael@0:  * The check involves ORing together four 4-byte values that
michael@0:  * overflow into the high bit of some byte when a byte is out of
michael@0:  * range. For seven-bit-ASCII, lo is 0x20 and hi is 0x7E. This
michael@0:  * loop is about 8x faster than the one-byte-at-a-time loop.
michael@0:  *
michael@0:  * If checking for exit bytes in the 0x00-0x1F and 7F range is
michael@0:  * unneeded, an even faster loop just looks at the high bits of
michael@0:  * 8 bytes at once, and is about 1.33x faster than the lo..hi
michael@0:  * loop.
michael@0:  *
michael@0:  * Exit from the scanning routines backs up to the first byte of
michael@0:  * the rejected character, so the text spanned is always a
michael@0:  * complete number of UTF-8 characters. The normal scanning exit
michael@0:  * is at the first rejected character, or at the end of the
michael@0:  * input text. Scanning also exits on any detected ill-formed
michael@0:  * character or at a special do-again action built into some
michael@0:  * exit-optimized tables. The do-again action gets back to the
michael@0:  * top of the scanning loop to retry eight-byte ASCII scans. It
michael@0:  * is typically put into state tables after four seven-bit-ASCII
michael@0:  * characters in a row are seen, to allow restarting the fast
michael@0:  * scan after some slower processing of multi-byte characters.
michael@0:  *
michael@0:  * A replacement state table is similar to a scanning state
michael@0:  * table but has more extensive actions. The default
michael@0:  * byte-at-a-time loop copies one byte from source to
michael@0:  * destination and goes to the next state. The replacement
michael@0:  * actions overwrite 1-3 bytes of the destination with different
michael@0:  * bytes, possibly shortening the output by 1 or 2 bytes. The
michael@0:  * replacement bytes come from within the state table, from
michael@0:  * dummy states inserted just after any state that contains a
michael@0:  * replacement action. This gives a quick address calculation for
michael@0:  * the replacement byte(s) and gives some cache locality.
michael@0:  *
michael@0:  * Additional replacement actions use one or two bytes from
michael@0:  * within dummy states to index a side table of more-extensive
michael@0:  * replacements. The side table specifies a length of 0..15
michael@0:  * destination bytes to overwrite and a length of 0..127 bytes
michael@0:  * to overwrite them with, plus the actual replacement bytes.
michael@0:  *
michael@0:  * This side table uses one extra bit to specify a pair of
michael@0:  * replacements, the first to be used in an HTML context and the
michael@0:  * second to be used in a plaintext context. This allows
michael@0:  * replacements that are spelled with "&lt;" in the former
michael@0:  * context and "<" in the latter.
michael@0:  *
michael@0:  * The side table also uses an extra bit to specify a non-zero
michael@0:  * next state after a replacement. This allows a combination
michael@0:  * replacement and state change, used to implement a limited
michael@0:  * version of the Boyer-Moore algorithm for multi-character
michael@0:  * replacement without backtracking. This is useful when there
michael@0:  * are overlapping replacements, such as ch => x and also c =>
michael@0:  * y, the latter to be used only if the character after c is not
michael@0:  * h. in this case, the state[0] table's entry for c would
michael@0:  * change c to y and also have a next-state of say n, and the
michael@0:  * state[n] entry for h would specify a replacement of the two
michael@0:  * bytes yh by x. No backtracking is needed.
michael@0:  *
michael@0:  * A replacement table may also include the exit actions of a
michael@0:  * scanning state table, so some character sequences can
michael@0:  * terminate early.
michael@0:  *
michael@0:  * During replacement, an optional data structure called an
michael@0:  * offset map can be updated to reflect each change in length
michael@0:  * between source and destination. This offset map can later be
michael@0:  * used to map destination-string offsets to corresponding
michael@0:  * source-string offsets or vice versa.
michael@0:  *
michael@0:  * The routines below also have variants in which state-table
michael@0:  * entries are all two bytes instead of one byte. This allows
michael@0:  * tables with more than 240 total states, but takes up twice as
michael@0:  * much space per state.
michael@0:  *
michael@0: **/
michael@0: 
michael@0: // Return true if current Tbl pointer is within state0 range
michael@0: // Note that unsigned compare checks both ends of range simultaneously
michael@0: static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
michael@0:   const uint8* Tbl0 = &st->state_table[st->state0];
michael@0:   return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
michael@0: }
michael@0: 
michael@0: static inline bool InStateZero_2(const UTF8ReplaceObj_2* st,
michael@0:                                  const unsigned short int* Tbl) {
michael@0:   const unsigned short int* Tbl0 =  &st->state_table[st->state0];
michael@0:   // Word difference, not byte difference
michael@0:   return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
michael@0: }
michael@0: 
michael@0: // UTF8PropObj, UTF8ScanObj, UTF8ReplaceObj are all typedefs of
michael@0: // UTF8MachineObj.
michael@0: 
michael@0: static bool IsPropObj(const UTF8StateMachineObj& obj) {
michael@0:   return obj.fast_state == NULL
michael@0:       && obj.max_expand == 0;
michael@0: }
michael@0: 
michael@0: static bool IsPropObj_2(const UTF8StateMachineObj_2& obj) {
michael@0:   return obj.fast_state == NULL
michael@0:       && obj.max_expand == 0;
michael@0: }
michael@0: 
michael@0: static bool IsScanObj(const UTF8StateMachineObj& obj) {
michael@0:   return obj.fast_state != NULL
michael@0:       && obj.max_expand == 0;
michael@0: }
michael@0: 
michael@0: static bool IsReplaceObj(const UTF8StateMachineObj& obj) {
michael@0:   // Normally, obj.fast_state != NULL, but the handwritten tables
michael@0:   // in utf8statetable_unittest don't handle fast_states.
michael@0:   return obj.max_expand > 0;
michael@0: }
michael@0: 
michael@0: static bool IsReplaceObj_2(const UTF8StateMachineObj_2& obj) {
michael@0:   return obj.max_expand > 0;
michael@0: }
michael@0: 
michael@0: // Look up property of one UTF-8 character and advance over it
michael@0: // Return 0 if input length is zero
michael@0: // Return 0 and advance one byte if input is ill-formed
michael@0: uint8 UTF8GenericProperty(const UTF8PropObj* st,
michael@0:                           const uint8** src,
michael@0:                           int* srclen) {
michael@0:   if (*srclen <= 0) {
michael@0:     return 0;
michael@0:   }
michael@0: 
michael@0:   const uint8* lsrc = *src;
michael@0:   const uint8* Tbl_0 = &st->state_table[st->state0];
michael@0:   const uint8* Tbl = Tbl_0;
michael@0:   int e;
michael@0:   int eshift = st->entry_shift;
michael@0: 
michael@0:   // Short series of tests faster than switch, optimizes 7-bit ASCII
michael@0:   unsigned char c = lsrc[0];
michael@0:   if (static_cast<signed char>(c) >= 0) {           // one byte
michael@0:     e = Tbl[c];
michael@0:     *src += 1;
michael@0:     *srclen -= 1;
michael@0:   } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
michael@0:     e = Tbl[c];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[1]];
michael@0:     *src += 2;
michael@0:     *srclen -= 2;
michael@0:   } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
michael@0:     e = Tbl[c];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[1]];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[2]];
michael@0:     *src += 3;
michael@0:     *srclen -= 3;
michael@0:   }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
michael@0:     e = Tbl[c];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[1]];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[2]];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[3]];
michael@0:     *src += 4;
michael@0:     *srclen -= 4;
michael@0:   } else {                                                // Ill-formed
michael@0:     e = 0;
michael@0:     *src += 1;
michael@0:     *srclen -= 1;
michael@0:   }
michael@0:   return e;
michael@0: }
michael@0: 
michael@0: bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src) {
michael@0:   const uint8* lsrc = reinterpret_cast<const uint8*>(src);
michael@0:   const uint8* Tbl_0 = &st.state_table[st.state0];
michael@0:   const uint8* Tbl = Tbl_0;
michael@0:   int e;
michael@0:   int eshift = st.entry_shift;
michael@0: 
michael@0:   // Short series of tests faster than switch, optimizes 7-bit ASCII
michael@0:   unsigned char c = lsrc[0];
michael@0:   if (static_cast<signed char>(c) >= 0) {           // one byte
michael@0:     e = Tbl[c];
michael@0:   } else if ((c & 0xe0) == 0xc0) {     // two bytes
michael@0:     e = Tbl[c];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[1]];
michael@0:   } else if ((c & 0xf0) == 0xe0) {     // three bytes
michael@0:     e = Tbl[c];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[1]];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[2]];
michael@0:   } else {                             // four bytes
michael@0:     e = Tbl[c];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[1]];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[2]];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[3]];
michael@0:   }
michael@0:   return e;
michael@0: }
michael@0: 
michael@0: 
michael@0: // BigOneByte versions are needed for tables > 240 states, but most
michael@0: // won't need the TwoByte versions.
michael@0: // Internally, to next-to-last offset is multiplied by 16 and the last
michael@0: // offset is relative instead of absolute.
michael@0: // Look up property of one UTF-8 character and advance over it
michael@0: // Return 0 if input length is zero
michael@0: // Return 0 and advance one byte if input is ill-formed
michael@0: uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
michael@0:                           const uint8** src,
michael@0:                           int* srclen) {
michael@0:   if (*srclen <= 0) {
michael@0:     return 0;
michael@0:   }
michael@0: 
michael@0:   const uint8* lsrc = *src;
michael@0:   const uint8* Tbl_0 = &st->state_table[st->state0];
michael@0:   const uint8* Tbl = Tbl_0;
michael@0:   int e;
michael@0:   int eshift = st->entry_shift;
michael@0: 
michael@0:   // Short series of tests faster than switch, optimizes 7-bit ASCII
michael@0:   unsigned char c = lsrc[0];
michael@0:   if (static_cast<signed char>(c) >= 0) {           // one byte
michael@0:     e = Tbl[c];
michael@0:     *src += 1;
michael@0:     *srclen -= 1;
michael@0:   } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
michael@0:     e = Tbl[c];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[1]];
michael@0:     *src += 2;
michael@0:     *srclen -= 2;
michael@0:   } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
michael@0:     e = Tbl[c];
michael@0:     Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
michael@0:     e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
michael@0:     Tbl = &Tbl[e << eshift];          // Relative +/-
michael@0:     e = Tbl[lsrc[2]];
michael@0:     *src += 3;
michael@0:     *srclen -= 3;
michael@0:   }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
michael@0:     e = Tbl[c];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[1]];
michael@0:     Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
michael@0:     e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
michael@0:     Tbl = &Tbl[e << eshift];          // Relative +/-
michael@0:     e = Tbl[lsrc[3]];
michael@0:     *src += 4;
michael@0:     *srclen -= 4;
michael@0:   } else {                                                // Ill-formed
michael@0:     e = 0;
michael@0:     *src += 1;
michael@0:     *srclen -= 1;
michael@0:   }
michael@0:   return e;
michael@0: }
michael@0: 
michael@0: // BigOneByte versions are needed for tables > 240 states, but most
michael@0: // won't need the TwoByte versions.
michael@0: bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src) {
michael@0:   const uint8* lsrc = reinterpret_cast<const uint8*>(src);
michael@0:   const uint8* Tbl_0 = &st.state_table[st.state0];
michael@0:   const uint8* Tbl = Tbl_0;
michael@0:   int e;
michael@0:   int eshift = st.entry_shift;
michael@0: 
michael@0:   // Short series of tests faster than switch, optimizes 7-bit ASCII
michael@0:   unsigned char c = lsrc[0];
michael@0:   if (static_cast<signed char>(c) >= 0) {           // one byte
michael@0:     e = Tbl[c];
michael@0:   } else if ((c & 0xe0) == 0xc0) {    // two bytes
michael@0:     e = Tbl[c];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[1]];
michael@0:   } else if ((c & 0xf0) == 0xe0) {    // three bytes
michael@0:     e = Tbl[c];
michael@0:     Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
michael@0:     e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
michael@0:     Tbl = &Tbl[e << eshift];          // Relative +/-
michael@0:     e = Tbl[lsrc[2]];
michael@0:   } else {                            // four bytes
michael@0:     e = Tbl[c];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[1]];
michael@0:     Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
michael@0:     e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
michael@0:     Tbl = &Tbl[e << eshift];          // Relative +/-
michael@0:     e = Tbl[lsrc[3]];
michael@0:   }
michael@0:   return e;
michael@0: }
michael@0: 
michael@0: 
michael@0: // TwoByte versions are needed for tables > 240 states
michael@0: // Look up property of one UTF-8 character and advance over it
michael@0: // Return 0 if input length is zero
michael@0: // Return 0 and advance one byte if input is ill-formed
michael@0: uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
michael@0:                           const uint8** src,
michael@0:                           int* srclen) {
michael@0:   if (*srclen <= 0) {
michael@0:     return 0;
michael@0:   }
michael@0: 
michael@0:   const uint8* lsrc = *src;
michael@0:   const unsigned short* Tbl_0 = &st->state_table[st->state0];
michael@0:   const unsigned short* Tbl = Tbl_0;
michael@0:   int e;
michael@0:   int eshift = st->entry_shift;
michael@0: 
michael@0:   // Short series of tests faster than switch, optimizes 7-bit ASCII
michael@0:   unsigned char c = lsrc[0];
michael@0:   if (static_cast<signed char>(c) >= 0) {           // one byte
michael@0:     e = Tbl[c];
michael@0:     *src += 1;
michael@0:     *srclen -= 1;
michael@0:   } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
michael@0:     e = Tbl[c];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[1]];
michael@0:     *src += 2;
michael@0:     *srclen -= 2;
michael@0:   } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
michael@0:     e = Tbl[c];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[1]];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[2]];
michael@0:     *src += 3;
michael@0:     *srclen -= 3;
michael@0:   }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
michael@0:     e = Tbl[c];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[1]];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[2]];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[3]];
michael@0:     *src += 4;
michael@0:     *srclen -= 4;
michael@0:   } else {                                                // Ill-formed
michael@0:     e = 0;
michael@0:     *src += 1;
michael@0:     *srclen -= 1;
michael@0:   }
michael@0:   return e;
michael@0: }
michael@0: 
michael@0: // TwoByte versions are needed for tables > 240 states
michael@0: bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src) {
michael@0:   const uint8* lsrc = reinterpret_cast<const uint8*>(src);
michael@0:   const unsigned short* Tbl_0 = &st.state_table[st.state0];
michael@0:   const unsigned short* Tbl = Tbl_0;
michael@0:   int e;
michael@0:   int eshift = st.entry_shift;
michael@0: 
michael@0:   // Short series of tests faster than switch, optimizes 7-bit ASCII
michael@0:   unsigned char c = lsrc[0];
michael@0:   if (static_cast<signed char>(c) >= 0) {           // one byte
michael@0:     e = Tbl[c];
michael@0:   } else if ((c & 0xe0) == 0xc0) {     // two bytes
michael@0:     e = Tbl[c];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[1]];
michael@0:   } else if ((c & 0xf0) == 0xe0) {     // three bytes
michael@0:     e = Tbl[c];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[1]];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[2]];
michael@0:   } else {                             // four bytes
michael@0:     e = Tbl[c];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[1]];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[2]];
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:     e = Tbl[lsrc[3]];
michael@0:   }
michael@0:   return e;
michael@0: }
michael@0: 
michael@0: 
michael@0: // Approximate speeds on 2.8 GHz Pentium 4:
michael@0: //   GenericScan 1-byte loop           300 MB/sec *
michael@0: //   GenericScan 4-byte loop          1200 MB/sec
michael@0: //   GenericScan 8-byte loop          2400 MB/sec *
michael@0: //   GenericScanFastAscii 4-byte loop 3000 MB/sec
michael@0: //   GenericScanFastAscii 8-byte loop 3200 MB/sec *
michael@0: //
michael@0: // * Implemented below. FastAscii loop is memory-bandwidth constrained.
michael@0: 
michael@0: // Scan a UTF-8 stringpiece based on state table.
michael@0: // Always scan complete UTF-8 characters
michael@0: // Set number of bytes scanned. Return reason for exiting
michael@0: int UTF8GenericScan(const UTF8ScanObj* st,
michael@0:                     const StringPiece& str,
michael@0:                     int* bytes_consumed) {
michael@0:   int eshift = st->entry_shift;       // 6 (space optimized) or 8
michael@0:   // int nEntries = (1 << eshift);       // 64 or 256 entries per state
michael@0: 
michael@0:   const uint8* isrc =
michael@0:     reinterpret_cast<const uint8*>(str.data());
michael@0:   const uint8* src = isrc;
michael@0:   const int len = str.length();
michael@0:   const uint8* srclimit = isrc + len;
michael@0:   const uint8* srclimit8 = srclimit - 7;
michael@0:   *bytes_consumed = 0;
michael@0:   if (len == 0) return kExitOK;
michael@0: 
michael@0:   const uint8* Tbl_0 = &st->state_table[st->state0];
michael@0: 
michael@0: DoAgain:
michael@0:   // Do state-table scan
michael@0:   int e = 0;
michael@0:   uint8 c;
michael@0: 
michael@0:   // Do fast for groups of 8 identity bytes.
michael@0:   // This covers a lot of 7-bit ASCII ~8x faster than the 1-byte loop,
michael@0:   // including slowing slightly on cr/lf/ht
michael@0:   //----------------------------
michael@0:   const uint8* Tbl2 = &st->fast_state[0];
michael@0:   uint32 losub = st->losub;
michael@0:   uint32 hiadd = st->hiadd;
michael@0:   while (src < srclimit8) {
michael@0:     uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
michael@0:     uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
michael@0:     src += 8;
michael@0:     // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
michael@0:     uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
michael@0:                   (s4567 - losub) | (s4567 + hiadd);
michael@0:     if ((temp & 0x80808080) != 0) {
michael@0:       // We typically end up here on cr/lf/ht; src was incremented
michael@0:       int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
michael@0:                   (Tbl2[src[-6]] | Tbl2[src[-5]]);
michael@0:       if (e0123 != 0) {src -= 8; break;}    // Exit on Non-interchange
michael@0:       e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
michael@0:               (Tbl2[src[-2]] | Tbl2[src[-1]]);
michael@0:       if (e0123 != 0) {src -= 4; break;}    // Exit on Non-interchange
michael@0:       // Else OK, go around again
michael@0:     }
michael@0:   }
michael@0:   //----------------------------
michael@0: 
michael@0:   // Byte-at-a-time scan
michael@0:   //----------------------------
michael@0:   const uint8* Tbl = Tbl_0;
michael@0:   while (src < srclimit) {
michael@0:     c = *src;
michael@0:     e = Tbl[c];
michael@0:     src++;
michael@0:     if (e >= kExitIllegalStructure) {break;}
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:   }
michael@0:   //----------------------------
michael@0: 
michael@0: 
michael@0:   // Exit possibilities:
michael@0:   //  Some exit code, !state0, back up over last char
michael@0:   //  Some exit code, state0, back up one byte exactly
michael@0:   //  source consumed, !state0, back up over partial char
michael@0:   //  source consumed, state0, exit OK
michael@0:   // For illegal byte in state0, avoid backup up over PREVIOUS char
michael@0:   // For truncated last char, back up to beginning of it
michael@0: 
michael@0:   if (e >= kExitIllegalStructure) {
michael@0:     // Back up over exactly one byte of rejected/illegal UTF-8 character
michael@0:     src--;
michael@0:     // Back up more if needed
michael@0:     if (!InStateZero(st, Tbl)) {
michael@0:       do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
michael@0:     }
michael@0:   } else if (!InStateZero(st, Tbl)) {
michael@0:     // Back up over truncated UTF-8 character
michael@0:     e = kExitIllegalStructure;
michael@0:     do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
michael@0:   } else {
michael@0:     // Normal termination, source fully consumed
michael@0:     e = kExitOK;
michael@0:   }
michael@0: 
michael@0:   if (e == kExitDoAgain) {
michael@0:     // Loop back up to the fast scan
michael@0:     goto DoAgain;
michael@0:   }
michael@0: 
michael@0:   *bytes_consumed = src - isrc;
michael@0:   return e;
michael@0: }
michael@0: 
michael@0: // Scan a UTF-8 stringpiece based on state table.
michael@0: // Always scan complete UTF-8 characters
michael@0: // Set number of bytes scanned. Return reason for exiting
michael@0: // OPTIMIZED for case of 7-bit ASCII 0000..007f all valid
michael@0: int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
michael@0:                     const StringPiece& str,
michael@0:                     int* bytes_consumed) {
michael@0:   const uint8* isrc =
michael@0:     reinterpret_cast<const uint8*>(str.data());
michael@0:   const uint8* src = isrc;
michael@0:   const int len = str.length();
michael@0:   const uint8* srclimit = isrc + len;
michael@0:   const uint8* srclimit8 = srclimit - 7;
michael@0:   *bytes_consumed = 0;
michael@0:   if (len == 0) return kExitOK;
michael@0: 
michael@0:   int n;
michael@0:   int rest_consumed;
michael@0:   int exit_reason;
michael@0:   do {
michael@0:     // Skip 8 bytes of ASCII at a whack; no endianness issue
michael@0:     while ((src < srclimit8) &&
michael@0:            (((reinterpret_cast<const uint32*>(src)[0] |
michael@0:               reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
michael@0:       src += 8;
michael@0:     }
michael@0:     // Run state table on the rest
michael@0:     n = src - isrc;
michael@0:     StringPiece str2(str.data() + n, str.length() - n);
michael@0:     exit_reason = UTF8GenericScan(st, str2, &rest_consumed);
michael@0:     src += rest_consumed;
michael@0:   } while ( exit_reason == kExitDoAgain );
michael@0: 
michael@0:   *bytes_consumed = src - isrc;
michael@0:   return exit_reason;
michael@0: }
michael@0: 
michael@0: // Hack to change halfwidth katakana to match an old UTF8CharToLower()
michael@0: 
michael@0: // Return number of src bytes skipped
michael@0: static int DoSpecialFixup(const unsigned char c,
michael@0:                     const unsigned char** srcp, const unsigned char* srclimit,
michael@0:                     unsigned char** dstp, unsigned char* dstlimit) {
michael@0:   return 0;
michael@0: }
michael@0: 
michael@0: 
michael@0: // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
michael@0: //   and doing text replacements.
michael@0: // DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below
michael@0: //   Needs caller to loop on kExitDoAgain
michael@0: static int UTF8GenericReplaceInternal(const UTF8ReplaceObj* st,
michael@0:                     const StringPiece& istr,
michael@0:                     StringPiece& ostr,
michael@0:                     bool is_plain_text,
michael@0:                     int* bytes_consumed,
michael@0:                     int* bytes_filled,
michael@0:                     int* chars_changed,
michael@0:                     OffsetMap* offsetmap) {
michael@0:   int eshift = st->entry_shift;
michael@0:   int nEntries = (1 << eshift);       // 64 or 256 entries per state
michael@0:   const uint8* isrc = reinterpret_cast<const uint8*>(istr.data());
michael@0:   const int ilen = istr.length();
michael@0:   const uint8* copystart = isrc;
michael@0:   const uint8* src = isrc;
michael@0:   const uint8* srclimit = src + ilen;
michael@0:   *bytes_consumed = 0;
michael@0:   *bytes_filled = 0;
michael@0:   *chars_changed = 0;
michael@0: 
michael@0:   const uint8* odst = reinterpret_cast<const uint8*>(ostr.data());
michael@0:   const int olen = ostr.length();
michael@0:   uint8* dst = const_cast<uint8*>(odst);
michael@0:   uint8* dstlimit = dst + olen;
michael@0: 
michael@0:   int total_changed = 0;
michael@0: 
michael@0:   // Invariant condition during replacements:
michael@0:   //  remaining dst size >= remaining src size
michael@0:   if ((dstlimit - dst) < (srclimit - src)) {
michael@0:     if (offsetmap != NULL) {
michael@0:       offsetmap->Copy(src - copystart);
michael@0:       copystart = src;
michael@0:     }
michael@0:     return kExitDstSpaceFull;
michael@0:   }
michael@0:   const uint8* Tbl_0 = &st->state_table[st->state0];
michael@0: 
michael@0:  Do_state_table:
michael@0:   // Do state-table scan, copying as we go
michael@0:   const uint8* Tbl = Tbl_0;
michael@0:   int e = 0;
michael@0:   uint8 c = 0;
michael@0: 
michael@0:  Do_state_table_newe:
michael@0: 
michael@0:   //----------------------------
michael@0:   while (src < srclimit) {
michael@0:     c = *src;
michael@0:     e = Tbl[c];
michael@0:     *dst = c;
michael@0:     src++;
michael@0:     dst++;
michael@0:     if (e >= kExitIllegalStructure) {break;}
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:   }
michael@0:   //----------------------------
michael@0: 
michael@0:   // Exit possibilities:
michael@0:   //  Replacement code, do the replacement and loop
michael@0:   //  Some other exit code, state0, back up one byte exactly
michael@0:   //  Some other exit code, !state0, back up over last char
michael@0:   //  source consumed, state0, exit OK
michael@0:   //  source consumed, !state0, back up over partial char
michael@0:   // For illegal byte in state0, avoid backup up over PREVIOUS char
michael@0:   // For truncated last char, back up to beginning of it
michael@0: 
michael@0:   if (e >= kExitIllegalStructure) {
michael@0:     // Switch on exit code; most loop back to top
michael@0:     int offset = 0;
michael@0:     switch (e) {
michael@0:     // These all make the output string the same size or shorter
michael@0:     // No checking needed
michael@0:     case kExitReplace31:    // del 2, add 1 bytes to change
michael@0:       dst -= 2;
michael@0:       if (offsetmap != NULL) {
michael@0:         offsetmap->Copy(src - copystart - 2);
michael@0:         offsetmap->Delete(2);
michael@0:         copystart = src;
michael@0:       }
michael@0:       dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
michael@0:       total_changed++;
michael@0:       goto Do_state_table;
michael@0:     case kExitReplace32:    // del 3, add 2 bytes to change
michael@0:       dst--;
michael@0:       if (offsetmap != NULL) {
michael@0:         offsetmap->Copy(src - copystart - 1);
michael@0:         offsetmap->Delete(1);
michael@0:         copystart = src;
michael@0:       }
michael@0:       dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)];
michael@0:       dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
michael@0:       total_changed++;
michael@0:       goto Do_state_table;
michael@0:     case kExitReplace21:    // del 2, add 1 bytes to change
michael@0:       dst--;
michael@0:       if (offsetmap != NULL) {
michael@0:         offsetmap->Copy(src - copystart - 1);
michael@0:         offsetmap->Delete(1);
michael@0:         copystart = src;
michael@0:       }
michael@0:       dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
michael@0:       total_changed++;
michael@0:       goto Do_state_table;
michael@0:     case kExitReplace3:    // update 3 bytes to change
michael@0:       dst[-3] = (unsigned char)Tbl[c + (nEntries * 3)];
michael@0:       // Fall into next case
michael@0:     case kExitReplace2:    // update 2 bytes to change
michael@0:       dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)];
michael@0:       // Fall into next case
michael@0:     case kExitReplace1:    // update 1 byte to change
michael@0:       dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
michael@0:       total_changed++;
michael@0:       goto Do_state_table;
michael@0:     case kExitReplace1S0:     // update 1 byte to change, 256-entry state
michael@0:       dst[-1] = (unsigned char)Tbl[c + (256 * 1)];
michael@0:       total_changed++;
michael@0:       goto Do_state_table;
michael@0:     // These can make the output string longer than the input
michael@0:     case kExitReplaceOffset2:
michael@0:       if ((nEntries != 256) && InStateZero(st, Tbl)) {
michael@0:         // For space-optimized table, we need multiples of 256 bytes
michael@0:         // in state0 and multiples of nEntries in other states
michael@0:         offset += ((unsigned char)Tbl[c + (256 * 2)] << 8);
michael@0:       } else {
michael@0:         offset += ((unsigned char)Tbl[c + (nEntries * 2)] << 8);
michael@0:       }
michael@0:       // Fall into next case
michael@0:     case kExitSpecial:      // Apply special fixups [read: hacks]
michael@0:     case kExitReplaceOffset1:
michael@0:       if ((nEntries != 256) && InStateZero(st, Tbl)) {
michael@0:         // For space-optimized table, we need multiples of 256 bytes
michael@0:         // in state0 and multiples of nEntries in other states
michael@0:         offset += (unsigned char)Tbl[c + (256 * 1)];
michael@0:       } else {
michael@0:         offset += (unsigned char)Tbl[c + (nEntries * 1)];
michael@0:       }
michael@0:       {
michael@0:         const RemapEntry* re = &st->remap_base[offset];
michael@0:         int del_len = re->delete_bytes & ~kReplaceAndResumeFlag;
michael@0:         int add_len = re->add_bytes & ~kHtmlPlaintextFlag;
michael@0: 
michael@0:         // Special-case non-HTML replacement of five sensitive entities
michael@0:         //   &quot; &amp; &apos; &lt; &gt;
michael@0:         //   0022   0026  0027   003c 003e
michael@0:         // A replacement creating one of these is expressed as a pair of
michael@0:         // entries, one for HTML output and one for plaintext output.
michael@0:         // The first of the pair has the high bit of add_bytes set.
michael@0:         if (re->add_bytes & kHtmlPlaintextFlag) {
michael@0:           // Use this entry for plain text
michael@0:           if (!is_plain_text) {
michael@0:             // Use very next entry for HTML text (same back/delete length)
michael@0:             re = &st->remap_base[offset + 1];
michael@0:             add_len = re->add_bytes & ~kHtmlPlaintextFlag;
michael@0:           }
michael@0:         }
michael@0: 
michael@0:         int string_offset = re->bytes_offset;
michael@0:         // After the replacement, need (dstlimit - newdst) >= (srclimit - src)
michael@0:         uint8* newdst = dst - del_len + add_len;
michael@0:         if ((dstlimit - newdst) < (srclimit - src)) {
michael@0:           // Won't fit; don't do the replacement. Caller may realloc and retry
michael@0:           e = kExitDstSpaceFull;
michael@0:           break;    // exit, backing up over this char for later retry
michael@0:         }
michael@0:         dst -= del_len;
michael@0:         memcpy(dst, &st->remap_string[string_offset], add_len);
michael@0:         dst += add_len;
michael@0:         total_changed++;
michael@0:         if (offsetmap != NULL) {
michael@0:           if (add_len > del_len) {
michael@0:             offsetmap->Copy(src - copystart);
michael@0:             offsetmap->Insert(add_len - del_len);
michael@0:             copystart = src;
michael@0:           } else if (add_len < del_len) {
michael@0:             offsetmap->Copy(src - copystart + add_len - del_len);
michael@0:             offsetmap->Delete(del_len - add_len);
michael@0:             copystart = src;
michael@0:           }
michael@0:         }
michael@0:         if (re->delete_bytes & kReplaceAndResumeFlag) {
michael@0:           // There is a non-zero  target state at the end of the
michael@0:           // replacement string
michael@0:           e = st->remap_string[string_offset + add_len];
michael@0:           Tbl = &Tbl_0[e << eshift];
michael@0:           goto Do_state_table_newe;
michael@0:         }
michael@0:       }
michael@0:       if (e == kExitRejectAlt) {break;}
michael@0:       if (e != kExitSpecial) {goto Do_state_table;}
michael@0: 
michael@0:     // case kExitSpecial:      // Apply special fixups [read: hacks]
michael@0:       // In this routine, do either UTF8CharToLower()
michael@0:       //   fullwidth/halfwidth mapping or
michael@0:       //   voiced mapping or
michael@0:       //   semi-voiced mapping
michael@0: 
michael@0:       // First, do EXIT_REPLACE_OFFSET1 action (above)
michael@0:       // Second: do additional code fixup
michael@0:       {
michael@0:         int srcdel = DoSpecialFixup(c, &src, srclimit, &dst, dstlimit);
michael@0:         if (offsetmap != NULL) {
michael@0:           if (srcdel != 0) {
michael@0:             offsetmap->Copy(src - copystart - srcdel);
michael@0:             offsetmap->Delete(srcdel);
michael@0:             copystart = src;
michael@0:           }
michael@0:         }
michael@0:       }
michael@0:       goto Do_state_table;
michael@0: 
michael@0:     case kExitIllegalStructure:   // structurally illegal byte; quit
michael@0:     case kExitReject:             // NUL or illegal code encountered; quit
michael@0:     case kExitRejectAlt:          // Apply replacement, then exit
michael@0:     default:                      // and all other exits
michael@0:       break;
michael@0:     }   // End switch (e)
michael@0: 
michael@0:     // Exit possibilities:
michael@0:     //  Some other exit code, state0, back up one byte exactly
michael@0:     //  Some other exit code, !state0, back up over last char
michael@0: 
michael@0:     // Back up over exactly one byte of rejected/illegal UTF-8 character
michael@0:     src--;
michael@0:     dst--;
michael@0:     // Back up more if needed
michael@0:     if (!InStateZero(st, Tbl)) {
michael@0:       do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
michael@0:     }
michael@0:   } else if (!InStateZero(st, Tbl)) {
michael@0:     // src >= srclimit, !state0
michael@0:     // Back up over truncated UTF-8 character
michael@0:     e = kExitIllegalStructure;
michael@0:     do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
michael@0:   } else {
michael@0:     // src >= srclimit, state0
michael@0:     // Normal termination, source fully consumed
michael@0:     e = kExitOK;
michael@0:   }
michael@0: 
michael@0:   if (offsetmap != NULL) {
michael@0:     if (src > copystart) {
michael@0:       offsetmap->Copy(src - copystart);
michael@0:       copystart = src;
michael@0:     }
michael@0:   }
michael@0: 
michael@0:   // Possible return values here:
michael@0:   //  kExitDstSpaceFull         caller may realloc and retry from middle
michael@0:   //  kExitIllegalStructure     caller my overwrite/truncate
michael@0:   //  kExitOK                   all done and happy
michael@0:   //  kExitReject               caller may overwrite/truncate
michael@0:   //  kExitDoAgain              LOOP NOT DONE; caller must retry from middle
michael@0:   //                            (may do fast ASCII loop first)
michael@0:   //  kExitPlaceholder          -unused-
michael@0:   //  kExitNone                 -unused-
michael@0:   *bytes_consumed = src - isrc;
michael@0:   *bytes_filled = dst - odst;
michael@0:   *chars_changed = total_changed;
michael@0:   return e;
michael@0: }
michael@0: 
michael@0: // TwoByte versions are needed for tables > 240 states, such
michael@0: // as the table for full Unicode 4.1 canonical + compatibility mapping
michael@0: 
michael@0: // Scan a UTF-8 stringpiece based on state table with two-byte entries,
michael@0: //   copying to output stringpiece
michael@0: //   and doing text replacements.
michael@0: // DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below
michael@0: //   Needs caller to loop on kExitDoAgain
michael@0: static int UTF8GenericReplaceInternalTwoByte(const UTF8ReplaceObj_2* st,
michael@0:                     const StringPiece& istr,
michael@0:                     StringPiece& ostr,
michael@0:                     bool is_plain_text,
michael@0:                     int* bytes_consumed,
michael@0:                     int* bytes_filled,
michael@0:                     int* chars_changed,
michael@0:                     OffsetMap* offsetmap) {
michael@0:   int eshift = st->entry_shift;
michael@0:   int nEntries = (1 << eshift);       // 64 or 256 entries per state
michael@0:   const uint8* isrc = reinterpret_cast<const uint8*>(istr.data());
michael@0:   const int ilen = istr.length();
michael@0:   const uint8* copystart = isrc;
michael@0:   const uint8* src = isrc;
michael@0:   const uint8* srclimit = src + ilen;
michael@0:   *bytes_consumed = 0;
michael@0:   *bytes_filled = 0;
michael@0:   *chars_changed = 0;
michael@0: 
michael@0:   const uint8* odst = reinterpret_cast<const uint8*>(ostr.data());
michael@0:   const int olen = ostr.length();
michael@0:   uint8* dst = const_cast<uint8*>(odst);
michael@0:   uint8* dstlimit = dst + olen;
michael@0: 
michael@0:   *chars_changed = 0;
michael@0: 
michael@0:   int total_changed = 0;
michael@0: 
michael@0:   int src_lll = srclimit - src;
michael@0:   int dst_lll = dstlimit - dst;
michael@0: 
michael@0: 
michael@0:   // Invariant condition during replacements:
michael@0:   //  remaining dst size >= remaining src size
michael@0:   if ((dstlimit - dst) < (srclimit - src)) {
michael@0:     if (offsetmap != NULL) {
michael@0:       offsetmap->Copy(src - copystart);
michael@0:       copystart = src;
michael@0:     }
michael@0:     return kExitDstSpaceFull_2;
michael@0:   }
michael@0:   const unsigned short* Tbl_0 = &st->state_table[st->state0];
michael@0: 
michael@0:  Do_state_table_2:
michael@0:   // Do state-table scan, copying as we go
michael@0:   const unsigned short* Tbl = Tbl_0;
michael@0:   int e = 0;
michael@0:   uint8 c = 0;
michael@0: 
michael@0:  Do_state_table_newe_2:
michael@0: 
michael@0:   //----------------------------
michael@0:   while (src < srclimit) {
michael@0:     c = *src;
michael@0:     e = Tbl[c];
michael@0:     *dst = c;
michael@0:     src++;
michael@0:     dst++;
michael@0:     if (e >= kExitIllegalStructure_2) {break;}
michael@0:     Tbl = &Tbl_0[e << eshift];
michael@0:   }
michael@0:   //----------------------------
michael@0:   src_lll = src - isrc;
michael@0:   dst_lll = dst - odst;
michael@0: 
michael@0:   // Exit possibilities:
michael@0:   //  Replacement code, do the replacement and loop
michael@0:   //  Some other exit code, state0, back up one byte exactly
michael@0:   //  Some other exit code, !state0, back up over last char
michael@0:   //  source consumed, state0, exit OK
michael@0:   //  source consumed, !state0, back up over partial char
michael@0:   // For illegal byte in state0, avoid backup up over PREVIOUS char
michael@0:   // For truncated last char, back up to beginning of it
michael@0: 
michael@0:   if (e >= kExitIllegalStructure_2) {
michael@0:     // Switch on exit code; most loop back to top
michael@0:     int offset = 0;
michael@0:     switch (e) {
michael@0:     // These all make the output string the same size or shorter
michael@0:     // No checking needed
michael@0:     case kExitReplace31_2:    // del 2, add 1 bytes to change
michael@0:       dst -= 2;
michael@0:       if (offsetmap != NULL) {
michael@0:         offsetmap->Copy(src - copystart - 2);
michael@0:         offsetmap->Delete(2);
michael@0:         copystart = src;
michael@0:       }
michael@0:       dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
michael@0:       total_changed++;
michael@0:       goto Do_state_table_2;
michael@0:     case kExitReplace32_2:    // del 3, add 2 bytes to change
michael@0:       dst--;
michael@0:       if (offsetmap != NULL) {
michael@0:         offsetmap->Copy(src - copystart - 1);
michael@0:         offsetmap->Delete(1);
michael@0:         copystart = src;
michael@0:       }
michael@0:       dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff);
michael@0:       dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
michael@0:       total_changed++;
michael@0:       goto Do_state_table_2;
michael@0:     case kExitReplace21_2:    // del 2, add 1 bytes to change
michael@0:       dst--;
michael@0:       if (offsetmap != NULL) {
michael@0:         offsetmap->Copy(src - copystart - 1);
michael@0:         offsetmap->Delete(1);
michael@0:         copystart = src;
michael@0:       }
michael@0:       dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
michael@0:       total_changed++;
michael@0:       goto Do_state_table_2;
michael@0:     case kExitReplace3_2:    // update 3 bytes to change
michael@0:       dst[-3] = (unsigned char)(Tbl[c + (nEntries * 2)] & 0xff);
michael@0:       // Fall into next case
michael@0:     case kExitReplace2_2:    // update 2 bytes to change
michael@0:       dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff);
michael@0:       // Fall into next case
michael@0:     case kExitReplace1_2:    // update 1 byte to change
michael@0:       dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
michael@0:       total_changed++;
michael@0:       goto Do_state_table_2;
michael@0:     case kExitReplace1S0_2:     // update 1 byte to change, 256-entry state
michael@0:       dst[-1] = (unsigned char)(Tbl[c + (256 * 1)] & 0xff);
michael@0:       total_changed++;
michael@0:       goto Do_state_table_2;
michael@0:     // These can make the output string longer than the input
michael@0:     case kExitReplaceOffset2_2:
michael@0:       if ((nEntries != 256) && InStateZero_2(st, Tbl)) {
michael@0:         // For space-optimized table, we need multiples of 256 bytes
michael@0:         // in state0 and multiples of nEntries in other states
michael@0:         offset += ((unsigned char)(Tbl[c + (256 * 1)] >> 8 & 0xff) << 8);
michael@0:       } else {
michael@0:         offset += ((unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff) << 8);
michael@0:       }
michael@0:       // Fall into next case
michael@0:     case kExitReplaceOffset1_2:
michael@0:       if ((nEntries != 256) && InStateZero_2(st, Tbl)) {
michael@0:         // For space-optimized table, we need multiples of 256 bytes
michael@0:         // in state0 and multiples of nEntries in other states
michael@0:         offset += (unsigned char)(Tbl[c + (256 * 1)] & 0xff);
michael@0:       } else {
michael@0:         offset += (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
michael@0:       }
michael@0:       {
michael@0:         const RemapEntry* re = &st->remap_base[offset];
michael@0:         int del_len = re->delete_bytes & ~kReplaceAndResumeFlag;
michael@0:         int add_len = re->add_bytes & ~kHtmlPlaintextFlag;
michael@0:         // Special-case non-HTML replacement of five sensitive entities
michael@0:         //   &quot; &amp; &apos; &lt; &gt;
michael@0:         //   0022   0026  0027   003c 003e
michael@0:         // A replacement creating one of these is expressed as a pair of
michael@0:         // entries, one for HTML output and one for plaintext output.
michael@0:         // The first of the pair has the high bit of add_bytes set.
michael@0:         if (re->add_bytes & kHtmlPlaintextFlag) {
michael@0:           // Use this entry for plain text
michael@0:           if (!is_plain_text) {
michael@0:             // Use very next entry for HTML text (same back/delete length)
michael@0:             re = &st->remap_base[offset + 1];
michael@0:             add_len = re->add_bytes & ~kHtmlPlaintextFlag;
michael@0:           }
michael@0:         }
michael@0: 
michael@0:         // After the replacement, need (dstlimit - dst) >= (srclimit - src)
michael@0:         int string_offset = re->bytes_offset;
michael@0:         // After the replacement, need (dstlimit - newdst) >= (srclimit - src)
michael@0:         uint8* newdst = dst - del_len + add_len;
michael@0:         if ((dstlimit - newdst) < (srclimit - src)) {
michael@0:           // Won't fit; don't do the replacement. Caller may realloc and retry
michael@0:           e = kExitDstSpaceFull_2;
michael@0:           break;    // exit, backing up over this char for later retry
michael@0:         }
michael@0:         dst -= del_len;
michael@0:         memcpy(dst, &st->remap_string[string_offset], add_len);
michael@0:         dst += add_len;
michael@0:         if (offsetmap != NULL) {
michael@0:           if (add_len > del_len) {
michael@0:             offsetmap->Copy(src - copystart);
michael@0:             offsetmap->Insert(add_len - del_len);
michael@0:             copystart = src;
michael@0:           } else if (add_len < del_len) {
michael@0:             offsetmap->Copy(src - copystart + add_len - del_len);
michael@0:             offsetmap->Delete(del_len - add_len);
michael@0:             copystart = src;
michael@0:           }
michael@0:         }
michael@0:         if (re->delete_bytes & kReplaceAndResumeFlag) {
michael@0:           // There is a two-byte non-zero target state at the end of the
michael@0:           // replacement string
michael@0:           uint8 c1 = st->remap_string[string_offset + add_len];
michael@0:           uint8 c2 = st->remap_string[string_offset + add_len + 1];
michael@0:           e = (c1 << 8) | c2;
michael@0:           Tbl = &Tbl_0[e << eshift];
michael@0:           total_changed++;
michael@0:           goto Do_state_table_newe_2;
michael@0:         }
michael@0:       }
michael@0:       total_changed++;
michael@0:       if (e == kExitRejectAlt_2) {break;}
michael@0:       goto Do_state_table_2;
michael@0: 
michael@0:     case kExitSpecial_2:           // NO special fixups [read: hacks]
michael@0:     case kExitIllegalStructure_2:  // structurally illegal byte; quit
michael@0:     case kExitReject_2:            // NUL or illegal code encountered; quit
michael@0:                                    // and all other exits
michael@0:     default:
michael@0:       break;
michael@0:     }   // End switch (e)
michael@0: 
michael@0:     // Exit possibilities:
michael@0:     //  Some other exit code, state0, back up one byte exactly
michael@0:     //  Some other exit code, !state0, back up over last char
michael@0: 
michael@0:     // Back up over exactly one byte of rejected/illegal UTF-8 character
michael@0:     src--;
michael@0:     dst--;
michael@0:     // Back up more if needed
michael@0:     if (!InStateZero_2(st, Tbl)) {
michael@0:       do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
michael@0:     }
michael@0:   } else if (!InStateZero_2(st, Tbl)) {
michael@0:     // src >= srclimit, !state0
michael@0:     // Back up over truncated UTF-8 character
michael@0:     e = kExitIllegalStructure_2;
michael@0: 
michael@0:     do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
michael@0:   } else {
michael@0:     // src >= srclimit, state0
michael@0:     // Normal termination, source fully consumed
michael@0:     e = kExitOK_2;
michael@0:   }
michael@0: 
michael@0:   if (offsetmap != NULL) {
michael@0:     if (src > copystart) {
michael@0:       offsetmap->Copy(src - copystart);
michael@0:       copystart = src;
michael@0:     }
michael@0:   }
michael@0: 
michael@0: 
michael@0:   // Possible return values here:
michael@0:   //  kExitDstSpaceFull_2         caller may realloc and retry from middle
michael@0:   //  kExitIllegalStructure_2     caller my overwrite/truncate
michael@0:   //  kExitOK_2                   all done and happy
michael@0:   //  kExitReject_2               caller may overwrite/truncate
michael@0:   //  kExitDoAgain_2              LOOP NOT DONE; caller must retry from middle
michael@0:   //                            (may do fast ASCII loop first)
michael@0:   //  kExitPlaceholder_2          -unused-
michael@0:   //  kExitNone_2                 -unused-
michael@0:   *bytes_consumed = src - isrc;
michael@0:   *bytes_filled = dst - odst;
michael@0:   *chars_changed = total_changed;
michael@0:   return e;
michael@0: }
michael@0: 
michael@0: 
michael@0: // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
michael@0: //   and doing text replacements.
michael@0: // Also writes an optional OffsetMap. Pass NULL to skip writing one.
michael@0: // Always scan complete UTF-8 characters
michael@0: // Set number of bytes consumed from input, number filled to output.
michael@0: // Return reason for exiting
michael@0: int UTF8GenericReplace(const UTF8ReplaceObj* st,
michael@0:                     const StringPiece& istr,
michael@0:                     StringPiece& ostr,
michael@0:                     bool is_plain_text,
michael@0:                     int* bytes_consumed,
michael@0:                     int* bytes_filled,
michael@0:                     int* chars_changed,
michael@0:                     OffsetMap* offsetmap) {
michael@0:   StringPiece local_istr(istr.data(), istr.length());
michael@0:   StringPiece local_ostr(ostr.data(), ostr.length());
michael@0:   int total_consumed = 0;
michael@0:   int total_filled = 0;
michael@0:   int total_changed = 0;
michael@0:   int local_bytes_consumed, local_bytes_filled, local_chars_changed;
michael@0:   int e;
michael@0:   do {
michael@0:     e = UTF8GenericReplaceInternal(st,
michael@0:                     local_istr, local_ostr, is_plain_text,
michael@0:                     &local_bytes_consumed, &local_bytes_filled,
michael@0:                     &local_chars_changed,
michael@0:                     offsetmap);
michael@0:     local_istr.remove_prefix(local_bytes_consumed);
michael@0:     local_ostr.remove_prefix(local_bytes_filled);
michael@0:     total_consumed += local_bytes_consumed;
michael@0:     total_filled += local_bytes_filled;
michael@0:     total_changed += local_chars_changed;
michael@0:   } while ( e == kExitDoAgain );
michael@0:   *bytes_consumed = total_consumed;
michael@0:   *bytes_filled = total_filled;
michael@0:   *chars_changed = total_changed;
michael@0:   return e;
michael@0: }
michael@0: 
michael@0: // Older version without offsetmap
michael@0: int UTF8GenericReplace(const UTF8ReplaceObj* st,
michael@0:                     const StringPiece& istr,
michael@0:                     StringPiece& ostr,
michael@0:                     bool is_plain_text,
michael@0:                     int* bytes_consumed,
michael@0:                     int* bytes_filled,
michael@0:                     int* chars_changed) {
michael@0:   return UTF8GenericReplace(st,
michael@0:                     istr,
michael@0:                     ostr,
michael@0:                     is_plain_text,
michael@0:                     bytes_consumed,
michael@0:                     bytes_filled,
michael@0:                     chars_changed,
michael@0:                     NULL);
michael@0: }
michael@0: 
michael@0: // Older version without is_plain_text or offsetmap
michael@0: int UTF8GenericReplace(const UTF8ReplaceObj* st,
michael@0:                     const StringPiece& istr,
michael@0:                     StringPiece& ostr,
michael@0:                     int* bytes_consumed,
michael@0:                     int* bytes_filled,
michael@0:                     int* chars_changed) {
michael@0:   bool is_plain_text = false;
michael@0:   return UTF8GenericReplace(st,
michael@0:                     istr,
michael@0:                     ostr,
michael@0:                     is_plain_text,
michael@0:                     bytes_consumed,
michael@0:                     bytes_filled,
michael@0:                     chars_changed,
michael@0:                     NULL);
michael@0: }
michael@0: 
michael@0: // Scan a UTF-8 stringpiece based on state table with two-byte entries,
michael@0: //   copying to output stringpiece
michael@0: //   and doing text replacements.
michael@0: // Also writes an optional OffsetMap. Pass NULL to skip writing one.
michael@0: // Always scan complete UTF-8 characters
michael@0: // Set number of bytes consumed from input, number filled to output.
michael@0: // Return reason for exiting
michael@0: int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
michael@0:                     const StringPiece& istr,
michael@0:                     StringPiece& ostr,
michael@0:                     bool is_plain_text,
michael@0:                     int* bytes_consumed,
michael@0:                     int* bytes_filled,
michael@0:                     int* chars_changed,
michael@0:                     OffsetMap* offsetmap) {
michael@0:   StringPiece local_istr(istr.data(), istr.length());
michael@0:   StringPiece local_ostr(ostr.data(), ostr.length());
michael@0:   int total_consumed = 0;
michael@0:   int total_filled = 0;
michael@0:   int total_changed = 0;
michael@0:   int local_bytes_consumed, local_bytes_filled, local_chars_changed;
michael@0:   int e;
michael@0:   do {
michael@0:     e = UTF8GenericReplaceInternalTwoByte(st,
michael@0:                     local_istr, local_ostr, is_plain_text,
michael@0:                     &local_bytes_consumed,
michael@0:                     &local_bytes_filled,
michael@0:                     &local_chars_changed,
michael@0:                     offsetmap);
michael@0:     local_istr.remove_prefix(local_bytes_consumed);
michael@0:     local_ostr.remove_prefix(local_bytes_filled);
michael@0:     total_consumed += local_bytes_consumed;
michael@0:     total_filled += local_bytes_filled;
michael@0:     total_changed += local_chars_changed;
michael@0:   } while ( e == kExitDoAgain_2 );
michael@0:   *bytes_consumed = total_consumed;
michael@0:   *bytes_filled = total_filled;
michael@0:   *chars_changed = total_changed;
michael@0: 
michael@0:   return e - kExitOK_2 + kExitOK;
michael@0: }
michael@0: 
michael@0: // Older version without offsetmap
michael@0: int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
michael@0:                     const StringPiece& istr,
michael@0:                     StringPiece& ostr,
michael@0:                     bool is_plain_text,
michael@0:                     int* bytes_consumed,
michael@0:                     int* bytes_filled,
michael@0:                     int* chars_changed) {
michael@0:   return UTF8GenericReplaceTwoByte(st,
michael@0:                     istr,
michael@0:                     ostr,
michael@0:                     is_plain_text,
michael@0:                     bytes_consumed,
michael@0:                     bytes_filled,
michael@0:                     chars_changed,
michael@0:                     NULL);
michael@0: }
michael@0: 
michael@0: // Older version without is_plain_text or offsetmap
michael@0: int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
michael@0:                     const StringPiece& istr,
michael@0:                     StringPiece& ostr,
michael@0:                     int* bytes_consumed,
michael@0:                     int* bytes_filled,
michael@0:                     int* chars_changed) {
michael@0:   bool is_plain_text = false;
michael@0:   return UTF8GenericReplaceTwoByte(st,
michael@0:                     istr,
michael@0:                     ostr,
michael@0:                     is_plain_text,
michael@0:                     bytes_consumed,
michael@0:                     bytes_filled,
michael@0:                     chars_changed,
michael@0:                     NULL);
michael@0: }
michael@0: 
michael@0: 
michael@0: 
michael@0: // Adjust a stringpiece to encompass complete UTF-8 characters.
michael@0: // The data pointer will be increased by 0..3 bytes to get to a character
michael@0: // boundary, and the length will then be decreased by 0..3 bytes
michael@0: // to encompass the last complete character.
michael@0: void UTF8TrimToChars(StringPiece* istr) {
michael@0:   const char* src = istr->data();
michael@0:   int len = istr->length();
michael@0:   // Exit if empty string
michael@0:   if (len == 0) {
michael@0:     return;
michael@0:   }
michael@0: 
michael@0:   // Exit on simple, common case
michael@0:   if ( ((src[0] & 0xc0) != 0x80) &&
michael@0:        (static_cast<signed char>(src[len - 1]) >= 0) ) {
michael@0:     // First byte is not a continuation and last byte is 7-bit ASCII -- done
michael@0:     return;
michael@0:   }
michael@0: 
michael@0:   // Adjust the back end, len > 0
michael@0:   const char* srclimit = src + len;
michael@0:   // Backscan over any ending continuation bytes to find last char start
michael@0:   const char* s = srclimit - 1;         // Last byte of the string
michael@0:   while ((src <= s) && ((*s & 0xc0) == 0x80)) {
michael@0:     s--;
michael@0:   }
michael@0:   // Include entire last char if it fits
michael@0:   if (src <= s) {
michael@0:     int last_char_len = UTF8OneCharLen(s);
michael@0:     if (s + last_char_len <= srclimit) {
michael@0:       // Last char fits, so include it, else exclude it
michael@0:       s += last_char_len;
michael@0:     }
michael@0:   }
michael@0:   if (s != srclimit) {
michael@0:     // s is one byte beyond the last full character, if any
michael@0:     istr->remove_suffix(srclimit - s);
michael@0:     // Exit if now empty string
michael@0:     if (istr->length() == 0) {
michael@0:       return;
michael@0:     }
michael@0:   }
michael@0: 
michael@0:   // Adjust the front end, len > 0
michael@0:   len = istr->length();
michael@0:   srclimit = src + len;
michael@0:   s = src;                            // First byte of the string
michael@0:   // Scan over any beginning continuation bytes to find first char start
michael@0:   while ((s < srclimit) && ((*s & 0xc0) == 0x80)) {
michael@0:     s++;
michael@0:   }
michael@0:   if (s != src) {
michael@0:     // s is at the first full character, if any
michael@0:     istr->remove_prefix(s - src);
michael@0:   }
michael@0: }
michael@0: 
michael@0: }       // End namespace CLD2