michael@0: // Copyright 2013 Google Inc. All Rights Reserved. michael@0: // michael@0: // Licensed under the Apache License, Version 2.0 (the "License"); michael@0: // you may not use this file except in compliance with the License. michael@0: // You may obtain a copy of the License at michael@0: // michael@0: // http://www.apache.org/licenses/LICENSE-2.0 michael@0: // michael@0: // Unless required by applicable law or agreed to in writing, software michael@0: // distributed under the License is distributed on an "AS IS" BASIS, michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. michael@0: // See the License for the specific language governing permissions and michael@0: // limitations under the License. michael@0: michael@0: // michael@0: // State Table follower for scanning UTF-8 strings without converting to michael@0: // 32- or 16-bit Unicode values. michael@0: // michael@0: michael@0: #ifdef COMPILER_MSVC michael@0: // MSVC warns: warning C4309: 'initializing' : truncation of constant value michael@0: // But the value is in fact not truncated. 0xFF still comes out 0xFF at michael@0: // runtime. michael@0: #pragma warning ( disable : 4309 ) michael@0: #endif michael@0: michael@0: #include "utf8statetable.h" michael@0: michael@0: #include // for uintptr_t michael@0: #include // for NULL, memcpy, memmove michael@0: michael@0: #include "integral_types.h" // for uint8, uint32, int8 michael@0: #include "stringpiece.h" michael@0: #include "offsetmap.h" michael@0: michael@0: michael@0: namespace CLD2 { michael@0: michael@0: static const int kReplaceAndResumeFlag = 0x80; // Bit in del byte to distinguish michael@0: // optional next-state field michael@0: // after replacement text michael@0: static const int kHtmlPlaintextFlag = 0x80; // Bit in add byte to distinguish michael@0: // HTML replacement vs. plaintext michael@0: michael@0: michael@0: /** michael@0: * This code implements a little interpreter for UTF8 state michael@0: * tables. There are three kinds of quite-similar state tables, michael@0: * property, scanning, and replacement. Each state in one of michael@0: * these tables consists of an array of 256 or 64 one-byte michael@0: * entries. The state is subscripted by an incoming source byte, michael@0: * and the entry either specifies the next state or specifies an michael@0: * action. Space-optimized tables have full 256-entry states for michael@0: * the first byte of a UTF-8 character, but only 64-entry states michael@0: * for continuation bytes. Space-optimized tables may only be michael@0: * used with source input that has been checked to be michael@0: * structurally- (or stronger interchange-) valid. michael@0: * michael@0: * A property state table has an unsigned one-byte property for michael@0: * each possible UTF-8 character. One-byte character properties michael@0: * are in the state[0] array, while for other lengths the michael@0: * state[0] array gives the next state, which contains the michael@0: * property value for two-byte characters or yet another state michael@0: * for longer ones. The code simply loads the right number of michael@0: * next-state values, then returns the final byte as property michael@0: * value. There are no actions specified in property tables. michael@0: * States are typically shared for multi-byte UTF-8 characters michael@0: * that all have the same property value. michael@0: * michael@0: * A scanning state table has entries that are either a michael@0: * next-state specifier for bytes that are accepted by the michael@0: * scanner, or an exit action for the last byte of each michael@0: * character that is rejected by the scanner. michael@0: * michael@0: * Scanning long strings involves a tight loop that picks up one michael@0: * byte at a time and follows next-state value back to state[0] michael@0: * for each accepted UTF-8 character. Scanning stops at the end michael@0: * of the string or at the first character encountered that has michael@0: * an exit action such as "reject". Timing information is given michael@0: * below. michael@0: * michael@0: * Since so much of Google's text is 7-bit-ASCII values michael@0: * (approximately 94% of the bytes of web documents), the michael@0: * scanning interpreter has two speed optimizations. One checks michael@0: * 8 bytes at a time to see if they are all in the range lo..hi, michael@0: * as specified in constants in the overall statetable object. michael@0: * The check involves ORing together four 4-byte values that michael@0: * overflow into the high bit of some byte when a byte is out of michael@0: * range. For seven-bit-ASCII, lo is 0x20 and hi is 0x7E. This michael@0: * loop is about 8x faster than the one-byte-at-a-time loop. michael@0: * michael@0: * If checking for exit bytes in the 0x00-0x1F and 7F range is michael@0: * unneeded, an even faster loop just looks at the high bits of michael@0: * 8 bytes at once, and is about 1.33x faster than the lo..hi michael@0: * loop. michael@0: * michael@0: * Exit from the scanning routines backs up to the first byte of michael@0: * the rejected character, so the text spanned is always a michael@0: * complete number of UTF-8 characters. The normal scanning exit michael@0: * is at the first rejected character, or at the end of the michael@0: * input text. Scanning also exits on any detected ill-formed michael@0: * character or at a special do-again action built into some michael@0: * exit-optimized tables. The do-again action gets back to the michael@0: * top of the scanning loop to retry eight-byte ASCII scans. It michael@0: * is typically put into state tables after four seven-bit-ASCII michael@0: * characters in a row are seen, to allow restarting the fast michael@0: * scan after some slower processing of multi-byte characters. michael@0: * michael@0: * A replacement state table is similar to a scanning state michael@0: * table but has more extensive actions. The default michael@0: * byte-at-a-time loop copies one byte from source to michael@0: * destination and goes to the next state. The replacement michael@0: * actions overwrite 1-3 bytes of the destination with different michael@0: * bytes, possibly shortening the output by 1 or 2 bytes. The michael@0: * replacement bytes come from within the state table, from michael@0: * dummy states inserted just after any state that contains a michael@0: * replacement action. This gives a quick address calculation for michael@0: * the replacement byte(s) and gives some cache locality. michael@0: * michael@0: * Additional replacement actions use one or two bytes from michael@0: * within dummy states to index a side table of more-extensive michael@0: * replacements. The side table specifies a length of 0..15 michael@0: * destination bytes to overwrite and a length of 0..127 bytes michael@0: * to overwrite them with, plus the actual replacement bytes. michael@0: * michael@0: * This side table uses one extra bit to specify a pair of michael@0: * replacements, the first to be used in an HTML context and the michael@0: * second to be used in a plaintext context. This allows michael@0: * replacements that are spelled with "<" in the former michael@0: * context and "<" in the latter. michael@0: * michael@0: * The side table also uses an extra bit to specify a non-zero michael@0: * next state after a replacement. This allows a combination michael@0: * replacement and state change, used to implement a limited michael@0: * version of the Boyer-Moore algorithm for multi-character michael@0: * replacement without backtracking. This is useful when there michael@0: * are overlapping replacements, such as ch => x and also c => michael@0: * y, the latter to be used only if the character after c is not michael@0: * h. in this case, the state[0] table's entry for c would michael@0: * change c to y and also have a next-state of say n, and the michael@0: * state[n] entry for h would specify a replacement of the two michael@0: * bytes yh by x. No backtracking is needed. michael@0: * michael@0: * A replacement table may also include the exit actions of a michael@0: * scanning state table, so some character sequences can michael@0: * terminate early. michael@0: * michael@0: * During replacement, an optional data structure called an michael@0: * offset map can be updated to reflect each change in length michael@0: * between source and destination. This offset map can later be michael@0: * used to map destination-string offsets to corresponding michael@0: * source-string offsets or vice versa. michael@0: * michael@0: * The routines below also have variants in which state-table michael@0: * entries are all two bytes instead of one byte. This allows michael@0: * tables with more than 240 total states, but takes up twice as michael@0: * much space per state. michael@0: * michael@0: **/ michael@0: michael@0: // Return true if current Tbl pointer is within state0 range michael@0: // Note that unsigned compare checks both ends of range simultaneously michael@0: static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) { michael@0: const uint8* Tbl0 = &st->state_table[st->state0]; michael@0: return (static_cast(Tbl - Tbl0) < st->state0_size); michael@0: } michael@0: michael@0: static inline bool InStateZero_2(const UTF8ReplaceObj_2* st, michael@0: const unsigned short int* Tbl) { michael@0: const unsigned short int* Tbl0 = &st->state_table[st->state0]; michael@0: // Word difference, not byte difference michael@0: return (static_cast(Tbl - Tbl0) < st->state0_size); michael@0: } michael@0: michael@0: // UTF8PropObj, UTF8ScanObj, UTF8ReplaceObj are all typedefs of michael@0: // UTF8MachineObj. michael@0: michael@0: static bool IsPropObj(const UTF8StateMachineObj& obj) { michael@0: return obj.fast_state == NULL michael@0: && obj.max_expand == 0; michael@0: } michael@0: michael@0: static bool IsPropObj_2(const UTF8StateMachineObj_2& obj) { michael@0: return obj.fast_state == NULL michael@0: && obj.max_expand == 0; michael@0: } michael@0: michael@0: static bool IsScanObj(const UTF8StateMachineObj& obj) { michael@0: return obj.fast_state != NULL michael@0: && obj.max_expand == 0; michael@0: } michael@0: michael@0: static bool IsReplaceObj(const UTF8StateMachineObj& obj) { michael@0: // Normally, obj.fast_state != NULL, but the handwritten tables michael@0: // in utf8statetable_unittest don't handle fast_states. michael@0: return obj.max_expand > 0; michael@0: } michael@0: michael@0: static bool IsReplaceObj_2(const UTF8StateMachineObj_2& obj) { michael@0: return obj.max_expand > 0; michael@0: } michael@0: michael@0: // Look up property of one UTF-8 character and advance over it michael@0: // Return 0 if input length is zero michael@0: // Return 0 and advance one byte if input is ill-formed michael@0: uint8 UTF8GenericProperty(const UTF8PropObj* st, michael@0: const uint8** src, michael@0: int* srclen) { michael@0: if (*srclen <= 0) { michael@0: return 0; michael@0: } michael@0: michael@0: const uint8* lsrc = *src; michael@0: const uint8* Tbl_0 = &st->state_table[st->state0]; michael@0: const uint8* Tbl = Tbl_0; michael@0: int e; michael@0: int eshift = st->entry_shift; michael@0: michael@0: // Short series of tests faster than switch, optimizes 7-bit ASCII michael@0: unsigned char c = lsrc[0]; michael@0: if (static_cast(c) >= 0) { // one byte michael@0: e = Tbl[c]; michael@0: *src += 1; michael@0: *srclen -= 1; michael@0: } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes michael@0: e = Tbl[c]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[1]]; michael@0: *src += 2; michael@0: *srclen -= 2; michael@0: } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes michael@0: e = Tbl[c]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[1]]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[2]]; michael@0: *src += 3; michael@0: *srclen -= 3; michael@0: }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes michael@0: e = Tbl[c]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[1]]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[2]]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[3]]; michael@0: *src += 4; michael@0: *srclen -= 4; michael@0: } else { // Ill-formed michael@0: e = 0; michael@0: *src += 1; michael@0: *srclen -= 1; michael@0: } michael@0: return e; michael@0: } michael@0: michael@0: bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src) { michael@0: const uint8* lsrc = reinterpret_cast(src); michael@0: const uint8* Tbl_0 = &st.state_table[st.state0]; michael@0: const uint8* Tbl = Tbl_0; michael@0: int e; michael@0: int eshift = st.entry_shift; michael@0: michael@0: // Short series of tests faster than switch, optimizes 7-bit ASCII michael@0: unsigned char c = lsrc[0]; michael@0: if (static_cast(c) >= 0) { // one byte michael@0: e = Tbl[c]; michael@0: } else if ((c & 0xe0) == 0xc0) { // two bytes michael@0: e = Tbl[c]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[1]]; michael@0: } else if ((c & 0xf0) == 0xe0) { // three bytes michael@0: e = Tbl[c]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[1]]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[2]]; michael@0: } else { // four bytes michael@0: e = Tbl[c]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[1]]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[2]]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[3]]; michael@0: } michael@0: return e; michael@0: } michael@0: michael@0: michael@0: // BigOneByte versions are needed for tables > 240 states, but most michael@0: // won't need the TwoByte versions. michael@0: // Internally, to next-to-last offset is multiplied by 16 and the last michael@0: // offset is relative instead of absolute. michael@0: // Look up property of one UTF-8 character and advance over it michael@0: // Return 0 if input length is zero michael@0: // Return 0 and advance one byte if input is ill-formed michael@0: uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st, michael@0: const uint8** src, michael@0: int* srclen) { michael@0: if (*srclen <= 0) { michael@0: return 0; michael@0: } michael@0: michael@0: const uint8* lsrc = *src; michael@0: const uint8* Tbl_0 = &st->state_table[st->state0]; michael@0: const uint8* Tbl = Tbl_0; michael@0: int e; michael@0: int eshift = st->entry_shift; michael@0: michael@0: // Short series of tests faster than switch, optimizes 7-bit ASCII michael@0: unsigned char c = lsrc[0]; michael@0: if (static_cast(c) >= 0) { // one byte michael@0: e = Tbl[c]; michael@0: *src += 1; michael@0: *srclen -= 1; michael@0: } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes michael@0: e = Tbl[c]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[1]]; michael@0: *src += 2; michael@0: *srclen -= 2; michael@0: } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes michael@0: e = Tbl[c]; michael@0: Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range michael@0: e = (reinterpret_cast(Tbl))[lsrc[1]]; michael@0: Tbl = &Tbl[e << eshift]; // Relative +/- michael@0: e = Tbl[lsrc[2]]; michael@0: *src += 3; michael@0: *srclen -= 3; michael@0: }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes michael@0: e = Tbl[c]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[1]]; michael@0: Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range michael@0: e = (reinterpret_cast(Tbl))[lsrc[2]]; michael@0: Tbl = &Tbl[e << eshift]; // Relative +/- michael@0: e = Tbl[lsrc[3]]; michael@0: *src += 4; michael@0: *srclen -= 4; michael@0: } else { // Ill-formed michael@0: e = 0; michael@0: *src += 1; michael@0: *srclen -= 1; michael@0: } michael@0: return e; michael@0: } michael@0: michael@0: // BigOneByte versions are needed for tables > 240 states, but most michael@0: // won't need the TwoByte versions. michael@0: bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src) { michael@0: const uint8* lsrc = reinterpret_cast(src); michael@0: const uint8* Tbl_0 = &st.state_table[st.state0]; michael@0: const uint8* Tbl = Tbl_0; michael@0: int e; michael@0: int eshift = st.entry_shift; michael@0: michael@0: // Short series of tests faster than switch, optimizes 7-bit ASCII michael@0: unsigned char c = lsrc[0]; michael@0: if (static_cast(c) >= 0) { // one byte michael@0: e = Tbl[c]; michael@0: } else if ((c & 0xe0) == 0xc0) { // two bytes michael@0: e = Tbl[c]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[1]]; michael@0: } else if ((c & 0xf0) == 0xe0) { // three bytes michael@0: e = Tbl[c]; michael@0: Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range michael@0: e = (reinterpret_cast(Tbl))[lsrc[1]]; michael@0: Tbl = &Tbl[e << eshift]; // Relative +/- michael@0: e = Tbl[lsrc[2]]; michael@0: } else { // four bytes michael@0: e = Tbl[c]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[1]]; michael@0: Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range michael@0: e = (reinterpret_cast(Tbl))[lsrc[2]]; michael@0: Tbl = &Tbl[e << eshift]; // Relative +/- michael@0: e = Tbl[lsrc[3]]; michael@0: } michael@0: return e; michael@0: } michael@0: michael@0: michael@0: // TwoByte versions are needed for tables > 240 states michael@0: // Look up property of one UTF-8 character and advance over it michael@0: // Return 0 if input length is zero michael@0: // Return 0 and advance one byte if input is ill-formed michael@0: uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st, michael@0: const uint8** src, michael@0: int* srclen) { michael@0: if (*srclen <= 0) { michael@0: return 0; michael@0: } michael@0: michael@0: const uint8* lsrc = *src; michael@0: const unsigned short* Tbl_0 = &st->state_table[st->state0]; michael@0: const unsigned short* Tbl = Tbl_0; michael@0: int e; michael@0: int eshift = st->entry_shift; michael@0: michael@0: // Short series of tests faster than switch, optimizes 7-bit ASCII michael@0: unsigned char c = lsrc[0]; michael@0: if (static_cast(c) >= 0) { // one byte michael@0: e = Tbl[c]; michael@0: *src += 1; michael@0: *srclen -= 1; michael@0: } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes michael@0: e = Tbl[c]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[1]]; michael@0: *src += 2; michael@0: *srclen -= 2; michael@0: } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes michael@0: e = Tbl[c]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[1]]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[2]]; michael@0: *src += 3; michael@0: *srclen -= 3; michael@0: }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes michael@0: e = Tbl[c]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[1]]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[2]]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[3]]; michael@0: *src += 4; michael@0: *srclen -= 4; michael@0: } else { // Ill-formed michael@0: e = 0; michael@0: *src += 1; michael@0: *srclen -= 1; michael@0: } michael@0: return e; michael@0: } michael@0: michael@0: // TwoByte versions are needed for tables > 240 states michael@0: bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src) { michael@0: const uint8* lsrc = reinterpret_cast(src); michael@0: const unsigned short* Tbl_0 = &st.state_table[st.state0]; michael@0: const unsigned short* Tbl = Tbl_0; michael@0: int e; michael@0: int eshift = st.entry_shift; michael@0: michael@0: // Short series of tests faster than switch, optimizes 7-bit ASCII michael@0: unsigned char c = lsrc[0]; michael@0: if (static_cast(c) >= 0) { // one byte michael@0: e = Tbl[c]; michael@0: } else if ((c & 0xe0) == 0xc0) { // two bytes michael@0: e = Tbl[c]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[1]]; michael@0: } else if ((c & 0xf0) == 0xe0) { // three bytes michael@0: e = Tbl[c]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[1]]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[2]]; michael@0: } else { // four bytes michael@0: e = Tbl[c]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[1]]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[2]]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: e = Tbl[lsrc[3]]; michael@0: } michael@0: return e; michael@0: } michael@0: michael@0: michael@0: // Approximate speeds on 2.8 GHz Pentium 4: michael@0: // GenericScan 1-byte loop 300 MB/sec * michael@0: // GenericScan 4-byte loop 1200 MB/sec michael@0: // GenericScan 8-byte loop 2400 MB/sec * michael@0: // GenericScanFastAscii 4-byte loop 3000 MB/sec michael@0: // GenericScanFastAscii 8-byte loop 3200 MB/sec * michael@0: // michael@0: // * Implemented below. FastAscii loop is memory-bandwidth constrained. michael@0: michael@0: // Scan a UTF-8 stringpiece based on state table. michael@0: // Always scan complete UTF-8 characters michael@0: // Set number of bytes scanned. Return reason for exiting michael@0: int UTF8GenericScan(const UTF8ScanObj* st, michael@0: const StringPiece& str, michael@0: int* bytes_consumed) { michael@0: int eshift = st->entry_shift; // 6 (space optimized) or 8 michael@0: // int nEntries = (1 << eshift); // 64 or 256 entries per state michael@0: michael@0: const uint8* isrc = michael@0: reinterpret_cast(str.data()); michael@0: const uint8* src = isrc; michael@0: const int len = str.length(); michael@0: const uint8* srclimit = isrc + len; michael@0: const uint8* srclimit8 = srclimit - 7; michael@0: *bytes_consumed = 0; michael@0: if (len == 0) return kExitOK; michael@0: michael@0: const uint8* Tbl_0 = &st->state_table[st->state0]; michael@0: michael@0: DoAgain: michael@0: // Do state-table scan michael@0: int e = 0; michael@0: uint8 c; michael@0: michael@0: // Do fast for groups of 8 identity bytes. michael@0: // This covers a lot of 7-bit ASCII ~8x faster than the 1-byte loop, michael@0: // including slowing slightly on cr/lf/ht michael@0: //---------------------------- michael@0: const uint8* Tbl2 = &st->fast_state[0]; michael@0: uint32 losub = st->losub; michael@0: uint32 hiadd = st->hiadd; michael@0: while (src < srclimit8) { michael@0: uint32 s0123 = (reinterpret_cast(src))[0]; michael@0: uint32 s4567 = (reinterpret_cast(src))[1]; michael@0: src += 8; michael@0: // This is a fast range check for all bytes in [lowsub..0x80-hiadd) michael@0: uint32 temp = (s0123 - losub) | (s0123 + hiadd) | michael@0: (s4567 - losub) | (s4567 + hiadd); michael@0: if ((temp & 0x80808080) != 0) { michael@0: // We typically end up here on cr/lf/ht; src was incremented michael@0: int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) | michael@0: (Tbl2[src[-6]] | Tbl2[src[-5]]); michael@0: if (e0123 != 0) {src -= 8; break;} // Exit on Non-interchange michael@0: e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) | michael@0: (Tbl2[src[-2]] | Tbl2[src[-1]]); michael@0: if (e0123 != 0) {src -= 4; break;} // Exit on Non-interchange michael@0: // Else OK, go around again michael@0: } michael@0: } michael@0: //---------------------------- michael@0: michael@0: // Byte-at-a-time scan michael@0: //---------------------------- michael@0: const uint8* Tbl = Tbl_0; michael@0: while (src < srclimit) { michael@0: c = *src; michael@0: e = Tbl[c]; michael@0: src++; michael@0: if (e >= kExitIllegalStructure) {break;} michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: } michael@0: //---------------------------- michael@0: michael@0: michael@0: // Exit possibilities: michael@0: // Some exit code, !state0, back up over last char michael@0: // Some exit code, state0, back up one byte exactly michael@0: // source consumed, !state0, back up over partial char michael@0: // source consumed, state0, exit OK michael@0: // For illegal byte in state0, avoid backup up over PREVIOUS char michael@0: // For truncated last char, back up to beginning of it michael@0: michael@0: if (e >= kExitIllegalStructure) { michael@0: // Back up over exactly one byte of rejected/illegal UTF-8 character michael@0: src--; michael@0: // Back up more if needed michael@0: if (!InStateZero(st, Tbl)) { michael@0: do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); michael@0: } michael@0: } else if (!InStateZero(st, Tbl)) { michael@0: // Back up over truncated UTF-8 character michael@0: e = kExitIllegalStructure; michael@0: do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); michael@0: } else { michael@0: // Normal termination, source fully consumed michael@0: e = kExitOK; michael@0: } michael@0: michael@0: if (e == kExitDoAgain) { michael@0: // Loop back up to the fast scan michael@0: goto DoAgain; michael@0: } michael@0: michael@0: *bytes_consumed = src - isrc; michael@0: return e; michael@0: } michael@0: michael@0: // Scan a UTF-8 stringpiece based on state table. michael@0: // Always scan complete UTF-8 characters michael@0: // Set number of bytes scanned. Return reason for exiting michael@0: // OPTIMIZED for case of 7-bit ASCII 0000..007f all valid michael@0: int UTF8GenericScanFastAscii(const UTF8ScanObj* st, michael@0: const StringPiece& str, michael@0: int* bytes_consumed) { michael@0: const uint8* isrc = michael@0: reinterpret_cast(str.data()); michael@0: const uint8* src = isrc; michael@0: const int len = str.length(); michael@0: const uint8* srclimit = isrc + len; michael@0: const uint8* srclimit8 = srclimit - 7; michael@0: *bytes_consumed = 0; michael@0: if (len == 0) return kExitOK; michael@0: michael@0: int n; michael@0: int rest_consumed; michael@0: int exit_reason; michael@0: do { michael@0: // Skip 8 bytes of ASCII at a whack; no endianness issue michael@0: while ((src < srclimit8) && michael@0: (((reinterpret_cast(src)[0] | michael@0: reinterpret_cast(src)[1]) & 0x80808080) == 0)) { michael@0: src += 8; michael@0: } michael@0: // Run state table on the rest michael@0: n = src - isrc; michael@0: StringPiece str2(str.data() + n, str.length() - n); michael@0: exit_reason = UTF8GenericScan(st, str2, &rest_consumed); michael@0: src += rest_consumed; michael@0: } while ( exit_reason == kExitDoAgain ); michael@0: michael@0: *bytes_consumed = src - isrc; michael@0: return exit_reason; michael@0: } michael@0: michael@0: // Hack to change halfwidth katakana to match an old UTF8CharToLower() michael@0: michael@0: // Return number of src bytes skipped michael@0: static int DoSpecialFixup(const unsigned char c, michael@0: const unsigned char** srcp, const unsigned char* srclimit, michael@0: unsigned char** dstp, unsigned char* dstlimit) { michael@0: return 0; michael@0: } michael@0: michael@0: michael@0: // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece michael@0: // and doing text replacements. michael@0: // DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below michael@0: // Needs caller to loop on kExitDoAgain michael@0: static int UTF8GenericReplaceInternal(const UTF8ReplaceObj* st, michael@0: const StringPiece& istr, michael@0: StringPiece& ostr, michael@0: bool is_plain_text, michael@0: int* bytes_consumed, michael@0: int* bytes_filled, michael@0: int* chars_changed, michael@0: OffsetMap* offsetmap) { michael@0: int eshift = st->entry_shift; michael@0: int nEntries = (1 << eshift); // 64 or 256 entries per state michael@0: const uint8* isrc = reinterpret_cast(istr.data()); michael@0: const int ilen = istr.length(); michael@0: const uint8* copystart = isrc; michael@0: const uint8* src = isrc; michael@0: const uint8* srclimit = src + ilen; michael@0: *bytes_consumed = 0; michael@0: *bytes_filled = 0; michael@0: *chars_changed = 0; michael@0: michael@0: const uint8* odst = reinterpret_cast(ostr.data()); michael@0: const int olen = ostr.length(); michael@0: uint8* dst = const_cast(odst); michael@0: uint8* dstlimit = dst + olen; michael@0: michael@0: int total_changed = 0; michael@0: michael@0: // Invariant condition during replacements: michael@0: // remaining dst size >= remaining src size michael@0: if ((dstlimit - dst) < (srclimit - src)) { michael@0: if (offsetmap != NULL) { michael@0: offsetmap->Copy(src - copystart); michael@0: copystart = src; michael@0: } michael@0: return kExitDstSpaceFull; michael@0: } michael@0: const uint8* Tbl_0 = &st->state_table[st->state0]; michael@0: michael@0: Do_state_table: michael@0: // Do state-table scan, copying as we go michael@0: const uint8* Tbl = Tbl_0; michael@0: int e = 0; michael@0: uint8 c = 0; michael@0: michael@0: Do_state_table_newe: michael@0: michael@0: //---------------------------- michael@0: while (src < srclimit) { michael@0: c = *src; michael@0: e = Tbl[c]; michael@0: *dst = c; michael@0: src++; michael@0: dst++; michael@0: if (e >= kExitIllegalStructure) {break;} michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: } michael@0: //---------------------------- michael@0: michael@0: // Exit possibilities: michael@0: // Replacement code, do the replacement and loop michael@0: // Some other exit code, state0, back up one byte exactly michael@0: // Some other exit code, !state0, back up over last char michael@0: // source consumed, state0, exit OK michael@0: // source consumed, !state0, back up over partial char michael@0: // For illegal byte in state0, avoid backup up over PREVIOUS char michael@0: // For truncated last char, back up to beginning of it michael@0: michael@0: if (e >= kExitIllegalStructure) { michael@0: // Switch on exit code; most loop back to top michael@0: int offset = 0; michael@0: switch (e) { michael@0: // These all make the output string the same size or shorter michael@0: // No checking needed michael@0: case kExitReplace31: // del 2, add 1 bytes to change michael@0: dst -= 2; michael@0: if (offsetmap != NULL) { michael@0: offsetmap->Copy(src - copystart - 2); michael@0: offsetmap->Delete(2); michael@0: copystart = src; michael@0: } michael@0: dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)]; michael@0: total_changed++; michael@0: goto Do_state_table; michael@0: case kExitReplace32: // del 3, add 2 bytes to change michael@0: dst--; michael@0: if (offsetmap != NULL) { michael@0: offsetmap->Copy(src - copystart - 1); michael@0: offsetmap->Delete(1); michael@0: copystart = src; michael@0: } michael@0: dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)]; michael@0: dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)]; michael@0: total_changed++; michael@0: goto Do_state_table; michael@0: case kExitReplace21: // del 2, add 1 bytes to change michael@0: dst--; michael@0: if (offsetmap != NULL) { michael@0: offsetmap->Copy(src - copystart - 1); michael@0: offsetmap->Delete(1); michael@0: copystart = src; michael@0: } michael@0: dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)]; michael@0: total_changed++; michael@0: goto Do_state_table; michael@0: case kExitReplace3: // update 3 bytes to change michael@0: dst[-3] = (unsigned char)Tbl[c + (nEntries * 3)]; michael@0: // Fall into next case michael@0: case kExitReplace2: // update 2 bytes to change michael@0: dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)]; michael@0: // Fall into next case michael@0: case kExitReplace1: // update 1 byte to change michael@0: dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)]; michael@0: total_changed++; michael@0: goto Do_state_table; michael@0: case kExitReplace1S0: // update 1 byte to change, 256-entry state michael@0: dst[-1] = (unsigned char)Tbl[c + (256 * 1)]; michael@0: total_changed++; michael@0: goto Do_state_table; michael@0: // These can make the output string longer than the input michael@0: case kExitReplaceOffset2: michael@0: if ((nEntries != 256) && InStateZero(st, Tbl)) { michael@0: // For space-optimized table, we need multiples of 256 bytes michael@0: // in state0 and multiples of nEntries in other states michael@0: offset += ((unsigned char)Tbl[c + (256 * 2)] << 8); michael@0: } else { michael@0: offset += ((unsigned char)Tbl[c + (nEntries * 2)] << 8); michael@0: } michael@0: // Fall into next case michael@0: case kExitSpecial: // Apply special fixups [read: hacks] michael@0: case kExitReplaceOffset1: michael@0: if ((nEntries != 256) && InStateZero(st, Tbl)) { michael@0: // For space-optimized table, we need multiples of 256 bytes michael@0: // in state0 and multiples of nEntries in other states michael@0: offset += (unsigned char)Tbl[c + (256 * 1)]; michael@0: } else { michael@0: offset += (unsigned char)Tbl[c + (nEntries * 1)]; michael@0: } michael@0: { michael@0: const RemapEntry* re = &st->remap_base[offset]; michael@0: int del_len = re->delete_bytes & ~kReplaceAndResumeFlag; michael@0: int add_len = re->add_bytes & ~kHtmlPlaintextFlag; michael@0: michael@0: // Special-case non-HTML replacement of five sensitive entities michael@0: // " & ' < > michael@0: // 0022 0026 0027 003c 003e michael@0: // A replacement creating one of these is expressed as a pair of michael@0: // entries, one for HTML output and one for plaintext output. michael@0: // The first of the pair has the high bit of add_bytes set. michael@0: if (re->add_bytes & kHtmlPlaintextFlag) { michael@0: // Use this entry for plain text michael@0: if (!is_plain_text) { michael@0: // Use very next entry for HTML text (same back/delete length) michael@0: re = &st->remap_base[offset + 1]; michael@0: add_len = re->add_bytes & ~kHtmlPlaintextFlag; michael@0: } michael@0: } michael@0: michael@0: int string_offset = re->bytes_offset; michael@0: // After the replacement, need (dstlimit - newdst) >= (srclimit - src) michael@0: uint8* newdst = dst - del_len + add_len; michael@0: if ((dstlimit - newdst) < (srclimit - src)) { michael@0: // Won't fit; don't do the replacement. Caller may realloc and retry michael@0: e = kExitDstSpaceFull; michael@0: break; // exit, backing up over this char for later retry michael@0: } michael@0: dst -= del_len; michael@0: memcpy(dst, &st->remap_string[string_offset], add_len); michael@0: dst += add_len; michael@0: total_changed++; michael@0: if (offsetmap != NULL) { michael@0: if (add_len > del_len) { michael@0: offsetmap->Copy(src - copystart); michael@0: offsetmap->Insert(add_len - del_len); michael@0: copystart = src; michael@0: } else if (add_len < del_len) { michael@0: offsetmap->Copy(src - copystart + add_len - del_len); michael@0: offsetmap->Delete(del_len - add_len); michael@0: copystart = src; michael@0: } michael@0: } michael@0: if (re->delete_bytes & kReplaceAndResumeFlag) { michael@0: // There is a non-zero target state at the end of the michael@0: // replacement string michael@0: e = st->remap_string[string_offset + add_len]; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: goto Do_state_table_newe; michael@0: } michael@0: } michael@0: if (e == kExitRejectAlt) {break;} michael@0: if (e != kExitSpecial) {goto Do_state_table;} michael@0: michael@0: // case kExitSpecial: // Apply special fixups [read: hacks] michael@0: // In this routine, do either UTF8CharToLower() michael@0: // fullwidth/halfwidth mapping or michael@0: // voiced mapping or michael@0: // semi-voiced mapping michael@0: michael@0: // First, do EXIT_REPLACE_OFFSET1 action (above) michael@0: // Second: do additional code fixup michael@0: { michael@0: int srcdel = DoSpecialFixup(c, &src, srclimit, &dst, dstlimit); michael@0: if (offsetmap != NULL) { michael@0: if (srcdel != 0) { michael@0: offsetmap->Copy(src - copystart - srcdel); michael@0: offsetmap->Delete(srcdel); michael@0: copystart = src; michael@0: } michael@0: } michael@0: } michael@0: goto Do_state_table; michael@0: michael@0: case kExitIllegalStructure: // structurally illegal byte; quit michael@0: case kExitReject: // NUL or illegal code encountered; quit michael@0: case kExitRejectAlt: // Apply replacement, then exit michael@0: default: // and all other exits michael@0: break; michael@0: } // End switch (e) michael@0: michael@0: // Exit possibilities: michael@0: // Some other exit code, state0, back up one byte exactly michael@0: // Some other exit code, !state0, back up over last char michael@0: michael@0: // Back up over exactly one byte of rejected/illegal UTF-8 character michael@0: src--; michael@0: dst--; michael@0: // Back up more if needed michael@0: if (!InStateZero(st, Tbl)) { michael@0: do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); michael@0: } michael@0: } else if (!InStateZero(st, Tbl)) { michael@0: // src >= srclimit, !state0 michael@0: // Back up over truncated UTF-8 character michael@0: e = kExitIllegalStructure; michael@0: do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); michael@0: } else { michael@0: // src >= srclimit, state0 michael@0: // Normal termination, source fully consumed michael@0: e = kExitOK; michael@0: } michael@0: michael@0: if (offsetmap != NULL) { michael@0: if (src > copystart) { michael@0: offsetmap->Copy(src - copystart); michael@0: copystart = src; michael@0: } michael@0: } michael@0: michael@0: // Possible return values here: michael@0: // kExitDstSpaceFull caller may realloc and retry from middle michael@0: // kExitIllegalStructure caller my overwrite/truncate michael@0: // kExitOK all done and happy michael@0: // kExitReject caller may overwrite/truncate michael@0: // kExitDoAgain LOOP NOT DONE; caller must retry from middle michael@0: // (may do fast ASCII loop first) michael@0: // kExitPlaceholder -unused- michael@0: // kExitNone -unused- michael@0: *bytes_consumed = src - isrc; michael@0: *bytes_filled = dst - odst; michael@0: *chars_changed = total_changed; michael@0: return e; michael@0: } michael@0: michael@0: // TwoByte versions are needed for tables > 240 states, such michael@0: // as the table for full Unicode 4.1 canonical + compatibility mapping michael@0: michael@0: // Scan a UTF-8 stringpiece based on state table with two-byte entries, michael@0: // copying to output stringpiece michael@0: // and doing text replacements. michael@0: // DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below michael@0: // Needs caller to loop on kExitDoAgain michael@0: static int UTF8GenericReplaceInternalTwoByte(const UTF8ReplaceObj_2* st, michael@0: const StringPiece& istr, michael@0: StringPiece& ostr, michael@0: bool is_plain_text, michael@0: int* bytes_consumed, michael@0: int* bytes_filled, michael@0: int* chars_changed, michael@0: OffsetMap* offsetmap) { michael@0: int eshift = st->entry_shift; michael@0: int nEntries = (1 << eshift); // 64 or 256 entries per state michael@0: const uint8* isrc = reinterpret_cast(istr.data()); michael@0: const int ilen = istr.length(); michael@0: const uint8* copystart = isrc; michael@0: const uint8* src = isrc; michael@0: const uint8* srclimit = src + ilen; michael@0: *bytes_consumed = 0; michael@0: *bytes_filled = 0; michael@0: *chars_changed = 0; michael@0: michael@0: const uint8* odst = reinterpret_cast(ostr.data()); michael@0: const int olen = ostr.length(); michael@0: uint8* dst = const_cast(odst); michael@0: uint8* dstlimit = dst + olen; michael@0: michael@0: *chars_changed = 0; michael@0: michael@0: int total_changed = 0; michael@0: michael@0: int src_lll = srclimit - src; michael@0: int dst_lll = dstlimit - dst; michael@0: michael@0: michael@0: // Invariant condition during replacements: michael@0: // remaining dst size >= remaining src size michael@0: if ((dstlimit - dst) < (srclimit - src)) { michael@0: if (offsetmap != NULL) { michael@0: offsetmap->Copy(src - copystart); michael@0: copystart = src; michael@0: } michael@0: return kExitDstSpaceFull_2; michael@0: } michael@0: const unsigned short* Tbl_0 = &st->state_table[st->state0]; michael@0: michael@0: Do_state_table_2: michael@0: // Do state-table scan, copying as we go michael@0: const unsigned short* Tbl = Tbl_0; michael@0: int e = 0; michael@0: uint8 c = 0; michael@0: michael@0: Do_state_table_newe_2: michael@0: michael@0: //---------------------------- michael@0: while (src < srclimit) { michael@0: c = *src; michael@0: e = Tbl[c]; michael@0: *dst = c; michael@0: src++; michael@0: dst++; michael@0: if (e >= kExitIllegalStructure_2) {break;} michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: } michael@0: //---------------------------- michael@0: src_lll = src - isrc; michael@0: dst_lll = dst - odst; michael@0: michael@0: // Exit possibilities: michael@0: // Replacement code, do the replacement and loop michael@0: // Some other exit code, state0, back up one byte exactly michael@0: // Some other exit code, !state0, back up over last char michael@0: // source consumed, state0, exit OK michael@0: // source consumed, !state0, back up over partial char michael@0: // For illegal byte in state0, avoid backup up over PREVIOUS char michael@0: // For truncated last char, back up to beginning of it michael@0: michael@0: if (e >= kExitIllegalStructure_2) { michael@0: // Switch on exit code; most loop back to top michael@0: int offset = 0; michael@0: switch (e) { michael@0: // These all make the output string the same size or shorter michael@0: // No checking needed michael@0: case kExitReplace31_2: // del 2, add 1 bytes to change michael@0: dst -= 2; michael@0: if (offsetmap != NULL) { michael@0: offsetmap->Copy(src - copystart - 2); michael@0: offsetmap->Delete(2); michael@0: copystart = src; michael@0: } michael@0: dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); michael@0: total_changed++; michael@0: goto Do_state_table_2; michael@0: case kExitReplace32_2: // del 3, add 2 bytes to change michael@0: dst--; michael@0: if (offsetmap != NULL) { michael@0: offsetmap->Copy(src - copystart - 1); michael@0: offsetmap->Delete(1); michael@0: copystart = src; michael@0: } michael@0: dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff); michael@0: dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); michael@0: total_changed++; michael@0: goto Do_state_table_2; michael@0: case kExitReplace21_2: // del 2, add 1 bytes to change michael@0: dst--; michael@0: if (offsetmap != NULL) { michael@0: offsetmap->Copy(src - copystart - 1); michael@0: offsetmap->Delete(1); michael@0: copystart = src; michael@0: } michael@0: dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); michael@0: total_changed++; michael@0: goto Do_state_table_2; michael@0: case kExitReplace3_2: // update 3 bytes to change michael@0: dst[-3] = (unsigned char)(Tbl[c + (nEntries * 2)] & 0xff); michael@0: // Fall into next case michael@0: case kExitReplace2_2: // update 2 bytes to change michael@0: dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff); michael@0: // Fall into next case michael@0: case kExitReplace1_2: // update 1 byte to change michael@0: dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); michael@0: total_changed++; michael@0: goto Do_state_table_2; michael@0: case kExitReplace1S0_2: // update 1 byte to change, 256-entry state michael@0: dst[-1] = (unsigned char)(Tbl[c + (256 * 1)] & 0xff); michael@0: total_changed++; michael@0: goto Do_state_table_2; michael@0: // These can make the output string longer than the input michael@0: case kExitReplaceOffset2_2: michael@0: if ((nEntries != 256) && InStateZero_2(st, Tbl)) { michael@0: // For space-optimized table, we need multiples of 256 bytes michael@0: // in state0 and multiples of nEntries in other states michael@0: offset += ((unsigned char)(Tbl[c + (256 * 1)] >> 8 & 0xff) << 8); michael@0: } else { michael@0: offset += ((unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff) << 8); michael@0: } michael@0: // Fall into next case michael@0: case kExitReplaceOffset1_2: michael@0: if ((nEntries != 256) && InStateZero_2(st, Tbl)) { michael@0: // For space-optimized table, we need multiples of 256 bytes michael@0: // in state0 and multiples of nEntries in other states michael@0: offset += (unsigned char)(Tbl[c + (256 * 1)] & 0xff); michael@0: } else { michael@0: offset += (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); michael@0: } michael@0: { michael@0: const RemapEntry* re = &st->remap_base[offset]; michael@0: int del_len = re->delete_bytes & ~kReplaceAndResumeFlag; michael@0: int add_len = re->add_bytes & ~kHtmlPlaintextFlag; michael@0: // Special-case non-HTML replacement of five sensitive entities michael@0: // " & ' < > michael@0: // 0022 0026 0027 003c 003e michael@0: // A replacement creating one of these is expressed as a pair of michael@0: // entries, one for HTML output and one for plaintext output. michael@0: // The first of the pair has the high bit of add_bytes set. michael@0: if (re->add_bytes & kHtmlPlaintextFlag) { michael@0: // Use this entry for plain text michael@0: if (!is_plain_text) { michael@0: // Use very next entry for HTML text (same back/delete length) michael@0: re = &st->remap_base[offset + 1]; michael@0: add_len = re->add_bytes & ~kHtmlPlaintextFlag; michael@0: } michael@0: } michael@0: michael@0: // After the replacement, need (dstlimit - dst) >= (srclimit - src) michael@0: int string_offset = re->bytes_offset; michael@0: // After the replacement, need (dstlimit - newdst) >= (srclimit - src) michael@0: uint8* newdst = dst - del_len + add_len; michael@0: if ((dstlimit - newdst) < (srclimit - src)) { michael@0: // Won't fit; don't do the replacement. Caller may realloc and retry michael@0: e = kExitDstSpaceFull_2; michael@0: break; // exit, backing up over this char for later retry michael@0: } michael@0: dst -= del_len; michael@0: memcpy(dst, &st->remap_string[string_offset], add_len); michael@0: dst += add_len; michael@0: if (offsetmap != NULL) { michael@0: if (add_len > del_len) { michael@0: offsetmap->Copy(src - copystart); michael@0: offsetmap->Insert(add_len - del_len); michael@0: copystart = src; michael@0: } else if (add_len < del_len) { michael@0: offsetmap->Copy(src - copystart + add_len - del_len); michael@0: offsetmap->Delete(del_len - add_len); michael@0: copystart = src; michael@0: } michael@0: } michael@0: if (re->delete_bytes & kReplaceAndResumeFlag) { michael@0: // There is a two-byte non-zero target state at the end of the michael@0: // replacement string michael@0: uint8 c1 = st->remap_string[string_offset + add_len]; michael@0: uint8 c2 = st->remap_string[string_offset + add_len + 1]; michael@0: e = (c1 << 8) | c2; michael@0: Tbl = &Tbl_0[e << eshift]; michael@0: total_changed++; michael@0: goto Do_state_table_newe_2; michael@0: } michael@0: } michael@0: total_changed++; michael@0: if (e == kExitRejectAlt_2) {break;} michael@0: goto Do_state_table_2; michael@0: michael@0: case kExitSpecial_2: // NO special fixups [read: hacks] michael@0: case kExitIllegalStructure_2: // structurally illegal byte; quit michael@0: case kExitReject_2: // NUL or illegal code encountered; quit michael@0: // and all other exits michael@0: default: michael@0: break; michael@0: } // End switch (e) michael@0: michael@0: // Exit possibilities: michael@0: // Some other exit code, state0, back up one byte exactly michael@0: // Some other exit code, !state0, back up over last char michael@0: michael@0: // Back up over exactly one byte of rejected/illegal UTF-8 character michael@0: src--; michael@0: dst--; michael@0: // Back up more if needed michael@0: if (!InStateZero_2(st, Tbl)) { michael@0: do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); michael@0: } michael@0: } else if (!InStateZero_2(st, Tbl)) { michael@0: // src >= srclimit, !state0 michael@0: // Back up over truncated UTF-8 character michael@0: e = kExitIllegalStructure_2; michael@0: michael@0: do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); michael@0: } else { michael@0: // src >= srclimit, state0 michael@0: // Normal termination, source fully consumed michael@0: e = kExitOK_2; michael@0: } michael@0: michael@0: if (offsetmap != NULL) { michael@0: if (src > copystart) { michael@0: offsetmap->Copy(src - copystart); michael@0: copystart = src; michael@0: } michael@0: } michael@0: michael@0: michael@0: // Possible return values here: michael@0: // kExitDstSpaceFull_2 caller may realloc and retry from middle michael@0: // kExitIllegalStructure_2 caller my overwrite/truncate michael@0: // kExitOK_2 all done and happy michael@0: // kExitReject_2 caller may overwrite/truncate michael@0: // kExitDoAgain_2 LOOP NOT DONE; caller must retry from middle michael@0: // (may do fast ASCII loop first) michael@0: // kExitPlaceholder_2 -unused- michael@0: // kExitNone_2 -unused- michael@0: *bytes_consumed = src - isrc; michael@0: *bytes_filled = dst - odst; michael@0: *chars_changed = total_changed; michael@0: return e; michael@0: } michael@0: michael@0: michael@0: // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece michael@0: // and doing text replacements. michael@0: // Also writes an optional OffsetMap. Pass NULL to skip writing one. michael@0: // Always scan complete UTF-8 characters michael@0: // Set number of bytes consumed from input, number filled to output. michael@0: // Return reason for exiting michael@0: int UTF8GenericReplace(const UTF8ReplaceObj* st, michael@0: const StringPiece& istr, michael@0: StringPiece& ostr, michael@0: bool is_plain_text, michael@0: int* bytes_consumed, michael@0: int* bytes_filled, michael@0: int* chars_changed, michael@0: OffsetMap* offsetmap) { michael@0: StringPiece local_istr(istr.data(), istr.length()); michael@0: StringPiece local_ostr(ostr.data(), ostr.length()); michael@0: int total_consumed = 0; michael@0: int total_filled = 0; michael@0: int total_changed = 0; michael@0: int local_bytes_consumed, local_bytes_filled, local_chars_changed; michael@0: int e; michael@0: do { michael@0: e = UTF8GenericReplaceInternal(st, michael@0: local_istr, local_ostr, is_plain_text, michael@0: &local_bytes_consumed, &local_bytes_filled, michael@0: &local_chars_changed, michael@0: offsetmap); michael@0: local_istr.remove_prefix(local_bytes_consumed); michael@0: local_ostr.remove_prefix(local_bytes_filled); michael@0: total_consumed += local_bytes_consumed; michael@0: total_filled += local_bytes_filled; michael@0: total_changed += local_chars_changed; michael@0: } while ( e == kExitDoAgain ); michael@0: *bytes_consumed = total_consumed; michael@0: *bytes_filled = total_filled; michael@0: *chars_changed = total_changed; michael@0: return e; michael@0: } michael@0: michael@0: // Older version without offsetmap michael@0: int UTF8GenericReplace(const UTF8ReplaceObj* st, michael@0: const StringPiece& istr, michael@0: StringPiece& ostr, michael@0: bool is_plain_text, michael@0: int* bytes_consumed, michael@0: int* bytes_filled, michael@0: int* chars_changed) { michael@0: return UTF8GenericReplace(st, michael@0: istr, michael@0: ostr, michael@0: is_plain_text, michael@0: bytes_consumed, michael@0: bytes_filled, michael@0: chars_changed, michael@0: NULL); michael@0: } michael@0: michael@0: // Older version without is_plain_text or offsetmap michael@0: int UTF8GenericReplace(const UTF8ReplaceObj* st, michael@0: const StringPiece& istr, michael@0: StringPiece& ostr, michael@0: int* bytes_consumed, michael@0: int* bytes_filled, michael@0: int* chars_changed) { michael@0: bool is_plain_text = false; michael@0: return UTF8GenericReplace(st, michael@0: istr, michael@0: ostr, michael@0: is_plain_text, michael@0: bytes_consumed, michael@0: bytes_filled, michael@0: chars_changed, michael@0: NULL); michael@0: } michael@0: michael@0: // Scan a UTF-8 stringpiece based on state table with two-byte entries, michael@0: // copying to output stringpiece michael@0: // and doing text replacements. michael@0: // Also writes an optional OffsetMap. Pass NULL to skip writing one. michael@0: // Always scan complete UTF-8 characters michael@0: // Set number of bytes consumed from input, number filled to output. michael@0: // Return reason for exiting michael@0: int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, michael@0: const StringPiece& istr, michael@0: StringPiece& ostr, michael@0: bool is_plain_text, michael@0: int* bytes_consumed, michael@0: int* bytes_filled, michael@0: int* chars_changed, michael@0: OffsetMap* offsetmap) { michael@0: StringPiece local_istr(istr.data(), istr.length()); michael@0: StringPiece local_ostr(ostr.data(), ostr.length()); michael@0: int total_consumed = 0; michael@0: int total_filled = 0; michael@0: int total_changed = 0; michael@0: int local_bytes_consumed, local_bytes_filled, local_chars_changed; michael@0: int e; michael@0: do { michael@0: e = UTF8GenericReplaceInternalTwoByte(st, michael@0: local_istr, local_ostr, is_plain_text, michael@0: &local_bytes_consumed, michael@0: &local_bytes_filled, michael@0: &local_chars_changed, michael@0: offsetmap); michael@0: local_istr.remove_prefix(local_bytes_consumed); michael@0: local_ostr.remove_prefix(local_bytes_filled); michael@0: total_consumed += local_bytes_consumed; michael@0: total_filled += local_bytes_filled; michael@0: total_changed += local_chars_changed; michael@0: } while ( e == kExitDoAgain_2 ); michael@0: *bytes_consumed = total_consumed; michael@0: *bytes_filled = total_filled; michael@0: *chars_changed = total_changed; michael@0: michael@0: return e - kExitOK_2 + kExitOK; michael@0: } michael@0: michael@0: // Older version without offsetmap michael@0: int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, michael@0: const StringPiece& istr, michael@0: StringPiece& ostr, michael@0: bool is_plain_text, michael@0: int* bytes_consumed, michael@0: int* bytes_filled, michael@0: int* chars_changed) { michael@0: return UTF8GenericReplaceTwoByte(st, michael@0: istr, michael@0: ostr, michael@0: is_plain_text, michael@0: bytes_consumed, michael@0: bytes_filled, michael@0: chars_changed, michael@0: NULL); michael@0: } michael@0: michael@0: // Older version without is_plain_text or offsetmap michael@0: int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, michael@0: const StringPiece& istr, michael@0: StringPiece& ostr, michael@0: int* bytes_consumed, michael@0: int* bytes_filled, michael@0: int* chars_changed) { michael@0: bool is_plain_text = false; michael@0: return UTF8GenericReplaceTwoByte(st, michael@0: istr, michael@0: ostr, michael@0: is_plain_text, michael@0: bytes_consumed, michael@0: bytes_filled, michael@0: chars_changed, michael@0: NULL); michael@0: } michael@0: michael@0: michael@0: michael@0: // Adjust a stringpiece to encompass complete UTF-8 characters. michael@0: // The data pointer will be increased by 0..3 bytes to get to a character michael@0: // boundary, and the length will then be decreased by 0..3 bytes michael@0: // to encompass the last complete character. michael@0: void UTF8TrimToChars(StringPiece* istr) { michael@0: const char* src = istr->data(); michael@0: int len = istr->length(); michael@0: // Exit if empty string michael@0: if (len == 0) { michael@0: return; michael@0: } michael@0: michael@0: // Exit on simple, common case michael@0: if ( ((src[0] & 0xc0) != 0x80) && michael@0: (static_cast(src[len - 1]) >= 0) ) { michael@0: // First byte is not a continuation and last byte is 7-bit ASCII -- done michael@0: return; michael@0: } michael@0: michael@0: // Adjust the back end, len > 0 michael@0: const char* srclimit = src + len; michael@0: // Backscan over any ending continuation bytes to find last char start michael@0: const char* s = srclimit - 1; // Last byte of the string michael@0: while ((src <= s) && ((*s & 0xc0) == 0x80)) { michael@0: s--; michael@0: } michael@0: // Include entire last char if it fits michael@0: if (src <= s) { michael@0: int last_char_len = UTF8OneCharLen(s); michael@0: if (s + last_char_len <= srclimit) { michael@0: // Last char fits, so include it, else exclude it michael@0: s += last_char_len; michael@0: } michael@0: } michael@0: if (s != srclimit) { michael@0: // s is one byte beyond the last full character, if any michael@0: istr->remove_suffix(srclimit - s); michael@0: // Exit if now empty string michael@0: if (istr->length() == 0) { michael@0: return; michael@0: } michael@0: } michael@0: michael@0: // Adjust the front end, len > 0 michael@0: len = istr->length(); michael@0: srclimit = src + len; michael@0: s = src; // First byte of the string michael@0: // Scan over any beginning continuation bytes to find first char start michael@0: while ((s < srclimit) && ((*s & 0xc0) == 0x80)) { michael@0: s++; michael@0: } michael@0: if (s != src) { michael@0: // s is at the first full character, if any michael@0: istr->remove_prefix(s - src); michael@0: } michael@0: } michael@0: michael@0: } // End namespace CLD2