browser/components/translation/cld2/internal/utf8statetable.cc

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 // Copyright 2013 Google Inc. All Rights Reserved.
     2 //
     3 // Licensed under the Apache License, Version 2.0 (the "License");
     4 // you may not use this file except in compliance with the License.
     5 // You may obtain a copy of the License at
     6 //
     7 //     http://www.apache.org/licenses/LICENSE-2.0
     8 //
     9 // Unless required by applicable law or agreed to in writing, software
    10 // distributed under the License is distributed on an "AS IS" BASIS,
    11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12 // See the License for the specific language governing permissions and
    13 // limitations under the License.
    15 //
    16 // State Table follower for scanning UTF-8 strings without converting to
    17 // 32- or 16-bit Unicode values.
    18 //
    20 #ifdef COMPILER_MSVC
    21 // MSVC warns: warning C4309: 'initializing' : truncation of constant value
    22 // But the value is in fact not truncated.  0xFF still comes out 0xFF at
    23 // runtime.
    24 #pragma warning ( disable : 4309 )
    25 #endif
    27 #include "utf8statetable.h"
    29 #include <stdint.h>                     // for uintptr_t
    30 #include <string.h>                     // for NULL, memcpy, memmove
    32 #include "integral_types.h"        // for uint8, uint32, int8
    33 #include "stringpiece.h"
    34 #include "offsetmap.h"
    37 namespace CLD2 {
    39 static const int kReplaceAndResumeFlag = 0x80; // Bit in del byte to distinguish
    40                                                // optional next-state field
    41                                                // after replacement text
    42 static const int kHtmlPlaintextFlag = 0x80;    // Bit in add byte to distinguish
    43                                                // HTML replacement vs. plaintext
    46 /**
    47  * This code implements a little interpreter for UTF8 state
    48  * tables. There are three kinds of quite-similar state tables,
    49  * property, scanning, and replacement. Each state in one of
    50  * these tables consists of an array of 256 or 64 one-byte
    51  * entries. The state is subscripted by an incoming source byte,
    52  * and the entry either specifies the next state or specifies an
    53  * action. Space-optimized tables have full 256-entry states for
    54  * the first byte of a UTF-8 character, but only 64-entry states
    55  * for continuation bytes. Space-optimized tables may only be
    56  * used with source input that has been checked to be
    57  * structurally- (or stronger interchange-) valid.
    58  *
    59  * A property state table has an unsigned one-byte property for
    60  * each possible UTF-8 character. One-byte character properties
    61  * are in the state[0] array, while for other lengths the
    62  * state[0] array gives the next state, which contains the
    63  * property value for two-byte characters or yet another state
    64  * for longer ones. The code simply loads the right number of
    65  * next-state values, then returns the final byte as property
    66  * value. There are no actions specified in property tables.
    67  * States are typically shared for multi-byte UTF-8 characters
    68  * that all have the same property value.
    69  *
    70  * A scanning state table has entries that are either a
    71  * next-state specifier for bytes that are accepted by the
    72  * scanner, or an exit action for the last byte of each
    73  * character that is rejected by the scanner.
    74  *
    75  * Scanning long strings involves a tight loop that picks up one
    76  * byte at a time and follows next-state value back to state[0]
    77  * for each accepted UTF-8 character. Scanning stops at the end
    78  * of the string or at the first character encountered that has
    79  * an exit action such as "reject". Timing information is given
    80  * below.
    81  *
    82  * Since so much of Google's text is 7-bit-ASCII values
    83  * (approximately 94% of the bytes of web documents), the
    84  * scanning interpreter has two speed optimizations. One checks
    85  * 8 bytes at a time to see if they are all in the range lo..hi,
    86  * as specified in constants in the overall statetable object.
    87  * The check involves ORing together four 4-byte values that
    88  * overflow into the high bit of some byte when a byte is out of
    89  * range. For seven-bit-ASCII, lo is 0x20 and hi is 0x7E. This
    90  * loop is about 8x faster than the one-byte-at-a-time loop.
    91  *
    92  * If checking for exit bytes in the 0x00-0x1F and 7F range is
    93  * unneeded, an even faster loop just looks at the high bits of
    94  * 8 bytes at once, and is about 1.33x faster than the lo..hi
    95  * loop.
    96  *
    97  * Exit from the scanning routines backs up to the first byte of
    98  * the rejected character, so the text spanned is always a
    99  * complete number of UTF-8 characters. The normal scanning exit
   100  * is at the first rejected character, or at the end of the
   101  * input text. Scanning also exits on any detected ill-formed
   102  * character or at a special do-again action built into some
   103  * exit-optimized tables. The do-again action gets back to the
   104  * top of the scanning loop to retry eight-byte ASCII scans. It
   105  * is typically put into state tables after four seven-bit-ASCII
   106  * characters in a row are seen, to allow restarting the fast
   107  * scan after some slower processing of multi-byte characters.
   108  *
   109  * A replacement state table is similar to a scanning state
   110  * table but has more extensive actions. The default
   111  * byte-at-a-time loop copies one byte from source to
   112  * destination and goes to the next state. The replacement
   113  * actions overwrite 1-3 bytes of the destination with different
   114  * bytes, possibly shortening the output by 1 or 2 bytes. The
   115  * replacement bytes come from within the state table, from
   116  * dummy states inserted just after any state that contains a
   117  * replacement action. This gives a quick address calculation for
   118  * the replacement byte(s) and gives some cache locality.
   119  *
   120  * Additional replacement actions use one or two bytes from
   121  * within dummy states to index a side table of more-extensive
   122  * replacements. The side table specifies a length of 0..15
   123  * destination bytes to overwrite and a length of 0..127 bytes
   124  * to overwrite them with, plus the actual replacement bytes.
   125  *
   126  * This side table uses one extra bit to specify a pair of
   127  * replacements, the first to be used in an HTML context and the
   128  * second to be used in a plaintext context. This allows
   129  * replacements that are spelled with "&lt;" in the former
   130  * context and "<" in the latter.
   131  *
   132  * The side table also uses an extra bit to specify a non-zero
   133  * next state after a replacement. This allows a combination
   134  * replacement and state change, used to implement a limited
   135  * version of the Boyer-Moore algorithm for multi-character
   136  * replacement without backtracking. This is useful when there
   137  * are overlapping replacements, such as ch => x and also c =>
   138  * y, the latter to be used only if the character after c is not
   139  * h. in this case, the state[0] table's entry for c would
   140  * change c to y and also have a next-state of say n, and the
   141  * state[n] entry for h would specify a replacement of the two
   142  * bytes yh by x. No backtracking is needed.
   143  *
   144  * A replacement table may also include the exit actions of a
   145  * scanning state table, so some character sequences can
   146  * terminate early.
   147  *
   148  * During replacement, an optional data structure called an
   149  * offset map can be updated to reflect each change in length
   150  * between source and destination. This offset map can later be
   151  * used to map destination-string offsets to corresponding
   152  * source-string offsets or vice versa.
   153  *
   154  * The routines below also have variants in which state-table
   155  * entries are all two bytes instead of one byte. This allows
   156  * tables with more than 240 total states, but takes up twice as
   157  * much space per state.
   158  *
   159 **/
   161 // Return true if current Tbl pointer is within state0 range
   162 // Note that unsigned compare checks both ends of range simultaneously
   163 static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
   164   const uint8* Tbl0 = &st->state_table[st->state0];
   165   return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
   166 }
   168 static inline bool InStateZero_2(const UTF8ReplaceObj_2* st,
   169                                  const unsigned short int* Tbl) {
   170   const unsigned short int* Tbl0 =  &st->state_table[st->state0];
   171   // Word difference, not byte difference
   172   return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
   173 }
   175 // UTF8PropObj, UTF8ScanObj, UTF8ReplaceObj are all typedefs of
   176 // UTF8MachineObj.
   178 static bool IsPropObj(const UTF8StateMachineObj& obj) {
   179   return obj.fast_state == NULL
   180       && obj.max_expand == 0;
   181 }
   183 static bool IsPropObj_2(const UTF8StateMachineObj_2& obj) {
   184   return obj.fast_state == NULL
   185       && obj.max_expand == 0;
   186 }
   188 static bool IsScanObj(const UTF8StateMachineObj& obj) {
   189   return obj.fast_state != NULL
   190       && obj.max_expand == 0;
   191 }
   193 static bool IsReplaceObj(const UTF8StateMachineObj& obj) {
   194   // Normally, obj.fast_state != NULL, but the handwritten tables
   195   // in utf8statetable_unittest don't handle fast_states.
   196   return obj.max_expand > 0;
   197 }
   199 static bool IsReplaceObj_2(const UTF8StateMachineObj_2& obj) {
   200   return obj.max_expand > 0;
   201 }
   203 // Look up property of one UTF-8 character and advance over it
   204 // Return 0 if input length is zero
   205 // Return 0 and advance one byte if input is ill-formed
   206 uint8 UTF8GenericProperty(const UTF8PropObj* st,
   207                           const uint8** src,
   208                           int* srclen) {
   209   if (*srclen <= 0) {
   210     return 0;
   211   }
   213   const uint8* lsrc = *src;
   214   const uint8* Tbl_0 = &st->state_table[st->state0];
   215   const uint8* Tbl = Tbl_0;
   216   int e;
   217   int eshift = st->entry_shift;
   219   // Short series of tests faster than switch, optimizes 7-bit ASCII
   220   unsigned char c = lsrc[0];
   221   if (static_cast<signed char>(c) >= 0) {           // one byte
   222     e = Tbl[c];
   223     *src += 1;
   224     *srclen -= 1;
   225   } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
   226     e = Tbl[c];
   227     Tbl = &Tbl_0[e << eshift];
   228     e = Tbl[lsrc[1]];
   229     *src += 2;
   230     *srclen -= 2;
   231   } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
   232     e = Tbl[c];
   233     Tbl = &Tbl_0[e << eshift];
   234     e = Tbl[lsrc[1]];
   235     Tbl = &Tbl_0[e << eshift];
   236     e = Tbl[lsrc[2]];
   237     *src += 3;
   238     *srclen -= 3;
   239   }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
   240     e = Tbl[c];
   241     Tbl = &Tbl_0[e << eshift];
   242     e = Tbl[lsrc[1]];
   243     Tbl = &Tbl_0[e << eshift];
   244     e = Tbl[lsrc[2]];
   245     Tbl = &Tbl_0[e << eshift];
   246     e = Tbl[lsrc[3]];
   247     *src += 4;
   248     *srclen -= 4;
   249   } else {                                                // Ill-formed
   250     e = 0;
   251     *src += 1;
   252     *srclen -= 1;
   253   }
   254   return e;
   255 }
   257 bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src) {
   258   const uint8* lsrc = reinterpret_cast<const uint8*>(src);
   259   const uint8* Tbl_0 = &st.state_table[st.state0];
   260   const uint8* Tbl = Tbl_0;
   261   int e;
   262   int eshift = st.entry_shift;
   264   // Short series of tests faster than switch, optimizes 7-bit ASCII
   265   unsigned char c = lsrc[0];
   266   if (static_cast<signed char>(c) >= 0) {           // one byte
   267     e = Tbl[c];
   268   } else if ((c & 0xe0) == 0xc0) {     // two bytes
   269     e = Tbl[c];
   270     Tbl = &Tbl_0[e << eshift];
   271     e = Tbl[lsrc[1]];
   272   } else if ((c & 0xf0) == 0xe0) {     // three bytes
   273     e = Tbl[c];
   274     Tbl = &Tbl_0[e << eshift];
   275     e = Tbl[lsrc[1]];
   276     Tbl = &Tbl_0[e << eshift];
   277     e = Tbl[lsrc[2]];
   278   } else {                             // four bytes
   279     e = Tbl[c];
   280     Tbl = &Tbl_0[e << eshift];
   281     e = Tbl[lsrc[1]];
   282     Tbl = &Tbl_0[e << eshift];
   283     e = Tbl[lsrc[2]];
   284     Tbl = &Tbl_0[e << eshift];
   285     e = Tbl[lsrc[3]];
   286   }
   287   return e;
   288 }
   291 // BigOneByte versions are needed for tables > 240 states, but most
   292 // won't need the TwoByte versions.
   293 // Internally, to next-to-last offset is multiplied by 16 and the last
   294 // offset is relative instead of absolute.
   295 // Look up property of one UTF-8 character and advance over it
   296 // Return 0 if input length is zero
   297 // Return 0 and advance one byte if input is ill-formed
   298 uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
   299                           const uint8** src,
   300                           int* srclen) {
   301   if (*srclen <= 0) {
   302     return 0;
   303   }
   305   const uint8* lsrc = *src;
   306   const uint8* Tbl_0 = &st->state_table[st->state0];
   307   const uint8* Tbl = Tbl_0;
   308   int e;
   309   int eshift = st->entry_shift;
   311   // Short series of tests faster than switch, optimizes 7-bit ASCII
   312   unsigned char c = lsrc[0];
   313   if (static_cast<signed char>(c) >= 0) {           // one byte
   314     e = Tbl[c];
   315     *src += 1;
   316     *srclen -= 1;
   317   } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
   318     e = Tbl[c];
   319     Tbl = &Tbl_0[e << eshift];
   320     e = Tbl[lsrc[1]];
   321     *src += 2;
   322     *srclen -= 2;
   323   } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
   324     e = Tbl[c];
   325     Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
   326     e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
   327     Tbl = &Tbl[e << eshift];          // Relative +/-
   328     e = Tbl[lsrc[2]];
   329     *src += 3;
   330     *srclen -= 3;
   331   }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
   332     e = Tbl[c];
   333     Tbl = &Tbl_0[e << eshift];
   334     e = Tbl[lsrc[1]];
   335     Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
   336     e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
   337     Tbl = &Tbl[e << eshift];          // Relative +/-
   338     e = Tbl[lsrc[3]];
   339     *src += 4;
   340     *srclen -= 4;
   341   } else {                                                // Ill-formed
   342     e = 0;
   343     *src += 1;
   344     *srclen -= 1;
   345   }
   346   return e;
   347 }
   349 // BigOneByte versions are needed for tables > 240 states, but most
   350 // won't need the TwoByte versions.
   351 bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src) {
   352   const uint8* lsrc = reinterpret_cast<const uint8*>(src);
   353   const uint8* Tbl_0 = &st.state_table[st.state0];
   354   const uint8* Tbl = Tbl_0;
   355   int e;
   356   int eshift = st.entry_shift;
   358   // Short series of tests faster than switch, optimizes 7-bit ASCII
   359   unsigned char c = lsrc[0];
   360   if (static_cast<signed char>(c) >= 0) {           // one byte
   361     e = Tbl[c];
   362   } else if ((c & 0xe0) == 0xc0) {    // two bytes
   363     e = Tbl[c];
   364     Tbl = &Tbl_0[e << eshift];
   365     e = Tbl[lsrc[1]];
   366   } else if ((c & 0xf0) == 0xe0) {    // three bytes
   367     e = Tbl[c];
   368     Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
   369     e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
   370     Tbl = &Tbl[e << eshift];          // Relative +/-
   371     e = Tbl[lsrc[2]];
   372   } else {                            // four bytes
   373     e = Tbl[c];
   374     Tbl = &Tbl_0[e << eshift];
   375     e = Tbl[lsrc[1]];
   376     Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
   377     e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
   378     Tbl = &Tbl[e << eshift];          // Relative +/-
   379     e = Tbl[lsrc[3]];
   380   }
   381   return e;
   382 }
   385 // TwoByte versions are needed for tables > 240 states
   386 // Look up property of one UTF-8 character and advance over it
   387 // Return 0 if input length is zero
   388 // Return 0 and advance one byte if input is ill-formed
   389 uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
   390                           const uint8** src,
   391                           int* srclen) {
   392   if (*srclen <= 0) {
   393     return 0;
   394   }
   396   const uint8* lsrc = *src;
   397   const unsigned short* Tbl_0 = &st->state_table[st->state0];
   398   const unsigned short* Tbl = Tbl_0;
   399   int e;
   400   int eshift = st->entry_shift;
   402   // Short series of tests faster than switch, optimizes 7-bit ASCII
   403   unsigned char c = lsrc[0];
   404   if (static_cast<signed char>(c) >= 0) {           // one byte
   405     e = Tbl[c];
   406     *src += 1;
   407     *srclen -= 1;
   408   } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
   409     e = Tbl[c];
   410     Tbl = &Tbl_0[e << eshift];
   411     e = Tbl[lsrc[1]];
   412     *src += 2;
   413     *srclen -= 2;
   414   } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
   415     e = Tbl[c];
   416     Tbl = &Tbl_0[e << eshift];
   417     e = Tbl[lsrc[1]];
   418     Tbl = &Tbl_0[e << eshift];
   419     e = Tbl[lsrc[2]];
   420     *src += 3;
   421     *srclen -= 3;
   422   }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
   423     e = Tbl[c];
   424     Tbl = &Tbl_0[e << eshift];
   425     e = Tbl[lsrc[1]];
   426     Tbl = &Tbl_0[e << eshift];
   427     e = Tbl[lsrc[2]];
   428     Tbl = &Tbl_0[e << eshift];
   429     e = Tbl[lsrc[3]];
   430     *src += 4;
   431     *srclen -= 4;
   432   } else {                                                // Ill-formed
   433     e = 0;
   434     *src += 1;
   435     *srclen -= 1;
   436   }
   437   return e;
   438 }
   440 // TwoByte versions are needed for tables > 240 states
   441 bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src) {
   442   const uint8* lsrc = reinterpret_cast<const uint8*>(src);
   443   const unsigned short* Tbl_0 = &st.state_table[st.state0];
   444   const unsigned short* Tbl = Tbl_0;
   445   int e;
   446   int eshift = st.entry_shift;
   448   // Short series of tests faster than switch, optimizes 7-bit ASCII
   449   unsigned char c = lsrc[0];
   450   if (static_cast<signed char>(c) >= 0) {           // one byte
   451     e = Tbl[c];
   452   } else if ((c & 0xe0) == 0xc0) {     // two bytes
   453     e = Tbl[c];
   454     Tbl = &Tbl_0[e << eshift];
   455     e = Tbl[lsrc[1]];
   456   } else if ((c & 0xf0) == 0xe0) {     // three bytes
   457     e = Tbl[c];
   458     Tbl = &Tbl_0[e << eshift];
   459     e = Tbl[lsrc[1]];
   460     Tbl = &Tbl_0[e << eshift];
   461     e = Tbl[lsrc[2]];
   462   } else {                             // four bytes
   463     e = Tbl[c];
   464     Tbl = &Tbl_0[e << eshift];
   465     e = Tbl[lsrc[1]];
   466     Tbl = &Tbl_0[e << eshift];
   467     e = Tbl[lsrc[2]];
   468     Tbl = &Tbl_0[e << eshift];
   469     e = Tbl[lsrc[3]];
   470   }
   471   return e;
   472 }
   475 // Approximate speeds on 2.8 GHz Pentium 4:
   476 //   GenericScan 1-byte loop           300 MB/sec *
   477 //   GenericScan 4-byte loop          1200 MB/sec
   478 //   GenericScan 8-byte loop          2400 MB/sec *
   479 //   GenericScanFastAscii 4-byte loop 3000 MB/sec
   480 //   GenericScanFastAscii 8-byte loop 3200 MB/sec *
   481 //
   482 // * Implemented below. FastAscii loop is memory-bandwidth constrained.
   484 // Scan a UTF-8 stringpiece based on state table.
   485 // Always scan complete UTF-8 characters
   486 // Set number of bytes scanned. Return reason for exiting
   487 int UTF8GenericScan(const UTF8ScanObj* st,
   488                     const StringPiece& str,
   489                     int* bytes_consumed) {
   490   int eshift = st->entry_shift;       // 6 (space optimized) or 8
   491   // int nEntries = (1 << eshift);       // 64 or 256 entries per state
   493   const uint8* isrc =
   494     reinterpret_cast<const uint8*>(str.data());
   495   const uint8* src = isrc;
   496   const int len = str.length();
   497   const uint8* srclimit = isrc + len;
   498   const uint8* srclimit8 = srclimit - 7;
   499   *bytes_consumed = 0;
   500   if (len == 0) return kExitOK;
   502   const uint8* Tbl_0 = &st->state_table[st->state0];
   504 DoAgain:
   505   // Do state-table scan
   506   int e = 0;
   507   uint8 c;
   509   // Do fast for groups of 8 identity bytes.
   510   // This covers a lot of 7-bit ASCII ~8x faster than the 1-byte loop,
   511   // including slowing slightly on cr/lf/ht
   512   //----------------------------
   513   const uint8* Tbl2 = &st->fast_state[0];
   514   uint32 losub = st->losub;
   515   uint32 hiadd = st->hiadd;
   516   while (src < srclimit8) {
   517     uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
   518     uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
   519     src += 8;
   520     // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
   521     uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
   522                   (s4567 - losub) | (s4567 + hiadd);
   523     if ((temp & 0x80808080) != 0) {
   524       // We typically end up here on cr/lf/ht; src was incremented
   525       int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
   526                   (Tbl2[src[-6]] | Tbl2[src[-5]]);
   527       if (e0123 != 0) {src -= 8; break;}    // Exit on Non-interchange
   528       e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
   529               (Tbl2[src[-2]] | Tbl2[src[-1]]);
   530       if (e0123 != 0) {src -= 4; break;}    // Exit on Non-interchange
   531       // Else OK, go around again
   532     }
   533   }
   534   //----------------------------
   536   // Byte-at-a-time scan
   537   //----------------------------
   538   const uint8* Tbl = Tbl_0;
   539   while (src < srclimit) {
   540     c = *src;
   541     e = Tbl[c];
   542     src++;
   543     if (e >= kExitIllegalStructure) {break;}
   544     Tbl = &Tbl_0[e << eshift];
   545   }
   546   //----------------------------
   549   // Exit possibilities:
   550   //  Some exit code, !state0, back up over last char
   551   //  Some exit code, state0, back up one byte exactly
   552   //  source consumed, !state0, back up over partial char
   553   //  source consumed, state0, exit OK
   554   // For illegal byte in state0, avoid backup up over PREVIOUS char
   555   // For truncated last char, back up to beginning of it
   557   if (e >= kExitIllegalStructure) {
   558     // Back up over exactly one byte of rejected/illegal UTF-8 character
   559     src--;
   560     // Back up more if needed
   561     if (!InStateZero(st, Tbl)) {
   562       do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
   563     }
   564   } else if (!InStateZero(st, Tbl)) {
   565     // Back up over truncated UTF-8 character
   566     e = kExitIllegalStructure;
   567     do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
   568   } else {
   569     // Normal termination, source fully consumed
   570     e = kExitOK;
   571   }
   573   if (e == kExitDoAgain) {
   574     // Loop back up to the fast scan
   575     goto DoAgain;
   576   }
   578   *bytes_consumed = src - isrc;
   579   return e;
   580 }
   582 // Scan a UTF-8 stringpiece based on state table.
   583 // Always scan complete UTF-8 characters
   584 // Set number of bytes scanned. Return reason for exiting
   585 // OPTIMIZED for case of 7-bit ASCII 0000..007f all valid
   586 int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
   587                     const StringPiece& str,
   588                     int* bytes_consumed) {
   589   const uint8* isrc =
   590     reinterpret_cast<const uint8*>(str.data());
   591   const uint8* src = isrc;
   592   const int len = str.length();
   593   const uint8* srclimit = isrc + len;
   594   const uint8* srclimit8 = srclimit - 7;
   595   *bytes_consumed = 0;
   596   if (len == 0) return kExitOK;
   598   int n;
   599   int rest_consumed;
   600   int exit_reason;
   601   do {
   602     // Skip 8 bytes of ASCII at a whack; no endianness issue
   603     while ((src < srclimit8) &&
   604            (((reinterpret_cast<const uint32*>(src)[0] |
   605               reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
   606       src += 8;
   607     }
   608     // Run state table on the rest
   609     n = src - isrc;
   610     StringPiece str2(str.data() + n, str.length() - n);
   611     exit_reason = UTF8GenericScan(st, str2, &rest_consumed);
   612     src += rest_consumed;
   613   } while ( exit_reason == kExitDoAgain );
   615   *bytes_consumed = src - isrc;
   616   return exit_reason;
   617 }
   619 // Hack to change halfwidth katakana to match an old UTF8CharToLower()
   621 // Return number of src bytes skipped
   622 static int DoSpecialFixup(const unsigned char c,
   623                     const unsigned char** srcp, const unsigned char* srclimit,
   624                     unsigned char** dstp, unsigned char* dstlimit) {
   625   return 0;
   626 }
   629 // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
   630 //   and doing text replacements.
   631 // DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below
   632 //   Needs caller to loop on kExitDoAgain
   633 static int UTF8GenericReplaceInternal(const UTF8ReplaceObj* st,
   634                     const StringPiece& istr,
   635                     StringPiece& ostr,
   636                     bool is_plain_text,
   637                     int* bytes_consumed,
   638                     int* bytes_filled,
   639                     int* chars_changed,
   640                     OffsetMap* offsetmap) {
   641   int eshift = st->entry_shift;
   642   int nEntries = (1 << eshift);       // 64 or 256 entries per state
   643   const uint8* isrc = reinterpret_cast<const uint8*>(istr.data());
   644   const int ilen = istr.length();
   645   const uint8* copystart = isrc;
   646   const uint8* src = isrc;
   647   const uint8* srclimit = src + ilen;
   648   *bytes_consumed = 0;
   649   *bytes_filled = 0;
   650   *chars_changed = 0;
   652   const uint8* odst = reinterpret_cast<const uint8*>(ostr.data());
   653   const int olen = ostr.length();
   654   uint8* dst = const_cast<uint8*>(odst);
   655   uint8* dstlimit = dst + olen;
   657   int total_changed = 0;
   659   // Invariant condition during replacements:
   660   //  remaining dst size >= remaining src size
   661   if ((dstlimit - dst) < (srclimit - src)) {
   662     if (offsetmap != NULL) {
   663       offsetmap->Copy(src - copystart);
   664       copystart = src;
   665     }
   666     return kExitDstSpaceFull;
   667   }
   668   const uint8* Tbl_0 = &st->state_table[st->state0];
   670  Do_state_table:
   671   // Do state-table scan, copying as we go
   672   const uint8* Tbl = Tbl_0;
   673   int e = 0;
   674   uint8 c = 0;
   676  Do_state_table_newe:
   678   //----------------------------
   679   while (src < srclimit) {
   680     c = *src;
   681     e = Tbl[c];
   682     *dst = c;
   683     src++;
   684     dst++;
   685     if (e >= kExitIllegalStructure) {break;}
   686     Tbl = &Tbl_0[e << eshift];
   687   }
   688   //----------------------------
   690   // Exit possibilities:
   691   //  Replacement code, do the replacement and loop
   692   //  Some other exit code, state0, back up one byte exactly
   693   //  Some other exit code, !state0, back up over last char
   694   //  source consumed, state0, exit OK
   695   //  source consumed, !state0, back up over partial char
   696   // For illegal byte in state0, avoid backup up over PREVIOUS char
   697   // For truncated last char, back up to beginning of it
   699   if (e >= kExitIllegalStructure) {
   700     // Switch on exit code; most loop back to top
   701     int offset = 0;
   702     switch (e) {
   703     // These all make the output string the same size or shorter
   704     // No checking needed
   705     case kExitReplace31:    // del 2, add 1 bytes to change
   706       dst -= 2;
   707       if (offsetmap != NULL) {
   708         offsetmap->Copy(src - copystart - 2);
   709         offsetmap->Delete(2);
   710         copystart = src;
   711       }
   712       dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
   713       total_changed++;
   714       goto Do_state_table;
   715     case kExitReplace32:    // del 3, add 2 bytes to change
   716       dst--;
   717       if (offsetmap != NULL) {
   718         offsetmap->Copy(src - copystart - 1);
   719         offsetmap->Delete(1);
   720         copystart = src;
   721       }
   722       dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)];
   723       dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
   724       total_changed++;
   725       goto Do_state_table;
   726     case kExitReplace21:    // del 2, add 1 bytes to change
   727       dst--;
   728       if (offsetmap != NULL) {
   729         offsetmap->Copy(src - copystart - 1);
   730         offsetmap->Delete(1);
   731         copystart = src;
   732       }
   733       dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
   734       total_changed++;
   735       goto Do_state_table;
   736     case kExitReplace3:    // update 3 bytes to change
   737       dst[-3] = (unsigned char)Tbl[c + (nEntries * 3)];
   738       // Fall into next case
   739     case kExitReplace2:    // update 2 bytes to change
   740       dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)];
   741       // Fall into next case
   742     case kExitReplace1:    // update 1 byte to change
   743       dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];
   744       total_changed++;
   745       goto Do_state_table;
   746     case kExitReplace1S0:     // update 1 byte to change, 256-entry state
   747       dst[-1] = (unsigned char)Tbl[c + (256 * 1)];
   748       total_changed++;
   749       goto Do_state_table;
   750     // These can make the output string longer than the input
   751     case kExitReplaceOffset2:
   752       if ((nEntries != 256) && InStateZero(st, Tbl)) {
   753         // For space-optimized table, we need multiples of 256 bytes
   754         // in state0 and multiples of nEntries in other states
   755         offset += ((unsigned char)Tbl[c + (256 * 2)] << 8);
   756       } else {
   757         offset += ((unsigned char)Tbl[c + (nEntries * 2)] << 8);
   758       }
   759       // Fall into next case
   760     case kExitSpecial:      // Apply special fixups [read: hacks]
   761     case kExitReplaceOffset1:
   762       if ((nEntries != 256) && InStateZero(st, Tbl)) {
   763         // For space-optimized table, we need multiples of 256 bytes
   764         // in state0 and multiples of nEntries in other states
   765         offset += (unsigned char)Tbl[c + (256 * 1)];
   766       } else {
   767         offset += (unsigned char)Tbl[c + (nEntries * 1)];
   768       }
   769       {
   770         const RemapEntry* re = &st->remap_base[offset];
   771         int del_len = re->delete_bytes & ~kReplaceAndResumeFlag;
   772         int add_len = re->add_bytes & ~kHtmlPlaintextFlag;
   774         // Special-case non-HTML replacement of five sensitive entities
   775         //   &quot; &amp; &apos; &lt; &gt;
   776         //   0022   0026  0027   003c 003e
   777         // A replacement creating one of these is expressed as a pair of
   778         // entries, one for HTML output and one for plaintext output.
   779         // The first of the pair has the high bit of add_bytes set.
   780         if (re->add_bytes & kHtmlPlaintextFlag) {
   781           // Use this entry for plain text
   782           if (!is_plain_text) {
   783             // Use very next entry for HTML text (same back/delete length)
   784             re = &st->remap_base[offset + 1];
   785             add_len = re->add_bytes & ~kHtmlPlaintextFlag;
   786           }
   787         }
   789         int string_offset = re->bytes_offset;
   790         // After the replacement, need (dstlimit - newdst) >= (srclimit - src)
   791         uint8* newdst = dst - del_len + add_len;
   792         if ((dstlimit - newdst) < (srclimit - src)) {
   793           // Won't fit; don't do the replacement. Caller may realloc and retry
   794           e = kExitDstSpaceFull;
   795           break;    // exit, backing up over this char for later retry
   796         }
   797         dst -= del_len;
   798         memcpy(dst, &st->remap_string[string_offset], add_len);
   799         dst += add_len;
   800         total_changed++;
   801         if (offsetmap != NULL) {
   802           if (add_len > del_len) {
   803             offsetmap->Copy(src - copystart);
   804             offsetmap->Insert(add_len - del_len);
   805             copystart = src;
   806           } else if (add_len < del_len) {
   807             offsetmap->Copy(src - copystart + add_len - del_len);
   808             offsetmap->Delete(del_len - add_len);
   809             copystart = src;
   810           }
   811         }
   812         if (re->delete_bytes & kReplaceAndResumeFlag) {
   813           // There is a non-zero  target state at the end of the
   814           // replacement string
   815           e = st->remap_string[string_offset + add_len];
   816           Tbl = &Tbl_0[e << eshift];
   817           goto Do_state_table_newe;
   818         }
   819       }
   820       if (e == kExitRejectAlt) {break;}
   821       if (e != kExitSpecial) {goto Do_state_table;}
   823     // case kExitSpecial:      // Apply special fixups [read: hacks]
   824       // In this routine, do either UTF8CharToLower()
   825       //   fullwidth/halfwidth mapping or
   826       //   voiced mapping or
   827       //   semi-voiced mapping
   829       // First, do EXIT_REPLACE_OFFSET1 action (above)
   830       // Second: do additional code fixup
   831       {
   832         int srcdel = DoSpecialFixup(c, &src, srclimit, &dst, dstlimit);
   833         if (offsetmap != NULL) {
   834           if (srcdel != 0) {
   835             offsetmap->Copy(src - copystart - srcdel);
   836             offsetmap->Delete(srcdel);
   837             copystart = src;
   838           }
   839         }
   840       }
   841       goto Do_state_table;
   843     case kExitIllegalStructure:   // structurally illegal byte; quit
   844     case kExitReject:             // NUL or illegal code encountered; quit
   845     case kExitRejectAlt:          // Apply replacement, then exit
   846     default:                      // and all other exits
   847       break;
   848     }   // End switch (e)
   850     // Exit possibilities:
   851     //  Some other exit code, state0, back up one byte exactly
   852     //  Some other exit code, !state0, back up over last char
   854     // Back up over exactly one byte of rejected/illegal UTF-8 character
   855     src--;
   856     dst--;
   857     // Back up more if needed
   858     if (!InStateZero(st, Tbl)) {
   859       do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
   860     }
   861   } else if (!InStateZero(st, Tbl)) {
   862     // src >= srclimit, !state0
   863     // Back up over truncated UTF-8 character
   864     e = kExitIllegalStructure;
   865     do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
   866   } else {
   867     // src >= srclimit, state0
   868     // Normal termination, source fully consumed
   869     e = kExitOK;
   870   }
   872   if (offsetmap != NULL) {
   873     if (src > copystart) {
   874       offsetmap->Copy(src - copystart);
   875       copystart = src;
   876     }
   877   }
   879   // Possible return values here:
   880   //  kExitDstSpaceFull         caller may realloc and retry from middle
   881   //  kExitIllegalStructure     caller my overwrite/truncate
   882   //  kExitOK                   all done and happy
   883   //  kExitReject               caller may overwrite/truncate
   884   //  kExitDoAgain              LOOP NOT DONE; caller must retry from middle
   885   //                            (may do fast ASCII loop first)
   886   //  kExitPlaceholder          -unused-
   887   //  kExitNone                 -unused-
   888   *bytes_consumed = src - isrc;
   889   *bytes_filled = dst - odst;
   890   *chars_changed = total_changed;
   891   return e;
   892 }
   894 // TwoByte versions are needed for tables > 240 states, such
   895 // as the table for full Unicode 4.1 canonical + compatibility mapping
   897 // Scan a UTF-8 stringpiece based on state table with two-byte entries,
   898 //   copying to output stringpiece
   899 //   and doing text replacements.
   900 // DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below
   901 //   Needs caller to loop on kExitDoAgain
   902 static int UTF8GenericReplaceInternalTwoByte(const UTF8ReplaceObj_2* st,
   903                     const StringPiece& istr,
   904                     StringPiece& ostr,
   905                     bool is_plain_text,
   906                     int* bytes_consumed,
   907                     int* bytes_filled,
   908                     int* chars_changed,
   909                     OffsetMap* offsetmap) {
   910   int eshift = st->entry_shift;
   911   int nEntries = (1 << eshift);       // 64 or 256 entries per state
   912   const uint8* isrc = reinterpret_cast<const uint8*>(istr.data());
   913   const int ilen = istr.length();
   914   const uint8* copystart = isrc;
   915   const uint8* src = isrc;
   916   const uint8* srclimit = src + ilen;
   917   *bytes_consumed = 0;
   918   *bytes_filled = 0;
   919   *chars_changed = 0;
   921   const uint8* odst = reinterpret_cast<const uint8*>(ostr.data());
   922   const int olen = ostr.length();
   923   uint8* dst = const_cast<uint8*>(odst);
   924   uint8* dstlimit = dst + olen;
   926   *chars_changed = 0;
   928   int total_changed = 0;
   930   int src_lll = srclimit - src;
   931   int dst_lll = dstlimit - dst;
   934   // Invariant condition during replacements:
   935   //  remaining dst size >= remaining src size
   936   if ((dstlimit - dst) < (srclimit - src)) {
   937     if (offsetmap != NULL) {
   938       offsetmap->Copy(src - copystart);
   939       copystart = src;
   940     }
   941     return kExitDstSpaceFull_2;
   942   }
   943   const unsigned short* Tbl_0 = &st->state_table[st->state0];
   945  Do_state_table_2:
   946   // Do state-table scan, copying as we go
   947   const unsigned short* Tbl = Tbl_0;
   948   int e = 0;
   949   uint8 c = 0;
   951  Do_state_table_newe_2:
   953   //----------------------------
   954   while (src < srclimit) {
   955     c = *src;
   956     e = Tbl[c];
   957     *dst = c;
   958     src++;
   959     dst++;
   960     if (e >= kExitIllegalStructure_2) {break;}
   961     Tbl = &Tbl_0[e << eshift];
   962   }
   963   //----------------------------
   964   src_lll = src - isrc;
   965   dst_lll = dst - odst;
   967   // Exit possibilities:
   968   //  Replacement code, do the replacement and loop
   969   //  Some other exit code, state0, back up one byte exactly
   970   //  Some other exit code, !state0, back up over last char
   971   //  source consumed, state0, exit OK
   972   //  source consumed, !state0, back up over partial char
   973   // For illegal byte in state0, avoid backup up over PREVIOUS char
   974   // For truncated last char, back up to beginning of it
   976   if (e >= kExitIllegalStructure_2) {
   977     // Switch on exit code; most loop back to top
   978     int offset = 0;
   979     switch (e) {
   980     // These all make the output string the same size or shorter
   981     // No checking needed
   982     case kExitReplace31_2:    // del 2, add 1 bytes to change
   983       dst -= 2;
   984       if (offsetmap != NULL) {
   985         offsetmap->Copy(src - copystart - 2);
   986         offsetmap->Delete(2);
   987         copystart = src;
   988       }
   989       dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
   990       total_changed++;
   991       goto Do_state_table_2;
   992     case kExitReplace32_2:    // del 3, add 2 bytes to change
   993       dst--;
   994       if (offsetmap != NULL) {
   995         offsetmap->Copy(src - copystart - 1);
   996         offsetmap->Delete(1);
   997         copystart = src;
   998       }
   999       dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff);
  1000       dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
  1001       total_changed++;
  1002       goto Do_state_table_2;
  1003     case kExitReplace21_2:    // del 2, add 1 bytes to change
  1004       dst--;
  1005       if (offsetmap != NULL) {
  1006         offsetmap->Copy(src - copystart - 1);
  1007         offsetmap->Delete(1);
  1008         copystart = src;
  1010       dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
  1011       total_changed++;
  1012       goto Do_state_table_2;
  1013     case kExitReplace3_2:    // update 3 bytes to change
  1014       dst[-3] = (unsigned char)(Tbl[c + (nEntries * 2)] & 0xff);
  1015       // Fall into next case
  1016     case kExitReplace2_2:    // update 2 bytes to change
  1017       dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff);
  1018       // Fall into next case
  1019     case kExitReplace1_2:    // update 1 byte to change
  1020       dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
  1021       total_changed++;
  1022       goto Do_state_table_2;
  1023     case kExitReplace1S0_2:     // update 1 byte to change, 256-entry state
  1024       dst[-1] = (unsigned char)(Tbl[c + (256 * 1)] & 0xff);
  1025       total_changed++;
  1026       goto Do_state_table_2;
  1027     // These can make the output string longer than the input
  1028     case kExitReplaceOffset2_2:
  1029       if ((nEntries != 256) && InStateZero_2(st, Tbl)) {
  1030         // For space-optimized table, we need multiples of 256 bytes
  1031         // in state0 and multiples of nEntries in other states
  1032         offset += ((unsigned char)(Tbl[c + (256 * 1)] >> 8 & 0xff) << 8);
  1033       } else {
  1034         offset += ((unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff) << 8);
  1036       // Fall into next case
  1037     case kExitReplaceOffset1_2:
  1038       if ((nEntries != 256) && InStateZero_2(st, Tbl)) {
  1039         // For space-optimized table, we need multiples of 256 bytes
  1040         // in state0 and multiples of nEntries in other states
  1041         offset += (unsigned char)(Tbl[c + (256 * 1)] & 0xff);
  1042       } else {
  1043         offset += (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);
  1046         const RemapEntry* re = &st->remap_base[offset];
  1047         int del_len = re->delete_bytes & ~kReplaceAndResumeFlag;
  1048         int add_len = re->add_bytes & ~kHtmlPlaintextFlag;
  1049         // Special-case non-HTML replacement of five sensitive entities
  1050         //   &quot; &amp; &apos; &lt; &gt;
  1051         //   0022   0026  0027   003c 003e
  1052         // A replacement creating one of these is expressed as a pair of
  1053         // entries, one for HTML output and one for plaintext output.
  1054         // The first of the pair has the high bit of add_bytes set.
  1055         if (re->add_bytes & kHtmlPlaintextFlag) {
  1056           // Use this entry for plain text
  1057           if (!is_plain_text) {
  1058             // Use very next entry for HTML text (same back/delete length)
  1059             re = &st->remap_base[offset + 1];
  1060             add_len = re->add_bytes & ~kHtmlPlaintextFlag;
  1064         // After the replacement, need (dstlimit - dst) >= (srclimit - src)
  1065         int string_offset = re->bytes_offset;
  1066         // After the replacement, need (dstlimit - newdst) >= (srclimit - src)
  1067         uint8* newdst = dst - del_len + add_len;
  1068         if ((dstlimit - newdst) < (srclimit - src)) {
  1069           // Won't fit; don't do the replacement. Caller may realloc and retry
  1070           e = kExitDstSpaceFull_2;
  1071           break;    // exit, backing up over this char for later retry
  1073         dst -= del_len;
  1074         memcpy(dst, &st->remap_string[string_offset], add_len);
  1075         dst += add_len;
  1076         if (offsetmap != NULL) {
  1077           if (add_len > del_len) {
  1078             offsetmap->Copy(src - copystart);
  1079             offsetmap->Insert(add_len - del_len);
  1080             copystart = src;
  1081           } else if (add_len < del_len) {
  1082             offsetmap->Copy(src - copystart + add_len - del_len);
  1083             offsetmap->Delete(del_len - add_len);
  1084             copystart = src;
  1087         if (re->delete_bytes & kReplaceAndResumeFlag) {
  1088           // There is a two-byte non-zero target state at the end of the
  1089           // replacement string
  1090           uint8 c1 = st->remap_string[string_offset + add_len];
  1091           uint8 c2 = st->remap_string[string_offset + add_len + 1];
  1092           e = (c1 << 8) | c2;
  1093           Tbl = &Tbl_0[e << eshift];
  1094           total_changed++;
  1095           goto Do_state_table_newe_2;
  1098       total_changed++;
  1099       if (e == kExitRejectAlt_2) {break;}
  1100       goto Do_state_table_2;
  1102     case kExitSpecial_2:           // NO special fixups [read: hacks]
  1103     case kExitIllegalStructure_2:  // structurally illegal byte; quit
  1104     case kExitReject_2:            // NUL or illegal code encountered; quit
  1105                                    // and all other exits
  1106     default:
  1107       break;
  1108     }   // End switch (e)
  1110     // Exit possibilities:
  1111     //  Some other exit code, state0, back up one byte exactly
  1112     //  Some other exit code, !state0, back up over last char
  1114     // Back up over exactly one byte of rejected/illegal UTF-8 character
  1115     src--;
  1116     dst--;
  1117     // Back up more if needed
  1118     if (!InStateZero_2(st, Tbl)) {
  1119       do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
  1121   } else if (!InStateZero_2(st, Tbl)) {
  1122     // src >= srclimit, !state0
  1123     // Back up over truncated UTF-8 character
  1124     e = kExitIllegalStructure_2;
  1126     do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
  1127   } else {
  1128     // src >= srclimit, state0
  1129     // Normal termination, source fully consumed
  1130     e = kExitOK_2;
  1133   if (offsetmap != NULL) {
  1134     if (src > copystart) {
  1135       offsetmap->Copy(src - copystart);
  1136       copystart = src;
  1141   // Possible return values here:
  1142   //  kExitDstSpaceFull_2         caller may realloc and retry from middle
  1143   //  kExitIllegalStructure_2     caller my overwrite/truncate
  1144   //  kExitOK_2                   all done and happy
  1145   //  kExitReject_2               caller may overwrite/truncate
  1146   //  kExitDoAgain_2              LOOP NOT DONE; caller must retry from middle
  1147   //                            (may do fast ASCII loop first)
  1148   //  kExitPlaceholder_2          -unused-
  1149   //  kExitNone_2                 -unused-
  1150   *bytes_consumed = src - isrc;
  1151   *bytes_filled = dst - odst;
  1152   *chars_changed = total_changed;
  1153   return e;
  1157 // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
  1158 //   and doing text replacements.
  1159 // Also writes an optional OffsetMap. Pass NULL to skip writing one.
  1160 // Always scan complete UTF-8 characters
  1161 // Set number of bytes consumed from input, number filled to output.
  1162 // Return reason for exiting
  1163 int UTF8GenericReplace(const UTF8ReplaceObj* st,
  1164                     const StringPiece& istr,
  1165                     StringPiece& ostr,
  1166                     bool is_plain_text,
  1167                     int* bytes_consumed,
  1168                     int* bytes_filled,
  1169                     int* chars_changed,
  1170                     OffsetMap* offsetmap) {
  1171   StringPiece local_istr(istr.data(), istr.length());
  1172   StringPiece local_ostr(ostr.data(), ostr.length());
  1173   int total_consumed = 0;
  1174   int total_filled = 0;
  1175   int total_changed = 0;
  1176   int local_bytes_consumed, local_bytes_filled, local_chars_changed;
  1177   int e;
  1178   do {
  1179     e = UTF8GenericReplaceInternal(st,
  1180                     local_istr, local_ostr, is_plain_text,
  1181                     &local_bytes_consumed, &local_bytes_filled,
  1182                     &local_chars_changed,
  1183                     offsetmap);
  1184     local_istr.remove_prefix(local_bytes_consumed);
  1185     local_ostr.remove_prefix(local_bytes_filled);
  1186     total_consumed += local_bytes_consumed;
  1187     total_filled += local_bytes_filled;
  1188     total_changed += local_chars_changed;
  1189   } while ( e == kExitDoAgain );
  1190   *bytes_consumed = total_consumed;
  1191   *bytes_filled = total_filled;
  1192   *chars_changed = total_changed;
  1193   return e;
  1196 // Older version without offsetmap
  1197 int UTF8GenericReplace(const UTF8ReplaceObj* st,
  1198                     const StringPiece& istr,
  1199                     StringPiece& ostr,
  1200                     bool is_plain_text,
  1201                     int* bytes_consumed,
  1202                     int* bytes_filled,
  1203                     int* chars_changed) {
  1204   return UTF8GenericReplace(st,
  1205                     istr,
  1206                     ostr,
  1207                     is_plain_text,
  1208                     bytes_consumed,
  1209                     bytes_filled,
  1210                     chars_changed,
  1211                     NULL);
  1214 // Older version without is_plain_text or offsetmap
  1215 int UTF8GenericReplace(const UTF8ReplaceObj* st,
  1216                     const StringPiece& istr,
  1217                     StringPiece& ostr,
  1218                     int* bytes_consumed,
  1219                     int* bytes_filled,
  1220                     int* chars_changed) {
  1221   bool is_plain_text = false;
  1222   return UTF8GenericReplace(st,
  1223                     istr,
  1224                     ostr,
  1225                     is_plain_text,
  1226                     bytes_consumed,
  1227                     bytes_filled,
  1228                     chars_changed,
  1229                     NULL);
  1232 // Scan a UTF-8 stringpiece based on state table with two-byte entries,
  1233 //   copying to output stringpiece
  1234 //   and doing text replacements.
  1235 // Also writes an optional OffsetMap. Pass NULL to skip writing one.
  1236 // Always scan complete UTF-8 characters
  1237 // Set number of bytes consumed from input, number filled to output.
  1238 // Return reason for exiting
  1239 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
  1240                     const StringPiece& istr,
  1241                     StringPiece& ostr,
  1242                     bool is_plain_text,
  1243                     int* bytes_consumed,
  1244                     int* bytes_filled,
  1245                     int* chars_changed,
  1246                     OffsetMap* offsetmap) {
  1247   StringPiece local_istr(istr.data(), istr.length());
  1248   StringPiece local_ostr(ostr.data(), ostr.length());
  1249   int total_consumed = 0;
  1250   int total_filled = 0;
  1251   int total_changed = 0;
  1252   int local_bytes_consumed, local_bytes_filled, local_chars_changed;
  1253   int e;
  1254   do {
  1255     e = UTF8GenericReplaceInternalTwoByte(st,
  1256                     local_istr, local_ostr, is_plain_text,
  1257                     &local_bytes_consumed,
  1258                     &local_bytes_filled,
  1259                     &local_chars_changed,
  1260                     offsetmap);
  1261     local_istr.remove_prefix(local_bytes_consumed);
  1262     local_ostr.remove_prefix(local_bytes_filled);
  1263     total_consumed += local_bytes_consumed;
  1264     total_filled += local_bytes_filled;
  1265     total_changed += local_chars_changed;
  1266   } while ( e == kExitDoAgain_2 );
  1267   *bytes_consumed = total_consumed;
  1268   *bytes_filled = total_filled;
  1269   *chars_changed = total_changed;
  1271   return e - kExitOK_2 + kExitOK;
  1274 // Older version without offsetmap
  1275 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
  1276                     const StringPiece& istr,
  1277                     StringPiece& ostr,
  1278                     bool is_plain_text,
  1279                     int* bytes_consumed,
  1280                     int* bytes_filled,
  1281                     int* chars_changed) {
  1282   return UTF8GenericReplaceTwoByte(st,
  1283                     istr,
  1284                     ostr,
  1285                     is_plain_text,
  1286                     bytes_consumed,
  1287                     bytes_filled,
  1288                     chars_changed,
  1289                     NULL);
  1292 // Older version without is_plain_text or offsetmap
  1293 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
  1294                     const StringPiece& istr,
  1295                     StringPiece& ostr,
  1296                     int* bytes_consumed,
  1297                     int* bytes_filled,
  1298                     int* chars_changed) {
  1299   bool is_plain_text = false;
  1300   return UTF8GenericReplaceTwoByte(st,
  1301                     istr,
  1302                     ostr,
  1303                     is_plain_text,
  1304                     bytes_consumed,
  1305                     bytes_filled,
  1306                     chars_changed,
  1307                     NULL);
  1312 // Adjust a stringpiece to encompass complete UTF-8 characters.
  1313 // The data pointer will be increased by 0..3 bytes to get to a character
  1314 // boundary, and the length will then be decreased by 0..3 bytes
  1315 // to encompass the last complete character.
  1316 void UTF8TrimToChars(StringPiece* istr) {
  1317   const char* src = istr->data();
  1318   int len = istr->length();
  1319   // Exit if empty string
  1320   if (len == 0) {
  1321     return;
  1324   // Exit on simple, common case
  1325   if ( ((src[0] & 0xc0) != 0x80) &&
  1326        (static_cast<signed char>(src[len - 1]) >= 0) ) {
  1327     // First byte is not a continuation and last byte is 7-bit ASCII -- done
  1328     return;
  1331   // Adjust the back end, len > 0
  1332   const char* srclimit = src + len;
  1333   // Backscan over any ending continuation bytes to find last char start
  1334   const char* s = srclimit - 1;         // Last byte of the string
  1335   while ((src <= s) && ((*s & 0xc0) == 0x80)) {
  1336     s--;
  1338   // Include entire last char if it fits
  1339   if (src <= s) {
  1340     int last_char_len = UTF8OneCharLen(s);
  1341     if (s + last_char_len <= srclimit) {
  1342       // Last char fits, so include it, else exclude it
  1343       s += last_char_len;
  1346   if (s != srclimit) {
  1347     // s is one byte beyond the last full character, if any
  1348     istr->remove_suffix(srclimit - s);
  1349     // Exit if now empty string
  1350     if (istr->length() == 0) {
  1351       return;
  1355   // Adjust the front end, len > 0
  1356   len = istr->length();
  1357   srclimit = src + len;
  1358   s = src;                            // First byte of the string
  1359   // Scan over any beginning continuation bytes to find first char start
  1360   while ((s < srclimit) && ((*s & 0xc0) == 0x80)) {
  1361     s++;
  1363   if (s != src) {
  1364     // s is at the first full character, if any
  1365     istr->remove_prefix(s - src);
  1369 }       // End namespace CLD2

mercurial