The Tor Browser: browser/components/translation/cld2/internal/utf8statetable.cc@6474c204b198

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 // Copyright 2013 Google Inc. All Rights Reserved.

     2 //

     3 // Licensed under the Apache License, Version 2.0 (the "License");

     4 // you may not use this file except in compliance with the License.

     5 // You may obtain a copy of the License at

     6 //

     7 //     http://www.apache.org/licenses/LICENSE-2.0

     8 //

     9 // Unless required by applicable law or agreed to in writing, software

    10 // distributed under the License is distributed on an "AS IS" BASIS,

    11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

    12 // See the License for the specific language governing permissions and

    13 // limitations under the License.

    15 //

    16 // State Table follower for scanning UTF-8 strings without converting to

    17 // 32- or 16-bit Unicode values.

    18 //

    20 #ifdef COMPILER_MSVC

    21 // MSVC warns: warning C4309: 'initializing' : truncation of constant value

    22 // But the value is in fact not truncated.  0xFF still comes out 0xFF at

    23 // runtime.

    24 #pragma warning ( disable : 4309 )

    25 #endif

    27 #include "utf8statetable.h"

    29 #include <stdint.h>                     // for uintptr_t

    30 #include <string.h>                     // for NULL, memcpy, memmove

    32 #include "integral_types.h"        // for uint8, uint32, int8

    33 #include "stringpiece.h"

    34 #include "offsetmap.h"

    37 namespace CLD2 {

    39 static const int kReplaceAndResumeFlag = 0x80; // Bit in del byte to distinguish

    40                                                // optional next-state field

    41                                                // after replacement text

    42 static const int kHtmlPlaintextFlag = 0x80;    // Bit in add byte to distinguish

    43                                                // HTML replacement vs. plaintext

    46 /**

    47  * This code implements a little interpreter for UTF8 state

    48  * tables. There are three kinds of quite-similar state tables,

    49  * property, scanning, and replacement. Each state in one of

    50  * these tables consists of an array of 256 or 64 one-byte

    51  * entries. The state is subscripted by an incoming source byte,

    52  * and the entry either specifies the next state or specifies an

    53  * action. Space-optimized tables have full 256-entry states for

    54  * the first byte of a UTF-8 character, but only 64-entry states

    55  * for continuation bytes. Space-optimized tables may only be

    56  * used with source input that has been checked to be

    57  * structurally- (or stronger interchange-) valid.

    58  *

    59  * A property state table has an unsigned one-byte property for

    60  * each possible UTF-8 character. One-byte character properties

    61  * are in the state[0] array, while for other lengths the

    62  * state[0] array gives the next state, which contains the

    63  * property value for two-byte characters or yet another state

    64  * for longer ones. The code simply loads the right number of

    65  * next-state values, then returns the final byte as property

    66  * value. There are no actions specified in property tables.

    67  * States are typically shared for multi-byte UTF-8 characters

    68  * that all have the same property value.

    69  *

    70  * A scanning state table has entries that are either a

    71  * next-state specifier for bytes that are accepted by the

    72  * scanner, or an exit action for the last byte of each

    73  * character that is rejected by the scanner.

    74  *

    75  * Scanning long strings involves a tight loop that picks up one

    76  * byte at a time and follows next-state value back to state[0]

    77  * for each accepted UTF-8 character. Scanning stops at the end

    78  * of the string or at the first character encountered that has

    79  * an exit action such as "reject". Timing information is given

    80  * below.

    81  *

    82  * Since so much of Google's text is 7-bit-ASCII values

    83  * (approximately 94% of the bytes of web documents), the

    84  * scanning interpreter has two speed optimizations. One checks

    85  * 8 bytes at a time to see if they are all in the range lo..hi,

    86  * as specified in constants in the overall statetable object.

    87  * The check involves ORing together four 4-byte values that

    88  * overflow into the high bit of some byte when a byte is out of

    89  * range. For seven-bit-ASCII, lo is 0x20 and hi is 0x7E. This

    90  * loop is about 8x faster than the one-byte-at-a-time loop.

    91  *

    92  * If checking for exit bytes in the 0x00-0x1F and 7F range is

    93  * unneeded, an even faster loop just looks at the high bits of

    94  * 8 bytes at once, and is about 1.33x faster than the lo..hi

    95  * loop.

    96  *

    97  * Exit from the scanning routines backs up to the first byte of

    98  * the rejected character, so the text spanned is always a

    99  * complete number of UTF-8 characters. The normal scanning exit

   100  * is at the first rejected character, or at the end of the

   101  * input text. Scanning also exits on any detected ill-formed

   102  * character or at a special do-again action built into some

   103  * exit-optimized tables. The do-again action gets back to the

   104  * top of the scanning loop to retry eight-byte ASCII scans. It

   105  * is typically put into state tables after four seven-bit-ASCII

   106  * characters in a row are seen, to allow restarting the fast

   107  * scan after some slower processing of multi-byte characters.

   108  *

   109  * A replacement state table is similar to a scanning state

   110  * table but has more extensive actions. The default

   111  * byte-at-a-time loop copies one byte from source to

   112  * destination and goes to the next state. The replacement

   113  * actions overwrite 1-3 bytes of the destination with different

   114  * bytes, possibly shortening the output by 1 or 2 bytes. The

   115  * replacement bytes come from within the state table, from

   116  * dummy states inserted just after any state that contains a

   117  * replacement action. This gives a quick address calculation for

   118  * the replacement byte(s) and gives some cache locality.

   119  *

   120  * Additional replacement actions use one or two bytes from

   121  * within dummy states to index a side table of more-extensive

   122  * replacements. The side table specifies a length of 0..15

   123  * destination bytes to overwrite and a length of 0..127 bytes

   124  * to overwrite them with, plus the actual replacement bytes.

   125  *

   126  * This side table uses one extra bit to specify a pair of

   127  * replacements, the first to be used in an HTML context and the

   128  * second to be used in a plaintext context. This allows

   129  * replacements that are spelled with "&lt;" in the former

   130  * context and "<" in the latter.

   131  *

   132  * The side table also uses an extra bit to specify a non-zero

   133  * next state after a replacement. This allows a combination

   134  * replacement and state change, used to implement a limited

   135  * version of the Boyer-Moore algorithm for multi-character

   136  * replacement without backtracking. This is useful when there

   137  * are overlapping replacements, such as ch => x and also c =>

   138  * y, the latter to be used only if the character after c is not

   139  * h. in this case, the state[0] table's entry for c would

   140  * change c to y and also have a next-state of say n, and the

   141  * state[n] entry for h would specify a replacement of the two

   142  * bytes yh by x. No backtracking is needed.

   143  *

   144  * A replacement table may also include the exit actions of a

   145  * scanning state table, so some character sequences can

   146  * terminate early.

   147  *

   148  * During replacement, an optional data structure called an

   149  * offset map can be updated to reflect each change in length

   150  * between source and destination. This offset map can later be

   151  * used to map destination-string offsets to corresponding

   152  * source-string offsets or vice versa.

   153  *

   154  * The routines below also have variants in which state-table

   155  * entries are all two bytes instead of one byte. This allows

   156  * tables with more than 240 total states, but takes up twice as

   157  * much space per state.

   158  *

   159 **/

   161 // Return true if current Tbl pointer is within state0 range

   162 // Note that unsigned compare checks both ends of range simultaneously

   163 static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {

   164   const uint8* Tbl0 = &st->state_table[st->state0];

   165   return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);

   166 }

   168 static inline bool InStateZero_2(const UTF8ReplaceObj_2* st,

   169                                  const unsigned short int* Tbl) {

   170   const unsigned short int* Tbl0 =  &st->state_table[st->state0];

   171   // Word difference, not byte difference

   172   return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);

   173 }

   175 // UTF8PropObj, UTF8ScanObj, UTF8ReplaceObj are all typedefs of

   176 // UTF8MachineObj.

   178 static bool IsPropObj(const UTF8StateMachineObj& obj) {

   179   return obj.fast_state == NULL

   180       && obj.max_expand == 0;

   181 }

   183 static bool IsPropObj_2(const UTF8StateMachineObj_2& obj) {

   184   return obj.fast_state == NULL

   185       && obj.max_expand == 0;

   186 }

   188 static bool IsScanObj(const UTF8StateMachineObj& obj) {

   189   return obj.fast_state != NULL

   190       && obj.max_expand == 0;

   191 }

   193 static bool IsReplaceObj(const UTF8StateMachineObj& obj) {

   194   // Normally, obj.fast_state != NULL, but the handwritten tables

   195   // in utf8statetable_unittest don't handle fast_states.

   196   return obj.max_expand > 0;

   197 }

   199 static bool IsReplaceObj_2(const UTF8StateMachineObj_2& obj) {

   200   return obj.max_expand > 0;

   201 }

   203 // Look up property of one UTF-8 character and advance over it

   204 // Return 0 if input length is zero

   205 // Return 0 and advance one byte if input is ill-formed

   206 uint8 UTF8GenericProperty(const UTF8PropObj* st,

   207                           const uint8** src,

   208                           int* srclen) {

   209   if (*srclen <= 0) {

   210     return 0;

   211   }

   213   const uint8* lsrc = *src;

   214   const uint8* Tbl_0 = &st->state_table[st->state0];

   215   const uint8* Tbl = Tbl_0;

   216   int e;

   217   int eshift = st->entry_shift;

   219   // Short series of tests faster than switch, optimizes 7-bit ASCII

   220   unsigned char c = lsrc[0];

   221   if (static_cast<signed char>(c) >= 0) {           // one byte

   222     e = Tbl[c];

   223     *src += 1;

   224     *srclen -= 1;

   225   } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes

   226     e = Tbl[c];

   227     Tbl = &Tbl_0[e << eshift];

   228     e = Tbl[lsrc[1]];

   229     *src += 2;

   230     *srclen -= 2;

   231   } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes

   232     e = Tbl[c];

   233     Tbl = &Tbl_0[e << eshift];

   234     e = Tbl[lsrc[1]];

   235     Tbl = &Tbl_0[e << eshift];

   236     e = Tbl[lsrc[2]];

   237     *src += 3;

   238     *srclen -= 3;

   239   }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes

   240     e = Tbl[c];

   241     Tbl = &Tbl_0[e << eshift];

   242     e = Tbl[lsrc[1]];

   243     Tbl = &Tbl_0[e << eshift];

   244     e = Tbl[lsrc[2]];

   245     Tbl = &Tbl_0[e << eshift];

   246     e = Tbl[lsrc[3]];

   247     *src += 4;

   248     *srclen -= 4;

   249   } else {                                                // Ill-formed

   250     e = 0;

   251     *src += 1;

   252     *srclen -= 1;

   253   }

   254   return e;

   255 }

   257 bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src) {

   258   const uint8* lsrc = reinterpret_cast<const uint8*>(src);

   259   const uint8* Tbl_0 = &st.state_table[st.state0];

   260   const uint8* Tbl = Tbl_0;

   261   int e;

   262   int eshift = st.entry_shift;

   264   // Short series of tests faster than switch, optimizes 7-bit ASCII

   265   unsigned char c = lsrc[0];

   266   if (static_cast<signed char>(c) >= 0) {           // one byte

   267     e = Tbl[c];

   268   } else if ((c & 0xe0) == 0xc0) {     // two bytes

   269     e = Tbl[c];

   270     Tbl = &Tbl_0[e << eshift];

   271     e = Tbl[lsrc[1]];

   272   } else if ((c & 0xf0) == 0xe0) {     // three bytes

   273     e = Tbl[c];

   274     Tbl = &Tbl_0[e << eshift];

   275     e = Tbl[lsrc[1]];

   276     Tbl = &Tbl_0[e << eshift];

   277     e = Tbl[lsrc[2]];

   278   } else {                             // four bytes

   279     e = Tbl[c];

   280     Tbl = &Tbl_0[e << eshift];

   281     e = Tbl[lsrc[1]];

   282     Tbl = &Tbl_0[e << eshift];

   283     e = Tbl[lsrc[2]];

   284     Tbl = &Tbl_0[e << eshift];

   285     e = Tbl[lsrc[3]];

   286   }

   287   return e;

   288 }

   291 // BigOneByte versions are needed for tables > 240 states, but most

   292 // won't need the TwoByte versions.

   293 // Internally, to next-to-last offset is multiplied by 16 and the last

   294 // offset is relative instead of absolute.

   295 // Look up property of one UTF-8 character and advance over it

   296 // Return 0 if input length is zero

   297 // Return 0 and advance one byte if input is ill-formed

   298 uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,

   299                           const uint8** src,

   300                           int* srclen) {

   301   if (*srclen <= 0) {

   302     return 0;

   303   }

   305   const uint8* lsrc = *src;

   306   const uint8* Tbl_0 = &st->state_table[st->state0];

   307   const uint8* Tbl = Tbl_0;

   308   int e;

   309   int eshift = st->entry_shift;

   311   // Short series of tests faster than switch, optimizes 7-bit ASCII

   312   unsigned char c = lsrc[0];

   313   if (static_cast<signed char>(c) >= 0) {           // one byte

   314     e = Tbl[c];

   315     *src += 1;

   316     *srclen -= 1;

   317   } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes

   318     e = Tbl[c];

   319     Tbl = &Tbl_0[e << eshift];

   320     e = Tbl[lsrc[1]];

   321     *src += 2;

   322     *srclen -= 2;

   323   } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes

   324     e = Tbl[c];

   325     Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range

   326     e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];

   327     Tbl = &Tbl[e << eshift];          // Relative +/-

   328     e = Tbl[lsrc[2]];

   329     *src += 3;

   330     *srclen -= 3;

   331   }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes

   332     e = Tbl[c];

   333     Tbl = &Tbl_0[e << eshift];

   334     e = Tbl[lsrc[1]];

   335     Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range

   336     e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];

   337     Tbl = &Tbl[e << eshift];          // Relative +/-

   338     e = Tbl[lsrc[3]];

   339     *src += 4;

   340     *srclen -= 4;

   341   } else {                                                // Ill-formed

   342     e = 0;

   343     *src += 1;

   344     *srclen -= 1;

   345   }

   346   return e;

   347 }

   349 // BigOneByte versions are needed for tables > 240 states, but most

   350 // won't need the TwoByte versions.

   351 bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src) {

   352   const uint8* lsrc = reinterpret_cast<const uint8*>(src);

   353   const uint8* Tbl_0 = &st.state_table[st.state0];

   354   const uint8* Tbl = Tbl_0;

   355   int e;

   356   int eshift = st.entry_shift;

   358   // Short series of tests faster than switch, optimizes 7-bit ASCII

   359   unsigned char c = lsrc[0];

   360   if (static_cast<signed char>(c) >= 0) {           // one byte

   361     e = Tbl[c];

   362   } else if ((c & 0xe0) == 0xc0) {    // two bytes

   363     e = Tbl[c];

   364     Tbl = &Tbl_0[e << eshift];

   365     e = Tbl[lsrc[1]];

   366   } else if ((c & 0xf0) == 0xe0) {    // three bytes

   367     e = Tbl[c];

   368     Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range

   369     e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];

   370     Tbl = &Tbl[e << eshift];          // Relative +/-

   371     e = Tbl[lsrc[2]];

   372   } else {                            // four bytes

   373     e = Tbl[c];

   374     Tbl = &Tbl_0[e << eshift];

   375     e = Tbl[lsrc[1]];

   376     Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range

   377     e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];

   378     Tbl = &Tbl[e << eshift];          // Relative +/-

   379     e = Tbl[lsrc[3]];

   380   }

   381   return e;

   382 }

   385 // TwoByte versions are needed for tables > 240 states

   386 // Look up property of one UTF-8 character and advance over it

   387 // Return 0 if input length is zero

   388 // Return 0 and advance one byte if input is ill-formed

   389 uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,

   390                           const uint8** src,

   391                           int* srclen) {

   392   if (*srclen <= 0) {

   393     return 0;

   394   }

   396   const uint8* lsrc = *src;

   397   const unsigned short* Tbl_0 = &st->state_table[st->state0];

   398   const unsigned short* Tbl = Tbl_0;

   399   int e;

   400   int eshift = st->entry_shift;

   402   // Short series of tests faster than switch, optimizes 7-bit ASCII

   403   unsigned char c = lsrc[0];

   404   if (static_cast<signed char>(c) >= 0) {           // one byte

   405     e = Tbl[c];

   406     *src += 1;

   407     *srclen -= 1;

   408   } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes

   409     e = Tbl[c];

   410     Tbl = &Tbl_0[e << eshift];

   411     e = Tbl[lsrc[1]];

   412     *src += 2;

   413     *srclen -= 2;

   414   } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes

   415     e = Tbl[c];

   416     Tbl = &Tbl_0[e << eshift];

   417     e = Tbl[lsrc[1]];

   418     Tbl = &Tbl_0[e << eshift];

   419     e = Tbl[lsrc[2]];

   420     *src += 3;

   421     *srclen -= 3;

   422   }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes

   423     e = Tbl[c];

   424     Tbl = &Tbl_0[e << eshift];

   425     e = Tbl[lsrc[1]];

   426     Tbl = &Tbl_0[e << eshift];

   427     e = Tbl[lsrc[2]];

   428     Tbl = &Tbl_0[e << eshift];

   429     e = Tbl[lsrc[3]];

   430     *src += 4;

   431     *srclen -= 4;

   432   } else {                                                // Ill-formed

   433     e = 0;

   434     *src += 1;

   435     *srclen -= 1;

   436   }

   437   return e;

   438 }

   440 // TwoByte versions are needed for tables > 240 states

   441 bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src) {

   442   const uint8* lsrc = reinterpret_cast<const uint8*>(src);

   443   const unsigned short* Tbl_0 = &st.state_table[st.state0];

   444   const unsigned short* Tbl = Tbl_0;

   445   int e;

   446   int eshift = st.entry_shift;

   448   // Short series of tests faster than switch, optimizes 7-bit ASCII

   449   unsigned char c = lsrc[0];

   450   if (static_cast<signed char>(c) >= 0) {           // one byte

   451     e = Tbl[c];

   452   } else if ((c & 0xe0) == 0xc0) {     // two bytes

   453     e = Tbl[c];

   454     Tbl = &Tbl_0[e << eshift];

   455     e = Tbl[lsrc[1]];

   456   } else if ((c & 0xf0) == 0xe0) {     // three bytes

   457     e = Tbl[c];

   458     Tbl = &Tbl_0[e << eshift];

   459     e = Tbl[lsrc[1]];

   460     Tbl = &Tbl_0[e << eshift];

   461     e = Tbl[lsrc[2]];

   462   } else {                             // four bytes

   463     e = Tbl[c];

   464     Tbl = &Tbl_0[e << eshift];

   465     e = Tbl[lsrc[1]];

   466     Tbl = &Tbl_0[e << eshift];

   467     e = Tbl[lsrc[2]];

   468     Tbl = &Tbl_0[e << eshift];

   469     e = Tbl[lsrc[3]];

   470   }

   471   return e;

   472 }

   475 // Approximate speeds on 2.8 GHz Pentium 4:

   476 //   GenericScan 1-byte loop           300 MB/sec *

   477 //   GenericScan 4-byte loop          1200 MB/sec

   478 //   GenericScan 8-byte loop          2400 MB/sec *

   479 //   GenericScanFastAscii 4-byte loop 3000 MB/sec

   480 //   GenericScanFastAscii 8-byte loop 3200 MB/sec *

   481 //

   482 // * Implemented below. FastAscii loop is memory-bandwidth constrained.

   484 // Scan a UTF-8 stringpiece based on state table.

   485 // Always scan complete UTF-8 characters

   486 // Set number of bytes scanned. Return reason for exiting

   487 int UTF8GenericScan(const UTF8ScanObj* st,

   488                     const StringPiece& str,

   489                     int* bytes_consumed) {

   490   int eshift = st->entry_shift;       // 6 (space optimized) or 8

   491   // int nEntries = (1 << eshift);       // 64 or 256 entries per state

   493   const uint8* isrc =

   494     reinterpret_cast<const uint8*>(str.data());

   495   const uint8* src = isrc;

   496   const int len = str.length();

   497   const uint8* srclimit = isrc + len;

   498   const uint8* srclimit8 = srclimit - 7;

   499   *bytes_consumed = 0;

   500   if (len == 0) return kExitOK;

   502   const uint8* Tbl_0 = &st->state_table[st->state0];

   504 DoAgain:

   505   // Do state-table scan

   506   int e = 0;

   507   uint8 c;

   509   // Do fast for groups of 8 identity bytes.

   510   // This covers a lot of 7-bit ASCII ~8x faster than the 1-byte loop,

   511   // including slowing slightly on cr/lf/ht

   512   //----------------------------

   513   const uint8* Tbl2 = &st->fast_state[0];

   514   uint32 losub = st->losub;

   515   uint32 hiadd = st->hiadd;

   516   while (src < srclimit8) {

   517     uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];

   518     uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];

   519     src += 8;

   520     // This is a fast range check for all bytes in [lowsub..0x80-hiadd)

   521     uint32 temp = (s0123 - losub) | (s0123 + hiadd) |

   522                   (s4567 - losub) | (s4567 + hiadd);

   523     if ((temp & 0x80808080) != 0) {

   524       // We typically end up here on cr/lf/ht; src was incremented

   525       int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |

   526                   (Tbl2[src[-6]] | Tbl2[src[-5]]);

   527       if (e0123 != 0) {src -= 8; break;}    // Exit on Non-interchange

   528       e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |

   529               (Tbl2[src[-2]] | Tbl2[src[-1]]);

   530       if (e0123 != 0) {src -= 4; break;}    // Exit on Non-interchange

   531       // Else OK, go around again

   532     }

   533   }

   534   //----------------------------

   536   // Byte-at-a-time scan

   537   //----------------------------

   538   const uint8* Tbl = Tbl_0;

   539   while (src < srclimit) {

   540     c = *src;

   541     e = Tbl[c];

   542     src++;

   543     if (e >= kExitIllegalStructure) {break;}

   544     Tbl = &Tbl_0[e << eshift];

   545   }

   546   //----------------------------

   549   // Exit possibilities:

   550   //  Some exit code, !state0, back up over last char

   551   //  Some exit code, state0, back up one byte exactly

   552   //  source consumed, !state0, back up over partial char

   553   //  source consumed, state0, exit OK

   554   // For illegal byte in state0, avoid backup up over PREVIOUS char

   555   // For truncated last char, back up to beginning of it

   557   if (e >= kExitIllegalStructure) {

   558     // Back up over exactly one byte of rejected/illegal UTF-8 character

   559     src--;

   560     // Back up more if needed

   561     if (!InStateZero(st, Tbl)) {

   562       do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));

   563     }

   564   } else if (!InStateZero(st, Tbl)) {

   565     // Back up over truncated UTF-8 character

   566     e = kExitIllegalStructure;

   567     do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));

   568   } else {

   569     // Normal termination, source fully consumed

   570     e = kExitOK;

   571   }

   573   if (e == kExitDoAgain) {

   574     // Loop back up to the fast scan

   575     goto DoAgain;

   576   }

   578   *bytes_consumed = src - isrc;

   579   return e;

   580 }

   582 // Scan a UTF-8 stringpiece based on state table.

   583 // Always scan complete UTF-8 characters

   584 // Set number of bytes scanned. Return reason for exiting

   585 // OPTIMIZED for case of 7-bit ASCII 0000..007f all valid

   586 int UTF8GenericScanFastAscii(const UTF8ScanObj* st,

   587                     const StringPiece& str,

   588                     int* bytes_consumed) {

   589   const uint8* isrc =

   590     reinterpret_cast<const uint8*>(str.data());

   591   const uint8* src = isrc;

   592   const int len = str.length();

   593   const uint8* srclimit = isrc + len;

   594   const uint8* srclimit8 = srclimit - 7;

   595   *bytes_consumed = 0;

   596   if (len == 0) return kExitOK;

   598   int n;

   599   int rest_consumed;

   600   int exit_reason;

   601   do {

   602     // Skip 8 bytes of ASCII at a whack; no endianness issue

   603     while ((src < srclimit8) &&

   604            (((reinterpret_cast<const uint32*>(src)[0] |

   605               reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {

   606       src += 8;

   607     }

   608     // Run state table on the rest

   609     n = src - isrc;

   610     StringPiece str2(str.data() + n, str.length() - n);

   611     exit_reason = UTF8GenericScan(st, str2, &rest_consumed);

   612     src += rest_consumed;

   613   } while ( exit_reason == kExitDoAgain );

   615   *bytes_consumed = src - isrc;

   616   return exit_reason;

   617 }

   619 // Hack to change halfwidth katakana to match an old UTF8CharToLower()

   621 // Return number of src bytes skipped

   622 static int DoSpecialFixup(const unsigned char c,

   623                     const unsigned char** srcp, const unsigned char* srclimit,

   624                     unsigned char** dstp, unsigned char* dstlimit) {

   625   return 0;

   626 }

   629 // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece

   630 //   and doing text replacements.

   631 // DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below

   632 //   Needs caller to loop on kExitDoAgain

   633 static int UTF8GenericReplaceInternal(const UTF8ReplaceObj* st,

   634                     const StringPiece& istr,

   635                     StringPiece& ostr,

   636                     bool is_plain_text,

   637                     int* bytes_consumed,

   638                     int* bytes_filled,

   639                     int* chars_changed,

   640                     OffsetMap* offsetmap) {

   641   int eshift = st->entry_shift;

   642   int nEntries = (1 << eshift);       // 64 or 256 entries per state

   643   const uint8* isrc = reinterpret_cast<const uint8*>(istr.data());

   644   const int ilen = istr.length();

   645   const uint8* copystart = isrc;

   646   const uint8* src = isrc;

   647   const uint8* srclimit = src + ilen;

   648   *bytes_consumed = 0;

   649   *bytes_filled = 0;

   650   *chars_changed = 0;

   652   const uint8* odst = reinterpret_cast<const uint8*>(ostr.data());

   653   const int olen = ostr.length();

   654   uint8* dst = const_cast<uint8*>(odst);

   655   uint8* dstlimit = dst + olen;

   657   int total_changed = 0;

   659   // Invariant condition during replacements:

   660   //  remaining dst size >= remaining src size

   661   if ((dstlimit - dst) < (srclimit - src)) {

   662     if (offsetmap != NULL) {

   663       offsetmap->Copy(src - copystart);

   664       copystart = src;

   665     }

   666     return kExitDstSpaceFull;

   667   }

   668   const uint8* Tbl_0 = &st->state_table[st->state0];

   670  Do_state_table:

   671   // Do state-table scan, copying as we go

   672   const uint8* Tbl = Tbl_0;

   673   int e = 0;

   674   uint8 c = 0;

   676  Do_state_table_newe:

   678   //----------------------------

   679   while (src < srclimit) {

   680     c = *src;

   681     e = Tbl[c];

   682     *dst = c;

   683     src++;

   684     dst++;

   685     if (e >= kExitIllegalStructure) {break;}

   686     Tbl = &Tbl_0[e << eshift];

   687   }

   688   //----------------------------

   690   // Exit possibilities:

   691   //  Replacement code, do the replacement and loop

   692   //  Some other exit code, state0, back up one byte exactly

   693   //  Some other exit code, !state0, back up over last char

   694   //  source consumed, state0, exit OK

   695   //  source consumed, !state0, back up over partial char

   696   // For illegal byte in state0, avoid backup up over PREVIOUS char

   697   // For truncated last char, back up to beginning of it

   699   if (e >= kExitIllegalStructure) {

   700     // Switch on exit code; most loop back to top

   701     int offset = 0;

   702     switch (e) {

   703     // These all make the output string the same size or shorter

   704     // No checking needed

   705     case kExitReplace31:    // del 2, add 1 bytes to change

   706       dst -= 2;

   707       if (offsetmap != NULL) {

   708         offsetmap->Copy(src - copystart - 2);

   709         offsetmap->Delete(2);

   710         copystart = src;

   711       }

   712       dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];

   713       total_changed++;

   714       goto Do_state_table;

   715     case kExitReplace32:    // del 3, add 2 bytes to change

   716       dst--;

   717       if (offsetmap != NULL) {

   718         offsetmap->Copy(src - copystart - 1);

   719         offsetmap->Delete(1);

   720         copystart = src;

   721       }

   722       dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)];

   723       dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];

   724       total_changed++;

   725       goto Do_state_table;

   726     case kExitReplace21:    // del 2, add 1 bytes to change

   727       dst--;

   728       if (offsetmap != NULL) {

   729         offsetmap->Copy(src - copystart - 1);

   730         offsetmap->Delete(1);

   731         copystart = src;

   732       }

   733       dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];

   734       total_changed++;

   735       goto Do_state_table;

   736     case kExitReplace3:    // update 3 bytes to change

   737       dst[-3] = (unsigned char)Tbl[c + (nEntries * 3)];

   738       // Fall into next case

   739     case kExitReplace2:    // update 2 bytes to change

   740       dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)];

   741       // Fall into next case

   742     case kExitReplace1:    // update 1 byte to change

   743       dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)];

   744       total_changed++;

   745       goto Do_state_table;

   746     case kExitReplace1S0:     // update 1 byte to change, 256-entry state

   747       dst[-1] = (unsigned char)Tbl[c + (256 * 1)];

   748       total_changed++;

   749       goto Do_state_table;

   750     // These can make the output string longer than the input

   751     case kExitReplaceOffset2:

   752       if ((nEntries != 256) && InStateZero(st, Tbl)) {

   753         // For space-optimized table, we need multiples of 256 bytes

   754         // in state0 and multiples of nEntries in other states

   755         offset += ((unsigned char)Tbl[c + (256 * 2)] << 8);

   756       } else {

   757         offset += ((unsigned char)Tbl[c + (nEntries * 2)] << 8);

   758       }

   759       // Fall into next case

   760     case kExitSpecial:      // Apply special fixups [read: hacks]

   761     case kExitReplaceOffset1:

   762       if ((nEntries != 256) && InStateZero(st, Tbl)) {

   763         // For space-optimized table, we need multiples of 256 bytes

   764         // in state0 and multiples of nEntries in other states

   765         offset += (unsigned char)Tbl[c + (256 * 1)];

   766       } else {

   767         offset += (unsigned char)Tbl[c + (nEntries * 1)];

   768       }

   769       {

   770         const RemapEntry* re = &st->remap_base[offset];

   771         int del_len = re->delete_bytes & ~kReplaceAndResumeFlag;

   772         int add_len = re->add_bytes & ~kHtmlPlaintextFlag;

   774         // Special-case non-HTML replacement of five sensitive entities

   775         //   &quot; &amp; &apos; &lt; &gt;

   776         //   0022   0026  0027   003c 003e

   777         // A replacement creating one of these is expressed as a pair of

   778         // entries, one for HTML output and one for plaintext output.

   779         // The first of the pair has the high bit of add_bytes set.

   780         if (re->add_bytes & kHtmlPlaintextFlag) {

   781           // Use this entry for plain text

   782           if (!is_plain_text) {

   783             // Use very next entry for HTML text (same back/delete length)

   784             re = &st->remap_base[offset + 1];

   785             add_len = re->add_bytes & ~kHtmlPlaintextFlag;

   786           }

   787         }

   789         int string_offset = re->bytes_offset;

   790         // After the replacement, need (dstlimit - newdst) >= (srclimit - src)

   791         uint8* newdst = dst - del_len + add_len;

   792         if ((dstlimit - newdst) < (srclimit - src)) {

   793           // Won't fit; don't do the replacement. Caller may realloc and retry

   794           e = kExitDstSpaceFull;

   795           break;    // exit, backing up over this char for later retry

   796         }

   797         dst -= del_len;

   798         memcpy(dst, &st->remap_string[string_offset], add_len);

   799         dst += add_len;

   800         total_changed++;

   801         if (offsetmap != NULL) {

   802           if (add_len > del_len) {

   803             offsetmap->Copy(src - copystart);

   804             offsetmap->Insert(add_len - del_len);

   805             copystart = src;

   806           } else if (add_len < del_len) {

   807             offsetmap->Copy(src - copystart + add_len - del_len);

   808             offsetmap->Delete(del_len - add_len);

   809             copystart = src;

   810           }

   811         }

   812         if (re->delete_bytes & kReplaceAndResumeFlag) {

   813           // There is a non-zero  target state at the end of the

   814           // replacement string

   815           e = st->remap_string[string_offset + add_len];

   816           Tbl = &Tbl_0[e << eshift];

   817           goto Do_state_table_newe;

   818         }

   819       }

   820       if (e == kExitRejectAlt) {break;}

   821       if (e != kExitSpecial) {goto Do_state_table;}

   823     // case kExitSpecial:      // Apply special fixups [read: hacks]

   824       // In this routine, do either UTF8CharToLower()

   825       //   fullwidth/halfwidth mapping or

   826       //   voiced mapping or

   827       //   semi-voiced mapping

   829       // First, do EXIT_REPLACE_OFFSET1 action (above)

   830       // Second: do additional code fixup

   831       {

   832         int srcdel = DoSpecialFixup(c, &src, srclimit, &dst, dstlimit);

   833         if (offsetmap != NULL) {

   834           if (srcdel != 0) {

   835             offsetmap->Copy(src - copystart - srcdel);

   836             offsetmap->Delete(srcdel);

   837             copystart = src;

   838           }

   839         }

   840       }

   841       goto Do_state_table;

   843     case kExitIllegalStructure:   // structurally illegal byte; quit

   844     case kExitReject:             // NUL or illegal code encountered; quit

   845     case kExitRejectAlt:          // Apply replacement, then exit

   846     default:                      // and all other exits

   847       break;

   848     }   // End switch (e)

   850     // Exit possibilities:

   851     //  Some other exit code, state0, back up one byte exactly

   852     //  Some other exit code, !state0, back up over last char

   854     // Back up over exactly one byte of rejected/illegal UTF-8 character

   855     src--;

   856     dst--;

   857     // Back up more if needed

   858     if (!InStateZero(st, Tbl)) {

   859       do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));

   860     }

   861   } else if (!InStateZero(st, Tbl)) {

   862     // src >= srclimit, !state0

   863     // Back up over truncated UTF-8 character

   864     e = kExitIllegalStructure;

   865     do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));

   866   } else {

   867     // src >= srclimit, state0

   868     // Normal termination, source fully consumed

   869     e = kExitOK;

   870   }

   872   if (offsetmap != NULL) {

   873     if (src > copystart) {

   874       offsetmap->Copy(src - copystart);

   875       copystart = src;

   876     }

   877   }

   879   // Possible return values here:

   880   //  kExitDstSpaceFull         caller may realloc and retry from middle

   881   //  kExitIllegalStructure     caller my overwrite/truncate

   882   //  kExitOK                   all done and happy

   883   //  kExitReject               caller may overwrite/truncate

   884   //  kExitDoAgain              LOOP NOT DONE; caller must retry from middle

   885   //                            (may do fast ASCII loop first)

   886   //  kExitPlaceholder          -unused-

   887   //  kExitNone                 -unused-

   888   *bytes_consumed = src - isrc;

   889   *bytes_filled = dst - odst;

   890   *chars_changed = total_changed;

   891   return e;

   892 }

   894 // TwoByte versions are needed for tables > 240 states, such

   895 // as the table for full Unicode 4.1 canonical + compatibility mapping

   897 // Scan a UTF-8 stringpiece based on state table with two-byte entries,

   898 //   copying to output stringpiece

   899 //   and doing text replacements.

   900 // DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below

   901 //   Needs caller to loop on kExitDoAgain

   902 static int UTF8GenericReplaceInternalTwoByte(const UTF8ReplaceObj_2* st,

   903                     const StringPiece& istr,

   904                     StringPiece& ostr,

   905                     bool is_plain_text,

   906                     int* bytes_consumed,

   907                     int* bytes_filled,

   908                     int* chars_changed,

   909                     OffsetMap* offsetmap) {

   910   int eshift = st->entry_shift;

   911   int nEntries = (1 << eshift);       // 64 or 256 entries per state

   912   const uint8* isrc = reinterpret_cast<const uint8*>(istr.data());

   913   const int ilen = istr.length();

   914   const uint8* copystart = isrc;

   915   const uint8* src = isrc;

   916   const uint8* srclimit = src + ilen;

   917   *bytes_consumed = 0;

   918   *bytes_filled = 0;

   919   *chars_changed = 0;

   921   const uint8* odst = reinterpret_cast<const uint8*>(ostr.data());

   922   const int olen = ostr.length();

   923   uint8* dst = const_cast<uint8*>(odst);

   924   uint8* dstlimit = dst + olen;

   926   *chars_changed = 0;

   928   int total_changed = 0;

   930   int src_lll = srclimit - src;

   931   int dst_lll = dstlimit - dst;

   934   // Invariant condition during replacements:

   935   //  remaining dst size >= remaining src size

   936   if ((dstlimit - dst) < (srclimit - src)) {

   937     if (offsetmap != NULL) {

   938       offsetmap->Copy(src - copystart);

   939       copystart = src;

   940     }

   941     return kExitDstSpaceFull_2;

   942   }

   943   const unsigned short* Tbl_0 = &st->state_table[st->state0];

   945  Do_state_table_2:

   946   // Do state-table scan, copying as we go

   947   const unsigned short* Tbl = Tbl_0;

   948   int e = 0;

   949   uint8 c = 0;

   951  Do_state_table_newe_2:

   953   //----------------------------

   954   while (src < srclimit) {

   955     c = *src;

   956     e = Tbl[c];

   957     *dst = c;

   958     src++;

   959     dst++;

   960     if (e >= kExitIllegalStructure_2) {break;}

   961     Tbl = &Tbl_0[e << eshift];

   962   }

   963   //----------------------------

   964   src_lll = src - isrc;

   965   dst_lll = dst - odst;

   967   // Exit possibilities:

   968   //  Replacement code, do the replacement and loop

   969   //  Some other exit code, state0, back up one byte exactly

   970   //  Some other exit code, !state0, back up over last char

   971   //  source consumed, state0, exit OK

   972   //  source consumed, !state0, back up over partial char

   973   // For illegal byte in state0, avoid backup up over PREVIOUS char

   974   // For truncated last char, back up to beginning of it

   976   if (e >= kExitIllegalStructure_2) {

   977     // Switch on exit code; most loop back to top

   978     int offset = 0;

   979     switch (e) {

   980     // These all make the output string the same size or shorter

   981     // No checking needed

   982     case kExitReplace31_2:    // del 2, add 1 bytes to change

   983       dst -= 2;

   984       if (offsetmap != NULL) {

   985         offsetmap->Copy(src - copystart - 2);

   986         offsetmap->Delete(2);

   987         copystart = src;

   988       }

   989       dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);

   990       total_changed++;

   991       goto Do_state_table_2;

   992     case kExitReplace32_2:    // del 3, add 2 bytes to change

   993       dst--;

   994       if (offsetmap != NULL) {

   995         offsetmap->Copy(src - copystart - 1);

   996         offsetmap->Delete(1);

   997         copystart = src;

   998       }

   999       dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff);

  1000       dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);

  1001       total_changed++;

  1002       goto Do_state_table_2;

  1003     case kExitReplace21_2:    // del 2, add 1 bytes to change

  1004       dst--;

  1005       if (offsetmap != NULL) {

  1006         offsetmap->Copy(src - copystart - 1);

  1007         offsetmap->Delete(1);

  1008         copystart = src;

  1009       }

  1010       dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);

  1011       total_changed++;

  1012       goto Do_state_table_2;

  1013     case kExitReplace3_2:    // update 3 bytes to change

  1014       dst[-3] = (unsigned char)(Tbl[c + (nEntries * 2)] & 0xff);

  1015       // Fall into next case

  1016     case kExitReplace2_2:    // update 2 bytes to change

  1017       dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff);

  1018       // Fall into next case

  1019     case kExitReplace1_2:    // update 1 byte to change

  1020       dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);

  1021       total_changed++;

  1022       goto Do_state_table_2;

  1023     case kExitReplace1S0_2:     // update 1 byte to change, 256-entry state

  1024       dst[-1] = (unsigned char)(Tbl[c + (256 * 1)] & 0xff);

  1025       total_changed++;

  1026       goto Do_state_table_2;

  1027     // These can make the output string longer than the input

  1028     case kExitReplaceOffset2_2:

  1029       if ((nEntries != 256) && InStateZero_2(st, Tbl)) {

  1030         // For space-optimized table, we need multiples of 256 bytes

  1031         // in state0 and multiples of nEntries in other states

  1032         offset += ((unsigned char)(Tbl[c + (256 * 1)] >> 8 & 0xff) << 8);

  1033       } else {

  1034         offset += ((unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff) << 8);

  1035       }

  1036       // Fall into next case

  1037     case kExitReplaceOffset1_2:

  1038       if ((nEntries != 256) && InStateZero_2(st, Tbl)) {

  1039         // For space-optimized table, we need multiples of 256 bytes

  1040         // in state0 and multiples of nEntries in other states

  1041         offset += (unsigned char)(Tbl[c + (256 * 1)] & 0xff);

  1042       } else {

  1043         offset += (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff);

  1044       }

  1045       {

  1046         const RemapEntry* re = &st->remap_base[offset];

  1047         int del_len = re->delete_bytes & ~kReplaceAndResumeFlag;

  1048         int add_len = re->add_bytes & ~kHtmlPlaintextFlag;

  1049         // Special-case non-HTML replacement of five sensitive entities

  1050         //   &quot; &amp; &apos; &lt; &gt;

  1051         //   0022   0026  0027   003c 003e

  1052         // A replacement creating one of these is expressed as a pair of

  1053         // entries, one for HTML output and one for plaintext output.

  1054         // The first of the pair has the high bit of add_bytes set.

  1055         if (re->add_bytes & kHtmlPlaintextFlag) {

  1056           // Use this entry for plain text

  1057           if (!is_plain_text) {

  1058             // Use very next entry for HTML text (same back/delete length)

  1059             re = &st->remap_base[offset + 1];

  1060             add_len = re->add_bytes & ~kHtmlPlaintextFlag;

  1061           }

  1062         }

  1064         // After the replacement, need (dstlimit - dst) >= (srclimit - src)

  1065         int string_offset = re->bytes_offset;

  1066         // After the replacement, need (dstlimit - newdst) >= (srclimit - src)

  1067         uint8* newdst = dst - del_len + add_len;

  1068         if ((dstlimit - newdst) < (srclimit - src)) {

  1069           // Won't fit; don't do the replacement. Caller may realloc and retry

  1070           e = kExitDstSpaceFull_2;

  1071           break;    // exit, backing up over this char for later retry

  1072         }

  1073         dst -= del_len;

  1074         memcpy(dst, &st->remap_string[string_offset], add_len);

  1075         dst += add_len;

  1076         if (offsetmap != NULL) {

  1077           if (add_len > del_len) {

  1078             offsetmap->Copy(src - copystart);

  1079             offsetmap->Insert(add_len - del_len);

  1080             copystart = src;

  1081           } else if (add_len < del_len) {

  1082             offsetmap->Copy(src - copystart + add_len - del_len);

  1083             offsetmap->Delete(del_len - add_len);

  1084             copystart = src;

  1085           }

  1086         }

  1087         if (re->delete_bytes & kReplaceAndResumeFlag) {

  1088           // There is a two-byte non-zero target state at the end of the

  1089           // replacement string

  1090           uint8 c1 = st->remap_string[string_offset + add_len];

  1091           uint8 c2 = st->remap_string[string_offset + add_len + 1];

  1092           e = (c1 << 8) | c2;

  1093           Tbl = &Tbl_0[e << eshift];

  1094           total_changed++;

  1095           goto Do_state_table_newe_2;

  1096         }

  1097       }

  1098       total_changed++;

  1099       if (e == kExitRejectAlt_2) {break;}

  1100       goto Do_state_table_2;

  1102     case kExitSpecial_2:           // NO special fixups [read: hacks]

  1103     case kExitIllegalStructure_2:  // structurally illegal byte; quit

  1104     case kExitReject_2:            // NUL or illegal code encountered; quit

  1105                                    // and all other exits

  1106     default:

  1107       break;

  1108     }   // End switch (e)

  1110     // Exit possibilities:

  1111     //  Some other exit code, state0, back up one byte exactly

  1112     //  Some other exit code, !state0, back up over last char

  1114     // Back up over exactly one byte of rejected/illegal UTF-8 character

  1115     src--;

  1116     dst--;

  1117     // Back up more if needed

  1118     if (!InStateZero_2(st, Tbl)) {

  1119       do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));

  1120     }

  1121   } else if (!InStateZero_2(st, Tbl)) {

  1122     // src >= srclimit, !state0

  1123     // Back up over truncated UTF-8 character

  1124     e = kExitIllegalStructure_2;

  1126     do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));

  1127   } else {

  1128     // src >= srclimit, state0

  1129     // Normal termination, source fully consumed

  1130     e = kExitOK_2;

  1131   }

  1133   if (offsetmap != NULL) {

  1134     if (src > copystart) {

  1135       offsetmap->Copy(src - copystart);

  1136       copystart = src;

  1137     }

  1138   }

  1141   // Possible return values here:

  1142   //  kExitDstSpaceFull_2         caller may realloc and retry from middle

  1143   //  kExitIllegalStructure_2     caller my overwrite/truncate

  1144   //  kExitOK_2                   all done and happy

  1145   //  kExitReject_2               caller may overwrite/truncate

  1146   //  kExitDoAgain_2              LOOP NOT DONE; caller must retry from middle

  1147   //                            (may do fast ASCII loop first)

  1148   //  kExitPlaceholder_2          -unused-

  1149   //  kExitNone_2                 -unused-

  1150   *bytes_consumed = src - isrc;

  1151   *bytes_filled = dst - odst;

  1152   *chars_changed = total_changed;

  1153   return e;

  1154 }

  1157 // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece

  1158 //   and doing text replacements.

  1159 // Also writes an optional OffsetMap. Pass NULL to skip writing one.

  1160 // Always scan complete UTF-8 characters

  1161 // Set number of bytes consumed from input, number filled to output.

  1162 // Return reason for exiting

  1163 int UTF8GenericReplace(const UTF8ReplaceObj* st,

  1164                     const StringPiece& istr,

  1165                     StringPiece& ostr,

  1166                     bool is_plain_text,

  1167                     int* bytes_consumed,

  1168                     int* bytes_filled,

  1169                     int* chars_changed,

  1170                     OffsetMap* offsetmap) {

  1171   StringPiece local_istr(istr.data(), istr.length());

  1172   StringPiece local_ostr(ostr.data(), ostr.length());

  1173   int total_consumed = 0;

  1174   int total_filled = 0;

  1175   int total_changed = 0;

  1176   int local_bytes_consumed, local_bytes_filled, local_chars_changed;

  1177   int e;

  1178   do {

  1179     e = UTF8GenericReplaceInternal(st,

  1180                     local_istr, local_ostr, is_plain_text,

  1181                     &local_bytes_consumed, &local_bytes_filled,

  1182                     &local_chars_changed,

  1183                     offsetmap);

  1184     local_istr.remove_prefix(local_bytes_consumed);

  1185     local_ostr.remove_prefix(local_bytes_filled);

  1186     total_consumed += local_bytes_consumed;

  1187     total_filled += local_bytes_filled;

  1188     total_changed += local_chars_changed;

  1189   } while ( e == kExitDoAgain );

  1190   *bytes_consumed = total_consumed;

  1191   *bytes_filled = total_filled;

  1192   *chars_changed = total_changed;

  1193   return e;

  1194 }

  1196 // Older version without offsetmap

  1197 int UTF8GenericReplace(const UTF8ReplaceObj* st,

  1198                     const StringPiece& istr,

  1199                     StringPiece& ostr,

  1200                     bool is_plain_text,

  1201                     int* bytes_consumed,

  1202                     int* bytes_filled,

  1203                     int* chars_changed) {

  1204   return UTF8GenericReplace(st,

  1205                     istr,

  1206                     ostr,

  1207                     is_plain_text,

  1208                     bytes_consumed,

  1209                     bytes_filled,

  1210                     chars_changed,

  1211                     NULL);

  1212 }

  1214 // Older version without is_plain_text or offsetmap

  1215 int UTF8GenericReplace(const UTF8ReplaceObj* st,

  1216                     const StringPiece& istr,

  1217                     StringPiece& ostr,

  1218                     int* bytes_consumed,

  1219                     int* bytes_filled,

  1220                     int* chars_changed) {

  1221   bool is_plain_text = false;

  1222   return UTF8GenericReplace(st,

  1223                     istr,

  1224                     ostr,

  1225                     is_plain_text,

  1226                     bytes_consumed,

  1227                     bytes_filled,

  1228                     chars_changed,

  1229                     NULL);

  1230 }

  1232 // Scan a UTF-8 stringpiece based on state table with two-byte entries,

  1233 //   copying to output stringpiece

  1234 //   and doing text replacements.

  1235 // Also writes an optional OffsetMap. Pass NULL to skip writing one.

  1236 // Always scan complete UTF-8 characters

  1237 // Set number of bytes consumed from input, number filled to output.

  1238 // Return reason for exiting

  1239 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,

  1240                     const StringPiece& istr,

  1241                     StringPiece& ostr,

  1242                     bool is_plain_text,

  1243                     int* bytes_consumed,

  1244                     int* bytes_filled,

  1245                     int* chars_changed,

  1246                     OffsetMap* offsetmap) {

  1247   StringPiece local_istr(istr.data(), istr.length());

  1248   StringPiece local_ostr(ostr.data(), ostr.length());

  1249   int total_consumed = 0;

  1250   int total_filled = 0;

  1251   int total_changed = 0;

  1252   int local_bytes_consumed, local_bytes_filled, local_chars_changed;

  1253   int e;

  1254   do {

  1255     e = UTF8GenericReplaceInternalTwoByte(st,

  1256                     local_istr, local_ostr, is_plain_text,

  1257                     &local_bytes_consumed,

  1258                     &local_bytes_filled,

  1259                     &local_chars_changed,

  1260                     offsetmap);

  1261     local_istr.remove_prefix(local_bytes_consumed);

  1262     local_ostr.remove_prefix(local_bytes_filled);

  1263     total_consumed += local_bytes_consumed;

  1264     total_filled += local_bytes_filled;

  1265     total_changed += local_chars_changed;

  1266   } while ( e == kExitDoAgain_2 );

  1267   *bytes_consumed = total_consumed;

  1268   *bytes_filled = total_filled;

  1269   *chars_changed = total_changed;

  1271   return e - kExitOK_2 + kExitOK;

  1272 }

  1274 // Older version without offsetmap

  1275 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,

  1276                     const StringPiece& istr,

  1277                     StringPiece& ostr,

  1278                     bool is_plain_text,

  1279                     int* bytes_consumed,

  1280                     int* bytes_filled,

  1281                     int* chars_changed) {

  1282   return UTF8GenericReplaceTwoByte(st,

  1283                     istr,

  1284                     ostr,

  1285                     is_plain_text,

  1286                     bytes_consumed,

  1287                     bytes_filled,

  1288                     chars_changed,

  1289                     NULL);

  1290 }

  1292 // Older version without is_plain_text or offsetmap

  1293 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,

  1294                     const StringPiece& istr,

  1295                     StringPiece& ostr,

  1296                     int* bytes_consumed,

  1297                     int* bytes_filled,

  1298                     int* chars_changed) {

  1299   bool is_plain_text = false;

  1300   return UTF8GenericReplaceTwoByte(st,

  1301                     istr,

  1302                     ostr,

  1303                     is_plain_text,

  1304                     bytes_consumed,

  1305                     bytes_filled,

  1306                     chars_changed,

  1307                     NULL);

  1308 }

  1312 // Adjust a stringpiece to encompass complete UTF-8 characters.

  1313 // The data pointer will be increased by 0..3 bytes to get to a character

  1314 // boundary, and the length will then be decreased by 0..3 bytes

  1315 // to encompass the last complete character.

  1316 void UTF8TrimToChars(StringPiece* istr) {

  1317   const char* src = istr->data();

  1318   int len = istr->length();

  1319   // Exit if empty string

  1320   if (len == 0) {

  1321     return;

  1322   }

  1324   // Exit on simple, common case

  1325   if ( ((src[0] & 0xc0) != 0x80) &&

  1326        (static_cast<signed char>(src[len - 1]) >= 0) ) {

  1327     // First byte is not a continuation and last byte is 7-bit ASCII -- done

  1328     return;

  1329   }

  1331   // Adjust the back end, len > 0

  1332   const char* srclimit = src + len;

  1333   // Backscan over any ending continuation bytes to find last char start

  1334   const char* s = srclimit - 1;         // Last byte of the string

  1335   while ((src <= s) && ((*s & 0xc0) == 0x80)) {

  1336     s--;

  1337   }

  1338   // Include entire last char if it fits

  1339   if (src <= s) {

  1340     int last_char_len = UTF8OneCharLen(s);

  1341     if (s + last_char_len <= srclimit) {

  1342       // Last char fits, so include it, else exclude it

  1343       s += last_char_len;

  1344     }

  1345   }

  1346   if (s != srclimit) {

  1347     // s is one byte beyond the last full character, if any

  1348     istr->remove_suffix(srclimit - s);

  1349     // Exit if now empty string

  1350     if (istr->length() == 0) {

  1351       return;

  1352     }

  1353   }

  1355   // Adjust the front end, len > 0

  1356   len = istr->length();

  1357   srclimit = src + len;

  1358   s = src;                            // First byte of the string

  1359   // Scan over any beginning continuation bytes to find first char start

  1360   while ((s < srclimit) && ((*s & 0xc0) == 0x80)) {

  1361     s++;

  1362   }

  1363   if (s != src) {

  1364     // s is at the first full character, if any

  1365     istr->remove_prefix(s - src);

  1366   }

  1367 }

  1369 }       // End namespace CLD2

The Tor Browser / file revision

browser/components/translation/cld2/internal/utf8statetable.cc@6474c204b198

browser/components/translation/cld2/internal/utf8statetable.cc