The Tor Browser: browser/components/translation/cld2/internal/utf8statetable.h@6474c204b198

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 // Copyright 2013 Google Inc. All Rights Reserved.

     2 //

     3 // Licensed under the Apache License, Version 2.0 (the "License");

     4 // you may not use this file except in compliance with the License.

     5 // You may obtain a copy of the License at

     6 //

     7 //     http://www.apache.org/licenses/LICENSE-2.0

     8 //

     9 // Unless required by applicable law or agreed to in writing, software

    10 // distributed under the License is distributed on an "AS IS" BASIS,

    11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

    12 // See the License for the specific language governing permissions and

    13 // limitations under the License.

    15 //

    16 // State Table follower for scanning UTF-8 strings without converting to

    17 // 32- or 16-bit Unicode values.

    18 //

    19 // Author: dsites@google.com (Dick Sites)

    20 //

    22 #ifndef UTIL_UTF8_UTF8STATETABLE_H_

    23 #define UTIL_UTF8_UTF8STATETABLE_H_

    25 #include <string>

    26 #include "integral_types.h"             // for uint8, uint32, uint16

    27 #include "stringpiece.h"

    30 namespace CLD2 {

    32 class OffsetMap;

    35 // These four-byte entries compactly encode how many bytes 0..255 to delete

    36 // in making a string replacement, how many bytes to add 0..255, and the offset

    37 // 0..64k-1 of the replacement string in remap_string.

    38 struct RemapEntry {

    39   uint8 delete_bytes;

    40   uint8 add_bytes;

    41   uint16 bytes_offset;

    42 };

    44 // Exit type codes for state tables. All but the first get stuffed into

    45 // signed one-byte entries. The first is only generated by executable code.

    46 // To distinguish from next-state entries, these must be contiguous and

    47 // all <= kExitNone

    48 typedef enum {

    49   kExitDstSpaceFull = 239,

    50   kExitIllegalStructure,  // 240

    51   kExitOK,                // 241

    52   kExitReject,            // ...

    53   kExitReplace1,

    54   kExitReplace2,

    55   kExitReplace3,

    56   kExitReplace21,

    57   kExitReplace31,

    58   kExitReplace32,

    59   kExitReplaceOffset1,

    60   kExitReplaceOffset2,

    61   kExitReplace1S0,

    62   kExitSpecial,

    63   kExitDoAgain,

    64   kExitRejectAlt,

    65   kExitNone               // 255

    66 } ExitReason;

    68 typedef enum {

    69   kExitDstSpaceFull_2 = 32767,       // 0x7fff

    70   kExitIllegalStructure_2,  // 32768    0x8000

    71   kExitOK_2,                // 32769    0x8001

    72   kExitReject_2,            // ...

    73   kExitReplace1_2,

    74   kExitReplace2_2,

    75   kExitReplace3_2,

    76   kExitReplace21_2,

    77   kExitReplace31_2,

    78   kExitReplace32_2,

    79   kExitReplaceOffset1_2,

    80   kExitReplaceOffset2_2,

    81   kExitReplace1S0_2,

    82   kExitSpecial_2,

    83   kExitDoAgain_2,

    84   kExitRejectAlt_2,

    85   kExitNone_2               // 32783    0x800f

    86 } ExitReason_2;

    89 // This struct represents one entire state table. The three initialized byte

    90 // areas are state_table, remap_base, and remap_string. state0 and state0_size

    91 // give the byte offset and length within state_table of the initial state --

    92 // table lookups are expected to start and end in this state, but for

    93 // truncated UTF-8 strings, may end in a different state. These allow a quick

    94 // test for that condition. entry_shift is 8 for tables subscripted by a full

    95 // byte value and 6 for space-optimized tables subscripted by only six

    96 // significant bits in UTF-8 continuation bytes.

    97 typedef struct {

    98   const uint32 state0;

    99   const uint32 state0_size;

   100   const uint32 total_size;

   101   const int max_expand;

   102   const int entry_shift;

   103   const int bytes_per_entry;

   104   const uint32 losub;

   105   const uint32 hiadd;

   106   const uint8* state_table;

   107   const RemapEntry* remap_base;

   108   const uint8* remap_string;

   109   const uint8* fast_state;

   110 } UTF8StateMachineObj;

   112 // Near-duplicate declaration for tables with two-byte entries

   113 typedef struct {

   114   const uint32 state0;

   115   const uint32 state0_size;

   116   const uint32 total_size;

   117   const int max_expand;

   118   const int entry_shift;

   119   const int bytes_per_entry;

   120   const uint32 losub;

   121   const uint32 hiadd;

   122   const unsigned short* state_table;

   123   const RemapEntry* remap_base;

   124   const uint8* remap_string;

   125   const uint8* fast_state;

   126 } UTF8StateMachineObj_2;

   129 typedef UTF8StateMachineObj UTF8PropObj;

   130 typedef UTF8StateMachineObj UTF8ScanObj;

   131 typedef UTF8StateMachineObj UTF8ReplaceObj;

   132 typedef UTF8StateMachineObj_2 UTF8PropObj_2;

   133 typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2;

   134 // NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2;

   137 // Look up property of one UTF-8 character and advance over it

   138 // Return 0 if input length is zero

   139 // Return 0 and advance one byte if input is ill-formed

   140 uint8 UTF8GenericProperty(const UTF8PropObj* st,

   141                           const uint8** src,

   142                           int* srclen);

   144 // Look up property of one UTF-8 character (assumed to be valid).

   145 // (This is a faster version of UTF8GenericProperty.)

   146 bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src);

   149 // BigOneByte versions are needed for tables > 240 states, but most

   150 // won't need the TwoByte versions.

   152 // Look up property of one UTF-8 character and advance over it

   153 // Return 0 if input length is zero

   154 // Return 0 and advance one byte if input is ill-formed

   155 uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,

   156                           const uint8** src,

   157                           int* srclen);

   160 // TwoByte versions are needed for tables > 240 states that don't fit onto

   161 // BigOneByte -- rare ultimate fallback

   163 // Look up property of one UTF-8 character (assumed to be valid).

   164 // (This is a faster version of UTF8GenericProperty.)

   165 bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src);

   167 // Look up property of one UTF-8 character and advance over it

   168 // Return 0 if input length is zero

   169 // Return 0 and advance one byte if input is ill-formed

   170 uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,

   171                           const uint8** src,

   172                           int* srclen);

   174 // Look up property of one UTF-8 character (assumed to be valid).

   175 // (This is a faster version of UTF8GenericProperty.)

   176 bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src);

   178 // Scan a UTF-8 stringpiece based on a state table.

   179 // Always scan complete UTF-8 characters

   180 // Set number of bytes scanned. Return reason for exiting

   181 int UTF8GenericScan(const UTF8ScanObj* st,

   182                     const StringPiece& str,

   183                     int* bytes_consumed);

   187 // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece

   188 //   and doing text replacements.

   189 // Always scan complete UTF-8 characters

   190 // Set number of bytes consumed from input, number filled to output.

   191 // Return reason for exiting

   192 // Also writes an optional OffsetMap. Pass NULL to skip writing one.

   193 int UTF8GenericReplace(const UTF8ReplaceObj* st,

   194                     const StringPiece& istr,

   195                     StringPiece& ostr,

   196                     bool is_plain_text,

   197                     int* bytes_consumed,

   198                     int* bytes_filled,

   199                     int* chars_changed,

   200                     OffsetMap* offsetmap);

   202 // Older version without offsetmap

   203 int UTF8GenericReplace(const UTF8ReplaceObj* st,

   204                     const StringPiece& istr,

   205                     StringPiece& ostr,

   206                     bool is_plain_text,

   207                     int* bytes_consumed,

   208                     int* bytes_filled,

   209                     int* chars_changed);

   211 // Older version without is_plain_text or offsetmap

   212 int UTF8GenericReplace(const UTF8ReplaceObj* st,

   213                     const StringPiece& istr,

   214                     StringPiece& ostr,

   215                     int* bytes_consumed,

   216                     int* bytes_filled,

   217                     int* chars_changed);

   220 // TwoByte version is needed for tables > about 256 states, such

   221 // as the table for full Unicode 4.1 canonical + compatibility mapping

   223 // Scan a UTF-8 stringpiece based on state table with two-byte entries,

   224 //   copying to output stringpiece

   225 //   and doing text replacements.

   226 // Always scan complete UTF-8 characters

   227 // Set number of bytes consumed from input, number filled to output.

   228 // Return reason for exiting

   229 // Also writes an optional OffsetMap. Pass NULL to skip writing one.

   230 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,

   231                     const StringPiece& istr,

   232                     StringPiece& ostr,

   233                     bool is_plain_text,

   234                     int* bytes_consumed,

   235                     int* bytes_filled,

   236                     int* chars_changed,

   237                     OffsetMap* offsetmap);

   239 // Older version without offsetmap

   240 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,

   241                     const StringPiece& istr,

   242                     StringPiece& ostr,

   243                     bool is_plain_text,

   244                     int* bytes_consumed,

   245                     int* bytes_filled,

   246                     int* chars_changed);

   248 // Older version without is_plain_text or offsetmap

   249 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,

   250                     const StringPiece& istr,

   251                     StringPiece& ostr,

   252                     int* bytes_consumed,

   253                     int* bytes_filled,

   254                     int* chars_changed);

   257 static const unsigned char kUTF8LenTbl[256] = {

   258   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

   259   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

   260   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

   261   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

   263   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

   264   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

   265   2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,

   266   3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4

   267 };

   269 inline int UTF8OneCharLen(const char* in) {

   270   return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)];

   271 }

   273 // Adjust a stringpiece to encompass complete UTF-8 characters.

   274 // The data pointer will be increased by 0..3 bytes to get to a character

   275 // boundary, and the length will then be decreased by 0..3 bytes

   276 // to encompass the last complete character.

   277 // This is useful especially when a UTF-8 string must be put into a fixed-

   278 // maximum-size buffer cleanly, such as a MySQL buffer.

   279 void UTF8TrimToChars(StringPiece* istr);

   281 }       // End namespace CLD2

   283 #endif  // UTIL_UTF8_UTF8STATETABLE_H_

The Tor Browser / file revision

browser/components/translation/cld2/internal/utf8statetable.h@6474c204b198

browser/components/translation/cld2/internal/utf8statetable.h