browser/components/translation/cld2/internal/utf8statetable.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 // Copyright 2013 Google Inc. All Rights Reserved.
     2 //
     3 // Licensed under the Apache License, Version 2.0 (the "License");
     4 // you may not use this file except in compliance with the License.
     5 // You may obtain a copy of the License at
     6 //
     7 //     http://www.apache.org/licenses/LICENSE-2.0
     8 //
     9 // Unless required by applicable law or agreed to in writing, software
    10 // distributed under the License is distributed on an "AS IS" BASIS,
    11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12 // See the License for the specific language governing permissions and
    13 // limitations under the License.
    15 //
    16 // State Table follower for scanning UTF-8 strings without converting to
    17 // 32- or 16-bit Unicode values.
    18 //
    19 // Author: dsites@google.com (Dick Sites)
    20 //
    22 #ifndef UTIL_UTF8_UTF8STATETABLE_H_
    23 #define UTIL_UTF8_UTF8STATETABLE_H_
    25 #include <string>
    26 #include "integral_types.h"             // for uint8, uint32, uint16
    27 #include "stringpiece.h"
    30 namespace CLD2 {
    32 class OffsetMap;
    35 // These four-byte entries compactly encode how many bytes 0..255 to delete
    36 // in making a string replacement, how many bytes to add 0..255, and the offset
    37 // 0..64k-1 of the replacement string in remap_string.
    38 struct RemapEntry {
    39   uint8 delete_bytes;
    40   uint8 add_bytes;
    41   uint16 bytes_offset;
    42 };
    44 // Exit type codes for state tables. All but the first get stuffed into
    45 // signed one-byte entries. The first is only generated by executable code.
    46 // To distinguish from next-state entries, these must be contiguous and
    47 // all <= kExitNone
    48 typedef enum {
    49   kExitDstSpaceFull = 239,
    50   kExitIllegalStructure,  // 240
    51   kExitOK,                // 241
    52   kExitReject,            // ...
    53   kExitReplace1,
    54   kExitReplace2,
    55   kExitReplace3,
    56   kExitReplace21,
    57   kExitReplace31,
    58   kExitReplace32,
    59   kExitReplaceOffset1,
    60   kExitReplaceOffset2,
    61   kExitReplace1S0,
    62   kExitSpecial,
    63   kExitDoAgain,
    64   kExitRejectAlt,
    65   kExitNone               // 255
    66 } ExitReason;
    68 typedef enum {
    69   kExitDstSpaceFull_2 = 32767,       // 0x7fff
    70   kExitIllegalStructure_2,  // 32768    0x8000
    71   kExitOK_2,                // 32769    0x8001
    72   kExitReject_2,            // ...
    73   kExitReplace1_2,
    74   kExitReplace2_2,
    75   kExitReplace3_2,
    76   kExitReplace21_2,
    77   kExitReplace31_2,
    78   kExitReplace32_2,
    79   kExitReplaceOffset1_2,
    80   kExitReplaceOffset2_2,
    81   kExitReplace1S0_2,
    82   kExitSpecial_2,
    83   kExitDoAgain_2,
    84   kExitRejectAlt_2,
    85   kExitNone_2               // 32783    0x800f
    86 } ExitReason_2;
    89 // This struct represents one entire state table. The three initialized byte
    90 // areas are state_table, remap_base, and remap_string. state0 and state0_size
    91 // give the byte offset and length within state_table of the initial state --
    92 // table lookups are expected to start and end in this state, but for
    93 // truncated UTF-8 strings, may end in a different state. These allow a quick
    94 // test for that condition. entry_shift is 8 for tables subscripted by a full
    95 // byte value and 6 for space-optimized tables subscripted by only six
    96 // significant bits in UTF-8 continuation bytes.
    97 typedef struct {
    98   const uint32 state0;
    99   const uint32 state0_size;
   100   const uint32 total_size;
   101   const int max_expand;
   102   const int entry_shift;
   103   const int bytes_per_entry;
   104   const uint32 losub;
   105   const uint32 hiadd;
   106   const uint8* state_table;
   107   const RemapEntry* remap_base;
   108   const uint8* remap_string;
   109   const uint8* fast_state;
   110 } UTF8StateMachineObj;
   112 // Near-duplicate declaration for tables with two-byte entries
   113 typedef struct {
   114   const uint32 state0;
   115   const uint32 state0_size;
   116   const uint32 total_size;
   117   const int max_expand;
   118   const int entry_shift;
   119   const int bytes_per_entry;
   120   const uint32 losub;
   121   const uint32 hiadd;
   122   const unsigned short* state_table;
   123   const RemapEntry* remap_base;
   124   const uint8* remap_string;
   125   const uint8* fast_state;
   126 } UTF8StateMachineObj_2;
   129 typedef UTF8StateMachineObj UTF8PropObj;
   130 typedef UTF8StateMachineObj UTF8ScanObj;
   131 typedef UTF8StateMachineObj UTF8ReplaceObj;
   132 typedef UTF8StateMachineObj_2 UTF8PropObj_2;
   133 typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2;
   134 // NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2;
   137 // Look up property of one UTF-8 character and advance over it
   138 // Return 0 if input length is zero
   139 // Return 0 and advance one byte if input is ill-formed
   140 uint8 UTF8GenericProperty(const UTF8PropObj* st,
   141                           const uint8** src,
   142                           int* srclen);
   144 // Look up property of one UTF-8 character (assumed to be valid).
   145 // (This is a faster version of UTF8GenericProperty.)
   146 bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src);
   149 // BigOneByte versions are needed for tables > 240 states, but most
   150 // won't need the TwoByte versions.
   152 // Look up property of one UTF-8 character and advance over it
   153 // Return 0 if input length is zero
   154 // Return 0 and advance one byte if input is ill-formed
   155 uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
   156                           const uint8** src,
   157                           int* srclen);
   160 // TwoByte versions are needed for tables > 240 states that don't fit onto
   161 // BigOneByte -- rare ultimate fallback
   163 // Look up property of one UTF-8 character (assumed to be valid).
   164 // (This is a faster version of UTF8GenericProperty.)
   165 bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src);
   167 // Look up property of one UTF-8 character and advance over it
   168 // Return 0 if input length is zero
   169 // Return 0 and advance one byte if input is ill-formed
   170 uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
   171                           const uint8** src,
   172                           int* srclen);
   174 // Look up property of one UTF-8 character (assumed to be valid).
   175 // (This is a faster version of UTF8GenericProperty.)
   176 bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src);
   178 // Scan a UTF-8 stringpiece based on a state table.
   179 // Always scan complete UTF-8 characters
   180 // Set number of bytes scanned. Return reason for exiting
   181 int UTF8GenericScan(const UTF8ScanObj* st,
   182                     const StringPiece& str,
   183                     int* bytes_consumed);
   187 // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
   188 //   and doing text replacements.
   189 // Always scan complete UTF-8 characters
   190 // Set number of bytes consumed from input, number filled to output.
   191 // Return reason for exiting
   192 // Also writes an optional OffsetMap. Pass NULL to skip writing one.
   193 int UTF8GenericReplace(const UTF8ReplaceObj* st,
   194                     const StringPiece& istr,
   195                     StringPiece& ostr,
   196                     bool is_plain_text,
   197                     int* bytes_consumed,
   198                     int* bytes_filled,
   199                     int* chars_changed,
   200                     OffsetMap* offsetmap);
   202 // Older version without offsetmap
   203 int UTF8GenericReplace(const UTF8ReplaceObj* st,
   204                     const StringPiece& istr,
   205                     StringPiece& ostr,
   206                     bool is_plain_text,
   207                     int* bytes_consumed,
   208                     int* bytes_filled,
   209                     int* chars_changed);
   211 // Older version without is_plain_text or offsetmap
   212 int UTF8GenericReplace(const UTF8ReplaceObj* st,
   213                     const StringPiece& istr,
   214                     StringPiece& ostr,
   215                     int* bytes_consumed,
   216                     int* bytes_filled,
   217                     int* chars_changed);
   220 // TwoByte version is needed for tables > about 256 states, such
   221 // as the table for full Unicode 4.1 canonical + compatibility mapping
   223 // Scan a UTF-8 stringpiece based on state table with two-byte entries,
   224 //   copying to output stringpiece
   225 //   and doing text replacements.
   226 // Always scan complete UTF-8 characters
   227 // Set number of bytes consumed from input, number filled to output.
   228 // Return reason for exiting
   229 // Also writes an optional OffsetMap. Pass NULL to skip writing one.
   230 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
   231                     const StringPiece& istr,
   232                     StringPiece& ostr,
   233                     bool is_plain_text,
   234                     int* bytes_consumed,
   235                     int* bytes_filled,
   236                     int* chars_changed,
   237                     OffsetMap* offsetmap);
   239 // Older version without offsetmap
   240 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
   241                     const StringPiece& istr,
   242                     StringPiece& ostr,
   243                     bool is_plain_text,
   244                     int* bytes_consumed,
   245                     int* bytes_filled,
   246                     int* chars_changed);
   248 // Older version without is_plain_text or offsetmap
   249 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
   250                     const StringPiece& istr,
   251                     StringPiece& ostr,
   252                     int* bytes_consumed,
   253                     int* bytes_filled,
   254                     int* chars_changed);
   257 static const unsigned char kUTF8LenTbl[256] = {
   258   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   259   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   260   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   261   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   263   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   264   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   265   2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
   266   3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
   267 };
   269 inline int UTF8OneCharLen(const char* in) {
   270   return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)];
   271 }
   273 // Adjust a stringpiece to encompass complete UTF-8 characters.
   274 // The data pointer will be increased by 0..3 bytes to get to a character
   275 // boundary, and the length will then be decreased by 0..3 bytes
   276 // to encompass the last complete character.
   277 // This is useful especially when a UTF-8 string must be put into a fixed-
   278 // maximum-size buffer cleanly, such as a MySQL buffer.
   279 void UTF8TrimToChars(StringPiece* istr);
   281 }       // End namespace CLD2
   283 #endif  // UTIL_UTF8_UTF8STATETABLE_H_

mercurial