browser/components/translation/cld2/internal/utf8statetable.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/browser/components/translation/cld2/internal/utf8statetable.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,283 @@
     1.4 +// Copyright 2013 Google Inc. All Rights Reserved.
     1.5 +//
     1.6 +// Licensed under the Apache License, Version 2.0 (the "License");
     1.7 +// you may not use this file except in compliance with the License.
     1.8 +// You may obtain a copy of the License at
     1.9 +//
    1.10 +//     http://www.apache.org/licenses/LICENSE-2.0
    1.11 +//
    1.12 +// Unless required by applicable law or agreed to in writing, software
    1.13 +// distributed under the License is distributed on an "AS IS" BASIS,
    1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    1.15 +// See the License for the specific language governing permissions and
    1.16 +// limitations under the License.
    1.17 +
    1.18 +//
    1.19 +// State Table follower for scanning UTF-8 strings without converting to
    1.20 +// 32- or 16-bit Unicode values.
    1.21 +//
    1.22 +// Author: dsites@google.com (Dick Sites)
    1.23 +//
    1.24 +
    1.25 +#ifndef UTIL_UTF8_UTF8STATETABLE_H_
    1.26 +#define UTIL_UTF8_UTF8STATETABLE_H_
    1.27 +
    1.28 +#include <string>
    1.29 +#include "integral_types.h"             // for uint8, uint32, uint16
    1.30 +#include "stringpiece.h"
    1.31 +
    1.32 +
    1.33 +namespace CLD2 {
    1.34 +
    1.35 +class OffsetMap;
    1.36 +
    1.37 +
    1.38 +// These four-byte entries compactly encode how many bytes 0..255 to delete
    1.39 +// in making a string replacement, how many bytes to add 0..255, and the offset
    1.40 +// 0..64k-1 of the replacement string in remap_string.
    1.41 +struct RemapEntry {
    1.42 +  uint8 delete_bytes;
    1.43 +  uint8 add_bytes;
    1.44 +  uint16 bytes_offset;
    1.45 +};
    1.46 +
    1.47 +// Exit type codes for state tables. All but the first get stuffed into
    1.48 +// signed one-byte entries. The first is only generated by executable code.
    1.49 +// To distinguish from next-state entries, these must be contiguous and
    1.50 +// all <= kExitNone
    1.51 +typedef enum {
    1.52 +  kExitDstSpaceFull = 239,
    1.53 +  kExitIllegalStructure,  // 240
    1.54 +  kExitOK,                // 241
    1.55 +  kExitReject,            // ...
    1.56 +  kExitReplace1,
    1.57 +  kExitReplace2,
    1.58 +  kExitReplace3,
    1.59 +  kExitReplace21,
    1.60 +  kExitReplace31,
    1.61 +  kExitReplace32,
    1.62 +  kExitReplaceOffset1,
    1.63 +  kExitReplaceOffset2,
    1.64 +  kExitReplace1S0,
    1.65 +  kExitSpecial,
    1.66 +  kExitDoAgain,
    1.67 +  kExitRejectAlt,
    1.68 +  kExitNone               // 255
    1.69 +} ExitReason;
    1.70 +
    1.71 +typedef enum {
    1.72 +  kExitDstSpaceFull_2 = 32767,       // 0x7fff
    1.73 +  kExitIllegalStructure_2,  // 32768    0x8000
    1.74 +  kExitOK_2,                // 32769    0x8001
    1.75 +  kExitReject_2,            // ...
    1.76 +  kExitReplace1_2,
    1.77 +  kExitReplace2_2,
    1.78 +  kExitReplace3_2,
    1.79 +  kExitReplace21_2,
    1.80 +  kExitReplace31_2,
    1.81 +  kExitReplace32_2,
    1.82 +  kExitReplaceOffset1_2,
    1.83 +  kExitReplaceOffset2_2,
    1.84 +  kExitReplace1S0_2,
    1.85 +  kExitSpecial_2,
    1.86 +  kExitDoAgain_2,
    1.87 +  kExitRejectAlt_2,
    1.88 +  kExitNone_2               // 32783    0x800f
    1.89 +} ExitReason_2;
    1.90 +
    1.91 +
    1.92 +// This struct represents one entire state table. The three initialized byte
    1.93 +// areas are state_table, remap_base, and remap_string. state0 and state0_size
    1.94 +// give the byte offset and length within state_table of the initial state --
    1.95 +// table lookups are expected to start and end in this state, but for
    1.96 +// truncated UTF-8 strings, may end in a different state. These allow a quick
    1.97 +// test for that condition. entry_shift is 8 for tables subscripted by a full
    1.98 +// byte value and 6 for space-optimized tables subscripted by only six
    1.99 +// significant bits in UTF-8 continuation bytes.
   1.100 +typedef struct {
   1.101 +  const uint32 state0;
   1.102 +  const uint32 state0_size;
   1.103 +  const uint32 total_size;
   1.104 +  const int max_expand;
   1.105 +  const int entry_shift;
   1.106 +  const int bytes_per_entry;
   1.107 +  const uint32 losub;
   1.108 +  const uint32 hiadd;
   1.109 +  const uint8* state_table;
   1.110 +  const RemapEntry* remap_base;
   1.111 +  const uint8* remap_string;
   1.112 +  const uint8* fast_state;
   1.113 +} UTF8StateMachineObj;
   1.114 +
   1.115 +// Near-duplicate declaration for tables with two-byte entries
   1.116 +typedef struct {
   1.117 +  const uint32 state0;
   1.118 +  const uint32 state0_size;
   1.119 +  const uint32 total_size;
   1.120 +  const int max_expand;
   1.121 +  const int entry_shift;
   1.122 +  const int bytes_per_entry;
   1.123 +  const uint32 losub;
   1.124 +  const uint32 hiadd;
   1.125 +  const unsigned short* state_table;
   1.126 +  const RemapEntry* remap_base;
   1.127 +  const uint8* remap_string;
   1.128 +  const uint8* fast_state;
   1.129 +} UTF8StateMachineObj_2;
   1.130 +
   1.131 +
   1.132 +typedef UTF8StateMachineObj UTF8PropObj;
   1.133 +typedef UTF8StateMachineObj UTF8ScanObj;
   1.134 +typedef UTF8StateMachineObj UTF8ReplaceObj;
   1.135 +typedef UTF8StateMachineObj_2 UTF8PropObj_2;
   1.136 +typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2;
   1.137 +// NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2;
   1.138 +
   1.139 +
   1.140 +// Look up property of one UTF-8 character and advance over it
   1.141 +// Return 0 if input length is zero
   1.142 +// Return 0 and advance one byte if input is ill-formed
   1.143 +uint8 UTF8GenericProperty(const UTF8PropObj* st,
   1.144 +                          const uint8** src,
   1.145 +                          int* srclen);
   1.146 +
   1.147 +// Look up property of one UTF-8 character (assumed to be valid).
   1.148 +// (This is a faster version of UTF8GenericProperty.)
   1.149 +bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src);
   1.150 +
   1.151 +
   1.152 +// BigOneByte versions are needed for tables > 240 states, but most
   1.153 +// won't need the TwoByte versions.
   1.154 +
   1.155 +// Look up property of one UTF-8 character and advance over it
   1.156 +// Return 0 if input length is zero
   1.157 +// Return 0 and advance one byte if input is ill-formed
   1.158 +uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
   1.159 +                          const uint8** src,
   1.160 +                          int* srclen);
   1.161 +
   1.162 +
   1.163 +// TwoByte versions are needed for tables > 240 states that don't fit onto
   1.164 +// BigOneByte -- rare ultimate fallback
   1.165 +
   1.166 +// Look up property of one UTF-8 character (assumed to be valid).
   1.167 +// (This is a faster version of UTF8GenericProperty.)
   1.168 +bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src);
   1.169 +
   1.170 +// Look up property of one UTF-8 character and advance over it
   1.171 +// Return 0 if input length is zero
   1.172 +// Return 0 and advance one byte if input is ill-formed
   1.173 +uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
   1.174 +                          const uint8** src,
   1.175 +                          int* srclen);
   1.176 +
   1.177 +// Look up property of one UTF-8 character (assumed to be valid).
   1.178 +// (This is a faster version of UTF8GenericProperty.)
   1.179 +bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src);
   1.180 +
   1.181 +// Scan a UTF-8 stringpiece based on a state table.
   1.182 +// Always scan complete UTF-8 characters
   1.183 +// Set number of bytes scanned. Return reason for exiting
   1.184 +int UTF8GenericScan(const UTF8ScanObj* st,
   1.185 +                    const StringPiece& str,
   1.186 +                    int* bytes_consumed);
   1.187 +
   1.188 +
   1.189 +
   1.190 +// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
   1.191 +//   and doing text replacements.
   1.192 +// Always scan complete UTF-8 characters
   1.193 +// Set number of bytes consumed from input, number filled to output.
   1.194 +// Return reason for exiting
   1.195 +// Also writes an optional OffsetMap. Pass NULL to skip writing one.
   1.196 +int UTF8GenericReplace(const UTF8ReplaceObj* st,
   1.197 +                    const StringPiece& istr,
   1.198 +                    StringPiece& ostr,
   1.199 +                    bool is_plain_text,
   1.200 +                    int* bytes_consumed,
   1.201 +                    int* bytes_filled,
   1.202 +                    int* chars_changed,
   1.203 +                    OffsetMap* offsetmap);
   1.204 +
   1.205 +// Older version without offsetmap
   1.206 +int UTF8GenericReplace(const UTF8ReplaceObj* st,
   1.207 +                    const StringPiece& istr,
   1.208 +                    StringPiece& ostr,
   1.209 +                    bool is_plain_text,
   1.210 +                    int* bytes_consumed,
   1.211 +                    int* bytes_filled,
   1.212 +                    int* chars_changed);
   1.213 +
   1.214 +// Older version without is_plain_text or offsetmap
   1.215 +int UTF8GenericReplace(const UTF8ReplaceObj* st,
   1.216 +                    const StringPiece& istr,
   1.217 +                    StringPiece& ostr,
   1.218 +                    int* bytes_consumed,
   1.219 +                    int* bytes_filled,
   1.220 +                    int* chars_changed);
   1.221 +
   1.222 +
   1.223 +// TwoByte version is needed for tables > about 256 states, such
   1.224 +// as the table for full Unicode 4.1 canonical + compatibility mapping
   1.225 +
   1.226 +// Scan a UTF-8 stringpiece based on state table with two-byte entries,
   1.227 +//   copying to output stringpiece
   1.228 +//   and doing text replacements.
   1.229 +// Always scan complete UTF-8 characters
   1.230 +// Set number of bytes consumed from input, number filled to output.
   1.231 +// Return reason for exiting
   1.232 +// Also writes an optional OffsetMap. Pass NULL to skip writing one.
   1.233 +int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
   1.234 +                    const StringPiece& istr,
   1.235 +                    StringPiece& ostr,
   1.236 +                    bool is_plain_text,
   1.237 +                    int* bytes_consumed,
   1.238 +                    int* bytes_filled,
   1.239 +                    int* chars_changed,
   1.240 +                    OffsetMap* offsetmap);
   1.241 +
   1.242 +// Older version without offsetmap
   1.243 +int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
   1.244 +                    const StringPiece& istr,
   1.245 +                    StringPiece& ostr,
   1.246 +                    bool is_plain_text,
   1.247 +                    int* bytes_consumed,
   1.248 +                    int* bytes_filled,
   1.249 +                    int* chars_changed);
   1.250 +
   1.251 +// Older version without is_plain_text or offsetmap
   1.252 +int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
   1.253 +                    const StringPiece& istr,
   1.254 +                    StringPiece& ostr,
   1.255 +                    int* bytes_consumed,
   1.256 +                    int* bytes_filled,
   1.257 +                    int* chars_changed);
   1.258 +
   1.259 +
   1.260 +static const unsigned char kUTF8LenTbl[256] = {
   1.261 +  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   1.262 +  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   1.263 +  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   1.264 +  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   1.265 +
   1.266 +  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   1.267 +  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   1.268 +  2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
   1.269 +  3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
   1.270 +};
   1.271 +
   1.272 +inline int UTF8OneCharLen(const char* in) {
   1.273 +  return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)];
   1.274 +}
   1.275 +
   1.276 +// Adjust a stringpiece to encompass complete UTF-8 characters.
   1.277 +// The data pointer will be increased by 0..3 bytes to get to a character
   1.278 +// boundary, and the length will then be decreased by 0..3 bytes
   1.279 +// to encompass the last complete character.
   1.280 +// This is useful especially when a UTF-8 string must be put into a fixed-
   1.281 +// maximum-size buffer cleanly, such as a MySQL buffer.
   1.282 +void UTF8TrimToChars(StringPiece* istr);
   1.283 +
   1.284 +}       // End namespace CLD2
   1.285 +
   1.286 +#endif  // UTIL_UTF8_UTF8STATETABLE_H_

mercurial