1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/browser/components/translation/cld2/internal/utf8statetable.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,283 @@ 1.4 +// Copyright 2013 Google Inc. All Rights Reserved. 1.5 +// 1.6 +// Licensed under the Apache License, Version 2.0 (the "License"); 1.7 +// you may not use this file except in compliance with the License. 1.8 +// You may obtain a copy of the License at 1.9 +// 1.10 +// http://www.apache.org/licenses/LICENSE-2.0 1.11 +// 1.12 +// Unless required by applicable law or agreed to in writing, software 1.13 +// distributed under the License is distributed on an "AS IS" BASIS, 1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.15 +// See the License for the specific language governing permissions and 1.16 +// limitations under the License. 1.17 + 1.18 +// 1.19 +// State Table follower for scanning UTF-8 strings without converting to 1.20 +// 32- or 16-bit Unicode values. 1.21 +// 1.22 +// Author: dsites@google.com (Dick Sites) 1.23 +// 1.24 + 1.25 +#ifndef UTIL_UTF8_UTF8STATETABLE_H_ 1.26 +#define UTIL_UTF8_UTF8STATETABLE_H_ 1.27 + 1.28 +#include <string> 1.29 +#include "integral_types.h" // for uint8, uint32, uint16 1.30 +#include "stringpiece.h" 1.31 + 1.32 + 1.33 +namespace CLD2 { 1.34 + 1.35 +class OffsetMap; 1.36 + 1.37 + 1.38 +// These four-byte entries compactly encode how many bytes 0..255 to delete 1.39 +// in making a string replacement, how many bytes to add 0..255, and the offset 1.40 +// 0..64k-1 of the replacement string in remap_string. 1.41 +struct RemapEntry { 1.42 + uint8 delete_bytes; 1.43 + uint8 add_bytes; 1.44 + uint16 bytes_offset; 1.45 +}; 1.46 + 1.47 +// Exit type codes for state tables. All but the first get stuffed into 1.48 +// signed one-byte entries. The first is only generated by executable code. 1.49 +// To distinguish from next-state entries, these must be contiguous and 1.50 +// all <= kExitNone 1.51 +typedef enum { 1.52 + kExitDstSpaceFull = 239, 1.53 + kExitIllegalStructure, // 240 1.54 + kExitOK, // 241 1.55 + kExitReject, // ... 1.56 + kExitReplace1, 1.57 + kExitReplace2, 1.58 + kExitReplace3, 1.59 + kExitReplace21, 1.60 + kExitReplace31, 1.61 + kExitReplace32, 1.62 + kExitReplaceOffset1, 1.63 + kExitReplaceOffset2, 1.64 + kExitReplace1S0, 1.65 + kExitSpecial, 1.66 + kExitDoAgain, 1.67 + kExitRejectAlt, 1.68 + kExitNone // 255 1.69 +} ExitReason; 1.70 + 1.71 +typedef enum { 1.72 + kExitDstSpaceFull_2 = 32767, // 0x7fff 1.73 + kExitIllegalStructure_2, // 32768 0x8000 1.74 + kExitOK_2, // 32769 0x8001 1.75 + kExitReject_2, // ... 1.76 + kExitReplace1_2, 1.77 + kExitReplace2_2, 1.78 + kExitReplace3_2, 1.79 + kExitReplace21_2, 1.80 + kExitReplace31_2, 1.81 + kExitReplace32_2, 1.82 + kExitReplaceOffset1_2, 1.83 + kExitReplaceOffset2_2, 1.84 + kExitReplace1S0_2, 1.85 + kExitSpecial_2, 1.86 + kExitDoAgain_2, 1.87 + kExitRejectAlt_2, 1.88 + kExitNone_2 // 32783 0x800f 1.89 +} ExitReason_2; 1.90 + 1.91 + 1.92 +// This struct represents one entire state table. The three initialized byte 1.93 +// areas are state_table, remap_base, and remap_string. state0 and state0_size 1.94 +// give the byte offset and length within state_table of the initial state -- 1.95 +// table lookups are expected to start and end in this state, but for 1.96 +// truncated UTF-8 strings, may end in a different state. These allow a quick 1.97 +// test for that condition. entry_shift is 8 for tables subscripted by a full 1.98 +// byte value and 6 for space-optimized tables subscripted by only six 1.99 +// significant bits in UTF-8 continuation bytes. 1.100 +typedef struct { 1.101 + const uint32 state0; 1.102 + const uint32 state0_size; 1.103 + const uint32 total_size; 1.104 + const int max_expand; 1.105 + const int entry_shift; 1.106 + const int bytes_per_entry; 1.107 + const uint32 losub; 1.108 + const uint32 hiadd; 1.109 + const uint8* state_table; 1.110 + const RemapEntry* remap_base; 1.111 + const uint8* remap_string; 1.112 + const uint8* fast_state; 1.113 +} UTF8StateMachineObj; 1.114 + 1.115 +// Near-duplicate declaration for tables with two-byte entries 1.116 +typedef struct { 1.117 + const uint32 state0; 1.118 + const uint32 state0_size; 1.119 + const uint32 total_size; 1.120 + const int max_expand; 1.121 + const int entry_shift; 1.122 + const int bytes_per_entry; 1.123 + const uint32 losub; 1.124 + const uint32 hiadd; 1.125 + const unsigned short* state_table; 1.126 + const RemapEntry* remap_base; 1.127 + const uint8* remap_string; 1.128 + const uint8* fast_state; 1.129 +} UTF8StateMachineObj_2; 1.130 + 1.131 + 1.132 +typedef UTF8StateMachineObj UTF8PropObj; 1.133 +typedef UTF8StateMachineObj UTF8ScanObj; 1.134 +typedef UTF8StateMachineObj UTF8ReplaceObj; 1.135 +typedef UTF8StateMachineObj_2 UTF8PropObj_2; 1.136 +typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2; 1.137 +// NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2; 1.138 + 1.139 + 1.140 +// Look up property of one UTF-8 character and advance over it 1.141 +// Return 0 if input length is zero 1.142 +// Return 0 and advance one byte if input is ill-formed 1.143 +uint8 UTF8GenericProperty(const UTF8PropObj* st, 1.144 + const uint8** src, 1.145 + int* srclen); 1.146 + 1.147 +// Look up property of one UTF-8 character (assumed to be valid). 1.148 +// (This is a faster version of UTF8GenericProperty.) 1.149 +bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src); 1.150 + 1.151 + 1.152 +// BigOneByte versions are needed for tables > 240 states, but most 1.153 +// won't need the TwoByte versions. 1.154 + 1.155 +// Look up property of one UTF-8 character and advance over it 1.156 +// Return 0 if input length is zero 1.157 +// Return 0 and advance one byte if input is ill-formed 1.158 +uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st, 1.159 + const uint8** src, 1.160 + int* srclen); 1.161 + 1.162 + 1.163 +// TwoByte versions are needed for tables > 240 states that don't fit onto 1.164 +// BigOneByte -- rare ultimate fallback 1.165 + 1.166 +// Look up property of one UTF-8 character (assumed to be valid). 1.167 +// (This is a faster version of UTF8GenericProperty.) 1.168 +bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src); 1.169 + 1.170 +// Look up property of one UTF-8 character and advance over it 1.171 +// Return 0 if input length is zero 1.172 +// Return 0 and advance one byte if input is ill-formed 1.173 +uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st, 1.174 + const uint8** src, 1.175 + int* srclen); 1.176 + 1.177 +// Look up property of one UTF-8 character (assumed to be valid). 1.178 +// (This is a faster version of UTF8GenericProperty.) 1.179 +bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src); 1.180 + 1.181 +// Scan a UTF-8 stringpiece based on a state table. 1.182 +// Always scan complete UTF-8 characters 1.183 +// Set number of bytes scanned. Return reason for exiting 1.184 +int UTF8GenericScan(const UTF8ScanObj* st, 1.185 + const StringPiece& str, 1.186 + int* bytes_consumed); 1.187 + 1.188 + 1.189 + 1.190 +// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece 1.191 +// and doing text replacements. 1.192 +// Always scan complete UTF-8 characters 1.193 +// Set number of bytes consumed from input, number filled to output. 1.194 +// Return reason for exiting 1.195 +// Also writes an optional OffsetMap. Pass NULL to skip writing one. 1.196 +int UTF8GenericReplace(const UTF8ReplaceObj* st, 1.197 + const StringPiece& istr, 1.198 + StringPiece& ostr, 1.199 + bool is_plain_text, 1.200 + int* bytes_consumed, 1.201 + int* bytes_filled, 1.202 + int* chars_changed, 1.203 + OffsetMap* offsetmap); 1.204 + 1.205 +// Older version without offsetmap 1.206 +int UTF8GenericReplace(const UTF8ReplaceObj* st, 1.207 + const StringPiece& istr, 1.208 + StringPiece& ostr, 1.209 + bool is_plain_text, 1.210 + int* bytes_consumed, 1.211 + int* bytes_filled, 1.212 + int* chars_changed); 1.213 + 1.214 +// Older version without is_plain_text or offsetmap 1.215 +int UTF8GenericReplace(const UTF8ReplaceObj* st, 1.216 + const StringPiece& istr, 1.217 + StringPiece& ostr, 1.218 + int* bytes_consumed, 1.219 + int* bytes_filled, 1.220 + int* chars_changed); 1.221 + 1.222 + 1.223 +// TwoByte version is needed for tables > about 256 states, such 1.224 +// as the table for full Unicode 4.1 canonical + compatibility mapping 1.225 + 1.226 +// Scan a UTF-8 stringpiece based on state table with two-byte entries, 1.227 +// copying to output stringpiece 1.228 +// and doing text replacements. 1.229 +// Always scan complete UTF-8 characters 1.230 +// Set number of bytes consumed from input, number filled to output. 1.231 +// Return reason for exiting 1.232 +// Also writes an optional OffsetMap. Pass NULL to skip writing one. 1.233 +int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, 1.234 + const StringPiece& istr, 1.235 + StringPiece& ostr, 1.236 + bool is_plain_text, 1.237 + int* bytes_consumed, 1.238 + int* bytes_filled, 1.239 + int* chars_changed, 1.240 + OffsetMap* offsetmap); 1.241 + 1.242 +// Older version without offsetmap 1.243 +int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, 1.244 + const StringPiece& istr, 1.245 + StringPiece& ostr, 1.246 + bool is_plain_text, 1.247 + int* bytes_consumed, 1.248 + int* bytes_filled, 1.249 + int* chars_changed); 1.250 + 1.251 +// Older version without is_plain_text or offsetmap 1.252 +int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, 1.253 + const StringPiece& istr, 1.254 + StringPiece& ostr, 1.255 + int* bytes_consumed, 1.256 + int* bytes_filled, 1.257 + int* chars_changed); 1.258 + 1.259 + 1.260 +static const unsigned char kUTF8LenTbl[256] = { 1.261 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.262 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.263 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.264 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.265 + 1.266 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.267 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.268 + 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 1.269 + 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4 1.270 +}; 1.271 + 1.272 +inline int UTF8OneCharLen(const char* in) { 1.273 + return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)]; 1.274 +} 1.275 + 1.276 +// Adjust a stringpiece to encompass complete UTF-8 characters. 1.277 +// The data pointer will be increased by 0..3 bytes to get to a character 1.278 +// boundary, and the length will then be decreased by 0..3 bytes 1.279 +// to encompass the last complete character. 1.280 +// This is useful especially when a UTF-8 string must be put into a fixed- 1.281 +// maximum-size buffer cleanly, such as a MySQL buffer. 1.282 +void UTF8TrimToChars(StringPiece* istr); 1.283 + 1.284 +} // End namespace CLD2 1.285 + 1.286 +#endif // UTIL_UTF8_UTF8STATETABLE_H_