browser/components/translation/cld2/internal/utf8statetable.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 // Copyright 2013 Google Inc. All Rights Reserved.
michael@0 2 //
michael@0 3 // Licensed under the Apache License, Version 2.0 (the "License");
michael@0 4 // you may not use this file except in compliance with the License.
michael@0 5 // You may obtain a copy of the License at
michael@0 6 //
michael@0 7 // http://www.apache.org/licenses/LICENSE-2.0
michael@0 8 //
michael@0 9 // Unless required by applicable law or agreed to in writing, software
michael@0 10 // distributed under the License is distributed on an "AS IS" BASIS,
michael@0 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
michael@0 12 // See the License for the specific language governing permissions and
michael@0 13 // limitations under the License.
michael@0 14
michael@0 15 //
michael@0 16 // State Table follower for scanning UTF-8 strings without converting to
michael@0 17 // 32- or 16-bit Unicode values.
michael@0 18 //
michael@0 19 // Author: dsites@google.com (Dick Sites)
michael@0 20 //
michael@0 21
michael@0 22 #ifndef UTIL_UTF8_UTF8STATETABLE_H_
michael@0 23 #define UTIL_UTF8_UTF8STATETABLE_H_
michael@0 24
michael@0 25 #include <string>
michael@0 26 #include "integral_types.h" // for uint8, uint32, uint16
michael@0 27 #include "stringpiece.h"
michael@0 28
michael@0 29
michael@0 30 namespace CLD2 {
michael@0 31
michael@0 32 class OffsetMap;
michael@0 33
michael@0 34
michael@0 35 // These four-byte entries compactly encode how many bytes 0..255 to delete
michael@0 36 // in making a string replacement, how many bytes to add 0..255, and the offset
michael@0 37 // 0..64k-1 of the replacement string in remap_string.
michael@0 38 struct RemapEntry {
michael@0 39 uint8 delete_bytes;
michael@0 40 uint8 add_bytes;
michael@0 41 uint16 bytes_offset;
michael@0 42 };
michael@0 43
michael@0 44 // Exit type codes for state tables. All but the first get stuffed into
michael@0 45 // signed one-byte entries. The first is only generated by executable code.
michael@0 46 // To distinguish from next-state entries, these must be contiguous and
michael@0 47 // all <= kExitNone
michael@0 48 typedef enum {
michael@0 49 kExitDstSpaceFull = 239,
michael@0 50 kExitIllegalStructure, // 240
michael@0 51 kExitOK, // 241
michael@0 52 kExitReject, // ...
michael@0 53 kExitReplace1,
michael@0 54 kExitReplace2,
michael@0 55 kExitReplace3,
michael@0 56 kExitReplace21,
michael@0 57 kExitReplace31,
michael@0 58 kExitReplace32,
michael@0 59 kExitReplaceOffset1,
michael@0 60 kExitReplaceOffset2,
michael@0 61 kExitReplace1S0,
michael@0 62 kExitSpecial,
michael@0 63 kExitDoAgain,
michael@0 64 kExitRejectAlt,
michael@0 65 kExitNone // 255
michael@0 66 } ExitReason;
michael@0 67
michael@0 68 typedef enum {
michael@0 69 kExitDstSpaceFull_2 = 32767, // 0x7fff
michael@0 70 kExitIllegalStructure_2, // 32768 0x8000
michael@0 71 kExitOK_2, // 32769 0x8001
michael@0 72 kExitReject_2, // ...
michael@0 73 kExitReplace1_2,
michael@0 74 kExitReplace2_2,
michael@0 75 kExitReplace3_2,
michael@0 76 kExitReplace21_2,
michael@0 77 kExitReplace31_2,
michael@0 78 kExitReplace32_2,
michael@0 79 kExitReplaceOffset1_2,
michael@0 80 kExitReplaceOffset2_2,
michael@0 81 kExitReplace1S0_2,
michael@0 82 kExitSpecial_2,
michael@0 83 kExitDoAgain_2,
michael@0 84 kExitRejectAlt_2,
michael@0 85 kExitNone_2 // 32783 0x800f
michael@0 86 } ExitReason_2;
michael@0 87
michael@0 88
michael@0 89 // This struct represents one entire state table. The three initialized byte
michael@0 90 // areas are state_table, remap_base, and remap_string. state0 and state0_size
michael@0 91 // give the byte offset and length within state_table of the initial state --
michael@0 92 // table lookups are expected to start and end in this state, but for
michael@0 93 // truncated UTF-8 strings, may end in a different state. These allow a quick
michael@0 94 // test for that condition. entry_shift is 8 for tables subscripted by a full
michael@0 95 // byte value and 6 for space-optimized tables subscripted by only six
michael@0 96 // significant bits in UTF-8 continuation bytes.
michael@0 97 typedef struct {
michael@0 98 const uint32 state0;
michael@0 99 const uint32 state0_size;
michael@0 100 const uint32 total_size;
michael@0 101 const int max_expand;
michael@0 102 const int entry_shift;
michael@0 103 const int bytes_per_entry;
michael@0 104 const uint32 losub;
michael@0 105 const uint32 hiadd;
michael@0 106 const uint8* state_table;
michael@0 107 const RemapEntry* remap_base;
michael@0 108 const uint8* remap_string;
michael@0 109 const uint8* fast_state;
michael@0 110 } UTF8StateMachineObj;
michael@0 111
michael@0 112 // Near-duplicate declaration for tables with two-byte entries
michael@0 113 typedef struct {
michael@0 114 const uint32 state0;
michael@0 115 const uint32 state0_size;
michael@0 116 const uint32 total_size;
michael@0 117 const int max_expand;
michael@0 118 const int entry_shift;
michael@0 119 const int bytes_per_entry;
michael@0 120 const uint32 losub;
michael@0 121 const uint32 hiadd;
michael@0 122 const unsigned short* state_table;
michael@0 123 const RemapEntry* remap_base;
michael@0 124 const uint8* remap_string;
michael@0 125 const uint8* fast_state;
michael@0 126 } UTF8StateMachineObj_2;
michael@0 127
michael@0 128
michael@0 129 typedef UTF8StateMachineObj UTF8PropObj;
michael@0 130 typedef UTF8StateMachineObj UTF8ScanObj;
michael@0 131 typedef UTF8StateMachineObj UTF8ReplaceObj;
michael@0 132 typedef UTF8StateMachineObj_2 UTF8PropObj_2;
michael@0 133 typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2;
michael@0 134 // NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2;
michael@0 135
michael@0 136
michael@0 137 // Look up property of one UTF-8 character and advance over it
michael@0 138 // Return 0 if input length is zero
michael@0 139 // Return 0 and advance one byte if input is ill-formed
michael@0 140 uint8 UTF8GenericProperty(const UTF8PropObj* st,
michael@0 141 const uint8** src,
michael@0 142 int* srclen);
michael@0 143
michael@0 144 // Look up property of one UTF-8 character (assumed to be valid).
michael@0 145 // (This is a faster version of UTF8GenericProperty.)
michael@0 146 bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src);
michael@0 147
michael@0 148
michael@0 149 // BigOneByte versions are needed for tables > 240 states, but most
michael@0 150 // won't need the TwoByte versions.
michael@0 151
michael@0 152 // Look up property of one UTF-8 character and advance over it
michael@0 153 // Return 0 if input length is zero
michael@0 154 // Return 0 and advance one byte if input is ill-formed
michael@0 155 uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
michael@0 156 const uint8** src,
michael@0 157 int* srclen);
michael@0 158
michael@0 159
michael@0 160 // TwoByte versions are needed for tables > 240 states that don't fit onto
michael@0 161 // BigOneByte -- rare ultimate fallback
michael@0 162
michael@0 163 // Look up property of one UTF-8 character (assumed to be valid).
michael@0 164 // (This is a faster version of UTF8GenericProperty.)
michael@0 165 bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src);
michael@0 166
michael@0 167 // Look up property of one UTF-8 character and advance over it
michael@0 168 // Return 0 if input length is zero
michael@0 169 // Return 0 and advance one byte if input is ill-formed
michael@0 170 uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
michael@0 171 const uint8** src,
michael@0 172 int* srclen);
michael@0 173
michael@0 174 // Look up property of one UTF-8 character (assumed to be valid).
michael@0 175 // (This is a faster version of UTF8GenericProperty.)
michael@0 176 bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src);
michael@0 177
michael@0 178 // Scan a UTF-8 stringpiece based on a state table.
michael@0 179 // Always scan complete UTF-8 characters
michael@0 180 // Set number of bytes scanned. Return reason for exiting
michael@0 181 int UTF8GenericScan(const UTF8ScanObj* st,
michael@0 182 const StringPiece& str,
michael@0 183 int* bytes_consumed);
michael@0 184
michael@0 185
michael@0 186
michael@0 187 // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
michael@0 188 // and doing text replacements.
michael@0 189 // Always scan complete UTF-8 characters
michael@0 190 // Set number of bytes consumed from input, number filled to output.
michael@0 191 // Return reason for exiting
michael@0 192 // Also writes an optional OffsetMap. Pass NULL to skip writing one.
michael@0 193 int UTF8GenericReplace(const UTF8ReplaceObj* st,
michael@0 194 const StringPiece& istr,
michael@0 195 StringPiece& ostr,
michael@0 196 bool is_plain_text,
michael@0 197 int* bytes_consumed,
michael@0 198 int* bytes_filled,
michael@0 199 int* chars_changed,
michael@0 200 OffsetMap* offsetmap);
michael@0 201
michael@0 202 // Older version without offsetmap
michael@0 203 int UTF8GenericReplace(const UTF8ReplaceObj* st,
michael@0 204 const StringPiece& istr,
michael@0 205 StringPiece& ostr,
michael@0 206 bool is_plain_text,
michael@0 207 int* bytes_consumed,
michael@0 208 int* bytes_filled,
michael@0 209 int* chars_changed);
michael@0 210
michael@0 211 // Older version without is_plain_text or offsetmap
michael@0 212 int UTF8GenericReplace(const UTF8ReplaceObj* st,
michael@0 213 const StringPiece& istr,
michael@0 214 StringPiece& ostr,
michael@0 215 int* bytes_consumed,
michael@0 216 int* bytes_filled,
michael@0 217 int* chars_changed);
michael@0 218
michael@0 219
michael@0 220 // TwoByte version is needed for tables > about 256 states, such
michael@0 221 // as the table for full Unicode 4.1 canonical + compatibility mapping
michael@0 222
michael@0 223 // Scan a UTF-8 stringpiece based on state table with two-byte entries,
michael@0 224 // copying to output stringpiece
michael@0 225 // and doing text replacements.
michael@0 226 // Always scan complete UTF-8 characters
michael@0 227 // Set number of bytes consumed from input, number filled to output.
michael@0 228 // Return reason for exiting
michael@0 229 // Also writes an optional OffsetMap. Pass NULL to skip writing one.
michael@0 230 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
michael@0 231 const StringPiece& istr,
michael@0 232 StringPiece& ostr,
michael@0 233 bool is_plain_text,
michael@0 234 int* bytes_consumed,
michael@0 235 int* bytes_filled,
michael@0 236 int* chars_changed,
michael@0 237 OffsetMap* offsetmap);
michael@0 238
michael@0 239 // Older version without offsetmap
michael@0 240 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
michael@0 241 const StringPiece& istr,
michael@0 242 StringPiece& ostr,
michael@0 243 bool is_plain_text,
michael@0 244 int* bytes_consumed,
michael@0 245 int* bytes_filled,
michael@0 246 int* chars_changed);
michael@0 247
michael@0 248 // Older version without is_plain_text or offsetmap
michael@0 249 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
michael@0 250 const StringPiece& istr,
michael@0 251 StringPiece& ostr,
michael@0 252 int* bytes_consumed,
michael@0 253 int* bytes_filled,
michael@0 254 int* chars_changed);
michael@0 255
michael@0 256
michael@0 257 static const unsigned char kUTF8LenTbl[256] = {
michael@0 258 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 259 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 260 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 261 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 262
michael@0 263 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 264 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 265 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
michael@0 266 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
michael@0 267 };
michael@0 268
michael@0 269 inline int UTF8OneCharLen(const char* in) {
michael@0 270 return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)];
michael@0 271 }
michael@0 272
michael@0 273 // Adjust a stringpiece to encompass complete UTF-8 characters.
michael@0 274 // The data pointer will be increased by 0..3 bytes to get to a character
michael@0 275 // boundary, and the length will then be decreased by 0..3 bytes
michael@0 276 // to encompass the last complete character.
michael@0 277 // This is useful especially when a UTF-8 string must be put into a fixed-
michael@0 278 // maximum-size buffer cleanly, such as a MySQL buffer.
michael@0 279 void UTF8TrimToChars(StringPiece* istr);
michael@0 280
michael@0 281 } // End namespace CLD2
michael@0 282
michael@0 283 #endif // UTIL_UTF8_UTF8STATETABLE_H_

mercurial