Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 //
16 // State Table follower for scanning UTF-8 strings without converting to
17 // 32- or 16-bit Unicode values.
18 //
19 // Author: dsites@google.com (Dick Sites)
20 //
22 #ifndef UTIL_UTF8_UTF8STATETABLE_H_
23 #define UTIL_UTF8_UTF8STATETABLE_H_
25 #include <string>
26 #include "integral_types.h" // for uint8, uint32, uint16
27 #include "stringpiece.h"
30 namespace CLD2 {
32 class OffsetMap;
35 // These four-byte entries compactly encode how many bytes 0..255 to delete
36 // in making a string replacement, how many bytes to add 0..255, and the offset
37 // 0..64k-1 of the replacement string in remap_string.
38 struct RemapEntry {
39 uint8 delete_bytes;
40 uint8 add_bytes;
41 uint16 bytes_offset;
42 };
44 // Exit type codes for state tables. All but the first get stuffed into
45 // signed one-byte entries. The first is only generated by executable code.
46 // To distinguish from next-state entries, these must be contiguous and
47 // all <= kExitNone
48 typedef enum {
49 kExitDstSpaceFull = 239,
50 kExitIllegalStructure, // 240
51 kExitOK, // 241
52 kExitReject, // ...
53 kExitReplace1,
54 kExitReplace2,
55 kExitReplace3,
56 kExitReplace21,
57 kExitReplace31,
58 kExitReplace32,
59 kExitReplaceOffset1,
60 kExitReplaceOffset2,
61 kExitReplace1S0,
62 kExitSpecial,
63 kExitDoAgain,
64 kExitRejectAlt,
65 kExitNone // 255
66 } ExitReason;
68 typedef enum {
69 kExitDstSpaceFull_2 = 32767, // 0x7fff
70 kExitIllegalStructure_2, // 32768 0x8000
71 kExitOK_2, // 32769 0x8001
72 kExitReject_2, // ...
73 kExitReplace1_2,
74 kExitReplace2_2,
75 kExitReplace3_2,
76 kExitReplace21_2,
77 kExitReplace31_2,
78 kExitReplace32_2,
79 kExitReplaceOffset1_2,
80 kExitReplaceOffset2_2,
81 kExitReplace1S0_2,
82 kExitSpecial_2,
83 kExitDoAgain_2,
84 kExitRejectAlt_2,
85 kExitNone_2 // 32783 0x800f
86 } ExitReason_2;
89 // This struct represents one entire state table. The three initialized byte
90 // areas are state_table, remap_base, and remap_string. state0 and state0_size
91 // give the byte offset and length within state_table of the initial state --
92 // table lookups are expected to start and end in this state, but for
93 // truncated UTF-8 strings, may end in a different state. These allow a quick
94 // test for that condition. entry_shift is 8 for tables subscripted by a full
95 // byte value and 6 for space-optimized tables subscripted by only six
96 // significant bits in UTF-8 continuation bytes.
97 typedef struct {
98 const uint32 state0;
99 const uint32 state0_size;
100 const uint32 total_size;
101 const int max_expand;
102 const int entry_shift;
103 const int bytes_per_entry;
104 const uint32 losub;
105 const uint32 hiadd;
106 const uint8* state_table;
107 const RemapEntry* remap_base;
108 const uint8* remap_string;
109 const uint8* fast_state;
110 } UTF8StateMachineObj;
112 // Near-duplicate declaration for tables with two-byte entries
113 typedef struct {
114 const uint32 state0;
115 const uint32 state0_size;
116 const uint32 total_size;
117 const int max_expand;
118 const int entry_shift;
119 const int bytes_per_entry;
120 const uint32 losub;
121 const uint32 hiadd;
122 const unsigned short* state_table;
123 const RemapEntry* remap_base;
124 const uint8* remap_string;
125 const uint8* fast_state;
126 } UTF8StateMachineObj_2;
129 typedef UTF8StateMachineObj UTF8PropObj;
130 typedef UTF8StateMachineObj UTF8ScanObj;
131 typedef UTF8StateMachineObj UTF8ReplaceObj;
132 typedef UTF8StateMachineObj_2 UTF8PropObj_2;
133 typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2;
134 // NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2;
137 // Look up property of one UTF-8 character and advance over it
138 // Return 0 if input length is zero
139 // Return 0 and advance one byte if input is ill-formed
140 uint8 UTF8GenericProperty(const UTF8PropObj* st,
141 const uint8** src,
142 int* srclen);
144 // Look up property of one UTF-8 character (assumed to be valid).
145 // (This is a faster version of UTF8GenericProperty.)
146 bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src);
149 // BigOneByte versions are needed for tables > 240 states, but most
150 // won't need the TwoByte versions.
152 // Look up property of one UTF-8 character and advance over it
153 // Return 0 if input length is zero
154 // Return 0 and advance one byte if input is ill-formed
155 uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
156 const uint8** src,
157 int* srclen);
160 // TwoByte versions are needed for tables > 240 states that don't fit onto
161 // BigOneByte -- rare ultimate fallback
163 // Look up property of one UTF-8 character (assumed to be valid).
164 // (This is a faster version of UTF8GenericProperty.)
165 bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src);
167 // Look up property of one UTF-8 character and advance over it
168 // Return 0 if input length is zero
169 // Return 0 and advance one byte if input is ill-formed
170 uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
171 const uint8** src,
172 int* srclen);
174 // Look up property of one UTF-8 character (assumed to be valid).
175 // (This is a faster version of UTF8GenericProperty.)
176 bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src);
178 // Scan a UTF-8 stringpiece based on a state table.
179 // Always scan complete UTF-8 characters
180 // Set number of bytes scanned. Return reason for exiting
181 int UTF8GenericScan(const UTF8ScanObj* st,
182 const StringPiece& str,
183 int* bytes_consumed);
187 // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
188 // and doing text replacements.
189 // Always scan complete UTF-8 characters
190 // Set number of bytes consumed from input, number filled to output.
191 // Return reason for exiting
192 // Also writes an optional OffsetMap. Pass NULL to skip writing one.
193 int UTF8GenericReplace(const UTF8ReplaceObj* st,
194 const StringPiece& istr,
195 StringPiece& ostr,
196 bool is_plain_text,
197 int* bytes_consumed,
198 int* bytes_filled,
199 int* chars_changed,
200 OffsetMap* offsetmap);
202 // Older version without offsetmap
203 int UTF8GenericReplace(const UTF8ReplaceObj* st,
204 const StringPiece& istr,
205 StringPiece& ostr,
206 bool is_plain_text,
207 int* bytes_consumed,
208 int* bytes_filled,
209 int* chars_changed);
211 // Older version without is_plain_text or offsetmap
212 int UTF8GenericReplace(const UTF8ReplaceObj* st,
213 const StringPiece& istr,
214 StringPiece& ostr,
215 int* bytes_consumed,
216 int* bytes_filled,
217 int* chars_changed);
220 // TwoByte version is needed for tables > about 256 states, such
221 // as the table for full Unicode 4.1 canonical + compatibility mapping
223 // Scan a UTF-8 stringpiece based on state table with two-byte entries,
224 // copying to output stringpiece
225 // and doing text replacements.
226 // Always scan complete UTF-8 characters
227 // Set number of bytes consumed from input, number filled to output.
228 // Return reason for exiting
229 // Also writes an optional OffsetMap. Pass NULL to skip writing one.
230 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
231 const StringPiece& istr,
232 StringPiece& ostr,
233 bool is_plain_text,
234 int* bytes_consumed,
235 int* bytes_filled,
236 int* chars_changed,
237 OffsetMap* offsetmap);
239 // Older version without offsetmap
240 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
241 const StringPiece& istr,
242 StringPiece& ostr,
243 bool is_plain_text,
244 int* bytes_consumed,
245 int* bytes_filled,
246 int* chars_changed);
248 // Older version without is_plain_text or offsetmap
249 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
250 const StringPiece& istr,
251 StringPiece& ostr,
252 int* bytes_consumed,
253 int* bytes_filled,
254 int* chars_changed);
257 static const unsigned char kUTF8LenTbl[256] = {
258 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
259 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
260 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
261 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
263 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
264 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
265 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
266 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
267 };
269 inline int UTF8OneCharLen(const char* in) {
270 return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)];
271 }
273 // Adjust a stringpiece to encompass complete UTF-8 characters.
274 // The data pointer will be increased by 0..3 bytes to get to a character
275 // boundary, and the length will then be decreased by 0..3 bytes
276 // to encompass the last complete character.
277 // This is useful especially when a UTF-8 string must be put into a fixed-
278 // maximum-size buffer cleanly, such as a MySQL buffer.
279 void UTF8TrimToChars(StringPiece* istr);
281 } // End namespace CLD2
283 #endif // UTIL_UTF8_UTF8STATETABLE_H_