browser/components/translation/cld2/internal/utf8statetable.h

branch
TOR_BUG_9701
changeset 15
b8a032363ba2
equal deleted inserted replaced
-1:000000000000 0:95d9d5ea8c94
1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 //
16 // State Table follower for scanning UTF-8 strings without converting to
17 // 32- or 16-bit Unicode values.
18 //
19 // Author: dsites@google.com (Dick Sites)
20 //
21
22 #ifndef UTIL_UTF8_UTF8STATETABLE_H_
23 #define UTIL_UTF8_UTF8STATETABLE_H_
24
25 #include <string>
26 #include "integral_types.h" // for uint8, uint32, uint16
27 #include "stringpiece.h"
28
29
30 namespace CLD2 {
31
32 class OffsetMap;
33
34
35 // These four-byte entries compactly encode how many bytes 0..255 to delete
36 // in making a string replacement, how many bytes to add 0..255, and the offset
37 // 0..64k-1 of the replacement string in remap_string.
38 struct RemapEntry {
39 uint8 delete_bytes;
40 uint8 add_bytes;
41 uint16 bytes_offset;
42 };
43
44 // Exit type codes for state tables. All but the first get stuffed into
45 // signed one-byte entries. The first is only generated by executable code.
46 // To distinguish from next-state entries, these must be contiguous and
47 // all <= kExitNone
48 typedef enum {
49 kExitDstSpaceFull = 239,
50 kExitIllegalStructure, // 240
51 kExitOK, // 241
52 kExitReject, // ...
53 kExitReplace1,
54 kExitReplace2,
55 kExitReplace3,
56 kExitReplace21,
57 kExitReplace31,
58 kExitReplace32,
59 kExitReplaceOffset1,
60 kExitReplaceOffset2,
61 kExitReplace1S0,
62 kExitSpecial,
63 kExitDoAgain,
64 kExitRejectAlt,
65 kExitNone // 255
66 } ExitReason;
67
68 typedef enum {
69 kExitDstSpaceFull_2 = 32767, // 0x7fff
70 kExitIllegalStructure_2, // 32768 0x8000
71 kExitOK_2, // 32769 0x8001
72 kExitReject_2, // ...
73 kExitReplace1_2,
74 kExitReplace2_2,
75 kExitReplace3_2,
76 kExitReplace21_2,
77 kExitReplace31_2,
78 kExitReplace32_2,
79 kExitReplaceOffset1_2,
80 kExitReplaceOffset2_2,
81 kExitReplace1S0_2,
82 kExitSpecial_2,
83 kExitDoAgain_2,
84 kExitRejectAlt_2,
85 kExitNone_2 // 32783 0x800f
86 } ExitReason_2;
87
88
89 // This struct represents one entire state table. The three initialized byte
90 // areas are state_table, remap_base, and remap_string. state0 and state0_size
91 // give the byte offset and length within state_table of the initial state --
92 // table lookups are expected to start and end in this state, but for
93 // truncated UTF-8 strings, may end in a different state. These allow a quick
94 // test for that condition. entry_shift is 8 for tables subscripted by a full
95 // byte value and 6 for space-optimized tables subscripted by only six
96 // significant bits in UTF-8 continuation bytes.
97 typedef struct {
98 const uint32 state0;
99 const uint32 state0_size;
100 const uint32 total_size;
101 const int max_expand;
102 const int entry_shift;
103 const int bytes_per_entry;
104 const uint32 losub;
105 const uint32 hiadd;
106 const uint8* state_table;
107 const RemapEntry* remap_base;
108 const uint8* remap_string;
109 const uint8* fast_state;
110 } UTF8StateMachineObj;
111
112 // Near-duplicate declaration for tables with two-byte entries
113 typedef struct {
114 const uint32 state0;
115 const uint32 state0_size;
116 const uint32 total_size;
117 const int max_expand;
118 const int entry_shift;
119 const int bytes_per_entry;
120 const uint32 losub;
121 const uint32 hiadd;
122 const unsigned short* state_table;
123 const RemapEntry* remap_base;
124 const uint8* remap_string;
125 const uint8* fast_state;
126 } UTF8StateMachineObj_2;
127
128
129 typedef UTF8StateMachineObj UTF8PropObj;
130 typedef UTF8StateMachineObj UTF8ScanObj;
131 typedef UTF8StateMachineObj UTF8ReplaceObj;
132 typedef UTF8StateMachineObj_2 UTF8PropObj_2;
133 typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2;
134 // NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2;
135
136
137 // Look up property of one UTF-8 character and advance over it
138 // Return 0 if input length is zero
139 // Return 0 and advance one byte if input is ill-formed
140 uint8 UTF8GenericProperty(const UTF8PropObj* st,
141 const uint8** src,
142 int* srclen);
143
144 // Look up property of one UTF-8 character (assumed to be valid).
145 // (This is a faster version of UTF8GenericProperty.)
146 bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src);
147
148
149 // BigOneByte versions are needed for tables > 240 states, but most
150 // won't need the TwoByte versions.
151
152 // Look up property of one UTF-8 character and advance over it
153 // Return 0 if input length is zero
154 // Return 0 and advance one byte if input is ill-formed
155 uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
156 const uint8** src,
157 int* srclen);
158
159
160 // TwoByte versions are needed for tables > 240 states that don't fit onto
161 // BigOneByte -- rare ultimate fallback
162
163 // Look up property of one UTF-8 character (assumed to be valid).
164 // (This is a faster version of UTF8GenericProperty.)
165 bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src);
166
167 // Look up property of one UTF-8 character and advance over it
168 // Return 0 if input length is zero
169 // Return 0 and advance one byte if input is ill-formed
170 uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
171 const uint8** src,
172 int* srclen);
173
174 // Look up property of one UTF-8 character (assumed to be valid).
175 // (This is a faster version of UTF8GenericProperty.)
176 bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src);
177
178 // Scan a UTF-8 stringpiece based on a state table.
179 // Always scan complete UTF-8 characters
180 // Set number of bytes scanned. Return reason for exiting
181 int UTF8GenericScan(const UTF8ScanObj* st,
182 const StringPiece& str,
183 int* bytes_consumed);
184
185
186
187 // Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
188 // and doing text replacements.
189 // Always scan complete UTF-8 characters
190 // Set number of bytes consumed from input, number filled to output.
191 // Return reason for exiting
192 // Also writes an optional OffsetMap. Pass NULL to skip writing one.
193 int UTF8GenericReplace(const UTF8ReplaceObj* st,
194 const StringPiece& istr,
195 StringPiece& ostr,
196 bool is_plain_text,
197 int* bytes_consumed,
198 int* bytes_filled,
199 int* chars_changed,
200 OffsetMap* offsetmap);
201
202 // Older version without offsetmap
203 int UTF8GenericReplace(const UTF8ReplaceObj* st,
204 const StringPiece& istr,
205 StringPiece& ostr,
206 bool is_plain_text,
207 int* bytes_consumed,
208 int* bytes_filled,
209 int* chars_changed);
210
211 // Older version without is_plain_text or offsetmap
212 int UTF8GenericReplace(const UTF8ReplaceObj* st,
213 const StringPiece& istr,
214 StringPiece& ostr,
215 int* bytes_consumed,
216 int* bytes_filled,
217 int* chars_changed);
218
219
220 // TwoByte version is needed for tables > about 256 states, such
221 // as the table for full Unicode 4.1 canonical + compatibility mapping
222
223 // Scan a UTF-8 stringpiece based on state table with two-byte entries,
224 // copying to output stringpiece
225 // and doing text replacements.
226 // Always scan complete UTF-8 characters
227 // Set number of bytes consumed from input, number filled to output.
228 // Return reason for exiting
229 // Also writes an optional OffsetMap. Pass NULL to skip writing one.
230 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
231 const StringPiece& istr,
232 StringPiece& ostr,
233 bool is_plain_text,
234 int* bytes_consumed,
235 int* bytes_filled,
236 int* chars_changed,
237 OffsetMap* offsetmap);
238
239 // Older version without offsetmap
240 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
241 const StringPiece& istr,
242 StringPiece& ostr,
243 bool is_plain_text,
244 int* bytes_consumed,
245 int* bytes_filled,
246 int* chars_changed);
247
248 // Older version without is_plain_text or offsetmap
249 int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
250 const StringPiece& istr,
251 StringPiece& ostr,
252 int* bytes_consumed,
253 int* bytes_filled,
254 int* chars_changed);
255
256
257 static const unsigned char kUTF8LenTbl[256] = {
258 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
259 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
260 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
261 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
262
263 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
264 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
265 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
266 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
267 };
268
269 inline int UTF8OneCharLen(const char* in) {
270 return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)];
271 }
272
273 // Adjust a stringpiece to encompass complete UTF-8 characters.
274 // The data pointer will be increased by 0..3 bytes to get to a character
275 // boundary, and the length will then be decreased by 0..3 bytes
276 // to encompass the last complete character.
277 // This is useful especially when a UTF-8 string must be put into a fixed-
278 // maximum-size buffer cleanly, such as a MySQL buffer.
279 void UTF8TrimToChars(StringPiece* istr);
280
281 } // End namespace CLD2
282
283 #endif // UTIL_UTF8_UTF8STATETABLE_H_

mercurial