|
1 // Copyright 2013 Google Inc. All Rights Reserved. |
|
2 // |
|
3 // Licensed under the Apache License, Version 2.0 (the "License"); |
|
4 // you may not use this file except in compliance with the License. |
|
5 // You may obtain a copy of the License at |
|
6 // |
|
7 // http://www.apache.org/licenses/LICENSE-2.0 |
|
8 // |
|
9 // Unless required by applicable law or agreed to in writing, software |
|
10 // distributed under the License is distributed on an "AS IS" BASIS, |
|
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
12 // See the License for the specific language governing permissions and |
|
13 // limitations under the License. |
|
14 |
|
15 // |
|
16 // Author: dsites@google.com (Dick Sites) |
|
17 // |
|
18 |
|
19 |
|
20 #include "getonescriptspan.h" |
|
21 #include <string.h> |
|
22 |
|
23 #include "fixunicodevalue.h" |
|
24 #include "lang_script.h" |
|
25 #include "port.h" |
|
26 #include "utf8statetable.h" |
|
27 |
|
28 #include "utf8prop_lettermarkscriptnum.h" |
|
29 #include "utf8repl_lettermarklower.h" |
|
30 #include "utf8scannot_lettermarkspecial.h" |
|
31 |
|
32 |
|
33 namespace CLD2 { |
|
34 |
|
35 // Alphabetical order for binary search, from |
|
36 // generated_entities.cc |
|
37 extern const int kNameToEntitySize; |
|
38 extern const CharIntPair kNameToEntity[]; |
|
39 |
|
40 static const int kMaxUpToWordBoundary = 50; // span < this make longer, |
|
41 // else make shorter |
|
42 static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes |
|
43 // to round to word boundary, |
|
44 // direction above |
|
45 |
|
46 static const char kSpecialSymbol[256] = { // true for < > & |
|
47 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
48 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0, |
|
49 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
50 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
51 |
|
52 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
53 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
54 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
55 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
56 }; |
|
57 |
|
58 |
|
59 |
|
60 #define LT 0 // < |
|
61 #define GT 1 // > |
|
62 #define EX 2 // ! |
|
63 #define HY 3 // - |
|
64 #define QU 4 // " |
|
65 #define AP 5 // ' |
|
66 #define SL 6 // / |
|
67 #define S_ 7 |
|
68 #define C_ 8 |
|
69 #define R_ 9 |
|
70 #define I_ 10 |
|
71 #define P_ 11 |
|
72 #define T_ 12 |
|
73 #define Y_ 13 |
|
74 #define L_ 14 |
|
75 #define E_ 15 |
|
76 #define CR 16 // <cr> or <lf> |
|
77 #define NL 17 // non-letter: ASCII whitespace, digit, punctuation |
|
78 #define PL 18 // possible letter, incl. & |
|
79 #define xx 19 // <unused> |
|
80 |
|
81 // Map byte to one of ~20 interesting categories for cheap tag parsing |
|
82 static const uint8 kCharToSub[256] = { |
|
83 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL, |
|
84 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, |
|
85 NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL, |
|
86 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL, |
|
87 |
|
88 PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, |
|
89 P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, |
|
90 PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, |
|
91 P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, |
|
92 |
|
93 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, |
|
94 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, |
|
95 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, |
|
96 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, |
|
97 |
|
98 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, |
|
99 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, |
|
100 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, |
|
101 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, |
|
102 }; |
|
103 |
|
104 #undef LT |
|
105 #undef GT |
|
106 #undef EX |
|
107 #undef HY |
|
108 #undef QU |
|
109 #undef AP |
|
110 #undef SL |
|
111 #undef S_ |
|
112 #undef C_ |
|
113 #undef R_ |
|
114 #undef I_ |
|
115 #undef P_ |
|
116 #undef T_ |
|
117 #undef Y_ |
|
118 #undef L_ |
|
119 #undef E_ |
|
120 #undef CR |
|
121 #undef NL |
|
122 #undef PL |
|
123 #undef xx |
|
124 |
|
125 |
|
126 #define OK 0 |
|
127 #define X_ 1 |
|
128 |
|
129 |
|
130 static const int kMaxExitStateLettersMarksOnly = 1; |
|
131 static const int kMaxExitStateAllText = 2; |
|
132 |
|
133 |
|
134 // State machine to do cheap parse of non-letter strings incl. tags |
|
135 // advances <tag> |
|
136 // | | |
|
137 // advances <tag> ... </tag> for <script> <style> |
|
138 // | | |
|
139 // advances <!-- ... <tag> ... --> |
|
140 // | | |
|
141 // advances <tag |
|
142 // || (0) |
|
143 // advances <tag <tag2> |
|
144 // || (0) |
|
145 // |
|
146 // We start in state [0] at a non-letter and make at least one transition |
|
147 // When scanning for just letters, arriving back at state [0] or [1] exits |
|
148 // the state machine. |
|
149 // When scanning for any non-tag text, arriving at state [2] also exits |
|
150 static const uint8 kTagParseTbl_0[] = { |
|
151 // < > ! - " ' / S C R I P T Y L E CR NL PL xx |
|
152 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK exit state |
|
153 X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error exit state |
|
154 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL* [exit state] |
|
155 X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] < |
|
156 X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <! |
|
157 X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!- |
|
158 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.* |
|
159 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*- |
|
160 6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*-- |
|
161 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.* |
|
162 10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*" |
|
163 11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*' |
|
164 X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " ' |
|
165 |
|
166 // < > ! - " ' / S C R I P T Y L E CR NL PL xx |
|
167 X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S |
|
168 X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC |
|
169 X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR |
|
170 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI |
|
171 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP |
|
172 X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT |
|
173 20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .* |
|
174 19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*< |
|
175 19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 21,21,19,X_, // [21] <SCRIPT .*</ allow SP CR LF |
|
176 19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S |
|
177 19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC |
|
178 19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR |
|
179 19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI |
|
180 19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP |
|
181 19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT |
|
182 |
|
183 // < > ! - " ' / S C R I P T Y L E CR NL PL xx |
|
184 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST |
|
185 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY |
|
186 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL |
|
187 X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE |
|
188 33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .* |
|
189 32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*< |
|
190 32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 34,34,32,X_, // [34] <STYLE .*</ allow SP CR LF |
|
191 32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S |
|
192 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST |
|
193 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY |
|
194 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL |
|
195 32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE |
|
196 }; |
|
197 |
|
198 #undef OK |
|
199 #undef X_ |
|
200 |
|
201 enum |
|
202 { |
|
203 UTFmax = 4, // maximum bytes per rune |
|
204 Runesync = 0x80, // cannot represent part of a UTF sequence (<) |
|
205 Runeself = 0x80, // rune and UTF sequences are the same (<) |
|
206 Runeerror = 0xFFFD, // decoding error in UTF |
|
207 Runemax = 0x10FFFF, // maximum rune value |
|
208 }; |
|
209 |
|
210 // Debugging. Not thread safe. |
|
211 static char gDisplayPiece[32]; |
|
212 const uint8 gCharlen[16] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 2,2,3,4}; |
|
213 char* DisplayPiece(const char* next_byte_, int byte_length_) { |
|
214 // Copy up to 8 UTF-8 chars to buffer |
|
215 int k = 0; // byte count |
|
216 int n = 0; // character count |
|
217 for (int i = 0; i < byte_length_; ++i) { |
|
218 char c = next_byte_[i]; |
|
219 if ((c & 0xc0) != 0x80) { |
|
220 // Beginning of a UTF-8 character |
|
221 int charlen = gCharlen[static_cast<uint8>(c) >> 4]; |
|
222 if (i + charlen > byte_length_) {break;} // Not enough room for full char |
|
223 if (k >= (32 - 7)) {break;} // Not necessarily enough room |
|
224 if (n >= 8) {break;} // Enough characters already |
|
225 ++n; |
|
226 } |
|
227 if (c == '<') { |
|
228 memcpy(&gDisplayPiece[k], "<", 4); k += 4; |
|
229 } else if (c == '>') { |
|
230 memcpy(&gDisplayPiece[k], ">", 4); k += 4; |
|
231 } else if (c == '&') { |
|
232 memcpy(&gDisplayPiece[k], "&", 5); k += 5; |
|
233 } else if (c == '\'') { |
|
234 memcpy(&gDisplayPiece[k], "'", 6); k += 6; |
|
235 } else if (c == '"') { |
|
236 memcpy(&gDisplayPiece[k], """, 6); k += 6; |
|
237 } else { |
|
238 gDisplayPiece[k++] = c; |
|
239 } |
|
240 } |
|
241 gDisplayPiece[k++] = '\0'; |
|
242 return gDisplayPiece; |
|
243 } |
|
244 |
|
245 |
|
246 |
|
247 // runetochar copies (encodes) one rune, pointed to by r, to at most |
|
248 // UTFmax bytes starting at s and returns the number of bytes generated. |
|
249 int runetochar(char *str, const char32 *rune) { |
|
250 // Convert to unsigned for range check. |
|
251 unsigned long c; |
|
252 |
|
253 // 1 char 00-7F |
|
254 c = *rune; |
|
255 if(c <= 0x7F) { |
|
256 str[0] = c; |
|
257 return 1; |
|
258 } |
|
259 |
|
260 // 2 char 0080-07FF |
|
261 if(c <= 0x07FF) { |
|
262 str[0] = 0xC0 | (c >> 1*6); |
|
263 str[1] = 0x80 | (c & 0x3F); |
|
264 return 2; |
|
265 } |
|
266 |
|
267 // Range check |
|
268 if (c > Runemax) { |
|
269 c = Runeerror; |
|
270 } |
|
271 |
|
272 // 3 char 0800-FFFF |
|
273 if (c <= 0xFFFF) { |
|
274 str[0] = 0xE0 | (c >> 2*6); |
|
275 str[1] = 0x80 | ((c >> 1*6) & 0x3F); |
|
276 str[2] = 0x80 | (c & 0x3F); |
|
277 return 3; |
|
278 } |
|
279 |
|
280 // 4 char 10000-1FFFFF |
|
281 str[0] = 0xF0 | (c >> 3*6); |
|
282 str[1] = 0x80 | ((c >> 2*6) & 0x3F); |
|
283 str[2] = 0x80 | ((c >> 1*6) & 0x3F); |
|
284 str[3] = 0x80 | (c & 0x3F); |
|
285 return 4; |
|
286 } |
|
287 |
|
288 |
|
289 |
|
290 // Useful for converting an entity to an ascii value. |
|
291 // RETURNS unicode value, or -1 if entity isn't valid. Don't include & or ; |
|
292 int LookupEntity(const char* entity_name, int entity_len) { |
|
293 // Make a C string |
|
294 if (entity_len >= 16) {return -1;} // All real entities are shorter |
|
295 char temp[16]; |
|
296 memcpy(temp, entity_name, entity_len); |
|
297 temp[entity_len] = '\0'; |
|
298 int match = BinarySearch(temp, 0, kNameToEntitySize, kNameToEntity); |
|
299 if (match >= 0) {return kNameToEntity[match].i;} |
|
300 return -1; |
|
301 } |
|
302 |
|
303 bool ascii_isdigit(char c) { |
|
304 return ('0' <= c) && (c <= '9'); |
|
305 } |
|
306 bool ascii_isxdigit(char c) { |
|
307 if (('0' <= c) && (c <= '9')) {return true;} |
|
308 if (('a' <= c) && (c <= 'f')) {return true;} |
|
309 if (('A' <= c) && (c <= 'F')) {return true;} |
|
310 return false; |
|
311 } |
|
312 bool ascii_isalnum(char c) { |
|
313 if (('0' <= c) && (c <= '9')) {return true;} |
|
314 if (('a' <= c) && (c <= 'z')) {return true;} |
|
315 if (('A' <= c) && (c <= 'Z')) {return true;} |
|
316 return false; |
|
317 } |
|
318 int hex_digit_to_int(char c) { |
|
319 if (('0' <= c) && (c <= '9')) {return c - '0';} |
|
320 if (('a' <= c) && (c <= 'f')) {return c - 'a' + 10;} |
|
321 if (('A' <= c) && (c <= 'F')) {return c - 'A' + 10;} |
|
322 return 0; |
|
323 } |
|
324 |
|
325 static int32 strto32_base10(const char* nptr, const char* limit, |
|
326 const char **endptr) { |
|
327 *endptr = nptr; |
|
328 while (nptr < limit && *nptr == '0') { |
|
329 ++nptr; |
|
330 } |
|
331 if (nptr == limit || !ascii_isdigit(*nptr)) |
|
332 return -1; |
|
333 const char* end_digits_run = nptr; |
|
334 while (end_digits_run < limit && ascii_isdigit(*end_digits_run)) { |
|
335 ++end_digits_run; |
|
336 } |
|
337 *endptr = end_digits_run; |
|
338 const int num_digits = end_digits_run - nptr; |
|
339 // kint32max == 2147483647. |
|
340 if (num_digits < 9 || |
|
341 (num_digits == 10 && memcmp(nptr, "2147483647", 10) <= 0)) { |
|
342 int value = 0; |
|
343 for (; nptr < end_digits_run; ++nptr) { |
|
344 value *= 10; |
|
345 value += *nptr - '0'; |
|
346 } |
|
347 // Overflow past the last valid unicode codepoint |
|
348 // (0x10ffff) is converted to U+FFFD by FixUnicodeValue(). |
|
349 return FixUnicodeValue(value); |
|
350 } else { |
|
351 // Overflow: can't fit in an int32; |
|
352 // returns the replacement character 0xFFFD. |
|
353 return 0xFFFD; |
|
354 } |
|
355 } |
|
356 |
|
357 static int32 strto32_base16(const char* nptr, const char* limit, |
|
358 const char **endptr) { |
|
359 *endptr = nptr; |
|
360 while (nptr < limit && *nptr == '0') { |
|
361 ++nptr; |
|
362 } |
|
363 if (nptr == limit || !ascii_isxdigit(*nptr)) { |
|
364 return -1; |
|
365 } |
|
366 const char* end_xdigits_run = nptr; |
|
367 while (end_xdigits_run < limit && ascii_isxdigit(*end_xdigits_run)) { |
|
368 ++end_xdigits_run; |
|
369 } |
|
370 *endptr = end_xdigits_run; |
|
371 const int num_xdigits = end_xdigits_run - nptr; |
|
372 // kint32max == 0x7FFFFFFF. |
|
373 if (num_xdigits < 8 || (num_xdigits == 8 && nptr[0] < '8')) { |
|
374 int value = 0; |
|
375 for (; nptr < end_xdigits_run; ++nptr) { |
|
376 value <<= 4; |
|
377 value += hex_digit_to_int(*nptr); |
|
378 } |
|
379 // Overflow past the last valid unicode codepoint |
|
380 // (0x10ffff) is converted to U+FFFD by FixUnicodeValue(). |
|
381 return FixUnicodeValue(value); |
|
382 } else { |
|
383 // Overflow: can't fit in an int32; |
|
384 // returns the replacement character 0xFFFD. |
|
385 return 0xFFFD; |
|
386 } |
|
387 } |
|
388 |
|
389 // Unescape the current character pointed to by src. SETS the number |
|
390 // of chars read for the conversion (in UTF8). If src isn't a valid entity, |
|
391 // just consume the & and RETURN -1. If src doesn't point to & -- which it |
|
392 // should -- set src_consumed to 0 and RETURN -1. |
|
393 int ReadEntity(const char* src, int srcn, int* src_consumed) { |
|
394 const char* const srcend = src + srcn; |
|
395 |
|
396 if (srcn == 0 || *src != '&') { // input should start with an ampersand |
|
397 *src_consumed = 0; |
|
398 return -1; |
|
399 } |
|
400 *src_consumed = 1; // we'll get the & at least |
|
401 |
|
402 // The standards are a bit unclear on when an entity ends. Certainly a ";" |
|
403 // ends one, but spaces probably do too. We follow the lead of both IE and |
|
404 // Netscape, which as far as we can tell end numeric entities (1st case below) |
|
405 // at any non-digit, and end character entities (2nd case) at any non-alnum. |
|
406 const char* entstart, *entend; // where the entity starts and ends |
|
407 entstart = src + 1; // read past the & |
|
408 int entval; // UCS2 value of the entity |
|
409 if ( *entstart == '#' ) { // -- 1st case: numeric entity |
|
410 if ( entstart + 2 >= srcend ) { |
|
411 return -1; // no way a legitimate number could fit |
|
412 } else if ( entstart[1] == 'x' || entstart[1] == 'X' ) { // hex numeric |
|
413 entval = strto32_base16(entstart + 2, srcend, &entend); |
|
414 } else { // decimal numeric entity |
|
415 entval = strto32_base10(entstart+1, srcend, &entend); |
|
416 } |
|
417 if (entval == -1 || entend > srcend) { |
|
418 return -1; // not entirely correct, but close enough |
|
419 } |
|
420 } else { // -- 2nd case: character entity |
|
421 for (entend = entstart; |
|
422 entend < srcend && ascii_isalnum(*entend); |
|
423 ++entend ) { |
|
424 // entity consists of alphanumeric chars |
|
425 } |
|
426 entval = LookupEntity(entstart, entend - entstart); |
|
427 if (entval < 0) { |
|
428 return -1; // not a legal entity name |
|
429 } |
|
430 // Now we do a strange-seeming IE6-compatibility check: if entval is |
|
431 // >= 256, it *must* be followed by a semicolon or it's not considered |
|
432 // an entity. The problem is lots of the newfangled entity names, like |
|
433 // "lang", also occur in URL CGI arguments: "/search?q=test&lang=en". |
|
434 // When these links are written in HTML, it would be really bad if the |
|
435 // "&lang" were treated as an entity, which is what the spec says |
|
436 // *should* happen (even when the HTML is inside an "A HREF" tag!) |
|
437 // IE ignores the spec for these new, high-value entities, so we do too. |
|
438 if ( entval >= 256 && !(entend < srcend && *entend == ';') ) { |
|
439 return -1; // make non-;-terminated entity illegal |
|
440 } |
|
441 } |
|
442 |
|
443 // Finally, figure out how much src was consumed |
|
444 if ( entend < srcend && *entend == ';' ) { |
|
445 entend++; // standard says ; terminator is special |
|
446 } |
|
447 *src_consumed = entend - src; |
|
448 return entval; |
|
449 } |
|
450 |
|
451 |
|
452 // Src points to '&' |
|
453 // Writes entity value to dst. Returns take(src), put(dst) byte counts |
|
454 void EntityToBuffer(const char* src, int len, char* dst, |
|
455 int* tlen, int* plen) { |
|
456 char32 entval = ReadEntity(src, len, tlen); |
|
457 |
|
458 // ReadEntity does this already: entval = FixUnicodeValue(entval); |
|
459 |
|
460 // Convert UTF-32 to UTF-8 |
|
461 if (entval > 0) { |
|
462 *plen = runetochar(dst, &entval); |
|
463 } else { |
|
464 // Illegal entity; ignore the '&' |
|
465 *tlen = 1; |
|
466 *plen = 0; |
|
467 } |
|
468 } |
|
469 |
|
470 // Returns true if character is < > or &, none of which are letters |
|
471 bool inline IsSpecial(char c) { |
|
472 if ((c & 0xe0) == 0x20) { |
|
473 return kSpecialSymbol[static_cast<uint8>(c)]; |
|
474 } |
|
475 return false; |
|
476 } |
|
477 |
|
478 // Quick Skip to next letter or < > & or to end of string (eos) |
|
479 // Always return is_letter for eos |
|
480 int ScanToLetterOrSpecial(const char* src, int len) { |
|
481 int bytes_consumed; |
|
482 StringPiece str(src, len); |
|
483 UTF8GenericScan(&utf8scannot_lettermarkspecial_obj, str, &bytes_consumed); |
|
484 return bytes_consumed; |
|
485 } |
|
486 |
|
487 |
|
488 |
|
489 |
|
490 // src points to non-letter, such as tag-opening '<' |
|
491 // Return length from here to next possible letter |
|
492 // On another < before >, return 1 |
|
493 // advances <tag> |
|
494 // | | |
|
495 // advances <tag> ... </tag> for <script> <style> |
|
496 // | | |
|
497 // advances <!-- ... <tag> ... --> |
|
498 // | | |
|
499 // advances <tag |
|
500 // | | end of string |
|
501 // advances <tag <tag2> |
|
502 // || |
|
503 int ScanToPossibleLetter(const char* isrc, int len, int max_exit_state) { |
|
504 const uint8* src = reinterpret_cast<const uint8*>(isrc); |
|
505 const uint8* srclimit = src + len; |
|
506 const uint8* tagParseTbl = kTagParseTbl_0; |
|
507 int e = 0; |
|
508 while (src < srclimit) { |
|
509 e = tagParseTbl[kCharToSub[*src++]]; |
|
510 if (e <= max_exit_state) { |
|
511 // We overshot by one byte |
|
512 --src; |
|
513 break; |
|
514 } |
|
515 tagParseTbl = &kTagParseTbl_0[e * 20]; |
|
516 } |
|
517 |
|
518 if (src >= srclimit) { |
|
519 // We fell off the end of the text. |
|
520 // It looks like the most common case for this is a truncated file, not |
|
521 // mismatched angle brackets. So we pretend that the last char was '>' |
|
522 return len; |
|
523 } |
|
524 |
|
525 // OK to be in state 0 or state 2 at exit |
|
526 if ((e != 0) && (e != 2)) { |
|
527 // Error, '<' followed by '<' |
|
528 // We want to back up to first <, then advance by one byte past it |
|
529 int offset = src - reinterpret_cast<const uint8*>(isrc); |
|
530 |
|
531 // Backscan to first '<' and return enough length to just get past it |
|
532 --offset; // back up over the second '<', which caused us to stop |
|
533 while ((0 < offset) && (isrc[offset] != '<')) { |
|
534 // Find the first '<', which is unmatched |
|
535 --offset; |
|
536 } |
|
537 // skip to just beyond first '<' |
|
538 return offset + 1; |
|
539 } |
|
540 |
|
541 return src - reinterpret_cast<const uint8*>(isrc); |
|
542 } |
|
543 |
|
544 |
|
545 ScriptScanner::ScriptScanner(const char* buffer, |
|
546 int buffer_length, |
|
547 bool is_plain_text) |
|
548 : start_byte_(buffer), |
|
549 next_byte_(buffer), |
|
550 next_byte_limit_(buffer + buffer_length), |
|
551 byte_length_(buffer_length), |
|
552 is_plain_text_(is_plain_text), |
|
553 letters_marks_only_(true), |
|
554 one_script_only_(true), |
|
555 exit_state_(kMaxExitStateLettersMarksOnly) { |
|
556 script_buffer_ = new char[kMaxScriptBuffer]; |
|
557 script_buffer_lower_ = new char[kMaxScriptLowerBuffer]; |
|
558 map2original_.Clear(); // map from script_buffer_ to buffer |
|
559 map2uplow_.Clear(); // map from script_buffer_lower_ to script_buffer_ |
|
560 } |
|
561 |
|
562 // Extended version to allow spans of any non-tag text and spans of mixed script |
|
563 ScriptScanner::ScriptScanner(const char* buffer, |
|
564 int buffer_length, |
|
565 bool is_plain_text, |
|
566 bool any_text, |
|
567 bool any_script) |
|
568 : start_byte_(buffer), |
|
569 next_byte_(buffer), |
|
570 next_byte_limit_(buffer + buffer_length), |
|
571 byte_length_(buffer_length), |
|
572 is_plain_text_(is_plain_text), |
|
573 letters_marks_only_(!any_text), |
|
574 one_script_only_(!any_script), |
|
575 exit_state_(any_text ? kMaxExitStateAllText : kMaxExitStateLettersMarksOnly) { |
|
576 script_buffer_ = new char[kMaxScriptBuffer]; |
|
577 script_buffer_lower_ = new char[kMaxScriptLowerBuffer]; |
|
578 map2original_.Clear(); // map from script_buffer_ to buffer |
|
579 map2uplow_.Clear(); // map from script_buffer_lower_ to script_buffer_ |
|
580 } |
|
581 |
|
582 |
|
583 ScriptScanner::~ScriptScanner() { |
|
584 delete[] script_buffer_; |
|
585 delete[] script_buffer_lower_; |
|
586 } |
|
587 |
|
588 |
|
589 |
|
590 |
|
591 // Get to the first real non-tag letter or entity that is a letter |
|
592 // Sets script of that letter |
|
593 // Return len if no more letters |
|
594 int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) { |
|
595 int sc = UNKNOWN_ULSCRIPT; |
|
596 int skip = 0; |
|
597 int tlen, plen; |
|
598 |
|
599 // Do run of non-letters (tag | &NL | NL)* |
|
600 tlen = 0; |
|
601 while (skip < len) { |
|
602 // Do fast scan to next interesting byte |
|
603 // int oldskip = skip; |
|
604 skip += ScanToLetterOrSpecial(src + skip, len - skip); |
|
605 |
|
606 // Check for no more letters/specials |
|
607 if (skip >= len) { |
|
608 // All done |
|
609 *script = sc; |
|
610 return len; |
|
611 } |
|
612 |
|
613 // We are at a letter, nonletter, tag, or entity |
|
614 if (IsSpecial(src[skip]) && !is_plain_text_) { |
|
615 if (src[skip] == '<') { |
|
616 // Begining of tag; skip to end and go around again |
|
617 tlen = ScanToPossibleLetter(src + skip, len - skip, |
|
618 exit_state_); |
|
619 sc = 0; |
|
620 } else if (src[skip] == '>') { |
|
621 // Unexpected end of tag; skip it and go around again |
|
622 tlen = 1; // Over the > |
|
623 sc = 0; |
|
624 } else if (src[skip] == '&') { |
|
625 // Expand entity, no advance |
|
626 char temp[4]; |
|
627 EntityToBuffer(src + skip, len - skip, |
|
628 temp, &tlen, &plen); |
|
629 sc = GetUTF8LetterScriptNum(temp); |
|
630 } |
|
631 } else { |
|
632 // Update 1..4 bytes |
|
633 tlen = UTF8OneCharLen(src + skip); |
|
634 sc = GetUTF8LetterScriptNum(src + skip); |
|
635 } |
|
636 if (sc != 0) {break;} // Letter found |
|
637 skip += tlen; // Else advance |
|
638 } |
|
639 |
|
640 *script = sc; |
|
641 return skip; |
|
642 } |
|
643 |
|
644 |
|
645 // These are for ASCII-only tag names |
|
646 // Compare one letter uplow to c, ignoring case of uplowp |
|
647 inline bool EqCase(char uplow, char c) { |
|
648 return (uplow | 0x20) == c; |
|
649 } |
|
650 |
|
651 // These are for ASCII-only tag names |
|
652 // Return true for space / < > etc. all less than 0x40 |
|
653 inline bool NeqLetter(char c) { |
|
654 return c < 0x40; |
|
655 } |
|
656 |
|
657 // These are for ASCII-only tag names |
|
658 // Return true for space \n false for \r |
|
659 inline bool WS(char c) { |
|
660 return (c == ' ') || (c == '\n'); |
|
661 } |
|
662 |
|
663 // Canonical CR or LF |
|
664 static const char LF = '\n'; |
|
665 |
|
666 |
|
667 // The naive loop scans from next_byte_ to script_buffer_ until full. |
|
668 // But this can leave an awkward hard-to-identify short fragment at the |
|
669 // end of the input. We would prefer to make the next-to-last fragment |
|
670 // shorter and the last fragment longer. |
|
671 |
|
672 // Copy next run of non-tag characters to buffer [NUL terminated] |
|
673 // This just replaces tags with space or \n and removes entities. |
|
674 // Tags <br> <p> and <tr> are replaced with \n. Non-letter sequences |
|
675 // including \r or \n are replaced by \n. All other tags and skipped text |
|
676 // are replaced with ASCII space. |
|
677 // |
|
678 // Buffer ALWAYS has leading space and trailing space space space NUL |
|
679 bool ScriptScanner::GetOneTextSpan(LangSpan* span) { |
|
680 span->text = script_buffer_; |
|
681 span->text_bytes = 0; |
|
682 span->offset = next_byte_ - start_byte_; |
|
683 span->ulscript = UNKNOWN_ULSCRIPT; |
|
684 span->lang = UNKNOWN_LANGUAGE; |
|
685 span->truncated = false; |
|
686 |
|
687 int put_soft_limit = kMaxScriptBytes - kWithinScriptTail; |
|
688 if ((kMaxScriptBytes <= byte_length_) && |
|
689 (byte_length_ < (2 * kMaxScriptBytes))) { |
|
690 // Try to split the last two fragments in half |
|
691 put_soft_limit = byte_length_ / 2; |
|
692 } |
|
693 |
|
694 script_buffer_[0] = ' '; // Always a space at front of output |
|
695 script_buffer_[1] = '\0'; |
|
696 int take = 0; |
|
697 int put = 1; // Start after the initial space |
|
698 int tlen, plen; |
|
699 |
|
700 if (byte_length_ <= 0) { |
|
701 return false; // No more text to be found |
|
702 } |
|
703 |
|
704 // Go over alternating spans of text and tags, |
|
705 // copying letters to buffer with single spaces for each run of non-letters |
|
706 bool last_byte_was_space = false; |
|
707 while (take < byte_length_) { |
|
708 char c = next_byte_[take]; |
|
709 if (c == '\r') {c = LF;} // Canonical CR or LF |
|
710 if (c == '\n') {c = LF;} // Canonical CR or LF |
|
711 |
|
712 if (IsSpecial(c) && !is_plain_text_) { |
|
713 if (c == '<') { |
|
714 // Replace tag with space |
|
715 c = ' '; // for almost-full test below |
|
716 // or if <p> <br> <tr>, replace with \n |
|
717 if (take < (byte_length_ - 3)) { |
|
718 if (EqCase(next_byte_[take + 1], 'p') && |
|
719 NeqLetter(next_byte_[take + 2])) { |
|
720 c = LF; |
|
721 } |
|
722 if (EqCase(next_byte_[take + 1], 'b') && |
|
723 EqCase(next_byte_[take + 2], 'r') && |
|
724 NeqLetter(next_byte_[take + 3])) { |
|
725 c = LF; |
|
726 } |
|
727 if (EqCase(next_byte_[take + 1], 't') && |
|
728 EqCase(next_byte_[take + 2], 'r') && |
|
729 NeqLetter(next_byte_[take + 3])) { |
|
730 c = LF; |
|
731 } |
|
732 } |
|
733 // Begining of tag; skip to end and go around again |
|
734 tlen = 1 + ScanToPossibleLetter(next_byte_ + take, byte_length_ - take, |
|
735 exit_state_); |
|
736 // Copy one byte, compressing spaces |
|
737 if (!last_byte_was_space || !WS(c)) { |
|
738 script_buffer_[put++] = c; // Advance dest |
|
739 last_byte_was_space = WS(c); |
|
740 } |
|
741 } else if (c == '>') { |
|
742 // Unexpected end of tag; copy it and go around again |
|
743 tlen = 1; // Over the > |
|
744 script_buffer_[put++] = c; // Advance dest |
|
745 } else if (c == '&') { |
|
746 // Expand entity, no advance |
|
747 EntityToBuffer(next_byte_ + take, byte_length_ - take, |
|
748 script_buffer_ + put, &tlen, &plen); |
|
749 put += plen; // Advance dest |
|
750 } |
|
751 take += tlen; // Advance source |
|
752 } else { |
|
753 // Copy one byte, compressing spaces |
|
754 if (!last_byte_was_space || !WS(c)) { |
|
755 script_buffer_[put++] = c; // Advance dest |
|
756 last_byte_was_space = WS(c); |
|
757 } |
|
758 ++take; // Advance source |
|
759 } |
|
760 |
|
761 if (WS(c) && |
|
762 (put >= put_soft_limit)) { |
|
763 // Buffer is almost full |
|
764 span->truncated = true; |
|
765 break; |
|
766 } |
|
767 if (put >= kMaxScriptBytes) { |
|
768 // Buffer is completely full |
|
769 span->truncated = true; |
|
770 break; |
|
771 } |
|
772 } |
|
773 |
|
774 // Almost done. Back up to a character boundary if needed |
|
775 while ((0 < take) && ((next_byte_[take] & 0xc0) == 0x80)) { |
|
776 // Back up over continuation byte |
|
777 --take; |
|
778 --put; |
|
779 } |
|
780 |
|
781 // Update input position |
|
782 next_byte_ += take; |
|
783 byte_length_ -= take; |
|
784 |
|
785 // Put four more spaces/NUL. Worst case is abcd _ _ _ \0 |
|
786 // kMaxScriptBytes | | put |
|
787 script_buffer_[put + 0] = ' '; |
|
788 script_buffer_[put + 1] = ' '; |
|
789 script_buffer_[put + 2] = ' '; |
|
790 script_buffer_[put + 3] = '\0'; |
|
791 |
|
792 span->text_bytes = put; // Does not include the last four chars above |
|
793 return true; |
|
794 } |
|
795 |
|
796 |
|
797 // Copy next run of same-script non-tag letters to buffer [NUL terminated] |
|
798 // Buffer ALWAYS has leading space and trailing space space space NUL |
|
799 bool ScriptScanner::GetOneScriptSpan(LangSpan* span) { |
|
800 if (!letters_marks_only_) { |
|
801 // Return non-tag text, including punctuation and digits |
|
802 return GetOneTextSpan(span); |
|
803 } |
|
804 |
|
805 span->text = script_buffer_; |
|
806 span->text_bytes = 0; |
|
807 span->offset = next_byte_ - start_byte_; |
|
808 span->ulscript = UNKNOWN_ULSCRIPT; |
|
809 span->lang = UNKNOWN_LANGUAGE; |
|
810 span->truncated = false; |
|
811 |
|
812 // struct timeval script_start, script_mid, script_end; |
|
813 |
|
814 int put_soft_limit = kMaxScriptBytes - kWithinScriptTail; |
|
815 if ((kMaxScriptBytes <= byte_length_) && |
|
816 (byte_length_ < (2 * kMaxScriptBytes))) { |
|
817 // Try to split the last two fragments in half |
|
818 put_soft_limit = byte_length_ / 2; |
|
819 } |
|
820 |
|
821 |
|
822 int spanscript; // The script of this span |
|
823 int sc = UNKNOWN_ULSCRIPT; // The script of next character |
|
824 int tlen = 0; |
|
825 int plen = 0; |
|
826 |
|
827 script_buffer_[0] = ' '; // Always a space at front of output |
|
828 script_buffer_[1] = '\0'; |
|
829 int take = 0; |
|
830 int put = 1; // Start after the initial space |
|
831 |
|
832 // Build offsets from span->text back to start_byte_ + span->offset |
|
833 // This mapping reflects deletion of non-letters, expansion of |
|
834 // entities, etc. |
|
835 map2original_.Clear(); |
|
836 map2original_.Delete(span->offset); // So that MapBack(0) gives offset |
|
837 |
|
838 // Get to the first real non-tag letter or entity that is a letter |
|
839 int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript); |
|
840 next_byte_ += skip; |
|
841 byte_length_ -= skip; |
|
842 |
|
843 if (skip != 1) { |
|
844 map2original_.Delete(skip); |
|
845 map2original_.Insert(1); |
|
846 } else { |
|
847 map2original_.Copy(1); |
|
848 } |
|
849 if (byte_length_ <= 0) { |
|
850 map2original_.Reset(); |
|
851 return false; // No more letters to be found |
|
852 } |
|
853 |
|
854 // There is at least one letter, so we know the script for this span |
|
855 span->ulscript = (ULScript)spanscript; |
|
856 |
|
857 |
|
858 // Go over alternating spans of same-script letters and non-letters, |
|
859 // copying letters to buffer with single spaces for each run of non-letters |
|
860 while (take < byte_length_) { |
|
861 // Copy run of letters in same script (&LS | LS)* |
|
862 int letter_count = 0; // Keep track of word length |
|
863 bool need_break = false; |
|
864 |
|
865 while (take < byte_length_) { |
|
866 // We are at a letter, nonletter, tag, or entity |
|
867 if (IsSpecial(next_byte_[take]) && !is_plain_text_) { |
|
868 if (next_byte_[take] == '<') { |
|
869 // Begining of tag |
|
870 sc = 0; |
|
871 break; |
|
872 } else if (next_byte_[take] == '>') { |
|
873 // Unexpected end of tag |
|
874 sc = 0; |
|
875 break; |
|
876 } else if (next_byte_[take] == '&') { |
|
877 // Copy entity, no advance |
|
878 EntityToBuffer(next_byte_ + take, byte_length_ - take, |
|
879 script_buffer_ + put, &tlen, &plen); |
|
880 sc = GetUTF8LetterScriptNum(script_buffer_ + put); |
|
881 } |
|
882 } else { |
|
883 // Real letter, safely copy up to 4 bytes, increment by 1..4 |
|
884 // Will update by 1..4 bytes at Advance, below |
|
885 tlen = plen = UTF8OneCharLen(next_byte_ + take); |
|
886 if (take < (byte_length_ - 3)) { |
|
887 // X86 fast case, does unaligned load/store |
|
888 UNALIGNED_STORE32(script_buffer_ + put, |
|
889 UNALIGNED_LOAD32(next_byte_ + take)); |
|
890 |
|
891 } else { |
|
892 // Slow case, happens 1-3 times per input document |
|
893 memcpy(script_buffer_ + put, next_byte_ + take, plen); |
|
894 } |
|
895 sc = GetUTF8LetterScriptNum(next_byte_ + take); |
|
896 } |
|
897 |
|
898 // Allow continue across a single letter in a different script: |
|
899 // A B D = three scripts, c = common script, i = inherited script, |
|
900 // - = don't care, ( = take position before the += below |
|
901 // AAA(A- continue |
|
902 // |
|
903 // AAA(BA continue |
|
904 // AAA(BB break |
|
905 // AAA(Bc continue (breaks after B) |
|
906 // AAA(BD break |
|
907 // AAA(Bi break |
|
908 // |
|
909 // AAA(c- break |
|
910 // |
|
911 // AAA(i- continue |
|
912 // |
|
913 |
|
914 if ((sc != spanscript) && (sc != ULScript_Inherited)) { |
|
915 // Might need to break this script span |
|
916 if (sc == ULScript_Common) { |
|
917 need_break = true; |
|
918 } else { |
|
919 // Look at next following character, ignoring entity as Common |
|
920 int sc2 = GetUTF8LetterScriptNum(next_byte_ + take + tlen); |
|
921 if ((sc2 != ULScript_Common) && (sc2 != spanscript)) { |
|
922 // We found a non-trivial change of script |
|
923 if (one_script_only_) { |
|
924 need_break = true; |
|
925 } |
|
926 } |
|
927 } |
|
928 } |
|
929 if (need_break) {break;} // Non-letter or letter in wrong script |
|
930 |
|
931 take += tlen; // Advance |
|
932 put += plen; // Advance |
|
933 |
|
934 // Update the offset map to reflect take/put lengths |
|
935 if (tlen == plen) { |
|
936 map2original_.Copy(tlen); |
|
937 } else if (tlen < plen) { |
|
938 map2original_.Copy(tlen); |
|
939 map2original_.Insert(plen - tlen); |
|
940 } else { // plen < tlen |
|
941 map2original_.Copy(plen); |
|
942 map2original_.Delete(tlen - plen); |
|
943 } |
|
944 |
|
945 ++letter_count; |
|
946 if (put >= kMaxScriptBytes) { |
|
947 // Buffer is full |
|
948 span->truncated = true; |
|
949 break; |
|
950 } |
|
951 } // End while letters |
|
952 |
|
953 // Do run of non-letters (tag | &NL | NL)* |
|
954 while (take < byte_length_) { |
|
955 // Do fast scan to next interesting byte |
|
956 tlen = ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take); |
|
957 take += tlen; |
|
958 map2original_.Delete(tlen); |
|
959 if (take >= byte_length_) {break;} // Might have scanned to end |
|
960 |
|
961 // We are at a letter, nonletter, tag, or entity |
|
962 if (IsSpecial(next_byte_[take]) && !is_plain_text_) { |
|
963 if (next_byte_[take] == '<') { |
|
964 // Begining of tag; skip to end and go around again |
|
965 tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take, |
|
966 exit_state_); |
|
967 sc = 0; |
|
968 } else if (next_byte_[take] == '>') { |
|
969 // Unexpected end of tag; skip it and go around again |
|
970 tlen = 1; // Over the > |
|
971 sc = 0; |
|
972 } else if (next_byte_[take] == '&') { |
|
973 // Expand entity, no advance |
|
974 EntityToBuffer(next_byte_ + take, byte_length_ - take, |
|
975 script_buffer_ + put, &tlen, &plen); |
|
976 sc = GetUTF8LetterScriptNum(script_buffer_ + put); |
|
977 } |
|
978 } else { |
|
979 // Update 1..4 |
|
980 tlen = UTF8OneCharLen(next_byte_ + take); |
|
981 sc = GetUTF8LetterScriptNum(next_byte_ + take); |
|
982 } |
|
983 if (sc != 0) {break;} // Letter found |
|
984 take += tlen; // Else advance |
|
985 map2original_.Delete(tlen); |
|
986 } // End while not-letters |
|
987 |
|
988 script_buffer_[put++] = ' '; |
|
989 map2original_.Insert(1); |
|
990 |
|
991 // Letter in wrong script ? |
|
992 if ((sc != spanscript) && (sc != ULScript_Inherited)) {break;} |
|
993 if (put >= put_soft_limit) { |
|
994 // Buffer is almost full |
|
995 span->truncated = true; |
|
996 break; |
|
997 } |
|
998 } |
|
999 |
|
1000 // Almost done. Back up to a character boundary if needed |
|
1001 while ((0 < take) && (take < byte_length_) && |
|
1002 ((next_byte_[take] & 0xc0) == 0x80)) { |
|
1003 // Back up over continuation byte |
|
1004 --take; |
|
1005 --put; |
|
1006 } |
|
1007 |
|
1008 // Update input position |
|
1009 next_byte_ += take; |
|
1010 byte_length_ -= take; |
|
1011 |
|
1012 // Put four more spaces/NUL. Worst case is abcd _ _ _ \0 |
|
1013 // kMaxScriptBytes | | put |
|
1014 script_buffer_[put + 0] = ' '; |
|
1015 script_buffer_[put + 1] = ' '; |
|
1016 script_buffer_[put + 2] = ' '; |
|
1017 script_buffer_[put + 3] = '\0'; |
|
1018 map2original_.Insert(4); |
|
1019 map2original_.Reset(); |
|
1020 |
|
1021 span->text_bytes = put; // Does not include the last four chars above |
|
1022 return true; |
|
1023 } |
|
1024 |
|
1025 // Force Latin, Cyrillic, Armenian, Greek scripts to be lowercase |
|
1026 // List changes with each version of Unicode, so just always lowercase |
|
1027 // Unicode 6.2.0: |
|
1028 // ARMENIAN COPTIC CYRILLIC DESERET GEORGIAN GLAGOLITIC GREEK LATIN |
|
1029 void ScriptScanner::LowerScriptSpan(LangSpan* span) { |
|
1030 // If needed, lowercase all the text. If we do it sooner, might miss |
|
1031 // lowercasing an entity such as Á |
|
1032 // We only need to do this for Latn and Cyrl scripts |
|
1033 map2uplow_.Clear(); |
|
1034 // Full Unicode lowercase of the entire buffer, including |
|
1035 // four pad bytes off the end. |
|
1036 // Ahhh. But the last byte 0x00 is not interchange-valid, so we do 3 pad |
|
1037 // bytes and put the 0x00 in explicitly. |
|
1038 // Build an offset map from script_buffer_lower_ back to script_buffer_ |
|
1039 int consumed, filled, changed; |
|
1040 StringPiece istr(span->text, span->text_bytes + 3); |
|
1041 StringPiece ostr(script_buffer_lower_, kMaxScriptLowerBuffer); |
|
1042 |
|
1043 UTF8GenericReplace(&utf8repl_lettermarklower_obj, |
|
1044 istr, ostr, is_plain_text_, |
|
1045 &consumed, &filled, &changed, &map2uplow_); |
|
1046 script_buffer_lower_[filled] = '\0'; |
|
1047 span->text = script_buffer_lower_; |
|
1048 span->text_bytes = filled - 3; |
|
1049 map2uplow_.Reset(); |
|
1050 } |
|
1051 |
|
1052 // Copy next run of same-script non-tag letters to buffer [NUL terminated] |
|
1053 // Force Latin, Cyrillic, Greek scripts to be lowercase |
|
1054 // Buffer ALWAYS has leading space and trailing space space space NUL |
|
1055 bool ScriptScanner::GetOneScriptSpanLower(LangSpan* span) { |
|
1056 bool ok = GetOneScriptSpan(span); |
|
1057 LowerScriptSpan(span); |
|
1058 return ok; |
|
1059 } |
|
1060 |
|
1061 |
|
1062 // Maps byte offset in most recent GetOneScriptSpan/Lower |
|
1063 // span->text [0..text_bytes] into an additional byte offset from |
|
1064 // span->offset, to get back to corresponding text in the original |
|
1065 // input buffer. |
|
1066 // text_offset must be the first byte |
|
1067 // of a UTF-8 character, or just beyond the last character. Normally this |
|
1068 // routine is called with the first byte of an interesting range and |
|
1069 // again with the first byte of the following range. |
|
1070 int ScriptScanner::MapBack(int text_offset) { |
|
1071 return map2original_.MapBack(map2uplow_.MapBack(text_offset)); |
|
1072 } |
|
1073 |
|
1074 |
|
1075 // Gets lscript number for letters; always returns |
|
1076 // 0 (common script) for non-letters |
|
1077 int GetUTF8LetterScriptNum(const char* src) { |
|
1078 int srclen = UTF8OneCharLen(src); |
|
1079 const uint8* usrc = reinterpret_cast<const uint8*>(src); |
|
1080 return UTF8GenericPropertyTwoByte(&utf8prop_lettermarkscriptnum_obj, |
|
1081 &usrc, &srclen); |
|
1082 } |
|
1083 |
|
1084 } // namespace CLD2 |
|
1085 |
|
1086 |