1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/browser/components/translation/cld2/internal/getonescriptspan.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1086 @@ 1.4 +// Copyright 2013 Google Inc. All Rights Reserved. 1.5 +// 1.6 +// Licensed under the Apache License, Version 2.0 (the "License"); 1.7 +// you may not use this file except in compliance with the License. 1.8 +// You may obtain a copy of the License at 1.9 +// 1.10 +// http://www.apache.org/licenses/LICENSE-2.0 1.11 +// 1.12 +// Unless required by applicable law or agreed to in writing, software 1.13 +// distributed under the License is distributed on an "AS IS" BASIS, 1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.15 +// See the License for the specific language governing permissions and 1.16 +// limitations under the License. 1.17 + 1.18 +// 1.19 +// Author: dsites@google.com (Dick Sites) 1.20 +// 1.21 + 1.22 + 1.23 +#include "getonescriptspan.h" 1.24 +#include <string.h> 1.25 + 1.26 +#include "fixunicodevalue.h" 1.27 +#include "lang_script.h" 1.28 +#include "port.h" 1.29 +#include "utf8statetable.h" 1.30 + 1.31 +#include "utf8prop_lettermarkscriptnum.h" 1.32 +#include "utf8repl_lettermarklower.h" 1.33 +#include "utf8scannot_lettermarkspecial.h" 1.34 + 1.35 + 1.36 +namespace CLD2 { 1.37 + 1.38 +// Alphabetical order for binary search, from 1.39 +// generated_entities.cc 1.40 +extern const int kNameToEntitySize; 1.41 +extern const CharIntPair kNameToEntity[]; 1.42 + 1.43 +static const int kMaxUpToWordBoundary = 50; // span < this make longer, 1.44 + // else make shorter 1.45 +static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes 1.46 + // to round to word boundary, 1.47 + // direction above 1.48 + 1.49 +static const char kSpecialSymbol[256] = { // true for < > & 1.50 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.51 + 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0, 1.52 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.53 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.54 + 1.55 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.56 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.57 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.58 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.59 +}; 1.60 + 1.61 + 1.62 + 1.63 +#define LT 0 // < 1.64 +#define GT 1 // > 1.65 +#define EX 2 // ! 1.66 +#define HY 3 // - 1.67 +#define QU 4 // " 1.68 +#define AP 5 // ' 1.69 +#define SL 6 // / 1.70 +#define S_ 7 1.71 +#define C_ 8 1.72 +#define R_ 9 1.73 +#define I_ 10 1.74 +#define P_ 11 1.75 +#define T_ 12 1.76 +#define Y_ 13 1.77 +#define L_ 14 1.78 +#define E_ 15 1.79 +#define CR 16 // <cr> or <lf> 1.80 +#define NL 17 // non-letter: ASCII whitespace, digit, punctuation 1.81 +#define PL 18 // possible letter, incl. & 1.82 +#define xx 19 // <unused> 1.83 + 1.84 +// Map byte to one of ~20 interesting categories for cheap tag parsing 1.85 +static const uint8 kCharToSub[256] = { 1.86 + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL, 1.87 + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, 1.88 + NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL, 1.89 + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL, 1.90 + 1.91 + PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, 1.92 + P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, 1.93 + PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, 1.94 + P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, 1.95 + 1.96 + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, 1.97 + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, 1.98 + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, 1.99 + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, 1.100 + 1.101 + PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, 1.102 + PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, 1.103 + PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, 1.104 + PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, 1.105 +}; 1.106 + 1.107 +#undef LT 1.108 +#undef GT 1.109 +#undef EX 1.110 +#undef HY 1.111 +#undef QU 1.112 +#undef AP 1.113 +#undef SL 1.114 +#undef S_ 1.115 +#undef C_ 1.116 +#undef R_ 1.117 +#undef I_ 1.118 +#undef P_ 1.119 +#undef T_ 1.120 +#undef Y_ 1.121 +#undef L_ 1.122 +#undef E_ 1.123 +#undef CR 1.124 +#undef NL 1.125 +#undef PL 1.126 +#undef xx 1.127 + 1.128 + 1.129 +#define OK 0 1.130 +#define X_ 1 1.131 + 1.132 + 1.133 +static const int kMaxExitStateLettersMarksOnly = 1; 1.134 +static const int kMaxExitStateAllText = 2; 1.135 + 1.136 + 1.137 +// State machine to do cheap parse of non-letter strings incl. tags 1.138 +// advances <tag> 1.139 +// | | 1.140 +// advances <tag> ... </tag> for <script> <style> 1.141 +// | | 1.142 +// advances <!-- ... <tag> ... --> 1.143 +// | | 1.144 +// advances <tag 1.145 +// || (0) 1.146 +// advances <tag <tag2> 1.147 +// || (0) 1.148 +// 1.149 +// We start in state [0] at a non-letter and make at least one transition 1.150 +// When scanning for just letters, arriving back at state [0] or [1] exits 1.151 +// the state machine. 1.152 +// When scanning for any non-tag text, arriving at state [2] also exits 1.153 +static const uint8 kTagParseTbl_0[] = { 1.154 +// < > ! - " ' / S C R I P T Y L E CR NL PL xx 1.155 + 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK exit state 1.156 + X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error exit state 1.157 + 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL* [exit state] 1.158 + X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] < 1.159 + X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <! 1.160 + X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!- 1.161 + 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.* 1.162 + 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*- 1.163 + 6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*-- 1.164 + X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.* 1.165 + 10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*" 1.166 + 11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*' 1.167 + X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " ' 1.168 + 1.169 +// < > ! - " ' / S C R I P T Y L E CR NL PL xx 1.170 + X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S 1.171 + X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC 1.172 + X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR 1.173 + X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI 1.174 + X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP 1.175 + X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT 1.176 + 20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .* 1.177 + 19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*< 1.178 + 19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 21,21,19,X_, // [21] <SCRIPT .*</ allow SP CR LF 1.179 + 19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S 1.180 + 19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC 1.181 + 19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR 1.182 + 19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI 1.183 + 19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP 1.184 + 19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT 1.185 + 1.186 +// < > ! - " ' / S C R I P T Y L E CR NL PL xx 1.187 + X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST 1.188 + X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY 1.189 + X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL 1.190 + X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE 1.191 + 33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .* 1.192 + 32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*< 1.193 + 32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 34,34,32,X_, // [34] <STYLE .*</ allow SP CR LF 1.194 + 32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S 1.195 + 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST 1.196 + 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY 1.197 + 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL 1.198 + 32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE 1.199 +}; 1.200 + 1.201 +#undef OK 1.202 +#undef X_ 1.203 + 1.204 +enum 1.205 +{ 1.206 + UTFmax = 4, // maximum bytes per rune 1.207 + Runesync = 0x80, // cannot represent part of a UTF sequence (<) 1.208 + Runeself = 0x80, // rune and UTF sequences are the same (<) 1.209 + Runeerror = 0xFFFD, // decoding error in UTF 1.210 + Runemax = 0x10FFFF, // maximum rune value 1.211 +}; 1.212 + 1.213 +// Debugging. Not thread safe. 1.214 +static char gDisplayPiece[32]; 1.215 +const uint8 gCharlen[16] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 2,2,3,4}; 1.216 +char* DisplayPiece(const char* next_byte_, int byte_length_) { 1.217 + // Copy up to 8 UTF-8 chars to buffer 1.218 + int k = 0; // byte count 1.219 + int n = 0; // character count 1.220 + for (int i = 0; i < byte_length_; ++i) { 1.221 + char c = next_byte_[i]; 1.222 + if ((c & 0xc0) != 0x80) { 1.223 + // Beginning of a UTF-8 character 1.224 + int charlen = gCharlen[static_cast<uint8>(c) >> 4]; 1.225 + if (i + charlen > byte_length_) {break;} // Not enough room for full char 1.226 + if (k >= (32 - 7)) {break;} // Not necessarily enough room 1.227 + if (n >= 8) {break;} // Enough characters already 1.228 + ++n; 1.229 + } 1.230 + if (c == '<') { 1.231 + memcpy(&gDisplayPiece[k], "<", 4); k += 4; 1.232 + } else if (c == '>') { 1.233 + memcpy(&gDisplayPiece[k], ">", 4); k += 4; 1.234 + } else if (c == '&') { 1.235 + memcpy(&gDisplayPiece[k], "&", 5); k += 5; 1.236 + } else if (c == '\'') { 1.237 + memcpy(&gDisplayPiece[k], "'", 6); k += 6; 1.238 + } else if (c == '"') { 1.239 + memcpy(&gDisplayPiece[k], """, 6); k += 6; 1.240 + } else { 1.241 + gDisplayPiece[k++] = c; 1.242 + } 1.243 + } 1.244 + gDisplayPiece[k++] = '\0'; 1.245 + return gDisplayPiece; 1.246 +} 1.247 + 1.248 + 1.249 + 1.250 +// runetochar copies (encodes) one rune, pointed to by r, to at most 1.251 +// UTFmax bytes starting at s and returns the number of bytes generated. 1.252 +int runetochar(char *str, const char32 *rune) { 1.253 + // Convert to unsigned for range check. 1.254 + unsigned long c; 1.255 + 1.256 + // 1 char 00-7F 1.257 + c = *rune; 1.258 + if(c <= 0x7F) { 1.259 + str[0] = c; 1.260 + return 1; 1.261 + } 1.262 + 1.263 + // 2 char 0080-07FF 1.264 + if(c <= 0x07FF) { 1.265 + str[0] = 0xC0 | (c >> 1*6); 1.266 + str[1] = 0x80 | (c & 0x3F); 1.267 + return 2; 1.268 + } 1.269 + 1.270 + // Range check 1.271 + if (c > Runemax) { 1.272 + c = Runeerror; 1.273 + } 1.274 + 1.275 + // 3 char 0800-FFFF 1.276 + if (c <= 0xFFFF) { 1.277 + str[0] = 0xE0 | (c >> 2*6); 1.278 + str[1] = 0x80 | ((c >> 1*6) & 0x3F); 1.279 + str[2] = 0x80 | (c & 0x3F); 1.280 + return 3; 1.281 + } 1.282 + 1.283 + // 4 char 10000-1FFFFF 1.284 + str[0] = 0xF0 | (c >> 3*6); 1.285 + str[1] = 0x80 | ((c >> 2*6) & 0x3F); 1.286 + str[2] = 0x80 | ((c >> 1*6) & 0x3F); 1.287 + str[3] = 0x80 | (c & 0x3F); 1.288 + return 4; 1.289 +} 1.290 + 1.291 + 1.292 + 1.293 +// Useful for converting an entity to an ascii value. 1.294 +// RETURNS unicode value, or -1 if entity isn't valid. Don't include & or ; 1.295 +int LookupEntity(const char* entity_name, int entity_len) { 1.296 + // Make a C string 1.297 + if (entity_len >= 16) {return -1;} // All real entities are shorter 1.298 + char temp[16]; 1.299 + memcpy(temp, entity_name, entity_len); 1.300 + temp[entity_len] = '\0'; 1.301 + int match = BinarySearch(temp, 0, kNameToEntitySize, kNameToEntity); 1.302 + if (match >= 0) {return kNameToEntity[match].i;} 1.303 + return -1; 1.304 +} 1.305 + 1.306 +bool ascii_isdigit(char c) { 1.307 + return ('0' <= c) && (c <= '9'); 1.308 +} 1.309 +bool ascii_isxdigit(char c) { 1.310 + if (('0' <= c) && (c <= '9')) {return true;} 1.311 + if (('a' <= c) && (c <= 'f')) {return true;} 1.312 + if (('A' <= c) && (c <= 'F')) {return true;} 1.313 + return false; 1.314 +} 1.315 +bool ascii_isalnum(char c) { 1.316 + if (('0' <= c) && (c <= '9')) {return true;} 1.317 + if (('a' <= c) && (c <= 'z')) {return true;} 1.318 + if (('A' <= c) && (c <= 'Z')) {return true;} 1.319 + return false; 1.320 +} 1.321 +int hex_digit_to_int(char c) { 1.322 + if (('0' <= c) && (c <= '9')) {return c - '0';} 1.323 + if (('a' <= c) && (c <= 'f')) {return c - 'a' + 10;} 1.324 + if (('A' <= c) && (c <= 'F')) {return c - 'A' + 10;} 1.325 + return 0; 1.326 +} 1.327 + 1.328 +static int32 strto32_base10(const char* nptr, const char* limit, 1.329 + const char **endptr) { 1.330 + *endptr = nptr; 1.331 + while (nptr < limit && *nptr == '0') { 1.332 + ++nptr; 1.333 + } 1.334 + if (nptr == limit || !ascii_isdigit(*nptr)) 1.335 + return -1; 1.336 + const char* end_digits_run = nptr; 1.337 + while (end_digits_run < limit && ascii_isdigit(*end_digits_run)) { 1.338 + ++end_digits_run; 1.339 + } 1.340 + *endptr = end_digits_run; 1.341 + const int num_digits = end_digits_run - nptr; 1.342 + // kint32max == 2147483647. 1.343 + if (num_digits < 9 || 1.344 + (num_digits == 10 && memcmp(nptr, "2147483647", 10) <= 0)) { 1.345 + int value = 0; 1.346 + for (; nptr < end_digits_run; ++nptr) { 1.347 + value *= 10; 1.348 + value += *nptr - '0'; 1.349 + } 1.350 + // Overflow past the last valid unicode codepoint 1.351 + // (0x10ffff) is converted to U+FFFD by FixUnicodeValue(). 1.352 + return FixUnicodeValue(value); 1.353 + } else { 1.354 + // Overflow: can't fit in an int32; 1.355 + // returns the replacement character 0xFFFD. 1.356 + return 0xFFFD; 1.357 + } 1.358 +} 1.359 + 1.360 +static int32 strto32_base16(const char* nptr, const char* limit, 1.361 + const char **endptr) { 1.362 + *endptr = nptr; 1.363 + while (nptr < limit && *nptr == '0') { 1.364 + ++nptr; 1.365 + } 1.366 + if (nptr == limit || !ascii_isxdigit(*nptr)) { 1.367 + return -1; 1.368 + } 1.369 + const char* end_xdigits_run = nptr; 1.370 + while (end_xdigits_run < limit && ascii_isxdigit(*end_xdigits_run)) { 1.371 + ++end_xdigits_run; 1.372 + } 1.373 + *endptr = end_xdigits_run; 1.374 + const int num_xdigits = end_xdigits_run - nptr; 1.375 + // kint32max == 0x7FFFFFFF. 1.376 + if (num_xdigits < 8 || (num_xdigits == 8 && nptr[0] < '8')) { 1.377 + int value = 0; 1.378 + for (; nptr < end_xdigits_run; ++nptr) { 1.379 + value <<= 4; 1.380 + value += hex_digit_to_int(*nptr); 1.381 + } 1.382 + // Overflow past the last valid unicode codepoint 1.383 + // (0x10ffff) is converted to U+FFFD by FixUnicodeValue(). 1.384 + return FixUnicodeValue(value); 1.385 + } else { 1.386 + // Overflow: can't fit in an int32; 1.387 + // returns the replacement character 0xFFFD. 1.388 + return 0xFFFD; 1.389 + } 1.390 +} 1.391 + 1.392 +// Unescape the current character pointed to by src. SETS the number 1.393 +// of chars read for the conversion (in UTF8). If src isn't a valid entity, 1.394 +// just consume the & and RETURN -1. If src doesn't point to & -- which it 1.395 +// should -- set src_consumed to 0 and RETURN -1. 1.396 +int ReadEntity(const char* src, int srcn, int* src_consumed) { 1.397 + const char* const srcend = src + srcn; 1.398 + 1.399 + if (srcn == 0 || *src != '&') { // input should start with an ampersand 1.400 + *src_consumed = 0; 1.401 + return -1; 1.402 + } 1.403 + *src_consumed = 1; // we'll get the & at least 1.404 + 1.405 + // The standards are a bit unclear on when an entity ends. Certainly a ";" 1.406 + // ends one, but spaces probably do too. We follow the lead of both IE and 1.407 + // Netscape, which as far as we can tell end numeric entities (1st case below) 1.408 + // at any non-digit, and end character entities (2nd case) at any non-alnum. 1.409 + const char* entstart, *entend; // where the entity starts and ends 1.410 + entstart = src + 1; // read past the & 1.411 + int entval; // UCS2 value of the entity 1.412 + if ( *entstart == '#' ) { // -- 1st case: numeric entity 1.413 + if ( entstart + 2 >= srcend ) { 1.414 + return -1; // no way a legitimate number could fit 1.415 + } else if ( entstart[1] == 'x' || entstart[1] == 'X' ) { // hex numeric 1.416 + entval = strto32_base16(entstart + 2, srcend, &entend); 1.417 + } else { // decimal numeric entity 1.418 + entval = strto32_base10(entstart+1, srcend, &entend); 1.419 + } 1.420 + if (entval == -1 || entend > srcend) { 1.421 + return -1; // not entirely correct, but close enough 1.422 + } 1.423 + } else { // -- 2nd case: character entity 1.424 + for (entend = entstart; 1.425 + entend < srcend && ascii_isalnum(*entend); 1.426 + ++entend ) { 1.427 + // entity consists of alphanumeric chars 1.428 + } 1.429 + entval = LookupEntity(entstart, entend - entstart); 1.430 + if (entval < 0) { 1.431 + return -1; // not a legal entity name 1.432 + } 1.433 + // Now we do a strange-seeming IE6-compatibility check: if entval is 1.434 + // >= 256, it *must* be followed by a semicolon or it's not considered 1.435 + // an entity. The problem is lots of the newfangled entity names, like 1.436 + // "lang", also occur in URL CGI arguments: "/search?q=test&lang=en". 1.437 + // When these links are written in HTML, it would be really bad if the 1.438 + // "&lang" were treated as an entity, which is what the spec says 1.439 + // *should* happen (even when the HTML is inside an "A HREF" tag!) 1.440 + // IE ignores the spec for these new, high-value entities, so we do too. 1.441 + if ( entval >= 256 && !(entend < srcend && *entend == ';') ) { 1.442 + return -1; // make non-;-terminated entity illegal 1.443 + } 1.444 + } 1.445 + 1.446 + // Finally, figure out how much src was consumed 1.447 + if ( entend < srcend && *entend == ';' ) { 1.448 + entend++; // standard says ; terminator is special 1.449 + } 1.450 + *src_consumed = entend - src; 1.451 + return entval; 1.452 +} 1.453 + 1.454 + 1.455 +// Src points to '&' 1.456 +// Writes entity value to dst. Returns take(src), put(dst) byte counts 1.457 +void EntityToBuffer(const char* src, int len, char* dst, 1.458 + int* tlen, int* plen) { 1.459 + char32 entval = ReadEntity(src, len, tlen); 1.460 + 1.461 + // ReadEntity does this already: entval = FixUnicodeValue(entval); 1.462 + 1.463 + // Convert UTF-32 to UTF-8 1.464 + if (entval > 0) { 1.465 + *plen = runetochar(dst, &entval); 1.466 + } else { 1.467 + // Illegal entity; ignore the '&' 1.468 + *tlen = 1; 1.469 + *plen = 0; 1.470 + } 1.471 +} 1.472 + 1.473 +// Returns true if character is < > or &, none of which are letters 1.474 +bool inline IsSpecial(char c) { 1.475 + if ((c & 0xe0) == 0x20) { 1.476 + return kSpecialSymbol[static_cast<uint8>(c)]; 1.477 + } 1.478 + return false; 1.479 +} 1.480 + 1.481 +// Quick Skip to next letter or < > & or to end of string (eos) 1.482 +// Always return is_letter for eos 1.483 +int ScanToLetterOrSpecial(const char* src, int len) { 1.484 + int bytes_consumed; 1.485 + StringPiece str(src, len); 1.486 + UTF8GenericScan(&utf8scannot_lettermarkspecial_obj, str, &bytes_consumed); 1.487 + return bytes_consumed; 1.488 +} 1.489 + 1.490 + 1.491 + 1.492 + 1.493 +// src points to non-letter, such as tag-opening '<' 1.494 +// Return length from here to next possible letter 1.495 +// On another < before >, return 1 1.496 +// advances <tag> 1.497 +// | | 1.498 +// advances <tag> ... </tag> for <script> <style> 1.499 +// | | 1.500 +// advances <!-- ... <tag> ... --> 1.501 +// | | 1.502 +// advances <tag 1.503 +// | | end of string 1.504 +// advances <tag <tag2> 1.505 +// || 1.506 +int ScanToPossibleLetter(const char* isrc, int len, int max_exit_state) { 1.507 + const uint8* src = reinterpret_cast<const uint8*>(isrc); 1.508 + const uint8* srclimit = src + len; 1.509 + const uint8* tagParseTbl = kTagParseTbl_0; 1.510 + int e = 0; 1.511 + while (src < srclimit) { 1.512 + e = tagParseTbl[kCharToSub[*src++]]; 1.513 + if (e <= max_exit_state) { 1.514 + // We overshot by one byte 1.515 + --src; 1.516 + break; 1.517 + } 1.518 + tagParseTbl = &kTagParseTbl_0[e * 20]; 1.519 + } 1.520 + 1.521 + if (src >= srclimit) { 1.522 + // We fell off the end of the text. 1.523 + // It looks like the most common case for this is a truncated file, not 1.524 + // mismatched angle brackets. So we pretend that the last char was '>' 1.525 + return len; 1.526 + } 1.527 + 1.528 + // OK to be in state 0 or state 2 at exit 1.529 + if ((e != 0) && (e != 2)) { 1.530 + // Error, '<' followed by '<' 1.531 + // We want to back up to first <, then advance by one byte past it 1.532 + int offset = src - reinterpret_cast<const uint8*>(isrc); 1.533 + 1.534 + // Backscan to first '<' and return enough length to just get past it 1.535 + --offset; // back up over the second '<', which caused us to stop 1.536 + while ((0 < offset) && (isrc[offset] != '<')) { 1.537 + // Find the first '<', which is unmatched 1.538 + --offset; 1.539 + } 1.540 + // skip to just beyond first '<' 1.541 + return offset + 1; 1.542 + } 1.543 + 1.544 + return src - reinterpret_cast<const uint8*>(isrc); 1.545 +} 1.546 + 1.547 + 1.548 +ScriptScanner::ScriptScanner(const char* buffer, 1.549 + int buffer_length, 1.550 + bool is_plain_text) 1.551 + : start_byte_(buffer), 1.552 + next_byte_(buffer), 1.553 + next_byte_limit_(buffer + buffer_length), 1.554 + byte_length_(buffer_length), 1.555 + is_plain_text_(is_plain_text), 1.556 + letters_marks_only_(true), 1.557 + one_script_only_(true), 1.558 + exit_state_(kMaxExitStateLettersMarksOnly) { 1.559 + script_buffer_ = new char[kMaxScriptBuffer]; 1.560 + script_buffer_lower_ = new char[kMaxScriptLowerBuffer]; 1.561 + map2original_.Clear(); // map from script_buffer_ to buffer 1.562 + map2uplow_.Clear(); // map from script_buffer_lower_ to script_buffer_ 1.563 +} 1.564 + 1.565 +// Extended version to allow spans of any non-tag text and spans of mixed script 1.566 +ScriptScanner::ScriptScanner(const char* buffer, 1.567 + int buffer_length, 1.568 + bool is_plain_text, 1.569 + bool any_text, 1.570 + bool any_script) 1.571 + : start_byte_(buffer), 1.572 + next_byte_(buffer), 1.573 + next_byte_limit_(buffer + buffer_length), 1.574 + byte_length_(buffer_length), 1.575 + is_plain_text_(is_plain_text), 1.576 + letters_marks_only_(!any_text), 1.577 + one_script_only_(!any_script), 1.578 + exit_state_(any_text ? kMaxExitStateAllText : kMaxExitStateLettersMarksOnly) { 1.579 + script_buffer_ = new char[kMaxScriptBuffer]; 1.580 + script_buffer_lower_ = new char[kMaxScriptLowerBuffer]; 1.581 + map2original_.Clear(); // map from script_buffer_ to buffer 1.582 + map2uplow_.Clear(); // map from script_buffer_lower_ to script_buffer_ 1.583 +} 1.584 + 1.585 + 1.586 +ScriptScanner::~ScriptScanner() { 1.587 + delete[] script_buffer_; 1.588 + delete[] script_buffer_lower_; 1.589 +} 1.590 + 1.591 + 1.592 + 1.593 + 1.594 +// Get to the first real non-tag letter or entity that is a letter 1.595 +// Sets script of that letter 1.596 +// Return len if no more letters 1.597 +int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) { 1.598 + int sc = UNKNOWN_ULSCRIPT; 1.599 + int skip = 0; 1.600 + int tlen, plen; 1.601 + 1.602 + // Do run of non-letters (tag | &NL | NL)* 1.603 + tlen = 0; 1.604 + while (skip < len) { 1.605 + // Do fast scan to next interesting byte 1.606 + // int oldskip = skip; 1.607 + skip += ScanToLetterOrSpecial(src + skip, len - skip); 1.608 + 1.609 + // Check for no more letters/specials 1.610 + if (skip >= len) { 1.611 + // All done 1.612 + *script = sc; 1.613 + return len; 1.614 + } 1.615 + 1.616 + // We are at a letter, nonletter, tag, or entity 1.617 + if (IsSpecial(src[skip]) && !is_plain_text_) { 1.618 + if (src[skip] == '<') { 1.619 + // Begining of tag; skip to end and go around again 1.620 + tlen = ScanToPossibleLetter(src + skip, len - skip, 1.621 + exit_state_); 1.622 + sc = 0; 1.623 + } else if (src[skip] == '>') { 1.624 + // Unexpected end of tag; skip it and go around again 1.625 + tlen = 1; // Over the > 1.626 + sc = 0; 1.627 + } else if (src[skip] == '&') { 1.628 + // Expand entity, no advance 1.629 + char temp[4]; 1.630 + EntityToBuffer(src + skip, len - skip, 1.631 + temp, &tlen, &plen); 1.632 + sc = GetUTF8LetterScriptNum(temp); 1.633 + } 1.634 + } else { 1.635 + // Update 1..4 bytes 1.636 + tlen = UTF8OneCharLen(src + skip); 1.637 + sc = GetUTF8LetterScriptNum(src + skip); 1.638 + } 1.639 + if (sc != 0) {break;} // Letter found 1.640 + skip += tlen; // Else advance 1.641 + } 1.642 + 1.643 + *script = sc; 1.644 + return skip; 1.645 +} 1.646 + 1.647 + 1.648 +// These are for ASCII-only tag names 1.649 +// Compare one letter uplow to c, ignoring case of uplowp 1.650 +inline bool EqCase(char uplow, char c) { 1.651 + return (uplow | 0x20) == c; 1.652 +} 1.653 + 1.654 +// These are for ASCII-only tag names 1.655 +// Return true for space / < > etc. all less than 0x40 1.656 +inline bool NeqLetter(char c) { 1.657 + return c < 0x40; 1.658 +} 1.659 + 1.660 +// These are for ASCII-only tag names 1.661 +// Return true for space \n false for \r 1.662 +inline bool WS(char c) { 1.663 + return (c == ' ') || (c == '\n'); 1.664 +} 1.665 + 1.666 +// Canonical CR or LF 1.667 +static const char LF = '\n'; 1.668 + 1.669 + 1.670 +// The naive loop scans from next_byte_ to script_buffer_ until full. 1.671 +// But this can leave an awkward hard-to-identify short fragment at the 1.672 +// end of the input. We would prefer to make the next-to-last fragment 1.673 +// shorter and the last fragment longer. 1.674 + 1.675 +// Copy next run of non-tag characters to buffer [NUL terminated] 1.676 +// This just replaces tags with space or \n and removes entities. 1.677 +// Tags <br> <p> and <tr> are replaced with \n. Non-letter sequences 1.678 +// including \r or \n are replaced by \n. All other tags and skipped text 1.679 +// are replaced with ASCII space. 1.680 +// 1.681 +// Buffer ALWAYS has leading space and trailing space space space NUL 1.682 +bool ScriptScanner::GetOneTextSpan(LangSpan* span) { 1.683 + span->text = script_buffer_; 1.684 + span->text_bytes = 0; 1.685 + span->offset = next_byte_ - start_byte_; 1.686 + span->ulscript = UNKNOWN_ULSCRIPT; 1.687 + span->lang = UNKNOWN_LANGUAGE; 1.688 + span->truncated = false; 1.689 + 1.690 + int put_soft_limit = kMaxScriptBytes - kWithinScriptTail; 1.691 + if ((kMaxScriptBytes <= byte_length_) && 1.692 + (byte_length_ < (2 * kMaxScriptBytes))) { 1.693 + // Try to split the last two fragments in half 1.694 + put_soft_limit = byte_length_ / 2; 1.695 + } 1.696 + 1.697 + script_buffer_[0] = ' '; // Always a space at front of output 1.698 + script_buffer_[1] = '\0'; 1.699 + int take = 0; 1.700 + int put = 1; // Start after the initial space 1.701 + int tlen, plen; 1.702 + 1.703 + if (byte_length_ <= 0) { 1.704 + return false; // No more text to be found 1.705 + } 1.706 + 1.707 + // Go over alternating spans of text and tags, 1.708 + // copying letters to buffer with single spaces for each run of non-letters 1.709 + bool last_byte_was_space = false; 1.710 + while (take < byte_length_) { 1.711 + char c = next_byte_[take]; 1.712 + if (c == '\r') {c = LF;} // Canonical CR or LF 1.713 + if (c == '\n') {c = LF;} // Canonical CR or LF 1.714 + 1.715 + if (IsSpecial(c) && !is_plain_text_) { 1.716 + if (c == '<') { 1.717 + // Replace tag with space 1.718 + c = ' '; // for almost-full test below 1.719 + // or if <p> <br> <tr>, replace with \n 1.720 + if (take < (byte_length_ - 3)) { 1.721 + if (EqCase(next_byte_[take + 1], 'p') && 1.722 + NeqLetter(next_byte_[take + 2])) { 1.723 + c = LF; 1.724 + } 1.725 + if (EqCase(next_byte_[take + 1], 'b') && 1.726 + EqCase(next_byte_[take + 2], 'r') && 1.727 + NeqLetter(next_byte_[take + 3])) { 1.728 + c = LF; 1.729 + } 1.730 + if (EqCase(next_byte_[take + 1], 't') && 1.731 + EqCase(next_byte_[take + 2], 'r') && 1.732 + NeqLetter(next_byte_[take + 3])) { 1.733 + c = LF; 1.734 + } 1.735 + } 1.736 + // Begining of tag; skip to end and go around again 1.737 + tlen = 1 + ScanToPossibleLetter(next_byte_ + take, byte_length_ - take, 1.738 + exit_state_); 1.739 + // Copy one byte, compressing spaces 1.740 + if (!last_byte_was_space || !WS(c)) { 1.741 + script_buffer_[put++] = c; // Advance dest 1.742 + last_byte_was_space = WS(c); 1.743 + } 1.744 + } else if (c == '>') { 1.745 + // Unexpected end of tag; copy it and go around again 1.746 + tlen = 1; // Over the > 1.747 + script_buffer_[put++] = c; // Advance dest 1.748 + } else if (c == '&') { 1.749 + // Expand entity, no advance 1.750 + EntityToBuffer(next_byte_ + take, byte_length_ - take, 1.751 + script_buffer_ + put, &tlen, &plen); 1.752 + put += plen; // Advance dest 1.753 + } 1.754 + take += tlen; // Advance source 1.755 + } else { 1.756 + // Copy one byte, compressing spaces 1.757 + if (!last_byte_was_space || !WS(c)) { 1.758 + script_buffer_[put++] = c; // Advance dest 1.759 + last_byte_was_space = WS(c); 1.760 + } 1.761 + ++take; // Advance source 1.762 + } 1.763 + 1.764 + if (WS(c) && 1.765 + (put >= put_soft_limit)) { 1.766 + // Buffer is almost full 1.767 + span->truncated = true; 1.768 + break; 1.769 + } 1.770 + if (put >= kMaxScriptBytes) { 1.771 + // Buffer is completely full 1.772 + span->truncated = true; 1.773 + break; 1.774 + } 1.775 + } 1.776 + 1.777 + // Almost done. Back up to a character boundary if needed 1.778 + while ((0 < take) && ((next_byte_[take] & 0xc0) == 0x80)) { 1.779 + // Back up over continuation byte 1.780 + --take; 1.781 + --put; 1.782 + } 1.783 + 1.784 + // Update input position 1.785 + next_byte_ += take; 1.786 + byte_length_ -= take; 1.787 + 1.788 + // Put four more spaces/NUL. Worst case is abcd _ _ _ \0 1.789 + // kMaxScriptBytes | | put 1.790 + script_buffer_[put + 0] = ' '; 1.791 + script_buffer_[put + 1] = ' '; 1.792 + script_buffer_[put + 2] = ' '; 1.793 + script_buffer_[put + 3] = '\0'; 1.794 + 1.795 + span->text_bytes = put; // Does not include the last four chars above 1.796 + return true; 1.797 +} 1.798 + 1.799 + 1.800 +// Copy next run of same-script non-tag letters to buffer [NUL terminated] 1.801 +// Buffer ALWAYS has leading space and trailing space space space NUL 1.802 +bool ScriptScanner::GetOneScriptSpan(LangSpan* span) { 1.803 + if (!letters_marks_only_) { 1.804 + // Return non-tag text, including punctuation and digits 1.805 + return GetOneTextSpan(span); 1.806 + } 1.807 + 1.808 + span->text = script_buffer_; 1.809 + span->text_bytes = 0; 1.810 + span->offset = next_byte_ - start_byte_; 1.811 + span->ulscript = UNKNOWN_ULSCRIPT; 1.812 + span->lang = UNKNOWN_LANGUAGE; 1.813 + span->truncated = false; 1.814 + 1.815 + // struct timeval script_start, script_mid, script_end; 1.816 + 1.817 + int put_soft_limit = kMaxScriptBytes - kWithinScriptTail; 1.818 + if ((kMaxScriptBytes <= byte_length_) && 1.819 + (byte_length_ < (2 * kMaxScriptBytes))) { 1.820 + // Try to split the last two fragments in half 1.821 + put_soft_limit = byte_length_ / 2; 1.822 + } 1.823 + 1.824 + 1.825 + int spanscript; // The script of this span 1.826 + int sc = UNKNOWN_ULSCRIPT; // The script of next character 1.827 + int tlen = 0; 1.828 + int plen = 0; 1.829 + 1.830 + script_buffer_[0] = ' '; // Always a space at front of output 1.831 + script_buffer_[1] = '\0'; 1.832 + int take = 0; 1.833 + int put = 1; // Start after the initial space 1.834 + 1.835 + // Build offsets from span->text back to start_byte_ + span->offset 1.836 + // This mapping reflects deletion of non-letters, expansion of 1.837 + // entities, etc. 1.838 + map2original_.Clear(); 1.839 + map2original_.Delete(span->offset); // So that MapBack(0) gives offset 1.840 + 1.841 + // Get to the first real non-tag letter or entity that is a letter 1.842 + int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript); 1.843 + next_byte_ += skip; 1.844 + byte_length_ -= skip; 1.845 + 1.846 + if (skip != 1) { 1.847 + map2original_.Delete(skip); 1.848 + map2original_.Insert(1); 1.849 + } else { 1.850 + map2original_.Copy(1); 1.851 + } 1.852 + if (byte_length_ <= 0) { 1.853 + map2original_.Reset(); 1.854 + return false; // No more letters to be found 1.855 + } 1.856 + 1.857 + // There is at least one letter, so we know the script for this span 1.858 + span->ulscript = (ULScript)spanscript; 1.859 + 1.860 + 1.861 + // Go over alternating spans of same-script letters and non-letters, 1.862 + // copying letters to buffer with single spaces for each run of non-letters 1.863 + while (take < byte_length_) { 1.864 + // Copy run of letters in same script (&LS | LS)* 1.865 + int letter_count = 0; // Keep track of word length 1.866 + bool need_break = false; 1.867 + 1.868 + while (take < byte_length_) { 1.869 + // We are at a letter, nonletter, tag, or entity 1.870 + if (IsSpecial(next_byte_[take]) && !is_plain_text_) { 1.871 + if (next_byte_[take] == '<') { 1.872 + // Begining of tag 1.873 + sc = 0; 1.874 + break; 1.875 + } else if (next_byte_[take] == '>') { 1.876 + // Unexpected end of tag 1.877 + sc = 0; 1.878 + break; 1.879 + } else if (next_byte_[take] == '&') { 1.880 + // Copy entity, no advance 1.881 + EntityToBuffer(next_byte_ + take, byte_length_ - take, 1.882 + script_buffer_ + put, &tlen, &plen); 1.883 + sc = GetUTF8LetterScriptNum(script_buffer_ + put); 1.884 + } 1.885 + } else { 1.886 + // Real letter, safely copy up to 4 bytes, increment by 1..4 1.887 + // Will update by 1..4 bytes at Advance, below 1.888 + tlen = plen = UTF8OneCharLen(next_byte_ + take); 1.889 + if (take < (byte_length_ - 3)) { 1.890 + // X86 fast case, does unaligned load/store 1.891 + UNALIGNED_STORE32(script_buffer_ + put, 1.892 + UNALIGNED_LOAD32(next_byte_ + take)); 1.893 + 1.894 + } else { 1.895 + // Slow case, happens 1-3 times per input document 1.896 + memcpy(script_buffer_ + put, next_byte_ + take, plen); 1.897 + } 1.898 + sc = GetUTF8LetterScriptNum(next_byte_ + take); 1.899 + } 1.900 + 1.901 + // Allow continue across a single letter in a different script: 1.902 + // A B D = three scripts, c = common script, i = inherited script, 1.903 + // - = don't care, ( = take position before the += below 1.904 + // AAA(A- continue 1.905 + // 1.906 + // AAA(BA continue 1.907 + // AAA(BB break 1.908 + // AAA(Bc continue (breaks after B) 1.909 + // AAA(BD break 1.910 + // AAA(Bi break 1.911 + // 1.912 + // AAA(c- break 1.913 + // 1.914 + // AAA(i- continue 1.915 + // 1.916 + 1.917 + if ((sc != spanscript) && (sc != ULScript_Inherited)) { 1.918 + // Might need to break this script span 1.919 + if (sc == ULScript_Common) { 1.920 + need_break = true; 1.921 + } else { 1.922 + // Look at next following character, ignoring entity as Common 1.923 + int sc2 = GetUTF8LetterScriptNum(next_byte_ + take + tlen); 1.924 + if ((sc2 != ULScript_Common) && (sc2 != spanscript)) { 1.925 + // We found a non-trivial change of script 1.926 + if (one_script_only_) { 1.927 + need_break = true; 1.928 + } 1.929 + } 1.930 + } 1.931 + } 1.932 + if (need_break) {break;} // Non-letter or letter in wrong script 1.933 + 1.934 + take += tlen; // Advance 1.935 + put += plen; // Advance 1.936 + 1.937 + // Update the offset map to reflect take/put lengths 1.938 + if (tlen == plen) { 1.939 + map2original_.Copy(tlen); 1.940 + } else if (tlen < plen) { 1.941 + map2original_.Copy(tlen); 1.942 + map2original_.Insert(plen - tlen); 1.943 + } else { // plen < tlen 1.944 + map2original_.Copy(plen); 1.945 + map2original_.Delete(tlen - plen); 1.946 + } 1.947 + 1.948 + ++letter_count; 1.949 + if (put >= kMaxScriptBytes) { 1.950 + // Buffer is full 1.951 + span->truncated = true; 1.952 + break; 1.953 + } 1.954 + } // End while letters 1.955 + 1.956 + // Do run of non-letters (tag | &NL | NL)* 1.957 + while (take < byte_length_) { 1.958 + // Do fast scan to next interesting byte 1.959 + tlen = ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take); 1.960 + take += tlen; 1.961 + map2original_.Delete(tlen); 1.962 + if (take >= byte_length_) {break;} // Might have scanned to end 1.963 + 1.964 + // We are at a letter, nonletter, tag, or entity 1.965 + if (IsSpecial(next_byte_[take]) && !is_plain_text_) { 1.966 + if (next_byte_[take] == '<') { 1.967 + // Begining of tag; skip to end and go around again 1.968 + tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take, 1.969 + exit_state_); 1.970 + sc = 0; 1.971 + } else if (next_byte_[take] == '>') { 1.972 + // Unexpected end of tag; skip it and go around again 1.973 + tlen = 1; // Over the > 1.974 + sc = 0; 1.975 + } else if (next_byte_[take] == '&') { 1.976 + // Expand entity, no advance 1.977 + EntityToBuffer(next_byte_ + take, byte_length_ - take, 1.978 + script_buffer_ + put, &tlen, &plen); 1.979 + sc = GetUTF8LetterScriptNum(script_buffer_ + put); 1.980 + } 1.981 + } else { 1.982 + // Update 1..4 1.983 + tlen = UTF8OneCharLen(next_byte_ + take); 1.984 + sc = GetUTF8LetterScriptNum(next_byte_ + take); 1.985 + } 1.986 + if (sc != 0) {break;} // Letter found 1.987 + take += tlen; // Else advance 1.988 + map2original_.Delete(tlen); 1.989 + } // End while not-letters 1.990 + 1.991 + script_buffer_[put++] = ' '; 1.992 + map2original_.Insert(1); 1.993 + 1.994 + // Letter in wrong script ? 1.995 + if ((sc != spanscript) && (sc != ULScript_Inherited)) {break;} 1.996 + if (put >= put_soft_limit) { 1.997 + // Buffer is almost full 1.998 + span->truncated = true; 1.999 + break; 1.1000 + } 1.1001 + } 1.1002 + 1.1003 + // Almost done. Back up to a character boundary if needed 1.1004 + while ((0 < take) && (take < byte_length_) && 1.1005 + ((next_byte_[take] & 0xc0) == 0x80)) { 1.1006 + // Back up over continuation byte 1.1007 + --take; 1.1008 + --put; 1.1009 + } 1.1010 + 1.1011 + // Update input position 1.1012 + next_byte_ += take; 1.1013 + byte_length_ -= take; 1.1014 + 1.1015 + // Put four more spaces/NUL. Worst case is abcd _ _ _ \0 1.1016 + // kMaxScriptBytes | | put 1.1017 + script_buffer_[put + 0] = ' '; 1.1018 + script_buffer_[put + 1] = ' '; 1.1019 + script_buffer_[put + 2] = ' '; 1.1020 + script_buffer_[put + 3] = '\0'; 1.1021 + map2original_.Insert(4); 1.1022 + map2original_.Reset(); 1.1023 + 1.1024 + span->text_bytes = put; // Does not include the last four chars above 1.1025 + return true; 1.1026 +} 1.1027 + 1.1028 +// Force Latin, Cyrillic, Armenian, Greek scripts to be lowercase 1.1029 +// List changes with each version of Unicode, so just always lowercase 1.1030 +// Unicode 6.2.0: 1.1031 +// ARMENIAN COPTIC CYRILLIC DESERET GEORGIAN GLAGOLITIC GREEK LATIN 1.1032 +void ScriptScanner::LowerScriptSpan(LangSpan* span) { 1.1033 + // If needed, lowercase all the text. If we do it sooner, might miss 1.1034 + // lowercasing an entity such as Á 1.1035 + // We only need to do this for Latn and Cyrl scripts 1.1036 + map2uplow_.Clear(); 1.1037 + // Full Unicode lowercase of the entire buffer, including 1.1038 + // four pad bytes off the end. 1.1039 + // Ahhh. But the last byte 0x00 is not interchange-valid, so we do 3 pad 1.1040 + // bytes and put the 0x00 in explicitly. 1.1041 + // Build an offset map from script_buffer_lower_ back to script_buffer_ 1.1042 + int consumed, filled, changed; 1.1043 + StringPiece istr(span->text, span->text_bytes + 3); 1.1044 + StringPiece ostr(script_buffer_lower_, kMaxScriptLowerBuffer); 1.1045 + 1.1046 + UTF8GenericReplace(&utf8repl_lettermarklower_obj, 1.1047 + istr, ostr, is_plain_text_, 1.1048 + &consumed, &filled, &changed, &map2uplow_); 1.1049 + script_buffer_lower_[filled] = '\0'; 1.1050 + span->text = script_buffer_lower_; 1.1051 + span->text_bytes = filled - 3; 1.1052 + map2uplow_.Reset(); 1.1053 +} 1.1054 + 1.1055 +// Copy next run of same-script non-tag letters to buffer [NUL terminated] 1.1056 +// Force Latin, Cyrillic, Greek scripts to be lowercase 1.1057 +// Buffer ALWAYS has leading space and trailing space space space NUL 1.1058 +bool ScriptScanner::GetOneScriptSpanLower(LangSpan* span) { 1.1059 + bool ok = GetOneScriptSpan(span); 1.1060 + LowerScriptSpan(span); 1.1061 + return ok; 1.1062 +} 1.1063 + 1.1064 + 1.1065 +// Maps byte offset in most recent GetOneScriptSpan/Lower 1.1066 +// span->text [0..text_bytes] into an additional byte offset from 1.1067 +// span->offset, to get back to corresponding text in the original 1.1068 +// input buffer. 1.1069 +// text_offset must be the first byte 1.1070 +// of a UTF-8 character, or just beyond the last character. Normally this 1.1071 +// routine is called with the first byte of an interesting range and 1.1072 +// again with the first byte of the following range. 1.1073 +int ScriptScanner::MapBack(int text_offset) { 1.1074 + return map2original_.MapBack(map2uplow_.MapBack(text_offset)); 1.1075 +} 1.1076 + 1.1077 + 1.1078 +// Gets lscript number for letters; always returns 1.1079 +// 0 (common script) for non-letters 1.1080 +int GetUTF8LetterScriptNum(const char* src) { 1.1081 + int srclen = UTF8OneCharLen(src); 1.1082 + const uint8* usrc = reinterpret_cast<const uint8*>(src); 1.1083 + return UTF8GenericPropertyTwoByte(&utf8prop_lettermarkscriptnum_obj, 1.1084 + &usrc, &srclen); 1.1085 +} 1.1086 + 1.1087 +} // namespace CLD2 1.1088 + 1.1089 +