browser/components/translation/cld2/internal/getonescriptspan.cc

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 // Copyright 2013 Google Inc. All Rights Reserved.
michael@0 2 //
michael@0 3 // Licensed under the Apache License, Version 2.0 (the "License");
michael@0 4 // you may not use this file except in compliance with the License.
michael@0 5 // You may obtain a copy of the License at
michael@0 6 //
michael@0 7 // http://www.apache.org/licenses/LICENSE-2.0
michael@0 8 //
michael@0 9 // Unless required by applicable law or agreed to in writing, software
michael@0 10 // distributed under the License is distributed on an "AS IS" BASIS,
michael@0 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
michael@0 12 // See the License for the specific language governing permissions and
michael@0 13 // limitations under the License.
michael@0 14
michael@0 15 //
michael@0 16 // Author: dsites@google.com (Dick Sites)
michael@0 17 //
michael@0 18
michael@0 19
michael@0 20 #include "getonescriptspan.h"
michael@0 21 #include <string.h>
michael@0 22
michael@0 23 #include "fixunicodevalue.h"
michael@0 24 #include "lang_script.h"
michael@0 25 #include "port.h"
michael@0 26 #include "utf8statetable.h"
michael@0 27
michael@0 28 #include "utf8prop_lettermarkscriptnum.h"
michael@0 29 #include "utf8repl_lettermarklower.h"
michael@0 30 #include "utf8scannot_lettermarkspecial.h"
michael@0 31
michael@0 32
michael@0 33 namespace CLD2 {
michael@0 34
michael@0 35 // Alphabetical order for binary search, from
michael@0 36 // generated_entities.cc
michael@0 37 extern const int kNameToEntitySize;
michael@0 38 extern const CharIntPair kNameToEntity[];
michael@0 39
michael@0 40 static const int kMaxUpToWordBoundary = 50; // span < this make longer,
michael@0 41 // else make shorter
michael@0 42 static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes
michael@0 43 // to round to word boundary,
michael@0 44 // direction above
michael@0 45
michael@0 46 static const char kSpecialSymbol[256] = { // true for < > &
michael@0 47 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 48 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
michael@0 49 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 50 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 51
michael@0 52 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 53 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 54 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 55 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 56 };
michael@0 57
michael@0 58
michael@0 59
michael@0 60 #define LT 0 // <
michael@0 61 #define GT 1 // >
michael@0 62 #define EX 2 // !
michael@0 63 #define HY 3 // -
michael@0 64 #define QU 4 // "
michael@0 65 #define AP 5 // '
michael@0 66 #define SL 6 // /
michael@0 67 #define S_ 7
michael@0 68 #define C_ 8
michael@0 69 #define R_ 9
michael@0 70 #define I_ 10
michael@0 71 #define P_ 11
michael@0 72 #define T_ 12
michael@0 73 #define Y_ 13
michael@0 74 #define L_ 14
michael@0 75 #define E_ 15
michael@0 76 #define CR 16 // <cr> or <lf>
michael@0 77 #define NL 17 // non-letter: ASCII whitespace, digit, punctuation
michael@0 78 #define PL 18 // possible letter, incl. &
michael@0 79 #define xx 19 // <unused>
michael@0 80
michael@0 81 // Map byte to one of ~20 interesting categories for cheap tag parsing
michael@0 82 static const uint8 kCharToSub[256] = {
michael@0 83 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
michael@0 84 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
michael@0 85 NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
michael@0 86 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
michael@0 87
michael@0 88 PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
michael@0 89 P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
michael@0 90 PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
michael@0 91 P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
michael@0 92
michael@0 93 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
michael@0 94 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
michael@0 95 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
michael@0 96 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
michael@0 97
michael@0 98 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
michael@0 99 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
michael@0 100 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
michael@0 101 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
michael@0 102 };
michael@0 103
michael@0 104 #undef LT
michael@0 105 #undef GT
michael@0 106 #undef EX
michael@0 107 #undef HY
michael@0 108 #undef QU
michael@0 109 #undef AP
michael@0 110 #undef SL
michael@0 111 #undef S_
michael@0 112 #undef C_
michael@0 113 #undef R_
michael@0 114 #undef I_
michael@0 115 #undef P_
michael@0 116 #undef T_
michael@0 117 #undef Y_
michael@0 118 #undef L_
michael@0 119 #undef E_
michael@0 120 #undef CR
michael@0 121 #undef NL
michael@0 122 #undef PL
michael@0 123 #undef xx
michael@0 124
michael@0 125
michael@0 126 #define OK 0
michael@0 127 #define X_ 1
michael@0 128
michael@0 129
michael@0 130 static const int kMaxExitStateLettersMarksOnly = 1;
michael@0 131 static const int kMaxExitStateAllText = 2;
michael@0 132
michael@0 133
michael@0 134 // State machine to do cheap parse of non-letter strings incl. tags
michael@0 135 // advances <tag>
michael@0 136 // | |
michael@0 137 // advances <tag> ... </tag> for <script> <style>
michael@0 138 // | |
michael@0 139 // advances <!-- ... <tag> ... -->
michael@0 140 // | |
michael@0 141 // advances <tag
michael@0 142 // || (0)
michael@0 143 // advances <tag <tag2>
michael@0 144 // || (0)
michael@0 145 //
michael@0 146 // We start in state [0] at a non-letter and make at least one transition
michael@0 147 // When scanning for just letters, arriving back at state [0] or [1] exits
michael@0 148 // the state machine.
michael@0 149 // When scanning for any non-tag text, arriving at state [2] also exits
michael@0 150 static const uint8 kTagParseTbl_0[] = {
michael@0 151 // < > ! - " ' / S C R I P T Y L E CR NL PL xx
michael@0 152 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK exit state
michael@0 153 X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error exit state
michael@0 154 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL* [exit state]
michael@0 155 X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] <
michael@0 156 X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <!
michael@0 157 X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!-
michael@0 158 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.*
michael@0 159 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*-
michael@0 160 6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*--
michael@0 161 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.*
michael@0 162 10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
michael@0 163 11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
michael@0 164 X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
michael@0 165
michael@0 166 // < > ! - " ' / S C R I P T Y L E CR NL PL xx
michael@0 167 X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S
michael@0 168 X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC
michael@0 169 X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR
michael@0 170 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI
michael@0 171 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP
michael@0 172 X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
michael@0 173 20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
michael@0 174 19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
michael@0 175 19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 21,21,19,X_, // [21] <SCRIPT .*</ allow SP CR LF
michael@0 176 19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
michael@0 177 19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
michael@0 178 19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
michael@0 179 19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
michael@0 180 19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
michael@0 181 19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
michael@0 182
michael@0 183 // < > ! - " ' / S C R I P T Y L E CR NL PL xx
michael@0 184 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST
michael@0 185 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY
michael@0 186 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL
michael@0 187 X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
michael@0 188 33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
michael@0 189 32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
michael@0 190 32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 34,34,32,X_, // [34] <STYLE .*</ allow SP CR LF
michael@0 191 32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
michael@0 192 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
michael@0 193 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
michael@0 194 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
michael@0 195 32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
michael@0 196 };
michael@0 197
michael@0 198 #undef OK
michael@0 199 #undef X_
michael@0 200
michael@0 201 enum
michael@0 202 {
michael@0 203 UTFmax = 4, // maximum bytes per rune
michael@0 204 Runesync = 0x80, // cannot represent part of a UTF sequence (<)
michael@0 205 Runeself = 0x80, // rune and UTF sequences are the same (<)
michael@0 206 Runeerror = 0xFFFD, // decoding error in UTF
michael@0 207 Runemax = 0x10FFFF, // maximum rune value
michael@0 208 };
michael@0 209
michael@0 210 // Debugging. Not thread safe.
michael@0 211 static char gDisplayPiece[32];
michael@0 212 const uint8 gCharlen[16] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 2,2,3,4};
michael@0 213 char* DisplayPiece(const char* next_byte_, int byte_length_) {
michael@0 214 // Copy up to 8 UTF-8 chars to buffer
michael@0 215 int k = 0; // byte count
michael@0 216 int n = 0; // character count
michael@0 217 for (int i = 0; i < byte_length_; ++i) {
michael@0 218 char c = next_byte_[i];
michael@0 219 if ((c & 0xc0) != 0x80) {
michael@0 220 // Beginning of a UTF-8 character
michael@0 221 int charlen = gCharlen[static_cast<uint8>(c) >> 4];
michael@0 222 if (i + charlen > byte_length_) {break;} // Not enough room for full char
michael@0 223 if (k >= (32 - 7)) {break;} // Not necessarily enough room
michael@0 224 if (n >= 8) {break;} // Enough characters already
michael@0 225 ++n;
michael@0 226 }
michael@0 227 if (c == '<') {
michael@0 228 memcpy(&gDisplayPiece[k], "&lt;", 4); k += 4;
michael@0 229 } else if (c == '>') {
michael@0 230 memcpy(&gDisplayPiece[k], "&gt;", 4); k += 4;
michael@0 231 } else if (c == '&') {
michael@0 232 memcpy(&gDisplayPiece[k], "&amp;", 5); k += 5;
michael@0 233 } else if (c == '\'') {
michael@0 234 memcpy(&gDisplayPiece[k], "&apos;", 6); k += 6;
michael@0 235 } else if (c == '"') {
michael@0 236 memcpy(&gDisplayPiece[k], "&quot;", 6); k += 6;
michael@0 237 } else {
michael@0 238 gDisplayPiece[k++] = c;
michael@0 239 }
michael@0 240 }
michael@0 241 gDisplayPiece[k++] = '\0';
michael@0 242 return gDisplayPiece;
michael@0 243 }
michael@0 244
michael@0 245
michael@0 246
michael@0 247 // runetochar copies (encodes) one rune, pointed to by r, to at most
michael@0 248 // UTFmax bytes starting at s and returns the number of bytes generated.
michael@0 249 int runetochar(char *str, const char32 *rune) {
michael@0 250 // Convert to unsigned for range check.
michael@0 251 unsigned long c;
michael@0 252
michael@0 253 // 1 char 00-7F
michael@0 254 c = *rune;
michael@0 255 if(c <= 0x7F) {
michael@0 256 str[0] = c;
michael@0 257 return 1;
michael@0 258 }
michael@0 259
michael@0 260 // 2 char 0080-07FF
michael@0 261 if(c <= 0x07FF) {
michael@0 262 str[0] = 0xC0 | (c >> 1*6);
michael@0 263 str[1] = 0x80 | (c & 0x3F);
michael@0 264 return 2;
michael@0 265 }
michael@0 266
michael@0 267 // Range check
michael@0 268 if (c > Runemax) {
michael@0 269 c = Runeerror;
michael@0 270 }
michael@0 271
michael@0 272 // 3 char 0800-FFFF
michael@0 273 if (c <= 0xFFFF) {
michael@0 274 str[0] = 0xE0 | (c >> 2*6);
michael@0 275 str[1] = 0x80 | ((c >> 1*6) & 0x3F);
michael@0 276 str[2] = 0x80 | (c & 0x3F);
michael@0 277 return 3;
michael@0 278 }
michael@0 279
michael@0 280 // 4 char 10000-1FFFFF
michael@0 281 str[0] = 0xF0 | (c >> 3*6);
michael@0 282 str[1] = 0x80 | ((c >> 2*6) & 0x3F);
michael@0 283 str[2] = 0x80 | ((c >> 1*6) & 0x3F);
michael@0 284 str[3] = 0x80 | (c & 0x3F);
michael@0 285 return 4;
michael@0 286 }
michael@0 287
michael@0 288
michael@0 289
michael@0 290 // Useful for converting an entity to an ascii value.
michael@0 291 // RETURNS unicode value, or -1 if entity isn't valid. Don't include & or ;
michael@0 292 int LookupEntity(const char* entity_name, int entity_len) {
michael@0 293 // Make a C string
michael@0 294 if (entity_len >= 16) {return -1;} // All real entities are shorter
michael@0 295 char temp[16];
michael@0 296 memcpy(temp, entity_name, entity_len);
michael@0 297 temp[entity_len] = '\0';
michael@0 298 int match = BinarySearch(temp, 0, kNameToEntitySize, kNameToEntity);
michael@0 299 if (match >= 0) {return kNameToEntity[match].i;}
michael@0 300 return -1;
michael@0 301 }
michael@0 302
michael@0 303 bool ascii_isdigit(char c) {
michael@0 304 return ('0' <= c) && (c <= '9');
michael@0 305 }
michael@0 306 bool ascii_isxdigit(char c) {
michael@0 307 if (('0' <= c) && (c <= '9')) {return true;}
michael@0 308 if (('a' <= c) && (c <= 'f')) {return true;}
michael@0 309 if (('A' <= c) && (c <= 'F')) {return true;}
michael@0 310 return false;
michael@0 311 }
michael@0 312 bool ascii_isalnum(char c) {
michael@0 313 if (('0' <= c) && (c <= '9')) {return true;}
michael@0 314 if (('a' <= c) && (c <= 'z')) {return true;}
michael@0 315 if (('A' <= c) && (c <= 'Z')) {return true;}
michael@0 316 return false;
michael@0 317 }
michael@0 318 int hex_digit_to_int(char c) {
michael@0 319 if (('0' <= c) && (c <= '9')) {return c - '0';}
michael@0 320 if (('a' <= c) && (c <= 'f')) {return c - 'a' + 10;}
michael@0 321 if (('A' <= c) && (c <= 'F')) {return c - 'A' + 10;}
michael@0 322 return 0;
michael@0 323 }
michael@0 324
michael@0 325 static int32 strto32_base10(const char* nptr, const char* limit,
michael@0 326 const char **endptr) {
michael@0 327 *endptr = nptr;
michael@0 328 while (nptr < limit && *nptr == '0') {
michael@0 329 ++nptr;
michael@0 330 }
michael@0 331 if (nptr == limit || !ascii_isdigit(*nptr))
michael@0 332 return -1;
michael@0 333 const char* end_digits_run = nptr;
michael@0 334 while (end_digits_run < limit && ascii_isdigit(*end_digits_run)) {
michael@0 335 ++end_digits_run;
michael@0 336 }
michael@0 337 *endptr = end_digits_run;
michael@0 338 const int num_digits = end_digits_run - nptr;
michael@0 339 // kint32max == 2147483647.
michael@0 340 if (num_digits < 9 ||
michael@0 341 (num_digits == 10 && memcmp(nptr, "2147483647", 10) <= 0)) {
michael@0 342 int value = 0;
michael@0 343 for (; nptr < end_digits_run; ++nptr) {
michael@0 344 value *= 10;
michael@0 345 value += *nptr - '0';
michael@0 346 }
michael@0 347 // Overflow past the last valid unicode codepoint
michael@0 348 // (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
michael@0 349 return FixUnicodeValue(value);
michael@0 350 } else {
michael@0 351 // Overflow: can't fit in an int32;
michael@0 352 // returns the replacement character 0xFFFD.
michael@0 353 return 0xFFFD;
michael@0 354 }
michael@0 355 }
michael@0 356
michael@0 357 static int32 strto32_base16(const char* nptr, const char* limit,
michael@0 358 const char **endptr) {
michael@0 359 *endptr = nptr;
michael@0 360 while (nptr < limit && *nptr == '0') {
michael@0 361 ++nptr;
michael@0 362 }
michael@0 363 if (nptr == limit || !ascii_isxdigit(*nptr)) {
michael@0 364 return -1;
michael@0 365 }
michael@0 366 const char* end_xdigits_run = nptr;
michael@0 367 while (end_xdigits_run < limit && ascii_isxdigit(*end_xdigits_run)) {
michael@0 368 ++end_xdigits_run;
michael@0 369 }
michael@0 370 *endptr = end_xdigits_run;
michael@0 371 const int num_xdigits = end_xdigits_run - nptr;
michael@0 372 // kint32max == 0x7FFFFFFF.
michael@0 373 if (num_xdigits < 8 || (num_xdigits == 8 && nptr[0] < '8')) {
michael@0 374 int value = 0;
michael@0 375 for (; nptr < end_xdigits_run; ++nptr) {
michael@0 376 value <<= 4;
michael@0 377 value += hex_digit_to_int(*nptr);
michael@0 378 }
michael@0 379 // Overflow past the last valid unicode codepoint
michael@0 380 // (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
michael@0 381 return FixUnicodeValue(value);
michael@0 382 } else {
michael@0 383 // Overflow: can't fit in an int32;
michael@0 384 // returns the replacement character 0xFFFD.
michael@0 385 return 0xFFFD;
michael@0 386 }
michael@0 387 }
michael@0 388
michael@0 389 // Unescape the current character pointed to by src. SETS the number
michael@0 390 // of chars read for the conversion (in UTF8). If src isn't a valid entity,
michael@0 391 // just consume the & and RETURN -1. If src doesn't point to & -- which it
michael@0 392 // should -- set src_consumed to 0 and RETURN -1.
michael@0 393 int ReadEntity(const char* src, int srcn, int* src_consumed) {
michael@0 394 const char* const srcend = src + srcn;
michael@0 395
michael@0 396 if (srcn == 0 || *src != '&') { // input should start with an ampersand
michael@0 397 *src_consumed = 0;
michael@0 398 return -1;
michael@0 399 }
michael@0 400 *src_consumed = 1; // we'll get the & at least
michael@0 401
michael@0 402 // The standards are a bit unclear on when an entity ends. Certainly a ";"
michael@0 403 // ends one, but spaces probably do too. We follow the lead of both IE and
michael@0 404 // Netscape, which as far as we can tell end numeric entities (1st case below)
michael@0 405 // at any non-digit, and end character entities (2nd case) at any non-alnum.
michael@0 406 const char* entstart, *entend; // where the entity starts and ends
michael@0 407 entstart = src + 1; // read past the &
michael@0 408 int entval; // UCS2 value of the entity
michael@0 409 if ( *entstart == '#' ) { // -- 1st case: numeric entity
michael@0 410 if ( entstart + 2 >= srcend ) {
michael@0 411 return -1; // no way a legitimate number could fit
michael@0 412 } else if ( entstart[1] == 'x' || entstart[1] == 'X' ) { // hex numeric
michael@0 413 entval = strto32_base16(entstart + 2, srcend, &entend);
michael@0 414 } else { // decimal numeric entity
michael@0 415 entval = strto32_base10(entstart+1, srcend, &entend);
michael@0 416 }
michael@0 417 if (entval == -1 || entend > srcend) {
michael@0 418 return -1; // not entirely correct, but close enough
michael@0 419 }
michael@0 420 } else { // -- 2nd case: character entity
michael@0 421 for (entend = entstart;
michael@0 422 entend < srcend && ascii_isalnum(*entend);
michael@0 423 ++entend ) {
michael@0 424 // entity consists of alphanumeric chars
michael@0 425 }
michael@0 426 entval = LookupEntity(entstart, entend - entstart);
michael@0 427 if (entval < 0) {
michael@0 428 return -1; // not a legal entity name
michael@0 429 }
michael@0 430 // Now we do a strange-seeming IE6-compatibility check: if entval is
michael@0 431 // >= 256, it *must* be followed by a semicolon or it's not considered
michael@0 432 // an entity. The problem is lots of the newfangled entity names, like
michael@0 433 // "lang", also occur in URL CGI arguments: "/search?q=test&lang=en".
michael@0 434 // When these links are written in HTML, it would be really bad if the
michael@0 435 // "&lang" were treated as an entity, which is what the spec says
michael@0 436 // *should* happen (even when the HTML is inside an "A HREF" tag!)
michael@0 437 // IE ignores the spec for these new, high-value entities, so we do too.
michael@0 438 if ( entval >= 256 && !(entend < srcend && *entend == ';') ) {
michael@0 439 return -1; // make non-;-terminated entity illegal
michael@0 440 }
michael@0 441 }
michael@0 442
michael@0 443 // Finally, figure out how much src was consumed
michael@0 444 if ( entend < srcend && *entend == ';' ) {
michael@0 445 entend++; // standard says ; terminator is special
michael@0 446 }
michael@0 447 *src_consumed = entend - src;
michael@0 448 return entval;
michael@0 449 }
michael@0 450
michael@0 451
michael@0 452 // Src points to '&'
michael@0 453 // Writes entity value to dst. Returns take(src), put(dst) byte counts
michael@0 454 void EntityToBuffer(const char* src, int len, char* dst,
michael@0 455 int* tlen, int* plen) {
michael@0 456 char32 entval = ReadEntity(src, len, tlen);
michael@0 457
michael@0 458 // ReadEntity does this already: entval = FixUnicodeValue(entval);
michael@0 459
michael@0 460 // Convert UTF-32 to UTF-8
michael@0 461 if (entval > 0) {
michael@0 462 *plen = runetochar(dst, &entval);
michael@0 463 } else {
michael@0 464 // Illegal entity; ignore the '&'
michael@0 465 *tlen = 1;
michael@0 466 *plen = 0;
michael@0 467 }
michael@0 468 }
michael@0 469
michael@0 470 // Returns true if character is < > or &, none of which are letters
michael@0 471 bool inline IsSpecial(char c) {
michael@0 472 if ((c & 0xe0) == 0x20) {
michael@0 473 return kSpecialSymbol[static_cast<uint8>(c)];
michael@0 474 }
michael@0 475 return false;
michael@0 476 }
michael@0 477
michael@0 478 // Quick Skip to next letter or < > & or to end of string (eos)
michael@0 479 // Always return is_letter for eos
michael@0 480 int ScanToLetterOrSpecial(const char* src, int len) {
michael@0 481 int bytes_consumed;
michael@0 482 StringPiece str(src, len);
michael@0 483 UTF8GenericScan(&utf8scannot_lettermarkspecial_obj, str, &bytes_consumed);
michael@0 484 return bytes_consumed;
michael@0 485 }
michael@0 486
michael@0 487
michael@0 488
michael@0 489
michael@0 490 // src points to non-letter, such as tag-opening '<'
michael@0 491 // Return length from here to next possible letter
michael@0 492 // On another < before >, return 1
michael@0 493 // advances <tag>
michael@0 494 // | |
michael@0 495 // advances <tag> ... </tag> for <script> <style>
michael@0 496 // | |
michael@0 497 // advances <!-- ... <tag> ... -->
michael@0 498 // | |
michael@0 499 // advances <tag
michael@0 500 // | | end of string
michael@0 501 // advances <tag <tag2>
michael@0 502 // ||
michael@0 503 int ScanToPossibleLetter(const char* isrc, int len, int max_exit_state) {
michael@0 504 const uint8* src = reinterpret_cast<const uint8*>(isrc);
michael@0 505 const uint8* srclimit = src + len;
michael@0 506 const uint8* tagParseTbl = kTagParseTbl_0;
michael@0 507 int e = 0;
michael@0 508 while (src < srclimit) {
michael@0 509 e = tagParseTbl[kCharToSub[*src++]];
michael@0 510 if (e <= max_exit_state) {
michael@0 511 // We overshot by one byte
michael@0 512 --src;
michael@0 513 break;
michael@0 514 }
michael@0 515 tagParseTbl = &kTagParseTbl_0[e * 20];
michael@0 516 }
michael@0 517
michael@0 518 if (src >= srclimit) {
michael@0 519 // We fell off the end of the text.
michael@0 520 // It looks like the most common case for this is a truncated file, not
michael@0 521 // mismatched angle brackets. So we pretend that the last char was '>'
michael@0 522 return len;
michael@0 523 }
michael@0 524
michael@0 525 // OK to be in state 0 or state 2 at exit
michael@0 526 if ((e != 0) && (e != 2)) {
michael@0 527 // Error, '<' followed by '<'
michael@0 528 // We want to back up to first <, then advance by one byte past it
michael@0 529 int offset = src - reinterpret_cast<const uint8*>(isrc);
michael@0 530
michael@0 531 // Backscan to first '<' and return enough length to just get past it
michael@0 532 --offset; // back up over the second '<', which caused us to stop
michael@0 533 while ((0 < offset) && (isrc[offset] != '<')) {
michael@0 534 // Find the first '<', which is unmatched
michael@0 535 --offset;
michael@0 536 }
michael@0 537 // skip to just beyond first '<'
michael@0 538 return offset + 1;
michael@0 539 }
michael@0 540
michael@0 541 return src - reinterpret_cast<const uint8*>(isrc);
michael@0 542 }
michael@0 543
michael@0 544
michael@0 545 ScriptScanner::ScriptScanner(const char* buffer,
michael@0 546 int buffer_length,
michael@0 547 bool is_plain_text)
michael@0 548 : start_byte_(buffer),
michael@0 549 next_byte_(buffer),
michael@0 550 next_byte_limit_(buffer + buffer_length),
michael@0 551 byte_length_(buffer_length),
michael@0 552 is_plain_text_(is_plain_text),
michael@0 553 letters_marks_only_(true),
michael@0 554 one_script_only_(true),
michael@0 555 exit_state_(kMaxExitStateLettersMarksOnly) {
michael@0 556 script_buffer_ = new char[kMaxScriptBuffer];
michael@0 557 script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
michael@0 558 map2original_.Clear(); // map from script_buffer_ to buffer
michael@0 559 map2uplow_.Clear(); // map from script_buffer_lower_ to script_buffer_
michael@0 560 }
michael@0 561
michael@0 562 // Extended version to allow spans of any non-tag text and spans of mixed script
michael@0 563 ScriptScanner::ScriptScanner(const char* buffer,
michael@0 564 int buffer_length,
michael@0 565 bool is_plain_text,
michael@0 566 bool any_text,
michael@0 567 bool any_script)
michael@0 568 : start_byte_(buffer),
michael@0 569 next_byte_(buffer),
michael@0 570 next_byte_limit_(buffer + buffer_length),
michael@0 571 byte_length_(buffer_length),
michael@0 572 is_plain_text_(is_plain_text),
michael@0 573 letters_marks_only_(!any_text),
michael@0 574 one_script_only_(!any_script),
michael@0 575 exit_state_(any_text ? kMaxExitStateAllText : kMaxExitStateLettersMarksOnly) {
michael@0 576 script_buffer_ = new char[kMaxScriptBuffer];
michael@0 577 script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
michael@0 578 map2original_.Clear(); // map from script_buffer_ to buffer
michael@0 579 map2uplow_.Clear(); // map from script_buffer_lower_ to script_buffer_
michael@0 580 }
michael@0 581
michael@0 582
michael@0 583 ScriptScanner::~ScriptScanner() {
michael@0 584 delete[] script_buffer_;
michael@0 585 delete[] script_buffer_lower_;
michael@0 586 }
michael@0 587
michael@0 588
michael@0 589
michael@0 590
michael@0 591 // Get to the first real non-tag letter or entity that is a letter
michael@0 592 // Sets script of that letter
michael@0 593 // Return len if no more letters
michael@0 594 int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
michael@0 595 int sc = UNKNOWN_ULSCRIPT;
michael@0 596 int skip = 0;
michael@0 597 int tlen, plen;
michael@0 598
michael@0 599 // Do run of non-letters (tag | &NL | NL)*
michael@0 600 tlen = 0;
michael@0 601 while (skip < len) {
michael@0 602 // Do fast scan to next interesting byte
michael@0 603 // int oldskip = skip;
michael@0 604 skip += ScanToLetterOrSpecial(src + skip, len - skip);
michael@0 605
michael@0 606 // Check for no more letters/specials
michael@0 607 if (skip >= len) {
michael@0 608 // All done
michael@0 609 *script = sc;
michael@0 610 return len;
michael@0 611 }
michael@0 612
michael@0 613 // We are at a letter, nonletter, tag, or entity
michael@0 614 if (IsSpecial(src[skip]) && !is_plain_text_) {
michael@0 615 if (src[skip] == '<') {
michael@0 616 // Begining of tag; skip to end and go around again
michael@0 617 tlen = ScanToPossibleLetter(src + skip, len - skip,
michael@0 618 exit_state_);
michael@0 619 sc = 0;
michael@0 620 } else if (src[skip] == '>') {
michael@0 621 // Unexpected end of tag; skip it and go around again
michael@0 622 tlen = 1; // Over the >
michael@0 623 sc = 0;
michael@0 624 } else if (src[skip] == '&') {
michael@0 625 // Expand entity, no advance
michael@0 626 char temp[4];
michael@0 627 EntityToBuffer(src + skip, len - skip,
michael@0 628 temp, &tlen, &plen);
michael@0 629 sc = GetUTF8LetterScriptNum(temp);
michael@0 630 }
michael@0 631 } else {
michael@0 632 // Update 1..4 bytes
michael@0 633 tlen = UTF8OneCharLen(src + skip);
michael@0 634 sc = GetUTF8LetterScriptNum(src + skip);
michael@0 635 }
michael@0 636 if (sc != 0) {break;} // Letter found
michael@0 637 skip += tlen; // Else advance
michael@0 638 }
michael@0 639
michael@0 640 *script = sc;
michael@0 641 return skip;
michael@0 642 }
michael@0 643
michael@0 644
michael@0 645 // These are for ASCII-only tag names
michael@0 646 // Compare one letter uplow to c, ignoring case of uplowp
michael@0 647 inline bool EqCase(char uplow, char c) {
michael@0 648 return (uplow | 0x20) == c;
michael@0 649 }
michael@0 650
michael@0 651 // These are for ASCII-only tag names
michael@0 652 // Return true for space / < > etc. all less than 0x40
michael@0 653 inline bool NeqLetter(char c) {
michael@0 654 return c < 0x40;
michael@0 655 }
michael@0 656
michael@0 657 // These are for ASCII-only tag names
michael@0 658 // Return true for space \n false for \r
michael@0 659 inline bool WS(char c) {
michael@0 660 return (c == ' ') || (c == '\n');
michael@0 661 }
michael@0 662
michael@0 663 // Canonical CR or LF
michael@0 664 static const char LF = '\n';
michael@0 665
michael@0 666
michael@0 667 // The naive loop scans from next_byte_ to script_buffer_ until full.
michael@0 668 // But this can leave an awkward hard-to-identify short fragment at the
michael@0 669 // end of the input. We would prefer to make the next-to-last fragment
michael@0 670 // shorter and the last fragment longer.
michael@0 671
michael@0 672 // Copy next run of non-tag characters to buffer [NUL terminated]
michael@0 673 // This just replaces tags with space or \n and removes entities.
michael@0 674 // Tags <br> <p> and <tr> are replaced with \n. Non-letter sequences
michael@0 675 // including \r or \n are replaced by \n. All other tags and skipped text
michael@0 676 // are replaced with ASCII space.
michael@0 677 //
michael@0 678 // Buffer ALWAYS has leading space and trailing space space space NUL
michael@0 679 bool ScriptScanner::GetOneTextSpan(LangSpan* span) {
michael@0 680 span->text = script_buffer_;
michael@0 681 span->text_bytes = 0;
michael@0 682 span->offset = next_byte_ - start_byte_;
michael@0 683 span->ulscript = UNKNOWN_ULSCRIPT;
michael@0 684 span->lang = UNKNOWN_LANGUAGE;
michael@0 685 span->truncated = false;
michael@0 686
michael@0 687 int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
michael@0 688 if ((kMaxScriptBytes <= byte_length_) &&
michael@0 689 (byte_length_ < (2 * kMaxScriptBytes))) {
michael@0 690 // Try to split the last two fragments in half
michael@0 691 put_soft_limit = byte_length_ / 2;
michael@0 692 }
michael@0 693
michael@0 694 script_buffer_[0] = ' '; // Always a space at front of output
michael@0 695 script_buffer_[1] = '\0';
michael@0 696 int take = 0;
michael@0 697 int put = 1; // Start after the initial space
michael@0 698 int tlen, plen;
michael@0 699
michael@0 700 if (byte_length_ <= 0) {
michael@0 701 return false; // No more text to be found
michael@0 702 }
michael@0 703
michael@0 704 // Go over alternating spans of text and tags,
michael@0 705 // copying letters to buffer with single spaces for each run of non-letters
michael@0 706 bool last_byte_was_space = false;
michael@0 707 while (take < byte_length_) {
michael@0 708 char c = next_byte_[take];
michael@0 709 if (c == '\r') {c = LF;} // Canonical CR or LF
michael@0 710 if (c == '\n') {c = LF;} // Canonical CR or LF
michael@0 711
michael@0 712 if (IsSpecial(c) && !is_plain_text_) {
michael@0 713 if (c == '<') {
michael@0 714 // Replace tag with space
michael@0 715 c = ' '; // for almost-full test below
michael@0 716 // or if <p> <br> <tr>, replace with \n
michael@0 717 if (take < (byte_length_ - 3)) {
michael@0 718 if (EqCase(next_byte_[take + 1], 'p') &&
michael@0 719 NeqLetter(next_byte_[take + 2])) {
michael@0 720 c = LF;
michael@0 721 }
michael@0 722 if (EqCase(next_byte_[take + 1], 'b') &&
michael@0 723 EqCase(next_byte_[take + 2], 'r') &&
michael@0 724 NeqLetter(next_byte_[take + 3])) {
michael@0 725 c = LF;
michael@0 726 }
michael@0 727 if (EqCase(next_byte_[take + 1], 't') &&
michael@0 728 EqCase(next_byte_[take + 2], 'r') &&
michael@0 729 NeqLetter(next_byte_[take + 3])) {
michael@0 730 c = LF;
michael@0 731 }
michael@0 732 }
michael@0 733 // Begining of tag; skip to end and go around again
michael@0 734 tlen = 1 + ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
michael@0 735 exit_state_);
michael@0 736 // Copy one byte, compressing spaces
michael@0 737 if (!last_byte_was_space || !WS(c)) {
michael@0 738 script_buffer_[put++] = c; // Advance dest
michael@0 739 last_byte_was_space = WS(c);
michael@0 740 }
michael@0 741 } else if (c == '>') {
michael@0 742 // Unexpected end of tag; copy it and go around again
michael@0 743 tlen = 1; // Over the >
michael@0 744 script_buffer_[put++] = c; // Advance dest
michael@0 745 } else if (c == '&') {
michael@0 746 // Expand entity, no advance
michael@0 747 EntityToBuffer(next_byte_ + take, byte_length_ - take,
michael@0 748 script_buffer_ + put, &tlen, &plen);
michael@0 749 put += plen; // Advance dest
michael@0 750 }
michael@0 751 take += tlen; // Advance source
michael@0 752 } else {
michael@0 753 // Copy one byte, compressing spaces
michael@0 754 if (!last_byte_was_space || !WS(c)) {
michael@0 755 script_buffer_[put++] = c; // Advance dest
michael@0 756 last_byte_was_space = WS(c);
michael@0 757 }
michael@0 758 ++take; // Advance source
michael@0 759 }
michael@0 760
michael@0 761 if (WS(c) &&
michael@0 762 (put >= put_soft_limit)) {
michael@0 763 // Buffer is almost full
michael@0 764 span->truncated = true;
michael@0 765 break;
michael@0 766 }
michael@0 767 if (put >= kMaxScriptBytes) {
michael@0 768 // Buffer is completely full
michael@0 769 span->truncated = true;
michael@0 770 break;
michael@0 771 }
michael@0 772 }
michael@0 773
michael@0 774 // Almost done. Back up to a character boundary if needed
michael@0 775 while ((0 < take) && ((next_byte_[take] & 0xc0) == 0x80)) {
michael@0 776 // Back up over continuation byte
michael@0 777 --take;
michael@0 778 --put;
michael@0 779 }
michael@0 780
michael@0 781 // Update input position
michael@0 782 next_byte_ += take;
michael@0 783 byte_length_ -= take;
michael@0 784
michael@0 785 // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
michael@0 786 // kMaxScriptBytes | | put
michael@0 787 script_buffer_[put + 0] = ' ';
michael@0 788 script_buffer_[put + 1] = ' ';
michael@0 789 script_buffer_[put + 2] = ' ';
michael@0 790 script_buffer_[put + 3] = '\0';
michael@0 791
michael@0 792 span->text_bytes = put; // Does not include the last four chars above
michael@0 793 return true;
michael@0 794 }
michael@0 795
michael@0 796
michael@0 797 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
michael@0 798 // Buffer ALWAYS has leading space and trailing space space space NUL
michael@0 799 bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
michael@0 800 if (!letters_marks_only_) {
michael@0 801 // Return non-tag text, including punctuation and digits
michael@0 802 return GetOneTextSpan(span);
michael@0 803 }
michael@0 804
michael@0 805 span->text = script_buffer_;
michael@0 806 span->text_bytes = 0;
michael@0 807 span->offset = next_byte_ - start_byte_;
michael@0 808 span->ulscript = UNKNOWN_ULSCRIPT;
michael@0 809 span->lang = UNKNOWN_LANGUAGE;
michael@0 810 span->truncated = false;
michael@0 811
michael@0 812 // struct timeval script_start, script_mid, script_end;
michael@0 813
michael@0 814 int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
michael@0 815 if ((kMaxScriptBytes <= byte_length_) &&
michael@0 816 (byte_length_ < (2 * kMaxScriptBytes))) {
michael@0 817 // Try to split the last two fragments in half
michael@0 818 put_soft_limit = byte_length_ / 2;
michael@0 819 }
michael@0 820
michael@0 821
michael@0 822 int spanscript; // The script of this span
michael@0 823 int sc = UNKNOWN_ULSCRIPT; // The script of next character
michael@0 824 int tlen = 0;
michael@0 825 int plen = 0;
michael@0 826
michael@0 827 script_buffer_[0] = ' '; // Always a space at front of output
michael@0 828 script_buffer_[1] = '\0';
michael@0 829 int take = 0;
michael@0 830 int put = 1; // Start after the initial space
michael@0 831
michael@0 832 // Build offsets from span->text back to start_byte_ + span->offset
michael@0 833 // This mapping reflects deletion of non-letters, expansion of
michael@0 834 // entities, etc.
michael@0 835 map2original_.Clear();
michael@0 836 map2original_.Delete(span->offset); // So that MapBack(0) gives offset
michael@0 837
michael@0 838 // Get to the first real non-tag letter or entity that is a letter
michael@0 839 int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
michael@0 840 next_byte_ += skip;
michael@0 841 byte_length_ -= skip;
michael@0 842
michael@0 843 if (skip != 1) {
michael@0 844 map2original_.Delete(skip);
michael@0 845 map2original_.Insert(1);
michael@0 846 } else {
michael@0 847 map2original_.Copy(1);
michael@0 848 }
michael@0 849 if (byte_length_ <= 0) {
michael@0 850 map2original_.Reset();
michael@0 851 return false; // No more letters to be found
michael@0 852 }
michael@0 853
michael@0 854 // There is at least one letter, so we know the script for this span
michael@0 855 span->ulscript = (ULScript)spanscript;
michael@0 856
michael@0 857
michael@0 858 // Go over alternating spans of same-script letters and non-letters,
michael@0 859 // copying letters to buffer with single spaces for each run of non-letters
michael@0 860 while (take < byte_length_) {
michael@0 861 // Copy run of letters in same script (&LS | LS)*
michael@0 862 int letter_count = 0; // Keep track of word length
michael@0 863 bool need_break = false;
michael@0 864
michael@0 865 while (take < byte_length_) {
michael@0 866 // We are at a letter, nonletter, tag, or entity
michael@0 867 if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
michael@0 868 if (next_byte_[take] == '<') {
michael@0 869 // Begining of tag
michael@0 870 sc = 0;
michael@0 871 break;
michael@0 872 } else if (next_byte_[take] == '>') {
michael@0 873 // Unexpected end of tag
michael@0 874 sc = 0;
michael@0 875 break;
michael@0 876 } else if (next_byte_[take] == '&') {
michael@0 877 // Copy entity, no advance
michael@0 878 EntityToBuffer(next_byte_ + take, byte_length_ - take,
michael@0 879 script_buffer_ + put, &tlen, &plen);
michael@0 880 sc = GetUTF8LetterScriptNum(script_buffer_ + put);
michael@0 881 }
michael@0 882 } else {
michael@0 883 // Real letter, safely copy up to 4 bytes, increment by 1..4
michael@0 884 // Will update by 1..4 bytes at Advance, below
michael@0 885 tlen = plen = UTF8OneCharLen(next_byte_ + take);
michael@0 886 if (take < (byte_length_ - 3)) {
michael@0 887 // X86 fast case, does unaligned load/store
michael@0 888 UNALIGNED_STORE32(script_buffer_ + put,
michael@0 889 UNALIGNED_LOAD32(next_byte_ + take));
michael@0 890
michael@0 891 } else {
michael@0 892 // Slow case, happens 1-3 times per input document
michael@0 893 memcpy(script_buffer_ + put, next_byte_ + take, plen);
michael@0 894 }
michael@0 895 sc = GetUTF8LetterScriptNum(next_byte_ + take);
michael@0 896 }
michael@0 897
michael@0 898 // Allow continue across a single letter in a different script:
michael@0 899 // A B D = three scripts, c = common script, i = inherited script,
michael@0 900 // - = don't care, ( = take position before the += below
michael@0 901 // AAA(A- continue
michael@0 902 //
michael@0 903 // AAA(BA continue
michael@0 904 // AAA(BB break
michael@0 905 // AAA(Bc continue (breaks after B)
michael@0 906 // AAA(BD break
michael@0 907 // AAA(Bi break
michael@0 908 //
michael@0 909 // AAA(c- break
michael@0 910 //
michael@0 911 // AAA(i- continue
michael@0 912 //
michael@0 913
michael@0 914 if ((sc != spanscript) && (sc != ULScript_Inherited)) {
michael@0 915 // Might need to break this script span
michael@0 916 if (sc == ULScript_Common) {
michael@0 917 need_break = true;
michael@0 918 } else {
michael@0 919 // Look at next following character, ignoring entity as Common
michael@0 920 int sc2 = GetUTF8LetterScriptNum(next_byte_ + take + tlen);
michael@0 921 if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
michael@0 922 // We found a non-trivial change of script
michael@0 923 if (one_script_only_) {
michael@0 924 need_break = true;
michael@0 925 }
michael@0 926 }
michael@0 927 }
michael@0 928 }
michael@0 929 if (need_break) {break;} // Non-letter or letter in wrong script
michael@0 930
michael@0 931 take += tlen; // Advance
michael@0 932 put += plen; // Advance
michael@0 933
michael@0 934 // Update the offset map to reflect take/put lengths
michael@0 935 if (tlen == plen) {
michael@0 936 map2original_.Copy(tlen);
michael@0 937 } else if (tlen < plen) {
michael@0 938 map2original_.Copy(tlen);
michael@0 939 map2original_.Insert(plen - tlen);
michael@0 940 } else { // plen < tlen
michael@0 941 map2original_.Copy(plen);
michael@0 942 map2original_.Delete(tlen - plen);
michael@0 943 }
michael@0 944
michael@0 945 ++letter_count;
michael@0 946 if (put >= kMaxScriptBytes) {
michael@0 947 // Buffer is full
michael@0 948 span->truncated = true;
michael@0 949 break;
michael@0 950 }
michael@0 951 } // End while letters
michael@0 952
michael@0 953 // Do run of non-letters (tag | &NL | NL)*
michael@0 954 while (take < byte_length_) {
michael@0 955 // Do fast scan to next interesting byte
michael@0 956 tlen = ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
michael@0 957 take += tlen;
michael@0 958 map2original_.Delete(tlen);
michael@0 959 if (take >= byte_length_) {break;} // Might have scanned to end
michael@0 960
michael@0 961 // We are at a letter, nonletter, tag, or entity
michael@0 962 if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
michael@0 963 if (next_byte_[take] == '<') {
michael@0 964 // Begining of tag; skip to end and go around again
michael@0 965 tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
michael@0 966 exit_state_);
michael@0 967 sc = 0;
michael@0 968 } else if (next_byte_[take] == '>') {
michael@0 969 // Unexpected end of tag; skip it and go around again
michael@0 970 tlen = 1; // Over the >
michael@0 971 sc = 0;
michael@0 972 } else if (next_byte_[take] == '&') {
michael@0 973 // Expand entity, no advance
michael@0 974 EntityToBuffer(next_byte_ + take, byte_length_ - take,
michael@0 975 script_buffer_ + put, &tlen, &plen);
michael@0 976 sc = GetUTF8LetterScriptNum(script_buffer_ + put);
michael@0 977 }
michael@0 978 } else {
michael@0 979 // Update 1..4
michael@0 980 tlen = UTF8OneCharLen(next_byte_ + take);
michael@0 981 sc = GetUTF8LetterScriptNum(next_byte_ + take);
michael@0 982 }
michael@0 983 if (sc != 0) {break;} // Letter found
michael@0 984 take += tlen; // Else advance
michael@0 985 map2original_.Delete(tlen);
michael@0 986 } // End while not-letters
michael@0 987
michael@0 988 script_buffer_[put++] = ' ';
michael@0 989 map2original_.Insert(1);
michael@0 990
michael@0 991 // Letter in wrong script ?
michael@0 992 if ((sc != spanscript) && (sc != ULScript_Inherited)) {break;}
michael@0 993 if (put >= put_soft_limit) {
michael@0 994 // Buffer is almost full
michael@0 995 span->truncated = true;
michael@0 996 break;
michael@0 997 }
michael@0 998 }
michael@0 999
michael@0 1000 // Almost done. Back up to a character boundary if needed
michael@0 1001 while ((0 < take) && (take < byte_length_) &&
michael@0 1002 ((next_byte_[take] & 0xc0) == 0x80)) {
michael@0 1003 // Back up over continuation byte
michael@0 1004 --take;
michael@0 1005 --put;
michael@0 1006 }
michael@0 1007
michael@0 1008 // Update input position
michael@0 1009 next_byte_ += take;
michael@0 1010 byte_length_ -= take;
michael@0 1011
michael@0 1012 // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
michael@0 1013 // kMaxScriptBytes | | put
michael@0 1014 script_buffer_[put + 0] = ' ';
michael@0 1015 script_buffer_[put + 1] = ' ';
michael@0 1016 script_buffer_[put + 2] = ' ';
michael@0 1017 script_buffer_[put + 3] = '\0';
michael@0 1018 map2original_.Insert(4);
michael@0 1019 map2original_.Reset();
michael@0 1020
michael@0 1021 span->text_bytes = put; // Does not include the last four chars above
michael@0 1022 return true;
michael@0 1023 }
michael@0 1024
michael@0 1025 // Force Latin, Cyrillic, Armenian, Greek scripts to be lowercase
michael@0 1026 // List changes with each version of Unicode, so just always lowercase
michael@0 1027 // Unicode 6.2.0:
michael@0 1028 // ARMENIAN COPTIC CYRILLIC DESERET GEORGIAN GLAGOLITIC GREEK LATIN
michael@0 1029 void ScriptScanner::LowerScriptSpan(LangSpan* span) {
michael@0 1030 // If needed, lowercase all the text. If we do it sooner, might miss
michael@0 1031 // lowercasing an entity such as &Aacute;
michael@0 1032 // We only need to do this for Latn and Cyrl scripts
michael@0 1033 map2uplow_.Clear();
michael@0 1034 // Full Unicode lowercase of the entire buffer, including
michael@0 1035 // four pad bytes off the end.
michael@0 1036 // Ahhh. But the last byte 0x00 is not interchange-valid, so we do 3 pad
michael@0 1037 // bytes and put the 0x00 in explicitly.
michael@0 1038 // Build an offset map from script_buffer_lower_ back to script_buffer_
michael@0 1039 int consumed, filled, changed;
michael@0 1040 StringPiece istr(span->text, span->text_bytes + 3);
michael@0 1041 StringPiece ostr(script_buffer_lower_, kMaxScriptLowerBuffer);
michael@0 1042
michael@0 1043 UTF8GenericReplace(&utf8repl_lettermarklower_obj,
michael@0 1044 istr, ostr, is_plain_text_,
michael@0 1045 &consumed, &filled, &changed, &map2uplow_);
michael@0 1046 script_buffer_lower_[filled] = '\0';
michael@0 1047 span->text = script_buffer_lower_;
michael@0 1048 span->text_bytes = filled - 3;
michael@0 1049 map2uplow_.Reset();
michael@0 1050 }
michael@0 1051
michael@0 1052 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
michael@0 1053 // Force Latin, Cyrillic, Greek scripts to be lowercase
michael@0 1054 // Buffer ALWAYS has leading space and trailing space space space NUL
michael@0 1055 bool ScriptScanner::GetOneScriptSpanLower(LangSpan* span) {
michael@0 1056 bool ok = GetOneScriptSpan(span);
michael@0 1057 LowerScriptSpan(span);
michael@0 1058 return ok;
michael@0 1059 }
michael@0 1060
michael@0 1061
michael@0 1062 // Maps byte offset in most recent GetOneScriptSpan/Lower
michael@0 1063 // span->text [0..text_bytes] into an additional byte offset from
michael@0 1064 // span->offset, to get back to corresponding text in the original
michael@0 1065 // input buffer.
michael@0 1066 // text_offset must be the first byte
michael@0 1067 // of a UTF-8 character, or just beyond the last character. Normally this
michael@0 1068 // routine is called with the first byte of an interesting range and
michael@0 1069 // again with the first byte of the following range.
michael@0 1070 int ScriptScanner::MapBack(int text_offset) {
michael@0 1071 return map2original_.MapBack(map2uplow_.MapBack(text_offset));
michael@0 1072 }
michael@0 1073
michael@0 1074
michael@0 1075 // Gets lscript number for letters; always returns
michael@0 1076 // 0 (common script) for non-letters
michael@0 1077 int GetUTF8LetterScriptNum(const char* src) {
michael@0 1078 int srclen = UTF8OneCharLen(src);
michael@0 1079 const uint8* usrc = reinterpret_cast<const uint8*>(src);
michael@0 1080 return UTF8GenericPropertyTwoByte(&utf8prop_lettermarkscriptnum_obj,
michael@0 1081 &usrc, &srclen);
michael@0 1082 }
michael@0 1083
michael@0 1084 } // namespace CLD2
michael@0 1085
michael@0 1086

mercurial