Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | // Copyright 2013 Google Inc. All Rights Reserved. |
michael@0 | 2 | // |
michael@0 | 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
michael@0 | 4 | // you may not use this file except in compliance with the License. |
michael@0 | 5 | // You may obtain a copy of the License at |
michael@0 | 6 | // |
michael@0 | 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
michael@0 | 8 | // |
michael@0 | 9 | // Unless required by applicable law or agreed to in writing, software |
michael@0 | 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
michael@0 | 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
michael@0 | 12 | // See the License for the specific language governing permissions and |
michael@0 | 13 | // limitations under the License. |
michael@0 | 14 | |
michael@0 | 15 | // |
michael@0 | 16 | // Author: dsites@google.com (Dick Sites) |
michael@0 | 17 | // |
michael@0 | 18 | |
michael@0 | 19 | |
michael@0 | 20 | #include "getonescriptspan.h" |
michael@0 | 21 | #include <string.h> |
michael@0 | 22 | |
michael@0 | 23 | #include "fixunicodevalue.h" |
michael@0 | 24 | #include "lang_script.h" |
michael@0 | 25 | #include "port.h" |
michael@0 | 26 | #include "utf8statetable.h" |
michael@0 | 27 | |
michael@0 | 28 | #include "utf8prop_lettermarkscriptnum.h" |
michael@0 | 29 | #include "utf8repl_lettermarklower.h" |
michael@0 | 30 | #include "utf8scannot_lettermarkspecial.h" |
michael@0 | 31 | |
michael@0 | 32 | |
michael@0 | 33 | namespace CLD2 { |
michael@0 | 34 | |
michael@0 | 35 | // Alphabetical order for binary search, from |
michael@0 | 36 | // generated_entities.cc |
michael@0 | 37 | extern const int kNameToEntitySize; |
michael@0 | 38 | extern const CharIntPair kNameToEntity[]; |
michael@0 | 39 | |
michael@0 | 40 | static const int kMaxUpToWordBoundary = 50; // span < this make longer, |
michael@0 | 41 | // else make shorter |
michael@0 | 42 | static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes |
michael@0 | 43 | // to round to word boundary, |
michael@0 | 44 | // direction above |
michael@0 | 45 | |
michael@0 | 46 | static const char kSpecialSymbol[256] = { // true for < > & |
michael@0 | 47 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 48 | 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0, |
michael@0 | 49 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 50 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 51 | |
michael@0 | 52 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 53 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 54 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 55 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 56 | }; |
michael@0 | 57 | |
michael@0 | 58 | |
michael@0 | 59 | |
michael@0 | 60 | #define LT 0 // < |
michael@0 | 61 | #define GT 1 // > |
michael@0 | 62 | #define EX 2 // ! |
michael@0 | 63 | #define HY 3 // - |
michael@0 | 64 | #define QU 4 // " |
michael@0 | 65 | #define AP 5 // ' |
michael@0 | 66 | #define SL 6 // / |
michael@0 | 67 | #define S_ 7 |
michael@0 | 68 | #define C_ 8 |
michael@0 | 69 | #define R_ 9 |
michael@0 | 70 | #define I_ 10 |
michael@0 | 71 | #define P_ 11 |
michael@0 | 72 | #define T_ 12 |
michael@0 | 73 | #define Y_ 13 |
michael@0 | 74 | #define L_ 14 |
michael@0 | 75 | #define E_ 15 |
michael@0 | 76 | #define CR 16 // <cr> or <lf> |
michael@0 | 77 | #define NL 17 // non-letter: ASCII whitespace, digit, punctuation |
michael@0 | 78 | #define PL 18 // possible letter, incl. & |
michael@0 | 79 | #define xx 19 // <unused> |
michael@0 | 80 | |
michael@0 | 81 | // Map byte to one of ~20 interesting categories for cheap tag parsing |
michael@0 | 82 | static const uint8 kCharToSub[256] = { |
michael@0 | 83 | NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL, |
michael@0 | 84 | NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, |
michael@0 | 85 | NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL, |
michael@0 | 86 | NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL, |
michael@0 | 87 | |
michael@0 | 88 | PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, |
michael@0 | 89 | P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, |
michael@0 | 90 | PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, |
michael@0 | 91 | P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, |
michael@0 | 92 | |
michael@0 | 93 | NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, |
michael@0 | 94 | NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, |
michael@0 | 95 | NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, |
michael@0 | 96 | NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, |
michael@0 | 97 | |
michael@0 | 98 | PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, |
michael@0 | 99 | PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, |
michael@0 | 100 | PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, |
michael@0 | 101 | PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, |
michael@0 | 102 | }; |
michael@0 | 103 | |
michael@0 | 104 | #undef LT |
michael@0 | 105 | #undef GT |
michael@0 | 106 | #undef EX |
michael@0 | 107 | #undef HY |
michael@0 | 108 | #undef QU |
michael@0 | 109 | #undef AP |
michael@0 | 110 | #undef SL |
michael@0 | 111 | #undef S_ |
michael@0 | 112 | #undef C_ |
michael@0 | 113 | #undef R_ |
michael@0 | 114 | #undef I_ |
michael@0 | 115 | #undef P_ |
michael@0 | 116 | #undef T_ |
michael@0 | 117 | #undef Y_ |
michael@0 | 118 | #undef L_ |
michael@0 | 119 | #undef E_ |
michael@0 | 120 | #undef CR |
michael@0 | 121 | #undef NL |
michael@0 | 122 | #undef PL |
michael@0 | 123 | #undef xx |
michael@0 | 124 | |
michael@0 | 125 | |
michael@0 | 126 | #define OK 0 |
michael@0 | 127 | #define X_ 1 |
michael@0 | 128 | |
michael@0 | 129 | |
michael@0 | 130 | static const int kMaxExitStateLettersMarksOnly = 1; |
michael@0 | 131 | static const int kMaxExitStateAllText = 2; |
michael@0 | 132 | |
michael@0 | 133 | |
michael@0 | 134 | // State machine to do cheap parse of non-letter strings incl. tags |
michael@0 | 135 | // advances <tag> |
michael@0 | 136 | // | | |
michael@0 | 137 | // advances <tag> ... </tag> for <script> <style> |
michael@0 | 138 | // | | |
michael@0 | 139 | // advances <!-- ... <tag> ... --> |
michael@0 | 140 | // | | |
michael@0 | 141 | // advances <tag |
michael@0 | 142 | // || (0) |
michael@0 | 143 | // advances <tag <tag2> |
michael@0 | 144 | // || (0) |
michael@0 | 145 | // |
michael@0 | 146 | // We start in state [0] at a non-letter and make at least one transition |
michael@0 | 147 | // When scanning for just letters, arriving back at state [0] or [1] exits |
michael@0 | 148 | // the state machine. |
michael@0 | 149 | // When scanning for any non-tag text, arriving at state [2] also exits |
michael@0 | 150 | static const uint8 kTagParseTbl_0[] = { |
michael@0 | 151 | // < > ! - " ' / S C R I P T Y L E CR NL PL xx |
michael@0 | 152 | 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK exit state |
michael@0 | 153 | X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error exit state |
michael@0 | 154 | 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL* [exit state] |
michael@0 | 155 | X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] < |
michael@0 | 156 | X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <! |
michael@0 | 157 | X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!- |
michael@0 | 158 | 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.* |
michael@0 | 159 | 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*- |
michael@0 | 160 | 6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*-- |
michael@0 | 161 | X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.* |
michael@0 | 162 | 10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*" |
michael@0 | 163 | 11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*' |
michael@0 | 164 | X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " ' |
michael@0 | 165 | |
michael@0 | 166 | // < > ! - " ' / S C R I P T Y L E CR NL PL xx |
michael@0 | 167 | X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S |
michael@0 | 168 | X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC |
michael@0 | 169 | X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR |
michael@0 | 170 | X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI |
michael@0 | 171 | X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP |
michael@0 | 172 | X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT |
michael@0 | 173 | 20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .* |
michael@0 | 174 | 19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*< |
michael@0 | 175 | 19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 21,21,19,X_, // [21] <SCRIPT .*</ allow SP CR LF |
michael@0 | 176 | 19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S |
michael@0 | 177 | 19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC |
michael@0 | 178 | 19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR |
michael@0 | 179 | 19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI |
michael@0 | 180 | 19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP |
michael@0 | 181 | 19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT |
michael@0 | 182 | |
michael@0 | 183 | // < > ! - " ' / S C R I P T Y L E CR NL PL xx |
michael@0 | 184 | X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST |
michael@0 | 185 | X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY |
michael@0 | 186 | X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL |
michael@0 | 187 | X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE |
michael@0 | 188 | 33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .* |
michael@0 | 189 | 32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*< |
michael@0 | 190 | 32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 34,34,32,X_, // [34] <STYLE .*</ allow SP CR LF |
michael@0 | 191 | 32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S |
michael@0 | 192 | 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST |
michael@0 | 193 | 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY |
michael@0 | 194 | 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL |
michael@0 | 195 | 32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE |
michael@0 | 196 | }; |
michael@0 | 197 | |
michael@0 | 198 | #undef OK |
michael@0 | 199 | #undef X_ |
michael@0 | 200 | |
michael@0 | 201 | enum |
michael@0 | 202 | { |
michael@0 | 203 | UTFmax = 4, // maximum bytes per rune |
michael@0 | 204 | Runesync = 0x80, // cannot represent part of a UTF sequence (<) |
michael@0 | 205 | Runeself = 0x80, // rune and UTF sequences are the same (<) |
michael@0 | 206 | Runeerror = 0xFFFD, // decoding error in UTF |
michael@0 | 207 | Runemax = 0x10FFFF, // maximum rune value |
michael@0 | 208 | }; |
michael@0 | 209 | |
michael@0 | 210 | // Debugging. Not thread safe. |
michael@0 | 211 | static char gDisplayPiece[32]; |
michael@0 | 212 | const uint8 gCharlen[16] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 2,2,3,4}; |
michael@0 | 213 | char* DisplayPiece(const char* next_byte_, int byte_length_) { |
michael@0 | 214 | // Copy up to 8 UTF-8 chars to buffer |
michael@0 | 215 | int k = 0; // byte count |
michael@0 | 216 | int n = 0; // character count |
michael@0 | 217 | for (int i = 0; i < byte_length_; ++i) { |
michael@0 | 218 | char c = next_byte_[i]; |
michael@0 | 219 | if ((c & 0xc0) != 0x80) { |
michael@0 | 220 | // Beginning of a UTF-8 character |
michael@0 | 221 | int charlen = gCharlen[static_cast<uint8>(c) >> 4]; |
michael@0 | 222 | if (i + charlen > byte_length_) {break;} // Not enough room for full char |
michael@0 | 223 | if (k >= (32 - 7)) {break;} // Not necessarily enough room |
michael@0 | 224 | if (n >= 8) {break;} // Enough characters already |
michael@0 | 225 | ++n; |
michael@0 | 226 | } |
michael@0 | 227 | if (c == '<') { |
michael@0 | 228 | memcpy(&gDisplayPiece[k], "<", 4); k += 4; |
michael@0 | 229 | } else if (c == '>') { |
michael@0 | 230 | memcpy(&gDisplayPiece[k], ">", 4); k += 4; |
michael@0 | 231 | } else if (c == '&') { |
michael@0 | 232 | memcpy(&gDisplayPiece[k], "&", 5); k += 5; |
michael@0 | 233 | } else if (c == '\'') { |
michael@0 | 234 | memcpy(&gDisplayPiece[k], "'", 6); k += 6; |
michael@0 | 235 | } else if (c == '"') { |
michael@0 | 236 | memcpy(&gDisplayPiece[k], """, 6); k += 6; |
michael@0 | 237 | } else { |
michael@0 | 238 | gDisplayPiece[k++] = c; |
michael@0 | 239 | } |
michael@0 | 240 | } |
michael@0 | 241 | gDisplayPiece[k++] = '\0'; |
michael@0 | 242 | return gDisplayPiece; |
michael@0 | 243 | } |
michael@0 | 244 | |
michael@0 | 245 | |
michael@0 | 246 | |
michael@0 | 247 | // runetochar copies (encodes) one rune, pointed to by r, to at most |
michael@0 | 248 | // UTFmax bytes starting at s and returns the number of bytes generated. |
michael@0 | 249 | int runetochar(char *str, const char32 *rune) { |
michael@0 | 250 | // Convert to unsigned for range check. |
michael@0 | 251 | unsigned long c; |
michael@0 | 252 | |
michael@0 | 253 | // 1 char 00-7F |
michael@0 | 254 | c = *rune; |
michael@0 | 255 | if(c <= 0x7F) { |
michael@0 | 256 | str[0] = c; |
michael@0 | 257 | return 1; |
michael@0 | 258 | } |
michael@0 | 259 | |
michael@0 | 260 | // 2 char 0080-07FF |
michael@0 | 261 | if(c <= 0x07FF) { |
michael@0 | 262 | str[0] = 0xC0 | (c >> 1*6); |
michael@0 | 263 | str[1] = 0x80 | (c & 0x3F); |
michael@0 | 264 | return 2; |
michael@0 | 265 | } |
michael@0 | 266 | |
michael@0 | 267 | // Range check |
michael@0 | 268 | if (c > Runemax) { |
michael@0 | 269 | c = Runeerror; |
michael@0 | 270 | } |
michael@0 | 271 | |
michael@0 | 272 | // 3 char 0800-FFFF |
michael@0 | 273 | if (c <= 0xFFFF) { |
michael@0 | 274 | str[0] = 0xE0 | (c >> 2*6); |
michael@0 | 275 | str[1] = 0x80 | ((c >> 1*6) & 0x3F); |
michael@0 | 276 | str[2] = 0x80 | (c & 0x3F); |
michael@0 | 277 | return 3; |
michael@0 | 278 | } |
michael@0 | 279 | |
michael@0 | 280 | // 4 char 10000-1FFFFF |
michael@0 | 281 | str[0] = 0xF0 | (c >> 3*6); |
michael@0 | 282 | str[1] = 0x80 | ((c >> 2*6) & 0x3F); |
michael@0 | 283 | str[2] = 0x80 | ((c >> 1*6) & 0x3F); |
michael@0 | 284 | str[3] = 0x80 | (c & 0x3F); |
michael@0 | 285 | return 4; |
michael@0 | 286 | } |
michael@0 | 287 | |
michael@0 | 288 | |
michael@0 | 289 | |
michael@0 | 290 | // Useful for converting an entity to an ascii value. |
michael@0 | 291 | // RETURNS unicode value, or -1 if entity isn't valid. Don't include & or ; |
michael@0 | 292 | int LookupEntity(const char* entity_name, int entity_len) { |
michael@0 | 293 | // Make a C string |
michael@0 | 294 | if (entity_len >= 16) {return -1;} // All real entities are shorter |
michael@0 | 295 | char temp[16]; |
michael@0 | 296 | memcpy(temp, entity_name, entity_len); |
michael@0 | 297 | temp[entity_len] = '\0'; |
michael@0 | 298 | int match = BinarySearch(temp, 0, kNameToEntitySize, kNameToEntity); |
michael@0 | 299 | if (match >= 0) {return kNameToEntity[match].i;} |
michael@0 | 300 | return -1; |
michael@0 | 301 | } |
michael@0 | 302 | |
michael@0 | 303 | bool ascii_isdigit(char c) { |
michael@0 | 304 | return ('0' <= c) && (c <= '9'); |
michael@0 | 305 | } |
michael@0 | 306 | bool ascii_isxdigit(char c) { |
michael@0 | 307 | if (('0' <= c) && (c <= '9')) {return true;} |
michael@0 | 308 | if (('a' <= c) && (c <= 'f')) {return true;} |
michael@0 | 309 | if (('A' <= c) && (c <= 'F')) {return true;} |
michael@0 | 310 | return false; |
michael@0 | 311 | } |
michael@0 | 312 | bool ascii_isalnum(char c) { |
michael@0 | 313 | if (('0' <= c) && (c <= '9')) {return true;} |
michael@0 | 314 | if (('a' <= c) && (c <= 'z')) {return true;} |
michael@0 | 315 | if (('A' <= c) && (c <= 'Z')) {return true;} |
michael@0 | 316 | return false; |
michael@0 | 317 | } |
michael@0 | 318 | int hex_digit_to_int(char c) { |
michael@0 | 319 | if (('0' <= c) && (c <= '9')) {return c - '0';} |
michael@0 | 320 | if (('a' <= c) && (c <= 'f')) {return c - 'a' + 10;} |
michael@0 | 321 | if (('A' <= c) && (c <= 'F')) {return c - 'A' + 10;} |
michael@0 | 322 | return 0; |
michael@0 | 323 | } |
michael@0 | 324 | |
michael@0 | 325 | static int32 strto32_base10(const char* nptr, const char* limit, |
michael@0 | 326 | const char **endptr) { |
michael@0 | 327 | *endptr = nptr; |
michael@0 | 328 | while (nptr < limit && *nptr == '0') { |
michael@0 | 329 | ++nptr; |
michael@0 | 330 | } |
michael@0 | 331 | if (nptr == limit || !ascii_isdigit(*nptr)) |
michael@0 | 332 | return -1; |
michael@0 | 333 | const char* end_digits_run = nptr; |
michael@0 | 334 | while (end_digits_run < limit && ascii_isdigit(*end_digits_run)) { |
michael@0 | 335 | ++end_digits_run; |
michael@0 | 336 | } |
michael@0 | 337 | *endptr = end_digits_run; |
michael@0 | 338 | const int num_digits = end_digits_run - nptr; |
michael@0 | 339 | // kint32max == 2147483647. |
michael@0 | 340 | if (num_digits < 9 || |
michael@0 | 341 | (num_digits == 10 && memcmp(nptr, "2147483647", 10) <= 0)) { |
michael@0 | 342 | int value = 0; |
michael@0 | 343 | for (; nptr < end_digits_run; ++nptr) { |
michael@0 | 344 | value *= 10; |
michael@0 | 345 | value += *nptr - '0'; |
michael@0 | 346 | } |
michael@0 | 347 | // Overflow past the last valid unicode codepoint |
michael@0 | 348 | // (0x10ffff) is converted to U+FFFD by FixUnicodeValue(). |
michael@0 | 349 | return FixUnicodeValue(value); |
michael@0 | 350 | } else { |
michael@0 | 351 | // Overflow: can't fit in an int32; |
michael@0 | 352 | // returns the replacement character 0xFFFD. |
michael@0 | 353 | return 0xFFFD; |
michael@0 | 354 | } |
michael@0 | 355 | } |
michael@0 | 356 | |
michael@0 | 357 | static int32 strto32_base16(const char* nptr, const char* limit, |
michael@0 | 358 | const char **endptr) { |
michael@0 | 359 | *endptr = nptr; |
michael@0 | 360 | while (nptr < limit && *nptr == '0') { |
michael@0 | 361 | ++nptr; |
michael@0 | 362 | } |
michael@0 | 363 | if (nptr == limit || !ascii_isxdigit(*nptr)) { |
michael@0 | 364 | return -1; |
michael@0 | 365 | } |
michael@0 | 366 | const char* end_xdigits_run = nptr; |
michael@0 | 367 | while (end_xdigits_run < limit && ascii_isxdigit(*end_xdigits_run)) { |
michael@0 | 368 | ++end_xdigits_run; |
michael@0 | 369 | } |
michael@0 | 370 | *endptr = end_xdigits_run; |
michael@0 | 371 | const int num_xdigits = end_xdigits_run - nptr; |
michael@0 | 372 | // kint32max == 0x7FFFFFFF. |
michael@0 | 373 | if (num_xdigits < 8 || (num_xdigits == 8 && nptr[0] < '8')) { |
michael@0 | 374 | int value = 0; |
michael@0 | 375 | for (; nptr < end_xdigits_run; ++nptr) { |
michael@0 | 376 | value <<= 4; |
michael@0 | 377 | value += hex_digit_to_int(*nptr); |
michael@0 | 378 | } |
michael@0 | 379 | // Overflow past the last valid unicode codepoint |
michael@0 | 380 | // (0x10ffff) is converted to U+FFFD by FixUnicodeValue(). |
michael@0 | 381 | return FixUnicodeValue(value); |
michael@0 | 382 | } else { |
michael@0 | 383 | // Overflow: can't fit in an int32; |
michael@0 | 384 | // returns the replacement character 0xFFFD. |
michael@0 | 385 | return 0xFFFD; |
michael@0 | 386 | } |
michael@0 | 387 | } |
michael@0 | 388 | |
michael@0 | 389 | // Unescape the current character pointed to by src. SETS the number |
michael@0 | 390 | // of chars read for the conversion (in UTF8). If src isn't a valid entity, |
michael@0 | 391 | // just consume the & and RETURN -1. If src doesn't point to & -- which it |
michael@0 | 392 | // should -- set src_consumed to 0 and RETURN -1. |
michael@0 | 393 | int ReadEntity(const char* src, int srcn, int* src_consumed) { |
michael@0 | 394 | const char* const srcend = src + srcn; |
michael@0 | 395 | |
michael@0 | 396 | if (srcn == 0 || *src != '&') { // input should start with an ampersand |
michael@0 | 397 | *src_consumed = 0; |
michael@0 | 398 | return -1; |
michael@0 | 399 | } |
michael@0 | 400 | *src_consumed = 1; // we'll get the & at least |
michael@0 | 401 | |
michael@0 | 402 | // The standards are a bit unclear on when an entity ends. Certainly a ";" |
michael@0 | 403 | // ends one, but spaces probably do too. We follow the lead of both IE and |
michael@0 | 404 | // Netscape, which as far as we can tell end numeric entities (1st case below) |
michael@0 | 405 | // at any non-digit, and end character entities (2nd case) at any non-alnum. |
michael@0 | 406 | const char* entstart, *entend; // where the entity starts and ends |
michael@0 | 407 | entstart = src + 1; // read past the & |
michael@0 | 408 | int entval; // UCS2 value of the entity |
michael@0 | 409 | if ( *entstart == '#' ) { // -- 1st case: numeric entity |
michael@0 | 410 | if ( entstart + 2 >= srcend ) { |
michael@0 | 411 | return -1; // no way a legitimate number could fit |
michael@0 | 412 | } else if ( entstart[1] == 'x' || entstart[1] == 'X' ) { // hex numeric |
michael@0 | 413 | entval = strto32_base16(entstart + 2, srcend, &entend); |
michael@0 | 414 | } else { // decimal numeric entity |
michael@0 | 415 | entval = strto32_base10(entstart+1, srcend, &entend); |
michael@0 | 416 | } |
michael@0 | 417 | if (entval == -1 || entend > srcend) { |
michael@0 | 418 | return -1; // not entirely correct, but close enough |
michael@0 | 419 | } |
michael@0 | 420 | } else { // -- 2nd case: character entity |
michael@0 | 421 | for (entend = entstart; |
michael@0 | 422 | entend < srcend && ascii_isalnum(*entend); |
michael@0 | 423 | ++entend ) { |
michael@0 | 424 | // entity consists of alphanumeric chars |
michael@0 | 425 | } |
michael@0 | 426 | entval = LookupEntity(entstart, entend - entstart); |
michael@0 | 427 | if (entval < 0) { |
michael@0 | 428 | return -1; // not a legal entity name |
michael@0 | 429 | } |
michael@0 | 430 | // Now we do a strange-seeming IE6-compatibility check: if entval is |
michael@0 | 431 | // >= 256, it *must* be followed by a semicolon or it's not considered |
michael@0 | 432 | // an entity. The problem is lots of the newfangled entity names, like |
michael@0 | 433 | // "lang", also occur in URL CGI arguments: "/search?q=test&lang=en". |
michael@0 | 434 | // When these links are written in HTML, it would be really bad if the |
michael@0 | 435 | // "&lang" were treated as an entity, which is what the spec says |
michael@0 | 436 | // *should* happen (even when the HTML is inside an "A HREF" tag!) |
michael@0 | 437 | // IE ignores the spec for these new, high-value entities, so we do too. |
michael@0 | 438 | if ( entval >= 256 && !(entend < srcend && *entend == ';') ) { |
michael@0 | 439 | return -1; // make non-;-terminated entity illegal |
michael@0 | 440 | } |
michael@0 | 441 | } |
michael@0 | 442 | |
michael@0 | 443 | // Finally, figure out how much src was consumed |
michael@0 | 444 | if ( entend < srcend && *entend == ';' ) { |
michael@0 | 445 | entend++; // standard says ; terminator is special |
michael@0 | 446 | } |
michael@0 | 447 | *src_consumed = entend - src; |
michael@0 | 448 | return entval; |
michael@0 | 449 | } |
michael@0 | 450 | |
michael@0 | 451 | |
michael@0 | 452 | // Src points to '&' |
michael@0 | 453 | // Writes entity value to dst. Returns take(src), put(dst) byte counts |
michael@0 | 454 | void EntityToBuffer(const char* src, int len, char* dst, |
michael@0 | 455 | int* tlen, int* plen) { |
michael@0 | 456 | char32 entval = ReadEntity(src, len, tlen); |
michael@0 | 457 | |
michael@0 | 458 | // ReadEntity does this already: entval = FixUnicodeValue(entval); |
michael@0 | 459 | |
michael@0 | 460 | // Convert UTF-32 to UTF-8 |
michael@0 | 461 | if (entval > 0) { |
michael@0 | 462 | *plen = runetochar(dst, &entval); |
michael@0 | 463 | } else { |
michael@0 | 464 | // Illegal entity; ignore the '&' |
michael@0 | 465 | *tlen = 1; |
michael@0 | 466 | *plen = 0; |
michael@0 | 467 | } |
michael@0 | 468 | } |
michael@0 | 469 | |
michael@0 | 470 | // Returns true if character is < > or &, none of which are letters |
michael@0 | 471 | bool inline IsSpecial(char c) { |
michael@0 | 472 | if ((c & 0xe0) == 0x20) { |
michael@0 | 473 | return kSpecialSymbol[static_cast<uint8>(c)]; |
michael@0 | 474 | } |
michael@0 | 475 | return false; |
michael@0 | 476 | } |
michael@0 | 477 | |
michael@0 | 478 | // Quick Skip to next letter or < > & or to end of string (eos) |
michael@0 | 479 | // Always return is_letter for eos |
michael@0 | 480 | int ScanToLetterOrSpecial(const char* src, int len) { |
michael@0 | 481 | int bytes_consumed; |
michael@0 | 482 | StringPiece str(src, len); |
michael@0 | 483 | UTF8GenericScan(&utf8scannot_lettermarkspecial_obj, str, &bytes_consumed); |
michael@0 | 484 | return bytes_consumed; |
michael@0 | 485 | } |
michael@0 | 486 | |
michael@0 | 487 | |
michael@0 | 488 | |
michael@0 | 489 | |
michael@0 | 490 | // src points to non-letter, such as tag-opening '<' |
michael@0 | 491 | // Return length from here to next possible letter |
michael@0 | 492 | // On another < before >, return 1 |
michael@0 | 493 | // advances <tag> |
michael@0 | 494 | // | | |
michael@0 | 495 | // advances <tag> ... </tag> for <script> <style> |
michael@0 | 496 | // | | |
michael@0 | 497 | // advances <!-- ... <tag> ... --> |
michael@0 | 498 | // | | |
michael@0 | 499 | // advances <tag |
michael@0 | 500 | // | | end of string |
michael@0 | 501 | // advances <tag <tag2> |
michael@0 | 502 | // || |
michael@0 | 503 | int ScanToPossibleLetter(const char* isrc, int len, int max_exit_state) { |
michael@0 | 504 | const uint8* src = reinterpret_cast<const uint8*>(isrc); |
michael@0 | 505 | const uint8* srclimit = src + len; |
michael@0 | 506 | const uint8* tagParseTbl = kTagParseTbl_0; |
michael@0 | 507 | int e = 0; |
michael@0 | 508 | while (src < srclimit) { |
michael@0 | 509 | e = tagParseTbl[kCharToSub[*src++]]; |
michael@0 | 510 | if (e <= max_exit_state) { |
michael@0 | 511 | // We overshot by one byte |
michael@0 | 512 | --src; |
michael@0 | 513 | break; |
michael@0 | 514 | } |
michael@0 | 515 | tagParseTbl = &kTagParseTbl_0[e * 20]; |
michael@0 | 516 | } |
michael@0 | 517 | |
michael@0 | 518 | if (src >= srclimit) { |
michael@0 | 519 | // We fell off the end of the text. |
michael@0 | 520 | // It looks like the most common case for this is a truncated file, not |
michael@0 | 521 | // mismatched angle brackets. So we pretend that the last char was '>' |
michael@0 | 522 | return len; |
michael@0 | 523 | } |
michael@0 | 524 | |
michael@0 | 525 | // OK to be in state 0 or state 2 at exit |
michael@0 | 526 | if ((e != 0) && (e != 2)) { |
michael@0 | 527 | // Error, '<' followed by '<' |
michael@0 | 528 | // We want to back up to first <, then advance by one byte past it |
michael@0 | 529 | int offset = src - reinterpret_cast<const uint8*>(isrc); |
michael@0 | 530 | |
michael@0 | 531 | // Backscan to first '<' and return enough length to just get past it |
michael@0 | 532 | --offset; // back up over the second '<', which caused us to stop |
michael@0 | 533 | while ((0 < offset) && (isrc[offset] != '<')) { |
michael@0 | 534 | // Find the first '<', which is unmatched |
michael@0 | 535 | --offset; |
michael@0 | 536 | } |
michael@0 | 537 | // skip to just beyond first '<' |
michael@0 | 538 | return offset + 1; |
michael@0 | 539 | } |
michael@0 | 540 | |
michael@0 | 541 | return src - reinterpret_cast<const uint8*>(isrc); |
michael@0 | 542 | } |
michael@0 | 543 | |
michael@0 | 544 | |
michael@0 | 545 | ScriptScanner::ScriptScanner(const char* buffer, |
michael@0 | 546 | int buffer_length, |
michael@0 | 547 | bool is_plain_text) |
michael@0 | 548 | : start_byte_(buffer), |
michael@0 | 549 | next_byte_(buffer), |
michael@0 | 550 | next_byte_limit_(buffer + buffer_length), |
michael@0 | 551 | byte_length_(buffer_length), |
michael@0 | 552 | is_plain_text_(is_plain_text), |
michael@0 | 553 | letters_marks_only_(true), |
michael@0 | 554 | one_script_only_(true), |
michael@0 | 555 | exit_state_(kMaxExitStateLettersMarksOnly) { |
michael@0 | 556 | script_buffer_ = new char[kMaxScriptBuffer]; |
michael@0 | 557 | script_buffer_lower_ = new char[kMaxScriptLowerBuffer]; |
michael@0 | 558 | map2original_.Clear(); // map from script_buffer_ to buffer |
michael@0 | 559 | map2uplow_.Clear(); // map from script_buffer_lower_ to script_buffer_ |
michael@0 | 560 | } |
michael@0 | 561 | |
michael@0 | 562 | // Extended version to allow spans of any non-tag text and spans of mixed script |
michael@0 | 563 | ScriptScanner::ScriptScanner(const char* buffer, |
michael@0 | 564 | int buffer_length, |
michael@0 | 565 | bool is_plain_text, |
michael@0 | 566 | bool any_text, |
michael@0 | 567 | bool any_script) |
michael@0 | 568 | : start_byte_(buffer), |
michael@0 | 569 | next_byte_(buffer), |
michael@0 | 570 | next_byte_limit_(buffer + buffer_length), |
michael@0 | 571 | byte_length_(buffer_length), |
michael@0 | 572 | is_plain_text_(is_plain_text), |
michael@0 | 573 | letters_marks_only_(!any_text), |
michael@0 | 574 | one_script_only_(!any_script), |
michael@0 | 575 | exit_state_(any_text ? kMaxExitStateAllText : kMaxExitStateLettersMarksOnly) { |
michael@0 | 576 | script_buffer_ = new char[kMaxScriptBuffer]; |
michael@0 | 577 | script_buffer_lower_ = new char[kMaxScriptLowerBuffer]; |
michael@0 | 578 | map2original_.Clear(); // map from script_buffer_ to buffer |
michael@0 | 579 | map2uplow_.Clear(); // map from script_buffer_lower_ to script_buffer_ |
michael@0 | 580 | } |
michael@0 | 581 | |
michael@0 | 582 | |
michael@0 | 583 | ScriptScanner::~ScriptScanner() { |
michael@0 | 584 | delete[] script_buffer_; |
michael@0 | 585 | delete[] script_buffer_lower_; |
michael@0 | 586 | } |
michael@0 | 587 | |
michael@0 | 588 | |
michael@0 | 589 | |
michael@0 | 590 | |
michael@0 | 591 | // Get to the first real non-tag letter or entity that is a letter |
michael@0 | 592 | // Sets script of that letter |
michael@0 | 593 | // Return len if no more letters |
michael@0 | 594 | int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) { |
michael@0 | 595 | int sc = UNKNOWN_ULSCRIPT; |
michael@0 | 596 | int skip = 0; |
michael@0 | 597 | int tlen, plen; |
michael@0 | 598 | |
michael@0 | 599 | // Do run of non-letters (tag | &NL | NL)* |
michael@0 | 600 | tlen = 0; |
michael@0 | 601 | while (skip < len) { |
michael@0 | 602 | // Do fast scan to next interesting byte |
michael@0 | 603 | // int oldskip = skip; |
michael@0 | 604 | skip += ScanToLetterOrSpecial(src + skip, len - skip); |
michael@0 | 605 | |
michael@0 | 606 | // Check for no more letters/specials |
michael@0 | 607 | if (skip >= len) { |
michael@0 | 608 | // All done |
michael@0 | 609 | *script = sc; |
michael@0 | 610 | return len; |
michael@0 | 611 | } |
michael@0 | 612 | |
michael@0 | 613 | // We are at a letter, nonletter, tag, or entity |
michael@0 | 614 | if (IsSpecial(src[skip]) && !is_plain_text_) { |
michael@0 | 615 | if (src[skip] == '<') { |
michael@0 | 616 | // Begining of tag; skip to end and go around again |
michael@0 | 617 | tlen = ScanToPossibleLetter(src + skip, len - skip, |
michael@0 | 618 | exit_state_); |
michael@0 | 619 | sc = 0; |
michael@0 | 620 | } else if (src[skip] == '>') { |
michael@0 | 621 | // Unexpected end of tag; skip it and go around again |
michael@0 | 622 | tlen = 1; // Over the > |
michael@0 | 623 | sc = 0; |
michael@0 | 624 | } else if (src[skip] == '&') { |
michael@0 | 625 | // Expand entity, no advance |
michael@0 | 626 | char temp[4]; |
michael@0 | 627 | EntityToBuffer(src + skip, len - skip, |
michael@0 | 628 | temp, &tlen, &plen); |
michael@0 | 629 | sc = GetUTF8LetterScriptNum(temp); |
michael@0 | 630 | } |
michael@0 | 631 | } else { |
michael@0 | 632 | // Update 1..4 bytes |
michael@0 | 633 | tlen = UTF8OneCharLen(src + skip); |
michael@0 | 634 | sc = GetUTF8LetterScriptNum(src + skip); |
michael@0 | 635 | } |
michael@0 | 636 | if (sc != 0) {break;} // Letter found |
michael@0 | 637 | skip += tlen; // Else advance |
michael@0 | 638 | } |
michael@0 | 639 | |
michael@0 | 640 | *script = sc; |
michael@0 | 641 | return skip; |
michael@0 | 642 | } |
michael@0 | 643 | |
michael@0 | 644 | |
michael@0 | 645 | // These are for ASCII-only tag names |
michael@0 | 646 | // Compare one letter uplow to c, ignoring case of uplowp |
michael@0 | 647 | inline bool EqCase(char uplow, char c) { |
michael@0 | 648 | return (uplow | 0x20) == c; |
michael@0 | 649 | } |
michael@0 | 650 | |
michael@0 | 651 | // These are for ASCII-only tag names |
michael@0 | 652 | // Return true for space / < > etc. all less than 0x40 |
michael@0 | 653 | inline bool NeqLetter(char c) { |
michael@0 | 654 | return c < 0x40; |
michael@0 | 655 | } |
michael@0 | 656 | |
michael@0 | 657 | // These are for ASCII-only tag names |
michael@0 | 658 | // Return true for space \n false for \r |
michael@0 | 659 | inline bool WS(char c) { |
michael@0 | 660 | return (c == ' ') || (c == '\n'); |
michael@0 | 661 | } |
michael@0 | 662 | |
michael@0 | 663 | // Canonical CR or LF |
michael@0 | 664 | static const char LF = '\n'; |
michael@0 | 665 | |
michael@0 | 666 | |
michael@0 | 667 | // The naive loop scans from next_byte_ to script_buffer_ until full. |
michael@0 | 668 | // But this can leave an awkward hard-to-identify short fragment at the |
michael@0 | 669 | // end of the input. We would prefer to make the next-to-last fragment |
michael@0 | 670 | // shorter and the last fragment longer. |
michael@0 | 671 | |
michael@0 | 672 | // Copy next run of non-tag characters to buffer [NUL terminated] |
michael@0 | 673 | // This just replaces tags with space or \n and removes entities. |
michael@0 | 674 | // Tags <br> <p> and <tr> are replaced with \n. Non-letter sequences |
michael@0 | 675 | // including \r or \n are replaced by \n. All other tags and skipped text |
michael@0 | 676 | // are replaced with ASCII space. |
michael@0 | 677 | // |
michael@0 | 678 | // Buffer ALWAYS has leading space and trailing space space space NUL |
michael@0 | 679 | bool ScriptScanner::GetOneTextSpan(LangSpan* span) { |
michael@0 | 680 | span->text = script_buffer_; |
michael@0 | 681 | span->text_bytes = 0; |
michael@0 | 682 | span->offset = next_byte_ - start_byte_; |
michael@0 | 683 | span->ulscript = UNKNOWN_ULSCRIPT; |
michael@0 | 684 | span->lang = UNKNOWN_LANGUAGE; |
michael@0 | 685 | span->truncated = false; |
michael@0 | 686 | |
michael@0 | 687 | int put_soft_limit = kMaxScriptBytes - kWithinScriptTail; |
michael@0 | 688 | if ((kMaxScriptBytes <= byte_length_) && |
michael@0 | 689 | (byte_length_ < (2 * kMaxScriptBytes))) { |
michael@0 | 690 | // Try to split the last two fragments in half |
michael@0 | 691 | put_soft_limit = byte_length_ / 2; |
michael@0 | 692 | } |
michael@0 | 693 | |
michael@0 | 694 | script_buffer_[0] = ' '; // Always a space at front of output |
michael@0 | 695 | script_buffer_[1] = '\0'; |
michael@0 | 696 | int take = 0; |
michael@0 | 697 | int put = 1; // Start after the initial space |
michael@0 | 698 | int tlen, plen; |
michael@0 | 699 | |
michael@0 | 700 | if (byte_length_ <= 0) { |
michael@0 | 701 | return false; // No more text to be found |
michael@0 | 702 | } |
michael@0 | 703 | |
michael@0 | 704 | // Go over alternating spans of text and tags, |
michael@0 | 705 | // copying letters to buffer with single spaces for each run of non-letters |
michael@0 | 706 | bool last_byte_was_space = false; |
michael@0 | 707 | while (take < byte_length_) { |
michael@0 | 708 | char c = next_byte_[take]; |
michael@0 | 709 | if (c == '\r') {c = LF;} // Canonical CR or LF |
michael@0 | 710 | if (c == '\n') {c = LF;} // Canonical CR or LF |
michael@0 | 711 | |
michael@0 | 712 | if (IsSpecial(c) && !is_plain_text_) { |
michael@0 | 713 | if (c == '<') { |
michael@0 | 714 | // Replace tag with space |
michael@0 | 715 | c = ' '; // for almost-full test below |
michael@0 | 716 | // or if <p> <br> <tr>, replace with \n |
michael@0 | 717 | if (take < (byte_length_ - 3)) { |
michael@0 | 718 | if (EqCase(next_byte_[take + 1], 'p') && |
michael@0 | 719 | NeqLetter(next_byte_[take + 2])) { |
michael@0 | 720 | c = LF; |
michael@0 | 721 | } |
michael@0 | 722 | if (EqCase(next_byte_[take + 1], 'b') && |
michael@0 | 723 | EqCase(next_byte_[take + 2], 'r') && |
michael@0 | 724 | NeqLetter(next_byte_[take + 3])) { |
michael@0 | 725 | c = LF; |
michael@0 | 726 | } |
michael@0 | 727 | if (EqCase(next_byte_[take + 1], 't') && |
michael@0 | 728 | EqCase(next_byte_[take + 2], 'r') && |
michael@0 | 729 | NeqLetter(next_byte_[take + 3])) { |
michael@0 | 730 | c = LF; |
michael@0 | 731 | } |
michael@0 | 732 | } |
michael@0 | 733 | // Begining of tag; skip to end and go around again |
michael@0 | 734 | tlen = 1 + ScanToPossibleLetter(next_byte_ + take, byte_length_ - take, |
michael@0 | 735 | exit_state_); |
michael@0 | 736 | // Copy one byte, compressing spaces |
michael@0 | 737 | if (!last_byte_was_space || !WS(c)) { |
michael@0 | 738 | script_buffer_[put++] = c; // Advance dest |
michael@0 | 739 | last_byte_was_space = WS(c); |
michael@0 | 740 | } |
michael@0 | 741 | } else if (c == '>') { |
michael@0 | 742 | // Unexpected end of tag; copy it and go around again |
michael@0 | 743 | tlen = 1; // Over the > |
michael@0 | 744 | script_buffer_[put++] = c; // Advance dest |
michael@0 | 745 | } else if (c == '&') { |
michael@0 | 746 | // Expand entity, no advance |
michael@0 | 747 | EntityToBuffer(next_byte_ + take, byte_length_ - take, |
michael@0 | 748 | script_buffer_ + put, &tlen, &plen); |
michael@0 | 749 | put += plen; // Advance dest |
michael@0 | 750 | } |
michael@0 | 751 | take += tlen; // Advance source |
michael@0 | 752 | } else { |
michael@0 | 753 | // Copy one byte, compressing spaces |
michael@0 | 754 | if (!last_byte_was_space || !WS(c)) { |
michael@0 | 755 | script_buffer_[put++] = c; // Advance dest |
michael@0 | 756 | last_byte_was_space = WS(c); |
michael@0 | 757 | } |
michael@0 | 758 | ++take; // Advance source |
michael@0 | 759 | } |
michael@0 | 760 | |
michael@0 | 761 | if (WS(c) && |
michael@0 | 762 | (put >= put_soft_limit)) { |
michael@0 | 763 | // Buffer is almost full |
michael@0 | 764 | span->truncated = true; |
michael@0 | 765 | break; |
michael@0 | 766 | } |
michael@0 | 767 | if (put >= kMaxScriptBytes) { |
michael@0 | 768 | // Buffer is completely full |
michael@0 | 769 | span->truncated = true; |
michael@0 | 770 | break; |
michael@0 | 771 | } |
michael@0 | 772 | } |
michael@0 | 773 | |
michael@0 | 774 | // Almost done. Back up to a character boundary if needed |
michael@0 | 775 | while ((0 < take) && ((next_byte_[take] & 0xc0) == 0x80)) { |
michael@0 | 776 | // Back up over continuation byte |
michael@0 | 777 | --take; |
michael@0 | 778 | --put; |
michael@0 | 779 | } |
michael@0 | 780 | |
michael@0 | 781 | // Update input position |
michael@0 | 782 | next_byte_ += take; |
michael@0 | 783 | byte_length_ -= take; |
michael@0 | 784 | |
michael@0 | 785 | // Put four more spaces/NUL. Worst case is abcd _ _ _ \0 |
michael@0 | 786 | // kMaxScriptBytes | | put |
michael@0 | 787 | script_buffer_[put + 0] = ' '; |
michael@0 | 788 | script_buffer_[put + 1] = ' '; |
michael@0 | 789 | script_buffer_[put + 2] = ' '; |
michael@0 | 790 | script_buffer_[put + 3] = '\0'; |
michael@0 | 791 | |
michael@0 | 792 | span->text_bytes = put; // Does not include the last four chars above |
michael@0 | 793 | return true; |
michael@0 | 794 | } |
michael@0 | 795 | |
michael@0 | 796 | |
michael@0 | 797 | // Copy next run of same-script non-tag letters to buffer [NUL terminated] |
michael@0 | 798 | // Buffer ALWAYS has leading space and trailing space space space NUL |
michael@0 | 799 | bool ScriptScanner::GetOneScriptSpan(LangSpan* span) { |
michael@0 | 800 | if (!letters_marks_only_) { |
michael@0 | 801 | // Return non-tag text, including punctuation and digits |
michael@0 | 802 | return GetOneTextSpan(span); |
michael@0 | 803 | } |
michael@0 | 804 | |
michael@0 | 805 | span->text = script_buffer_; |
michael@0 | 806 | span->text_bytes = 0; |
michael@0 | 807 | span->offset = next_byte_ - start_byte_; |
michael@0 | 808 | span->ulscript = UNKNOWN_ULSCRIPT; |
michael@0 | 809 | span->lang = UNKNOWN_LANGUAGE; |
michael@0 | 810 | span->truncated = false; |
michael@0 | 811 | |
michael@0 | 812 | // struct timeval script_start, script_mid, script_end; |
michael@0 | 813 | |
michael@0 | 814 | int put_soft_limit = kMaxScriptBytes - kWithinScriptTail; |
michael@0 | 815 | if ((kMaxScriptBytes <= byte_length_) && |
michael@0 | 816 | (byte_length_ < (2 * kMaxScriptBytes))) { |
michael@0 | 817 | // Try to split the last two fragments in half |
michael@0 | 818 | put_soft_limit = byte_length_ / 2; |
michael@0 | 819 | } |
michael@0 | 820 | |
michael@0 | 821 | |
michael@0 | 822 | int spanscript; // The script of this span |
michael@0 | 823 | int sc = UNKNOWN_ULSCRIPT; // The script of next character |
michael@0 | 824 | int tlen = 0; |
michael@0 | 825 | int plen = 0; |
michael@0 | 826 | |
michael@0 | 827 | script_buffer_[0] = ' '; // Always a space at front of output |
michael@0 | 828 | script_buffer_[1] = '\0'; |
michael@0 | 829 | int take = 0; |
michael@0 | 830 | int put = 1; // Start after the initial space |
michael@0 | 831 | |
michael@0 | 832 | // Build offsets from span->text back to start_byte_ + span->offset |
michael@0 | 833 | // This mapping reflects deletion of non-letters, expansion of |
michael@0 | 834 | // entities, etc. |
michael@0 | 835 | map2original_.Clear(); |
michael@0 | 836 | map2original_.Delete(span->offset); // So that MapBack(0) gives offset |
michael@0 | 837 | |
michael@0 | 838 | // Get to the first real non-tag letter or entity that is a letter |
michael@0 | 839 | int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript); |
michael@0 | 840 | next_byte_ += skip; |
michael@0 | 841 | byte_length_ -= skip; |
michael@0 | 842 | |
michael@0 | 843 | if (skip != 1) { |
michael@0 | 844 | map2original_.Delete(skip); |
michael@0 | 845 | map2original_.Insert(1); |
michael@0 | 846 | } else { |
michael@0 | 847 | map2original_.Copy(1); |
michael@0 | 848 | } |
michael@0 | 849 | if (byte_length_ <= 0) { |
michael@0 | 850 | map2original_.Reset(); |
michael@0 | 851 | return false; // No more letters to be found |
michael@0 | 852 | } |
michael@0 | 853 | |
michael@0 | 854 | // There is at least one letter, so we know the script for this span |
michael@0 | 855 | span->ulscript = (ULScript)spanscript; |
michael@0 | 856 | |
michael@0 | 857 | |
michael@0 | 858 | // Go over alternating spans of same-script letters and non-letters, |
michael@0 | 859 | // copying letters to buffer with single spaces for each run of non-letters |
michael@0 | 860 | while (take < byte_length_) { |
michael@0 | 861 | // Copy run of letters in same script (&LS | LS)* |
michael@0 | 862 | int letter_count = 0; // Keep track of word length |
michael@0 | 863 | bool need_break = false; |
michael@0 | 864 | |
michael@0 | 865 | while (take < byte_length_) { |
michael@0 | 866 | // We are at a letter, nonletter, tag, or entity |
michael@0 | 867 | if (IsSpecial(next_byte_[take]) && !is_plain_text_) { |
michael@0 | 868 | if (next_byte_[take] == '<') { |
michael@0 | 869 | // Begining of tag |
michael@0 | 870 | sc = 0; |
michael@0 | 871 | break; |
michael@0 | 872 | } else if (next_byte_[take] == '>') { |
michael@0 | 873 | // Unexpected end of tag |
michael@0 | 874 | sc = 0; |
michael@0 | 875 | break; |
michael@0 | 876 | } else if (next_byte_[take] == '&') { |
michael@0 | 877 | // Copy entity, no advance |
michael@0 | 878 | EntityToBuffer(next_byte_ + take, byte_length_ - take, |
michael@0 | 879 | script_buffer_ + put, &tlen, &plen); |
michael@0 | 880 | sc = GetUTF8LetterScriptNum(script_buffer_ + put); |
michael@0 | 881 | } |
michael@0 | 882 | } else { |
michael@0 | 883 | // Real letter, safely copy up to 4 bytes, increment by 1..4 |
michael@0 | 884 | // Will update by 1..4 bytes at Advance, below |
michael@0 | 885 | tlen = plen = UTF8OneCharLen(next_byte_ + take); |
michael@0 | 886 | if (take < (byte_length_ - 3)) { |
michael@0 | 887 | // X86 fast case, does unaligned load/store |
michael@0 | 888 | UNALIGNED_STORE32(script_buffer_ + put, |
michael@0 | 889 | UNALIGNED_LOAD32(next_byte_ + take)); |
michael@0 | 890 | |
michael@0 | 891 | } else { |
michael@0 | 892 | // Slow case, happens 1-3 times per input document |
michael@0 | 893 | memcpy(script_buffer_ + put, next_byte_ + take, plen); |
michael@0 | 894 | } |
michael@0 | 895 | sc = GetUTF8LetterScriptNum(next_byte_ + take); |
michael@0 | 896 | } |
michael@0 | 897 | |
michael@0 | 898 | // Allow continue across a single letter in a different script: |
michael@0 | 899 | // A B D = three scripts, c = common script, i = inherited script, |
michael@0 | 900 | // - = don't care, ( = take position before the += below |
michael@0 | 901 | // AAA(A- continue |
michael@0 | 902 | // |
michael@0 | 903 | // AAA(BA continue |
michael@0 | 904 | // AAA(BB break |
michael@0 | 905 | // AAA(Bc continue (breaks after B) |
michael@0 | 906 | // AAA(BD break |
michael@0 | 907 | // AAA(Bi break |
michael@0 | 908 | // |
michael@0 | 909 | // AAA(c- break |
michael@0 | 910 | // |
michael@0 | 911 | // AAA(i- continue |
michael@0 | 912 | // |
michael@0 | 913 | |
michael@0 | 914 | if ((sc != spanscript) && (sc != ULScript_Inherited)) { |
michael@0 | 915 | // Might need to break this script span |
michael@0 | 916 | if (sc == ULScript_Common) { |
michael@0 | 917 | need_break = true; |
michael@0 | 918 | } else { |
michael@0 | 919 | // Look at next following character, ignoring entity as Common |
michael@0 | 920 | int sc2 = GetUTF8LetterScriptNum(next_byte_ + take + tlen); |
michael@0 | 921 | if ((sc2 != ULScript_Common) && (sc2 != spanscript)) { |
michael@0 | 922 | // We found a non-trivial change of script |
michael@0 | 923 | if (one_script_only_) { |
michael@0 | 924 | need_break = true; |
michael@0 | 925 | } |
michael@0 | 926 | } |
michael@0 | 927 | } |
michael@0 | 928 | } |
michael@0 | 929 | if (need_break) {break;} // Non-letter or letter in wrong script |
michael@0 | 930 | |
michael@0 | 931 | take += tlen; // Advance |
michael@0 | 932 | put += plen; // Advance |
michael@0 | 933 | |
michael@0 | 934 | // Update the offset map to reflect take/put lengths |
michael@0 | 935 | if (tlen == plen) { |
michael@0 | 936 | map2original_.Copy(tlen); |
michael@0 | 937 | } else if (tlen < plen) { |
michael@0 | 938 | map2original_.Copy(tlen); |
michael@0 | 939 | map2original_.Insert(plen - tlen); |
michael@0 | 940 | } else { // plen < tlen |
michael@0 | 941 | map2original_.Copy(plen); |
michael@0 | 942 | map2original_.Delete(tlen - plen); |
michael@0 | 943 | } |
michael@0 | 944 | |
michael@0 | 945 | ++letter_count; |
michael@0 | 946 | if (put >= kMaxScriptBytes) { |
michael@0 | 947 | // Buffer is full |
michael@0 | 948 | span->truncated = true; |
michael@0 | 949 | break; |
michael@0 | 950 | } |
michael@0 | 951 | } // End while letters |
michael@0 | 952 | |
michael@0 | 953 | // Do run of non-letters (tag | &NL | NL)* |
michael@0 | 954 | while (take < byte_length_) { |
michael@0 | 955 | // Do fast scan to next interesting byte |
michael@0 | 956 | tlen = ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take); |
michael@0 | 957 | take += tlen; |
michael@0 | 958 | map2original_.Delete(tlen); |
michael@0 | 959 | if (take >= byte_length_) {break;} // Might have scanned to end |
michael@0 | 960 | |
michael@0 | 961 | // We are at a letter, nonletter, tag, or entity |
michael@0 | 962 | if (IsSpecial(next_byte_[take]) && !is_plain_text_) { |
michael@0 | 963 | if (next_byte_[take] == '<') { |
michael@0 | 964 | // Begining of tag; skip to end and go around again |
michael@0 | 965 | tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take, |
michael@0 | 966 | exit_state_); |
michael@0 | 967 | sc = 0; |
michael@0 | 968 | } else if (next_byte_[take] == '>') { |
michael@0 | 969 | // Unexpected end of tag; skip it and go around again |
michael@0 | 970 | tlen = 1; // Over the > |
michael@0 | 971 | sc = 0; |
michael@0 | 972 | } else if (next_byte_[take] == '&') { |
michael@0 | 973 | // Expand entity, no advance |
michael@0 | 974 | EntityToBuffer(next_byte_ + take, byte_length_ - take, |
michael@0 | 975 | script_buffer_ + put, &tlen, &plen); |
michael@0 | 976 | sc = GetUTF8LetterScriptNum(script_buffer_ + put); |
michael@0 | 977 | } |
michael@0 | 978 | } else { |
michael@0 | 979 | // Update 1..4 |
michael@0 | 980 | tlen = UTF8OneCharLen(next_byte_ + take); |
michael@0 | 981 | sc = GetUTF8LetterScriptNum(next_byte_ + take); |
michael@0 | 982 | } |
michael@0 | 983 | if (sc != 0) {break;} // Letter found |
michael@0 | 984 | take += tlen; // Else advance |
michael@0 | 985 | map2original_.Delete(tlen); |
michael@0 | 986 | } // End while not-letters |
michael@0 | 987 | |
michael@0 | 988 | script_buffer_[put++] = ' '; |
michael@0 | 989 | map2original_.Insert(1); |
michael@0 | 990 | |
michael@0 | 991 | // Letter in wrong script ? |
michael@0 | 992 | if ((sc != spanscript) && (sc != ULScript_Inherited)) {break;} |
michael@0 | 993 | if (put >= put_soft_limit) { |
michael@0 | 994 | // Buffer is almost full |
michael@0 | 995 | span->truncated = true; |
michael@0 | 996 | break; |
michael@0 | 997 | } |
michael@0 | 998 | } |
michael@0 | 999 | |
michael@0 | 1000 | // Almost done. Back up to a character boundary if needed |
michael@0 | 1001 | while ((0 < take) && (take < byte_length_) && |
michael@0 | 1002 | ((next_byte_[take] & 0xc0) == 0x80)) { |
michael@0 | 1003 | // Back up over continuation byte |
michael@0 | 1004 | --take; |
michael@0 | 1005 | --put; |
michael@0 | 1006 | } |
michael@0 | 1007 | |
michael@0 | 1008 | // Update input position |
michael@0 | 1009 | next_byte_ += take; |
michael@0 | 1010 | byte_length_ -= take; |
michael@0 | 1011 | |
michael@0 | 1012 | // Put four more spaces/NUL. Worst case is abcd _ _ _ \0 |
michael@0 | 1013 | // kMaxScriptBytes | | put |
michael@0 | 1014 | script_buffer_[put + 0] = ' '; |
michael@0 | 1015 | script_buffer_[put + 1] = ' '; |
michael@0 | 1016 | script_buffer_[put + 2] = ' '; |
michael@0 | 1017 | script_buffer_[put + 3] = '\0'; |
michael@0 | 1018 | map2original_.Insert(4); |
michael@0 | 1019 | map2original_.Reset(); |
michael@0 | 1020 | |
michael@0 | 1021 | span->text_bytes = put; // Does not include the last four chars above |
michael@0 | 1022 | return true; |
michael@0 | 1023 | } |
michael@0 | 1024 | |
michael@0 | 1025 | // Force Latin, Cyrillic, Armenian, Greek scripts to be lowercase |
michael@0 | 1026 | // List changes with each version of Unicode, so just always lowercase |
michael@0 | 1027 | // Unicode 6.2.0: |
michael@0 | 1028 | // ARMENIAN COPTIC CYRILLIC DESERET GEORGIAN GLAGOLITIC GREEK LATIN |
michael@0 | 1029 | void ScriptScanner::LowerScriptSpan(LangSpan* span) { |
michael@0 | 1030 | // If needed, lowercase all the text. If we do it sooner, might miss |
michael@0 | 1031 | // lowercasing an entity such as Á |
michael@0 | 1032 | // We only need to do this for Latn and Cyrl scripts |
michael@0 | 1033 | map2uplow_.Clear(); |
michael@0 | 1034 | // Full Unicode lowercase of the entire buffer, including |
michael@0 | 1035 | // four pad bytes off the end. |
michael@0 | 1036 | // Ahhh. But the last byte 0x00 is not interchange-valid, so we do 3 pad |
michael@0 | 1037 | // bytes and put the 0x00 in explicitly. |
michael@0 | 1038 | // Build an offset map from script_buffer_lower_ back to script_buffer_ |
michael@0 | 1039 | int consumed, filled, changed; |
michael@0 | 1040 | StringPiece istr(span->text, span->text_bytes + 3); |
michael@0 | 1041 | StringPiece ostr(script_buffer_lower_, kMaxScriptLowerBuffer); |
michael@0 | 1042 | |
michael@0 | 1043 | UTF8GenericReplace(&utf8repl_lettermarklower_obj, |
michael@0 | 1044 | istr, ostr, is_plain_text_, |
michael@0 | 1045 | &consumed, &filled, &changed, &map2uplow_); |
michael@0 | 1046 | script_buffer_lower_[filled] = '\0'; |
michael@0 | 1047 | span->text = script_buffer_lower_; |
michael@0 | 1048 | span->text_bytes = filled - 3; |
michael@0 | 1049 | map2uplow_.Reset(); |
michael@0 | 1050 | } |
michael@0 | 1051 | |
michael@0 | 1052 | // Copy next run of same-script non-tag letters to buffer [NUL terminated] |
michael@0 | 1053 | // Force Latin, Cyrillic, Greek scripts to be lowercase |
michael@0 | 1054 | // Buffer ALWAYS has leading space and trailing space space space NUL |
michael@0 | 1055 | bool ScriptScanner::GetOneScriptSpanLower(LangSpan* span) { |
michael@0 | 1056 | bool ok = GetOneScriptSpan(span); |
michael@0 | 1057 | LowerScriptSpan(span); |
michael@0 | 1058 | return ok; |
michael@0 | 1059 | } |
michael@0 | 1060 | |
michael@0 | 1061 | |
michael@0 | 1062 | // Maps byte offset in most recent GetOneScriptSpan/Lower |
michael@0 | 1063 | // span->text [0..text_bytes] into an additional byte offset from |
michael@0 | 1064 | // span->offset, to get back to corresponding text in the original |
michael@0 | 1065 | // input buffer. |
michael@0 | 1066 | // text_offset must be the first byte |
michael@0 | 1067 | // of a UTF-8 character, or just beyond the last character. Normally this |
michael@0 | 1068 | // routine is called with the first byte of an interesting range and |
michael@0 | 1069 | // again with the first byte of the following range. |
michael@0 | 1070 | int ScriptScanner::MapBack(int text_offset) { |
michael@0 | 1071 | return map2original_.MapBack(map2uplow_.MapBack(text_offset)); |
michael@0 | 1072 | } |
michael@0 | 1073 | |
michael@0 | 1074 | |
michael@0 | 1075 | // Gets lscript number for letters; always returns |
michael@0 | 1076 | // 0 (common script) for non-letters |
michael@0 | 1077 | int GetUTF8LetterScriptNum(const char* src) { |
michael@0 | 1078 | int srclen = UTF8OneCharLen(src); |
michael@0 | 1079 | const uint8* usrc = reinterpret_cast<const uint8*>(src); |
michael@0 | 1080 | return UTF8GenericPropertyTwoByte(&utf8prop_lettermarkscriptnum_obj, |
michael@0 | 1081 | &usrc, &srclen); |
michael@0 | 1082 | } |
michael@0 | 1083 | |
michael@0 | 1084 | } // namespace CLD2 |
michael@0 | 1085 | |
michael@0 | 1086 |