browser/components/translation/cld2/internal/getonescriptspan.cc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/browser/components/translation/cld2/internal/getonescriptspan.cc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1086 @@
     1.4 +// Copyright 2013 Google Inc. All Rights Reserved.
     1.5 +//
     1.6 +// Licensed under the Apache License, Version 2.0 (the "License");
     1.7 +// you may not use this file except in compliance with the License.
     1.8 +// You may obtain a copy of the License at
     1.9 +//
    1.10 +//     http://www.apache.org/licenses/LICENSE-2.0
    1.11 +//
    1.12 +// Unless required by applicable law or agreed to in writing, software
    1.13 +// distributed under the License is distributed on an "AS IS" BASIS,
    1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    1.15 +// See the License for the specific language governing permissions and
    1.16 +// limitations under the License.
    1.17 +
    1.18 +//
    1.19 +// Author: dsites@google.com (Dick Sites)
    1.20 +//
    1.21 +
    1.22 +
    1.23 +#include "getonescriptspan.h"
    1.24 +#include <string.h>
    1.25 +
    1.26 +#include "fixunicodevalue.h"
    1.27 +#include "lang_script.h"
    1.28 +#include "port.h"
    1.29 +#include "utf8statetable.h"
    1.30 +
    1.31 +#include "utf8prop_lettermarkscriptnum.h"
    1.32 +#include "utf8repl_lettermarklower.h"
    1.33 +#include "utf8scannot_lettermarkspecial.h"
    1.34 +
    1.35 +
    1.36 +namespace CLD2 {
    1.37 +
    1.38 +// Alphabetical order for binary search, from
    1.39 +// generated_entities.cc
    1.40 +extern const int kNameToEntitySize;
    1.41 +extern const CharIntPair kNameToEntity[];
    1.42 +
    1.43 +static const int kMaxUpToWordBoundary = 50;       // span < this make longer,
    1.44 +                                                  // else make shorter
    1.45 +static const int kMaxAdvanceToWordBoundary = 10;  // +/- this many bytes
    1.46 +                                                  // to round to word boundary,
    1.47 +                                                  // direction above
    1.48 +
    1.49 +static const char kSpecialSymbol[256] = {       // true for < > &
    1.50 +  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    1.51 +  0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
    1.52 +  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    1.53 +  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    1.54 +
    1.55 +  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    1.56 +  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    1.57 +  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    1.58 +  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    1.59 +};
    1.60 +
    1.61 +
    1.62 +
    1.63 +#define LT 0      // <
    1.64 +#define GT 1      // >
    1.65 +#define EX 2      // !
    1.66 +#define HY 3      // -
    1.67 +#define QU 4      // "
    1.68 +#define AP 5      // '
    1.69 +#define SL 6      // /
    1.70 +#define S_ 7
    1.71 +#define C_ 8
    1.72 +#define R_ 9
    1.73 +#define I_ 10
    1.74 +#define P_ 11
    1.75 +#define T_ 12
    1.76 +#define Y_ 13
    1.77 +#define L_ 14
    1.78 +#define E_ 15
    1.79 +#define CR 16     // <cr> or <lf>
    1.80 +#define NL 17     // non-letter: ASCII whitespace, digit, punctuation
    1.81 +#define PL 18     // possible letter, incl. &
    1.82 +#define xx 19     // <unused>
    1.83 +
    1.84 +// Map byte to one of ~20 interesting categories for cheap tag parsing
    1.85 +static const uint8 kCharToSub[256] = {
    1.86 +  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
    1.87 +  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
    1.88 +  NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
    1.89 +  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
    1.90 +
    1.91 +  PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
    1.92 +  P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
    1.93 +  PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
    1.94 +  P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
    1.95 +
    1.96 +  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
    1.97 +  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
    1.98 +  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
    1.99 +  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
   1.100 +
   1.101 +  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
   1.102 +  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
   1.103 +  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
   1.104 +  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
   1.105 +};
   1.106 +
   1.107 +#undef LT
   1.108 +#undef GT
   1.109 +#undef EX
   1.110 +#undef HY
   1.111 +#undef QU
   1.112 +#undef AP
   1.113 +#undef SL
   1.114 +#undef S_
   1.115 +#undef C_
   1.116 +#undef R_
   1.117 +#undef I_
   1.118 +#undef P_
   1.119 +#undef T_
   1.120 +#undef Y_
   1.121 +#undef L_
   1.122 +#undef E_
   1.123 +#undef CR
   1.124 +#undef NL
   1.125 +#undef PL
   1.126 +#undef xx
   1.127 +
   1.128 +
   1.129 +#define OK 0
   1.130 +#define X_ 1
   1.131 +
   1.132 +
   1.133 +static const int kMaxExitStateLettersMarksOnly = 1;
   1.134 +static const int kMaxExitStateAllText = 2;
   1.135 +
   1.136 +
   1.137 +// State machine to do cheap parse of non-letter strings incl. tags
   1.138 +// advances <tag>
   1.139 +//          |    |
   1.140 +// advances <tag> ... </tag>  for <script> <style>
   1.141 +//          |               |
   1.142 +// advances <!-- ... <tag> ... -->
   1.143 +//          |                     |
   1.144 +// advances <tag
   1.145 +//          ||  (0)
   1.146 +// advances <tag <tag2>
   1.147 +//          ||  (0)
   1.148 +//
   1.149 +// We start in state [0] at a non-letter and make at least one transition
   1.150 +// When scanning for just letters, arriving back at state [0] or [1] exits
   1.151 +//   the state machine.
   1.152 +// When scanning for any non-tag text, arriving at state [2] also exits
   1.153 +static const uint8 kTagParseTbl_0[] = {
   1.154 +// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
   1.155 +   3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [0] OK    exit state
   1.156 +  X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error exit state
   1.157 +   3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [2] NL*   [exit state]
   1.158 +  X_, 2, 4, 9, 10,11, 9,13,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [3] <
   1.159 +  X_, 2, 9, 5, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [4] <!
   1.160 +  X_, 2, 9, 6, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [5] <!-
   1.161 +   6, 6, 6, 7,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [6] <!--.*
   1.162 +   6, 6, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [7] <!--.*-
   1.163 +   6, 2, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [8] <!--.*--
   1.164 +  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [9] <.*
   1.165 +  10,10,10,10,  9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
   1.166 +  11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
   1.167 +  X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
   1.168 +
   1.169 +// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
   1.170 +  X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9,  9, 9, 9,X_, // [13] <S
   1.171 +  X_, 2, 9, 9, 10,11, 9, 9,  9,15, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [14] <SC
   1.172 +  X_, 2, 9, 9, 10,11, 9, 9,  9, 9,16, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [15] <SCR
   1.173 +  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9,17,  9, 9, 9, 9,  9, 9, 9,X_, // [16] <SCRI
   1.174 +  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9, 18, 9, 9, 9,  9, 9, 9,X_, // [17] <SCRIP
   1.175 +  X_,19, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
   1.176 +  20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
   1.177 +  19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
   1.178 +  19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 21,21,19,X_, // [21] <SCRIPT .*</ allow SP CR LF
   1.179 +  19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
   1.180 +  19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
   1.181 +  19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
   1.182 +  19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
   1.183 +  19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
   1.184 +  19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
   1.185 +
   1.186 +// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
   1.187 +  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9,29, 9, 9,  9, 9, 9,X_, // [28] <ST
   1.188 +  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9,30, 9,  9, 9, 9,X_, // [29] <STY
   1.189 +  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9,31,  9, 9, 9,X_, // [30] <STYL
   1.190 +  X_,32, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
   1.191 +  33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
   1.192 +  32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
   1.193 +  32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 34,34,32,X_, // [34] <STYLE .*</ allow SP CR LF
   1.194 +  32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
   1.195 +  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
   1.196 +  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
   1.197 +  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
   1.198 +  32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
   1.199 +};
   1.200 +
   1.201 +#undef OK
   1.202 +#undef X_
   1.203 +
   1.204 +enum
   1.205 +{
   1.206 +  UTFmax        = 4,            // maximum bytes per rune
   1.207 +  Runesync      = 0x80,         // cannot represent part of a UTF sequence (<)
   1.208 +  Runeself      = 0x80,         // rune and UTF sequences are the same (<)
   1.209 +  Runeerror     = 0xFFFD,       // decoding error in UTF
   1.210 +  Runemax       = 0x10FFFF,     // maximum rune value
   1.211 +};
   1.212 +
   1.213 +// Debugging. Not thread safe.
   1.214 +static char gDisplayPiece[32];
   1.215 +const uint8 gCharlen[16] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 2,2,3,4};
   1.216 +char* DisplayPiece(const char* next_byte_, int byte_length_) {
   1.217 +  // Copy up to 8 UTF-8 chars to buffer
   1.218 +  int k = 0;    // byte count
   1.219 +  int n = 0;    // character count
   1.220 +  for (int i = 0; i < byte_length_; ++i) {
   1.221 +    char c = next_byte_[i];
   1.222 +    if ((c & 0xc0) != 0x80) {
   1.223 +      // Beginning of a UTF-8 character
   1.224 +      int charlen = gCharlen[static_cast<uint8>(c) >> 4];
   1.225 +      if (i + charlen > byte_length_) {break;} // Not enough room for full char
   1.226 +      if (k >= (32 - 7)) {break;}   // Not necessarily enough room
   1.227 +      if (n >= 8) {break;}          // Enough characters already
   1.228 +      ++n;
   1.229 +    }
   1.230 +    if (c == '<') {
   1.231 +      memcpy(&gDisplayPiece[k], "&lt;", 4); k += 4;
   1.232 +    } else if (c == '>') {
   1.233 +      memcpy(&gDisplayPiece[k], "&gt;", 4); k += 4;
   1.234 +    } else if (c == '&') {
   1.235 +      memcpy(&gDisplayPiece[k], "&amp;", 5); k += 5;
   1.236 +    } else if (c == '\'') {
   1.237 +      memcpy(&gDisplayPiece[k], "&apos;", 6); k += 6;
   1.238 +    } else if (c == '"') {
   1.239 +      memcpy(&gDisplayPiece[k], "&quot;", 6); k += 6;
   1.240 +    } else {
   1.241 +      gDisplayPiece[k++] = c;
   1.242 +    }
   1.243 +  }
   1.244 +  gDisplayPiece[k++] = '\0';
   1.245 +  return gDisplayPiece;
   1.246 +}
   1.247 +
   1.248 +
   1.249 +
   1.250 +// runetochar copies (encodes) one rune, pointed to by r, to at most
   1.251 +// UTFmax bytes starting at s and returns the number of bytes generated.
   1.252 +int runetochar(char *str, const char32 *rune) {
   1.253 +  // Convert to unsigned for range check.
   1.254 +  unsigned long c;
   1.255 +
   1.256 +  // 1 char 00-7F
   1.257 +  c = *rune;
   1.258 +  if(c <= 0x7F) {
   1.259 +    str[0] = c;
   1.260 +    return 1;
   1.261 +  }
   1.262 +
   1.263 +  // 2 char 0080-07FF
   1.264 +  if(c <= 0x07FF) {
   1.265 +    str[0] = 0xC0 | (c >> 1*6);
   1.266 +    str[1] = 0x80 | (c & 0x3F);
   1.267 +    return 2;
   1.268 +  }
   1.269 +
   1.270 +  // Range check
   1.271 +  if (c > Runemax) {
   1.272 +    c = Runeerror;
   1.273 +  }
   1.274 +
   1.275 +  // 3 char 0800-FFFF
   1.276 +  if (c <= 0xFFFF) {
   1.277 +    str[0] = 0xE0 |  (c >> 2*6);
   1.278 +    str[1] = 0x80 | ((c >> 1*6) & 0x3F);
   1.279 +    str[2] = 0x80 |  (c & 0x3F);
   1.280 +    return 3;
   1.281 +  }
   1.282 +
   1.283 +  // 4 char 10000-1FFFFF
   1.284 +  str[0] = 0xF0 | (c >> 3*6);
   1.285 +  str[1] = 0x80 | ((c >> 2*6) & 0x3F);
   1.286 +  str[2] = 0x80 | ((c >> 1*6) & 0x3F);
   1.287 +  str[3] = 0x80 | (c & 0x3F);
   1.288 +  return 4;
   1.289 +}
   1.290 +
   1.291 +
   1.292 +
   1.293 +// Useful for converting an entity to an ascii value.
   1.294 +// RETURNS unicode value, or -1 if entity isn't valid.  Don't include & or ;
   1.295 +int LookupEntity(const char* entity_name, int entity_len) {
   1.296 +  // Make a C string
   1.297 +  if (entity_len >= 16) {return -1;}    // All real entities are shorter
   1.298 +  char temp[16];
   1.299 +  memcpy(temp, entity_name, entity_len);
   1.300 +  temp[entity_len] = '\0';
   1.301 +  int match = BinarySearch(temp, 0, kNameToEntitySize, kNameToEntity);
   1.302 +  if (match >= 0) {return kNameToEntity[match].i;}
   1.303 +  return -1;
   1.304 +}
   1.305 +
   1.306 +bool ascii_isdigit(char c) {
   1.307 +  return ('0' <= c) && (c <= '9');
   1.308 +}
   1.309 +bool ascii_isxdigit(char c) {
   1.310 +  if (('0' <= c) && (c <= '9')) {return true;}
   1.311 +  if (('a' <= c) && (c <= 'f')) {return true;}
   1.312 +  if (('A' <= c) && (c <= 'F')) {return true;}
   1.313 +  return false;
   1.314 +}
   1.315 +bool ascii_isalnum(char c) {
   1.316 +  if (('0' <= c) && (c <= '9')) {return true;}
   1.317 +  if (('a' <= c) && (c <= 'z')) {return true;}
   1.318 +  if (('A' <= c) && (c <= 'Z')) {return true;}
   1.319 +  return false;
   1.320 +}
   1.321 +int hex_digit_to_int(char c) {
   1.322 +  if (('0' <= c) && (c <= '9')) {return c - '0';}
   1.323 +  if (('a' <= c) && (c <= 'f')) {return c - 'a' + 10;}
   1.324 +  if (('A' <= c) && (c <= 'F')) {return c - 'A' + 10;}
   1.325 +  return 0;
   1.326 +}
   1.327 +
   1.328 +static int32 strto32_base10(const char* nptr, const char* limit,
   1.329 +                            const char **endptr) {
   1.330 +  *endptr = nptr;
   1.331 +  while (nptr < limit && *nptr == '0') {
   1.332 +    ++nptr;
   1.333 +  }
   1.334 +  if (nptr == limit || !ascii_isdigit(*nptr))
   1.335 +    return -1;
   1.336 +  const char* end_digits_run = nptr;
   1.337 +  while (end_digits_run < limit && ascii_isdigit(*end_digits_run)) {
   1.338 +    ++end_digits_run;
   1.339 +  }
   1.340 +  *endptr = end_digits_run;
   1.341 +  const int num_digits = end_digits_run - nptr;
   1.342 +  // kint32max == 2147483647.
   1.343 +  if (num_digits < 9 ||
   1.344 +      (num_digits == 10 && memcmp(nptr, "2147483647", 10) <= 0)) {
   1.345 +    int value = 0;
   1.346 +    for (; nptr < end_digits_run; ++nptr) {
   1.347 +      value *= 10;
   1.348 +      value += *nptr - '0';
   1.349 +    }
   1.350 +    // Overflow past the last valid unicode codepoint
   1.351 +    // (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
   1.352 +    return FixUnicodeValue(value);
   1.353 +  } else {
   1.354 +    // Overflow: can't fit in an int32;
   1.355 +    // returns the replacement character 0xFFFD.
   1.356 +    return 0xFFFD;
   1.357 +  }
   1.358 +}
   1.359 +
   1.360 +static int32 strto32_base16(const char* nptr, const char* limit,
   1.361 +                            const char **endptr) {
   1.362 +  *endptr = nptr;
   1.363 +  while (nptr < limit && *nptr == '0') {
   1.364 +    ++nptr;
   1.365 +  }
   1.366 +  if (nptr == limit || !ascii_isxdigit(*nptr)) {
   1.367 +    return -1;
   1.368 +  }
   1.369 +  const char* end_xdigits_run = nptr;
   1.370 +  while (end_xdigits_run < limit && ascii_isxdigit(*end_xdigits_run)) {
   1.371 +    ++end_xdigits_run;
   1.372 +  }
   1.373 +  *endptr = end_xdigits_run;
   1.374 +  const int num_xdigits = end_xdigits_run - nptr;
   1.375 +  // kint32max == 0x7FFFFFFF.
   1.376 +  if (num_xdigits < 8 || (num_xdigits == 8 && nptr[0] < '8')) {
   1.377 +    int value = 0;
   1.378 +    for (; nptr < end_xdigits_run; ++nptr) {
   1.379 +      value <<= 4;
   1.380 +      value += hex_digit_to_int(*nptr);
   1.381 +    }
   1.382 +    // Overflow past the last valid unicode codepoint
   1.383 +    // (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
   1.384 +    return FixUnicodeValue(value);
   1.385 +  } else {
   1.386 +    // Overflow: can't fit in an int32;
   1.387 +    // returns the replacement character 0xFFFD.
   1.388 +    return 0xFFFD;
   1.389 +  }
   1.390 +}
   1.391 +
   1.392 +// Unescape the current character pointed to by src.  SETS the number
   1.393 +// of chars read for the conversion (in UTF8).  If src isn't a valid entity,
   1.394 +// just consume the & and RETURN -1.  If src doesn't point to & -- which it
   1.395 +// should -- set src_consumed to 0 and RETURN -1.
   1.396 +int ReadEntity(const char* src, int srcn, int* src_consumed) {
   1.397 +  const char* const srcend = src + srcn;
   1.398 +
   1.399 +  if (srcn == 0 || *src != '&') {      // input should start with an ampersand
   1.400 +    *src_consumed = 0;
   1.401 +    return -1;
   1.402 +  }
   1.403 +  *src_consumed = 1;                   // we'll get the & at least
   1.404 +
   1.405 +  // The standards are a bit unclear on when an entity ends.  Certainly a ";"
   1.406 +  // ends one, but spaces probably do too.  We follow the lead of both IE and
   1.407 +  // Netscape, which as far as we can tell end numeric entities (1st case below)
   1.408 +  // at any non-digit, and end character entities (2nd case) at any non-alnum.
   1.409 +  const char* entstart, *entend;  // where the entity starts and ends
   1.410 +  entstart = src + 1;             // read past the &
   1.411 +  int entval;                     // UCS2 value of the entity
   1.412 +  if ( *entstart == '#' ) {       // -- 1st case: numeric entity
   1.413 +    if ( entstart + 2 >= srcend ) {
   1.414 +      return -1;                  // no way a legitimate number could fit
   1.415 +    } else if ( entstart[1] == 'x' || entstart[1] == 'X' ) {   // hex numeric
   1.416 +      entval = strto32_base16(entstart + 2, srcend, &entend);
   1.417 +    } else {                                  // decimal numeric entity
   1.418 +      entval = strto32_base10(entstart+1, srcend, &entend);
   1.419 +    }
   1.420 +    if (entval == -1 || entend > srcend) {
   1.421 +      return -1;                 // not entirely correct, but close enough
   1.422 +    }
   1.423 +  } else {                       // -- 2nd case: character entity
   1.424 +    for (entend = entstart;
   1.425 +         entend < srcend && ascii_isalnum(*entend);
   1.426 +         ++entend ) {
   1.427 +      // entity consists of alphanumeric chars
   1.428 +    }
   1.429 +    entval = LookupEntity(entstart, entend - entstart);
   1.430 +    if (entval < 0) {
   1.431 +      return -1;  // not a legal entity name
   1.432 +    }
   1.433 +    // Now we do a strange-seeming IE6-compatibility check: if entval is
   1.434 +    // >= 256, it *must* be followed by a semicolon or it's not considered
   1.435 +    // an entity.  The problem is lots of the newfangled entity names, like
   1.436 +    // "lang", also occur in URL CGI arguments: "/search?q=test&lang=en".
   1.437 +    // When these links are written in HTML, it would be really bad if the
   1.438 +    // "&lang" were treated as an entity, which is what the spec says
   1.439 +    // *should* happen (even when the HTML is inside an "A HREF" tag!)
   1.440 +    // IE ignores the spec for these new, high-value entities, so we do too.
   1.441 +    if ( entval >= 256 && !(entend < srcend && *entend == ';') ) {
   1.442 +      return -1;                 // make non-;-terminated entity illegal
   1.443 +    }
   1.444 +  }
   1.445 +
   1.446 +  // Finally, figure out how much src was consumed
   1.447 +  if ( entend < srcend && *entend == ';' ) {
   1.448 +    entend++;                    // standard says ; terminator is special
   1.449 +  }
   1.450 +  *src_consumed = entend - src;
   1.451 +  return entval;
   1.452 +}
   1.453 +
   1.454 +
   1.455 +// Src points to '&'
   1.456 +// Writes entity value to dst. Returns take(src), put(dst) byte counts
   1.457 +void EntityToBuffer(const char* src, int len, char* dst,
   1.458 +                    int* tlen, int* plen) {
   1.459 +  char32 entval = ReadEntity(src, len, tlen);
   1.460 +
   1.461 +  // ReadEntity does this already: entval = FixUnicodeValue(entval);
   1.462 +
   1.463 +  // Convert UTF-32 to UTF-8
   1.464 +  if (entval > 0) {
   1.465 +    *plen = runetochar(dst, &entval);
   1.466 +  } else {
   1.467 +    // Illegal entity; ignore the '&'
   1.468 +    *tlen = 1;
   1.469 +    *plen = 0;
   1.470 +  }
   1.471 +}
   1.472 +
   1.473 +// Returns true if character is < > or &, none of which are letters
   1.474 +bool inline IsSpecial(char c) {
   1.475 +  if ((c & 0xe0) == 0x20) {
   1.476 +    return kSpecialSymbol[static_cast<uint8>(c)];
   1.477 +  }
   1.478 +  return false;
   1.479 +}
   1.480 +
   1.481 +// Quick Skip to next letter or < > & or to end of string (eos)
   1.482 +// Always return is_letter for eos
   1.483 +int ScanToLetterOrSpecial(const char* src, int len) {
   1.484 +  int bytes_consumed;
   1.485 +  StringPiece str(src, len);
   1.486 +  UTF8GenericScan(&utf8scannot_lettermarkspecial_obj, str, &bytes_consumed);
   1.487 +  return bytes_consumed;
   1.488 +}
   1.489 +
   1.490 +
   1.491 +
   1.492 +
   1.493 +// src points to non-letter, such as tag-opening '<'
   1.494 +// Return length from here to next possible letter
   1.495 +// On another < before >, return 1
   1.496 +// advances <tag>
   1.497 +//          |    |
   1.498 +// advances <tag> ... </tag>  for <script> <style>
   1.499 +//          |               |
   1.500 +// advances <!-- ... <tag> ... -->
   1.501 +//          |                     |
   1.502 +// advances <tag
   1.503 +//          |    | end of string
   1.504 +// advances <tag <tag2>
   1.505 +//          ||
   1.506 +int ScanToPossibleLetter(const char* isrc, int len, int max_exit_state) {
   1.507 +  const uint8* src = reinterpret_cast<const uint8*>(isrc);
   1.508 +  const uint8* srclimit = src + len;
   1.509 +  const uint8* tagParseTbl = kTagParseTbl_0;
   1.510 +  int e = 0;
   1.511 +  while (src < srclimit) {
   1.512 +    e = tagParseTbl[kCharToSub[*src++]];
   1.513 +    if (e <= max_exit_state) {
   1.514 +      // We overshot by one byte
   1.515 +      --src;
   1.516 +      break;
   1.517 +    }
   1.518 +    tagParseTbl = &kTagParseTbl_0[e * 20];
   1.519 +  }
   1.520 +
   1.521 +  if (src >= srclimit) {
   1.522 +    // We fell off the end of the text.
   1.523 +    // It looks like the most common case for this is a truncated file, not
   1.524 +    // mismatched angle brackets. So we pretend that the last char was '>'
   1.525 +    return len;
   1.526 +  }
   1.527 +
   1.528 +  // OK to be in state 0 or state 2 at exit
   1.529 +  if ((e != 0) && (e != 2)) {
   1.530 +    // Error, '<' followed by '<'
   1.531 +    // We want to back up to first <, then advance by one byte past it
   1.532 +    int offset = src - reinterpret_cast<const uint8*>(isrc);
   1.533 +
   1.534 +    // Backscan to first '<' and return enough length to just get past it
   1.535 +    --offset;   // back up over the second '<', which caused us to stop
   1.536 +    while ((0 < offset) && (isrc[offset] != '<')) {
   1.537 +      // Find the first '<', which is unmatched
   1.538 +      --offset;
   1.539 +    }
   1.540 +    // skip to just beyond first '<'
   1.541 +    return offset + 1;
   1.542 +  }
   1.543 +
   1.544 +  return src - reinterpret_cast<const uint8*>(isrc);
   1.545 +}
   1.546 +
   1.547 +
   1.548 +ScriptScanner::ScriptScanner(const char* buffer,
   1.549 +                             int buffer_length,
   1.550 +                             bool is_plain_text)
   1.551 +  : start_byte_(buffer),
   1.552 +  next_byte_(buffer),
   1.553 +  next_byte_limit_(buffer + buffer_length),
   1.554 +  byte_length_(buffer_length),
   1.555 +  is_plain_text_(is_plain_text),
   1.556 +  letters_marks_only_(true),
   1.557 +  one_script_only_(true),
   1.558 +  exit_state_(kMaxExitStateLettersMarksOnly) {
   1.559 +    script_buffer_ = new char[kMaxScriptBuffer];
   1.560 +    script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
   1.561 +    map2original_.Clear();    // map from script_buffer_ to buffer
   1.562 +    map2uplow_.Clear();       // map from script_buffer_lower_ to script_buffer_
   1.563 +}
   1.564 +
   1.565 +// Extended version to allow spans of any non-tag text and spans of mixed script
   1.566 +ScriptScanner::ScriptScanner(const char* buffer,
   1.567 +                             int buffer_length,
   1.568 +                             bool is_plain_text,
   1.569 +                             bool any_text,
   1.570 +                             bool any_script)
   1.571 +  : start_byte_(buffer),
   1.572 +  next_byte_(buffer),
   1.573 +  next_byte_limit_(buffer + buffer_length),
   1.574 +  byte_length_(buffer_length),
   1.575 +  is_plain_text_(is_plain_text),
   1.576 +  letters_marks_only_(!any_text),
   1.577 +  one_script_only_(!any_script),
   1.578 +  exit_state_(any_text ? kMaxExitStateAllText : kMaxExitStateLettersMarksOnly) {
   1.579 +    script_buffer_ = new char[kMaxScriptBuffer];
   1.580 +    script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
   1.581 +    map2original_.Clear();    // map from script_buffer_ to buffer
   1.582 +    map2uplow_.Clear();       // map from script_buffer_lower_ to script_buffer_
   1.583 +}
   1.584 +
   1.585 +
   1.586 +ScriptScanner::~ScriptScanner() {
   1.587 +  delete[] script_buffer_;
   1.588 +  delete[] script_buffer_lower_;
   1.589 +}
   1.590 +
   1.591 +
   1.592 +
   1.593 +
   1.594 +// Get to the first real non-tag letter or entity that is a letter
   1.595 +// Sets script of that letter
   1.596 +// Return len if no more letters
   1.597 +int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
   1.598 +  int sc = UNKNOWN_ULSCRIPT;
   1.599 +  int skip = 0;
   1.600 +  int tlen, plen;
   1.601 +
   1.602 +  // Do run of non-letters (tag | &NL | NL)*
   1.603 +  tlen = 0;
   1.604 +  while (skip < len) {
   1.605 +    // Do fast scan to next interesting byte
   1.606 +    // int oldskip = skip;
   1.607 +    skip += ScanToLetterOrSpecial(src + skip, len - skip);
   1.608 +
   1.609 +    // Check for no more letters/specials
   1.610 +    if (skip >= len) {
   1.611 +      // All done
   1.612 +      *script = sc;
   1.613 +      return len;
   1.614 +    }
   1.615 +
   1.616 +    // We are at a letter, nonletter, tag, or entity
   1.617 +    if (IsSpecial(src[skip]) && !is_plain_text_) {
   1.618 +      if (src[skip] == '<') {
   1.619 +        // Begining of tag; skip to end and go around again
   1.620 +        tlen = ScanToPossibleLetter(src + skip, len - skip,
   1.621 +                                    exit_state_);
   1.622 +        sc = 0;
   1.623 +      } else if (src[skip] == '>') {
   1.624 +        // Unexpected end of tag; skip it and go around again
   1.625 +        tlen = 1;         // Over the >
   1.626 +        sc = 0;
   1.627 +      } else if (src[skip] == '&') {
   1.628 +        // Expand entity, no advance
   1.629 +        char temp[4];
   1.630 +        EntityToBuffer(src + skip, len - skip,
   1.631 +                       temp, &tlen, &plen);
   1.632 +        sc = GetUTF8LetterScriptNum(temp);
   1.633 +      }
   1.634 +    } else {
   1.635 +      // Update 1..4 bytes
   1.636 +      tlen = UTF8OneCharLen(src + skip);
   1.637 +      sc = GetUTF8LetterScriptNum(src + skip);
   1.638 +    }
   1.639 +    if (sc != 0) {break;}           // Letter found
   1.640 +    skip += tlen;                   // Else advance
   1.641 +  }
   1.642 +
   1.643 +  *script = sc;
   1.644 +  return skip;
   1.645 +}
   1.646 +
   1.647 +
   1.648 +// These are for ASCII-only tag names
   1.649 +// Compare one letter uplow to c, ignoring case of uplowp
   1.650 +inline bool EqCase(char uplow, char c) {
   1.651 +  return (uplow | 0x20) == c;
   1.652 +}
   1.653 +
   1.654 +// These are for ASCII-only tag names
   1.655 +// Return true for space / < > etc. all less than 0x40
   1.656 +inline bool NeqLetter(char c) {
   1.657 +  return c < 0x40;
   1.658 +}
   1.659 +
   1.660 +// These are for ASCII-only tag names
   1.661 +// Return true for space \n false for \r
   1.662 +inline bool WS(char c) {
   1.663 +  return (c == ' ') || (c == '\n');
   1.664 +}
   1.665 +
   1.666 +// Canonical CR or LF
   1.667 +static const char LF = '\n';
   1.668 +
   1.669 +
   1.670 +// The naive loop scans from next_byte_ to script_buffer_ until full.
   1.671 +// But this can leave an awkward hard-to-identify short fragment at the
   1.672 +// end of the input. We would prefer to make the next-to-last fragment
   1.673 +// shorter and the last fragment longer.
   1.674 +
   1.675 +// Copy next run of non-tag characters to buffer [NUL terminated]
   1.676 +// This just replaces tags with space or \n and removes entities.
   1.677 +// Tags <br> <p> and <tr> are replaced with \n. Non-letter sequences
   1.678 +// including \r or \n are replaced by \n. All other tags and skipped text
   1.679 +// are replaced with ASCII space.
   1.680 +//
   1.681 +// Buffer ALWAYS has leading space and trailing space space space NUL
   1.682 +bool ScriptScanner::GetOneTextSpan(LangSpan* span) {
   1.683 +  span->text = script_buffer_;
   1.684 +  span->text_bytes = 0;
   1.685 +  span->offset = next_byte_ - start_byte_;
   1.686 +  span->ulscript = UNKNOWN_ULSCRIPT;
   1.687 +  span->lang = UNKNOWN_LANGUAGE;
   1.688 +  span->truncated = false;
   1.689 +
   1.690 +  int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
   1.691 +  if ((kMaxScriptBytes <= byte_length_) &&
   1.692 +      (byte_length_ < (2 * kMaxScriptBytes))) {
   1.693 +    // Try to split the last two fragments in half
   1.694 +    put_soft_limit = byte_length_ / 2;
   1.695 +  }
   1.696 +
   1.697 +  script_buffer_[0] = ' ';  // Always a space at front of output
   1.698 +  script_buffer_[1] = '\0';
   1.699 +  int take = 0;
   1.700 +  int put = 1;              // Start after the initial space
   1.701 +  int tlen, plen;
   1.702 +
   1.703 +  if (byte_length_ <= 0) {
   1.704 +    return false;          // No more text to be found
   1.705 +  }
   1.706 +
   1.707 +  // Go over alternating spans of text and tags,
   1.708 +  // copying letters to buffer with single spaces for each run of non-letters
   1.709 +  bool last_byte_was_space = false;
   1.710 +  while (take < byte_length_) {
   1.711 +    char c = next_byte_[take];
   1.712 +    if (c == '\r') {c = LF;}      // Canonical CR or LF
   1.713 +    if (c == '\n') {c = LF;}      // Canonical CR or LF
   1.714 +
   1.715 +    if (IsSpecial(c) && !is_plain_text_) {
   1.716 +      if (c == '<') {
   1.717 +        // Replace tag with space
   1.718 +        c = ' ';                      // for almost-full test below
   1.719 +        // or if <p> <br> <tr>, replace with \n
   1.720 +        if (take < (byte_length_ - 3)) {
   1.721 +          if (EqCase(next_byte_[take + 1], 'p') &&
   1.722 +              NeqLetter(next_byte_[take + 2])) {
   1.723 +            c = LF;
   1.724 +          }
   1.725 +          if (EqCase(next_byte_[take + 1], 'b') &&
   1.726 +              EqCase(next_byte_[take + 2], 'r') &&
   1.727 +              NeqLetter(next_byte_[take + 3])) {
   1.728 +            c = LF;
   1.729 +          }
   1.730 +          if (EqCase(next_byte_[take + 1], 't') &&
   1.731 +              EqCase(next_byte_[take + 2], 'r') &&
   1.732 +              NeqLetter(next_byte_[take + 3])) {
   1.733 +            c = LF;
   1.734 +          }
   1.735 +        }
   1.736 +        // Begining of tag; skip to end and go around again
   1.737 +        tlen = 1 + ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
   1.738 +                                    exit_state_);
   1.739 +        // Copy one byte, compressing spaces
   1.740 +        if (!last_byte_was_space || !WS(c)) {
   1.741 +          script_buffer_[put++] = c;      // Advance dest
   1.742 +          last_byte_was_space = WS(c);
   1.743 +        }
   1.744 +      } else if (c == '>') {
   1.745 +        // Unexpected end of tag; copy it and go around again
   1.746 +        tlen = 1;         // Over the >
   1.747 +        script_buffer_[put++] = c;    // Advance dest
   1.748 +      } else if (c == '&') {
   1.749 +        // Expand entity, no advance
   1.750 +        EntityToBuffer(next_byte_ + take, byte_length_ - take,
   1.751 +                       script_buffer_ + put, &tlen, &plen);
   1.752 +        put += plen;                  // Advance dest
   1.753 +      }
   1.754 +      take += tlen;                   // Advance source
   1.755 +    } else {
   1.756 +      // Copy one byte, compressing spaces
   1.757 +      if (!last_byte_was_space || !WS(c)) {
   1.758 +        script_buffer_[put++] = c;      // Advance dest
   1.759 +        last_byte_was_space = WS(c);
   1.760 +      }
   1.761 +      ++take;                         // Advance source
   1.762 +    }
   1.763 +
   1.764 +    if (WS(c) &&
   1.765 +        (put >= put_soft_limit)) {
   1.766 +      // Buffer is almost full
   1.767 +      span->truncated = true;
   1.768 +      break;
   1.769 +    }
   1.770 +    if (put >= kMaxScriptBytes) {
   1.771 +      // Buffer is completely full
   1.772 +      span->truncated = true;
   1.773 +      break;
   1.774 +    }
   1.775 +  }
   1.776 +
   1.777 +  // Almost done. Back up to a character boundary if needed
   1.778 +  while ((0 < take) && ((next_byte_[take] & 0xc0) == 0x80)) {
   1.779 +    // Back up over continuation byte
   1.780 +    --take;
   1.781 +    --put;
   1.782 +  }
   1.783 +
   1.784 +  // Update input position
   1.785 +  next_byte_ += take;
   1.786 +  byte_length_ -= take;
   1.787 +
   1.788 +  // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
   1.789 +  //                          kMaxScriptBytes |   | put
   1.790 +  script_buffer_[put + 0] = ' ';
   1.791 +  script_buffer_[put + 1] = ' ';
   1.792 +  script_buffer_[put + 2] = ' ';
   1.793 +  script_buffer_[put + 3] = '\0';
   1.794 +
   1.795 +  span->text_bytes = put;       // Does not include the last four chars above
   1.796 +  return true;
   1.797 +}
   1.798 +
   1.799 +
   1.800 +// Copy next run of same-script non-tag letters to buffer [NUL terminated]
   1.801 +// Buffer ALWAYS has leading space and trailing space space space NUL
   1.802 +bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
   1.803 +  if (!letters_marks_only_) {
   1.804 +    // Return non-tag text, including punctuation and digits
   1.805 +    return GetOneTextSpan(span);
   1.806 +  }
   1.807 +
   1.808 +  span->text = script_buffer_;
   1.809 +  span->text_bytes = 0;
   1.810 +  span->offset = next_byte_ - start_byte_;
   1.811 +  span->ulscript = UNKNOWN_ULSCRIPT;
   1.812 +  span->lang = UNKNOWN_LANGUAGE;
   1.813 +  span->truncated = false;
   1.814 +
   1.815 +  // struct timeval script_start, script_mid, script_end;
   1.816 +
   1.817 +  int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
   1.818 +  if ((kMaxScriptBytes <= byte_length_) &&
   1.819 +      (byte_length_ < (2 * kMaxScriptBytes))) {
   1.820 +    // Try to split the last two fragments in half
   1.821 +    put_soft_limit = byte_length_ / 2;
   1.822 +  }
   1.823 +
   1.824 +
   1.825 +  int spanscript;           // The script of this span
   1.826 +  int sc = UNKNOWN_ULSCRIPT;  // The script of next character
   1.827 +  int tlen = 0;
   1.828 +  int plen = 0;
   1.829 +
   1.830 +  script_buffer_[0] = ' ';  // Always a space at front of output
   1.831 +  script_buffer_[1] = '\0';
   1.832 +  int take = 0;
   1.833 +  int put = 1;              // Start after the initial space
   1.834 +
   1.835 +  // Build offsets from span->text back to start_byte_ + span->offset
   1.836 +  // This mapping reflects deletion of non-letters, expansion of
   1.837 +  // entities, etc.
   1.838 +  map2original_.Clear();
   1.839 +  map2original_.Delete(span->offset);   // So that MapBack(0) gives offset
   1.840 +
   1.841 +  // Get to the first real non-tag letter or entity that is a letter
   1.842 +  int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
   1.843 +  next_byte_ += skip;
   1.844 +  byte_length_ -= skip;
   1.845 +
   1.846 +  if (skip != 1) {
   1.847 +    map2original_.Delete(skip);
   1.848 +    map2original_.Insert(1);
   1.849 +  } else {
   1.850 +    map2original_.Copy(1);
   1.851 +  }
   1.852 +  if (byte_length_ <= 0) {
   1.853 +    map2original_.Reset();
   1.854 +    return false;               // No more letters to be found
   1.855 +  }
   1.856 +
   1.857 +  // There is at least one letter, so we know the script for this span
   1.858 +  span->ulscript = (ULScript)spanscript;
   1.859 +
   1.860 +
   1.861 +  // Go over alternating spans of same-script letters and non-letters,
   1.862 +  // copying letters to buffer with single spaces for each run of non-letters
   1.863 +  while (take < byte_length_) {
   1.864 +    // Copy run of letters in same script (&LS | LS)*
   1.865 +    int letter_count = 0;              // Keep track of word length
   1.866 +    bool need_break = false;
   1.867 +
   1.868 +    while (take < byte_length_) {
   1.869 +      // We are at a letter, nonletter, tag, or entity
   1.870 +      if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
   1.871 +        if (next_byte_[take] == '<') {
   1.872 +          // Begining of tag
   1.873 +          sc = 0;
   1.874 +          break;
   1.875 +        } else if (next_byte_[take] == '>') {
   1.876 +          // Unexpected end of tag
   1.877 +          sc = 0;
   1.878 +          break;
   1.879 +        } else if (next_byte_[take] == '&') {
   1.880 +          // Copy entity, no advance
   1.881 +          EntityToBuffer(next_byte_ + take, byte_length_ - take,
   1.882 +                         script_buffer_ + put, &tlen, &plen);
   1.883 +          sc = GetUTF8LetterScriptNum(script_buffer_ + put);
   1.884 +        }
   1.885 +      } else {
   1.886 +        // Real letter, safely copy up to 4 bytes, increment by 1..4
   1.887 +        // Will update by 1..4 bytes at Advance, below
   1.888 +        tlen = plen = UTF8OneCharLen(next_byte_ + take);
   1.889 +        if (take < (byte_length_ - 3)) {
   1.890 +          // X86 fast case, does unaligned load/store
   1.891 +          UNALIGNED_STORE32(script_buffer_ + put,
   1.892 +                            UNALIGNED_LOAD32(next_byte_ + take));
   1.893 +
   1.894 +        } else {
   1.895 +          // Slow case, happens 1-3 times per input document
   1.896 +          memcpy(script_buffer_ + put, next_byte_ + take, plen);
   1.897 +        }
   1.898 +        sc = GetUTF8LetterScriptNum(next_byte_ + take);
   1.899 +      }
   1.900 +
   1.901 +      // Allow continue across a single letter in a different script:
   1.902 +      // A B D = three scripts, c = common script, i = inherited script,
   1.903 +      // - = don't care, ( = take position before the += below
   1.904 +      //  AAA(A-    continue
   1.905 +      //
   1.906 +      //  AAA(BA    continue
   1.907 +      //  AAA(BB    break
   1.908 +      //  AAA(Bc    continue (breaks after B)
   1.909 +      //  AAA(BD    break
   1.910 +      //  AAA(Bi    break
   1.911 +      //
   1.912 +      //  AAA(c-    break
   1.913 +      //
   1.914 +      //  AAA(i-    continue
   1.915 +      //
   1.916 +
   1.917 +      if ((sc != spanscript) && (sc != ULScript_Inherited)) {
   1.918 +        // Might need to break this script span
   1.919 +        if (sc == ULScript_Common) {
   1.920 +          need_break = true;
   1.921 +        } else {
   1.922 +          // Look at next following character, ignoring entity as Common
   1.923 +          int sc2 = GetUTF8LetterScriptNum(next_byte_ + take + tlen);
   1.924 +          if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
   1.925 +            // We found a non-trivial change of script
   1.926 +            if (one_script_only_) {
   1.927 +              need_break = true;
   1.928 +            }
   1.929 +          }
   1.930 +        }
   1.931 +      }
   1.932 +      if (need_break) {break;}  // Non-letter or letter in wrong script
   1.933 +
   1.934 +      take += tlen;                   // Advance
   1.935 +      put += plen;                    // Advance
   1.936 +
   1.937 +      // Update the offset map to reflect take/put lengths
   1.938 +      if (tlen == plen) {
   1.939 +        map2original_.Copy(tlen);
   1.940 +      } else if (tlen < plen) {
   1.941 +        map2original_.Copy(tlen);
   1.942 +        map2original_.Insert(plen - tlen);
   1.943 +      } else {    // plen < tlen
   1.944 +        map2original_.Copy(plen);
   1.945 +        map2original_.Delete(tlen - plen);
   1.946 +      }
   1.947 +
   1.948 +      ++letter_count;
   1.949 +      if (put >= kMaxScriptBytes) {
   1.950 +        // Buffer is full
   1.951 +        span->truncated = true;
   1.952 +        break;
   1.953 +      }
   1.954 +    }     // End while letters
   1.955 +
   1.956 +    // Do run of non-letters (tag | &NL | NL)*
   1.957 +    while (take < byte_length_) {
   1.958 +      // Do fast scan to next interesting byte
   1.959 +      tlen = ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
   1.960 +      take += tlen;
   1.961 +      map2original_.Delete(tlen);
   1.962 +      if (take >= byte_length_) {break;}    // Might have scanned to end
   1.963 +
   1.964 +      // We are at a letter, nonletter, tag, or entity
   1.965 +      if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
   1.966 +        if (next_byte_[take] == '<') {
   1.967 +          // Begining of tag; skip to end and go around again
   1.968 +          tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
   1.969 +                                      exit_state_);
   1.970 +          sc = 0;
   1.971 +        } else if (next_byte_[take] == '>') {
   1.972 +          // Unexpected end of tag; skip it and go around again
   1.973 +          tlen = 1;         // Over the >
   1.974 +          sc = 0;
   1.975 +        } else if (next_byte_[take] == '&') {
   1.976 +          // Expand entity, no advance
   1.977 +          EntityToBuffer(next_byte_ + take, byte_length_ - take,
   1.978 +                         script_buffer_ + put, &tlen, &plen);
   1.979 +          sc = GetUTF8LetterScriptNum(script_buffer_ + put);
   1.980 +        }
   1.981 +      } else {
   1.982 +        // Update 1..4
   1.983 +        tlen = UTF8OneCharLen(next_byte_ + take);
   1.984 +        sc = GetUTF8LetterScriptNum(next_byte_ + take);
   1.985 +      }
   1.986 +      if (sc != 0) {break;}           // Letter found
   1.987 +      take += tlen;                   // Else advance
   1.988 +      map2original_.Delete(tlen);
   1.989 +    }     // End while not-letters
   1.990 +
   1.991 +    script_buffer_[put++] = ' ';
   1.992 +    map2original_.Insert(1);
   1.993 +
   1.994 +    // Letter in wrong script ?
   1.995 +    if ((sc != spanscript) && (sc != ULScript_Inherited)) {break;}
   1.996 +    if (put >= put_soft_limit) {
   1.997 +      // Buffer is almost full
   1.998 +      span->truncated = true;
   1.999 +      break;
  1.1000 +    }
  1.1001 +  }
  1.1002 +
  1.1003 +  // Almost done. Back up to a character boundary if needed
  1.1004 +  while ((0 < take) && (take < byte_length_) &&
  1.1005 +         ((next_byte_[take] & 0xc0) == 0x80)) {
  1.1006 +    // Back up over continuation byte
  1.1007 +    --take;
  1.1008 +    --put;
  1.1009 +  }
  1.1010 +
  1.1011 +  // Update input position
  1.1012 +  next_byte_ += take;
  1.1013 +  byte_length_ -= take;
  1.1014 +
  1.1015 +  // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
  1.1016 +  //                          kMaxScriptBytes |   | put
  1.1017 +  script_buffer_[put + 0] = ' ';
  1.1018 +  script_buffer_[put + 1] = ' ';
  1.1019 +  script_buffer_[put + 2] = ' ';
  1.1020 +  script_buffer_[put + 3] = '\0';
  1.1021 +  map2original_.Insert(4);
  1.1022 +  map2original_.Reset();
  1.1023 +
  1.1024 +  span->text_bytes = put;       // Does not include the last four chars above
  1.1025 +  return true;
  1.1026 +}
  1.1027 +
  1.1028 +// Force Latin, Cyrillic, Armenian, Greek scripts to be lowercase
  1.1029 +// List changes with each version of Unicode, so just always lowercase
  1.1030 +// Unicode 6.2.0:
  1.1031 +//   ARMENIAN COPTIC CYRILLIC DESERET GEORGIAN GLAGOLITIC GREEK LATIN
  1.1032 +void ScriptScanner::LowerScriptSpan(LangSpan* span) {
  1.1033 +  // If needed, lowercase all the text. If we do it sooner, might miss
  1.1034 +  // lowercasing an entity such as &Aacute;
  1.1035 +  // We only need to do this for Latn and Cyrl scripts
  1.1036 +  map2uplow_.Clear();
  1.1037 +  // Full Unicode lowercase of the entire buffer, including
  1.1038 +  // four pad bytes off the end.
  1.1039 +  // Ahhh. But the last byte 0x00 is not interchange-valid, so we do 3 pad
  1.1040 +  // bytes and put the 0x00 in explicitly.
  1.1041 +  // Build an offset map from script_buffer_lower_ back to script_buffer_
  1.1042 +  int consumed, filled, changed;
  1.1043 +  StringPiece istr(span->text, span->text_bytes + 3);
  1.1044 +  StringPiece ostr(script_buffer_lower_, kMaxScriptLowerBuffer);
  1.1045 +
  1.1046 +  UTF8GenericReplace(&utf8repl_lettermarklower_obj,
  1.1047 +                            istr, ostr, is_plain_text_,
  1.1048 +                            &consumed, &filled, &changed, &map2uplow_);
  1.1049 +  script_buffer_lower_[filled] = '\0';
  1.1050 +  span->text = script_buffer_lower_;
  1.1051 +  span->text_bytes = filled - 3;
  1.1052 +  map2uplow_.Reset();
  1.1053 +}
  1.1054 +
  1.1055 +// Copy next run of same-script non-tag letters to buffer [NUL terminated]
  1.1056 +// Force Latin, Cyrillic, Greek scripts to be lowercase
  1.1057 +// Buffer ALWAYS has leading space and trailing space space space NUL
  1.1058 +bool ScriptScanner::GetOneScriptSpanLower(LangSpan* span) {
  1.1059 +  bool ok = GetOneScriptSpan(span);
  1.1060 +  LowerScriptSpan(span);
  1.1061 +  return ok;
  1.1062 +}
  1.1063 +
  1.1064 +
  1.1065 +// Maps byte offset in most recent GetOneScriptSpan/Lower
  1.1066 +// span->text [0..text_bytes] into an additional byte offset from
  1.1067 +// span->offset, to get back to corresponding text in the original
  1.1068 +// input buffer.
  1.1069 +// text_offset must be the first byte
  1.1070 +// of a UTF-8 character, or just beyond the last character. Normally this
  1.1071 +// routine is called with the first byte of an interesting range and
  1.1072 +// again with the first byte of the following range.
  1.1073 +int ScriptScanner::MapBack(int text_offset) {
  1.1074 +  return map2original_.MapBack(map2uplow_.MapBack(text_offset));
  1.1075 +}
  1.1076 +
  1.1077 +
  1.1078 +// Gets lscript number for letters; always returns
  1.1079 +//   0 (common script) for non-letters
  1.1080 +int GetUTF8LetterScriptNum(const char* src) {
  1.1081 +  int srclen = UTF8OneCharLen(src);
  1.1082 +  const uint8* usrc = reinterpret_cast<const uint8*>(src);
  1.1083 +  return UTF8GenericPropertyTwoByte(&utf8prop_lettermarkscriptnum_obj,
  1.1084 +                                    &usrc, &srclen);
  1.1085 +}
  1.1086 +
  1.1087 +}  // namespace CLD2
  1.1088 +
  1.1089 +

mercurial