browser/components/translation/cld2/internal/getonescriptspan.cc

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 // Copyright 2013 Google Inc. All Rights Reserved.
     2 //
     3 // Licensed under the Apache License, Version 2.0 (the "License");
     4 // you may not use this file except in compliance with the License.
     5 // You may obtain a copy of the License at
     6 //
     7 //     http://www.apache.org/licenses/LICENSE-2.0
     8 //
     9 // Unless required by applicable law or agreed to in writing, software
    10 // distributed under the License is distributed on an "AS IS" BASIS,
    11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12 // See the License for the specific language governing permissions and
    13 // limitations under the License.
    15 //
    16 // Author: dsites@google.com (Dick Sites)
    17 //
    20 #include "getonescriptspan.h"
    21 #include <string.h>
    23 #include "fixunicodevalue.h"
    24 #include "lang_script.h"
    25 #include "port.h"
    26 #include "utf8statetable.h"
    28 #include "utf8prop_lettermarkscriptnum.h"
    29 #include "utf8repl_lettermarklower.h"
    30 #include "utf8scannot_lettermarkspecial.h"
    33 namespace CLD2 {
    35 // Alphabetical order for binary search, from
    36 // generated_entities.cc
    37 extern const int kNameToEntitySize;
    38 extern const CharIntPair kNameToEntity[];
    40 static const int kMaxUpToWordBoundary = 50;       // span < this make longer,
    41                                                   // else make shorter
    42 static const int kMaxAdvanceToWordBoundary = 10;  // +/- this many bytes
    43                                                   // to round to word boundary,
    44                                                   // direction above
    46 static const char kSpecialSymbol[256] = {       // true for < > &
    47   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    48   0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
    49   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    50   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    52   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    53   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    54   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    55   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    56 };
    60 #define LT 0      // <
    61 #define GT 1      // >
    62 #define EX 2      // !
    63 #define HY 3      // -
    64 #define QU 4      // "
    65 #define AP 5      // '
    66 #define SL 6      // /
    67 #define S_ 7
    68 #define C_ 8
    69 #define R_ 9
    70 #define I_ 10
    71 #define P_ 11
    72 #define T_ 12
    73 #define Y_ 13
    74 #define L_ 14
    75 #define E_ 15
    76 #define CR 16     // <cr> or <lf>
    77 #define NL 17     // non-letter: ASCII whitespace, digit, punctuation
    78 #define PL 18     // possible letter, incl. &
    79 #define xx 19     // <unused>
    81 // Map byte to one of ~20 interesting categories for cheap tag parsing
    82 static const uint8 kCharToSub[256] = {
    83   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
    84   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
    85   NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
    86   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
    88   PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
    89   P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
    90   PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
    91   P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
    93   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
    94   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
    95   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
    96   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
    98   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
    99   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
   100   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
   101   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
   102 };
   104 #undef LT
   105 #undef GT
   106 #undef EX
   107 #undef HY
   108 #undef QU
   109 #undef AP
   110 #undef SL
   111 #undef S_
   112 #undef C_
   113 #undef R_
   114 #undef I_
   115 #undef P_
   116 #undef T_
   117 #undef Y_
   118 #undef L_
   119 #undef E_
   120 #undef CR
   121 #undef NL
   122 #undef PL
   123 #undef xx
   126 #define OK 0
   127 #define X_ 1
   130 static const int kMaxExitStateLettersMarksOnly = 1;
   131 static const int kMaxExitStateAllText = 2;
   134 // State machine to do cheap parse of non-letter strings incl. tags
   135 // advances <tag>
   136 //          |    |
   137 // advances <tag> ... </tag>  for <script> <style>
   138 //          |               |
   139 // advances <!-- ... <tag> ... -->
   140 //          |                     |
   141 // advances <tag
   142 //          ||  (0)
   143 // advances <tag <tag2>
   144 //          ||  (0)
   145 //
   146 // We start in state [0] at a non-letter and make at least one transition
   147 // When scanning for just letters, arriving back at state [0] or [1] exits
   148 //   the state machine.
   149 // When scanning for any non-tag text, arriving at state [2] also exits
   150 static const uint8 kTagParseTbl_0[] = {
   151 // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
   152    3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [0] OK    exit state
   153   X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error exit state
   154    3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [2] NL*   [exit state]
   155   X_, 2, 4, 9, 10,11, 9,13,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [3] <
   156   X_, 2, 9, 5, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [4] <!
   157   X_, 2, 9, 6, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [5] <!-
   158    6, 6, 6, 7,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [6] <!--.*
   159    6, 6, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [7] <!--.*-
   160    6, 2, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [8] <!--.*--
   161   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [9] <.*
   162   10,10,10,10,  9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
   163   11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
   164   X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
   166 // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
   167   X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9,  9, 9, 9,X_, // [13] <S
   168   X_, 2, 9, 9, 10,11, 9, 9,  9,15, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [14] <SC
   169   X_, 2, 9, 9, 10,11, 9, 9,  9, 9,16, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [15] <SCR
   170   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9,17,  9, 9, 9, 9,  9, 9, 9,X_, // [16] <SCRI
   171   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9, 18, 9, 9, 9,  9, 9, 9,X_, // [17] <SCRIP
   172   X_,19, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
   173   20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
   174   19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
   175   19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 21,21,19,X_, // [21] <SCRIPT .*</ allow SP CR LF
   176   19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
   177   19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
   178   19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
   179   19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
   180   19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
   181   19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
   183 // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
   184   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9,29, 9, 9,  9, 9, 9,X_, // [28] <ST
   185   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9,30, 9,  9, 9, 9,X_, // [29] <STY
   186   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9,31,  9, 9, 9,X_, // [30] <STYL
   187   X_,32, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
   188   33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
   189   32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
   190   32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 34,34,32,X_, // [34] <STYLE .*</ allow SP CR LF
   191   32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
   192   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
   193   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
   194   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
   195   32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
   196 };
   198 #undef OK
   199 #undef X_
   201 enum
   202 {
   203   UTFmax        = 4,            // maximum bytes per rune
   204   Runesync      = 0x80,         // cannot represent part of a UTF sequence (<)
   205   Runeself      = 0x80,         // rune and UTF sequences are the same (<)
   206   Runeerror     = 0xFFFD,       // decoding error in UTF
   207   Runemax       = 0x10FFFF,     // maximum rune value
   208 };
   210 // Debugging. Not thread safe.
   211 static char gDisplayPiece[32];
   212 const uint8 gCharlen[16] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 2,2,3,4};
   213 char* DisplayPiece(const char* next_byte_, int byte_length_) {
   214   // Copy up to 8 UTF-8 chars to buffer
   215   int k = 0;    // byte count
   216   int n = 0;    // character count
   217   for (int i = 0; i < byte_length_; ++i) {
   218     char c = next_byte_[i];
   219     if ((c & 0xc0) != 0x80) {
   220       // Beginning of a UTF-8 character
   221       int charlen = gCharlen[static_cast<uint8>(c) >> 4];
   222       if (i + charlen > byte_length_) {break;} // Not enough room for full char
   223       if (k >= (32 - 7)) {break;}   // Not necessarily enough room
   224       if (n >= 8) {break;}          // Enough characters already
   225       ++n;
   226     }
   227     if (c == '<') {
   228       memcpy(&gDisplayPiece[k], "&lt;", 4); k += 4;
   229     } else if (c == '>') {
   230       memcpy(&gDisplayPiece[k], "&gt;", 4); k += 4;
   231     } else if (c == '&') {
   232       memcpy(&gDisplayPiece[k], "&amp;", 5); k += 5;
   233     } else if (c == '\'') {
   234       memcpy(&gDisplayPiece[k], "&apos;", 6); k += 6;
   235     } else if (c == '"') {
   236       memcpy(&gDisplayPiece[k], "&quot;", 6); k += 6;
   237     } else {
   238       gDisplayPiece[k++] = c;
   239     }
   240   }
   241   gDisplayPiece[k++] = '\0';
   242   return gDisplayPiece;
   243 }
   247 // runetochar copies (encodes) one rune, pointed to by r, to at most
   248 // UTFmax bytes starting at s and returns the number of bytes generated.
   249 int runetochar(char *str, const char32 *rune) {
   250   // Convert to unsigned for range check.
   251   unsigned long c;
   253   // 1 char 00-7F
   254   c = *rune;
   255   if(c <= 0x7F) {
   256     str[0] = c;
   257     return 1;
   258   }
   260   // 2 char 0080-07FF
   261   if(c <= 0x07FF) {
   262     str[0] = 0xC0 | (c >> 1*6);
   263     str[1] = 0x80 | (c & 0x3F);
   264     return 2;
   265   }
   267   // Range check
   268   if (c > Runemax) {
   269     c = Runeerror;
   270   }
   272   // 3 char 0800-FFFF
   273   if (c <= 0xFFFF) {
   274     str[0] = 0xE0 |  (c >> 2*6);
   275     str[1] = 0x80 | ((c >> 1*6) & 0x3F);
   276     str[2] = 0x80 |  (c & 0x3F);
   277     return 3;
   278   }
   280   // 4 char 10000-1FFFFF
   281   str[0] = 0xF0 | (c >> 3*6);
   282   str[1] = 0x80 | ((c >> 2*6) & 0x3F);
   283   str[2] = 0x80 | ((c >> 1*6) & 0x3F);
   284   str[3] = 0x80 | (c & 0x3F);
   285   return 4;
   286 }
   290 // Useful for converting an entity to an ascii value.
   291 // RETURNS unicode value, or -1 if entity isn't valid.  Don't include & or ;
   292 int LookupEntity(const char* entity_name, int entity_len) {
   293   // Make a C string
   294   if (entity_len >= 16) {return -1;}    // All real entities are shorter
   295   char temp[16];
   296   memcpy(temp, entity_name, entity_len);
   297   temp[entity_len] = '\0';
   298   int match = BinarySearch(temp, 0, kNameToEntitySize, kNameToEntity);
   299   if (match >= 0) {return kNameToEntity[match].i;}
   300   return -1;
   301 }
   303 bool ascii_isdigit(char c) {
   304   return ('0' <= c) && (c <= '9');
   305 }
   306 bool ascii_isxdigit(char c) {
   307   if (('0' <= c) && (c <= '9')) {return true;}
   308   if (('a' <= c) && (c <= 'f')) {return true;}
   309   if (('A' <= c) && (c <= 'F')) {return true;}
   310   return false;
   311 }
   312 bool ascii_isalnum(char c) {
   313   if (('0' <= c) && (c <= '9')) {return true;}
   314   if (('a' <= c) && (c <= 'z')) {return true;}
   315   if (('A' <= c) && (c <= 'Z')) {return true;}
   316   return false;
   317 }
   318 int hex_digit_to_int(char c) {
   319   if (('0' <= c) && (c <= '9')) {return c - '0';}
   320   if (('a' <= c) && (c <= 'f')) {return c - 'a' + 10;}
   321   if (('A' <= c) && (c <= 'F')) {return c - 'A' + 10;}
   322   return 0;
   323 }
   325 static int32 strto32_base10(const char* nptr, const char* limit,
   326                             const char **endptr) {
   327   *endptr = nptr;
   328   while (nptr < limit && *nptr == '0') {
   329     ++nptr;
   330   }
   331   if (nptr == limit || !ascii_isdigit(*nptr))
   332     return -1;
   333   const char* end_digits_run = nptr;
   334   while (end_digits_run < limit && ascii_isdigit(*end_digits_run)) {
   335     ++end_digits_run;
   336   }
   337   *endptr = end_digits_run;
   338   const int num_digits = end_digits_run - nptr;
   339   // kint32max == 2147483647.
   340   if (num_digits < 9 ||
   341       (num_digits == 10 && memcmp(nptr, "2147483647", 10) <= 0)) {
   342     int value = 0;
   343     for (; nptr < end_digits_run; ++nptr) {
   344       value *= 10;
   345       value += *nptr - '0';
   346     }
   347     // Overflow past the last valid unicode codepoint
   348     // (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
   349     return FixUnicodeValue(value);
   350   } else {
   351     // Overflow: can't fit in an int32;
   352     // returns the replacement character 0xFFFD.
   353     return 0xFFFD;
   354   }
   355 }
   357 static int32 strto32_base16(const char* nptr, const char* limit,
   358                             const char **endptr) {
   359   *endptr = nptr;
   360   while (nptr < limit && *nptr == '0') {
   361     ++nptr;
   362   }
   363   if (nptr == limit || !ascii_isxdigit(*nptr)) {
   364     return -1;
   365   }
   366   const char* end_xdigits_run = nptr;
   367   while (end_xdigits_run < limit && ascii_isxdigit(*end_xdigits_run)) {
   368     ++end_xdigits_run;
   369   }
   370   *endptr = end_xdigits_run;
   371   const int num_xdigits = end_xdigits_run - nptr;
   372   // kint32max == 0x7FFFFFFF.
   373   if (num_xdigits < 8 || (num_xdigits == 8 && nptr[0] < '8')) {
   374     int value = 0;
   375     for (; nptr < end_xdigits_run; ++nptr) {
   376       value <<= 4;
   377       value += hex_digit_to_int(*nptr);
   378     }
   379     // Overflow past the last valid unicode codepoint
   380     // (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
   381     return FixUnicodeValue(value);
   382   } else {
   383     // Overflow: can't fit in an int32;
   384     // returns the replacement character 0xFFFD.
   385     return 0xFFFD;
   386   }
   387 }
   389 // Unescape the current character pointed to by src.  SETS the number
   390 // of chars read for the conversion (in UTF8).  If src isn't a valid entity,
   391 // just consume the & and RETURN -1.  If src doesn't point to & -- which it
   392 // should -- set src_consumed to 0 and RETURN -1.
   393 int ReadEntity(const char* src, int srcn, int* src_consumed) {
   394   const char* const srcend = src + srcn;
   396   if (srcn == 0 || *src != '&') {      // input should start with an ampersand
   397     *src_consumed = 0;
   398     return -1;
   399   }
   400   *src_consumed = 1;                   // we'll get the & at least
   402   // The standards are a bit unclear on when an entity ends.  Certainly a ";"
   403   // ends one, but spaces probably do too.  We follow the lead of both IE and
   404   // Netscape, which as far as we can tell end numeric entities (1st case below)
   405   // at any non-digit, and end character entities (2nd case) at any non-alnum.
   406   const char* entstart, *entend;  // where the entity starts and ends
   407   entstart = src + 1;             // read past the &
   408   int entval;                     // UCS2 value of the entity
   409   if ( *entstart == '#' ) {       // -- 1st case: numeric entity
   410     if ( entstart + 2 >= srcend ) {
   411       return -1;                  // no way a legitimate number could fit
   412     } else if ( entstart[1] == 'x' || entstart[1] == 'X' ) {   // hex numeric
   413       entval = strto32_base16(entstart + 2, srcend, &entend);
   414     } else {                                  // decimal numeric entity
   415       entval = strto32_base10(entstart+1, srcend, &entend);
   416     }
   417     if (entval == -1 || entend > srcend) {
   418       return -1;                 // not entirely correct, but close enough
   419     }
   420   } else {                       // -- 2nd case: character entity
   421     for (entend = entstart;
   422          entend < srcend && ascii_isalnum(*entend);
   423          ++entend ) {
   424       // entity consists of alphanumeric chars
   425     }
   426     entval = LookupEntity(entstart, entend - entstart);
   427     if (entval < 0) {
   428       return -1;  // not a legal entity name
   429     }
   430     // Now we do a strange-seeming IE6-compatibility check: if entval is
   431     // >= 256, it *must* be followed by a semicolon or it's not considered
   432     // an entity.  The problem is lots of the newfangled entity names, like
   433     // "lang", also occur in URL CGI arguments: "/search?q=test&lang=en".
   434     // When these links are written in HTML, it would be really bad if the
   435     // "&lang" were treated as an entity, which is what the spec says
   436     // *should* happen (even when the HTML is inside an "A HREF" tag!)
   437     // IE ignores the spec for these new, high-value entities, so we do too.
   438     if ( entval >= 256 && !(entend < srcend && *entend == ';') ) {
   439       return -1;                 // make non-;-terminated entity illegal
   440     }
   441   }
   443   // Finally, figure out how much src was consumed
   444   if ( entend < srcend && *entend == ';' ) {
   445     entend++;                    // standard says ; terminator is special
   446   }
   447   *src_consumed = entend - src;
   448   return entval;
   449 }
   452 // Src points to '&'
   453 // Writes entity value to dst. Returns take(src), put(dst) byte counts
   454 void EntityToBuffer(const char* src, int len, char* dst,
   455                     int* tlen, int* plen) {
   456   char32 entval = ReadEntity(src, len, tlen);
   458   // ReadEntity does this already: entval = FixUnicodeValue(entval);
   460   // Convert UTF-32 to UTF-8
   461   if (entval > 0) {
   462     *plen = runetochar(dst, &entval);
   463   } else {
   464     // Illegal entity; ignore the '&'
   465     *tlen = 1;
   466     *plen = 0;
   467   }
   468 }
   470 // Returns true if character is < > or &, none of which are letters
   471 bool inline IsSpecial(char c) {
   472   if ((c & 0xe0) == 0x20) {
   473     return kSpecialSymbol[static_cast<uint8>(c)];
   474   }
   475   return false;
   476 }
   478 // Quick Skip to next letter or < > & or to end of string (eos)
   479 // Always return is_letter for eos
   480 int ScanToLetterOrSpecial(const char* src, int len) {
   481   int bytes_consumed;
   482   StringPiece str(src, len);
   483   UTF8GenericScan(&utf8scannot_lettermarkspecial_obj, str, &bytes_consumed);
   484   return bytes_consumed;
   485 }
   490 // src points to non-letter, such as tag-opening '<'
   491 // Return length from here to next possible letter
   492 // On another < before >, return 1
   493 // advances <tag>
   494 //          |    |
   495 // advances <tag> ... </tag>  for <script> <style>
   496 //          |               |
   497 // advances <!-- ... <tag> ... -->
   498 //          |                     |
   499 // advances <tag
   500 //          |    | end of string
   501 // advances <tag <tag2>
   502 //          ||
   503 int ScanToPossibleLetter(const char* isrc, int len, int max_exit_state) {
   504   const uint8* src = reinterpret_cast<const uint8*>(isrc);
   505   const uint8* srclimit = src + len;
   506   const uint8* tagParseTbl = kTagParseTbl_0;
   507   int e = 0;
   508   while (src < srclimit) {
   509     e = tagParseTbl[kCharToSub[*src++]];
   510     if (e <= max_exit_state) {
   511       // We overshot by one byte
   512       --src;
   513       break;
   514     }
   515     tagParseTbl = &kTagParseTbl_0[e * 20];
   516   }
   518   if (src >= srclimit) {
   519     // We fell off the end of the text.
   520     // It looks like the most common case for this is a truncated file, not
   521     // mismatched angle brackets. So we pretend that the last char was '>'
   522     return len;
   523   }
   525   // OK to be in state 0 or state 2 at exit
   526   if ((e != 0) && (e != 2)) {
   527     // Error, '<' followed by '<'
   528     // We want to back up to first <, then advance by one byte past it
   529     int offset = src - reinterpret_cast<const uint8*>(isrc);
   531     // Backscan to first '<' and return enough length to just get past it
   532     --offset;   // back up over the second '<', which caused us to stop
   533     while ((0 < offset) && (isrc[offset] != '<')) {
   534       // Find the first '<', which is unmatched
   535       --offset;
   536     }
   537     // skip to just beyond first '<'
   538     return offset + 1;
   539   }
   541   return src - reinterpret_cast<const uint8*>(isrc);
   542 }
   545 ScriptScanner::ScriptScanner(const char* buffer,
   546                              int buffer_length,
   547                              bool is_plain_text)
   548   : start_byte_(buffer),
   549   next_byte_(buffer),
   550   next_byte_limit_(buffer + buffer_length),
   551   byte_length_(buffer_length),
   552   is_plain_text_(is_plain_text),
   553   letters_marks_only_(true),
   554   one_script_only_(true),
   555   exit_state_(kMaxExitStateLettersMarksOnly) {
   556     script_buffer_ = new char[kMaxScriptBuffer];
   557     script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
   558     map2original_.Clear();    // map from script_buffer_ to buffer
   559     map2uplow_.Clear();       // map from script_buffer_lower_ to script_buffer_
   560 }
   562 // Extended version to allow spans of any non-tag text and spans of mixed script
   563 ScriptScanner::ScriptScanner(const char* buffer,
   564                              int buffer_length,
   565                              bool is_plain_text,
   566                              bool any_text,
   567                              bool any_script)
   568   : start_byte_(buffer),
   569   next_byte_(buffer),
   570   next_byte_limit_(buffer + buffer_length),
   571   byte_length_(buffer_length),
   572   is_plain_text_(is_plain_text),
   573   letters_marks_only_(!any_text),
   574   one_script_only_(!any_script),
   575   exit_state_(any_text ? kMaxExitStateAllText : kMaxExitStateLettersMarksOnly) {
   576     script_buffer_ = new char[kMaxScriptBuffer];
   577     script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
   578     map2original_.Clear();    // map from script_buffer_ to buffer
   579     map2uplow_.Clear();       // map from script_buffer_lower_ to script_buffer_
   580 }
   583 ScriptScanner::~ScriptScanner() {
   584   delete[] script_buffer_;
   585   delete[] script_buffer_lower_;
   586 }
   591 // Get to the first real non-tag letter or entity that is a letter
   592 // Sets script of that letter
   593 // Return len if no more letters
   594 int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
   595   int sc = UNKNOWN_ULSCRIPT;
   596   int skip = 0;
   597   int tlen, plen;
   599   // Do run of non-letters (tag | &NL | NL)*
   600   tlen = 0;
   601   while (skip < len) {
   602     // Do fast scan to next interesting byte
   603     // int oldskip = skip;
   604     skip += ScanToLetterOrSpecial(src + skip, len - skip);
   606     // Check for no more letters/specials
   607     if (skip >= len) {
   608       // All done
   609       *script = sc;
   610       return len;
   611     }
   613     // We are at a letter, nonletter, tag, or entity
   614     if (IsSpecial(src[skip]) && !is_plain_text_) {
   615       if (src[skip] == '<') {
   616         // Begining of tag; skip to end and go around again
   617         tlen = ScanToPossibleLetter(src + skip, len - skip,
   618                                     exit_state_);
   619         sc = 0;
   620       } else if (src[skip] == '>') {
   621         // Unexpected end of tag; skip it and go around again
   622         tlen = 1;         // Over the >
   623         sc = 0;
   624       } else if (src[skip] == '&') {
   625         // Expand entity, no advance
   626         char temp[4];
   627         EntityToBuffer(src + skip, len - skip,
   628                        temp, &tlen, &plen);
   629         sc = GetUTF8LetterScriptNum(temp);
   630       }
   631     } else {
   632       // Update 1..4 bytes
   633       tlen = UTF8OneCharLen(src + skip);
   634       sc = GetUTF8LetterScriptNum(src + skip);
   635     }
   636     if (sc != 0) {break;}           // Letter found
   637     skip += tlen;                   // Else advance
   638   }
   640   *script = sc;
   641   return skip;
   642 }
   645 // These are for ASCII-only tag names
   646 // Compare one letter uplow to c, ignoring case of uplowp
   647 inline bool EqCase(char uplow, char c) {
   648   return (uplow | 0x20) == c;
   649 }
   651 // These are for ASCII-only tag names
   652 // Return true for space / < > etc. all less than 0x40
   653 inline bool NeqLetter(char c) {
   654   return c < 0x40;
   655 }
   657 // These are for ASCII-only tag names
   658 // Return true for space \n false for \r
   659 inline bool WS(char c) {
   660   return (c == ' ') || (c == '\n');
   661 }
   663 // Canonical CR or LF
   664 static const char LF = '\n';
   667 // The naive loop scans from next_byte_ to script_buffer_ until full.
   668 // But this can leave an awkward hard-to-identify short fragment at the
   669 // end of the input. We would prefer to make the next-to-last fragment
   670 // shorter and the last fragment longer.
   672 // Copy next run of non-tag characters to buffer [NUL terminated]
   673 // This just replaces tags with space or \n and removes entities.
   674 // Tags <br> <p> and <tr> are replaced with \n. Non-letter sequences
   675 // including \r or \n are replaced by \n. All other tags and skipped text
   676 // are replaced with ASCII space.
   677 //
   678 // Buffer ALWAYS has leading space and trailing space space space NUL
   679 bool ScriptScanner::GetOneTextSpan(LangSpan* span) {
   680   span->text = script_buffer_;
   681   span->text_bytes = 0;
   682   span->offset = next_byte_ - start_byte_;
   683   span->ulscript = UNKNOWN_ULSCRIPT;
   684   span->lang = UNKNOWN_LANGUAGE;
   685   span->truncated = false;
   687   int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
   688   if ((kMaxScriptBytes <= byte_length_) &&
   689       (byte_length_ < (2 * kMaxScriptBytes))) {
   690     // Try to split the last two fragments in half
   691     put_soft_limit = byte_length_ / 2;
   692   }
   694   script_buffer_[0] = ' ';  // Always a space at front of output
   695   script_buffer_[1] = '\0';
   696   int take = 0;
   697   int put = 1;              // Start after the initial space
   698   int tlen, plen;
   700   if (byte_length_ <= 0) {
   701     return false;          // No more text to be found
   702   }
   704   // Go over alternating spans of text and tags,
   705   // copying letters to buffer with single spaces for each run of non-letters
   706   bool last_byte_was_space = false;
   707   while (take < byte_length_) {
   708     char c = next_byte_[take];
   709     if (c == '\r') {c = LF;}      // Canonical CR or LF
   710     if (c == '\n') {c = LF;}      // Canonical CR or LF
   712     if (IsSpecial(c) && !is_plain_text_) {
   713       if (c == '<') {
   714         // Replace tag with space
   715         c = ' ';                      // for almost-full test below
   716         // or if <p> <br> <tr>, replace with \n
   717         if (take < (byte_length_ - 3)) {
   718           if (EqCase(next_byte_[take + 1], 'p') &&
   719               NeqLetter(next_byte_[take + 2])) {
   720             c = LF;
   721           }
   722           if (EqCase(next_byte_[take + 1], 'b') &&
   723               EqCase(next_byte_[take + 2], 'r') &&
   724               NeqLetter(next_byte_[take + 3])) {
   725             c = LF;
   726           }
   727           if (EqCase(next_byte_[take + 1], 't') &&
   728               EqCase(next_byte_[take + 2], 'r') &&
   729               NeqLetter(next_byte_[take + 3])) {
   730             c = LF;
   731           }
   732         }
   733         // Begining of tag; skip to end and go around again
   734         tlen = 1 + ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
   735                                     exit_state_);
   736         // Copy one byte, compressing spaces
   737         if (!last_byte_was_space || !WS(c)) {
   738           script_buffer_[put++] = c;      // Advance dest
   739           last_byte_was_space = WS(c);
   740         }
   741       } else if (c == '>') {
   742         // Unexpected end of tag; copy it and go around again
   743         tlen = 1;         // Over the >
   744         script_buffer_[put++] = c;    // Advance dest
   745       } else if (c == '&') {
   746         // Expand entity, no advance
   747         EntityToBuffer(next_byte_ + take, byte_length_ - take,
   748                        script_buffer_ + put, &tlen, &plen);
   749         put += plen;                  // Advance dest
   750       }
   751       take += tlen;                   // Advance source
   752     } else {
   753       // Copy one byte, compressing spaces
   754       if (!last_byte_was_space || !WS(c)) {
   755         script_buffer_[put++] = c;      // Advance dest
   756         last_byte_was_space = WS(c);
   757       }
   758       ++take;                         // Advance source
   759     }
   761     if (WS(c) &&
   762         (put >= put_soft_limit)) {
   763       // Buffer is almost full
   764       span->truncated = true;
   765       break;
   766     }
   767     if (put >= kMaxScriptBytes) {
   768       // Buffer is completely full
   769       span->truncated = true;
   770       break;
   771     }
   772   }
   774   // Almost done. Back up to a character boundary if needed
   775   while ((0 < take) && ((next_byte_[take] & 0xc0) == 0x80)) {
   776     // Back up over continuation byte
   777     --take;
   778     --put;
   779   }
   781   // Update input position
   782   next_byte_ += take;
   783   byte_length_ -= take;
   785   // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
   786   //                          kMaxScriptBytes |   | put
   787   script_buffer_[put + 0] = ' ';
   788   script_buffer_[put + 1] = ' ';
   789   script_buffer_[put + 2] = ' ';
   790   script_buffer_[put + 3] = '\0';
   792   span->text_bytes = put;       // Does not include the last four chars above
   793   return true;
   794 }
   797 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
   798 // Buffer ALWAYS has leading space and trailing space space space NUL
   799 bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
   800   if (!letters_marks_only_) {
   801     // Return non-tag text, including punctuation and digits
   802     return GetOneTextSpan(span);
   803   }
   805   span->text = script_buffer_;
   806   span->text_bytes = 0;
   807   span->offset = next_byte_ - start_byte_;
   808   span->ulscript = UNKNOWN_ULSCRIPT;
   809   span->lang = UNKNOWN_LANGUAGE;
   810   span->truncated = false;
   812   // struct timeval script_start, script_mid, script_end;
   814   int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
   815   if ((kMaxScriptBytes <= byte_length_) &&
   816       (byte_length_ < (2 * kMaxScriptBytes))) {
   817     // Try to split the last two fragments in half
   818     put_soft_limit = byte_length_ / 2;
   819   }
   822   int spanscript;           // The script of this span
   823   int sc = UNKNOWN_ULSCRIPT;  // The script of next character
   824   int tlen = 0;
   825   int plen = 0;
   827   script_buffer_[0] = ' ';  // Always a space at front of output
   828   script_buffer_[1] = '\0';
   829   int take = 0;
   830   int put = 1;              // Start after the initial space
   832   // Build offsets from span->text back to start_byte_ + span->offset
   833   // This mapping reflects deletion of non-letters, expansion of
   834   // entities, etc.
   835   map2original_.Clear();
   836   map2original_.Delete(span->offset);   // So that MapBack(0) gives offset
   838   // Get to the first real non-tag letter or entity that is a letter
   839   int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
   840   next_byte_ += skip;
   841   byte_length_ -= skip;
   843   if (skip != 1) {
   844     map2original_.Delete(skip);
   845     map2original_.Insert(1);
   846   } else {
   847     map2original_.Copy(1);
   848   }
   849   if (byte_length_ <= 0) {
   850     map2original_.Reset();
   851     return false;               // No more letters to be found
   852   }
   854   // There is at least one letter, so we know the script for this span
   855   span->ulscript = (ULScript)spanscript;
   858   // Go over alternating spans of same-script letters and non-letters,
   859   // copying letters to buffer with single spaces for each run of non-letters
   860   while (take < byte_length_) {
   861     // Copy run of letters in same script (&LS | LS)*
   862     int letter_count = 0;              // Keep track of word length
   863     bool need_break = false;
   865     while (take < byte_length_) {
   866       // We are at a letter, nonletter, tag, or entity
   867       if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
   868         if (next_byte_[take] == '<') {
   869           // Begining of tag
   870           sc = 0;
   871           break;
   872         } else if (next_byte_[take] == '>') {
   873           // Unexpected end of tag
   874           sc = 0;
   875           break;
   876         } else if (next_byte_[take] == '&') {
   877           // Copy entity, no advance
   878           EntityToBuffer(next_byte_ + take, byte_length_ - take,
   879                          script_buffer_ + put, &tlen, &plen);
   880           sc = GetUTF8LetterScriptNum(script_buffer_ + put);
   881         }
   882       } else {
   883         // Real letter, safely copy up to 4 bytes, increment by 1..4
   884         // Will update by 1..4 bytes at Advance, below
   885         tlen = plen = UTF8OneCharLen(next_byte_ + take);
   886         if (take < (byte_length_ - 3)) {
   887           // X86 fast case, does unaligned load/store
   888           UNALIGNED_STORE32(script_buffer_ + put,
   889                             UNALIGNED_LOAD32(next_byte_ + take));
   891         } else {
   892           // Slow case, happens 1-3 times per input document
   893           memcpy(script_buffer_ + put, next_byte_ + take, plen);
   894         }
   895         sc = GetUTF8LetterScriptNum(next_byte_ + take);
   896       }
   898       // Allow continue across a single letter in a different script:
   899       // A B D = three scripts, c = common script, i = inherited script,
   900       // - = don't care, ( = take position before the += below
   901       //  AAA(A-    continue
   902       //
   903       //  AAA(BA    continue
   904       //  AAA(BB    break
   905       //  AAA(Bc    continue (breaks after B)
   906       //  AAA(BD    break
   907       //  AAA(Bi    break
   908       //
   909       //  AAA(c-    break
   910       //
   911       //  AAA(i-    continue
   912       //
   914       if ((sc != spanscript) && (sc != ULScript_Inherited)) {
   915         // Might need to break this script span
   916         if (sc == ULScript_Common) {
   917           need_break = true;
   918         } else {
   919           // Look at next following character, ignoring entity as Common
   920           int sc2 = GetUTF8LetterScriptNum(next_byte_ + take + tlen);
   921           if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
   922             // We found a non-trivial change of script
   923             if (one_script_only_) {
   924               need_break = true;
   925             }
   926           }
   927         }
   928       }
   929       if (need_break) {break;}  // Non-letter or letter in wrong script
   931       take += tlen;                   // Advance
   932       put += plen;                    // Advance
   934       // Update the offset map to reflect take/put lengths
   935       if (tlen == plen) {
   936         map2original_.Copy(tlen);
   937       } else if (tlen < plen) {
   938         map2original_.Copy(tlen);
   939         map2original_.Insert(plen - tlen);
   940       } else {    // plen < tlen
   941         map2original_.Copy(plen);
   942         map2original_.Delete(tlen - plen);
   943       }
   945       ++letter_count;
   946       if (put >= kMaxScriptBytes) {
   947         // Buffer is full
   948         span->truncated = true;
   949         break;
   950       }
   951     }     // End while letters
   953     // Do run of non-letters (tag | &NL | NL)*
   954     while (take < byte_length_) {
   955       // Do fast scan to next interesting byte
   956       tlen = ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
   957       take += tlen;
   958       map2original_.Delete(tlen);
   959       if (take >= byte_length_) {break;}    // Might have scanned to end
   961       // We are at a letter, nonletter, tag, or entity
   962       if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
   963         if (next_byte_[take] == '<') {
   964           // Begining of tag; skip to end and go around again
   965           tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
   966                                       exit_state_);
   967           sc = 0;
   968         } else if (next_byte_[take] == '>') {
   969           // Unexpected end of tag; skip it and go around again
   970           tlen = 1;         // Over the >
   971           sc = 0;
   972         } else if (next_byte_[take] == '&') {
   973           // Expand entity, no advance
   974           EntityToBuffer(next_byte_ + take, byte_length_ - take,
   975                          script_buffer_ + put, &tlen, &plen);
   976           sc = GetUTF8LetterScriptNum(script_buffer_ + put);
   977         }
   978       } else {
   979         // Update 1..4
   980         tlen = UTF8OneCharLen(next_byte_ + take);
   981         sc = GetUTF8LetterScriptNum(next_byte_ + take);
   982       }
   983       if (sc != 0) {break;}           // Letter found
   984       take += tlen;                   // Else advance
   985       map2original_.Delete(tlen);
   986     }     // End while not-letters
   988     script_buffer_[put++] = ' ';
   989     map2original_.Insert(1);
   991     // Letter in wrong script ?
   992     if ((sc != spanscript) && (sc != ULScript_Inherited)) {break;}
   993     if (put >= put_soft_limit) {
   994       // Buffer is almost full
   995       span->truncated = true;
   996       break;
   997     }
   998   }
  1000   // Almost done. Back up to a character boundary if needed
  1001   while ((0 < take) && (take < byte_length_) &&
  1002          ((next_byte_[take] & 0xc0) == 0x80)) {
  1003     // Back up over continuation byte
  1004     --take;
  1005     --put;
  1008   // Update input position
  1009   next_byte_ += take;
  1010   byte_length_ -= take;
  1012   // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
  1013   //                          kMaxScriptBytes |   | put
  1014   script_buffer_[put + 0] = ' ';
  1015   script_buffer_[put + 1] = ' ';
  1016   script_buffer_[put + 2] = ' ';
  1017   script_buffer_[put + 3] = '\0';
  1018   map2original_.Insert(4);
  1019   map2original_.Reset();
  1021   span->text_bytes = put;       // Does not include the last four chars above
  1022   return true;
  1025 // Force Latin, Cyrillic, Armenian, Greek scripts to be lowercase
  1026 // List changes with each version of Unicode, so just always lowercase
  1027 // Unicode 6.2.0:
  1028 //   ARMENIAN COPTIC CYRILLIC DESERET GEORGIAN GLAGOLITIC GREEK LATIN
  1029 void ScriptScanner::LowerScriptSpan(LangSpan* span) {
  1030   // If needed, lowercase all the text. If we do it sooner, might miss
  1031   // lowercasing an entity such as &Aacute;
  1032   // We only need to do this for Latn and Cyrl scripts
  1033   map2uplow_.Clear();
  1034   // Full Unicode lowercase of the entire buffer, including
  1035   // four pad bytes off the end.
  1036   // Ahhh. But the last byte 0x00 is not interchange-valid, so we do 3 pad
  1037   // bytes and put the 0x00 in explicitly.
  1038   // Build an offset map from script_buffer_lower_ back to script_buffer_
  1039   int consumed, filled, changed;
  1040   StringPiece istr(span->text, span->text_bytes + 3);
  1041   StringPiece ostr(script_buffer_lower_, kMaxScriptLowerBuffer);
  1043   UTF8GenericReplace(&utf8repl_lettermarklower_obj,
  1044                             istr, ostr, is_plain_text_,
  1045                             &consumed, &filled, &changed, &map2uplow_);
  1046   script_buffer_lower_[filled] = '\0';
  1047   span->text = script_buffer_lower_;
  1048   span->text_bytes = filled - 3;
  1049   map2uplow_.Reset();
  1052 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
  1053 // Force Latin, Cyrillic, Greek scripts to be lowercase
  1054 // Buffer ALWAYS has leading space and trailing space space space NUL
  1055 bool ScriptScanner::GetOneScriptSpanLower(LangSpan* span) {
  1056   bool ok = GetOneScriptSpan(span);
  1057   LowerScriptSpan(span);
  1058   return ok;
  1062 // Maps byte offset in most recent GetOneScriptSpan/Lower
  1063 // span->text [0..text_bytes] into an additional byte offset from
  1064 // span->offset, to get back to corresponding text in the original
  1065 // input buffer.
  1066 // text_offset must be the first byte
  1067 // of a UTF-8 character, or just beyond the last character. Normally this
  1068 // routine is called with the first byte of an interesting range and
  1069 // again with the first byte of the following range.
  1070 int ScriptScanner::MapBack(int text_offset) {
  1071   return map2original_.MapBack(map2uplow_.MapBack(text_offset));
  1075 // Gets lscript number for letters; always returns
  1076 //   0 (common script) for non-letters
  1077 int GetUTF8LetterScriptNum(const char* src) {
  1078   int srclen = UTF8OneCharLen(src);
  1079   const uint8* usrc = reinterpret_cast<const uint8*>(src);
  1080   return UTF8GenericPropertyTwoByte(&utf8prop_lettermarkscriptnum_obj,
  1081                                     &usrc, &srclen);
  1084 }  // namespace CLD2

mercurial