The Tor Browser: browser/components/translation/cld2/internal/getonescriptspan.cc@6474c204b198

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 // Copyright 2013 Google Inc. All Rights Reserved.

     2 //

     3 // Licensed under the Apache License, Version 2.0 (the "License");

     4 // you may not use this file except in compliance with the License.

     5 // You may obtain a copy of the License at

     6 //

     7 //     http://www.apache.org/licenses/LICENSE-2.0

     8 //

     9 // Unless required by applicable law or agreed to in writing, software

    10 // distributed under the License is distributed on an "AS IS" BASIS,

    11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

    12 // See the License for the specific language governing permissions and

    13 // limitations under the License.

    15 //

    16 // Author: dsites@google.com (Dick Sites)

    17 //

    20 #include "getonescriptspan.h"

    21 #include <string.h>

    23 #include "fixunicodevalue.h"

    24 #include "lang_script.h"

    25 #include "port.h"

    26 #include "utf8statetable.h"

    28 #include "utf8prop_lettermarkscriptnum.h"

    29 #include "utf8repl_lettermarklower.h"

    30 #include "utf8scannot_lettermarkspecial.h"

    33 namespace CLD2 {

    35 // Alphabetical order for binary search, from

    36 // generated_entities.cc

    37 extern const int kNameToEntitySize;

    38 extern const CharIntPair kNameToEntity[];

    40 static const int kMaxUpToWordBoundary = 50;       // span < this make longer,

    41                                                   // else make shorter

    42 static const int kMaxAdvanceToWordBoundary = 10;  // +/- this many bytes

    43                                                   // to round to word boundary,

    44                                                   // direction above

    46 static const char kSpecialSymbol[256] = {       // true for < > &

    47   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

    48   0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,

    49   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

    50   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

    52   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

    53   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

    54   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

    55   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

    56 };

    60 #define LT 0      // <

    61 #define GT 1      // >

    62 #define EX 2      // !

    63 #define HY 3      // -

    64 #define QU 4      // "

    65 #define AP 5      // '

    66 #define SL 6      // /

    67 #define S_ 7

    68 #define C_ 8

    69 #define R_ 9

    70 #define I_ 10

    71 #define P_ 11

    72 #define T_ 12

    73 #define Y_ 13

    74 #define L_ 14

    75 #define E_ 15

    76 #define CR 16     // <cr> or <lf>

    77 #define NL 17     // non-letter: ASCII whitespace, digit, punctuation

    78 #define PL 18     // possible letter, incl. &

    79 #define xx 19     // <unused>

    81 // Map byte to one of ~20 interesting categories for cheap tag parsing

    82 static const uint8 kCharToSub[256] = {

    83   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,

    84   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,

    85   NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,

    86   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,

    88   PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,

    89   P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,

    90   PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,

    91   P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,

    93   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,

    94   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,

    95   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,

    96   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,

    98   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,

    99   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,

   100   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,

   101   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,

   102 };

   104 #undef LT

   105 #undef GT

   106 #undef EX

   107 #undef HY

   108 #undef QU

   109 #undef AP

   110 #undef SL

   111 #undef S_

   112 #undef C_

   113 #undef R_

   114 #undef I_

   115 #undef P_

   116 #undef T_

   117 #undef Y_

   118 #undef L_

   119 #undef E_

   120 #undef CR

   121 #undef NL

   122 #undef PL

   123 #undef xx

   126 #define OK 0

   127 #define X_ 1

   130 static const int kMaxExitStateLettersMarksOnly = 1;

   131 static const int kMaxExitStateAllText = 2;

   134 // State machine to do cheap parse of non-letter strings incl. tags

   135 // advances <tag>

   136 //          |    |

   137 // advances <tag> ... </tag>  for <script> <style>

   138 //          |               |

   139 // advances <!-- ... <tag> ... -->

   140 //          |                     |

   141 // advances <tag

   142 //          ||  (0)

   143 // advances <tag <tag2>

   144 //          ||  (0)

   145 //

   146 // We start in state [0] at a non-letter and make at least one transition

   147 // When scanning for just letters, arriving back at state [0] or [1] exits

   148 //   the state machine.

   149 // When scanning for any non-tag text, arriving at state [2] also exits

   150 static const uint8 kTagParseTbl_0[] = {

   151 // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx

   152    3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [0] OK    exit state

   153   X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error exit state

   154    3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [2] NL*   [exit state]

   155   X_, 2, 4, 9, 10,11, 9,13,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [3] <

   156   X_, 2, 9, 5, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [4] <!

   157   X_, 2, 9, 6, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [5] <!-

   158    6, 6, 6, 7,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [6] <!--.*

   159    6, 6, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [7] <!--.*-

   160    6, 2, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [8] <!--.*--

   161   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [9] <.*

   162   10,10,10,10,  9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"

   163   11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'

   164   X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '

   166 // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx

   167   X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9,  9, 9, 9,X_, // [13] <S

   168   X_, 2, 9, 9, 10,11, 9, 9,  9,15, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [14] <SC

   169   X_, 2, 9, 9, 10,11, 9, 9,  9, 9,16, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [15] <SCR

   170   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9,17,  9, 9, 9, 9,  9, 9, 9,X_, // [16] <SCRI

   171   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9, 18, 9, 9, 9,  9, 9, 9,X_, // [17] <SCRIP

   172   X_,19, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT

   173   20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*

   174   19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<

   175   19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 21,21,19,X_, // [21] <SCRIPT .*</ allow SP CR LF

   176   19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S

   177   19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC

   178   19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR

   179   19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI

   180   19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP

   181   19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT

   183 // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx

   184   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9,29, 9, 9,  9, 9, 9,X_, // [28] <ST

   185   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9,30, 9,  9, 9, 9,X_, // [29] <STY

   186   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9,31,  9, 9, 9,X_, // [30] <STYL

   187   X_,32, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE

   188   33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*

   189   32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<

   190   32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 34,34,32,X_, // [34] <STYLE .*</ allow SP CR LF

   191   32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S

   192   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST

   193   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY

   194   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL

   195   32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE

   196 };

   198 #undef OK

   199 #undef X_

   201 enum

   202 {

   203   UTFmax        = 4,            // maximum bytes per rune

   204   Runesync      = 0x80,         // cannot represent part of a UTF sequence (<)

   205   Runeself      = 0x80,         // rune and UTF sequences are the same (<)

   206   Runeerror     = 0xFFFD,       // decoding error in UTF

   207   Runemax       = 0x10FFFF,     // maximum rune value

   208 };

   210 // Debugging. Not thread safe.

   211 static char gDisplayPiece[32];

   212 const uint8 gCharlen[16] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 2,2,3,4};

   213 char* DisplayPiece(const char* next_byte_, int byte_length_) {

   214   // Copy up to 8 UTF-8 chars to buffer

   215   int k = 0;    // byte count

   216   int n = 0;    // character count

   217   for (int i = 0; i < byte_length_; ++i) {

   218     char c = next_byte_[i];

   219     if ((c & 0xc0) != 0x80) {

   220       // Beginning of a UTF-8 character

   221       int charlen = gCharlen[static_cast<uint8>(c) >> 4];

   222       if (i + charlen > byte_length_) {break;} // Not enough room for full char

   223       if (k >= (32 - 7)) {break;}   // Not necessarily enough room

   224       if (n >= 8) {break;}          // Enough characters already

   225       ++n;

   226     }

   227     if (c == '<') {

   228       memcpy(&gDisplayPiece[k], "&lt;", 4); k += 4;

   229     } else if (c == '>') {

   230       memcpy(&gDisplayPiece[k], "&gt;", 4); k += 4;

   231     } else if (c == '&') {

   232       memcpy(&gDisplayPiece[k], "&amp;", 5); k += 5;

   233     } else if (c == '\'') {

   234       memcpy(&gDisplayPiece[k], "&apos;", 6); k += 6;

   235     } else if (c == '"') {

   236       memcpy(&gDisplayPiece[k], "&quot;", 6); k += 6;

   237     } else {

   238       gDisplayPiece[k++] = c;

   239     }

   240   }

   241   gDisplayPiece[k++] = '\0';

   242   return gDisplayPiece;

   243 }

   247 // runetochar copies (encodes) one rune, pointed to by r, to at most

   248 // UTFmax bytes starting at s and returns the number of bytes generated.

   249 int runetochar(char *str, const char32 *rune) {

   250   // Convert to unsigned for range check.

   251   unsigned long c;

   253   // 1 char 00-7F

   254   c = *rune;

   255   if(c <= 0x7F) {

   256     str[0] = c;

   257     return 1;

   258   }

   260   // 2 char 0080-07FF

   261   if(c <= 0x07FF) {

   262     str[0] = 0xC0 | (c >> 1*6);

   263     str[1] = 0x80 | (c & 0x3F);

   264     return 2;

   265   }

   267   // Range check

   268   if (c > Runemax) {

   269     c = Runeerror;

   270   }

   272   // 3 char 0800-FFFF

   273   if (c <= 0xFFFF) {

   274     str[0] = 0xE0 |  (c >> 2*6);

   275     str[1] = 0x80 | ((c >> 1*6) & 0x3F);

   276     str[2] = 0x80 |  (c & 0x3F);

   277     return 3;

   278   }

   280   // 4 char 10000-1FFFFF

   281   str[0] = 0xF0 | (c >> 3*6);

   282   str[1] = 0x80 | ((c >> 2*6) & 0x3F);

   283   str[2] = 0x80 | ((c >> 1*6) & 0x3F);

   284   str[3] = 0x80 | (c & 0x3F);

   285   return 4;

   286 }

   290 // Useful for converting an entity to an ascii value.

   291 // RETURNS unicode value, or -1 if entity isn't valid.  Don't include & or ;

   292 int LookupEntity(const char* entity_name, int entity_len) {

   293   // Make a C string

   294   if (entity_len >= 16) {return -1;}    // All real entities are shorter

   295   char temp[16];

   296   memcpy(temp, entity_name, entity_len);

   297   temp[entity_len] = '\0';

   298   int match = BinarySearch(temp, 0, kNameToEntitySize, kNameToEntity);

   299   if (match >= 0) {return kNameToEntity[match].i;}

   300   return -1;

   301 }

   303 bool ascii_isdigit(char c) {

   304   return ('0' <= c) && (c <= '9');

   305 }

   306 bool ascii_isxdigit(char c) {

   307   if (('0' <= c) && (c <= '9')) {return true;}

   308   if (('a' <= c) && (c <= 'f')) {return true;}

   309   if (('A' <= c) && (c <= 'F')) {return true;}

   310   return false;

   311 }

   312 bool ascii_isalnum(char c) {

   313   if (('0' <= c) && (c <= '9')) {return true;}

   314   if (('a' <= c) && (c <= 'z')) {return true;}

   315   if (('A' <= c) && (c <= 'Z')) {return true;}

   316   return false;

   317 }

   318 int hex_digit_to_int(char c) {

   319   if (('0' <= c) && (c <= '9')) {return c - '0';}

   320   if (('a' <= c) && (c <= 'f')) {return c - 'a' + 10;}

   321   if (('A' <= c) && (c <= 'F')) {return c - 'A' + 10;}

   322   return 0;

   323 }

   325 static int32 strto32_base10(const char* nptr, const char* limit,

   326                             const char **endptr) {

   327   *endptr = nptr;

   328   while (nptr < limit && *nptr == '0') {

   329     ++nptr;

   330   }

   331   if (nptr == limit || !ascii_isdigit(*nptr))

   332     return -1;

   333   const char* end_digits_run = nptr;

   334   while (end_digits_run < limit && ascii_isdigit(*end_digits_run)) {

   335     ++end_digits_run;

   336   }

   337   *endptr = end_digits_run;

   338   const int num_digits = end_digits_run - nptr;

   339   // kint32max == 2147483647.

   340   if (num_digits < 9 ||

   341       (num_digits == 10 && memcmp(nptr, "2147483647", 10) <= 0)) {

   342     int value = 0;

   343     for (; nptr < end_digits_run; ++nptr) {

   344       value *= 10;

   345       value += *nptr - '0';

   346     }

   347     // Overflow past the last valid unicode codepoint

   348     // (0x10ffff) is converted to U+FFFD by FixUnicodeValue().

   349     return FixUnicodeValue(value);

   350   } else {

   351     // Overflow: can't fit in an int32;

   352     // returns the replacement character 0xFFFD.

   353     return 0xFFFD;

   354   }

   355 }

   357 static int32 strto32_base16(const char* nptr, const char* limit,

   358                             const char **endptr) {

   359   *endptr = nptr;

   360   while (nptr < limit && *nptr == '0') {

   361     ++nptr;

   362   }

   363   if (nptr == limit || !ascii_isxdigit(*nptr)) {

   364     return -1;

   365   }

   366   const char* end_xdigits_run = nptr;

   367   while (end_xdigits_run < limit && ascii_isxdigit(*end_xdigits_run)) {

   368     ++end_xdigits_run;

   369   }

   370   *endptr = end_xdigits_run;

   371   const int num_xdigits = end_xdigits_run - nptr;

   372   // kint32max == 0x7FFFFFFF.

   373   if (num_xdigits < 8 || (num_xdigits == 8 && nptr[0] < '8')) {

   374     int value = 0;

   375     for (; nptr < end_xdigits_run; ++nptr) {

   376       value <<= 4;

   377       value += hex_digit_to_int(*nptr);

   378     }

   379     // Overflow past the last valid unicode codepoint

   380     // (0x10ffff) is converted to U+FFFD by FixUnicodeValue().

   381     return FixUnicodeValue(value);

   382   } else {

   383     // Overflow: can't fit in an int32;

   384     // returns the replacement character 0xFFFD.

   385     return 0xFFFD;

   386   }

   387 }

   389 // Unescape the current character pointed to by src.  SETS the number

   390 // of chars read for the conversion (in UTF8).  If src isn't a valid entity,

   391 // just consume the & and RETURN -1.  If src doesn't point to & -- which it

   392 // should -- set src_consumed to 0 and RETURN -1.

   393 int ReadEntity(const char* src, int srcn, int* src_consumed) {

   394   const char* const srcend = src + srcn;

   396   if (srcn == 0 || *src != '&') {      // input should start with an ampersand

   397     *src_consumed = 0;

   398     return -1;

   399   }

   400   *src_consumed = 1;                   // we'll get the & at least

   402   // The standards are a bit unclear on when an entity ends.  Certainly a ";"

   403   // ends one, but spaces probably do too.  We follow the lead of both IE and

   404   // Netscape, which as far as we can tell end numeric entities (1st case below)

   405   // at any non-digit, and end character entities (2nd case) at any non-alnum.

   406   const char* entstart, *entend;  // where the entity starts and ends

   407   entstart = src + 1;             // read past the &

   408   int entval;                     // UCS2 value of the entity

   409   if ( *entstart == '#' ) {       // -- 1st case: numeric entity

   410     if ( entstart + 2 >= srcend ) {

   411       return -1;                  // no way a legitimate number could fit

   412     } else if ( entstart[1] == 'x' || entstart[1] == 'X' ) {   // hex numeric

   413       entval = strto32_base16(entstart + 2, srcend, &entend);

   414     } else {                                  // decimal numeric entity

   415       entval = strto32_base10(entstart+1, srcend, &entend);

   416     }

   417     if (entval == -1 || entend > srcend) {

   418       return -1;                 // not entirely correct, but close enough

   419     }

   420   } else {                       // -- 2nd case: character entity

   421     for (entend = entstart;

   422          entend < srcend && ascii_isalnum(*entend);

   423          ++entend ) {

   424       // entity consists of alphanumeric chars

   425     }

   426     entval = LookupEntity(entstart, entend - entstart);

   427     if (entval < 0) {

   428       return -1;  // not a legal entity name

   429     }

   430     // Now we do a strange-seeming IE6-compatibility check: if entval is

   431     // >= 256, it *must* be followed by a semicolon or it's not considered

   432     // an entity.  The problem is lots of the newfangled entity names, like

   433     // "lang", also occur in URL CGI arguments: "/search?q=test&lang=en".

   434     // When these links are written in HTML, it would be really bad if the

   435     // "&lang" were treated as an entity, which is what the spec says

   436     // *should* happen (even when the HTML is inside an "A HREF" tag!)

   437     // IE ignores the spec for these new, high-value entities, so we do too.

   438     if ( entval >= 256 && !(entend < srcend && *entend == ';') ) {

   439       return -1;                 // make non-;-terminated entity illegal

   440     }

   441   }

   443   // Finally, figure out how much src was consumed

   444   if ( entend < srcend && *entend == ';' ) {

   445     entend++;                    // standard says ; terminator is special

   446   }

   447   *src_consumed = entend - src;

   448   return entval;

   449 }

   452 // Src points to '&'

   453 // Writes entity value to dst. Returns take(src), put(dst) byte counts

   454 void EntityToBuffer(const char* src, int len, char* dst,

   455                     int* tlen, int* plen) {

   456   char32 entval = ReadEntity(src, len, tlen);

   458   // ReadEntity does this already: entval = FixUnicodeValue(entval);

   460   // Convert UTF-32 to UTF-8

   461   if (entval > 0) {

   462     *plen = runetochar(dst, &entval);

   463   } else {

   464     // Illegal entity; ignore the '&'

   465     *tlen = 1;

   466     *plen = 0;

   467   }

   468 }

   470 // Returns true if character is < > or &, none of which are letters

   471 bool inline IsSpecial(char c) {

   472   if ((c & 0xe0) == 0x20) {

   473     return kSpecialSymbol[static_cast<uint8>(c)];

   474   }

   475   return false;

   476 }

   478 // Quick Skip to next letter or < > & or to end of string (eos)

   479 // Always return is_letter for eos

   480 int ScanToLetterOrSpecial(const char* src, int len) {

   481   int bytes_consumed;

   482   StringPiece str(src, len);

   483   UTF8GenericScan(&utf8scannot_lettermarkspecial_obj, str, &bytes_consumed);

   484   return bytes_consumed;

   485 }

   490 // src points to non-letter, such as tag-opening '<'

   491 // Return length from here to next possible letter

   492 // On another < before >, return 1

   493 // advances <tag>

   494 //          |    |

   495 // advances <tag> ... </tag>  for <script> <style>

   496 //          |               |

   497 // advances <!-- ... <tag> ... -->

   498 //          |                     |

   499 // advances <tag

   500 //          |    | end of string

   501 // advances <tag <tag2>

   502 //          ||

   503 int ScanToPossibleLetter(const char* isrc, int len, int max_exit_state) {

   504   const uint8* src = reinterpret_cast<const uint8*>(isrc);

   505   const uint8* srclimit = src + len;

   506   const uint8* tagParseTbl = kTagParseTbl_0;

   507   int e = 0;

   508   while (src < srclimit) {

   509     e = tagParseTbl[kCharToSub[*src++]];

   510     if (e <= max_exit_state) {

   511       // We overshot by one byte

   512       --src;

   513       break;

   514     }

   515     tagParseTbl = &kTagParseTbl_0[e * 20];

   516   }

   518   if (src >= srclimit) {

   519     // We fell off the end of the text.

   520     // It looks like the most common case for this is a truncated file, not

   521     // mismatched angle brackets. So we pretend that the last char was '>'

   522     return len;

   523   }

   525   // OK to be in state 0 or state 2 at exit

   526   if ((e != 0) && (e != 2)) {

   527     // Error, '<' followed by '<'

   528     // We want to back up to first <, then advance by one byte past it

   529     int offset = src - reinterpret_cast<const uint8*>(isrc);

   531     // Backscan to first '<' and return enough length to just get past it

   532     --offset;   // back up over the second '<', which caused us to stop

   533     while ((0 < offset) && (isrc[offset] != '<')) {

   534       // Find the first '<', which is unmatched

   535       --offset;

   536     }

   537     // skip to just beyond first '<'

   538     return offset + 1;

   539   }

   541   return src - reinterpret_cast<const uint8*>(isrc);

   542 }

   545 ScriptScanner::ScriptScanner(const char* buffer,

   546                              int buffer_length,

   547                              bool is_plain_text)

   548   : start_byte_(buffer),

   549   next_byte_(buffer),

   550   next_byte_limit_(buffer + buffer_length),

   551   byte_length_(buffer_length),

   552   is_plain_text_(is_plain_text),

   553   letters_marks_only_(true),

   554   one_script_only_(true),

   555   exit_state_(kMaxExitStateLettersMarksOnly) {

   556     script_buffer_ = new char[kMaxScriptBuffer];

   557     script_buffer_lower_ = new char[kMaxScriptLowerBuffer];

   558     map2original_.Clear();    // map from script_buffer_ to buffer

   559     map2uplow_.Clear();       // map from script_buffer_lower_ to script_buffer_

   560 }

   562 // Extended version to allow spans of any non-tag text and spans of mixed script

   563 ScriptScanner::ScriptScanner(const char* buffer,

   564                              int buffer_length,

   565                              bool is_plain_text,

   566                              bool any_text,

   567                              bool any_script)

   568   : start_byte_(buffer),

   569   next_byte_(buffer),

   570   next_byte_limit_(buffer + buffer_length),

   571   byte_length_(buffer_length),

   572   is_plain_text_(is_plain_text),

   573   letters_marks_only_(!any_text),

   574   one_script_only_(!any_script),

   575   exit_state_(any_text ? kMaxExitStateAllText : kMaxExitStateLettersMarksOnly) {

   576     script_buffer_ = new char[kMaxScriptBuffer];

   577     script_buffer_lower_ = new char[kMaxScriptLowerBuffer];

   578     map2original_.Clear();    // map from script_buffer_ to buffer

   579     map2uplow_.Clear();       // map from script_buffer_lower_ to script_buffer_

   580 }

   583 ScriptScanner::~ScriptScanner() {

   584   delete[] script_buffer_;

   585   delete[] script_buffer_lower_;

   586 }

   591 // Get to the first real non-tag letter or entity that is a letter

   592 // Sets script of that letter

   593 // Return len if no more letters

   594 int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {

   595   int sc = UNKNOWN_ULSCRIPT;

   596   int skip = 0;

   597   int tlen, plen;

   599   // Do run of non-letters (tag | &NL | NL)*

   600   tlen = 0;

   601   while (skip < len) {

   602     // Do fast scan to next interesting byte

   603     // int oldskip = skip;

   604     skip += ScanToLetterOrSpecial(src + skip, len - skip);

   606     // Check for no more letters/specials

   607     if (skip >= len) {

   608       // All done

   609       *script = sc;

   610       return len;

   611     }

   613     // We are at a letter, nonletter, tag, or entity

   614     if (IsSpecial(src[skip]) && !is_plain_text_) {

   615       if (src[skip] == '<') {

   616         // Begining of tag; skip to end and go around again

   617         tlen = ScanToPossibleLetter(src + skip, len - skip,

   618                                     exit_state_);

   619         sc = 0;

   620       } else if (src[skip] == '>') {

   621         // Unexpected end of tag; skip it and go around again

   622         tlen = 1;         // Over the >

   623         sc = 0;

   624       } else if (src[skip] == '&') {

   625         // Expand entity, no advance

   626         char temp[4];

   627         EntityToBuffer(src + skip, len - skip,

   628                        temp, &tlen, &plen);

   629         sc = GetUTF8LetterScriptNum(temp);

   630       }

   631     } else {

   632       // Update 1..4 bytes

   633       tlen = UTF8OneCharLen(src + skip);

   634       sc = GetUTF8LetterScriptNum(src + skip);

   635     }

   636     if (sc != 0) {break;}           // Letter found

   637     skip += tlen;                   // Else advance

   638   }

   640   *script = sc;

   641   return skip;

   642 }

   645 // These are for ASCII-only tag names

   646 // Compare one letter uplow to c, ignoring case of uplowp

   647 inline bool EqCase(char uplow, char c) {

   648   return (uplow | 0x20) == c;

   649 }

   651 // These are for ASCII-only tag names

   652 // Return true for space / < > etc. all less than 0x40

   653 inline bool NeqLetter(char c) {

   654   return c < 0x40;

   655 }

   657 // These are for ASCII-only tag names

   658 // Return true for space \n false for \r

   659 inline bool WS(char c) {

   660   return (c == ' ') || (c == '\n');

   661 }

   663 // Canonical CR or LF

   664 static const char LF = '\n';

   667 // The naive loop scans from next_byte_ to script_buffer_ until full.

   668 // But this can leave an awkward hard-to-identify short fragment at the

   669 // end of the input. We would prefer to make the next-to-last fragment

   670 // shorter and the last fragment longer.

   672 // Copy next run of non-tag characters to buffer [NUL terminated]

   673 // This just replaces tags with space or \n and removes entities.

   674 // Tags <br> <p> and <tr> are replaced with \n. Non-letter sequences

   675 // including \r or \n are replaced by \n. All other tags and skipped text

   676 // are replaced with ASCII space.

   677 //

   678 // Buffer ALWAYS has leading space and trailing space space space NUL

   679 bool ScriptScanner::GetOneTextSpan(LangSpan* span) {

   680   span->text = script_buffer_;

   681   span->text_bytes = 0;

   682   span->offset = next_byte_ - start_byte_;

   683   span->ulscript = UNKNOWN_ULSCRIPT;

   684   span->lang = UNKNOWN_LANGUAGE;

   685   span->truncated = false;

   687   int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;

   688   if ((kMaxScriptBytes <= byte_length_) &&

   689       (byte_length_ < (2 * kMaxScriptBytes))) {

   690     // Try to split the last two fragments in half

   691     put_soft_limit = byte_length_ / 2;

   692   }

   694   script_buffer_[0] = ' ';  // Always a space at front of output

   695   script_buffer_[1] = '\0';

   696   int take = 0;

   697   int put = 1;              // Start after the initial space

   698   int tlen, plen;

   700   if (byte_length_ <= 0) {

   701     return false;          // No more text to be found

   702   }

   704   // Go over alternating spans of text and tags,

   705   // copying letters to buffer with single spaces for each run of non-letters

   706   bool last_byte_was_space = false;

   707   while (take < byte_length_) {

   708     char c = next_byte_[take];

   709     if (c == '\r') {c = LF;}      // Canonical CR or LF

   710     if (c == '\n') {c = LF;}      // Canonical CR or LF

   712     if (IsSpecial(c) && !is_plain_text_) {

   713       if (c == '<') {

   714         // Replace tag with space

   715         c = ' ';                      // for almost-full test below

   716         // or if <p> <br> <tr>, replace with \n

   717         if (take < (byte_length_ - 3)) {

   718           if (EqCase(next_byte_[take + 1], 'p') &&

   719               NeqLetter(next_byte_[take + 2])) {

   720             c = LF;

   721           }

   722           if (EqCase(next_byte_[take + 1], 'b') &&

   723               EqCase(next_byte_[take + 2], 'r') &&

   724               NeqLetter(next_byte_[take + 3])) {

   725             c = LF;

   726           }

   727           if (EqCase(next_byte_[take + 1], 't') &&

   728               EqCase(next_byte_[take + 2], 'r') &&

   729               NeqLetter(next_byte_[take + 3])) {

   730             c = LF;

   731           }

   732         }

   733         // Begining of tag; skip to end and go around again

   734         tlen = 1 + ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,

   735                                     exit_state_);

   736         // Copy one byte, compressing spaces

   737         if (!last_byte_was_space || !WS(c)) {

   738           script_buffer_[put++] = c;      // Advance dest

   739           last_byte_was_space = WS(c);

   740         }

   741       } else if (c == '>') {

   742         // Unexpected end of tag; copy it and go around again

   743         tlen = 1;         // Over the >

   744         script_buffer_[put++] = c;    // Advance dest

   745       } else if (c == '&') {

   746         // Expand entity, no advance

   747         EntityToBuffer(next_byte_ + take, byte_length_ - take,

   748                        script_buffer_ + put, &tlen, &plen);

   749         put += plen;                  // Advance dest

   750       }

   751       take += tlen;                   // Advance source

   752     } else {

   753       // Copy one byte, compressing spaces

   754       if (!last_byte_was_space || !WS(c)) {

   755         script_buffer_[put++] = c;      // Advance dest

   756         last_byte_was_space = WS(c);

   757       }

   758       ++take;                         // Advance source

   759     }

   761     if (WS(c) &&

   762         (put >= put_soft_limit)) {

   763       // Buffer is almost full

   764       span->truncated = true;

   765       break;

   766     }

   767     if (put >= kMaxScriptBytes) {

   768       // Buffer is completely full

   769       span->truncated = true;

   770       break;

   771     }

   772   }

   774   // Almost done. Back up to a character boundary if needed

   775   while ((0 < take) && ((next_byte_[take] & 0xc0) == 0x80)) {

   776     // Back up over continuation byte

   777     --take;

   778     --put;

   779   }

   781   // Update input position

   782   next_byte_ += take;

   783   byte_length_ -= take;

   785   // Put four more spaces/NUL. Worst case is abcd _ _ _ \0

   786   //                          kMaxScriptBytes |   | put

   787   script_buffer_[put + 0] = ' ';

   788   script_buffer_[put + 1] = ' ';

   789   script_buffer_[put + 2] = ' ';

   790   script_buffer_[put + 3] = '\0';

   792   span->text_bytes = put;       // Does not include the last four chars above

   793   return true;

   794 }

   797 // Copy next run of same-script non-tag letters to buffer [NUL terminated]

   798 // Buffer ALWAYS has leading space and trailing space space space NUL

   799 bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {

   800   if (!letters_marks_only_) {

   801     // Return non-tag text, including punctuation and digits

   802     return GetOneTextSpan(span);

   803   }

   805   span->text = script_buffer_;

   806   span->text_bytes = 0;

   807   span->offset = next_byte_ - start_byte_;

   808   span->ulscript = UNKNOWN_ULSCRIPT;

   809   span->lang = UNKNOWN_LANGUAGE;

   810   span->truncated = false;

   812   // struct timeval script_start, script_mid, script_end;

   814   int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;

   815   if ((kMaxScriptBytes <= byte_length_) &&

   816       (byte_length_ < (2 * kMaxScriptBytes))) {

   817     // Try to split the last two fragments in half

   818     put_soft_limit = byte_length_ / 2;

   819   }

   822   int spanscript;           // The script of this span

   823   int sc = UNKNOWN_ULSCRIPT;  // The script of next character

   824   int tlen = 0;

   825   int plen = 0;

   827   script_buffer_[0] = ' ';  // Always a space at front of output

   828   script_buffer_[1] = '\0';

   829   int take = 0;

   830   int put = 1;              // Start after the initial space

   832   // Build offsets from span->text back to start_byte_ + span->offset

   833   // This mapping reflects deletion of non-letters, expansion of

   834   // entities, etc.

   835   map2original_.Clear();

   836   map2original_.Delete(span->offset);   // So that MapBack(0) gives offset

   838   // Get to the first real non-tag letter or entity that is a letter

   839   int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);

   840   next_byte_ += skip;

   841   byte_length_ -= skip;

   843   if (skip != 1) {

   844     map2original_.Delete(skip);

   845     map2original_.Insert(1);

   846   } else {

   847     map2original_.Copy(1);

   848   }

   849   if (byte_length_ <= 0) {

   850     map2original_.Reset();

   851     return false;               // No more letters to be found

   852   }

   854   // There is at least one letter, so we know the script for this span

   855   span->ulscript = (ULScript)spanscript;

   858   // Go over alternating spans of same-script letters and non-letters,

   859   // copying letters to buffer with single spaces for each run of non-letters

   860   while (take < byte_length_) {

   861     // Copy run of letters in same script (&LS | LS)*

   862     int letter_count = 0;              // Keep track of word length

   863     bool need_break = false;

   865     while (take < byte_length_) {

   866       // We are at a letter, nonletter, tag, or entity

   867       if (IsSpecial(next_byte_[take]) && !is_plain_text_) {

   868         if (next_byte_[take] == '<') {

   869           // Begining of tag

   870           sc = 0;

   871           break;

   872         } else if (next_byte_[take] == '>') {

   873           // Unexpected end of tag

   874           sc = 0;

   875           break;

   876         } else if (next_byte_[take] == '&') {

   877           // Copy entity, no advance

   878           EntityToBuffer(next_byte_ + take, byte_length_ - take,

   879                          script_buffer_ + put, &tlen, &plen);

   880           sc = GetUTF8LetterScriptNum(script_buffer_ + put);

   881         }

   882       } else {

   883         // Real letter, safely copy up to 4 bytes, increment by 1..4

   884         // Will update by 1..4 bytes at Advance, below

   885         tlen = plen = UTF8OneCharLen(next_byte_ + take);

   886         if (take < (byte_length_ - 3)) {

   887           // X86 fast case, does unaligned load/store

   888           UNALIGNED_STORE32(script_buffer_ + put,

   889                             UNALIGNED_LOAD32(next_byte_ + take));

   891         } else {

   892           // Slow case, happens 1-3 times per input document

   893           memcpy(script_buffer_ + put, next_byte_ + take, plen);

   894         }

   895         sc = GetUTF8LetterScriptNum(next_byte_ + take);

   896       }

   898       // Allow continue across a single letter in a different script:

   899       // A B D = three scripts, c = common script, i = inherited script,

   900       // - = don't care, ( = take position before the += below

   901       //  AAA(A-    continue

   902       //

   903       //  AAA(BA    continue

   904       //  AAA(BB    break

   905       //  AAA(Bc    continue (breaks after B)

   906       //  AAA(BD    break

   907       //  AAA(Bi    break

   908       //

   909       //  AAA(c-    break

   910       //

   911       //  AAA(i-    continue

   912       //

   914       if ((sc != spanscript) && (sc != ULScript_Inherited)) {

   915         // Might need to break this script span

   916         if (sc == ULScript_Common) {

   917           need_break = true;

   918         } else {

   919           // Look at next following character, ignoring entity as Common

   920           int sc2 = GetUTF8LetterScriptNum(next_byte_ + take + tlen);

   921           if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {

   922             // We found a non-trivial change of script

   923             if (one_script_only_) {

   924               need_break = true;

   925             }

   926           }

   927         }

   928       }

   929       if (need_break) {break;}  // Non-letter or letter in wrong script

   931       take += tlen;                   // Advance

   932       put += plen;                    // Advance

   934       // Update the offset map to reflect take/put lengths

   935       if (tlen == plen) {

   936         map2original_.Copy(tlen);

   937       } else if (tlen < plen) {

   938         map2original_.Copy(tlen);

   939         map2original_.Insert(plen - tlen);

   940       } else {    // plen < tlen

   941         map2original_.Copy(plen);

   942         map2original_.Delete(tlen - plen);

   943       }

   945       ++letter_count;

   946       if (put >= kMaxScriptBytes) {

   947         // Buffer is full

   948         span->truncated = true;

   949         break;

   950       }

   951     }     // End while letters

   953     // Do run of non-letters (tag | &NL | NL)*

   954     while (take < byte_length_) {

   955       // Do fast scan to next interesting byte

   956       tlen = ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);

   957       take += tlen;

   958       map2original_.Delete(tlen);

   959       if (take >= byte_length_) {break;}    // Might have scanned to end

   961       // We are at a letter, nonletter, tag, or entity

   962       if (IsSpecial(next_byte_[take]) && !is_plain_text_) {

   963         if (next_byte_[take] == '<') {

   964           // Begining of tag; skip to end and go around again

   965           tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,

   966                                       exit_state_);

   967           sc = 0;

   968         } else if (next_byte_[take] == '>') {

   969           // Unexpected end of tag; skip it and go around again

   970           tlen = 1;         // Over the >

   971           sc = 0;

   972         } else if (next_byte_[take] == '&') {

   973           // Expand entity, no advance

   974           EntityToBuffer(next_byte_ + take, byte_length_ - take,

   975                          script_buffer_ + put, &tlen, &plen);

   976           sc = GetUTF8LetterScriptNum(script_buffer_ + put);

   977         }

   978       } else {

   979         // Update 1..4

   980         tlen = UTF8OneCharLen(next_byte_ + take);

   981         sc = GetUTF8LetterScriptNum(next_byte_ + take);

   982       }

   983       if (sc != 0) {break;}           // Letter found

   984       take += tlen;                   // Else advance

   985       map2original_.Delete(tlen);

   986     }     // End while not-letters

   988     script_buffer_[put++] = ' ';

   989     map2original_.Insert(1);

   991     // Letter in wrong script ?

   992     if ((sc != spanscript) && (sc != ULScript_Inherited)) {break;}

   993     if (put >= put_soft_limit) {

   994       // Buffer is almost full

   995       span->truncated = true;

   996       break;

   997     }

   998   }

  1000   // Almost done. Back up to a character boundary if needed

  1001   while ((0 < take) && (take < byte_length_) &&

  1002          ((next_byte_[take] & 0xc0) == 0x80)) {

  1003     // Back up over continuation byte

  1004     --take;

  1005     --put;

  1006   }

  1008   // Update input position

  1009   next_byte_ += take;

  1010   byte_length_ -= take;

  1012   // Put four more spaces/NUL. Worst case is abcd _ _ _ \0

  1013   //                          kMaxScriptBytes |   | put

  1014   script_buffer_[put + 0] = ' ';

  1015   script_buffer_[put + 1] = ' ';

  1016   script_buffer_[put + 2] = ' ';

  1017   script_buffer_[put + 3] = '\0';

  1018   map2original_.Insert(4);

  1019   map2original_.Reset();

  1021   span->text_bytes = put;       // Does not include the last four chars above

  1022   return true;

  1023 }

  1025 // Force Latin, Cyrillic, Armenian, Greek scripts to be lowercase

  1026 // List changes with each version of Unicode, so just always lowercase

  1027 // Unicode 6.2.0:

  1028 //   ARMENIAN COPTIC CYRILLIC DESERET GEORGIAN GLAGOLITIC GREEK LATIN

  1029 void ScriptScanner::LowerScriptSpan(LangSpan* span) {

  1030   // If needed, lowercase all the text. If we do it sooner, might miss

  1031   // lowercasing an entity such as &Aacute;

  1032   // We only need to do this for Latn and Cyrl scripts

  1033   map2uplow_.Clear();

  1034   // Full Unicode lowercase of the entire buffer, including

  1035   // four pad bytes off the end.

  1036   // Ahhh. But the last byte 0x00 is not interchange-valid, so we do 3 pad

  1037   // bytes and put the 0x00 in explicitly.

  1038   // Build an offset map from script_buffer_lower_ back to script_buffer_

  1039   int consumed, filled, changed;

  1040   StringPiece istr(span->text, span->text_bytes + 3);

  1041   StringPiece ostr(script_buffer_lower_, kMaxScriptLowerBuffer);

  1043   UTF8GenericReplace(&utf8repl_lettermarklower_obj,

  1044                             istr, ostr, is_plain_text_,

  1045                             &consumed, &filled, &changed, &map2uplow_);

  1046   script_buffer_lower_[filled] = '\0';

  1047   span->text = script_buffer_lower_;

  1048   span->text_bytes = filled - 3;

  1049   map2uplow_.Reset();

  1050 }

  1052 // Copy next run of same-script non-tag letters to buffer [NUL terminated]

  1053 // Force Latin, Cyrillic, Greek scripts to be lowercase

  1054 // Buffer ALWAYS has leading space and trailing space space space NUL

  1055 bool ScriptScanner::GetOneScriptSpanLower(LangSpan* span) {

  1056   bool ok = GetOneScriptSpan(span);

  1057   LowerScriptSpan(span);

  1058   return ok;

  1059 }

  1062 // Maps byte offset in most recent GetOneScriptSpan/Lower

  1063 // span->text [0..text_bytes] into an additional byte offset from

  1064 // span->offset, to get back to corresponding text in the original

  1065 // input buffer.

  1066 // text_offset must be the first byte

  1067 // of a UTF-8 character, or just beyond the last character. Normally this

  1068 // routine is called with the first byte of an interesting range and

  1069 // again with the first byte of the following range.

  1070 int ScriptScanner::MapBack(int text_offset) {

  1071   return map2original_.MapBack(map2uplow_.MapBack(text_offset));

  1072 }

  1075 // Gets lscript number for letters; always returns

  1076 //   0 (common script) for non-letters

  1077 int GetUTF8LetterScriptNum(const char* src) {

  1078   int srclen = UTF8OneCharLen(src);

  1079   const uint8* usrc = reinterpret_cast<const uint8*>(src);

  1080   return UTF8GenericPropertyTwoByte(&utf8prop_lettermarkscriptnum_obj,

  1081                                     &usrc, &srclen);

  1082 }

  1084 }  // namespace CLD2

The Tor Browser / file revision

browser/components/translation/cld2/internal/getonescriptspan.cc@6474c204b198

browser/components/translation/cld2/internal/getonescriptspan.cc