The Tor Browser: parser/expat/lib/xmltok.c@6474c204b198

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd

     2    See the file COPYING for copying permission.

     3 */

     5 #include <stddef.h>

     7 #ifdef COMPILED_FROM_DSP

     8 #include "winconfig.h"

     9 #elif defined(MACOS_CLASSIC)

    10 #include "macconfig.h"

    11 #elif defined(__amigaos4__)

    12 #include "amigaconfig.h"

    13 #else

    14 #ifdef HAVE_EXPAT_CONFIG_H

    15 #include <expat_config.h>

    16 #endif

    17 #endif /* ndef COMPILED_FROM_DSP */

    19 #include "expat_external.h"

    20 #include "internal.h"

    21 #include "xmltok.h"

    22 #include "nametab.h"

    24 #ifdef XML_DTD

    25 #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)

    26 #else

    27 #define IGNORE_SECTION_TOK_VTABLE /* as nothing */

    28 #endif

    30 #define VTABLE1 \

    31   { PREFIX(prologTok), PREFIX(contentTok), \

    32     PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \

    33   { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \

    34   PREFIX(sameName), \

    35   PREFIX(nameMatchesAscii), \

    36   PREFIX(nameLength), \

    37   PREFIX(skipS), \

    38   PREFIX(getAtts), \

    39   PREFIX(charRefNumber), \

    40   PREFIX(predefinedEntityName), \

    41   PREFIX(updatePosition), \

    42   PREFIX(isPublicId)

    44 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)

    46 #define UCS2_GET_NAMING(pages, hi, lo) \

    47    (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))

    49 /* A 2 byte UTF-8 representation splits the characters 11 bits between

    50    the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into

    51    pages, 3 bits to add to that index and 5 bits to generate the mask.

    52 */

    53 #define UTF8_GET_NAMING2(pages, byte) \

    54     (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \

    55                       + ((((byte)[0]) & 3) << 1) \

    56                       + ((((byte)[1]) >> 5) & 1)] \

    57          & (1 << (((byte)[1]) & 0x1F)))

    59 /* A 3 byte UTF-8 representation splits the characters 16 bits between

    60    the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index

    61    into pages, 3 bits to add to that index and 5 bits to generate the

    62    mask.

    63 */

    64 #define UTF8_GET_NAMING3(pages, byte) \

    65   (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \

    66                              + ((((byte)[1]) >> 2) & 0xF)] \

    67                        << 3) \

    68                       + ((((byte)[1]) & 3) << 1) \

    69                       + ((((byte)[2]) >> 5) & 1)] \

    70          & (1 << (((byte)[2]) & 0x1F)))

    72 #define UTF8_GET_NAMING(pages, p, n) \

    73   ((n) == 2 \

    74   ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \

    75   : ((n) == 3 \

    76      ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \

    77      : 0))

    79 /* Detection of invalid UTF-8 sequences is based on Table 3.1B

    80    of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/

    81    with the additional restriction of not allowing the Unicode

    82    code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).

    83    Implementation details:

    84      (A & 0x80) == 0     means A < 0x80

    85    and

    86      (A & 0xC0) == 0xC0  means A > 0xBF

    87 */

    89 #define UTF8_INVALID2(p) \

    90   ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)

    92 #define UTF8_INVALID3(p) \

    93   (((p)[2] & 0x80) == 0 \

    94   || \

    95   ((*p) == 0xEF && (p)[1] == 0xBF \

    96     ? \

    97     (p)[2] > 0xBD \

    98     : \

    99     ((p)[2] & 0xC0) == 0xC0) \

   100   || \

   101   ((*p) == 0xE0 \

   102     ? \

   103     (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \

   104     : \

   105     ((p)[1] & 0x80) == 0 \

   106     || \

   107     ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))

   109 #define UTF8_INVALID4(p) \

   110   (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \

   111   || \

   112   ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \

   113   || \

   114   ((*p) == 0xF0 \

   115     ? \

   116     (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \

   117     : \

   118     ((p)[1] & 0x80) == 0 \

   119     || \

   120     ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))

   122 static int PTRFASTCALL

   123 isNever(const ENCODING *enc, const char *p)

   124 {

   125   return 0;

   126 }

   128 static int PTRFASTCALL

   129 utf8_isName2(const ENCODING *enc, const char *p)

   130 {

   131   return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);

   132 }

   134 static int PTRFASTCALL

   135 utf8_isName3(const ENCODING *enc, const char *p)

   136 {

   137   return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);

   138 }

   140 #define utf8_isName4 isNever

   142 static int PTRFASTCALL

   143 utf8_isNmstrt2(const ENCODING *enc, const char *p)

   144 {

   145   return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);

   146 }

   148 static int PTRFASTCALL

   149 utf8_isNmstrt3(const ENCODING *enc, const char *p)

   150 {

   151   return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);

   152 }

   154 #define utf8_isNmstrt4 isNever

   156 static int PTRFASTCALL

   157 utf8_isInvalid2(const ENCODING *enc, const char *p)

   158 {

   159   return UTF8_INVALID2((const unsigned char *)p);

   160 }

   162 static int PTRFASTCALL

   163 utf8_isInvalid3(const ENCODING *enc, const char *p)

   164 {

   165   return UTF8_INVALID3((const unsigned char *)p);

   166 }

   168 static int PTRFASTCALL

   169 utf8_isInvalid4(const ENCODING *enc, const char *p)

   170 {

   171   return UTF8_INVALID4((const unsigned char *)p);

   172 }

   174 struct normal_encoding {

   175   ENCODING enc;

   176   unsigned char type[256];

   177 #ifdef XML_MIN_SIZE

   178   int (PTRFASTCALL *byteType)(const ENCODING *, const char *);

   179   int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);

   180   int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);

   181   int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);

   182   int (PTRCALL *charMatches)(const ENCODING *, const char *, int);

   183 #endif /* XML_MIN_SIZE */

   184   int (PTRFASTCALL *isName2)(const ENCODING *, const char *);

   185   int (PTRFASTCALL *isName3)(const ENCODING *, const char *);

   186   int (PTRFASTCALL *isName4)(const ENCODING *, const char *);

   187   int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);

   188   int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);

   189   int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);

   190   int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);

   191   int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);

   192   int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);

   193 };

   195 #define AS_NORMAL_ENCODING(enc)   ((const struct normal_encoding *) (enc))

   197 #ifdef XML_MIN_SIZE

   199 #define STANDARD_VTABLE(E) \

   200  E ## byteType, \

   201  E ## isNameMin, \

   202  E ## isNmstrtMin, \

   203  E ## byteToAscii, \

   204  E ## charMatches,

   206 #else

   208 #define STANDARD_VTABLE(E) /* as nothing */

   210 #endif

   212 #define NORMAL_VTABLE(E) \

   213  E ## isName2, \

   214  E ## isName3, \

   215  E ## isName4, \

   216  E ## isNmstrt2, \

   217  E ## isNmstrt3, \

   218  E ## isNmstrt4, \

   219  E ## isInvalid2, \

   220  E ## isInvalid3, \

   221  E ## isInvalid4

   223 static int FASTCALL checkCharRefNumber(int);

   225 #include "xmltok_impl.h"

   226 #include "ascii.h"

   228 #ifdef XML_MIN_SIZE

   229 #define sb_isNameMin isNever

   230 #define sb_isNmstrtMin isNever

   231 #endif

   233 #ifdef XML_MIN_SIZE

   234 #define MINBPC(enc) ((enc)->minBytesPerChar)

   235 #else

   236 /* minimum bytes per character */

   237 #define MINBPC(enc) 1

   238 #endif

   240 #define SB_BYTE_TYPE(enc, p) \

   241   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])

   243 #ifdef XML_MIN_SIZE

   244 static int PTRFASTCALL

   245 sb_byteType(const ENCODING *enc, const char *p)

   246 {

   247   return SB_BYTE_TYPE(enc, p);

   248 }

   249 #define BYTE_TYPE(enc, p) \

   250  (AS_NORMAL_ENCODING(enc)->byteType(enc, p))

   251 #else

   252 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)

   253 #endif

   255 #ifdef XML_MIN_SIZE

   256 #define BYTE_TO_ASCII(enc, p) \

   257  (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))

   258 static int PTRFASTCALL

   259 sb_byteToAscii(const ENCODING *enc, const char *p)

   260 {

   261   return *p;

   262 }

   263 #else

   264 #define BYTE_TO_ASCII(enc, p) (*(p))

   265 #endif

   267 #define IS_NAME_CHAR(enc, p, n) \

   268  (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))

   269 #define IS_NMSTRT_CHAR(enc, p, n) \

   270  (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))

   271 #define IS_INVALID_CHAR(enc, p, n) \

   272  (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))

   274 #ifdef XML_MIN_SIZE

   275 #define IS_NAME_CHAR_MINBPC(enc, p) \

   276  (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))

   277 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \

   278  (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))

   279 #else

   280 #define IS_NAME_CHAR_MINBPC(enc, p) (0)

   281 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)

   282 #endif

   284 #ifdef XML_MIN_SIZE

   285 #define CHAR_MATCHES(enc, p, c) \

   286  (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))

   287 static int PTRCALL

   288 sb_charMatches(const ENCODING *enc, const char *p, int c)

   289 {

   290   return *p == c;

   291 }

   292 #else

   293 /* c is an ASCII character */

   294 #define CHAR_MATCHES(enc, p, c) (*(p) == c)

   295 #endif

   297 #define PREFIX(ident) normal_ ## ident

   298 #include "xmltok_impl.c"

   300 #undef MINBPC

   301 #undef BYTE_TYPE

   302 #undef BYTE_TO_ASCII

   303 #undef CHAR_MATCHES

   304 #undef IS_NAME_CHAR

   305 #undef IS_NAME_CHAR_MINBPC

   306 #undef IS_NMSTRT_CHAR

   307 #undef IS_NMSTRT_CHAR_MINBPC

   308 #undef IS_INVALID_CHAR

   310 enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */

   311   UTF8_cval1 = 0x00,

   312   UTF8_cval2 = 0xc0,

   313   UTF8_cval3 = 0xe0,

   314   UTF8_cval4 = 0xf0

   315 };

   317 static void PTRCALL

   318 utf8_toUtf8(const ENCODING *enc,

   319             const char **fromP, const char *fromLim,

   320             char **toP, const char *toLim)

   321 {

   322   char *to;

   323   const char *from;

   324   if (fromLim - *fromP > toLim - *toP) {

   325     /* Avoid copying partial characters. */

   326     for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)

   327       if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)

   328         break;

   329   }

   330   for (to = *toP, from = *fromP; from != fromLim; from++, to++)

   331     *to = *from;

   332   *fromP = from;

   333   *toP = to;

   334 }

   336 static void PTRCALL

   337 utf8_toUtf16(const ENCODING *enc,

   338              const char **fromP, const char *fromLim,

   339              unsigned short **toP, const unsigned short *toLim)

   340 {

   341   unsigned short *to = *toP;

   342   const char *from = *fromP;

   343   while (from != fromLim && to != toLim) {

   344     switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {

   345     case BT_LEAD2:

   346       *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));

   347       from += 2;

   348       break;

   349     case BT_LEAD3:

   350       *to++ = (unsigned short)(((from[0] & 0xf) << 12)

   351                                | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));

   352       from += 3;

   353       break;

   354     case BT_LEAD4:

   355       {

   356         unsigned long n;

   357         if (to + 1 == toLim)

   358           goto after;

   359         n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)

   360             | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);

   361         n -= 0x10000;

   362         to[0] = (unsigned short)((n >> 10) | 0xD800);

   363         to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);

   364         to += 2;

   365         from += 4;

   366       }

   367       break;

   368     default:

   369       *to++ = *from++;

   370       break;

   371     }

   372   }

   373 after:

   374   *fromP = from;

   375   *toP = to;

   376 }

   378 #ifdef XML_NS

   379 static const struct normal_encoding utf8_encoding_ns = {

   380   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },

   381   {

   382 #include "asciitab.h"

   383 #include "utf8tab.h"

   384   },

   385   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)

   386 };

   387 #endif

   389 static const struct normal_encoding utf8_encoding = {

   390   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },

   391   {

   392 #define BT_COLON BT_NMSTRT

   393 #include "asciitab.h"

   394 #undef BT_COLON

   395 #include "utf8tab.h"

   396   },

   397   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)

   398 };

   400 #ifdef XML_NS

   402 static const struct normal_encoding internal_utf8_encoding_ns = {

   403   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },

   404   {

   405 #include "iasciitab.h"

   406 #include "utf8tab.h"

   407   },

   408   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)

   409 };

   411 #endif

   413 static const struct normal_encoding internal_utf8_encoding = {

   414   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },

   415   {

   416 #define BT_COLON BT_NMSTRT

   417 #include "iasciitab.h"

   418 #undef BT_COLON

   419 #include "utf8tab.h"

   420   },

   421   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)

   422 };

   424 static void PTRCALL

   425 latin1_toUtf8(const ENCODING *enc,

   426               const char **fromP, const char *fromLim,

   427               char **toP, const char *toLim)

   428 {

   429   for (;;) {

   430     unsigned char c;

   431     if (*fromP == fromLim)

   432       break;

   433     c = (unsigned char)**fromP;

   434     if (c & 0x80) {

   435       if (toLim - *toP < 2)

   436         break;

   437       *(*toP)++ = (char)((c >> 6) | UTF8_cval2);

   438       *(*toP)++ = (char)((c & 0x3f) | 0x80);

   439       (*fromP)++;

   440     }

   441     else {

   442       if (*toP == toLim)

   443         break;

   444       *(*toP)++ = *(*fromP)++;

   445     }

   446   }

   447 }

   449 static void PTRCALL

   450 latin1_toUtf16(const ENCODING *enc,

   451                const char **fromP, const char *fromLim,

   452                unsigned short **toP, const unsigned short *toLim)

   453 {

   454   while (*fromP != fromLim && *toP != toLim)

   455     *(*toP)++ = (unsigned char)*(*fromP)++;

   456 }

   458 #ifdef XML_NS

   460 static const struct normal_encoding latin1_encoding_ns = {

   461   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },

   462   {

   463 #include "asciitab.h"

   464 #include "latin1tab.h"

   465   },

   466   STANDARD_VTABLE(sb_)

   467 };

   469 #endif

   471 static const struct normal_encoding latin1_encoding = {

   472   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },

   473   {

   474 #define BT_COLON BT_NMSTRT

   475 #include "asciitab.h"

   476 #undef BT_COLON

   477 #include "latin1tab.h"

   478   },

   479   STANDARD_VTABLE(sb_)

   480 };

   482 static void PTRCALL

   483 ascii_toUtf8(const ENCODING *enc,

   484              const char **fromP, const char *fromLim,

   485              char **toP, const char *toLim)

   486 {

   487   while (*fromP != fromLim && *toP != toLim)

   488     *(*toP)++ = *(*fromP)++;

   489 }

   491 #ifdef XML_NS

   493 static const struct normal_encoding ascii_encoding_ns = {

   494   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },

   495   {

   496 #include "asciitab.h"

   497 /* BT_NONXML == 0 */

   498   },

   499   STANDARD_VTABLE(sb_)

   500 };

   502 #endif

   504 static const struct normal_encoding ascii_encoding = {

   505   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },

   506   {

   507 #define BT_COLON BT_NMSTRT

   508 #include "asciitab.h"

   509 #undef BT_COLON

   510 /* BT_NONXML == 0 */

   511   },

   512   STANDARD_VTABLE(sb_)

   513 };

   515 static int PTRFASTCALL

   516 unicode_byte_type(char hi, char lo)

   517 {

   518   switch ((unsigned char)hi) {

   519   case 0xD8: case 0xD9: case 0xDA: case 0xDB:

   520     return BT_LEAD4;

   521   case 0xDC: case 0xDD: case 0xDE: case 0xDF:

   522     return BT_TRAIL;

   523   case 0xFF:

   524     switch ((unsigned char)lo) {

   525     case 0xFF:

   526     case 0xFE:

   527       return BT_NONXML;

   528     }

   529     break;

   530   }

   531   return BT_NONASCII;

   532 }

   534 #define DEFINE_UTF16_TO_UTF8(E) \

   535 static void  PTRCALL \

   536 E ## toUtf8(const ENCODING *enc, \

   537             const char **fromP, const char *fromLim, \

   538             char **toP, const char *toLim) \

   539 { \

   540   const char *from; \

   541   for (from = *fromP; from != fromLim; from += 2) { \

   542     int plane; \

   543     unsigned char lo2; \

   544     unsigned char lo = GET_LO(from); \

   545     unsigned char hi = GET_HI(from); \

   546     switch (hi) { \

   547     case 0: \

   548       if (lo < 0x80) { \

   549         if (*toP == toLim) { \

   550           *fromP = from; \

   551           return; \

   552         } \

   553         *(*toP)++ = lo; \

   554         break; \

   555       } \

   556       /* fall through */ \

   557     case 0x1: case 0x2: case 0x3: \

   558     case 0x4: case 0x5: case 0x6: case 0x7: \

   559       if (toLim -  *toP < 2) { \

   560         *fromP = from; \

   561         return; \

   562       } \

   563       *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \

   564       *(*toP)++ = ((lo & 0x3f) | 0x80); \

   565       break; \

   566     default: \

   567       if (toLim -  *toP < 3)  { \

   568         *fromP = from; \

   569         return; \

   570       } \

   571       /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \

   572       *(*toP)++ = ((hi >> 4) | UTF8_cval3); \

   573       *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \

   574       *(*toP)++ = ((lo & 0x3f) | 0x80); \

   575       break; \

   576     case 0xD8: case 0xD9: case 0xDA: case 0xDB: \

   577       if (toLim -  *toP < 4) { \

   578         *fromP = from; \

   579         return; \

   580       } \

   581       plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \

   582       *(*toP)++ = ((plane >> 2) | UTF8_cval4); \

   583       *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \

   584       from += 2; \

   585       lo2 = GET_LO(from); \

   586       *(*toP)++ = (((lo & 0x3) << 4) \

   587                    | ((GET_HI(from) & 0x3) << 2) \

   588                    | (lo2 >> 6) \

   589                    | 0x80); \

   590       *(*toP)++ = ((lo2 & 0x3f) | 0x80); \

   591       break; \

   592     } \

   593   } \

   594   *fromP = from; \

   595 }

   597 #define DEFINE_UTF16_TO_UTF16(E) \

   598 static void  PTRCALL \

   599 E ## toUtf16(const ENCODING *enc, \

   600              const char **fromP, const char *fromLim, \

   601              unsigned short **toP, const unsigned short *toLim) \

   602 { \

   603   /* Avoid copying first half only of surrogate */ \

   604   if (fromLim - *fromP > ((toLim - *toP) << 1) \

   605       && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \

   606     fromLim -= 2; \

   607   for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \

   608     *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \

   609 }

   611 #define SET2(ptr, ch) \

   612   (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))

   613 #define GET_LO(ptr) ((unsigned char)(ptr)[0])

   614 #define GET_HI(ptr) ((unsigned char)(ptr)[1])

   616 DEFINE_UTF16_TO_UTF8(little2_)

   617 DEFINE_UTF16_TO_UTF16(little2_)

   619 #undef SET2

   620 #undef GET_LO

   621 #undef GET_HI

   623 #define SET2(ptr, ch) \

   624   (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))

   625 #define GET_LO(ptr) ((unsigned char)(ptr)[1])

   626 #define GET_HI(ptr) ((unsigned char)(ptr)[0])

   628 DEFINE_UTF16_TO_UTF8(big2_)

   629 DEFINE_UTF16_TO_UTF16(big2_)

   631 #undef SET2

   632 #undef GET_LO

   633 #undef GET_HI

   635 #define LITTLE2_BYTE_TYPE(enc, p) \

   636  ((p)[1] == 0 \

   637   ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \

   638   : unicode_byte_type((p)[1], (p)[0]))

   639 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)

   640 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)

   641 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \

   642   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])

   643 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \

   644   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])

   646 #ifdef XML_MIN_SIZE

   648 static int PTRFASTCALL

   649 little2_byteType(const ENCODING *enc, const char *p)

   650 {

   651   return LITTLE2_BYTE_TYPE(enc, p);

   652 }

   654 static int PTRFASTCALL

   655 little2_byteToAscii(const ENCODING *enc, const char *p)

   656 {

   657   return LITTLE2_BYTE_TO_ASCII(enc, p);

   658 }

   660 static int PTRCALL

   661 little2_charMatches(const ENCODING *enc, const char *p, int c)

   662 {

   663   return LITTLE2_CHAR_MATCHES(enc, p, c);

   664 }

   666 static int PTRFASTCALL

   667 little2_isNameMin(const ENCODING *enc, const char *p)

   668 {

   669   return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);

   670 }

   672 static int PTRFASTCALL

   673 little2_isNmstrtMin(const ENCODING *enc, const char *p)

   674 {

   675   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);

   676 }

   678 #undef VTABLE

   679 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16

   681 #else /* not XML_MIN_SIZE */

   683 #undef PREFIX

   684 #define PREFIX(ident) little2_ ## ident

   685 #define MINBPC(enc) 2

   686 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */

   687 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)

   688 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)

   689 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)

   690 #define IS_NAME_CHAR(enc, p, n) 0

   691 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)

   692 #define IS_NMSTRT_CHAR(enc, p, n) (0)

   693 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)

   695 #include "xmltok_impl.c"

   697 #undef MINBPC

   698 #undef BYTE_TYPE

   699 #undef BYTE_TO_ASCII

   700 #undef CHAR_MATCHES

   701 #undef IS_NAME_CHAR

   702 #undef IS_NAME_CHAR_MINBPC

   703 #undef IS_NMSTRT_CHAR

   704 #undef IS_NMSTRT_CHAR_MINBPC

   705 #undef IS_INVALID_CHAR

   707 #endif /* not XML_MIN_SIZE */

   709 #ifdef XML_NS

   711 static const struct normal_encoding little2_encoding_ns = {

   712   { VTABLE, 2, 0,

   713 #if BYTEORDER == 1234

   714     1

   715 #else

   716     0

   717 #endif

   718   },

   719   {

   720 #include "asciitab.h"

   721 #include "latin1tab.h"

   722   },

   723   STANDARD_VTABLE(little2_)

   724 };

   726 #endif

   728 static const struct normal_encoding little2_encoding = {

   729   { VTABLE, 2, 0,

   730 #if BYTEORDER == 1234

   731     1

   732 #else

   733     0

   734 #endif

   735   },

   736   {

   737 #define BT_COLON BT_NMSTRT

   738 #include "asciitab.h"

   739 #undef BT_COLON

   740 #include "latin1tab.h"

   741   },

   742   STANDARD_VTABLE(little2_)

   743 };

   745 #if BYTEORDER != 4321

   747 #ifdef XML_NS

   749 static const struct normal_encoding internal_little2_encoding_ns = {

   750   { VTABLE, 2, 0, 1 },

   751   {

   752 #include "iasciitab.h"

   753 #include "latin1tab.h"

   754   },

   755   STANDARD_VTABLE(little2_)

   756 };

   758 #endif

   760 static const struct normal_encoding internal_little2_encoding = {

   761   { VTABLE, 2, 0, 1 },

   762   {

   763 #define BT_COLON BT_NMSTRT

   764 #include "iasciitab.h"

   765 #undef BT_COLON

   766 #include "latin1tab.h"

   767   },

   768   STANDARD_VTABLE(little2_)

   769 };

   771 #endif

   774 #define BIG2_BYTE_TYPE(enc, p) \

   775  ((p)[0] == 0 \

   776   ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \

   777   : unicode_byte_type((p)[0], (p)[1]))

   778 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)

   779 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)

   780 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \

   781   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])

   782 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \

   783   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])

   785 #ifdef XML_MIN_SIZE

   787 static int PTRFASTCALL

   788 big2_byteType(const ENCODING *enc, const char *p)

   789 {

   790   return BIG2_BYTE_TYPE(enc, p);

   791 }

   793 static int PTRFASTCALL

   794 big2_byteToAscii(const ENCODING *enc, const char *p)

   795 {

   796   return BIG2_BYTE_TO_ASCII(enc, p);

   797 }

   799 static int PTRCALL

   800 big2_charMatches(const ENCODING *enc, const char *p, int c)

   801 {

   802   return BIG2_CHAR_MATCHES(enc, p, c);

   803 }

   805 static int PTRFASTCALL

   806 big2_isNameMin(const ENCODING *enc, const char *p)

   807 {

   808   return BIG2_IS_NAME_CHAR_MINBPC(enc, p);

   809 }

   811 static int PTRFASTCALL

   812 big2_isNmstrtMin(const ENCODING *enc, const char *p)

   813 {

   814   return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);

   815 }

   817 #undef VTABLE

   818 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16

   820 #else /* not XML_MIN_SIZE */

   822 #undef PREFIX

   823 #define PREFIX(ident) big2_ ## ident

   824 #define MINBPC(enc) 2

   825 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */

   826 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)

   827 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)

   828 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)

   829 #define IS_NAME_CHAR(enc, p, n) 0

   830 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)

   831 #define IS_NMSTRT_CHAR(enc, p, n) (0)

   832 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)

   834 #include "xmltok_impl.c"

   836 #undef MINBPC

   837 #undef BYTE_TYPE

   838 #undef BYTE_TO_ASCII

   839 #undef CHAR_MATCHES

   840 #undef IS_NAME_CHAR

   841 #undef IS_NAME_CHAR_MINBPC

   842 #undef IS_NMSTRT_CHAR

   843 #undef IS_NMSTRT_CHAR_MINBPC

   844 #undef IS_INVALID_CHAR

   846 #endif /* not XML_MIN_SIZE */

   848 #ifdef XML_NS

   850 static const struct normal_encoding big2_encoding_ns = {

   851   { VTABLE, 2, 0,

   852 #if BYTEORDER == 4321

   853   1

   854 #else

   855   0

   856 #endif

   857   },

   858   {

   859 #include "asciitab.h"

   860 #include "latin1tab.h"

   861   },

   862   STANDARD_VTABLE(big2_)

   863 };

   865 #endif

   867 static const struct normal_encoding big2_encoding = {

   868   { VTABLE, 2, 0,

   869 #if BYTEORDER == 4321

   870   1

   871 #else

   872   0

   873 #endif

   874   },

   875   {

   876 #define BT_COLON BT_NMSTRT

   877 #include "asciitab.h"

   878 #undef BT_COLON

   879 #include "latin1tab.h"

   880   },

   881   STANDARD_VTABLE(big2_)

   882 };

   884 #if BYTEORDER != 1234

   886 #ifdef XML_NS

   888 static const struct normal_encoding internal_big2_encoding_ns = {

   889   { VTABLE, 2, 0, 1 },

   890   {

   891 #include "iasciitab.h"

   892 #include "latin1tab.h"

   893   },

   894   STANDARD_VTABLE(big2_)

   895 };

   897 #endif

   899 static const struct normal_encoding internal_big2_encoding = {

   900   { VTABLE, 2, 0, 1 },

   901   {

   902 #define BT_COLON BT_NMSTRT

   903 #include "iasciitab.h"

   904 #undef BT_COLON

   905 #include "latin1tab.h"

   906   },

   907   STANDARD_VTABLE(big2_)

   908 };

   910 #endif

   912 #undef PREFIX

   914 static int FASTCALL

   915 streqci(const char *s1, const char *s2)

   916 {

   917   for (;;) {

   918     char c1 = *s1++;

   919     char c2 = *s2++;

   920     if (ASCII_a <= c1 && c1 <= ASCII_z)

   921       c1 += ASCII_A - ASCII_a;

   922     if (ASCII_a <= c2 && c2 <= ASCII_z)

   923       c2 += ASCII_A - ASCII_a;

   924     if (c1 != c2)

   925       return 0;

   926     if (!c1)

   927       break;

   928   }

   929   return 1;

   930 }

   932 static void PTRCALL

   933 initUpdatePosition(const ENCODING *enc, const char *ptr,

   934                    const char *end, POSITION *pos)

   935 {

   936   normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);

   937 }

   939 static int

   940 toAscii(const ENCODING *enc, const char *ptr, const char *end)

   941 {

   942   char buf[1];

   943   char *p = buf;

   944   XmlUtf8Convert(enc, &ptr, end, &p, p + 1);

   945   if (p == buf)

   946     return -1;

   947   else

   948     return buf[0];

   949 }

   951 static int FASTCALL

   952 isSpace(int c)

   953 {

   954   switch (c) {

   955   case 0x20:

   956   case 0xD:

   957   case 0xA:

   958   case 0x9:

   959     return 1;

   960   }

   961   return 0;

   962 }

   964 /* Return 1 if there's just optional white space or there's an S

   965    followed by name=val.

   966 */

   967 static int

   968 parsePseudoAttribute(const ENCODING *enc,

   969                      const char *ptr,

   970                      const char *end,

   971                      const char **namePtr,

   972                      const char **nameEndPtr,

   973                      const char **valPtr,

   974                      const char **nextTokPtr)

   975 {

   976   int c;

   977   char open;

   978   if (ptr == end) {

   979     *namePtr = NULL;

   980     return 1;

   981   }

   982   if (!isSpace(toAscii(enc, ptr, end))) {

   983     *nextTokPtr = ptr;

   984     return 0;

   985   }

   986   do {

   987     ptr += enc->minBytesPerChar;

   988   } while (isSpace(toAscii(enc, ptr, end)));

   989   if (ptr == end) {

   990     *namePtr = NULL;

   991     return 1;

   992   }

   993   *namePtr = ptr;

   994   for (;;) {

   995     c = toAscii(enc, ptr, end);

   996     if (c == -1) {

   997       *nextTokPtr = ptr;

   998       return 0;

   999     }

  1000     if (c == ASCII_EQUALS) {

  1001       *nameEndPtr = ptr;

  1002       break;

  1003     }

  1004     if (isSpace(c)) {

  1005       *nameEndPtr = ptr;

  1006       do {

  1007         ptr += enc->minBytesPerChar;

  1008       } while (isSpace(c = toAscii(enc, ptr, end)));

  1009       if (c != ASCII_EQUALS) {

  1010         *nextTokPtr = ptr;

  1011         return 0;

  1012       }

  1013       break;

  1014     }

  1015     ptr += enc->minBytesPerChar;

  1016   }

  1017   if (ptr == *namePtr) {

  1018     *nextTokPtr = ptr;

  1019     return 0;

  1020   }

  1021   ptr += enc->minBytesPerChar;

  1022   c = toAscii(enc, ptr, end);

  1023   while (isSpace(c)) {

  1024     ptr += enc->minBytesPerChar;

  1025     c = toAscii(enc, ptr, end);

  1026   }

  1027   if (c != ASCII_QUOT && c != ASCII_APOS) {

  1028     *nextTokPtr = ptr;

  1029     return 0;

  1030   }

  1031   open = (char)c;

  1032   ptr += enc->minBytesPerChar;

  1033   *valPtr = ptr;

  1034   for (;; ptr += enc->minBytesPerChar) {

  1035     c = toAscii(enc, ptr, end);

  1036     if (c == open)

  1037       break;

  1038     if (!(ASCII_a <= c && c <= ASCII_z)

  1039         && !(ASCII_A <= c && c <= ASCII_Z)

  1040         && !(ASCII_0 <= c && c <= ASCII_9)

  1041         && c != ASCII_PERIOD

  1042         && c != ASCII_MINUS

  1043         && c != ASCII_UNDERSCORE) {

  1044       *nextTokPtr = ptr;

  1045       return 0;

  1046     }

  1047   }

  1048   *nextTokPtr = ptr + enc->minBytesPerChar;

  1049   return 1;

  1050 }

  1052 static const char KW_version[] = {

  1053   ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'

  1054 };

  1056 static const char KW_encoding[] = {

  1057   ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'

  1058 };

  1060 static const char KW_standalone[] = {

  1061   ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,

  1062   ASCII_n, ASCII_e, '\0'

  1063 };

  1065 static const char KW_yes[] = {

  1066   ASCII_y, ASCII_e, ASCII_s,  '\0'

  1067 };

  1069 static const char KW_no[] = {

  1070   ASCII_n, ASCII_o,  '\0'

  1071 };

  1073 /* BEGIN MOZILLA CHANGE (http://bugzilla.mozilla.org/show_bug.cgi?id=62157) */

  1074 static const char KW_XML_1_0[] = {

  1075   ASCII_1, ASCII_PERIOD, ASCII_0, '\0'

  1076 };

  1077 /* END MOZILLA CHANGE */

  1079 static int

  1080 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,

  1081                                                  const char *,

  1082                                                  const char *),

  1083                int isGeneralTextEntity,

  1084                const ENCODING *enc,

  1085                const char *ptr,

  1086                const char *end,

  1087                const char **badPtr,

  1088                const char **versionPtr,

  1089                const char **versionEndPtr,

  1090                const char **encodingName,

  1091                const ENCODING **encoding,

  1092                int *standalone)

  1093 {

  1094   const char *val = NULL;

  1095   const char *name = NULL;

  1096   const char *nameEnd = NULL;

  1097   ptr += 5 * enc->minBytesPerChar;

  1098   end -= 2 * enc->minBytesPerChar;

  1099   if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)

  1100       || !name) {

  1101     *badPtr = ptr;

  1102     return 0;

  1103   }

  1104   if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {

  1105     if (!isGeneralTextEntity) {

  1106       *badPtr = name;

  1107       return 0;

  1108     }

  1109   }

  1110   else {

  1111     if (versionPtr)

  1112       *versionPtr = val;

  1113     if (versionEndPtr)

  1114       *versionEndPtr = ptr;

  1115 /* BEGIN MOZILLA CHANGE (http://bugzilla.mozilla.org/show_bug.cgi?id=62157) */

  1116      /* Anything else but a version="1.0" is invalid for us, until we support later versions. */

  1117      if (!XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_XML_1_0)) {

  1118        *badPtr = val;

  1119        return 0;

  1120      }

  1121 /* END MOZILLA CHANGE */

  1122     if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {

  1123       *badPtr = ptr;

  1124       return 0;

  1125     }

  1126     if (!name) {

  1127       if (isGeneralTextEntity) {

  1128         /* a TextDecl must have an EncodingDecl */

  1129         *badPtr = ptr;

  1130         return 0;

  1131       }

  1132       return 1;

  1133     }

  1134   }

  1135   if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {

  1136     int c = toAscii(enc, val, end);

  1137     if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {

  1138       *badPtr = val;

  1139       return 0;

  1140     }

  1141     if (encodingName)

  1142       *encodingName = val;

  1143     if (encoding)

  1144       *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);

  1145     if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {

  1146       *badPtr = ptr;

  1147       return 0;

  1148     }

  1149     if (!name)

  1150       return 1;

  1151   }

  1152   if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)

  1153       || isGeneralTextEntity) {

  1154     *badPtr = name;

  1155     return 0;

  1156   }

  1157   if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {

  1158     if (standalone)

  1159       *standalone = 1;

  1160   }

  1161   else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {

  1162     if (standalone)

  1163       *standalone = 0;

  1164   }

  1165   else {

  1166     *badPtr = val;

  1167     return 0;

  1168   }

  1169   while (isSpace(toAscii(enc, ptr, end)))

  1170     ptr += enc->minBytesPerChar;

  1171   if (ptr != end) {

  1172     *badPtr = ptr;

  1173     return 0;

  1174   }

  1175   return 1;

  1176 }

  1178 static int FASTCALL

  1179 checkCharRefNumber(int result)

  1180 {

  1181   switch (result >> 8) {

  1182   case 0xD8: case 0xD9: case 0xDA: case 0xDB:

  1183   case 0xDC: case 0xDD: case 0xDE: case 0xDF:

  1184     return -1;

  1185   case 0:

  1186     if (latin1_encoding.type[result] == BT_NONXML)

  1187       return -1;

  1188     break;

  1189   case 0xFF:

  1190     if (result == 0xFFFE || result == 0xFFFF)

  1191       return -1;

  1192     break;

  1193   }

  1194   return result;

  1195 }

  1197 int FASTCALL

  1198 XmlUtf8Encode(int c, char *buf)

  1199 {

  1200   enum {

  1201     /* minN is minimum legal resulting value for N byte sequence */

  1202     min2 = 0x80,

  1203     min3 = 0x800,

  1204     min4 = 0x10000

  1205   };

  1207   if (c < 0)

  1208     return 0;

  1209   if (c < min2) {

  1210     buf[0] = (char)(c | UTF8_cval1);

  1211     return 1;

  1212   }

  1213   if (c < min3) {

  1214     buf[0] = (char)((c >> 6) | UTF8_cval2);

  1215     buf[1] = (char)((c & 0x3f) | 0x80);

  1216     return 2;

  1217   }

  1218   if (c < min4) {

  1219     buf[0] = (char)((c >> 12) | UTF8_cval3);

  1220     buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);

  1221     buf[2] = (char)((c & 0x3f) | 0x80);

  1222     return 3;

  1223   }

  1224   if (c < 0x110000) {

  1225     buf[0] = (char)((c >> 18) | UTF8_cval4);

  1226     buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);

  1227     buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);

  1228     buf[3] = (char)((c & 0x3f) | 0x80);

  1229     return 4;

  1230   }

  1231   return 0;

  1232 }

  1234 int FASTCALL

  1235 XmlUtf16Encode(int charNum, unsigned short *buf)

  1236 {

  1237   if (charNum < 0)

  1238     return 0;

  1239   if (charNum < 0x10000) {

  1240     buf[0] = (unsigned short)charNum;

  1241     return 1;

  1242   }

  1243   if (charNum < 0x110000) {

  1244     charNum -= 0x10000;

  1245     buf[0] = (unsigned short)((charNum >> 10) + 0xD800);

  1246     buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);

  1247     return 2;

  1248   }

  1249   return 0;

  1250 }

  1252 struct unknown_encoding {

  1253   struct normal_encoding normal;

  1254   CONVERTER convert;

  1255   void *userData;

  1256   unsigned short utf16[256];

  1257   char utf8[256][4];

  1258 };

  1260 #define AS_UNKNOWN_ENCODING(enc)  ((const struct unknown_encoding *) (enc))

  1262 int

  1263 XmlSizeOfUnknownEncoding(void)

  1264 {

  1265   return sizeof(struct unknown_encoding);

  1266 }

  1268 static int PTRFASTCALL

  1269 unknown_isName(const ENCODING *enc, const char *p)

  1270 {

  1271   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);

  1272   int c = uenc->convert(uenc->userData, p);

  1273   if (c & ~0xFFFF)

  1274     return 0;

  1275   return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);

  1276 }

  1278 static int PTRFASTCALL

  1279 unknown_isNmstrt(const ENCODING *enc, const char *p)

  1280 {

  1281   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);

  1282   int c = uenc->convert(uenc->userData, p);

  1283   if (c & ~0xFFFF)

  1284     return 0;

  1285   return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);

  1286 }

  1288 static int PTRFASTCALL

  1289 unknown_isInvalid(const ENCODING *enc, const char *p)

  1290 {

  1291   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);

  1292   int c = uenc->convert(uenc->userData, p);

  1293   return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;

  1294 }

  1296 static void PTRCALL

  1297 unknown_toUtf8(const ENCODING *enc,

  1298                const char **fromP, const char *fromLim,

  1299                char **toP, const char *toLim)

  1300 {

  1301   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);

  1302   char buf[XML_UTF8_ENCODE_MAX];

  1303   for (;;) {

  1304     const char *utf8;

  1305     int n;

  1306     if (*fromP == fromLim)

  1307       break;

  1308     utf8 = uenc->utf8[(unsigned char)**fromP];

  1309     n = *utf8++;

  1310     if (n == 0) {

  1311       int c = uenc->convert(uenc->userData, *fromP);

  1312       n = XmlUtf8Encode(c, buf);

  1313       if (n > toLim - *toP)

  1314         break;

  1315       utf8 = buf;

  1316       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]

  1317                  - (BT_LEAD2 - 2));

  1318     }

  1319     else {

  1320       if (n > toLim - *toP)

  1321         break;

  1322       (*fromP)++;

  1323     }

  1324     do {

  1325       *(*toP)++ = *utf8++;

  1326     } while (--n != 0);

  1327   }

  1328 }

  1330 static void PTRCALL

  1331 unknown_toUtf16(const ENCODING *enc,

  1332                 const char **fromP, const char *fromLim,

  1333                 unsigned short **toP, const unsigned short *toLim)

  1334 {

  1335   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);

  1336   while (*fromP != fromLim && *toP != toLim) {

  1337     unsigned short c = uenc->utf16[(unsigned char)**fromP];

  1338     if (c == 0) {

  1339       c = (unsigned short)

  1340           uenc->convert(uenc->userData, *fromP);

  1341       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]

  1342                  - (BT_LEAD2 - 2));

  1343     }

  1344     else

  1345       (*fromP)++;

  1346     *(*toP)++ = c;

  1347   }

  1348 }

  1350 ENCODING *

  1351 XmlInitUnknownEncoding(void *mem,

  1352                        int *table,

  1353                        CONVERTER convert,

  1354                        void *userData)

  1355 {

  1356   int i;

  1357   struct unknown_encoding *e = (struct unknown_encoding *)mem;

  1358   for (i = 0; i < (int)sizeof(struct normal_encoding); i++)

  1359     ((char *)mem)[i] = ((char *)&latin1_encoding)[i];

  1360   for (i = 0; i < 128; i++)

  1361     if (latin1_encoding.type[i] != BT_OTHER

  1362         && latin1_encoding.type[i] != BT_NONXML

  1363         && table[i] != i)

  1364       return 0;

  1365   for (i = 0; i < 256; i++) {

  1366     int c = table[i];

  1367     if (c == -1) {

  1368       e->normal.type[i] = BT_MALFORM;

  1369       /* This shouldn't really get used. */

  1370       e->utf16[i] = 0xFFFF;

  1371       e->utf8[i][0] = 1;

  1372       e->utf8[i][1] = 0;

  1373     }

  1374     else if (c < 0) {

  1375       if (c < -4)

  1376         return 0;

  1377       e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));

  1378       e->utf8[i][0] = 0;

  1379       e->utf16[i] = 0;

  1380     }

  1381     else if (c < 0x80) {

  1382       if (latin1_encoding.type[c] != BT_OTHER

  1383           && latin1_encoding.type[c] != BT_NONXML

  1384           && c != i)

  1385         return 0;

  1386       e->normal.type[i] = latin1_encoding.type[c];

  1387       e->utf8[i][0] = 1;

  1388       e->utf8[i][1] = (char)c;

  1389       e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);

  1390     }

  1391     else if (checkCharRefNumber(c) < 0) {

  1392       e->normal.type[i] = BT_NONXML;

  1393       /* This shouldn't really get used. */

  1394       e->utf16[i] = 0xFFFF;

  1395       e->utf8[i][0] = 1;

  1396       e->utf8[i][1] = 0;

  1397     }

  1398     else {

  1399       if (c > 0xFFFF)

  1400         return 0;

  1401       if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))

  1402         e->normal.type[i] = BT_NMSTRT;

  1403       else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))

  1404         e->normal.type[i] = BT_NAME;

  1405       else

  1406         e->normal.type[i] = BT_OTHER;

  1407       e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);

  1408       e->utf16[i] = (unsigned short)c;

  1409     }

  1410   }

  1411   e->userData = userData;

  1412   e->convert = convert;

  1413   if (convert) {

  1414     e->normal.isName2 = unknown_isName;

  1415     e->normal.isName3 = unknown_isName;

  1416     e->normal.isName4 = unknown_isName;

  1417     e->normal.isNmstrt2 = unknown_isNmstrt;

  1418     e->normal.isNmstrt3 = unknown_isNmstrt;

  1419     e->normal.isNmstrt4 = unknown_isNmstrt;

  1420     e->normal.isInvalid2 = unknown_isInvalid;

  1421     e->normal.isInvalid3 = unknown_isInvalid;

  1422     e->normal.isInvalid4 = unknown_isInvalid;

  1423   }

  1424   e->normal.enc.utf8Convert = unknown_toUtf8;

  1425   e->normal.enc.utf16Convert = unknown_toUtf16;

  1426   return &(e->normal.enc);

  1427 }

  1429 /* If this enumeration is changed, getEncodingIndex and encodings

  1430 must also be changed. */

  1431 enum {

  1432   UNKNOWN_ENC = -1,

  1433   ISO_8859_1_ENC = 0,

  1434   US_ASCII_ENC,

  1435   UTF_8_ENC,

  1436   UTF_16_ENC,

  1437   UTF_16BE_ENC,

  1438   UTF_16LE_ENC,

  1439   /* must match encodingNames up to here */

  1440   NO_ENC

  1441 };

  1443 static const char KW_ISO_8859_1[] = {

  1444   ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,

  1445   ASCII_MINUS, ASCII_1, '\0'

  1446 };

  1447 static const char KW_US_ASCII[] = {

  1448   ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,

  1449   '\0'

  1450 };

  1451 static const char KW_UTF_8[] =  {

  1452   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'

  1453 };

  1454 static const char KW_UTF_16[] = {

  1455   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'

  1456 };

  1457 static const char KW_UTF_16BE[] = {

  1458   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,

  1459   '\0'

  1460 };

  1461 static const char KW_UTF_16LE[] = {

  1462   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,

  1463   '\0'

  1464 };

  1466 static int FASTCALL

  1467 getEncodingIndex(const char *name)

  1468 {

  1469   static const char * const encodingNames[] = {

  1470     KW_ISO_8859_1,

  1471     KW_US_ASCII,

  1472     KW_UTF_8,

  1473     KW_UTF_16,

  1474     KW_UTF_16BE,

  1475     KW_UTF_16LE,

  1476   };

  1477   int i;

  1478   if (name == NULL)

  1479     return NO_ENC;

  1480   for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)

  1481     if (streqci(name, encodingNames[i]))

  1482       return i;

  1483   return UNKNOWN_ENC;

  1484 }

  1486 /* For binary compatibility, we store the index of the encoding

  1487    specified at initialization in the isUtf16 member.

  1488 */

  1490 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)

  1491 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)

  1493 /* This is what detects the encoding.  encodingTable maps from

  1494    encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of

  1495    the external (protocol) specified encoding; state is

  1496    XML_CONTENT_STATE if we're parsing an external text entity, and

  1497    XML_PROLOG_STATE otherwise.

  1498 */

  1501 static int

  1502 initScan(const ENCODING * const *encodingTable,

  1503          const INIT_ENCODING *enc,

  1504          int state,

  1505          const char *ptr,

  1506          const char *end,

  1507          const char **nextTokPtr)

  1508 {

  1509   const ENCODING **encPtr;

  1511   if (ptr == end)

  1512     return XML_TOK_NONE;

  1513   encPtr = enc->encPtr;

  1514   if (ptr + 1 == end) {

  1515     /* only a single byte available for auto-detection */

  1516 #ifndef XML_DTD /* FIXME */

  1517     /* a well-formed document entity must have more than one byte */

  1518     if (state != XML_CONTENT_STATE)

  1519       return XML_TOK_PARTIAL;

  1520 #endif

  1521     /* so we're parsing an external text entity... */

  1522     /* if UTF-16 was externally specified, then we need at least 2 bytes */

  1523     switch (INIT_ENC_INDEX(enc)) {

  1524     case UTF_16_ENC:

  1525     case UTF_16LE_ENC:

  1526     case UTF_16BE_ENC:

  1527       return XML_TOK_PARTIAL;

  1528     }

  1529     switch ((unsigned char)*ptr) {

  1530     case 0xFE:

  1531     case 0xFF:

  1532     case 0xEF: /* possibly first byte of UTF-8 BOM */

  1533       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC

  1534           && state == XML_CONTENT_STATE)

  1535         break;

  1536       /* fall through */

  1537     case 0x00:

  1538     case 0x3C:

  1539       return XML_TOK_PARTIAL;

  1540     }

  1541   }

  1542   else {

  1543     switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {

  1544     case 0xFEFF:

  1545       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC

  1546           && state == XML_CONTENT_STATE)

  1547         break;

  1548       *nextTokPtr = ptr + 2;

  1549       *encPtr = encodingTable[UTF_16BE_ENC];

  1550       return XML_TOK_BOM;

  1551     /* 00 3C is handled in the default case */

  1552     case 0x3C00:

  1553       if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC

  1554            || INIT_ENC_INDEX(enc) == UTF_16_ENC)

  1555           && state == XML_CONTENT_STATE)

  1556         break;

  1557       *encPtr = encodingTable[UTF_16LE_ENC];

  1558       return XmlTok(*encPtr, state, ptr, end, nextTokPtr);

  1559     case 0xFFFE:

  1560       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC

  1561           && state == XML_CONTENT_STATE)

  1562         break;

  1563       *nextTokPtr = ptr + 2;

  1564       *encPtr = encodingTable[UTF_16LE_ENC];

  1565       return XML_TOK_BOM;

  1566     case 0xEFBB:

  1567       /* Maybe a UTF-8 BOM (EF BB BF) */

  1568       /* If there's an explicitly specified (external) encoding

  1569          of ISO-8859-1 or some flavour of UTF-16

  1570          and this is an external text entity,

  1571          don't look for the BOM,

  1572          because it might be a legal data.

  1573       */

  1574       if (state == XML_CONTENT_STATE) {

  1575         int e = INIT_ENC_INDEX(enc);

  1576         if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC

  1577             || e == UTF_16LE_ENC || e == UTF_16_ENC)

  1578           break;

  1579       }

  1580       if (ptr + 2 == end)

  1581         return XML_TOK_PARTIAL;

  1582       if ((unsigned char)ptr[2] == 0xBF) {

  1583         *nextTokPtr = ptr + 3;

  1584         *encPtr = encodingTable[UTF_8_ENC];

  1585         return XML_TOK_BOM;

  1586       }

  1587       break;

  1588     default:

  1589       if (ptr[0] == '\0') {

  1590         /* 0 isn't a legal data character. Furthermore a document

  1591            entity can only start with ASCII characters.  So the only

  1592            way this can fail to be big-endian UTF-16 if it it's an

  1593            external parsed general entity that's labelled as

  1594            UTF-16LE.

  1595         */

  1596         if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)

  1597           break;

  1598         *encPtr = encodingTable[UTF_16BE_ENC];

  1599         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);

  1600       }

  1601       else if (ptr[1] == '\0') {

  1602         /* We could recover here in the case:

  1603             - parsing an external entity

  1604             - second byte is 0

  1605             - no externally specified encoding

  1606             - no encoding declaration

  1607            by assuming UTF-16LE.  But we don't, because this would mean when

  1608            presented just with a single byte, we couldn't reliably determine

  1609            whether we needed further bytes.

  1610         */

  1611         if (state == XML_CONTENT_STATE)

  1612           break;

  1613         *encPtr = encodingTable[UTF_16LE_ENC];

  1614         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);

  1615       }

  1616       break;

  1617     }

  1618   }

  1619   *encPtr = encodingTable[INIT_ENC_INDEX(enc)];

  1620   return XmlTok(*encPtr, state, ptr, end, nextTokPtr);

  1621 }

  1624 #define NS(x) x

  1625 #define ns(x) x

  1626 #include "xmltok_ns.c"

  1627 #undef NS

  1628 #undef ns

  1630 #ifdef XML_NS

  1632 #define NS(x) x ## NS

  1633 #define ns(x) x ## _ns

  1635 #include "xmltok_ns.c"

  1637 #undef NS

  1638 #undef ns

  1640 ENCODING *

  1641 XmlInitUnknownEncodingNS(void *mem,

  1642                          int *table,

  1643                          CONVERTER convert,

  1644                          void *userData)

  1645 {

  1646   ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);

  1647   if (enc)

  1648     ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;

  1649   return enc;

  1650 }

  1652 #endif /* XML_NS */

  1654 /* BEGIN MOZILLA CHANGE (Mozilla extensions for QName checking) */

  1655 #ifdef MOZILLA_CLIENT

  1656 #include "moz_extensions.c"

  1657 #endif /* MOZILLA_CLIENT */

  1658 /* END MOZILLA CHANGE */

The Tor Browser / file revision

parser/expat/lib/xmltok.c@6474c204b198

parser/expat/lib/xmltok.c