parser/expat/lib/xmltok.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
     2    See the file COPYING for copying permission.
     3 */
     5 #include <stddef.h>
     7 #ifdef COMPILED_FROM_DSP
     8 #include "winconfig.h"
     9 #elif defined(MACOS_CLASSIC)
    10 #include "macconfig.h"
    11 #elif defined(__amigaos4__)
    12 #include "amigaconfig.h"
    13 #else
    14 #ifdef HAVE_EXPAT_CONFIG_H
    15 #include <expat_config.h>
    16 #endif
    17 #endif /* ndef COMPILED_FROM_DSP */
    19 #include "expat_external.h"
    20 #include "internal.h"
    21 #include "xmltok.h"
    22 #include "nametab.h"
    24 #ifdef XML_DTD
    25 #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
    26 #else
    27 #define IGNORE_SECTION_TOK_VTABLE /* as nothing */
    28 #endif
    30 #define VTABLE1 \
    31   { PREFIX(prologTok), PREFIX(contentTok), \
    32     PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
    33   { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
    34   PREFIX(sameName), \
    35   PREFIX(nameMatchesAscii), \
    36   PREFIX(nameLength), \
    37   PREFIX(skipS), \
    38   PREFIX(getAtts), \
    39   PREFIX(charRefNumber), \
    40   PREFIX(predefinedEntityName), \
    41   PREFIX(updatePosition), \
    42   PREFIX(isPublicId)
    44 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
    46 #define UCS2_GET_NAMING(pages, hi, lo) \
    47    (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
    49 /* A 2 byte UTF-8 representation splits the characters 11 bits between
    50    the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
    51    pages, 3 bits to add to that index and 5 bits to generate the mask.
    52 */
    53 #define UTF8_GET_NAMING2(pages, byte) \
    54     (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
    55                       + ((((byte)[0]) & 3) << 1) \
    56                       + ((((byte)[1]) >> 5) & 1)] \
    57          & (1 << (((byte)[1]) & 0x1F)))
    59 /* A 3 byte UTF-8 representation splits the characters 16 bits between
    60    the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
    61    into pages, 3 bits to add to that index and 5 bits to generate the
    62    mask.
    63 */
    64 #define UTF8_GET_NAMING3(pages, byte) \
    65   (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
    66                              + ((((byte)[1]) >> 2) & 0xF)] \
    67                        << 3) \
    68                       + ((((byte)[1]) & 3) << 1) \
    69                       + ((((byte)[2]) >> 5) & 1)] \
    70          & (1 << (((byte)[2]) & 0x1F)))
    72 #define UTF8_GET_NAMING(pages, p, n) \
    73   ((n) == 2 \
    74   ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
    75   : ((n) == 3 \
    76      ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
    77      : 0))
    79 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
    80    of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
    81    with the additional restriction of not allowing the Unicode
    82    code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
    83    Implementation details:
    84      (A & 0x80) == 0     means A < 0x80
    85    and
    86      (A & 0xC0) == 0xC0  means A > 0xBF
    87 */
    89 #define UTF8_INVALID2(p) \
    90   ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
    92 #define UTF8_INVALID3(p) \
    93   (((p)[2] & 0x80) == 0 \
    94   || \
    95   ((*p) == 0xEF && (p)[1] == 0xBF \
    96     ? \
    97     (p)[2] > 0xBD \
    98     : \
    99     ((p)[2] & 0xC0) == 0xC0) \
   100   || \
   101   ((*p) == 0xE0 \
   102     ? \
   103     (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
   104     : \
   105     ((p)[1] & 0x80) == 0 \
   106     || \
   107     ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
   109 #define UTF8_INVALID4(p) \
   110   (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
   111   || \
   112   ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
   113   || \
   114   ((*p) == 0xF0 \
   115     ? \
   116     (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
   117     : \
   118     ((p)[1] & 0x80) == 0 \
   119     || \
   120     ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
   122 static int PTRFASTCALL
   123 isNever(const ENCODING *enc, const char *p)
   124 {
   125   return 0;
   126 }
   128 static int PTRFASTCALL
   129 utf8_isName2(const ENCODING *enc, const char *p)
   130 {
   131   return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
   132 }
   134 static int PTRFASTCALL
   135 utf8_isName3(const ENCODING *enc, const char *p)
   136 {
   137   return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
   138 }
   140 #define utf8_isName4 isNever
   142 static int PTRFASTCALL
   143 utf8_isNmstrt2(const ENCODING *enc, const char *p)
   144 {
   145   return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
   146 }
   148 static int PTRFASTCALL
   149 utf8_isNmstrt3(const ENCODING *enc, const char *p)
   150 {
   151   return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
   152 }
   154 #define utf8_isNmstrt4 isNever
   156 static int PTRFASTCALL
   157 utf8_isInvalid2(const ENCODING *enc, const char *p)
   158 {
   159   return UTF8_INVALID2((const unsigned char *)p);
   160 }
   162 static int PTRFASTCALL
   163 utf8_isInvalid3(const ENCODING *enc, const char *p)
   164 {
   165   return UTF8_INVALID3((const unsigned char *)p);
   166 }
   168 static int PTRFASTCALL
   169 utf8_isInvalid4(const ENCODING *enc, const char *p)
   170 {
   171   return UTF8_INVALID4((const unsigned char *)p);
   172 }
   174 struct normal_encoding {
   175   ENCODING enc;
   176   unsigned char type[256];
   177 #ifdef XML_MIN_SIZE
   178   int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
   179   int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
   180   int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
   181   int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
   182   int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
   183 #endif /* XML_MIN_SIZE */
   184   int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
   185   int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
   186   int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
   187   int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
   188   int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
   189   int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
   190   int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
   191   int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
   192   int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
   193 };
   195 #define AS_NORMAL_ENCODING(enc)   ((const struct normal_encoding *) (enc))
   197 #ifdef XML_MIN_SIZE
   199 #define STANDARD_VTABLE(E) \
   200  E ## byteType, \
   201  E ## isNameMin, \
   202  E ## isNmstrtMin, \
   203  E ## byteToAscii, \
   204  E ## charMatches,
   206 #else
   208 #define STANDARD_VTABLE(E) /* as nothing */
   210 #endif
   212 #define NORMAL_VTABLE(E) \
   213  E ## isName2, \
   214  E ## isName3, \
   215  E ## isName4, \
   216  E ## isNmstrt2, \
   217  E ## isNmstrt3, \
   218  E ## isNmstrt4, \
   219  E ## isInvalid2, \
   220  E ## isInvalid3, \
   221  E ## isInvalid4
   223 static int FASTCALL checkCharRefNumber(int);
   225 #include "xmltok_impl.h"
   226 #include "ascii.h"
   228 #ifdef XML_MIN_SIZE
   229 #define sb_isNameMin isNever
   230 #define sb_isNmstrtMin isNever
   231 #endif
   233 #ifdef XML_MIN_SIZE
   234 #define MINBPC(enc) ((enc)->minBytesPerChar)
   235 #else
   236 /* minimum bytes per character */
   237 #define MINBPC(enc) 1
   238 #endif
   240 #define SB_BYTE_TYPE(enc, p) \
   241   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
   243 #ifdef XML_MIN_SIZE
   244 static int PTRFASTCALL
   245 sb_byteType(const ENCODING *enc, const char *p)
   246 {
   247   return SB_BYTE_TYPE(enc, p);
   248 }
   249 #define BYTE_TYPE(enc, p) \
   250  (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
   251 #else
   252 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
   253 #endif
   255 #ifdef XML_MIN_SIZE
   256 #define BYTE_TO_ASCII(enc, p) \
   257  (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
   258 static int PTRFASTCALL
   259 sb_byteToAscii(const ENCODING *enc, const char *p)
   260 {
   261   return *p;
   262 }
   263 #else
   264 #define BYTE_TO_ASCII(enc, p) (*(p))
   265 #endif
   267 #define IS_NAME_CHAR(enc, p, n) \
   268  (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
   269 #define IS_NMSTRT_CHAR(enc, p, n) \
   270  (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
   271 #define IS_INVALID_CHAR(enc, p, n) \
   272  (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
   274 #ifdef XML_MIN_SIZE
   275 #define IS_NAME_CHAR_MINBPC(enc, p) \
   276  (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
   277 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
   278  (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
   279 #else
   280 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
   281 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
   282 #endif
   284 #ifdef XML_MIN_SIZE
   285 #define CHAR_MATCHES(enc, p, c) \
   286  (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
   287 static int PTRCALL
   288 sb_charMatches(const ENCODING *enc, const char *p, int c)
   289 {
   290   return *p == c;
   291 }
   292 #else
   293 /* c is an ASCII character */
   294 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
   295 #endif
   297 #define PREFIX(ident) normal_ ## ident
   298 #include "xmltok_impl.c"
   300 #undef MINBPC
   301 #undef BYTE_TYPE
   302 #undef BYTE_TO_ASCII
   303 #undef CHAR_MATCHES
   304 #undef IS_NAME_CHAR
   305 #undef IS_NAME_CHAR_MINBPC
   306 #undef IS_NMSTRT_CHAR
   307 #undef IS_NMSTRT_CHAR_MINBPC
   308 #undef IS_INVALID_CHAR
   310 enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
   311   UTF8_cval1 = 0x00,
   312   UTF8_cval2 = 0xc0,
   313   UTF8_cval3 = 0xe0,
   314   UTF8_cval4 = 0xf0
   315 };
   317 static void PTRCALL
   318 utf8_toUtf8(const ENCODING *enc,
   319             const char **fromP, const char *fromLim,
   320             char **toP, const char *toLim)
   321 {
   322   char *to;
   323   const char *from;
   324   if (fromLim - *fromP > toLim - *toP) {
   325     /* Avoid copying partial characters. */
   326     for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
   327       if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
   328         break;
   329   }
   330   for (to = *toP, from = *fromP; from != fromLim; from++, to++)
   331     *to = *from;
   332   *fromP = from;
   333   *toP = to;
   334 }
   336 static void PTRCALL
   337 utf8_toUtf16(const ENCODING *enc,
   338              const char **fromP, const char *fromLim,
   339              unsigned short **toP, const unsigned short *toLim)
   340 {
   341   unsigned short *to = *toP;
   342   const char *from = *fromP;
   343   while (from != fromLim && to != toLim) {
   344     switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
   345     case BT_LEAD2:
   346       *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
   347       from += 2;
   348       break;
   349     case BT_LEAD3:
   350       *to++ = (unsigned short)(((from[0] & 0xf) << 12)
   351                                | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
   352       from += 3;
   353       break;
   354     case BT_LEAD4:
   355       {
   356         unsigned long n;
   357         if (to + 1 == toLim)
   358           goto after;
   359         n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
   360             | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
   361         n -= 0x10000;
   362         to[0] = (unsigned short)((n >> 10) | 0xD800);
   363         to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
   364         to += 2;
   365         from += 4;
   366       }
   367       break;
   368     default:
   369       *to++ = *from++;
   370       break;
   371     }
   372   }
   373 after:
   374   *fromP = from;
   375   *toP = to;
   376 }
   378 #ifdef XML_NS
   379 static const struct normal_encoding utf8_encoding_ns = {
   380   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
   381   {
   382 #include "asciitab.h"
   383 #include "utf8tab.h"
   384   },
   385   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
   386 };
   387 #endif
   389 static const struct normal_encoding utf8_encoding = {
   390   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
   391   {
   392 #define BT_COLON BT_NMSTRT
   393 #include "asciitab.h"
   394 #undef BT_COLON
   395 #include "utf8tab.h"
   396   },
   397   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
   398 };
   400 #ifdef XML_NS
   402 static const struct normal_encoding internal_utf8_encoding_ns = {
   403   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
   404   {
   405 #include "iasciitab.h"
   406 #include "utf8tab.h"
   407   },
   408   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
   409 };
   411 #endif
   413 static const struct normal_encoding internal_utf8_encoding = {
   414   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
   415   {
   416 #define BT_COLON BT_NMSTRT
   417 #include "iasciitab.h"
   418 #undef BT_COLON
   419 #include "utf8tab.h"
   420   },
   421   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
   422 };
   424 static void PTRCALL
   425 latin1_toUtf8(const ENCODING *enc,
   426               const char **fromP, const char *fromLim,
   427               char **toP, const char *toLim)
   428 {
   429   for (;;) {
   430     unsigned char c;
   431     if (*fromP == fromLim)
   432       break;
   433     c = (unsigned char)**fromP;
   434     if (c & 0x80) {
   435       if (toLim - *toP < 2)
   436         break;
   437       *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
   438       *(*toP)++ = (char)((c & 0x3f) | 0x80);
   439       (*fromP)++;
   440     }
   441     else {
   442       if (*toP == toLim)
   443         break;
   444       *(*toP)++ = *(*fromP)++;
   445     }
   446   }
   447 }
   449 static void PTRCALL
   450 latin1_toUtf16(const ENCODING *enc,
   451                const char **fromP, const char *fromLim,
   452                unsigned short **toP, const unsigned short *toLim)
   453 {
   454   while (*fromP != fromLim && *toP != toLim)
   455     *(*toP)++ = (unsigned char)*(*fromP)++;
   456 }
   458 #ifdef XML_NS
   460 static const struct normal_encoding latin1_encoding_ns = {
   461   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
   462   {
   463 #include "asciitab.h"
   464 #include "latin1tab.h"
   465   },
   466   STANDARD_VTABLE(sb_)
   467 };
   469 #endif
   471 static const struct normal_encoding latin1_encoding = {
   472   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
   473   {
   474 #define BT_COLON BT_NMSTRT
   475 #include "asciitab.h"
   476 #undef BT_COLON
   477 #include "latin1tab.h"
   478   },
   479   STANDARD_VTABLE(sb_)
   480 };
   482 static void PTRCALL
   483 ascii_toUtf8(const ENCODING *enc,
   484              const char **fromP, const char *fromLim,
   485              char **toP, const char *toLim)
   486 {
   487   while (*fromP != fromLim && *toP != toLim)
   488     *(*toP)++ = *(*fromP)++;
   489 }
   491 #ifdef XML_NS
   493 static const struct normal_encoding ascii_encoding_ns = {
   494   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
   495   {
   496 #include "asciitab.h"
   497 /* BT_NONXML == 0 */
   498   },
   499   STANDARD_VTABLE(sb_)
   500 };
   502 #endif
   504 static const struct normal_encoding ascii_encoding = {
   505   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
   506   {
   507 #define BT_COLON BT_NMSTRT
   508 #include "asciitab.h"
   509 #undef BT_COLON
   510 /* BT_NONXML == 0 */
   511   },
   512   STANDARD_VTABLE(sb_)
   513 };
   515 static int PTRFASTCALL
   516 unicode_byte_type(char hi, char lo)
   517 {
   518   switch ((unsigned char)hi) {
   519   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
   520     return BT_LEAD4;
   521   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
   522     return BT_TRAIL;
   523   case 0xFF:
   524     switch ((unsigned char)lo) {
   525     case 0xFF:
   526     case 0xFE:
   527       return BT_NONXML;
   528     }
   529     break;
   530   }
   531   return BT_NONASCII;
   532 }
   534 #define DEFINE_UTF16_TO_UTF8(E) \
   535 static void  PTRCALL \
   536 E ## toUtf8(const ENCODING *enc, \
   537             const char **fromP, const char *fromLim, \
   538             char **toP, const char *toLim) \
   539 { \
   540   const char *from; \
   541   for (from = *fromP; from != fromLim; from += 2) { \
   542     int plane; \
   543     unsigned char lo2; \
   544     unsigned char lo = GET_LO(from); \
   545     unsigned char hi = GET_HI(from); \
   546     switch (hi) { \
   547     case 0: \
   548       if (lo < 0x80) { \
   549         if (*toP == toLim) { \
   550           *fromP = from; \
   551           return; \
   552         } \
   553         *(*toP)++ = lo; \
   554         break; \
   555       } \
   556       /* fall through */ \
   557     case 0x1: case 0x2: case 0x3: \
   558     case 0x4: case 0x5: case 0x6: case 0x7: \
   559       if (toLim -  *toP < 2) { \
   560         *fromP = from; \
   561         return; \
   562       } \
   563       *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
   564       *(*toP)++ = ((lo & 0x3f) | 0x80); \
   565       break; \
   566     default: \
   567       if (toLim -  *toP < 3)  { \
   568         *fromP = from; \
   569         return; \
   570       } \
   571       /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
   572       *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
   573       *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
   574       *(*toP)++ = ((lo & 0x3f) | 0x80); \
   575       break; \
   576     case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
   577       if (toLim -  *toP < 4) { \
   578         *fromP = from; \
   579         return; \
   580       } \
   581       plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
   582       *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
   583       *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
   584       from += 2; \
   585       lo2 = GET_LO(from); \
   586       *(*toP)++ = (((lo & 0x3) << 4) \
   587                    | ((GET_HI(from) & 0x3) << 2) \
   588                    | (lo2 >> 6) \
   589                    | 0x80); \
   590       *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
   591       break; \
   592     } \
   593   } \
   594   *fromP = from; \
   595 }
   597 #define DEFINE_UTF16_TO_UTF16(E) \
   598 static void  PTRCALL \
   599 E ## toUtf16(const ENCODING *enc, \
   600              const char **fromP, const char *fromLim, \
   601              unsigned short **toP, const unsigned short *toLim) \
   602 { \
   603   /* Avoid copying first half only of surrogate */ \
   604   if (fromLim - *fromP > ((toLim - *toP) << 1) \
   605       && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
   606     fromLim -= 2; \
   607   for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
   608     *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
   609 }
   611 #define SET2(ptr, ch) \
   612   (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
   613 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
   614 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
   616 DEFINE_UTF16_TO_UTF8(little2_)
   617 DEFINE_UTF16_TO_UTF16(little2_)
   619 #undef SET2
   620 #undef GET_LO
   621 #undef GET_HI
   623 #define SET2(ptr, ch) \
   624   (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
   625 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
   626 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
   628 DEFINE_UTF16_TO_UTF8(big2_)
   629 DEFINE_UTF16_TO_UTF16(big2_)
   631 #undef SET2
   632 #undef GET_LO
   633 #undef GET_HI
   635 #define LITTLE2_BYTE_TYPE(enc, p) \
   636  ((p)[1] == 0 \
   637   ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
   638   : unicode_byte_type((p)[1], (p)[0]))
   639 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
   640 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
   641 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
   642   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
   643 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
   644   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
   646 #ifdef XML_MIN_SIZE
   648 static int PTRFASTCALL
   649 little2_byteType(const ENCODING *enc, const char *p)
   650 {
   651   return LITTLE2_BYTE_TYPE(enc, p);
   652 }
   654 static int PTRFASTCALL
   655 little2_byteToAscii(const ENCODING *enc, const char *p)
   656 {
   657   return LITTLE2_BYTE_TO_ASCII(enc, p);
   658 }
   660 static int PTRCALL
   661 little2_charMatches(const ENCODING *enc, const char *p, int c)
   662 {
   663   return LITTLE2_CHAR_MATCHES(enc, p, c);
   664 }
   666 static int PTRFASTCALL
   667 little2_isNameMin(const ENCODING *enc, const char *p)
   668 {
   669   return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
   670 }
   672 static int PTRFASTCALL
   673 little2_isNmstrtMin(const ENCODING *enc, const char *p)
   674 {
   675   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
   676 }
   678 #undef VTABLE
   679 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
   681 #else /* not XML_MIN_SIZE */
   683 #undef PREFIX
   684 #define PREFIX(ident) little2_ ## ident
   685 #define MINBPC(enc) 2
   686 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
   687 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
   688 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
   689 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
   690 #define IS_NAME_CHAR(enc, p, n) 0
   691 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
   692 #define IS_NMSTRT_CHAR(enc, p, n) (0)
   693 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
   695 #include "xmltok_impl.c"
   697 #undef MINBPC
   698 #undef BYTE_TYPE
   699 #undef BYTE_TO_ASCII
   700 #undef CHAR_MATCHES
   701 #undef IS_NAME_CHAR
   702 #undef IS_NAME_CHAR_MINBPC
   703 #undef IS_NMSTRT_CHAR
   704 #undef IS_NMSTRT_CHAR_MINBPC
   705 #undef IS_INVALID_CHAR
   707 #endif /* not XML_MIN_SIZE */
   709 #ifdef XML_NS
   711 static const struct normal_encoding little2_encoding_ns = {
   712   { VTABLE, 2, 0,
   713 #if BYTEORDER == 1234
   714     1
   715 #else
   716     0
   717 #endif
   718   },
   719   {
   720 #include "asciitab.h"
   721 #include "latin1tab.h"
   722   },
   723   STANDARD_VTABLE(little2_)
   724 };
   726 #endif
   728 static const struct normal_encoding little2_encoding = {
   729   { VTABLE, 2, 0,
   730 #if BYTEORDER == 1234
   731     1
   732 #else
   733     0
   734 #endif
   735   },
   736   {
   737 #define BT_COLON BT_NMSTRT
   738 #include "asciitab.h"
   739 #undef BT_COLON
   740 #include "latin1tab.h"
   741   },
   742   STANDARD_VTABLE(little2_)
   743 };
   745 #if BYTEORDER != 4321
   747 #ifdef XML_NS
   749 static const struct normal_encoding internal_little2_encoding_ns = {
   750   { VTABLE, 2, 0, 1 },
   751   {
   752 #include "iasciitab.h"
   753 #include "latin1tab.h"
   754   },
   755   STANDARD_VTABLE(little2_)
   756 };
   758 #endif
   760 static const struct normal_encoding internal_little2_encoding = {
   761   { VTABLE, 2, 0, 1 },
   762   {
   763 #define BT_COLON BT_NMSTRT
   764 #include "iasciitab.h"
   765 #undef BT_COLON
   766 #include "latin1tab.h"
   767   },
   768   STANDARD_VTABLE(little2_)
   769 };
   771 #endif
   774 #define BIG2_BYTE_TYPE(enc, p) \
   775  ((p)[0] == 0 \
   776   ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
   777   : unicode_byte_type((p)[0], (p)[1]))
   778 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
   779 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
   780 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
   781   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
   782 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
   783   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
   785 #ifdef XML_MIN_SIZE
   787 static int PTRFASTCALL
   788 big2_byteType(const ENCODING *enc, const char *p)
   789 {
   790   return BIG2_BYTE_TYPE(enc, p);
   791 }
   793 static int PTRFASTCALL
   794 big2_byteToAscii(const ENCODING *enc, const char *p)
   795 {
   796   return BIG2_BYTE_TO_ASCII(enc, p);
   797 }
   799 static int PTRCALL
   800 big2_charMatches(const ENCODING *enc, const char *p, int c)
   801 {
   802   return BIG2_CHAR_MATCHES(enc, p, c);
   803 }
   805 static int PTRFASTCALL
   806 big2_isNameMin(const ENCODING *enc, const char *p)
   807 {
   808   return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
   809 }
   811 static int PTRFASTCALL
   812 big2_isNmstrtMin(const ENCODING *enc, const char *p)
   813 {
   814   return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
   815 }
   817 #undef VTABLE
   818 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
   820 #else /* not XML_MIN_SIZE */
   822 #undef PREFIX
   823 #define PREFIX(ident) big2_ ## ident
   824 #define MINBPC(enc) 2
   825 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
   826 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
   827 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
   828 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
   829 #define IS_NAME_CHAR(enc, p, n) 0
   830 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
   831 #define IS_NMSTRT_CHAR(enc, p, n) (0)
   832 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
   834 #include "xmltok_impl.c"
   836 #undef MINBPC
   837 #undef BYTE_TYPE
   838 #undef BYTE_TO_ASCII
   839 #undef CHAR_MATCHES
   840 #undef IS_NAME_CHAR
   841 #undef IS_NAME_CHAR_MINBPC
   842 #undef IS_NMSTRT_CHAR
   843 #undef IS_NMSTRT_CHAR_MINBPC
   844 #undef IS_INVALID_CHAR
   846 #endif /* not XML_MIN_SIZE */
   848 #ifdef XML_NS
   850 static const struct normal_encoding big2_encoding_ns = {
   851   { VTABLE, 2, 0,
   852 #if BYTEORDER == 4321
   853   1
   854 #else
   855   0
   856 #endif
   857   },
   858   {
   859 #include "asciitab.h"
   860 #include "latin1tab.h"
   861   },
   862   STANDARD_VTABLE(big2_)
   863 };
   865 #endif
   867 static const struct normal_encoding big2_encoding = {
   868   { VTABLE, 2, 0,
   869 #if BYTEORDER == 4321
   870   1
   871 #else
   872   0
   873 #endif
   874   },
   875   {
   876 #define BT_COLON BT_NMSTRT
   877 #include "asciitab.h"
   878 #undef BT_COLON
   879 #include "latin1tab.h"
   880   },
   881   STANDARD_VTABLE(big2_)
   882 };
   884 #if BYTEORDER != 1234
   886 #ifdef XML_NS
   888 static const struct normal_encoding internal_big2_encoding_ns = {
   889   { VTABLE, 2, 0, 1 },
   890   {
   891 #include "iasciitab.h"
   892 #include "latin1tab.h"
   893   },
   894   STANDARD_VTABLE(big2_)
   895 };
   897 #endif
   899 static const struct normal_encoding internal_big2_encoding = {
   900   { VTABLE, 2, 0, 1 },
   901   {
   902 #define BT_COLON BT_NMSTRT
   903 #include "iasciitab.h"
   904 #undef BT_COLON
   905 #include "latin1tab.h"
   906   },
   907   STANDARD_VTABLE(big2_)
   908 };
   910 #endif
   912 #undef PREFIX
   914 static int FASTCALL
   915 streqci(const char *s1, const char *s2)
   916 {
   917   for (;;) {
   918     char c1 = *s1++;
   919     char c2 = *s2++;
   920     if (ASCII_a <= c1 && c1 <= ASCII_z)
   921       c1 += ASCII_A - ASCII_a;
   922     if (ASCII_a <= c2 && c2 <= ASCII_z)
   923       c2 += ASCII_A - ASCII_a;
   924     if (c1 != c2)
   925       return 0;
   926     if (!c1)
   927       break;
   928   }
   929   return 1;
   930 }
   932 static void PTRCALL
   933 initUpdatePosition(const ENCODING *enc, const char *ptr,
   934                    const char *end, POSITION *pos)
   935 {
   936   normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
   937 }
   939 static int
   940 toAscii(const ENCODING *enc, const char *ptr, const char *end)
   941 {
   942   char buf[1];
   943   char *p = buf;
   944   XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
   945   if (p == buf)
   946     return -1;
   947   else
   948     return buf[0];
   949 }
   951 static int FASTCALL
   952 isSpace(int c)
   953 {
   954   switch (c) {
   955   case 0x20:
   956   case 0xD:
   957   case 0xA:
   958   case 0x9:
   959     return 1;
   960   }
   961   return 0;
   962 }
   964 /* Return 1 if there's just optional white space or there's an S
   965    followed by name=val.
   966 */
   967 static int
   968 parsePseudoAttribute(const ENCODING *enc,
   969                      const char *ptr,
   970                      const char *end,
   971                      const char **namePtr,
   972                      const char **nameEndPtr,
   973                      const char **valPtr,
   974                      const char **nextTokPtr)
   975 {
   976   int c;
   977   char open;
   978   if (ptr == end) {
   979     *namePtr = NULL;
   980     return 1;
   981   }
   982   if (!isSpace(toAscii(enc, ptr, end))) {
   983     *nextTokPtr = ptr;
   984     return 0;
   985   }
   986   do {
   987     ptr += enc->minBytesPerChar;
   988   } while (isSpace(toAscii(enc, ptr, end)));
   989   if (ptr == end) {
   990     *namePtr = NULL;
   991     return 1;
   992   }
   993   *namePtr = ptr;
   994   for (;;) {
   995     c = toAscii(enc, ptr, end);
   996     if (c == -1) {
   997       *nextTokPtr = ptr;
   998       return 0;
   999     }
  1000     if (c == ASCII_EQUALS) {
  1001       *nameEndPtr = ptr;
  1002       break;
  1004     if (isSpace(c)) {
  1005       *nameEndPtr = ptr;
  1006       do {
  1007         ptr += enc->minBytesPerChar;
  1008       } while (isSpace(c = toAscii(enc, ptr, end)));
  1009       if (c != ASCII_EQUALS) {
  1010         *nextTokPtr = ptr;
  1011         return 0;
  1013       break;
  1015     ptr += enc->minBytesPerChar;
  1017   if (ptr == *namePtr) {
  1018     *nextTokPtr = ptr;
  1019     return 0;
  1021   ptr += enc->minBytesPerChar;
  1022   c = toAscii(enc, ptr, end);
  1023   while (isSpace(c)) {
  1024     ptr += enc->minBytesPerChar;
  1025     c = toAscii(enc, ptr, end);
  1027   if (c != ASCII_QUOT && c != ASCII_APOS) {
  1028     *nextTokPtr = ptr;
  1029     return 0;
  1031   open = (char)c;
  1032   ptr += enc->minBytesPerChar;
  1033   *valPtr = ptr;
  1034   for (;; ptr += enc->minBytesPerChar) {
  1035     c = toAscii(enc, ptr, end);
  1036     if (c == open)
  1037       break;
  1038     if (!(ASCII_a <= c && c <= ASCII_z)
  1039         && !(ASCII_A <= c && c <= ASCII_Z)
  1040         && !(ASCII_0 <= c && c <= ASCII_9)
  1041         && c != ASCII_PERIOD
  1042         && c != ASCII_MINUS
  1043         && c != ASCII_UNDERSCORE) {
  1044       *nextTokPtr = ptr;
  1045       return 0;
  1048   *nextTokPtr = ptr + enc->minBytesPerChar;
  1049   return 1;
  1052 static const char KW_version[] = {
  1053   ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
  1054 };
  1056 static const char KW_encoding[] = {
  1057   ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
  1058 };
  1060 static const char KW_standalone[] = {
  1061   ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
  1062   ASCII_n, ASCII_e, '\0'
  1063 };
  1065 static const char KW_yes[] = {
  1066   ASCII_y, ASCII_e, ASCII_s,  '\0'
  1067 };
  1069 static const char KW_no[] = {
  1070   ASCII_n, ASCII_o,  '\0'
  1071 };
  1073 /* BEGIN MOZILLA CHANGE (http://bugzilla.mozilla.org/show_bug.cgi?id=62157) */
  1074 static const char KW_XML_1_0[] = {
  1075   ASCII_1, ASCII_PERIOD, ASCII_0, '\0'
  1076 };
  1077 /* END MOZILLA CHANGE */
  1079 static int
  1080 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
  1081                                                  const char *,
  1082                                                  const char *),
  1083                int isGeneralTextEntity,
  1084                const ENCODING *enc,
  1085                const char *ptr,
  1086                const char *end,
  1087                const char **badPtr,
  1088                const char **versionPtr,
  1089                const char **versionEndPtr,
  1090                const char **encodingName,
  1091                const ENCODING **encoding,
  1092                int *standalone)
  1094   const char *val = NULL;
  1095   const char *name = NULL;
  1096   const char *nameEnd = NULL;
  1097   ptr += 5 * enc->minBytesPerChar;
  1098   end -= 2 * enc->minBytesPerChar;
  1099   if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
  1100       || !name) {
  1101     *badPtr = ptr;
  1102     return 0;
  1104   if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
  1105     if (!isGeneralTextEntity) {
  1106       *badPtr = name;
  1107       return 0;
  1110   else {
  1111     if (versionPtr)
  1112       *versionPtr = val;
  1113     if (versionEndPtr)
  1114       *versionEndPtr = ptr;
  1115 /* BEGIN MOZILLA CHANGE (http://bugzilla.mozilla.org/show_bug.cgi?id=62157) */
  1116      /* Anything else but a version="1.0" is invalid for us, until we support later versions. */
  1117      if (!XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_XML_1_0)) {
  1118        *badPtr = val;
  1119        return 0;
  1121 /* END MOZILLA CHANGE */
  1122     if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
  1123       *badPtr = ptr;
  1124       return 0;
  1126     if (!name) {
  1127       if (isGeneralTextEntity) {
  1128         /* a TextDecl must have an EncodingDecl */
  1129         *badPtr = ptr;
  1130         return 0;
  1132       return 1;
  1135   if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
  1136     int c = toAscii(enc, val, end);
  1137     if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
  1138       *badPtr = val;
  1139       return 0;
  1141     if (encodingName)
  1142       *encodingName = val;
  1143     if (encoding)
  1144       *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
  1145     if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
  1146       *badPtr = ptr;
  1147       return 0;
  1149     if (!name)
  1150       return 1;
  1152   if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
  1153       || isGeneralTextEntity) {
  1154     *badPtr = name;
  1155     return 0;
  1157   if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
  1158     if (standalone)
  1159       *standalone = 1;
  1161   else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
  1162     if (standalone)
  1163       *standalone = 0;
  1165   else {
  1166     *badPtr = val;
  1167     return 0;
  1169   while (isSpace(toAscii(enc, ptr, end)))
  1170     ptr += enc->minBytesPerChar;
  1171   if (ptr != end) {
  1172     *badPtr = ptr;
  1173     return 0;
  1175   return 1;
  1178 static int FASTCALL
  1179 checkCharRefNumber(int result)
  1181   switch (result >> 8) {
  1182   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
  1183   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
  1184     return -1;
  1185   case 0:
  1186     if (latin1_encoding.type[result] == BT_NONXML)
  1187       return -1;
  1188     break;
  1189   case 0xFF:
  1190     if (result == 0xFFFE || result == 0xFFFF)
  1191       return -1;
  1192     break;
  1194   return result;
  1197 int FASTCALL
  1198 XmlUtf8Encode(int c, char *buf)
  1200   enum {
  1201     /* minN is minimum legal resulting value for N byte sequence */
  1202     min2 = 0x80,
  1203     min3 = 0x800,
  1204     min4 = 0x10000
  1205   };
  1207   if (c < 0)
  1208     return 0;
  1209   if (c < min2) {
  1210     buf[0] = (char)(c | UTF8_cval1);
  1211     return 1;
  1213   if (c < min3) {
  1214     buf[0] = (char)((c >> 6) | UTF8_cval2);
  1215     buf[1] = (char)((c & 0x3f) | 0x80);
  1216     return 2;
  1218   if (c < min4) {
  1219     buf[0] = (char)((c >> 12) | UTF8_cval3);
  1220     buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
  1221     buf[2] = (char)((c & 0x3f) | 0x80);
  1222     return 3;
  1224   if (c < 0x110000) {
  1225     buf[0] = (char)((c >> 18) | UTF8_cval4);
  1226     buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
  1227     buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
  1228     buf[3] = (char)((c & 0x3f) | 0x80);
  1229     return 4;
  1231   return 0;
  1234 int FASTCALL
  1235 XmlUtf16Encode(int charNum, unsigned short *buf)
  1237   if (charNum < 0)
  1238     return 0;
  1239   if (charNum < 0x10000) {
  1240     buf[0] = (unsigned short)charNum;
  1241     return 1;
  1243   if (charNum < 0x110000) {
  1244     charNum -= 0x10000;
  1245     buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
  1246     buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
  1247     return 2;
  1249   return 0;
  1252 struct unknown_encoding {
  1253   struct normal_encoding normal;
  1254   CONVERTER convert;
  1255   void *userData;
  1256   unsigned short utf16[256];
  1257   char utf8[256][4];
  1258 };
  1260 #define AS_UNKNOWN_ENCODING(enc)  ((const struct unknown_encoding *) (enc))
  1262 int
  1263 XmlSizeOfUnknownEncoding(void)
  1265   return sizeof(struct unknown_encoding);
  1268 static int PTRFASTCALL
  1269 unknown_isName(const ENCODING *enc, const char *p)
  1271   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1272   int c = uenc->convert(uenc->userData, p);
  1273   if (c & ~0xFFFF)
  1274     return 0;
  1275   return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
  1278 static int PTRFASTCALL
  1279 unknown_isNmstrt(const ENCODING *enc, const char *p)
  1281   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1282   int c = uenc->convert(uenc->userData, p);
  1283   if (c & ~0xFFFF)
  1284     return 0;
  1285   return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
  1288 static int PTRFASTCALL
  1289 unknown_isInvalid(const ENCODING *enc, const char *p)
  1291   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1292   int c = uenc->convert(uenc->userData, p);
  1293   return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
  1296 static void PTRCALL
  1297 unknown_toUtf8(const ENCODING *enc,
  1298                const char **fromP, const char *fromLim,
  1299                char **toP, const char *toLim)
  1301   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1302   char buf[XML_UTF8_ENCODE_MAX];
  1303   for (;;) {
  1304     const char *utf8;
  1305     int n;
  1306     if (*fromP == fromLim)
  1307       break;
  1308     utf8 = uenc->utf8[(unsigned char)**fromP];
  1309     n = *utf8++;
  1310     if (n == 0) {
  1311       int c = uenc->convert(uenc->userData, *fromP);
  1312       n = XmlUtf8Encode(c, buf);
  1313       if (n > toLim - *toP)
  1314         break;
  1315       utf8 = buf;
  1316       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
  1317                  - (BT_LEAD2 - 2));
  1319     else {
  1320       if (n > toLim - *toP)
  1321         break;
  1322       (*fromP)++;
  1324     do {
  1325       *(*toP)++ = *utf8++;
  1326     } while (--n != 0);
  1330 static void PTRCALL
  1331 unknown_toUtf16(const ENCODING *enc,
  1332                 const char **fromP, const char *fromLim,
  1333                 unsigned short **toP, const unsigned short *toLim)
  1335   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1336   while (*fromP != fromLim && *toP != toLim) {
  1337     unsigned short c = uenc->utf16[(unsigned char)**fromP];
  1338     if (c == 0) {
  1339       c = (unsigned short)
  1340           uenc->convert(uenc->userData, *fromP);
  1341       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
  1342                  - (BT_LEAD2 - 2));
  1344     else
  1345       (*fromP)++;
  1346     *(*toP)++ = c;
  1350 ENCODING *
  1351 XmlInitUnknownEncoding(void *mem,
  1352                        int *table,
  1353                        CONVERTER convert, 
  1354                        void *userData)
  1356   int i;
  1357   struct unknown_encoding *e = (struct unknown_encoding *)mem;
  1358   for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
  1359     ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
  1360   for (i = 0; i < 128; i++)
  1361     if (latin1_encoding.type[i] != BT_OTHER
  1362         && latin1_encoding.type[i] != BT_NONXML
  1363         && table[i] != i)
  1364       return 0;
  1365   for (i = 0; i < 256; i++) {
  1366     int c = table[i];
  1367     if (c == -1) {
  1368       e->normal.type[i] = BT_MALFORM;
  1369       /* This shouldn't really get used. */
  1370       e->utf16[i] = 0xFFFF;
  1371       e->utf8[i][0] = 1;
  1372       e->utf8[i][1] = 0;
  1374     else if (c < 0) {
  1375       if (c < -4)
  1376         return 0;
  1377       e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
  1378       e->utf8[i][0] = 0;
  1379       e->utf16[i] = 0;
  1381     else if (c < 0x80) {
  1382       if (latin1_encoding.type[c] != BT_OTHER
  1383           && latin1_encoding.type[c] != BT_NONXML
  1384           && c != i)
  1385         return 0;
  1386       e->normal.type[i] = latin1_encoding.type[c];
  1387       e->utf8[i][0] = 1;
  1388       e->utf8[i][1] = (char)c;
  1389       e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
  1391     else if (checkCharRefNumber(c) < 0) {
  1392       e->normal.type[i] = BT_NONXML;
  1393       /* This shouldn't really get used. */
  1394       e->utf16[i] = 0xFFFF;
  1395       e->utf8[i][0] = 1;
  1396       e->utf8[i][1] = 0;
  1398     else {
  1399       if (c > 0xFFFF)
  1400         return 0;
  1401       if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
  1402         e->normal.type[i] = BT_NMSTRT;
  1403       else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
  1404         e->normal.type[i] = BT_NAME;
  1405       else
  1406         e->normal.type[i] = BT_OTHER;
  1407       e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
  1408       e->utf16[i] = (unsigned short)c;
  1411   e->userData = userData;
  1412   e->convert = convert;
  1413   if (convert) {
  1414     e->normal.isName2 = unknown_isName;
  1415     e->normal.isName3 = unknown_isName;
  1416     e->normal.isName4 = unknown_isName;
  1417     e->normal.isNmstrt2 = unknown_isNmstrt;
  1418     e->normal.isNmstrt3 = unknown_isNmstrt;
  1419     e->normal.isNmstrt4 = unknown_isNmstrt;
  1420     e->normal.isInvalid2 = unknown_isInvalid;
  1421     e->normal.isInvalid3 = unknown_isInvalid;
  1422     e->normal.isInvalid4 = unknown_isInvalid;
  1424   e->normal.enc.utf8Convert = unknown_toUtf8;
  1425   e->normal.enc.utf16Convert = unknown_toUtf16;
  1426   return &(e->normal.enc);
  1429 /* If this enumeration is changed, getEncodingIndex and encodings
  1430 must also be changed. */
  1431 enum {
  1432   UNKNOWN_ENC = -1,
  1433   ISO_8859_1_ENC = 0,
  1434   US_ASCII_ENC,
  1435   UTF_8_ENC,
  1436   UTF_16_ENC,
  1437   UTF_16BE_ENC,
  1438   UTF_16LE_ENC,
  1439   /* must match encodingNames up to here */
  1440   NO_ENC
  1441 };
  1443 static const char KW_ISO_8859_1[] = {
  1444   ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
  1445   ASCII_MINUS, ASCII_1, '\0'
  1446 };
  1447 static const char KW_US_ASCII[] = {
  1448   ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
  1449   '\0'
  1450 };
  1451 static const char KW_UTF_8[] =  {
  1452   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
  1453 };
  1454 static const char KW_UTF_16[] = {
  1455   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
  1456 };
  1457 static const char KW_UTF_16BE[] = {
  1458   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
  1459   '\0'
  1460 };
  1461 static const char KW_UTF_16LE[] = {
  1462   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
  1463   '\0'
  1464 };
  1466 static int FASTCALL
  1467 getEncodingIndex(const char *name)
  1469   static const char * const encodingNames[] = {
  1470     KW_ISO_8859_1,
  1471     KW_US_ASCII,
  1472     KW_UTF_8,
  1473     KW_UTF_16,
  1474     KW_UTF_16BE,
  1475     KW_UTF_16LE,
  1476   };
  1477   int i;
  1478   if (name == NULL)
  1479     return NO_ENC;
  1480   for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
  1481     if (streqci(name, encodingNames[i]))
  1482       return i;
  1483   return UNKNOWN_ENC;
  1486 /* For binary compatibility, we store the index of the encoding
  1487    specified at initialization in the isUtf16 member.
  1488 */
  1490 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
  1491 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
  1493 /* This is what detects the encoding.  encodingTable maps from
  1494    encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
  1495    the external (protocol) specified encoding; state is
  1496    XML_CONTENT_STATE if we're parsing an external text entity, and
  1497    XML_PROLOG_STATE otherwise.
  1498 */
  1501 static int
  1502 initScan(const ENCODING * const *encodingTable,
  1503          const INIT_ENCODING *enc,
  1504          int state,
  1505          const char *ptr,
  1506          const char *end,
  1507          const char **nextTokPtr)
  1509   const ENCODING **encPtr;
  1511   if (ptr == end)
  1512     return XML_TOK_NONE;
  1513   encPtr = enc->encPtr;
  1514   if (ptr + 1 == end) {
  1515     /* only a single byte available for auto-detection */
  1516 #ifndef XML_DTD /* FIXME */
  1517     /* a well-formed document entity must have more than one byte */
  1518     if (state != XML_CONTENT_STATE)
  1519       return XML_TOK_PARTIAL;
  1520 #endif
  1521     /* so we're parsing an external text entity... */
  1522     /* if UTF-16 was externally specified, then we need at least 2 bytes */
  1523     switch (INIT_ENC_INDEX(enc)) {
  1524     case UTF_16_ENC:
  1525     case UTF_16LE_ENC:
  1526     case UTF_16BE_ENC:
  1527       return XML_TOK_PARTIAL;
  1529     switch ((unsigned char)*ptr) {
  1530     case 0xFE:
  1531     case 0xFF:
  1532     case 0xEF: /* possibly first byte of UTF-8 BOM */
  1533       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
  1534           && state == XML_CONTENT_STATE)
  1535         break;
  1536       /* fall through */
  1537     case 0x00:
  1538     case 0x3C:
  1539       return XML_TOK_PARTIAL;
  1542   else {
  1543     switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
  1544     case 0xFEFF:
  1545       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
  1546           && state == XML_CONTENT_STATE)
  1547         break;
  1548       *nextTokPtr = ptr + 2;
  1549       *encPtr = encodingTable[UTF_16BE_ENC];
  1550       return XML_TOK_BOM;
  1551     /* 00 3C is handled in the default case */
  1552     case 0x3C00:
  1553       if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
  1554            || INIT_ENC_INDEX(enc) == UTF_16_ENC)
  1555           && state == XML_CONTENT_STATE)
  1556         break;
  1557       *encPtr = encodingTable[UTF_16LE_ENC];
  1558       return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1559     case 0xFFFE:
  1560       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
  1561           && state == XML_CONTENT_STATE)
  1562         break;
  1563       *nextTokPtr = ptr + 2;
  1564       *encPtr = encodingTable[UTF_16LE_ENC];
  1565       return XML_TOK_BOM;
  1566     case 0xEFBB:
  1567       /* Maybe a UTF-8 BOM (EF BB BF) */
  1568       /* If there's an explicitly specified (external) encoding
  1569          of ISO-8859-1 or some flavour of UTF-16
  1570          and this is an external text entity,
  1571          don't look for the BOM,
  1572          because it might be a legal data.
  1573       */
  1574       if (state == XML_CONTENT_STATE) {
  1575         int e = INIT_ENC_INDEX(enc);
  1576         if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
  1577             || e == UTF_16LE_ENC || e == UTF_16_ENC)
  1578           break;
  1580       if (ptr + 2 == end)
  1581         return XML_TOK_PARTIAL;
  1582       if ((unsigned char)ptr[2] == 0xBF) {
  1583         *nextTokPtr = ptr + 3;
  1584         *encPtr = encodingTable[UTF_8_ENC];
  1585         return XML_TOK_BOM;
  1587       break;
  1588     default:
  1589       if (ptr[0] == '\0') {
  1590         /* 0 isn't a legal data character. Furthermore a document
  1591            entity can only start with ASCII characters.  So the only
  1592            way this can fail to be big-endian UTF-16 if it it's an
  1593            external parsed general entity that's labelled as
  1594            UTF-16LE.
  1595         */
  1596         if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
  1597           break;
  1598         *encPtr = encodingTable[UTF_16BE_ENC];
  1599         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1601       else if (ptr[1] == '\0') {
  1602         /* We could recover here in the case:
  1603             - parsing an external entity
  1604             - second byte is 0
  1605             - no externally specified encoding
  1606             - no encoding declaration
  1607            by assuming UTF-16LE.  But we don't, because this would mean when
  1608            presented just with a single byte, we couldn't reliably determine
  1609            whether we needed further bytes.
  1610         */
  1611         if (state == XML_CONTENT_STATE)
  1612           break;
  1613         *encPtr = encodingTable[UTF_16LE_ENC];
  1614         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1616       break;
  1619   *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
  1620   return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1624 #define NS(x) x
  1625 #define ns(x) x
  1626 #include "xmltok_ns.c"
  1627 #undef NS
  1628 #undef ns
  1630 #ifdef XML_NS
  1632 #define NS(x) x ## NS
  1633 #define ns(x) x ## _ns
  1635 #include "xmltok_ns.c"
  1637 #undef NS
  1638 #undef ns
  1640 ENCODING *
  1641 XmlInitUnknownEncodingNS(void *mem,
  1642                          int *table,
  1643                          CONVERTER convert, 
  1644                          void *userData)
  1646   ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
  1647   if (enc)
  1648     ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
  1649   return enc;
  1652 #endif /* XML_NS */
  1654 /* BEGIN MOZILLA CHANGE (Mozilla extensions for QName checking) */
  1655 #ifdef MOZILLA_CLIENT
  1656 #include "moz_extensions.c"
  1657 #endif /* MOZILLA_CLIENT */
  1658 /* END MOZILLA CHANGE */

mercurial