parser/expat/lib/xmltok.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/parser/expat/lib/xmltok.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1658 @@
     1.4 +/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
     1.5 +   See the file COPYING for copying permission.
     1.6 +*/
     1.7 +
     1.8 +#include <stddef.h>
     1.9 +
    1.10 +#ifdef COMPILED_FROM_DSP
    1.11 +#include "winconfig.h"
    1.12 +#elif defined(MACOS_CLASSIC)
    1.13 +#include "macconfig.h"
    1.14 +#elif defined(__amigaos4__)
    1.15 +#include "amigaconfig.h"
    1.16 +#else
    1.17 +#ifdef HAVE_EXPAT_CONFIG_H
    1.18 +#include <expat_config.h>
    1.19 +#endif
    1.20 +#endif /* ndef COMPILED_FROM_DSP */
    1.21 +
    1.22 +#include "expat_external.h"
    1.23 +#include "internal.h"
    1.24 +#include "xmltok.h"
    1.25 +#include "nametab.h"
    1.26 +
    1.27 +#ifdef XML_DTD
    1.28 +#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
    1.29 +#else
    1.30 +#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
    1.31 +#endif
    1.32 +
    1.33 +#define VTABLE1 \
    1.34 +  { PREFIX(prologTok), PREFIX(contentTok), \
    1.35 +    PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
    1.36 +  { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
    1.37 +  PREFIX(sameName), \
    1.38 +  PREFIX(nameMatchesAscii), \
    1.39 +  PREFIX(nameLength), \
    1.40 +  PREFIX(skipS), \
    1.41 +  PREFIX(getAtts), \
    1.42 +  PREFIX(charRefNumber), \
    1.43 +  PREFIX(predefinedEntityName), \
    1.44 +  PREFIX(updatePosition), \
    1.45 +  PREFIX(isPublicId)
    1.46 +
    1.47 +#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
    1.48 +
    1.49 +#define UCS2_GET_NAMING(pages, hi, lo) \
    1.50 +   (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
    1.51 +
    1.52 +/* A 2 byte UTF-8 representation splits the characters 11 bits between
    1.53 +   the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
    1.54 +   pages, 3 bits to add to that index and 5 bits to generate the mask.
    1.55 +*/
    1.56 +#define UTF8_GET_NAMING2(pages, byte) \
    1.57 +    (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
    1.58 +                      + ((((byte)[0]) & 3) << 1) \
    1.59 +                      + ((((byte)[1]) >> 5) & 1)] \
    1.60 +         & (1 << (((byte)[1]) & 0x1F)))
    1.61 +
    1.62 +/* A 3 byte UTF-8 representation splits the characters 16 bits between
    1.63 +   the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
    1.64 +   into pages, 3 bits to add to that index and 5 bits to generate the
    1.65 +   mask.
    1.66 +*/
    1.67 +#define UTF8_GET_NAMING3(pages, byte) \
    1.68 +  (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
    1.69 +                             + ((((byte)[1]) >> 2) & 0xF)] \
    1.70 +                       << 3) \
    1.71 +                      + ((((byte)[1]) & 3) << 1) \
    1.72 +                      + ((((byte)[2]) >> 5) & 1)] \
    1.73 +         & (1 << (((byte)[2]) & 0x1F)))
    1.74 +
    1.75 +#define UTF8_GET_NAMING(pages, p, n) \
    1.76 +  ((n) == 2 \
    1.77 +  ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
    1.78 +  : ((n) == 3 \
    1.79 +     ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
    1.80 +     : 0))
    1.81 +
    1.82 +/* Detection of invalid UTF-8 sequences is based on Table 3.1B
    1.83 +   of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
    1.84 +   with the additional restriction of not allowing the Unicode
    1.85 +   code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
    1.86 +   Implementation details:
    1.87 +     (A & 0x80) == 0     means A < 0x80
    1.88 +   and
    1.89 +     (A & 0xC0) == 0xC0  means A > 0xBF
    1.90 +*/
    1.91 +
    1.92 +#define UTF8_INVALID2(p) \
    1.93 +  ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
    1.94 +
    1.95 +#define UTF8_INVALID3(p) \
    1.96 +  (((p)[2] & 0x80) == 0 \
    1.97 +  || \
    1.98 +  ((*p) == 0xEF && (p)[1] == 0xBF \
    1.99 +    ? \
   1.100 +    (p)[2] > 0xBD \
   1.101 +    : \
   1.102 +    ((p)[2] & 0xC0) == 0xC0) \
   1.103 +  || \
   1.104 +  ((*p) == 0xE0 \
   1.105 +    ? \
   1.106 +    (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
   1.107 +    : \
   1.108 +    ((p)[1] & 0x80) == 0 \
   1.109 +    || \
   1.110 +    ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
   1.111 +
   1.112 +#define UTF8_INVALID4(p) \
   1.113 +  (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
   1.114 +  || \
   1.115 +  ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
   1.116 +  || \
   1.117 +  ((*p) == 0xF0 \
   1.118 +    ? \
   1.119 +    (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
   1.120 +    : \
   1.121 +    ((p)[1] & 0x80) == 0 \
   1.122 +    || \
   1.123 +    ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
   1.124 +
   1.125 +static int PTRFASTCALL
   1.126 +isNever(const ENCODING *enc, const char *p)
   1.127 +{
   1.128 +  return 0;
   1.129 +}
   1.130 +
   1.131 +static int PTRFASTCALL
   1.132 +utf8_isName2(const ENCODING *enc, const char *p)
   1.133 +{
   1.134 +  return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
   1.135 +}
   1.136 +
   1.137 +static int PTRFASTCALL
   1.138 +utf8_isName3(const ENCODING *enc, const char *p)
   1.139 +{
   1.140 +  return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
   1.141 +}
   1.142 +
   1.143 +#define utf8_isName4 isNever
   1.144 +
   1.145 +static int PTRFASTCALL
   1.146 +utf8_isNmstrt2(const ENCODING *enc, const char *p)
   1.147 +{
   1.148 +  return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
   1.149 +}
   1.150 +
   1.151 +static int PTRFASTCALL
   1.152 +utf8_isNmstrt3(const ENCODING *enc, const char *p)
   1.153 +{
   1.154 +  return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
   1.155 +}
   1.156 +
   1.157 +#define utf8_isNmstrt4 isNever
   1.158 +
   1.159 +static int PTRFASTCALL
   1.160 +utf8_isInvalid2(const ENCODING *enc, const char *p)
   1.161 +{
   1.162 +  return UTF8_INVALID2((const unsigned char *)p);
   1.163 +}
   1.164 +
   1.165 +static int PTRFASTCALL
   1.166 +utf8_isInvalid3(const ENCODING *enc, const char *p)
   1.167 +{
   1.168 +  return UTF8_INVALID3((const unsigned char *)p);
   1.169 +}
   1.170 +
   1.171 +static int PTRFASTCALL
   1.172 +utf8_isInvalid4(const ENCODING *enc, const char *p)
   1.173 +{
   1.174 +  return UTF8_INVALID4((const unsigned char *)p);
   1.175 +}
   1.176 +
   1.177 +struct normal_encoding {
   1.178 +  ENCODING enc;
   1.179 +  unsigned char type[256];
   1.180 +#ifdef XML_MIN_SIZE
   1.181 +  int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
   1.182 +  int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
   1.183 +  int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
   1.184 +  int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
   1.185 +  int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
   1.186 +#endif /* XML_MIN_SIZE */
   1.187 +  int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
   1.188 +  int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
   1.189 +  int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
   1.190 +  int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
   1.191 +  int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
   1.192 +  int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
   1.193 +  int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
   1.194 +  int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
   1.195 +  int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
   1.196 +};
   1.197 +
   1.198 +#define AS_NORMAL_ENCODING(enc)   ((const struct normal_encoding *) (enc))
   1.199 +
   1.200 +#ifdef XML_MIN_SIZE
   1.201 +
   1.202 +#define STANDARD_VTABLE(E) \
   1.203 + E ## byteType, \
   1.204 + E ## isNameMin, \
   1.205 + E ## isNmstrtMin, \
   1.206 + E ## byteToAscii, \
   1.207 + E ## charMatches,
   1.208 +
   1.209 +#else
   1.210 +
   1.211 +#define STANDARD_VTABLE(E) /* as nothing */
   1.212 +
   1.213 +#endif
   1.214 +
   1.215 +#define NORMAL_VTABLE(E) \
   1.216 + E ## isName2, \
   1.217 + E ## isName3, \
   1.218 + E ## isName4, \
   1.219 + E ## isNmstrt2, \
   1.220 + E ## isNmstrt3, \
   1.221 + E ## isNmstrt4, \
   1.222 + E ## isInvalid2, \
   1.223 + E ## isInvalid3, \
   1.224 + E ## isInvalid4
   1.225 +
   1.226 +static int FASTCALL checkCharRefNumber(int);
   1.227 +
   1.228 +#include "xmltok_impl.h"
   1.229 +#include "ascii.h"
   1.230 +
   1.231 +#ifdef XML_MIN_SIZE
   1.232 +#define sb_isNameMin isNever
   1.233 +#define sb_isNmstrtMin isNever
   1.234 +#endif
   1.235 +
   1.236 +#ifdef XML_MIN_SIZE
   1.237 +#define MINBPC(enc) ((enc)->minBytesPerChar)
   1.238 +#else
   1.239 +/* minimum bytes per character */
   1.240 +#define MINBPC(enc) 1
   1.241 +#endif
   1.242 +
   1.243 +#define SB_BYTE_TYPE(enc, p) \
   1.244 +  (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
   1.245 +
   1.246 +#ifdef XML_MIN_SIZE
   1.247 +static int PTRFASTCALL
   1.248 +sb_byteType(const ENCODING *enc, const char *p)
   1.249 +{
   1.250 +  return SB_BYTE_TYPE(enc, p);
   1.251 +}
   1.252 +#define BYTE_TYPE(enc, p) \
   1.253 + (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
   1.254 +#else
   1.255 +#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
   1.256 +#endif
   1.257 +
   1.258 +#ifdef XML_MIN_SIZE
   1.259 +#define BYTE_TO_ASCII(enc, p) \
   1.260 + (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
   1.261 +static int PTRFASTCALL
   1.262 +sb_byteToAscii(const ENCODING *enc, const char *p)
   1.263 +{
   1.264 +  return *p;
   1.265 +}
   1.266 +#else
   1.267 +#define BYTE_TO_ASCII(enc, p) (*(p))
   1.268 +#endif
   1.269 +
   1.270 +#define IS_NAME_CHAR(enc, p, n) \
   1.271 + (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
   1.272 +#define IS_NMSTRT_CHAR(enc, p, n) \
   1.273 + (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
   1.274 +#define IS_INVALID_CHAR(enc, p, n) \
   1.275 + (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
   1.276 +
   1.277 +#ifdef XML_MIN_SIZE
   1.278 +#define IS_NAME_CHAR_MINBPC(enc, p) \
   1.279 + (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
   1.280 +#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
   1.281 + (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
   1.282 +#else
   1.283 +#define IS_NAME_CHAR_MINBPC(enc, p) (0)
   1.284 +#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
   1.285 +#endif
   1.286 +
   1.287 +#ifdef XML_MIN_SIZE
   1.288 +#define CHAR_MATCHES(enc, p, c) \
   1.289 + (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
   1.290 +static int PTRCALL
   1.291 +sb_charMatches(const ENCODING *enc, const char *p, int c)
   1.292 +{
   1.293 +  return *p == c;
   1.294 +}
   1.295 +#else
   1.296 +/* c is an ASCII character */
   1.297 +#define CHAR_MATCHES(enc, p, c) (*(p) == c)
   1.298 +#endif
   1.299 +
   1.300 +#define PREFIX(ident) normal_ ## ident
   1.301 +#include "xmltok_impl.c"
   1.302 +
   1.303 +#undef MINBPC
   1.304 +#undef BYTE_TYPE
   1.305 +#undef BYTE_TO_ASCII
   1.306 +#undef CHAR_MATCHES
   1.307 +#undef IS_NAME_CHAR
   1.308 +#undef IS_NAME_CHAR_MINBPC
   1.309 +#undef IS_NMSTRT_CHAR
   1.310 +#undef IS_NMSTRT_CHAR_MINBPC
   1.311 +#undef IS_INVALID_CHAR
   1.312 +
   1.313 +enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
   1.314 +  UTF8_cval1 = 0x00,
   1.315 +  UTF8_cval2 = 0xc0,
   1.316 +  UTF8_cval3 = 0xe0,
   1.317 +  UTF8_cval4 = 0xf0
   1.318 +};
   1.319 +
   1.320 +static void PTRCALL
   1.321 +utf8_toUtf8(const ENCODING *enc,
   1.322 +            const char **fromP, const char *fromLim,
   1.323 +            char **toP, const char *toLim)
   1.324 +{
   1.325 +  char *to;
   1.326 +  const char *from;
   1.327 +  if (fromLim - *fromP > toLim - *toP) {
   1.328 +    /* Avoid copying partial characters. */
   1.329 +    for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
   1.330 +      if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
   1.331 +        break;
   1.332 +  }
   1.333 +  for (to = *toP, from = *fromP; from != fromLim; from++, to++)
   1.334 +    *to = *from;
   1.335 +  *fromP = from;
   1.336 +  *toP = to;
   1.337 +}
   1.338 +
   1.339 +static void PTRCALL
   1.340 +utf8_toUtf16(const ENCODING *enc,
   1.341 +             const char **fromP, const char *fromLim,
   1.342 +             unsigned short **toP, const unsigned short *toLim)
   1.343 +{
   1.344 +  unsigned short *to = *toP;
   1.345 +  const char *from = *fromP;
   1.346 +  while (from != fromLim && to != toLim) {
   1.347 +    switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
   1.348 +    case BT_LEAD2:
   1.349 +      *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
   1.350 +      from += 2;
   1.351 +      break;
   1.352 +    case BT_LEAD3:
   1.353 +      *to++ = (unsigned short)(((from[0] & 0xf) << 12)
   1.354 +                               | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
   1.355 +      from += 3;
   1.356 +      break;
   1.357 +    case BT_LEAD4:
   1.358 +      {
   1.359 +        unsigned long n;
   1.360 +        if (to + 1 == toLim)
   1.361 +          goto after;
   1.362 +        n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
   1.363 +            | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
   1.364 +        n -= 0x10000;
   1.365 +        to[0] = (unsigned short)((n >> 10) | 0xD800);
   1.366 +        to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
   1.367 +        to += 2;
   1.368 +        from += 4;
   1.369 +      }
   1.370 +      break;
   1.371 +    default:
   1.372 +      *to++ = *from++;
   1.373 +      break;
   1.374 +    }
   1.375 +  }
   1.376 +after:
   1.377 +  *fromP = from;
   1.378 +  *toP = to;
   1.379 +}
   1.380 +
   1.381 +#ifdef XML_NS
   1.382 +static const struct normal_encoding utf8_encoding_ns = {
   1.383 +  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
   1.384 +  {
   1.385 +#include "asciitab.h"
   1.386 +#include "utf8tab.h"
   1.387 +  },
   1.388 +  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
   1.389 +};
   1.390 +#endif
   1.391 +
   1.392 +static const struct normal_encoding utf8_encoding = {
   1.393 +  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
   1.394 +  {
   1.395 +#define BT_COLON BT_NMSTRT
   1.396 +#include "asciitab.h"
   1.397 +#undef BT_COLON
   1.398 +#include "utf8tab.h"
   1.399 +  },
   1.400 +  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
   1.401 +};
   1.402 +
   1.403 +#ifdef XML_NS
   1.404 +
   1.405 +static const struct normal_encoding internal_utf8_encoding_ns = {
   1.406 +  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
   1.407 +  {
   1.408 +#include "iasciitab.h"
   1.409 +#include "utf8tab.h"
   1.410 +  },
   1.411 +  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
   1.412 +};
   1.413 +
   1.414 +#endif
   1.415 +
   1.416 +static const struct normal_encoding internal_utf8_encoding = {
   1.417 +  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
   1.418 +  {
   1.419 +#define BT_COLON BT_NMSTRT
   1.420 +#include "iasciitab.h"
   1.421 +#undef BT_COLON
   1.422 +#include "utf8tab.h"
   1.423 +  },
   1.424 +  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
   1.425 +};
   1.426 +
   1.427 +static void PTRCALL
   1.428 +latin1_toUtf8(const ENCODING *enc,
   1.429 +              const char **fromP, const char *fromLim,
   1.430 +              char **toP, const char *toLim)
   1.431 +{
   1.432 +  for (;;) {
   1.433 +    unsigned char c;
   1.434 +    if (*fromP == fromLim)
   1.435 +      break;
   1.436 +    c = (unsigned char)**fromP;
   1.437 +    if (c & 0x80) {
   1.438 +      if (toLim - *toP < 2)
   1.439 +        break;
   1.440 +      *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
   1.441 +      *(*toP)++ = (char)((c & 0x3f) | 0x80);
   1.442 +      (*fromP)++;
   1.443 +    }
   1.444 +    else {
   1.445 +      if (*toP == toLim)
   1.446 +        break;
   1.447 +      *(*toP)++ = *(*fromP)++;
   1.448 +    }
   1.449 +  }
   1.450 +}
   1.451 +
   1.452 +static void PTRCALL
   1.453 +latin1_toUtf16(const ENCODING *enc,
   1.454 +               const char **fromP, const char *fromLim,
   1.455 +               unsigned short **toP, const unsigned short *toLim)
   1.456 +{
   1.457 +  while (*fromP != fromLim && *toP != toLim)
   1.458 +    *(*toP)++ = (unsigned char)*(*fromP)++;
   1.459 +}
   1.460 +
   1.461 +#ifdef XML_NS
   1.462 +
   1.463 +static const struct normal_encoding latin1_encoding_ns = {
   1.464 +  { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
   1.465 +  {
   1.466 +#include "asciitab.h"
   1.467 +#include "latin1tab.h"
   1.468 +  },
   1.469 +  STANDARD_VTABLE(sb_)
   1.470 +};
   1.471 +
   1.472 +#endif
   1.473 +
   1.474 +static const struct normal_encoding latin1_encoding = {
   1.475 +  { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
   1.476 +  {
   1.477 +#define BT_COLON BT_NMSTRT
   1.478 +#include "asciitab.h"
   1.479 +#undef BT_COLON
   1.480 +#include "latin1tab.h"
   1.481 +  },
   1.482 +  STANDARD_VTABLE(sb_)
   1.483 +};
   1.484 +
   1.485 +static void PTRCALL
   1.486 +ascii_toUtf8(const ENCODING *enc,
   1.487 +             const char **fromP, const char *fromLim,
   1.488 +             char **toP, const char *toLim)
   1.489 +{
   1.490 +  while (*fromP != fromLim && *toP != toLim)
   1.491 +    *(*toP)++ = *(*fromP)++;
   1.492 +}
   1.493 +
   1.494 +#ifdef XML_NS
   1.495 +
   1.496 +static const struct normal_encoding ascii_encoding_ns = {
   1.497 +  { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
   1.498 +  {
   1.499 +#include "asciitab.h"
   1.500 +/* BT_NONXML == 0 */
   1.501 +  },
   1.502 +  STANDARD_VTABLE(sb_)
   1.503 +};
   1.504 +
   1.505 +#endif
   1.506 +
   1.507 +static const struct normal_encoding ascii_encoding = {
   1.508 +  { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
   1.509 +  {
   1.510 +#define BT_COLON BT_NMSTRT
   1.511 +#include "asciitab.h"
   1.512 +#undef BT_COLON
   1.513 +/* BT_NONXML == 0 */
   1.514 +  },
   1.515 +  STANDARD_VTABLE(sb_)
   1.516 +};
   1.517 +
   1.518 +static int PTRFASTCALL
   1.519 +unicode_byte_type(char hi, char lo)
   1.520 +{
   1.521 +  switch ((unsigned char)hi) {
   1.522 +  case 0xD8: case 0xD9: case 0xDA: case 0xDB:
   1.523 +    return BT_LEAD4;
   1.524 +  case 0xDC: case 0xDD: case 0xDE: case 0xDF:
   1.525 +    return BT_TRAIL;
   1.526 +  case 0xFF:
   1.527 +    switch ((unsigned char)lo) {
   1.528 +    case 0xFF:
   1.529 +    case 0xFE:
   1.530 +      return BT_NONXML;
   1.531 +    }
   1.532 +    break;
   1.533 +  }
   1.534 +  return BT_NONASCII;
   1.535 +}
   1.536 +
   1.537 +#define DEFINE_UTF16_TO_UTF8(E) \
   1.538 +static void  PTRCALL \
   1.539 +E ## toUtf8(const ENCODING *enc, \
   1.540 +            const char **fromP, const char *fromLim, \
   1.541 +            char **toP, const char *toLim) \
   1.542 +{ \
   1.543 +  const char *from; \
   1.544 +  for (from = *fromP; from != fromLim; from += 2) { \
   1.545 +    int plane; \
   1.546 +    unsigned char lo2; \
   1.547 +    unsigned char lo = GET_LO(from); \
   1.548 +    unsigned char hi = GET_HI(from); \
   1.549 +    switch (hi) { \
   1.550 +    case 0: \
   1.551 +      if (lo < 0x80) { \
   1.552 +        if (*toP == toLim) { \
   1.553 +          *fromP = from; \
   1.554 +          return; \
   1.555 +        } \
   1.556 +        *(*toP)++ = lo; \
   1.557 +        break; \
   1.558 +      } \
   1.559 +      /* fall through */ \
   1.560 +    case 0x1: case 0x2: case 0x3: \
   1.561 +    case 0x4: case 0x5: case 0x6: case 0x7: \
   1.562 +      if (toLim -  *toP < 2) { \
   1.563 +        *fromP = from; \
   1.564 +        return; \
   1.565 +      } \
   1.566 +      *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
   1.567 +      *(*toP)++ = ((lo & 0x3f) | 0x80); \
   1.568 +      break; \
   1.569 +    default: \
   1.570 +      if (toLim -  *toP < 3)  { \
   1.571 +        *fromP = from; \
   1.572 +        return; \
   1.573 +      } \
   1.574 +      /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
   1.575 +      *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
   1.576 +      *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
   1.577 +      *(*toP)++ = ((lo & 0x3f) | 0x80); \
   1.578 +      break; \
   1.579 +    case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
   1.580 +      if (toLim -  *toP < 4) { \
   1.581 +        *fromP = from; \
   1.582 +        return; \
   1.583 +      } \
   1.584 +      plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
   1.585 +      *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
   1.586 +      *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
   1.587 +      from += 2; \
   1.588 +      lo2 = GET_LO(from); \
   1.589 +      *(*toP)++ = (((lo & 0x3) << 4) \
   1.590 +                   | ((GET_HI(from) & 0x3) << 2) \
   1.591 +                   | (lo2 >> 6) \
   1.592 +                   | 0x80); \
   1.593 +      *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
   1.594 +      break; \
   1.595 +    } \
   1.596 +  } \
   1.597 +  *fromP = from; \
   1.598 +}
   1.599 +
   1.600 +#define DEFINE_UTF16_TO_UTF16(E) \
   1.601 +static void  PTRCALL \
   1.602 +E ## toUtf16(const ENCODING *enc, \
   1.603 +             const char **fromP, const char *fromLim, \
   1.604 +             unsigned short **toP, const unsigned short *toLim) \
   1.605 +{ \
   1.606 +  /* Avoid copying first half only of surrogate */ \
   1.607 +  if (fromLim - *fromP > ((toLim - *toP) << 1) \
   1.608 +      && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
   1.609 +    fromLim -= 2; \
   1.610 +  for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
   1.611 +    *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
   1.612 +}
   1.613 +
   1.614 +#define SET2(ptr, ch) \
   1.615 +  (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
   1.616 +#define GET_LO(ptr) ((unsigned char)(ptr)[0])
   1.617 +#define GET_HI(ptr) ((unsigned char)(ptr)[1])
   1.618 +
   1.619 +DEFINE_UTF16_TO_UTF8(little2_)
   1.620 +DEFINE_UTF16_TO_UTF16(little2_)
   1.621 +
   1.622 +#undef SET2
   1.623 +#undef GET_LO
   1.624 +#undef GET_HI
   1.625 +
   1.626 +#define SET2(ptr, ch) \
   1.627 +  (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
   1.628 +#define GET_LO(ptr) ((unsigned char)(ptr)[1])
   1.629 +#define GET_HI(ptr) ((unsigned char)(ptr)[0])
   1.630 +
   1.631 +DEFINE_UTF16_TO_UTF8(big2_)
   1.632 +DEFINE_UTF16_TO_UTF16(big2_)
   1.633 +
   1.634 +#undef SET2
   1.635 +#undef GET_LO
   1.636 +#undef GET_HI
   1.637 +
   1.638 +#define LITTLE2_BYTE_TYPE(enc, p) \
   1.639 + ((p)[1] == 0 \
   1.640 +  ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
   1.641 +  : unicode_byte_type((p)[1], (p)[0]))
   1.642 +#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
   1.643 +#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
   1.644 +#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
   1.645 +  UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
   1.646 +#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
   1.647 +  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
   1.648 +
   1.649 +#ifdef XML_MIN_SIZE
   1.650 +
   1.651 +static int PTRFASTCALL
   1.652 +little2_byteType(const ENCODING *enc, const char *p)
   1.653 +{
   1.654 +  return LITTLE2_BYTE_TYPE(enc, p);
   1.655 +}
   1.656 +
   1.657 +static int PTRFASTCALL
   1.658 +little2_byteToAscii(const ENCODING *enc, const char *p)
   1.659 +{
   1.660 +  return LITTLE2_BYTE_TO_ASCII(enc, p);
   1.661 +}
   1.662 +
   1.663 +static int PTRCALL
   1.664 +little2_charMatches(const ENCODING *enc, const char *p, int c)
   1.665 +{
   1.666 +  return LITTLE2_CHAR_MATCHES(enc, p, c);
   1.667 +}
   1.668 +
   1.669 +static int PTRFASTCALL
   1.670 +little2_isNameMin(const ENCODING *enc, const char *p)
   1.671 +{
   1.672 +  return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
   1.673 +}
   1.674 +
   1.675 +static int PTRFASTCALL
   1.676 +little2_isNmstrtMin(const ENCODING *enc, const char *p)
   1.677 +{
   1.678 +  return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
   1.679 +}
   1.680 +
   1.681 +#undef VTABLE
   1.682 +#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
   1.683 +
   1.684 +#else /* not XML_MIN_SIZE */
   1.685 +
   1.686 +#undef PREFIX
   1.687 +#define PREFIX(ident) little2_ ## ident
   1.688 +#define MINBPC(enc) 2
   1.689 +/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
   1.690 +#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
   1.691 +#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
   1.692 +#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
   1.693 +#define IS_NAME_CHAR(enc, p, n) 0
   1.694 +#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
   1.695 +#define IS_NMSTRT_CHAR(enc, p, n) (0)
   1.696 +#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
   1.697 +
   1.698 +#include "xmltok_impl.c"
   1.699 +
   1.700 +#undef MINBPC
   1.701 +#undef BYTE_TYPE
   1.702 +#undef BYTE_TO_ASCII
   1.703 +#undef CHAR_MATCHES
   1.704 +#undef IS_NAME_CHAR
   1.705 +#undef IS_NAME_CHAR_MINBPC
   1.706 +#undef IS_NMSTRT_CHAR
   1.707 +#undef IS_NMSTRT_CHAR_MINBPC
   1.708 +#undef IS_INVALID_CHAR
   1.709 +
   1.710 +#endif /* not XML_MIN_SIZE */
   1.711 +
   1.712 +#ifdef XML_NS
   1.713 +
   1.714 +static const struct normal_encoding little2_encoding_ns = {
   1.715 +  { VTABLE, 2, 0,
   1.716 +#if BYTEORDER == 1234
   1.717 +    1
   1.718 +#else
   1.719 +    0
   1.720 +#endif
   1.721 +  },
   1.722 +  {
   1.723 +#include "asciitab.h"
   1.724 +#include "latin1tab.h"
   1.725 +  },
   1.726 +  STANDARD_VTABLE(little2_)
   1.727 +};
   1.728 +
   1.729 +#endif
   1.730 +
   1.731 +static const struct normal_encoding little2_encoding = {
   1.732 +  { VTABLE, 2, 0,
   1.733 +#if BYTEORDER == 1234
   1.734 +    1
   1.735 +#else
   1.736 +    0
   1.737 +#endif
   1.738 +  },
   1.739 +  {
   1.740 +#define BT_COLON BT_NMSTRT
   1.741 +#include "asciitab.h"
   1.742 +#undef BT_COLON
   1.743 +#include "latin1tab.h"
   1.744 +  },
   1.745 +  STANDARD_VTABLE(little2_)
   1.746 +};
   1.747 +
   1.748 +#if BYTEORDER != 4321
   1.749 +
   1.750 +#ifdef XML_NS
   1.751 +
   1.752 +static const struct normal_encoding internal_little2_encoding_ns = {
   1.753 +  { VTABLE, 2, 0, 1 },
   1.754 +  {
   1.755 +#include "iasciitab.h"
   1.756 +#include "latin1tab.h"
   1.757 +  },
   1.758 +  STANDARD_VTABLE(little2_)
   1.759 +};
   1.760 +
   1.761 +#endif
   1.762 +
   1.763 +static const struct normal_encoding internal_little2_encoding = {
   1.764 +  { VTABLE, 2, 0, 1 },
   1.765 +  {
   1.766 +#define BT_COLON BT_NMSTRT
   1.767 +#include "iasciitab.h"
   1.768 +#undef BT_COLON
   1.769 +#include "latin1tab.h"
   1.770 +  },
   1.771 +  STANDARD_VTABLE(little2_)
   1.772 +};
   1.773 +
   1.774 +#endif
   1.775 +
   1.776 +
   1.777 +#define BIG2_BYTE_TYPE(enc, p) \
   1.778 + ((p)[0] == 0 \
   1.779 +  ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
   1.780 +  : unicode_byte_type((p)[0], (p)[1]))
   1.781 +#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
   1.782 +#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
   1.783 +#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
   1.784 +  UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
   1.785 +#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
   1.786 +  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
   1.787 +
   1.788 +#ifdef XML_MIN_SIZE
   1.789 +
   1.790 +static int PTRFASTCALL
   1.791 +big2_byteType(const ENCODING *enc, const char *p)
   1.792 +{
   1.793 +  return BIG2_BYTE_TYPE(enc, p);
   1.794 +}
   1.795 +
   1.796 +static int PTRFASTCALL
   1.797 +big2_byteToAscii(const ENCODING *enc, const char *p)
   1.798 +{
   1.799 +  return BIG2_BYTE_TO_ASCII(enc, p);
   1.800 +}
   1.801 +
   1.802 +static int PTRCALL
   1.803 +big2_charMatches(const ENCODING *enc, const char *p, int c)
   1.804 +{
   1.805 +  return BIG2_CHAR_MATCHES(enc, p, c);
   1.806 +}
   1.807 +
   1.808 +static int PTRFASTCALL
   1.809 +big2_isNameMin(const ENCODING *enc, const char *p)
   1.810 +{
   1.811 +  return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
   1.812 +}
   1.813 +
   1.814 +static int PTRFASTCALL
   1.815 +big2_isNmstrtMin(const ENCODING *enc, const char *p)
   1.816 +{
   1.817 +  return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
   1.818 +}
   1.819 +
   1.820 +#undef VTABLE
   1.821 +#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
   1.822 +
   1.823 +#else /* not XML_MIN_SIZE */
   1.824 +
   1.825 +#undef PREFIX
   1.826 +#define PREFIX(ident) big2_ ## ident
   1.827 +#define MINBPC(enc) 2
   1.828 +/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
   1.829 +#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
   1.830 +#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
   1.831 +#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
   1.832 +#define IS_NAME_CHAR(enc, p, n) 0
   1.833 +#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
   1.834 +#define IS_NMSTRT_CHAR(enc, p, n) (0)
   1.835 +#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
   1.836 +
   1.837 +#include "xmltok_impl.c"
   1.838 +
   1.839 +#undef MINBPC
   1.840 +#undef BYTE_TYPE
   1.841 +#undef BYTE_TO_ASCII
   1.842 +#undef CHAR_MATCHES
   1.843 +#undef IS_NAME_CHAR
   1.844 +#undef IS_NAME_CHAR_MINBPC
   1.845 +#undef IS_NMSTRT_CHAR
   1.846 +#undef IS_NMSTRT_CHAR_MINBPC
   1.847 +#undef IS_INVALID_CHAR
   1.848 +
   1.849 +#endif /* not XML_MIN_SIZE */
   1.850 +
   1.851 +#ifdef XML_NS
   1.852 +
   1.853 +static const struct normal_encoding big2_encoding_ns = {
   1.854 +  { VTABLE, 2, 0,
   1.855 +#if BYTEORDER == 4321
   1.856 +  1
   1.857 +#else
   1.858 +  0
   1.859 +#endif
   1.860 +  },
   1.861 +  {
   1.862 +#include "asciitab.h"
   1.863 +#include "latin1tab.h"
   1.864 +  },
   1.865 +  STANDARD_VTABLE(big2_)
   1.866 +};
   1.867 +
   1.868 +#endif
   1.869 +
   1.870 +static const struct normal_encoding big2_encoding = {
   1.871 +  { VTABLE, 2, 0,
   1.872 +#if BYTEORDER == 4321
   1.873 +  1
   1.874 +#else
   1.875 +  0
   1.876 +#endif
   1.877 +  },
   1.878 +  {
   1.879 +#define BT_COLON BT_NMSTRT
   1.880 +#include "asciitab.h"
   1.881 +#undef BT_COLON
   1.882 +#include "latin1tab.h"
   1.883 +  },
   1.884 +  STANDARD_VTABLE(big2_)
   1.885 +};
   1.886 +
   1.887 +#if BYTEORDER != 1234
   1.888 +
   1.889 +#ifdef XML_NS
   1.890 +
   1.891 +static const struct normal_encoding internal_big2_encoding_ns = {
   1.892 +  { VTABLE, 2, 0, 1 },
   1.893 +  {
   1.894 +#include "iasciitab.h"
   1.895 +#include "latin1tab.h"
   1.896 +  },
   1.897 +  STANDARD_VTABLE(big2_)
   1.898 +};
   1.899 +
   1.900 +#endif
   1.901 +
   1.902 +static const struct normal_encoding internal_big2_encoding = {
   1.903 +  { VTABLE, 2, 0, 1 },
   1.904 +  {
   1.905 +#define BT_COLON BT_NMSTRT
   1.906 +#include "iasciitab.h"
   1.907 +#undef BT_COLON
   1.908 +#include "latin1tab.h"
   1.909 +  },
   1.910 +  STANDARD_VTABLE(big2_)
   1.911 +};
   1.912 +
   1.913 +#endif
   1.914 +
   1.915 +#undef PREFIX
   1.916 +
   1.917 +static int FASTCALL
   1.918 +streqci(const char *s1, const char *s2)
   1.919 +{
   1.920 +  for (;;) {
   1.921 +    char c1 = *s1++;
   1.922 +    char c2 = *s2++;
   1.923 +    if (ASCII_a <= c1 && c1 <= ASCII_z)
   1.924 +      c1 += ASCII_A - ASCII_a;
   1.925 +    if (ASCII_a <= c2 && c2 <= ASCII_z)
   1.926 +      c2 += ASCII_A - ASCII_a;
   1.927 +    if (c1 != c2)
   1.928 +      return 0;
   1.929 +    if (!c1)
   1.930 +      break;
   1.931 +  }
   1.932 +  return 1;
   1.933 +}
   1.934 +
   1.935 +static void PTRCALL
   1.936 +initUpdatePosition(const ENCODING *enc, const char *ptr,
   1.937 +                   const char *end, POSITION *pos)
   1.938 +{
   1.939 +  normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
   1.940 +}
   1.941 +
   1.942 +static int
   1.943 +toAscii(const ENCODING *enc, const char *ptr, const char *end)
   1.944 +{
   1.945 +  char buf[1];
   1.946 +  char *p = buf;
   1.947 +  XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
   1.948 +  if (p == buf)
   1.949 +    return -1;
   1.950 +  else
   1.951 +    return buf[0];
   1.952 +}
   1.953 +
   1.954 +static int FASTCALL
   1.955 +isSpace(int c)
   1.956 +{
   1.957 +  switch (c) {
   1.958 +  case 0x20:
   1.959 +  case 0xD:
   1.960 +  case 0xA:
   1.961 +  case 0x9:
   1.962 +    return 1;
   1.963 +  }
   1.964 +  return 0;
   1.965 +}
   1.966 +
   1.967 +/* Return 1 if there's just optional white space or there's an S
   1.968 +   followed by name=val.
   1.969 +*/
   1.970 +static int
   1.971 +parsePseudoAttribute(const ENCODING *enc,
   1.972 +                     const char *ptr,
   1.973 +                     const char *end,
   1.974 +                     const char **namePtr,
   1.975 +                     const char **nameEndPtr,
   1.976 +                     const char **valPtr,
   1.977 +                     const char **nextTokPtr)
   1.978 +{
   1.979 +  int c;
   1.980 +  char open;
   1.981 +  if (ptr == end) {
   1.982 +    *namePtr = NULL;
   1.983 +    return 1;
   1.984 +  }
   1.985 +  if (!isSpace(toAscii(enc, ptr, end))) {
   1.986 +    *nextTokPtr = ptr;
   1.987 +    return 0;
   1.988 +  }
   1.989 +  do {
   1.990 +    ptr += enc->minBytesPerChar;
   1.991 +  } while (isSpace(toAscii(enc, ptr, end)));
   1.992 +  if (ptr == end) {
   1.993 +    *namePtr = NULL;
   1.994 +    return 1;
   1.995 +  }
   1.996 +  *namePtr = ptr;
   1.997 +  for (;;) {
   1.998 +    c = toAscii(enc, ptr, end);
   1.999 +    if (c == -1) {
  1.1000 +      *nextTokPtr = ptr;
  1.1001 +      return 0;
  1.1002 +    }
  1.1003 +    if (c == ASCII_EQUALS) {
  1.1004 +      *nameEndPtr = ptr;
  1.1005 +      break;
  1.1006 +    }
  1.1007 +    if (isSpace(c)) {
  1.1008 +      *nameEndPtr = ptr;
  1.1009 +      do {
  1.1010 +        ptr += enc->minBytesPerChar;
  1.1011 +      } while (isSpace(c = toAscii(enc, ptr, end)));
  1.1012 +      if (c != ASCII_EQUALS) {
  1.1013 +        *nextTokPtr = ptr;
  1.1014 +        return 0;
  1.1015 +      }
  1.1016 +      break;
  1.1017 +    }
  1.1018 +    ptr += enc->minBytesPerChar;
  1.1019 +  }
  1.1020 +  if (ptr == *namePtr) {
  1.1021 +    *nextTokPtr = ptr;
  1.1022 +    return 0;
  1.1023 +  }
  1.1024 +  ptr += enc->minBytesPerChar;
  1.1025 +  c = toAscii(enc, ptr, end);
  1.1026 +  while (isSpace(c)) {
  1.1027 +    ptr += enc->minBytesPerChar;
  1.1028 +    c = toAscii(enc, ptr, end);
  1.1029 +  }
  1.1030 +  if (c != ASCII_QUOT && c != ASCII_APOS) {
  1.1031 +    *nextTokPtr = ptr;
  1.1032 +    return 0;
  1.1033 +  }
  1.1034 +  open = (char)c;
  1.1035 +  ptr += enc->minBytesPerChar;
  1.1036 +  *valPtr = ptr;
  1.1037 +  for (;; ptr += enc->minBytesPerChar) {
  1.1038 +    c = toAscii(enc, ptr, end);
  1.1039 +    if (c == open)
  1.1040 +      break;
  1.1041 +    if (!(ASCII_a <= c && c <= ASCII_z)
  1.1042 +        && !(ASCII_A <= c && c <= ASCII_Z)
  1.1043 +        && !(ASCII_0 <= c && c <= ASCII_9)
  1.1044 +        && c != ASCII_PERIOD
  1.1045 +        && c != ASCII_MINUS
  1.1046 +        && c != ASCII_UNDERSCORE) {
  1.1047 +      *nextTokPtr = ptr;
  1.1048 +      return 0;
  1.1049 +    }
  1.1050 +  }
  1.1051 +  *nextTokPtr = ptr + enc->minBytesPerChar;
  1.1052 +  return 1;
  1.1053 +}
  1.1054 +
  1.1055 +static const char KW_version[] = {
  1.1056 +  ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
  1.1057 +};
  1.1058 +
  1.1059 +static const char KW_encoding[] = {
  1.1060 +  ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
  1.1061 +};
  1.1062 +
  1.1063 +static const char KW_standalone[] = {
  1.1064 +  ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
  1.1065 +  ASCII_n, ASCII_e, '\0'
  1.1066 +};
  1.1067 +
  1.1068 +static const char KW_yes[] = {
  1.1069 +  ASCII_y, ASCII_e, ASCII_s,  '\0'
  1.1070 +};
  1.1071 +
  1.1072 +static const char KW_no[] = {
  1.1073 +  ASCII_n, ASCII_o,  '\0'
  1.1074 +};
  1.1075 +
  1.1076 +/* BEGIN MOZILLA CHANGE (http://bugzilla.mozilla.org/show_bug.cgi?id=62157) */
  1.1077 +static const char KW_XML_1_0[] = {
  1.1078 +  ASCII_1, ASCII_PERIOD, ASCII_0, '\0'
  1.1079 +};
  1.1080 +/* END MOZILLA CHANGE */
  1.1081 +
  1.1082 +static int
  1.1083 +doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
  1.1084 +                                                 const char *,
  1.1085 +                                                 const char *),
  1.1086 +               int isGeneralTextEntity,
  1.1087 +               const ENCODING *enc,
  1.1088 +               const char *ptr,
  1.1089 +               const char *end,
  1.1090 +               const char **badPtr,
  1.1091 +               const char **versionPtr,
  1.1092 +               const char **versionEndPtr,
  1.1093 +               const char **encodingName,
  1.1094 +               const ENCODING **encoding,
  1.1095 +               int *standalone)
  1.1096 +{
  1.1097 +  const char *val = NULL;
  1.1098 +  const char *name = NULL;
  1.1099 +  const char *nameEnd = NULL;
  1.1100 +  ptr += 5 * enc->minBytesPerChar;
  1.1101 +  end -= 2 * enc->minBytesPerChar;
  1.1102 +  if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
  1.1103 +      || !name) {
  1.1104 +    *badPtr = ptr;
  1.1105 +    return 0;
  1.1106 +  }
  1.1107 +  if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
  1.1108 +    if (!isGeneralTextEntity) {
  1.1109 +      *badPtr = name;
  1.1110 +      return 0;
  1.1111 +    }
  1.1112 +  }
  1.1113 +  else {
  1.1114 +    if (versionPtr)
  1.1115 +      *versionPtr = val;
  1.1116 +    if (versionEndPtr)
  1.1117 +      *versionEndPtr = ptr;
  1.1118 +/* BEGIN MOZILLA CHANGE (http://bugzilla.mozilla.org/show_bug.cgi?id=62157) */
  1.1119 +     /* Anything else but a version="1.0" is invalid for us, until we support later versions. */
  1.1120 +     if (!XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_XML_1_0)) {
  1.1121 +       *badPtr = val;
  1.1122 +       return 0;
  1.1123 +     }
  1.1124 +/* END MOZILLA CHANGE */
  1.1125 +    if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
  1.1126 +      *badPtr = ptr;
  1.1127 +      return 0;
  1.1128 +    }
  1.1129 +    if (!name) {
  1.1130 +      if (isGeneralTextEntity) {
  1.1131 +        /* a TextDecl must have an EncodingDecl */
  1.1132 +        *badPtr = ptr;
  1.1133 +        return 0;
  1.1134 +      }
  1.1135 +      return 1;
  1.1136 +    }
  1.1137 +  }
  1.1138 +  if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
  1.1139 +    int c = toAscii(enc, val, end);
  1.1140 +    if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
  1.1141 +      *badPtr = val;
  1.1142 +      return 0;
  1.1143 +    }
  1.1144 +    if (encodingName)
  1.1145 +      *encodingName = val;
  1.1146 +    if (encoding)
  1.1147 +      *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
  1.1148 +    if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
  1.1149 +      *badPtr = ptr;
  1.1150 +      return 0;
  1.1151 +    }
  1.1152 +    if (!name)
  1.1153 +      return 1;
  1.1154 +  }
  1.1155 +  if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
  1.1156 +      || isGeneralTextEntity) {
  1.1157 +    *badPtr = name;
  1.1158 +    return 0;
  1.1159 +  }
  1.1160 +  if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
  1.1161 +    if (standalone)
  1.1162 +      *standalone = 1;
  1.1163 +  }
  1.1164 +  else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
  1.1165 +    if (standalone)
  1.1166 +      *standalone = 0;
  1.1167 +  }
  1.1168 +  else {
  1.1169 +    *badPtr = val;
  1.1170 +    return 0;
  1.1171 +  }
  1.1172 +  while (isSpace(toAscii(enc, ptr, end)))
  1.1173 +    ptr += enc->minBytesPerChar;
  1.1174 +  if (ptr != end) {
  1.1175 +    *badPtr = ptr;
  1.1176 +    return 0;
  1.1177 +  }
  1.1178 +  return 1;
  1.1179 +}
  1.1180 +
  1.1181 +static int FASTCALL
  1.1182 +checkCharRefNumber(int result)
  1.1183 +{
  1.1184 +  switch (result >> 8) {
  1.1185 +  case 0xD8: case 0xD9: case 0xDA: case 0xDB:
  1.1186 +  case 0xDC: case 0xDD: case 0xDE: case 0xDF:
  1.1187 +    return -1;
  1.1188 +  case 0:
  1.1189 +    if (latin1_encoding.type[result] == BT_NONXML)
  1.1190 +      return -1;
  1.1191 +    break;
  1.1192 +  case 0xFF:
  1.1193 +    if (result == 0xFFFE || result == 0xFFFF)
  1.1194 +      return -1;
  1.1195 +    break;
  1.1196 +  }
  1.1197 +  return result;
  1.1198 +}
  1.1199 +
  1.1200 +int FASTCALL
  1.1201 +XmlUtf8Encode(int c, char *buf)
  1.1202 +{
  1.1203 +  enum {
  1.1204 +    /* minN is minimum legal resulting value for N byte sequence */
  1.1205 +    min2 = 0x80,
  1.1206 +    min3 = 0x800,
  1.1207 +    min4 = 0x10000
  1.1208 +  };
  1.1209 +
  1.1210 +  if (c < 0)
  1.1211 +    return 0;
  1.1212 +  if (c < min2) {
  1.1213 +    buf[0] = (char)(c | UTF8_cval1);
  1.1214 +    return 1;
  1.1215 +  }
  1.1216 +  if (c < min3) {
  1.1217 +    buf[0] = (char)((c >> 6) | UTF8_cval2);
  1.1218 +    buf[1] = (char)((c & 0x3f) | 0x80);
  1.1219 +    return 2;
  1.1220 +  }
  1.1221 +  if (c < min4) {
  1.1222 +    buf[0] = (char)((c >> 12) | UTF8_cval3);
  1.1223 +    buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
  1.1224 +    buf[2] = (char)((c & 0x3f) | 0x80);
  1.1225 +    return 3;
  1.1226 +  }
  1.1227 +  if (c < 0x110000) {
  1.1228 +    buf[0] = (char)((c >> 18) | UTF8_cval4);
  1.1229 +    buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
  1.1230 +    buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
  1.1231 +    buf[3] = (char)((c & 0x3f) | 0x80);
  1.1232 +    return 4;
  1.1233 +  }
  1.1234 +  return 0;
  1.1235 +}
  1.1236 +
  1.1237 +int FASTCALL
  1.1238 +XmlUtf16Encode(int charNum, unsigned short *buf)
  1.1239 +{
  1.1240 +  if (charNum < 0)
  1.1241 +    return 0;
  1.1242 +  if (charNum < 0x10000) {
  1.1243 +    buf[0] = (unsigned short)charNum;
  1.1244 +    return 1;
  1.1245 +  }
  1.1246 +  if (charNum < 0x110000) {
  1.1247 +    charNum -= 0x10000;
  1.1248 +    buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
  1.1249 +    buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
  1.1250 +    return 2;
  1.1251 +  }
  1.1252 +  return 0;
  1.1253 +}
  1.1254 +
  1.1255 +struct unknown_encoding {
  1.1256 +  struct normal_encoding normal;
  1.1257 +  CONVERTER convert;
  1.1258 +  void *userData;
  1.1259 +  unsigned short utf16[256];
  1.1260 +  char utf8[256][4];
  1.1261 +};
  1.1262 +
  1.1263 +#define AS_UNKNOWN_ENCODING(enc)  ((const struct unknown_encoding *) (enc))
  1.1264 +
  1.1265 +int
  1.1266 +XmlSizeOfUnknownEncoding(void)
  1.1267 +{
  1.1268 +  return sizeof(struct unknown_encoding);
  1.1269 +}
  1.1270 +
  1.1271 +static int PTRFASTCALL
  1.1272 +unknown_isName(const ENCODING *enc, const char *p)
  1.1273 +{
  1.1274 +  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1.1275 +  int c = uenc->convert(uenc->userData, p);
  1.1276 +  if (c & ~0xFFFF)
  1.1277 +    return 0;
  1.1278 +  return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
  1.1279 +}
  1.1280 +
  1.1281 +static int PTRFASTCALL
  1.1282 +unknown_isNmstrt(const ENCODING *enc, const char *p)
  1.1283 +{
  1.1284 +  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1.1285 +  int c = uenc->convert(uenc->userData, p);
  1.1286 +  if (c & ~0xFFFF)
  1.1287 +    return 0;
  1.1288 +  return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
  1.1289 +}
  1.1290 +
  1.1291 +static int PTRFASTCALL
  1.1292 +unknown_isInvalid(const ENCODING *enc, const char *p)
  1.1293 +{
  1.1294 +  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1.1295 +  int c = uenc->convert(uenc->userData, p);
  1.1296 +  return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
  1.1297 +}
  1.1298 +
  1.1299 +static void PTRCALL
  1.1300 +unknown_toUtf8(const ENCODING *enc,
  1.1301 +               const char **fromP, const char *fromLim,
  1.1302 +               char **toP, const char *toLim)
  1.1303 +{
  1.1304 +  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1.1305 +  char buf[XML_UTF8_ENCODE_MAX];
  1.1306 +  for (;;) {
  1.1307 +    const char *utf8;
  1.1308 +    int n;
  1.1309 +    if (*fromP == fromLim)
  1.1310 +      break;
  1.1311 +    utf8 = uenc->utf8[(unsigned char)**fromP];
  1.1312 +    n = *utf8++;
  1.1313 +    if (n == 0) {
  1.1314 +      int c = uenc->convert(uenc->userData, *fromP);
  1.1315 +      n = XmlUtf8Encode(c, buf);
  1.1316 +      if (n > toLim - *toP)
  1.1317 +        break;
  1.1318 +      utf8 = buf;
  1.1319 +      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
  1.1320 +                 - (BT_LEAD2 - 2));
  1.1321 +    }
  1.1322 +    else {
  1.1323 +      if (n > toLim - *toP)
  1.1324 +        break;
  1.1325 +      (*fromP)++;
  1.1326 +    }
  1.1327 +    do {
  1.1328 +      *(*toP)++ = *utf8++;
  1.1329 +    } while (--n != 0);
  1.1330 +  }
  1.1331 +}
  1.1332 +
  1.1333 +static void PTRCALL
  1.1334 +unknown_toUtf16(const ENCODING *enc,
  1.1335 +                const char **fromP, const char *fromLim,
  1.1336 +                unsigned short **toP, const unsigned short *toLim)
  1.1337 +{
  1.1338 +  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1.1339 +  while (*fromP != fromLim && *toP != toLim) {
  1.1340 +    unsigned short c = uenc->utf16[(unsigned char)**fromP];
  1.1341 +    if (c == 0) {
  1.1342 +      c = (unsigned short)
  1.1343 +          uenc->convert(uenc->userData, *fromP);
  1.1344 +      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
  1.1345 +                 - (BT_LEAD2 - 2));
  1.1346 +    }
  1.1347 +    else
  1.1348 +      (*fromP)++;
  1.1349 +    *(*toP)++ = c;
  1.1350 +  }
  1.1351 +}
  1.1352 +
  1.1353 +ENCODING *
  1.1354 +XmlInitUnknownEncoding(void *mem,
  1.1355 +                       int *table,
  1.1356 +                       CONVERTER convert, 
  1.1357 +                       void *userData)
  1.1358 +{
  1.1359 +  int i;
  1.1360 +  struct unknown_encoding *e = (struct unknown_encoding *)mem;
  1.1361 +  for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
  1.1362 +    ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
  1.1363 +  for (i = 0; i < 128; i++)
  1.1364 +    if (latin1_encoding.type[i] != BT_OTHER
  1.1365 +        && latin1_encoding.type[i] != BT_NONXML
  1.1366 +        && table[i] != i)
  1.1367 +      return 0;
  1.1368 +  for (i = 0; i < 256; i++) {
  1.1369 +    int c = table[i];
  1.1370 +    if (c == -1) {
  1.1371 +      e->normal.type[i] = BT_MALFORM;
  1.1372 +      /* This shouldn't really get used. */
  1.1373 +      e->utf16[i] = 0xFFFF;
  1.1374 +      e->utf8[i][0] = 1;
  1.1375 +      e->utf8[i][1] = 0;
  1.1376 +    }
  1.1377 +    else if (c < 0) {
  1.1378 +      if (c < -4)
  1.1379 +        return 0;
  1.1380 +      e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
  1.1381 +      e->utf8[i][0] = 0;
  1.1382 +      e->utf16[i] = 0;
  1.1383 +    }
  1.1384 +    else if (c < 0x80) {
  1.1385 +      if (latin1_encoding.type[c] != BT_OTHER
  1.1386 +          && latin1_encoding.type[c] != BT_NONXML
  1.1387 +          && c != i)
  1.1388 +        return 0;
  1.1389 +      e->normal.type[i] = latin1_encoding.type[c];
  1.1390 +      e->utf8[i][0] = 1;
  1.1391 +      e->utf8[i][1] = (char)c;
  1.1392 +      e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
  1.1393 +    }
  1.1394 +    else if (checkCharRefNumber(c) < 0) {
  1.1395 +      e->normal.type[i] = BT_NONXML;
  1.1396 +      /* This shouldn't really get used. */
  1.1397 +      e->utf16[i] = 0xFFFF;
  1.1398 +      e->utf8[i][0] = 1;
  1.1399 +      e->utf8[i][1] = 0;
  1.1400 +    }
  1.1401 +    else {
  1.1402 +      if (c > 0xFFFF)
  1.1403 +        return 0;
  1.1404 +      if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
  1.1405 +        e->normal.type[i] = BT_NMSTRT;
  1.1406 +      else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
  1.1407 +        e->normal.type[i] = BT_NAME;
  1.1408 +      else
  1.1409 +        e->normal.type[i] = BT_OTHER;
  1.1410 +      e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
  1.1411 +      e->utf16[i] = (unsigned short)c;
  1.1412 +    }
  1.1413 +  }
  1.1414 +  e->userData = userData;
  1.1415 +  e->convert = convert;
  1.1416 +  if (convert) {
  1.1417 +    e->normal.isName2 = unknown_isName;
  1.1418 +    e->normal.isName3 = unknown_isName;
  1.1419 +    e->normal.isName4 = unknown_isName;
  1.1420 +    e->normal.isNmstrt2 = unknown_isNmstrt;
  1.1421 +    e->normal.isNmstrt3 = unknown_isNmstrt;
  1.1422 +    e->normal.isNmstrt4 = unknown_isNmstrt;
  1.1423 +    e->normal.isInvalid2 = unknown_isInvalid;
  1.1424 +    e->normal.isInvalid3 = unknown_isInvalid;
  1.1425 +    e->normal.isInvalid4 = unknown_isInvalid;
  1.1426 +  }
  1.1427 +  e->normal.enc.utf8Convert = unknown_toUtf8;
  1.1428 +  e->normal.enc.utf16Convert = unknown_toUtf16;
  1.1429 +  return &(e->normal.enc);
  1.1430 +}
  1.1431 +
  1.1432 +/* If this enumeration is changed, getEncodingIndex and encodings
  1.1433 +must also be changed. */
  1.1434 +enum {
  1.1435 +  UNKNOWN_ENC = -1,
  1.1436 +  ISO_8859_1_ENC = 0,
  1.1437 +  US_ASCII_ENC,
  1.1438 +  UTF_8_ENC,
  1.1439 +  UTF_16_ENC,
  1.1440 +  UTF_16BE_ENC,
  1.1441 +  UTF_16LE_ENC,
  1.1442 +  /* must match encodingNames up to here */
  1.1443 +  NO_ENC
  1.1444 +};
  1.1445 +
  1.1446 +static const char KW_ISO_8859_1[] = {
  1.1447 +  ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
  1.1448 +  ASCII_MINUS, ASCII_1, '\0'
  1.1449 +};
  1.1450 +static const char KW_US_ASCII[] = {
  1.1451 +  ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
  1.1452 +  '\0'
  1.1453 +};
  1.1454 +static const char KW_UTF_8[] =  {
  1.1455 +  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
  1.1456 +};
  1.1457 +static const char KW_UTF_16[] = {
  1.1458 +  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
  1.1459 +};
  1.1460 +static const char KW_UTF_16BE[] = {
  1.1461 +  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
  1.1462 +  '\0'
  1.1463 +};
  1.1464 +static const char KW_UTF_16LE[] = {
  1.1465 +  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
  1.1466 +  '\0'
  1.1467 +};
  1.1468 +
  1.1469 +static int FASTCALL
  1.1470 +getEncodingIndex(const char *name)
  1.1471 +{
  1.1472 +  static const char * const encodingNames[] = {
  1.1473 +    KW_ISO_8859_1,
  1.1474 +    KW_US_ASCII,
  1.1475 +    KW_UTF_8,
  1.1476 +    KW_UTF_16,
  1.1477 +    KW_UTF_16BE,
  1.1478 +    KW_UTF_16LE,
  1.1479 +  };
  1.1480 +  int i;
  1.1481 +  if (name == NULL)
  1.1482 +    return NO_ENC;
  1.1483 +  for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
  1.1484 +    if (streqci(name, encodingNames[i]))
  1.1485 +      return i;
  1.1486 +  return UNKNOWN_ENC;
  1.1487 +}
  1.1488 +
  1.1489 +/* For binary compatibility, we store the index of the encoding
  1.1490 +   specified at initialization in the isUtf16 member.
  1.1491 +*/
  1.1492 +
  1.1493 +#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
  1.1494 +#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
  1.1495 +
  1.1496 +/* This is what detects the encoding.  encodingTable maps from
  1.1497 +   encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
  1.1498 +   the external (protocol) specified encoding; state is
  1.1499 +   XML_CONTENT_STATE if we're parsing an external text entity, and
  1.1500 +   XML_PROLOG_STATE otherwise.
  1.1501 +*/
  1.1502 +
  1.1503 +
  1.1504 +static int
  1.1505 +initScan(const ENCODING * const *encodingTable,
  1.1506 +         const INIT_ENCODING *enc,
  1.1507 +         int state,
  1.1508 +         const char *ptr,
  1.1509 +         const char *end,
  1.1510 +         const char **nextTokPtr)
  1.1511 +{
  1.1512 +  const ENCODING **encPtr;
  1.1513 +
  1.1514 +  if (ptr == end)
  1.1515 +    return XML_TOK_NONE;
  1.1516 +  encPtr = enc->encPtr;
  1.1517 +  if (ptr + 1 == end) {
  1.1518 +    /* only a single byte available for auto-detection */
  1.1519 +#ifndef XML_DTD /* FIXME */
  1.1520 +    /* a well-formed document entity must have more than one byte */
  1.1521 +    if (state != XML_CONTENT_STATE)
  1.1522 +      return XML_TOK_PARTIAL;
  1.1523 +#endif
  1.1524 +    /* so we're parsing an external text entity... */
  1.1525 +    /* if UTF-16 was externally specified, then we need at least 2 bytes */
  1.1526 +    switch (INIT_ENC_INDEX(enc)) {
  1.1527 +    case UTF_16_ENC:
  1.1528 +    case UTF_16LE_ENC:
  1.1529 +    case UTF_16BE_ENC:
  1.1530 +      return XML_TOK_PARTIAL;
  1.1531 +    }
  1.1532 +    switch ((unsigned char)*ptr) {
  1.1533 +    case 0xFE:
  1.1534 +    case 0xFF:
  1.1535 +    case 0xEF: /* possibly first byte of UTF-8 BOM */
  1.1536 +      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
  1.1537 +          && state == XML_CONTENT_STATE)
  1.1538 +        break;
  1.1539 +      /* fall through */
  1.1540 +    case 0x00:
  1.1541 +    case 0x3C:
  1.1542 +      return XML_TOK_PARTIAL;
  1.1543 +    }
  1.1544 +  }
  1.1545 +  else {
  1.1546 +    switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
  1.1547 +    case 0xFEFF:
  1.1548 +      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
  1.1549 +          && state == XML_CONTENT_STATE)
  1.1550 +        break;
  1.1551 +      *nextTokPtr = ptr + 2;
  1.1552 +      *encPtr = encodingTable[UTF_16BE_ENC];
  1.1553 +      return XML_TOK_BOM;
  1.1554 +    /* 00 3C is handled in the default case */
  1.1555 +    case 0x3C00:
  1.1556 +      if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
  1.1557 +           || INIT_ENC_INDEX(enc) == UTF_16_ENC)
  1.1558 +          && state == XML_CONTENT_STATE)
  1.1559 +        break;
  1.1560 +      *encPtr = encodingTable[UTF_16LE_ENC];
  1.1561 +      return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1.1562 +    case 0xFFFE:
  1.1563 +      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
  1.1564 +          && state == XML_CONTENT_STATE)
  1.1565 +        break;
  1.1566 +      *nextTokPtr = ptr + 2;
  1.1567 +      *encPtr = encodingTable[UTF_16LE_ENC];
  1.1568 +      return XML_TOK_BOM;
  1.1569 +    case 0xEFBB:
  1.1570 +      /* Maybe a UTF-8 BOM (EF BB BF) */
  1.1571 +      /* If there's an explicitly specified (external) encoding
  1.1572 +         of ISO-8859-1 or some flavour of UTF-16
  1.1573 +         and this is an external text entity,
  1.1574 +         don't look for the BOM,
  1.1575 +         because it might be a legal data.
  1.1576 +      */
  1.1577 +      if (state == XML_CONTENT_STATE) {
  1.1578 +        int e = INIT_ENC_INDEX(enc);
  1.1579 +        if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
  1.1580 +            || e == UTF_16LE_ENC || e == UTF_16_ENC)
  1.1581 +          break;
  1.1582 +      }
  1.1583 +      if (ptr + 2 == end)
  1.1584 +        return XML_TOK_PARTIAL;
  1.1585 +      if ((unsigned char)ptr[2] == 0xBF) {
  1.1586 +        *nextTokPtr = ptr + 3;
  1.1587 +        *encPtr = encodingTable[UTF_8_ENC];
  1.1588 +        return XML_TOK_BOM;
  1.1589 +      }
  1.1590 +      break;
  1.1591 +    default:
  1.1592 +      if (ptr[0] == '\0') {
  1.1593 +        /* 0 isn't a legal data character. Furthermore a document
  1.1594 +           entity can only start with ASCII characters.  So the only
  1.1595 +           way this can fail to be big-endian UTF-16 if it it's an
  1.1596 +           external parsed general entity that's labelled as
  1.1597 +           UTF-16LE.
  1.1598 +        */
  1.1599 +        if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
  1.1600 +          break;
  1.1601 +        *encPtr = encodingTable[UTF_16BE_ENC];
  1.1602 +        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1.1603 +      }
  1.1604 +      else if (ptr[1] == '\0') {
  1.1605 +        /* We could recover here in the case:
  1.1606 +            - parsing an external entity
  1.1607 +            - second byte is 0
  1.1608 +            - no externally specified encoding
  1.1609 +            - no encoding declaration
  1.1610 +           by assuming UTF-16LE.  But we don't, because this would mean when
  1.1611 +           presented just with a single byte, we couldn't reliably determine
  1.1612 +           whether we needed further bytes.
  1.1613 +        */
  1.1614 +        if (state == XML_CONTENT_STATE)
  1.1615 +          break;
  1.1616 +        *encPtr = encodingTable[UTF_16LE_ENC];
  1.1617 +        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1.1618 +      }
  1.1619 +      break;
  1.1620 +    }
  1.1621 +  }
  1.1622 +  *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
  1.1623 +  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1.1624 +}
  1.1625 +
  1.1626 +
  1.1627 +#define NS(x) x
  1.1628 +#define ns(x) x
  1.1629 +#include "xmltok_ns.c"
  1.1630 +#undef NS
  1.1631 +#undef ns
  1.1632 +
  1.1633 +#ifdef XML_NS
  1.1634 +
  1.1635 +#define NS(x) x ## NS
  1.1636 +#define ns(x) x ## _ns
  1.1637 +
  1.1638 +#include "xmltok_ns.c"
  1.1639 +
  1.1640 +#undef NS
  1.1641 +#undef ns
  1.1642 +
  1.1643 +ENCODING *
  1.1644 +XmlInitUnknownEncodingNS(void *mem,
  1.1645 +                         int *table,
  1.1646 +                         CONVERTER convert, 
  1.1647 +                         void *userData)
  1.1648 +{
  1.1649 +  ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
  1.1650 +  if (enc)
  1.1651 +    ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
  1.1652 +  return enc;
  1.1653 +}
  1.1654 +
  1.1655 +#endif /* XML_NS */
  1.1656 +
  1.1657 +/* BEGIN MOZILLA CHANGE (Mozilla extensions for QName checking) */
  1.1658 +#ifdef MOZILLA_CLIENT
  1.1659 +#include "moz_extensions.c"
  1.1660 +#endif /* MOZILLA_CLIENT */
  1.1661 +/* END MOZILLA CHANGE */

mercurial