michael@0: /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd michael@0: See the file COPYING for copying permission. michael@0: */ michael@0: michael@0: #include michael@0: michael@0: #ifdef COMPILED_FROM_DSP michael@0: #include "winconfig.h" michael@0: #elif defined(MACOS_CLASSIC) michael@0: #include "macconfig.h" michael@0: #elif defined(__amigaos4__) michael@0: #include "amigaconfig.h" michael@0: #else michael@0: #ifdef HAVE_EXPAT_CONFIG_H michael@0: #include michael@0: #endif michael@0: #endif /* ndef COMPILED_FROM_DSP */ michael@0: michael@0: #include "expat_external.h" michael@0: #include "internal.h" michael@0: #include "xmltok.h" michael@0: #include "nametab.h" michael@0: michael@0: #ifdef XML_DTD michael@0: #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok) michael@0: #else michael@0: #define IGNORE_SECTION_TOK_VTABLE /* as nothing */ michael@0: #endif michael@0: michael@0: #define VTABLE1 \ michael@0: { PREFIX(prologTok), PREFIX(contentTok), \ michael@0: PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \ michael@0: { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \ michael@0: PREFIX(sameName), \ michael@0: PREFIX(nameMatchesAscii), \ michael@0: PREFIX(nameLength), \ michael@0: PREFIX(skipS), \ michael@0: PREFIX(getAtts), \ michael@0: PREFIX(charRefNumber), \ michael@0: PREFIX(predefinedEntityName), \ michael@0: PREFIX(updatePosition), \ michael@0: PREFIX(isPublicId) michael@0: michael@0: #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) michael@0: michael@0: #define UCS2_GET_NAMING(pages, hi, lo) \ michael@0: (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F))) michael@0: michael@0: /* A 2 byte UTF-8 representation splits the characters 11 bits between michael@0: the bottom 5 and 6 bits of the bytes. We need 8 bits to index into michael@0: pages, 3 bits to add to that index and 5 bits to generate the mask. michael@0: */ michael@0: #define UTF8_GET_NAMING2(pages, byte) \ michael@0: (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ michael@0: + ((((byte)[0]) & 3) << 1) \ michael@0: + ((((byte)[1]) >> 5) & 1)] \ michael@0: & (1 << (((byte)[1]) & 0x1F))) michael@0: michael@0: /* A 3 byte UTF-8 representation splits the characters 16 bits between michael@0: the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index michael@0: into pages, 3 bits to add to that index and 5 bits to generate the michael@0: mask. michael@0: */ michael@0: #define UTF8_GET_NAMING3(pages, byte) \ michael@0: (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \ michael@0: + ((((byte)[1]) >> 2) & 0xF)] \ michael@0: << 3) \ michael@0: + ((((byte)[1]) & 3) << 1) \ michael@0: + ((((byte)[2]) >> 5) & 1)] \ michael@0: & (1 << (((byte)[2]) & 0x1F))) michael@0: michael@0: #define UTF8_GET_NAMING(pages, p, n) \ michael@0: ((n) == 2 \ michael@0: ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ michael@0: : ((n) == 3 \ michael@0: ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \ michael@0: : 0)) michael@0: michael@0: /* Detection of invalid UTF-8 sequences is based on Table 3.1B michael@0: of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/ michael@0: with the additional restriction of not allowing the Unicode michael@0: code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE). michael@0: Implementation details: michael@0: (A & 0x80) == 0 means A < 0x80 michael@0: and michael@0: (A & 0xC0) == 0xC0 means A > 0xBF michael@0: */ michael@0: michael@0: #define UTF8_INVALID2(p) \ michael@0: ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0) michael@0: michael@0: #define UTF8_INVALID3(p) \ michael@0: (((p)[2] & 0x80) == 0 \ michael@0: || \ michael@0: ((*p) == 0xEF && (p)[1] == 0xBF \ michael@0: ? \ michael@0: (p)[2] > 0xBD \ michael@0: : \ michael@0: ((p)[2] & 0xC0) == 0xC0) \ michael@0: || \ michael@0: ((*p) == 0xE0 \ michael@0: ? \ michael@0: (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \ michael@0: : \ michael@0: ((p)[1] & 0x80) == 0 \ michael@0: || \ michael@0: ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0))) michael@0: michael@0: #define UTF8_INVALID4(p) \ michael@0: (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \ michael@0: || \ michael@0: ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \ michael@0: || \ michael@0: ((*p) == 0xF0 \ michael@0: ? \ michael@0: (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \ michael@0: : \ michael@0: ((p)[1] & 0x80) == 0 \ michael@0: || \ michael@0: ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0))) michael@0: michael@0: static int PTRFASTCALL michael@0: isNever(const ENCODING *enc, const char *p) michael@0: { michael@0: return 0; michael@0: } michael@0: michael@0: static int PTRFASTCALL michael@0: utf8_isName2(const ENCODING *enc, const char *p) michael@0: { michael@0: return UTF8_GET_NAMING2(namePages, (const unsigned char *)p); michael@0: } michael@0: michael@0: static int PTRFASTCALL michael@0: utf8_isName3(const ENCODING *enc, const char *p) michael@0: { michael@0: return UTF8_GET_NAMING3(namePages, (const unsigned char *)p); michael@0: } michael@0: michael@0: #define utf8_isName4 isNever michael@0: michael@0: static int PTRFASTCALL michael@0: utf8_isNmstrt2(const ENCODING *enc, const char *p) michael@0: { michael@0: return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p); michael@0: } michael@0: michael@0: static int PTRFASTCALL michael@0: utf8_isNmstrt3(const ENCODING *enc, const char *p) michael@0: { michael@0: return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p); michael@0: } michael@0: michael@0: #define utf8_isNmstrt4 isNever michael@0: michael@0: static int PTRFASTCALL michael@0: utf8_isInvalid2(const ENCODING *enc, const char *p) michael@0: { michael@0: return UTF8_INVALID2((const unsigned char *)p); michael@0: } michael@0: michael@0: static int PTRFASTCALL michael@0: utf8_isInvalid3(const ENCODING *enc, const char *p) michael@0: { michael@0: return UTF8_INVALID3((const unsigned char *)p); michael@0: } michael@0: michael@0: static int PTRFASTCALL michael@0: utf8_isInvalid4(const ENCODING *enc, const char *p) michael@0: { michael@0: return UTF8_INVALID4((const unsigned char *)p); michael@0: } michael@0: michael@0: struct normal_encoding { michael@0: ENCODING enc; michael@0: unsigned char type[256]; michael@0: #ifdef XML_MIN_SIZE michael@0: int (PTRFASTCALL *byteType)(const ENCODING *, const char *); michael@0: int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *); michael@0: int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *); michael@0: int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *); michael@0: int (PTRCALL *charMatches)(const ENCODING *, const char *, int); michael@0: #endif /* XML_MIN_SIZE */ michael@0: int (PTRFASTCALL *isName2)(const ENCODING *, const char *); michael@0: int (PTRFASTCALL *isName3)(const ENCODING *, const char *); michael@0: int (PTRFASTCALL *isName4)(const ENCODING *, const char *); michael@0: int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *); michael@0: int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *); michael@0: int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *); michael@0: int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *); michael@0: int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *); michael@0: int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *); michael@0: }; michael@0: michael@0: #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc)) michael@0: michael@0: #ifdef XML_MIN_SIZE michael@0: michael@0: #define STANDARD_VTABLE(E) \ michael@0: E ## byteType, \ michael@0: E ## isNameMin, \ michael@0: E ## isNmstrtMin, \ michael@0: E ## byteToAscii, \ michael@0: E ## charMatches, michael@0: michael@0: #else michael@0: michael@0: #define STANDARD_VTABLE(E) /* as nothing */ michael@0: michael@0: #endif michael@0: michael@0: #define NORMAL_VTABLE(E) \ michael@0: E ## isName2, \ michael@0: E ## isName3, \ michael@0: E ## isName4, \ michael@0: E ## isNmstrt2, \ michael@0: E ## isNmstrt3, \ michael@0: E ## isNmstrt4, \ michael@0: E ## isInvalid2, \ michael@0: E ## isInvalid3, \ michael@0: E ## isInvalid4 michael@0: michael@0: static int FASTCALL checkCharRefNumber(int); michael@0: michael@0: #include "xmltok_impl.h" michael@0: #include "ascii.h" michael@0: michael@0: #ifdef XML_MIN_SIZE michael@0: #define sb_isNameMin isNever michael@0: #define sb_isNmstrtMin isNever michael@0: #endif michael@0: michael@0: #ifdef XML_MIN_SIZE michael@0: #define MINBPC(enc) ((enc)->minBytesPerChar) michael@0: #else michael@0: /* minimum bytes per character */ michael@0: #define MINBPC(enc) 1 michael@0: #endif michael@0: michael@0: #define SB_BYTE_TYPE(enc, p) \ michael@0: (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) michael@0: michael@0: #ifdef XML_MIN_SIZE michael@0: static int PTRFASTCALL michael@0: sb_byteType(const ENCODING *enc, const char *p) michael@0: { michael@0: return SB_BYTE_TYPE(enc, p); michael@0: } michael@0: #define BYTE_TYPE(enc, p) \ michael@0: (AS_NORMAL_ENCODING(enc)->byteType(enc, p)) michael@0: #else michael@0: #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p) michael@0: #endif michael@0: michael@0: #ifdef XML_MIN_SIZE michael@0: #define BYTE_TO_ASCII(enc, p) \ michael@0: (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p)) michael@0: static int PTRFASTCALL michael@0: sb_byteToAscii(const ENCODING *enc, const char *p) michael@0: { michael@0: return *p; michael@0: } michael@0: #else michael@0: #define BYTE_TO_ASCII(enc, p) (*(p)) michael@0: #endif michael@0: michael@0: #define IS_NAME_CHAR(enc, p, n) \ michael@0: (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p)) michael@0: #define IS_NMSTRT_CHAR(enc, p, n) \ michael@0: (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p)) michael@0: #define IS_INVALID_CHAR(enc, p, n) \ michael@0: (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p)) michael@0: michael@0: #ifdef XML_MIN_SIZE michael@0: #define IS_NAME_CHAR_MINBPC(enc, p) \ michael@0: (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p)) michael@0: #define IS_NMSTRT_CHAR_MINBPC(enc, p) \ michael@0: (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p)) michael@0: #else michael@0: #define IS_NAME_CHAR_MINBPC(enc, p) (0) michael@0: #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) michael@0: #endif michael@0: michael@0: #ifdef XML_MIN_SIZE michael@0: #define CHAR_MATCHES(enc, p, c) \ michael@0: (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c)) michael@0: static int PTRCALL michael@0: sb_charMatches(const ENCODING *enc, const char *p, int c) michael@0: { michael@0: return *p == c; michael@0: } michael@0: #else michael@0: /* c is an ASCII character */ michael@0: #define CHAR_MATCHES(enc, p, c) (*(p) == c) michael@0: #endif michael@0: michael@0: #define PREFIX(ident) normal_ ## ident michael@0: #include "xmltok_impl.c" michael@0: michael@0: #undef MINBPC michael@0: #undef BYTE_TYPE michael@0: #undef BYTE_TO_ASCII michael@0: #undef CHAR_MATCHES michael@0: #undef IS_NAME_CHAR michael@0: #undef IS_NAME_CHAR_MINBPC michael@0: #undef IS_NMSTRT_CHAR michael@0: #undef IS_NMSTRT_CHAR_MINBPC michael@0: #undef IS_INVALID_CHAR michael@0: michael@0: enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ michael@0: UTF8_cval1 = 0x00, michael@0: UTF8_cval2 = 0xc0, michael@0: UTF8_cval3 = 0xe0, michael@0: UTF8_cval4 = 0xf0 michael@0: }; michael@0: michael@0: static void PTRCALL michael@0: utf8_toUtf8(const ENCODING *enc, michael@0: const char **fromP, const char *fromLim, michael@0: char **toP, const char *toLim) michael@0: { michael@0: char *to; michael@0: const char *from; michael@0: if (fromLim - *fromP > toLim - *toP) { michael@0: /* Avoid copying partial characters. */ michael@0: for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--) michael@0: if (((unsigned char)fromLim[-1] & 0xc0) != 0x80) michael@0: break; michael@0: } michael@0: for (to = *toP, from = *fromP; from != fromLim; from++, to++) michael@0: *to = *from; michael@0: *fromP = from; michael@0: *toP = to; michael@0: } michael@0: michael@0: static void PTRCALL michael@0: utf8_toUtf16(const ENCODING *enc, michael@0: const char **fromP, const char *fromLim, michael@0: unsigned short **toP, const unsigned short *toLim) michael@0: { michael@0: unsigned short *to = *toP; michael@0: const char *from = *fromP; michael@0: while (from != fromLim && to != toLim) { michael@0: switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) { michael@0: case BT_LEAD2: michael@0: *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f)); michael@0: from += 2; michael@0: break; michael@0: case BT_LEAD3: michael@0: *to++ = (unsigned short)(((from[0] & 0xf) << 12) michael@0: | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f)); michael@0: from += 3; michael@0: break; michael@0: case BT_LEAD4: michael@0: { michael@0: unsigned long n; michael@0: if (to + 1 == toLim) michael@0: goto after; michael@0: n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) michael@0: | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); michael@0: n -= 0x10000; michael@0: to[0] = (unsigned short)((n >> 10) | 0xD800); michael@0: to[1] = (unsigned short)((n & 0x3FF) | 0xDC00); michael@0: to += 2; michael@0: from += 4; michael@0: } michael@0: break; michael@0: default: michael@0: *to++ = *from++; michael@0: break; michael@0: } michael@0: } michael@0: after: michael@0: *fromP = from; michael@0: *toP = to; michael@0: } michael@0: michael@0: #ifdef XML_NS michael@0: static const struct normal_encoding utf8_encoding_ns = { michael@0: { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, michael@0: { michael@0: #include "asciitab.h" michael@0: #include "utf8tab.h" michael@0: }, michael@0: STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) michael@0: }; michael@0: #endif michael@0: michael@0: static const struct normal_encoding utf8_encoding = { michael@0: { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, michael@0: { michael@0: #define BT_COLON BT_NMSTRT michael@0: #include "asciitab.h" michael@0: #undef BT_COLON michael@0: #include "utf8tab.h" michael@0: }, michael@0: STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) michael@0: }; michael@0: michael@0: #ifdef XML_NS michael@0: michael@0: static const struct normal_encoding internal_utf8_encoding_ns = { michael@0: { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, michael@0: { michael@0: #include "iasciitab.h" michael@0: #include "utf8tab.h" michael@0: }, michael@0: STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) michael@0: }; michael@0: michael@0: #endif michael@0: michael@0: static const struct normal_encoding internal_utf8_encoding = { michael@0: { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, michael@0: { michael@0: #define BT_COLON BT_NMSTRT michael@0: #include "iasciitab.h" michael@0: #undef BT_COLON michael@0: #include "utf8tab.h" michael@0: }, michael@0: STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) michael@0: }; michael@0: michael@0: static void PTRCALL michael@0: latin1_toUtf8(const ENCODING *enc, michael@0: const char **fromP, const char *fromLim, michael@0: char **toP, const char *toLim) michael@0: { michael@0: for (;;) { michael@0: unsigned char c; michael@0: if (*fromP == fromLim) michael@0: break; michael@0: c = (unsigned char)**fromP; michael@0: if (c & 0x80) { michael@0: if (toLim - *toP < 2) michael@0: break; michael@0: *(*toP)++ = (char)((c >> 6) | UTF8_cval2); michael@0: *(*toP)++ = (char)((c & 0x3f) | 0x80); michael@0: (*fromP)++; michael@0: } michael@0: else { michael@0: if (*toP == toLim) michael@0: break; michael@0: *(*toP)++ = *(*fromP)++; michael@0: } michael@0: } michael@0: } michael@0: michael@0: static void PTRCALL michael@0: latin1_toUtf16(const ENCODING *enc, michael@0: const char **fromP, const char *fromLim, michael@0: unsigned short **toP, const unsigned short *toLim) michael@0: { michael@0: while (*fromP != fromLim && *toP != toLim) michael@0: *(*toP)++ = (unsigned char)*(*fromP)++; michael@0: } michael@0: michael@0: #ifdef XML_NS michael@0: michael@0: static const struct normal_encoding latin1_encoding_ns = { michael@0: { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, michael@0: { michael@0: #include "asciitab.h" michael@0: #include "latin1tab.h" michael@0: }, michael@0: STANDARD_VTABLE(sb_) michael@0: }; michael@0: michael@0: #endif michael@0: michael@0: static const struct normal_encoding latin1_encoding = { michael@0: { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, michael@0: { michael@0: #define BT_COLON BT_NMSTRT michael@0: #include "asciitab.h" michael@0: #undef BT_COLON michael@0: #include "latin1tab.h" michael@0: }, michael@0: STANDARD_VTABLE(sb_) michael@0: }; michael@0: michael@0: static void PTRCALL michael@0: ascii_toUtf8(const ENCODING *enc, michael@0: const char **fromP, const char *fromLim, michael@0: char **toP, const char *toLim) michael@0: { michael@0: while (*fromP != fromLim && *toP != toLim) michael@0: *(*toP)++ = *(*fromP)++; michael@0: } michael@0: michael@0: #ifdef XML_NS michael@0: michael@0: static const struct normal_encoding ascii_encoding_ns = { michael@0: { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, michael@0: { michael@0: #include "asciitab.h" michael@0: /* BT_NONXML == 0 */ michael@0: }, michael@0: STANDARD_VTABLE(sb_) michael@0: }; michael@0: michael@0: #endif michael@0: michael@0: static const struct normal_encoding ascii_encoding = { michael@0: { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, michael@0: { michael@0: #define BT_COLON BT_NMSTRT michael@0: #include "asciitab.h" michael@0: #undef BT_COLON michael@0: /* BT_NONXML == 0 */ michael@0: }, michael@0: STANDARD_VTABLE(sb_) michael@0: }; michael@0: michael@0: static int PTRFASTCALL michael@0: unicode_byte_type(char hi, char lo) michael@0: { michael@0: switch ((unsigned char)hi) { michael@0: case 0xD8: case 0xD9: case 0xDA: case 0xDB: michael@0: return BT_LEAD4; michael@0: case 0xDC: case 0xDD: case 0xDE: case 0xDF: michael@0: return BT_TRAIL; michael@0: case 0xFF: michael@0: switch ((unsigned char)lo) { michael@0: case 0xFF: michael@0: case 0xFE: michael@0: return BT_NONXML; michael@0: } michael@0: break; michael@0: } michael@0: return BT_NONASCII; michael@0: } michael@0: michael@0: #define DEFINE_UTF16_TO_UTF8(E) \ michael@0: static void PTRCALL \ michael@0: E ## toUtf8(const ENCODING *enc, \ michael@0: const char **fromP, const char *fromLim, \ michael@0: char **toP, const char *toLim) \ michael@0: { \ michael@0: const char *from; \ michael@0: for (from = *fromP; from != fromLim; from += 2) { \ michael@0: int plane; \ michael@0: unsigned char lo2; \ michael@0: unsigned char lo = GET_LO(from); \ michael@0: unsigned char hi = GET_HI(from); \ michael@0: switch (hi) { \ michael@0: case 0: \ michael@0: if (lo < 0x80) { \ michael@0: if (*toP == toLim) { \ michael@0: *fromP = from; \ michael@0: return; \ michael@0: } \ michael@0: *(*toP)++ = lo; \ michael@0: break; \ michael@0: } \ michael@0: /* fall through */ \ michael@0: case 0x1: case 0x2: case 0x3: \ michael@0: case 0x4: case 0x5: case 0x6: case 0x7: \ michael@0: if (toLim - *toP < 2) { \ michael@0: *fromP = from; \ michael@0: return; \ michael@0: } \ michael@0: *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ michael@0: *(*toP)++ = ((lo & 0x3f) | 0x80); \ michael@0: break; \ michael@0: default: \ michael@0: if (toLim - *toP < 3) { \ michael@0: *fromP = from; \ michael@0: return; \ michael@0: } \ michael@0: /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ michael@0: *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ michael@0: *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ michael@0: *(*toP)++ = ((lo & 0x3f) | 0x80); \ michael@0: break; \ michael@0: case 0xD8: case 0xD9: case 0xDA: case 0xDB: \ michael@0: if (toLim - *toP < 4) { \ michael@0: *fromP = from; \ michael@0: return; \ michael@0: } \ michael@0: plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ michael@0: *(*toP)++ = ((plane >> 2) | UTF8_cval4); \ michael@0: *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ michael@0: from += 2; \ michael@0: lo2 = GET_LO(from); \ michael@0: *(*toP)++ = (((lo & 0x3) << 4) \ michael@0: | ((GET_HI(from) & 0x3) << 2) \ michael@0: | (lo2 >> 6) \ michael@0: | 0x80); \ michael@0: *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ michael@0: break; \ michael@0: } \ michael@0: } \ michael@0: *fromP = from; \ michael@0: } michael@0: michael@0: #define DEFINE_UTF16_TO_UTF16(E) \ michael@0: static void PTRCALL \ michael@0: E ## toUtf16(const ENCODING *enc, \ michael@0: const char **fromP, const char *fromLim, \ michael@0: unsigned short **toP, const unsigned short *toLim) \ michael@0: { \ michael@0: /* Avoid copying first half only of surrogate */ \ michael@0: if (fromLim - *fromP > ((toLim - *toP) << 1) \ michael@0: && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \ michael@0: fromLim -= 2; \ michael@0: for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \ michael@0: *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ michael@0: } michael@0: michael@0: #define SET2(ptr, ch) \ michael@0: (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8))) michael@0: #define GET_LO(ptr) ((unsigned char)(ptr)[0]) michael@0: #define GET_HI(ptr) ((unsigned char)(ptr)[1]) michael@0: michael@0: DEFINE_UTF16_TO_UTF8(little2_) michael@0: DEFINE_UTF16_TO_UTF16(little2_) michael@0: michael@0: #undef SET2 michael@0: #undef GET_LO michael@0: #undef GET_HI michael@0: michael@0: #define SET2(ptr, ch) \ michael@0: (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF))) michael@0: #define GET_LO(ptr) ((unsigned char)(ptr)[1]) michael@0: #define GET_HI(ptr) ((unsigned char)(ptr)[0]) michael@0: michael@0: DEFINE_UTF16_TO_UTF8(big2_) michael@0: DEFINE_UTF16_TO_UTF16(big2_) michael@0: michael@0: #undef SET2 michael@0: #undef GET_LO michael@0: #undef GET_HI michael@0: michael@0: #define LITTLE2_BYTE_TYPE(enc, p) \ michael@0: ((p)[1] == 0 \ michael@0: ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \ michael@0: : unicode_byte_type((p)[1], (p)[0])) michael@0: #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1) michael@0: #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c) michael@0: #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \ michael@0: UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0]) michael@0: #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ michael@0: UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0]) michael@0: michael@0: #ifdef XML_MIN_SIZE michael@0: michael@0: static int PTRFASTCALL michael@0: little2_byteType(const ENCODING *enc, const char *p) michael@0: { michael@0: return LITTLE2_BYTE_TYPE(enc, p); michael@0: } michael@0: michael@0: static int PTRFASTCALL michael@0: little2_byteToAscii(const ENCODING *enc, const char *p) michael@0: { michael@0: return LITTLE2_BYTE_TO_ASCII(enc, p); michael@0: } michael@0: michael@0: static int PTRCALL michael@0: little2_charMatches(const ENCODING *enc, const char *p, int c) michael@0: { michael@0: return LITTLE2_CHAR_MATCHES(enc, p, c); michael@0: } michael@0: michael@0: static int PTRFASTCALL michael@0: little2_isNameMin(const ENCODING *enc, const char *p) michael@0: { michael@0: return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p); michael@0: } michael@0: michael@0: static int PTRFASTCALL michael@0: little2_isNmstrtMin(const ENCODING *enc, const char *p) michael@0: { michael@0: return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p); michael@0: } michael@0: michael@0: #undef VTABLE michael@0: #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16 michael@0: michael@0: #else /* not XML_MIN_SIZE */ michael@0: michael@0: #undef PREFIX michael@0: #define PREFIX(ident) little2_ ## ident michael@0: #define MINBPC(enc) 2 michael@0: /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ michael@0: #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p) michael@0: #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p) michael@0: #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c) michael@0: #define IS_NAME_CHAR(enc, p, n) 0 michael@0: #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) michael@0: #define IS_NMSTRT_CHAR(enc, p, n) (0) michael@0: #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) michael@0: michael@0: #include "xmltok_impl.c" michael@0: michael@0: #undef MINBPC michael@0: #undef BYTE_TYPE michael@0: #undef BYTE_TO_ASCII michael@0: #undef CHAR_MATCHES michael@0: #undef IS_NAME_CHAR michael@0: #undef IS_NAME_CHAR_MINBPC michael@0: #undef IS_NMSTRT_CHAR michael@0: #undef IS_NMSTRT_CHAR_MINBPC michael@0: #undef IS_INVALID_CHAR michael@0: michael@0: #endif /* not XML_MIN_SIZE */ michael@0: michael@0: #ifdef XML_NS michael@0: michael@0: static const struct normal_encoding little2_encoding_ns = { michael@0: { VTABLE, 2, 0, michael@0: #if BYTEORDER == 1234 michael@0: 1 michael@0: #else michael@0: 0 michael@0: #endif michael@0: }, michael@0: { michael@0: #include "asciitab.h" michael@0: #include "latin1tab.h" michael@0: }, michael@0: STANDARD_VTABLE(little2_) michael@0: }; michael@0: michael@0: #endif michael@0: michael@0: static const struct normal_encoding little2_encoding = { michael@0: { VTABLE, 2, 0, michael@0: #if BYTEORDER == 1234 michael@0: 1 michael@0: #else michael@0: 0 michael@0: #endif michael@0: }, michael@0: { michael@0: #define BT_COLON BT_NMSTRT michael@0: #include "asciitab.h" michael@0: #undef BT_COLON michael@0: #include "latin1tab.h" michael@0: }, michael@0: STANDARD_VTABLE(little2_) michael@0: }; michael@0: michael@0: #if BYTEORDER != 4321 michael@0: michael@0: #ifdef XML_NS michael@0: michael@0: static const struct normal_encoding internal_little2_encoding_ns = { michael@0: { VTABLE, 2, 0, 1 }, michael@0: { michael@0: #include "iasciitab.h" michael@0: #include "latin1tab.h" michael@0: }, michael@0: STANDARD_VTABLE(little2_) michael@0: }; michael@0: michael@0: #endif michael@0: michael@0: static const struct normal_encoding internal_little2_encoding = { michael@0: { VTABLE, 2, 0, 1 }, michael@0: { michael@0: #define BT_COLON BT_NMSTRT michael@0: #include "iasciitab.h" michael@0: #undef BT_COLON michael@0: #include "latin1tab.h" michael@0: }, michael@0: STANDARD_VTABLE(little2_) michael@0: }; michael@0: michael@0: #endif michael@0: michael@0: michael@0: #define BIG2_BYTE_TYPE(enc, p) \ michael@0: ((p)[0] == 0 \ michael@0: ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \ michael@0: : unicode_byte_type((p)[0], (p)[1])) michael@0: #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1) michael@0: #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c) michael@0: #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \ michael@0: UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) michael@0: #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ michael@0: UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) michael@0: michael@0: #ifdef XML_MIN_SIZE michael@0: michael@0: static int PTRFASTCALL michael@0: big2_byteType(const ENCODING *enc, const char *p) michael@0: { michael@0: return BIG2_BYTE_TYPE(enc, p); michael@0: } michael@0: michael@0: static int PTRFASTCALL michael@0: big2_byteToAscii(const ENCODING *enc, const char *p) michael@0: { michael@0: return BIG2_BYTE_TO_ASCII(enc, p); michael@0: } michael@0: michael@0: static int PTRCALL michael@0: big2_charMatches(const ENCODING *enc, const char *p, int c) michael@0: { michael@0: return BIG2_CHAR_MATCHES(enc, p, c); michael@0: } michael@0: michael@0: static int PTRFASTCALL michael@0: big2_isNameMin(const ENCODING *enc, const char *p) michael@0: { michael@0: return BIG2_IS_NAME_CHAR_MINBPC(enc, p); michael@0: } michael@0: michael@0: static int PTRFASTCALL michael@0: big2_isNmstrtMin(const ENCODING *enc, const char *p) michael@0: { michael@0: return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p); michael@0: } michael@0: michael@0: #undef VTABLE michael@0: #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16 michael@0: michael@0: #else /* not XML_MIN_SIZE */ michael@0: michael@0: #undef PREFIX michael@0: #define PREFIX(ident) big2_ ## ident michael@0: #define MINBPC(enc) 2 michael@0: /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ michael@0: #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p) michael@0: #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p) michael@0: #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c) michael@0: #define IS_NAME_CHAR(enc, p, n) 0 michael@0: #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p) michael@0: #define IS_NMSTRT_CHAR(enc, p, n) (0) michael@0: #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) michael@0: michael@0: #include "xmltok_impl.c" michael@0: michael@0: #undef MINBPC michael@0: #undef BYTE_TYPE michael@0: #undef BYTE_TO_ASCII michael@0: #undef CHAR_MATCHES michael@0: #undef IS_NAME_CHAR michael@0: #undef IS_NAME_CHAR_MINBPC michael@0: #undef IS_NMSTRT_CHAR michael@0: #undef IS_NMSTRT_CHAR_MINBPC michael@0: #undef IS_INVALID_CHAR michael@0: michael@0: #endif /* not XML_MIN_SIZE */ michael@0: michael@0: #ifdef XML_NS michael@0: michael@0: static const struct normal_encoding big2_encoding_ns = { michael@0: { VTABLE, 2, 0, michael@0: #if BYTEORDER == 4321 michael@0: 1 michael@0: #else michael@0: 0 michael@0: #endif michael@0: }, michael@0: { michael@0: #include "asciitab.h" michael@0: #include "latin1tab.h" michael@0: }, michael@0: STANDARD_VTABLE(big2_) michael@0: }; michael@0: michael@0: #endif michael@0: michael@0: static const struct normal_encoding big2_encoding = { michael@0: { VTABLE, 2, 0, michael@0: #if BYTEORDER == 4321 michael@0: 1 michael@0: #else michael@0: 0 michael@0: #endif michael@0: }, michael@0: { michael@0: #define BT_COLON BT_NMSTRT michael@0: #include "asciitab.h" michael@0: #undef BT_COLON michael@0: #include "latin1tab.h" michael@0: }, michael@0: STANDARD_VTABLE(big2_) michael@0: }; michael@0: michael@0: #if BYTEORDER != 1234 michael@0: michael@0: #ifdef XML_NS michael@0: michael@0: static const struct normal_encoding internal_big2_encoding_ns = { michael@0: { VTABLE, 2, 0, 1 }, michael@0: { michael@0: #include "iasciitab.h" michael@0: #include "latin1tab.h" michael@0: }, michael@0: STANDARD_VTABLE(big2_) michael@0: }; michael@0: michael@0: #endif michael@0: michael@0: static const struct normal_encoding internal_big2_encoding = { michael@0: { VTABLE, 2, 0, 1 }, michael@0: { michael@0: #define BT_COLON BT_NMSTRT michael@0: #include "iasciitab.h" michael@0: #undef BT_COLON michael@0: #include "latin1tab.h" michael@0: }, michael@0: STANDARD_VTABLE(big2_) michael@0: }; michael@0: michael@0: #endif michael@0: michael@0: #undef PREFIX michael@0: michael@0: static int FASTCALL michael@0: streqci(const char *s1, const char *s2) michael@0: { michael@0: for (;;) { michael@0: char c1 = *s1++; michael@0: char c2 = *s2++; michael@0: if (ASCII_a <= c1 && c1 <= ASCII_z) michael@0: c1 += ASCII_A - ASCII_a; michael@0: if (ASCII_a <= c2 && c2 <= ASCII_z) michael@0: c2 += ASCII_A - ASCII_a; michael@0: if (c1 != c2) michael@0: return 0; michael@0: if (!c1) michael@0: break; michael@0: } michael@0: return 1; michael@0: } michael@0: michael@0: static void PTRCALL michael@0: initUpdatePosition(const ENCODING *enc, const char *ptr, michael@0: const char *end, POSITION *pos) michael@0: { michael@0: normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); michael@0: } michael@0: michael@0: static int michael@0: toAscii(const ENCODING *enc, const char *ptr, const char *end) michael@0: { michael@0: char buf[1]; michael@0: char *p = buf; michael@0: XmlUtf8Convert(enc, &ptr, end, &p, p + 1); michael@0: if (p == buf) michael@0: return -1; michael@0: else michael@0: return buf[0]; michael@0: } michael@0: michael@0: static int FASTCALL michael@0: isSpace(int c) michael@0: { michael@0: switch (c) { michael@0: case 0x20: michael@0: case 0xD: michael@0: case 0xA: michael@0: case 0x9: michael@0: return 1; michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: /* Return 1 if there's just optional white space or there's an S michael@0: followed by name=val. michael@0: */ michael@0: static int michael@0: parsePseudoAttribute(const ENCODING *enc, michael@0: const char *ptr, michael@0: const char *end, michael@0: const char **namePtr, michael@0: const char **nameEndPtr, michael@0: const char **valPtr, michael@0: const char **nextTokPtr) michael@0: { michael@0: int c; michael@0: char open; michael@0: if (ptr == end) { michael@0: *namePtr = NULL; michael@0: return 1; michael@0: } michael@0: if (!isSpace(toAscii(enc, ptr, end))) { michael@0: *nextTokPtr = ptr; michael@0: return 0; michael@0: } michael@0: do { michael@0: ptr += enc->minBytesPerChar; michael@0: } while (isSpace(toAscii(enc, ptr, end))); michael@0: if (ptr == end) { michael@0: *namePtr = NULL; michael@0: return 1; michael@0: } michael@0: *namePtr = ptr; michael@0: for (;;) { michael@0: c = toAscii(enc, ptr, end); michael@0: if (c == -1) { michael@0: *nextTokPtr = ptr; michael@0: return 0; michael@0: } michael@0: if (c == ASCII_EQUALS) { michael@0: *nameEndPtr = ptr; michael@0: break; michael@0: } michael@0: if (isSpace(c)) { michael@0: *nameEndPtr = ptr; michael@0: do { michael@0: ptr += enc->minBytesPerChar; michael@0: } while (isSpace(c = toAscii(enc, ptr, end))); michael@0: if (c != ASCII_EQUALS) { michael@0: *nextTokPtr = ptr; michael@0: return 0; michael@0: } michael@0: break; michael@0: } michael@0: ptr += enc->minBytesPerChar; michael@0: } michael@0: if (ptr == *namePtr) { michael@0: *nextTokPtr = ptr; michael@0: return 0; michael@0: } michael@0: ptr += enc->minBytesPerChar; michael@0: c = toAscii(enc, ptr, end); michael@0: while (isSpace(c)) { michael@0: ptr += enc->minBytesPerChar; michael@0: c = toAscii(enc, ptr, end); michael@0: } michael@0: if (c != ASCII_QUOT && c != ASCII_APOS) { michael@0: *nextTokPtr = ptr; michael@0: return 0; michael@0: } michael@0: open = (char)c; michael@0: ptr += enc->minBytesPerChar; michael@0: *valPtr = ptr; michael@0: for (;; ptr += enc->minBytesPerChar) { michael@0: c = toAscii(enc, ptr, end); michael@0: if (c == open) michael@0: break; michael@0: if (!(ASCII_a <= c && c <= ASCII_z) michael@0: && !(ASCII_A <= c && c <= ASCII_Z) michael@0: && !(ASCII_0 <= c && c <= ASCII_9) michael@0: && c != ASCII_PERIOD michael@0: && c != ASCII_MINUS michael@0: && c != ASCII_UNDERSCORE) { michael@0: *nextTokPtr = ptr; michael@0: return 0; michael@0: } michael@0: } michael@0: *nextTokPtr = ptr + enc->minBytesPerChar; michael@0: return 1; michael@0: } michael@0: michael@0: static const char KW_version[] = { michael@0: ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0' michael@0: }; michael@0: michael@0: static const char KW_encoding[] = { michael@0: ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0' michael@0: }; michael@0: michael@0: static const char KW_standalone[] = { michael@0: ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o, michael@0: ASCII_n, ASCII_e, '\0' michael@0: }; michael@0: michael@0: static const char KW_yes[] = { michael@0: ASCII_y, ASCII_e, ASCII_s, '\0' michael@0: }; michael@0: michael@0: static const char KW_no[] = { michael@0: ASCII_n, ASCII_o, '\0' michael@0: }; michael@0: michael@0: /* BEGIN MOZILLA CHANGE (http://bugzilla.mozilla.org/show_bug.cgi?id=62157) */ michael@0: static const char KW_XML_1_0[] = { michael@0: ASCII_1, ASCII_PERIOD, ASCII_0, '\0' michael@0: }; michael@0: /* END MOZILLA CHANGE */ michael@0: michael@0: static int michael@0: doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, michael@0: const char *, michael@0: const char *), michael@0: int isGeneralTextEntity, michael@0: const ENCODING *enc, michael@0: const char *ptr, michael@0: const char *end, michael@0: const char **badPtr, michael@0: const char **versionPtr, michael@0: const char **versionEndPtr, michael@0: const char **encodingName, michael@0: const ENCODING **encoding, michael@0: int *standalone) michael@0: { michael@0: const char *val = NULL; michael@0: const char *name = NULL; michael@0: const char *nameEnd = NULL; michael@0: ptr += 5 * enc->minBytesPerChar; michael@0: end -= 2 * enc->minBytesPerChar; michael@0: if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) michael@0: || !name) { michael@0: *badPtr = ptr; michael@0: return 0; michael@0: } michael@0: if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) { michael@0: if (!isGeneralTextEntity) { michael@0: *badPtr = name; michael@0: return 0; michael@0: } michael@0: } michael@0: else { michael@0: if (versionPtr) michael@0: *versionPtr = val; michael@0: if (versionEndPtr) michael@0: *versionEndPtr = ptr; michael@0: /* BEGIN MOZILLA CHANGE (http://bugzilla.mozilla.org/show_bug.cgi?id=62157) */ michael@0: /* Anything else but a version="1.0" is invalid for us, until we support later versions. */ michael@0: if (!XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_XML_1_0)) { michael@0: *badPtr = val; michael@0: return 0; michael@0: } michael@0: /* END MOZILLA CHANGE */ michael@0: if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { michael@0: *badPtr = ptr; michael@0: return 0; michael@0: } michael@0: if (!name) { michael@0: if (isGeneralTextEntity) { michael@0: /* a TextDecl must have an EncodingDecl */ michael@0: *badPtr = ptr; michael@0: return 0; michael@0: } michael@0: return 1; michael@0: } michael@0: } michael@0: if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) { michael@0: int c = toAscii(enc, val, end); michael@0: if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) { michael@0: *badPtr = val; michael@0: return 0; michael@0: } michael@0: if (encodingName) michael@0: *encodingName = val; michael@0: if (encoding) michael@0: *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar); michael@0: if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { michael@0: *badPtr = ptr; michael@0: return 0; michael@0: } michael@0: if (!name) michael@0: return 1; michael@0: } michael@0: if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) michael@0: || isGeneralTextEntity) { michael@0: *badPtr = name; michael@0: return 0; michael@0: } michael@0: if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) { michael@0: if (standalone) michael@0: *standalone = 1; michael@0: } michael@0: else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) { michael@0: if (standalone) michael@0: *standalone = 0; michael@0: } michael@0: else { michael@0: *badPtr = val; michael@0: return 0; michael@0: } michael@0: while (isSpace(toAscii(enc, ptr, end))) michael@0: ptr += enc->minBytesPerChar; michael@0: if (ptr != end) { michael@0: *badPtr = ptr; michael@0: return 0; michael@0: } michael@0: return 1; michael@0: } michael@0: michael@0: static int FASTCALL michael@0: checkCharRefNumber(int result) michael@0: { michael@0: switch (result >> 8) { michael@0: case 0xD8: case 0xD9: case 0xDA: case 0xDB: michael@0: case 0xDC: case 0xDD: case 0xDE: case 0xDF: michael@0: return -1; michael@0: case 0: michael@0: if (latin1_encoding.type[result] == BT_NONXML) michael@0: return -1; michael@0: break; michael@0: case 0xFF: michael@0: if (result == 0xFFFE || result == 0xFFFF) michael@0: return -1; michael@0: break; michael@0: } michael@0: return result; michael@0: } michael@0: michael@0: int FASTCALL michael@0: XmlUtf8Encode(int c, char *buf) michael@0: { michael@0: enum { michael@0: /* minN is minimum legal resulting value for N byte sequence */ michael@0: min2 = 0x80, michael@0: min3 = 0x800, michael@0: min4 = 0x10000 michael@0: }; michael@0: michael@0: if (c < 0) michael@0: return 0; michael@0: if (c < min2) { michael@0: buf[0] = (char)(c | UTF8_cval1); michael@0: return 1; michael@0: } michael@0: if (c < min3) { michael@0: buf[0] = (char)((c >> 6) | UTF8_cval2); michael@0: buf[1] = (char)((c & 0x3f) | 0x80); michael@0: return 2; michael@0: } michael@0: if (c < min4) { michael@0: buf[0] = (char)((c >> 12) | UTF8_cval3); michael@0: buf[1] = (char)(((c >> 6) & 0x3f) | 0x80); michael@0: buf[2] = (char)((c & 0x3f) | 0x80); michael@0: return 3; michael@0: } michael@0: if (c < 0x110000) { michael@0: buf[0] = (char)((c >> 18) | UTF8_cval4); michael@0: buf[1] = (char)(((c >> 12) & 0x3f) | 0x80); michael@0: buf[2] = (char)(((c >> 6) & 0x3f) | 0x80); michael@0: buf[3] = (char)((c & 0x3f) | 0x80); michael@0: return 4; michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: int FASTCALL michael@0: XmlUtf16Encode(int charNum, unsigned short *buf) michael@0: { michael@0: if (charNum < 0) michael@0: return 0; michael@0: if (charNum < 0x10000) { michael@0: buf[0] = (unsigned short)charNum; michael@0: return 1; michael@0: } michael@0: if (charNum < 0x110000) { michael@0: charNum -= 0x10000; michael@0: buf[0] = (unsigned short)((charNum >> 10) + 0xD800); michael@0: buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00); michael@0: return 2; michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: struct unknown_encoding { michael@0: struct normal_encoding normal; michael@0: CONVERTER convert; michael@0: void *userData; michael@0: unsigned short utf16[256]; michael@0: char utf8[256][4]; michael@0: }; michael@0: michael@0: #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc)) michael@0: michael@0: int michael@0: XmlSizeOfUnknownEncoding(void) michael@0: { michael@0: return sizeof(struct unknown_encoding); michael@0: } michael@0: michael@0: static int PTRFASTCALL michael@0: unknown_isName(const ENCODING *enc, const char *p) michael@0: { michael@0: const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); michael@0: int c = uenc->convert(uenc->userData, p); michael@0: if (c & ~0xFFFF) michael@0: return 0; michael@0: return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF); michael@0: } michael@0: michael@0: static int PTRFASTCALL michael@0: unknown_isNmstrt(const ENCODING *enc, const char *p) michael@0: { michael@0: const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); michael@0: int c = uenc->convert(uenc->userData, p); michael@0: if (c & ~0xFFFF) michael@0: return 0; michael@0: return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF); michael@0: } michael@0: michael@0: static int PTRFASTCALL michael@0: unknown_isInvalid(const ENCODING *enc, const char *p) michael@0: { michael@0: const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); michael@0: int c = uenc->convert(uenc->userData, p); michael@0: return (c & ~0xFFFF) || checkCharRefNumber(c) < 0; michael@0: } michael@0: michael@0: static void PTRCALL michael@0: unknown_toUtf8(const ENCODING *enc, michael@0: const char **fromP, const char *fromLim, michael@0: char **toP, const char *toLim) michael@0: { michael@0: const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); michael@0: char buf[XML_UTF8_ENCODE_MAX]; michael@0: for (;;) { michael@0: const char *utf8; michael@0: int n; michael@0: if (*fromP == fromLim) michael@0: break; michael@0: utf8 = uenc->utf8[(unsigned char)**fromP]; michael@0: n = *utf8++; michael@0: if (n == 0) { michael@0: int c = uenc->convert(uenc->userData, *fromP); michael@0: n = XmlUtf8Encode(c, buf); michael@0: if (n > toLim - *toP) michael@0: break; michael@0: utf8 = buf; michael@0: *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] michael@0: - (BT_LEAD2 - 2)); michael@0: } michael@0: else { michael@0: if (n > toLim - *toP) michael@0: break; michael@0: (*fromP)++; michael@0: } michael@0: do { michael@0: *(*toP)++ = *utf8++; michael@0: } while (--n != 0); michael@0: } michael@0: } michael@0: michael@0: static void PTRCALL michael@0: unknown_toUtf16(const ENCODING *enc, michael@0: const char **fromP, const char *fromLim, michael@0: unsigned short **toP, const unsigned short *toLim) michael@0: { michael@0: const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); michael@0: while (*fromP != fromLim && *toP != toLim) { michael@0: unsigned short c = uenc->utf16[(unsigned char)**fromP]; michael@0: if (c == 0) { michael@0: c = (unsigned short) michael@0: uenc->convert(uenc->userData, *fromP); michael@0: *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] michael@0: - (BT_LEAD2 - 2)); michael@0: } michael@0: else michael@0: (*fromP)++; michael@0: *(*toP)++ = c; michael@0: } michael@0: } michael@0: michael@0: ENCODING * michael@0: XmlInitUnknownEncoding(void *mem, michael@0: int *table, michael@0: CONVERTER convert, michael@0: void *userData) michael@0: { michael@0: int i; michael@0: struct unknown_encoding *e = (struct unknown_encoding *)mem; michael@0: for (i = 0; i < (int)sizeof(struct normal_encoding); i++) michael@0: ((char *)mem)[i] = ((char *)&latin1_encoding)[i]; michael@0: for (i = 0; i < 128; i++) michael@0: if (latin1_encoding.type[i] != BT_OTHER michael@0: && latin1_encoding.type[i] != BT_NONXML michael@0: && table[i] != i) michael@0: return 0; michael@0: for (i = 0; i < 256; i++) { michael@0: int c = table[i]; michael@0: if (c == -1) { michael@0: e->normal.type[i] = BT_MALFORM; michael@0: /* This shouldn't really get used. */ michael@0: e->utf16[i] = 0xFFFF; michael@0: e->utf8[i][0] = 1; michael@0: e->utf8[i][1] = 0; michael@0: } michael@0: else if (c < 0) { michael@0: if (c < -4) michael@0: return 0; michael@0: e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2)); michael@0: e->utf8[i][0] = 0; michael@0: e->utf16[i] = 0; michael@0: } michael@0: else if (c < 0x80) { michael@0: if (latin1_encoding.type[c] != BT_OTHER michael@0: && latin1_encoding.type[c] != BT_NONXML michael@0: && c != i) michael@0: return 0; michael@0: e->normal.type[i] = latin1_encoding.type[c]; michael@0: e->utf8[i][0] = 1; michael@0: e->utf8[i][1] = (char)c; michael@0: e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c); michael@0: } michael@0: else if (checkCharRefNumber(c) < 0) { michael@0: e->normal.type[i] = BT_NONXML; michael@0: /* This shouldn't really get used. */ michael@0: e->utf16[i] = 0xFFFF; michael@0: e->utf8[i][0] = 1; michael@0: e->utf8[i][1] = 0; michael@0: } michael@0: else { michael@0: if (c > 0xFFFF) michael@0: return 0; michael@0: if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)) michael@0: e->normal.type[i] = BT_NMSTRT; michael@0: else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff)) michael@0: e->normal.type[i] = BT_NAME; michael@0: else michael@0: e->normal.type[i] = BT_OTHER; michael@0: e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1); michael@0: e->utf16[i] = (unsigned short)c; michael@0: } michael@0: } michael@0: e->userData = userData; michael@0: e->convert = convert; michael@0: if (convert) { michael@0: e->normal.isName2 = unknown_isName; michael@0: e->normal.isName3 = unknown_isName; michael@0: e->normal.isName4 = unknown_isName; michael@0: e->normal.isNmstrt2 = unknown_isNmstrt; michael@0: e->normal.isNmstrt3 = unknown_isNmstrt; michael@0: e->normal.isNmstrt4 = unknown_isNmstrt; michael@0: e->normal.isInvalid2 = unknown_isInvalid; michael@0: e->normal.isInvalid3 = unknown_isInvalid; michael@0: e->normal.isInvalid4 = unknown_isInvalid; michael@0: } michael@0: e->normal.enc.utf8Convert = unknown_toUtf8; michael@0: e->normal.enc.utf16Convert = unknown_toUtf16; michael@0: return &(e->normal.enc); michael@0: } michael@0: michael@0: /* If this enumeration is changed, getEncodingIndex and encodings michael@0: must also be changed. */ michael@0: enum { michael@0: UNKNOWN_ENC = -1, michael@0: ISO_8859_1_ENC = 0, michael@0: US_ASCII_ENC, michael@0: UTF_8_ENC, michael@0: UTF_16_ENC, michael@0: UTF_16BE_ENC, michael@0: UTF_16LE_ENC, michael@0: /* must match encodingNames up to here */ michael@0: NO_ENC michael@0: }; michael@0: michael@0: static const char KW_ISO_8859_1[] = { michael@0: ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9, michael@0: ASCII_MINUS, ASCII_1, '\0' michael@0: }; michael@0: static const char KW_US_ASCII[] = { michael@0: ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I, michael@0: '\0' michael@0: }; michael@0: static const char KW_UTF_8[] = { michael@0: ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0' michael@0: }; michael@0: static const char KW_UTF_16[] = { michael@0: ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0' michael@0: }; michael@0: static const char KW_UTF_16BE[] = { michael@0: ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E, michael@0: '\0' michael@0: }; michael@0: static const char KW_UTF_16LE[] = { michael@0: ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E, michael@0: '\0' michael@0: }; michael@0: michael@0: static int FASTCALL michael@0: getEncodingIndex(const char *name) michael@0: { michael@0: static const char * const encodingNames[] = { michael@0: KW_ISO_8859_1, michael@0: KW_US_ASCII, michael@0: KW_UTF_8, michael@0: KW_UTF_16, michael@0: KW_UTF_16BE, michael@0: KW_UTF_16LE, michael@0: }; michael@0: int i; michael@0: if (name == NULL) michael@0: return NO_ENC; michael@0: for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++) michael@0: if (streqci(name, encodingNames[i])) michael@0: return i; michael@0: return UNKNOWN_ENC; michael@0: } michael@0: michael@0: /* For binary compatibility, we store the index of the encoding michael@0: specified at initialization in the isUtf16 member. michael@0: */ michael@0: michael@0: #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16) michael@0: #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i) michael@0: michael@0: /* This is what detects the encoding. encodingTable maps from michael@0: encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of michael@0: the external (protocol) specified encoding; state is michael@0: XML_CONTENT_STATE if we're parsing an external text entity, and michael@0: XML_PROLOG_STATE otherwise. michael@0: */ michael@0: michael@0: michael@0: static int michael@0: initScan(const ENCODING * const *encodingTable, michael@0: const INIT_ENCODING *enc, michael@0: int state, michael@0: const char *ptr, michael@0: const char *end, michael@0: const char **nextTokPtr) michael@0: { michael@0: const ENCODING **encPtr; michael@0: michael@0: if (ptr == end) michael@0: return XML_TOK_NONE; michael@0: encPtr = enc->encPtr; michael@0: if (ptr + 1 == end) { michael@0: /* only a single byte available for auto-detection */ michael@0: #ifndef XML_DTD /* FIXME */ michael@0: /* a well-formed document entity must have more than one byte */ michael@0: if (state != XML_CONTENT_STATE) michael@0: return XML_TOK_PARTIAL; michael@0: #endif michael@0: /* so we're parsing an external text entity... */ michael@0: /* if UTF-16 was externally specified, then we need at least 2 bytes */ michael@0: switch (INIT_ENC_INDEX(enc)) { michael@0: case UTF_16_ENC: michael@0: case UTF_16LE_ENC: michael@0: case UTF_16BE_ENC: michael@0: return XML_TOK_PARTIAL; michael@0: } michael@0: switch ((unsigned char)*ptr) { michael@0: case 0xFE: michael@0: case 0xFF: michael@0: case 0xEF: /* possibly first byte of UTF-8 BOM */ michael@0: if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC michael@0: && state == XML_CONTENT_STATE) michael@0: break; michael@0: /* fall through */ michael@0: case 0x00: michael@0: case 0x3C: michael@0: return XML_TOK_PARTIAL; michael@0: } michael@0: } michael@0: else { michael@0: switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { michael@0: case 0xFEFF: michael@0: if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC michael@0: && state == XML_CONTENT_STATE) michael@0: break; michael@0: *nextTokPtr = ptr + 2; michael@0: *encPtr = encodingTable[UTF_16BE_ENC]; michael@0: return XML_TOK_BOM; michael@0: /* 00 3C is handled in the default case */ michael@0: case 0x3C00: michael@0: if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC michael@0: || INIT_ENC_INDEX(enc) == UTF_16_ENC) michael@0: && state == XML_CONTENT_STATE) michael@0: break; michael@0: *encPtr = encodingTable[UTF_16LE_ENC]; michael@0: return XmlTok(*encPtr, state, ptr, end, nextTokPtr); michael@0: case 0xFFFE: michael@0: if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC michael@0: && state == XML_CONTENT_STATE) michael@0: break; michael@0: *nextTokPtr = ptr + 2; michael@0: *encPtr = encodingTable[UTF_16LE_ENC]; michael@0: return XML_TOK_BOM; michael@0: case 0xEFBB: michael@0: /* Maybe a UTF-8 BOM (EF BB BF) */ michael@0: /* If there's an explicitly specified (external) encoding michael@0: of ISO-8859-1 or some flavour of UTF-16 michael@0: and this is an external text entity, michael@0: don't look for the BOM, michael@0: because it might be a legal data. michael@0: */ michael@0: if (state == XML_CONTENT_STATE) { michael@0: int e = INIT_ENC_INDEX(enc); michael@0: if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC michael@0: || e == UTF_16LE_ENC || e == UTF_16_ENC) michael@0: break; michael@0: } michael@0: if (ptr + 2 == end) michael@0: return XML_TOK_PARTIAL; michael@0: if ((unsigned char)ptr[2] == 0xBF) { michael@0: *nextTokPtr = ptr + 3; michael@0: *encPtr = encodingTable[UTF_8_ENC]; michael@0: return XML_TOK_BOM; michael@0: } michael@0: break; michael@0: default: michael@0: if (ptr[0] == '\0') { michael@0: /* 0 isn't a legal data character. Furthermore a document michael@0: entity can only start with ASCII characters. So the only michael@0: way this can fail to be big-endian UTF-16 if it it's an michael@0: external parsed general entity that's labelled as michael@0: UTF-16LE. michael@0: */ michael@0: if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC) michael@0: break; michael@0: *encPtr = encodingTable[UTF_16BE_ENC]; michael@0: return XmlTok(*encPtr, state, ptr, end, nextTokPtr); michael@0: } michael@0: else if (ptr[1] == '\0') { michael@0: /* We could recover here in the case: michael@0: - parsing an external entity michael@0: - second byte is 0 michael@0: - no externally specified encoding michael@0: - no encoding declaration michael@0: by assuming UTF-16LE. But we don't, because this would mean when michael@0: presented just with a single byte, we couldn't reliably determine michael@0: whether we needed further bytes. michael@0: */ michael@0: if (state == XML_CONTENT_STATE) michael@0: break; michael@0: *encPtr = encodingTable[UTF_16LE_ENC]; michael@0: return XmlTok(*encPtr, state, ptr, end, nextTokPtr); michael@0: } michael@0: break; michael@0: } michael@0: } michael@0: *encPtr = encodingTable[INIT_ENC_INDEX(enc)]; michael@0: return XmlTok(*encPtr, state, ptr, end, nextTokPtr); michael@0: } michael@0: michael@0: michael@0: #define NS(x) x michael@0: #define ns(x) x michael@0: #include "xmltok_ns.c" michael@0: #undef NS michael@0: #undef ns michael@0: michael@0: #ifdef XML_NS michael@0: michael@0: #define NS(x) x ## NS michael@0: #define ns(x) x ## _ns michael@0: michael@0: #include "xmltok_ns.c" michael@0: michael@0: #undef NS michael@0: #undef ns michael@0: michael@0: ENCODING * michael@0: XmlInitUnknownEncodingNS(void *mem, michael@0: int *table, michael@0: CONVERTER convert, michael@0: void *userData) michael@0: { michael@0: ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData); michael@0: if (enc) michael@0: ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON; michael@0: return enc; michael@0: } michael@0: michael@0: #endif /* XML_NS */ michael@0: michael@0: /* BEGIN MOZILLA CHANGE (Mozilla extensions for QName checking) */ michael@0: #ifdef MOZILLA_CLIENT michael@0: #include "moz_extensions.c" michael@0: #endif /* MOZILLA_CLIENT */ michael@0: /* END MOZILLA CHANGE */