michael@0: /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd michael@0: See the file COPYING for copying permission. michael@0: */ michael@0: michael@0: #ifndef IS_INVALID_CHAR michael@0: #define IS_INVALID_CHAR(enc, ptr, n) (0) michael@0: #endif michael@0: michael@0: #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \ michael@0: case BT_LEAD ## n: \ michael@0: if (end - ptr < n) \ michael@0: return XML_TOK_PARTIAL_CHAR; \ michael@0: if (IS_INVALID_CHAR(enc, ptr, n)) { \ michael@0: *(nextTokPtr) = (ptr); \ michael@0: return XML_TOK_INVALID; \ michael@0: } \ michael@0: ptr += n; \ michael@0: break; michael@0: michael@0: #define INVALID_CASES(ptr, nextTokPtr) \ michael@0: INVALID_LEAD_CASE(2, ptr, nextTokPtr) \ michael@0: INVALID_LEAD_CASE(3, ptr, nextTokPtr) \ michael@0: INVALID_LEAD_CASE(4, ptr, nextTokPtr) \ michael@0: case BT_NONXML: \ michael@0: case BT_MALFORM: \ michael@0: case BT_TRAIL: \ michael@0: *(nextTokPtr) = (ptr); \ michael@0: return XML_TOK_INVALID; michael@0: michael@0: #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \ michael@0: case BT_LEAD ## n: \ michael@0: if (end - ptr < n) \ michael@0: return XML_TOK_PARTIAL_CHAR; \ michael@0: if (!IS_NAME_CHAR(enc, ptr, n)) { \ michael@0: *nextTokPtr = ptr; \ michael@0: return XML_TOK_INVALID; \ michael@0: } \ michael@0: ptr += n; \ michael@0: break; michael@0: michael@0: #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \ michael@0: case BT_NONASCII: \ michael@0: if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \ michael@0: *nextTokPtr = ptr; \ michael@0: return XML_TOK_INVALID; \ michael@0: } \ michael@0: case BT_NMSTRT: \ michael@0: case BT_HEX: \ michael@0: case BT_DIGIT: \ michael@0: case BT_NAME: \ michael@0: case BT_MINUS: \ michael@0: ptr += MINBPC(enc); \ michael@0: break; \ michael@0: CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \ michael@0: CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \ michael@0: CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr) michael@0: michael@0: #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \ michael@0: case BT_LEAD ## n: \ michael@0: if (end - ptr < n) \ michael@0: return XML_TOK_PARTIAL_CHAR; \ michael@0: if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \ michael@0: *nextTokPtr = ptr; \ michael@0: return XML_TOK_INVALID; \ michael@0: } \ michael@0: ptr += n; \ michael@0: break; michael@0: michael@0: #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \ michael@0: case BT_NONASCII: \ michael@0: if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \ michael@0: *nextTokPtr = ptr; \ michael@0: return XML_TOK_INVALID; \ michael@0: } \ michael@0: case BT_NMSTRT: \ michael@0: case BT_HEX: \ michael@0: ptr += MINBPC(enc); \ michael@0: break; \ michael@0: CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \ michael@0: CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \ michael@0: CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr) michael@0: michael@0: #ifndef PREFIX michael@0: #define PREFIX(ident) ident michael@0: #endif michael@0: michael@0: /* ptr points to character following " */ michael@0: switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) { michael@0: case BT_S: case BT_CR: case BT_LF: case BT_PERCNT: michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_INVALID; michael@0: } michael@0: /* fall through */ michael@0: case BT_S: case BT_CR: case BT_LF: michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_DECL_OPEN; michael@0: case BT_NMSTRT: michael@0: case BT_HEX: michael@0: ptr += MINBPC(enc); michael@0: break; michael@0: default: michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_INVALID; michael@0: } michael@0: } michael@0: return XML_TOK_PARTIAL; michael@0: } michael@0: michael@0: static int PTRCALL michael@0: PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, michael@0: const char *end, int *tokPtr) michael@0: { michael@0: int upper = 0; michael@0: *tokPtr = XML_TOK_PI; michael@0: if (end - ptr != MINBPC(enc)*3) michael@0: return 1; michael@0: switch (BYTE_TO_ASCII(enc, ptr)) { michael@0: case ASCII_x: michael@0: break; michael@0: case ASCII_X: michael@0: upper = 1; michael@0: break; michael@0: default: michael@0: return 1; michael@0: } michael@0: ptr += MINBPC(enc); michael@0: switch (BYTE_TO_ASCII(enc, ptr)) { michael@0: case ASCII_m: michael@0: break; michael@0: case ASCII_M: michael@0: upper = 1; michael@0: break; michael@0: default: michael@0: return 1; michael@0: } michael@0: ptr += MINBPC(enc); michael@0: switch (BYTE_TO_ASCII(enc, ptr)) { michael@0: case ASCII_l: michael@0: break; michael@0: case ASCII_L: michael@0: upper = 1; michael@0: break; michael@0: default: michael@0: return 1; michael@0: } michael@0: if (upper) michael@0: return 0; michael@0: *tokPtr = XML_TOK_XML_DECL; michael@0: return 1; michael@0: } michael@0: michael@0: /* ptr points to character following " 1) { michael@0: size_t n = end - ptr; michael@0: if (n & (MINBPC(enc) - 1)) { michael@0: n &= ~(MINBPC(enc) - 1); michael@0: if (n == 0) michael@0: return XML_TOK_PARTIAL; michael@0: end = ptr + n; michael@0: } michael@0: } michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: case BT_RSQB: michael@0: ptr += MINBPC(enc); michael@0: if (ptr == end) michael@0: return XML_TOK_PARTIAL; michael@0: if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB)) michael@0: break; michael@0: ptr += MINBPC(enc); michael@0: if (ptr == end) michael@0: return XML_TOK_PARTIAL; michael@0: if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { michael@0: ptr -= MINBPC(enc); michael@0: break; michael@0: } michael@0: *nextTokPtr = ptr + MINBPC(enc); michael@0: return XML_TOK_CDATA_SECT_CLOSE; michael@0: case BT_CR: michael@0: ptr += MINBPC(enc); michael@0: if (ptr == end) michael@0: return XML_TOK_PARTIAL; michael@0: if (BYTE_TYPE(enc, ptr) == BT_LF) michael@0: ptr += MINBPC(enc); michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_DATA_NEWLINE; michael@0: case BT_LF: michael@0: *nextTokPtr = ptr + MINBPC(enc); michael@0: return XML_TOK_DATA_NEWLINE; michael@0: INVALID_CASES(ptr, nextTokPtr) michael@0: default: michael@0: ptr += MINBPC(enc); michael@0: break; michael@0: } michael@0: while (ptr != end) { michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: #define LEAD_CASE(n) \ michael@0: case BT_LEAD ## n: \ michael@0: if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ michael@0: *nextTokPtr = ptr; \ michael@0: return XML_TOK_DATA_CHARS; \ michael@0: } \ michael@0: ptr += n; \ michael@0: break; michael@0: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) michael@0: #undef LEAD_CASE michael@0: case BT_NONXML: michael@0: case BT_MALFORM: michael@0: case BT_TRAIL: michael@0: case BT_CR: michael@0: case BT_LF: michael@0: case BT_RSQB: michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_DATA_CHARS; michael@0: default: michael@0: ptr += MINBPC(enc); michael@0: break; michael@0: } michael@0: } michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_DATA_CHARS; michael@0: } michael@0: michael@0: /* ptr points to character following " 1) { michael@0: size_t n = end - ptr; michael@0: if (n & (MINBPC(enc) - 1)) { michael@0: n &= ~(MINBPC(enc) - 1); michael@0: if (n == 0) michael@0: return XML_TOK_PARTIAL; michael@0: end = ptr + n; michael@0: } michael@0: } michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: case BT_LT: michael@0: return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr); michael@0: case BT_AMP: michael@0: return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); michael@0: case BT_CR: michael@0: ptr += MINBPC(enc); michael@0: if (ptr == end) michael@0: return XML_TOK_TRAILING_CR; michael@0: if (BYTE_TYPE(enc, ptr) == BT_LF) michael@0: ptr += MINBPC(enc); michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_DATA_NEWLINE; michael@0: case BT_LF: michael@0: *nextTokPtr = ptr + MINBPC(enc); michael@0: return XML_TOK_DATA_NEWLINE; michael@0: case BT_RSQB: michael@0: ptr += MINBPC(enc); michael@0: if (ptr == end) michael@0: return XML_TOK_TRAILING_RSQB; michael@0: if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB)) michael@0: break; michael@0: ptr += MINBPC(enc); michael@0: if (ptr == end) michael@0: return XML_TOK_TRAILING_RSQB; michael@0: if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { michael@0: ptr -= MINBPC(enc); michael@0: break; michael@0: } michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_INVALID; michael@0: INVALID_CASES(ptr, nextTokPtr) michael@0: default: michael@0: ptr += MINBPC(enc); michael@0: break; michael@0: } michael@0: while (ptr != end) { michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: #define LEAD_CASE(n) \ michael@0: case BT_LEAD ## n: \ michael@0: if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ michael@0: *nextTokPtr = ptr; \ michael@0: return XML_TOK_DATA_CHARS; \ michael@0: } \ michael@0: ptr += n; \ michael@0: break; michael@0: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) michael@0: #undef LEAD_CASE michael@0: case BT_RSQB: michael@0: if (ptr + MINBPC(enc) != end) { michael@0: if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) { michael@0: ptr += MINBPC(enc); michael@0: break; michael@0: } michael@0: if (ptr + 2*MINBPC(enc) != end) { michael@0: if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) { michael@0: ptr += MINBPC(enc); michael@0: break; michael@0: } michael@0: *nextTokPtr = ptr + 2*MINBPC(enc); michael@0: return XML_TOK_INVALID; michael@0: } michael@0: } michael@0: /* fall through */ michael@0: case BT_AMP: michael@0: case BT_LT: michael@0: case BT_NONXML: michael@0: case BT_MALFORM: michael@0: case BT_TRAIL: michael@0: case BT_CR: michael@0: case BT_LF: michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_DATA_CHARS; michael@0: default: michael@0: ptr += MINBPC(enc); michael@0: break; michael@0: } michael@0: } michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_DATA_CHARS; michael@0: } michael@0: michael@0: /* ptr points to character following "%" */ michael@0: michael@0: static int PTRCALL michael@0: PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end, michael@0: const char **nextTokPtr) michael@0: { michael@0: if (ptr == end) michael@0: return -XML_TOK_PERCENT; michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) michael@0: case BT_S: case BT_LF: case BT_CR: case BT_PERCNT: michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_PERCENT; michael@0: default: michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_INVALID; michael@0: } michael@0: while (ptr != end) { michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) michael@0: case BT_SEMI: michael@0: *nextTokPtr = ptr + MINBPC(enc); michael@0: return XML_TOK_PARAM_ENTITY_REF; michael@0: default: michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_INVALID; michael@0: } michael@0: } michael@0: return XML_TOK_PARTIAL; michael@0: } michael@0: michael@0: static int PTRCALL michael@0: PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end, michael@0: const char **nextTokPtr) michael@0: { michael@0: if (ptr == end) michael@0: return XML_TOK_PARTIAL; michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) michael@0: default: michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_INVALID; michael@0: } michael@0: while (ptr != end) { michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) michael@0: case BT_CR: case BT_LF: case BT_S: michael@0: case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR: michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_POUND_NAME; michael@0: default: michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_INVALID; michael@0: } michael@0: } michael@0: return -XML_TOK_POUND_NAME; michael@0: } michael@0: michael@0: static int PTRCALL michael@0: PREFIX(scanLit)(int open, const ENCODING *enc, michael@0: const char *ptr, const char *end, michael@0: const char **nextTokPtr) michael@0: { michael@0: while (ptr != end) { michael@0: int t = BYTE_TYPE(enc, ptr); michael@0: switch (t) { michael@0: INVALID_CASES(ptr, nextTokPtr) michael@0: case BT_QUOT: michael@0: case BT_APOS: michael@0: ptr += MINBPC(enc); michael@0: if (t != open) michael@0: break; michael@0: if (ptr == end) michael@0: return -XML_TOK_LITERAL; michael@0: *nextTokPtr = ptr; michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: case BT_S: case BT_CR: case BT_LF: michael@0: case BT_GT: case BT_PERCNT: case BT_LSQB: michael@0: return XML_TOK_LITERAL; michael@0: default: michael@0: return XML_TOK_INVALID; michael@0: } michael@0: default: michael@0: ptr += MINBPC(enc); michael@0: break; michael@0: } michael@0: } michael@0: return XML_TOK_PARTIAL; michael@0: } michael@0: michael@0: static int PTRCALL michael@0: PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, michael@0: const char **nextTokPtr) michael@0: { michael@0: int tok; michael@0: if (ptr == end) michael@0: return XML_TOK_NONE; michael@0: if (MINBPC(enc) > 1) { michael@0: size_t n = end - ptr; michael@0: if (n & (MINBPC(enc) - 1)) { michael@0: n &= ~(MINBPC(enc) - 1); michael@0: if (n == 0) michael@0: return XML_TOK_PARTIAL; michael@0: end = ptr + n; michael@0: } michael@0: } michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: case BT_QUOT: michael@0: return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr); michael@0: case BT_APOS: michael@0: return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr); michael@0: case BT_LT: michael@0: { michael@0: ptr += MINBPC(enc); michael@0: if (ptr == end) michael@0: return XML_TOK_PARTIAL; michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: case BT_EXCL: michael@0: return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr); michael@0: case BT_QUEST: michael@0: return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr); michael@0: case BT_NMSTRT: michael@0: case BT_HEX: michael@0: case BT_NONASCII: michael@0: case BT_LEAD2: michael@0: case BT_LEAD3: michael@0: case BT_LEAD4: michael@0: *nextTokPtr = ptr - MINBPC(enc); michael@0: return XML_TOK_INSTANCE_START; michael@0: } michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_INVALID; michael@0: } michael@0: case BT_CR: michael@0: if (ptr + MINBPC(enc) == end) { michael@0: *nextTokPtr = end; michael@0: /* indicate that this might be part of a CR/LF pair */ michael@0: return -XML_TOK_PROLOG_S; michael@0: } michael@0: /* fall through */ michael@0: case BT_S: case BT_LF: michael@0: for (;;) { michael@0: ptr += MINBPC(enc); michael@0: if (ptr == end) michael@0: break; michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: case BT_S: case BT_LF: michael@0: break; michael@0: case BT_CR: michael@0: /* don't split CR/LF pair */ michael@0: if (ptr + MINBPC(enc) != end) michael@0: break; michael@0: /* fall through */ michael@0: default: michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_PROLOG_S; michael@0: } michael@0: } michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_PROLOG_S; michael@0: case BT_PERCNT: michael@0: return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr); michael@0: case BT_COMMA: michael@0: *nextTokPtr = ptr + MINBPC(enc); michael@0: return XML_TOK_COMMA; michael@0: case BT_LSQB: michael@0: *nextTokPtr = ptr + MINBPC(enc); michael@0: return XML_TOK_OPEN_BRACKET; michael@0: case BT_RSQB: michael@0: ptr += MINBPC(enc); michael@0: if (ptr == end) michael@0: return -XML_TOK_CLOSE_BRACKET; michael@0: if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) { michael@0: if (ptr + MINBPC(enc) == end) michael@0: return XML_TOK_PARTIAL; michael@0: if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) { michael@0: *nextTokPtr = ptr + 2*MINBPC(enc); michael@0: return XML_TOK_COND_SECT_CLOSE; michael@0: } michael@0: } michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_CLOSE_BRACKET; michael@0: case BT_LPAR: michael@0: *nextTokPtr = ptr + MINBPC(enc); michael@0: return XML_TOK_OPEN_PAREN; michael@0: case BT_RPAR: michael@0: ptr += MINBPC(enc); michael@0: if (ptr == end) michael@0: return -XML_TOK_CLOSE_PAREN; michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: case BT_AST: michael@0: *nextTokPtr = ptr + MINBPC(enc); michael@0: return XML_TOK_CLOSE_PAREN_ASTERISK; michael@0: case BT_QUEST: michael@0: *nextTokPtr = ptr + MINBPC(enc); michael@0: return XML_TOK_CLOSE_PAREN_QUESTION; michael@0: case BT_PLUS: michael@0: *nextTokPtr = ptr + MINBPC(enc); michael@0: return XML_TOK_CLOSE_PAREN_PLUS; michael@0: case BT_CR: case BT_LF: case BT_S: michael@0: case BT_GT: case BT_COMMA: case BT_VERBAR: michael@0: case BT_RPAR: michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_CLOSE_PAREN; michael@0: } michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_INVALID; michael@0: case BT_VERBAR: michael@0: *nextTokPtr = ptr + MINBPC(enc); michael@0: return XML_TOK_OR; michael@0: case BT_GT: michael@0: *nextTokPtr = ptr + MINBPC(enc); michael@0: return XML_TOK_DECL_CLOSE; michael@0: case BT_NUM: michael@0: return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr); michael@0: #define LEAD_CASE(n) \ michael@0: case BT_LEAD ## n: \ michael@0: if (end - ptr < n) \ michael@0: return XML_TOK_PARTIAL_CHAR; \ michael@0: if (IS_NMSTRT_CHAR(enc, ptr, n)) { \ michael@0: ptr += n; \ michael@0: tok = XML_TOK_NAME; \ michael@0: break; \ michael@0: } \ michael@0: if (IS_NAME_CHAR(enc, ptr, n)) { \ michael@0: ptr += n; \ michael@0: tok = XML_TOK_NMTOKEN; \ michael@0: break; \ michael@0: } \ michael@0: *nextTokPtr = ptr; \ michael@0: return XML_TOK_INVALID; michael@0: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) michael@0: #undef LEAD_CASE michael@0: case BT_NMSTRT: michael@0: case BT_HEX: michael@0: tok = XML_TOK_NAME; michael@0: ptr += MINBPC(enc); michael@0: break; michael@0: case BT_DIGIT: michael@0: case BT_NAME: michael@0: case BT_MINUS: michael@0: #ifdef XML_NS michael@0: case BT_COLON: michael@0: #endif michael@0: tok = XML_TOK_NMTOKEN; michael@0: ptr += MINBPC(enc); michael@0: break; michael@0: case BT_NONASCII: michael@0: if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { michael@0: ptr += MINBPC(enc); michael@0: tok = XML_TOK_NAME; michael@0: break; michael@0: } michael@0: if (IS_NAME_CHAR_MINBPC(enc, ptr)) { michael@0: ptr += MINBPC(enc); michael@0: tok = XML_TOK_NMTOKEN; michael@0: break; michael@0: } michael@0: /* fall through */ michael@0: default: michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_INVALID; michael@0: } michael@0: while (ptr != end) { michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) michael@0: case BT_GT: case BT_RPAR: case BT_COMMA: michael@0: case BT_VERBAR: case BT_LSQB: case BT_PERCNT: michael@0: case BT_S: case BT_CR: case BT_LF: michael@0: *nextTokPtr = ptr; michael@0: return tok; michael@0: #ifdef XML_NS michael@0: case BT_COLON: michael@0: ptr += MINBPC(enc); michael@0: switch (tok) { michael@0: case XML_TOK_NAME: michael@0: if (ptr == end) michael@0: return XML_TOK_PARTIAL; michael@0: tok = XML_TOK_PREFIXED_NAME; michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) michael@0: default: michael@0: tok = XML_TOK_NMTOKEN; michael@0: break; michael@0: } michael@0: break; michael@0: case XML_TOK_PREFIXED_NAME: michael@0: tok = XML_TOK_NMTOKEN; michael@0: break; michael@0: } michael@0: break; michael@0: #endif michael@0: case BT_PLUS: michael@0: if (tok == XML_TOK_NMTOKEN) { michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_INVALID; michael@0: } michael@0: *nextTokPtr = ptr + MINBPC(enc); michael@0: return XML_TOK_NAME_PLUS; michael@0: case BT_AST: michael@0: if (tok == XML_TOK_NMTOKEN) { michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_INVALID; michael@0: } michael@0: *nextTokPtr = ptr + MINBPC(enc); michael@0: return XML_TOK_NAME_ASTERISK; michael@0: case BT_QUEST: michael@0: if (tok == XML_TOK_NMTOKEN) { michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_INVALID; michael@0: } michael@0: *nextTokPtr = ptr + MINBPC(enc); michael@0: return XML_TOK_NAME_QUESTION; michael@0: default: michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_INVALID; michael@0: } michael@0: } michael@0: return -tok; michael@0: } michael@0: michael@0: static int PTRCALL michael@0: PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, michael@0: const char *end, const char **nextTokPtr) michael@0: { michael@0: const char *start; michael@0: if (ptr == end) michael@0: return XML_TOK_NONE; michael@0: start = ptr; michael@0: while (ptr != end) { michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: #define LEAD_CASE(n) \ michael@0: case BT_LEAD ## n: ptr += n; break; michael@0: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) michael@0: #undef LEAD_CASE michael@0: case BT_AMP: michael@0: if (ptr == start) michael@0: return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_DATA_CHARS; michael@0: case BT_LT: michael@0: /* this is for inside entity references */ michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_INVALID; michael@0: case BT_LF: michael@0: if (ptr == start) { michael@0: *nextTokPtr = ptr + MINBPC(enc); michael@0: return XML_TOK_DATA_NEWLINE; michael@0: } michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_DATA_CHARS; michael@0: case BT_CR: michael@0: if (ptr == start) { michael@0: ptr += MINBPC(enc); michael@0: if (ptr == end) michael@0: return XML_TOK_TRAILING_CR; michael@0: if (BYTE_TYPE(enc, ptr) == BT_LF) michael@0: ptr += MINBPC(enc); michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_DATA_NEWLINE; michael@0: } michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_DATA_CHARS; michael@0: case BT_S: michael@0: if (ptr == start) { michael@0: *nextTokPtr = ptr + MINBPC(enc); michael@0: return XML_TOK_ATTRIBUTE_VALUE_S; michael@0: } michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_DATA_CHARS; michael@0: default: michael@0: ptr += MINBPC(enc); michael@0: break; michael@0: } michael@0: } michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_DATA_CHARS; michael@0: } michael@0: michael@0: static int PTRCALL michael@0: PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, michael@0: const char *end, const char **nextTokPtr) michael@0: { michael@0: const char *start; michael@0: if (ptr == end) michael@0: return XML_TOK_NONE; michael@0: start = ptr; michael@0: while (ptr != end) { michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: #define LEAD_CASE(n) \ michael@0: case BT_LEAD ## n: ptr += n; break; michael@0: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) michael@0: #undef LEAD_CASE michael@0: case BT_AMP: michael@0: if (ptr == start) michael@0: return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_DATA_CHARS; michael@0: case BT_PERCNT: michael@0: if (ptr == start) { michael@0: int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), michael@0: end, nextTokPtr); michael@0: return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok; michael@0: } michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_DATA_CHARS; michael@0: case BT_LF: michael@0: if (ptr == start) { michael@0: *nextTokPtr = ptr + MINBPC(enc); michael@0: return XML_TOK_DATA_NEWLINE; michael@0: } michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_DATA_CHARS; michael@0: case BT_CR: michael@0: if (ptr == start) { michael@0: ptr += MINBPC(enc); michael@0: if (ptr == end) michael@0: return XML_TOK_TRAILING_CR; michael@0: if (BYTE_TYPE(enc, ptr) == BT_LF) michael@0: ptr += MINBPC(enc); michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_DATA_NEWLINE; michael@0: } michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_DATA_CHARS; michael@0: default: michael@0: ptr += MINBPC(enc); michael@0: break; michael@0: } michael@0: } michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_DATA_CHARS; michael@0: } michael@0: michael@0: #ifdef XML_DTD michael@0: michael@0: static int PTRCALL michael@0: PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, michael@0: const char *end, const char **nextTokPtr) michael@0: { michael@0: int level = 0; michael@0: if (MINBPC(enc) > 1) { michael@0: size_t n = end - ptr; michael@0: if (n & (MINBPC(enc) - 1)) { michael@0: n &= ~(MINBPC(enc) - 1); michael@0: end = ptr + n; michael@0: } michael@0: } michael@0: while (ptr != end) { michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: INVALID_CASES(ptr, nextTokPtr) michael@0: case BT_LT: michael@0: if ((ptr += MINBPC(enc)) == end) michael@0: return XML_TOK_PARTIAL; michael@0: if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) { michael@0: if ((ptr += MINBPC(enc)) == end) michael@0: return XML_TOK_PARTIAL; michael@0: if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) { michael@0: ++level; michael@0: ptr += MINBPC(enc); michael@0: } michael@0: } michael@0: break; michael@0: case BT_RSQB: michael@0: if ((ptr += MINBPC(enc)) == end) michael@0: return XML_TOK_PARTIAL; michael@0: if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) { michael@0: if ((ptr += MINBPC(enc)) == end) michael@0: return XML_TOK_PARTIAL; michael@0: if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { michael@0: ptr += MINBPC(enc); michael@0: if (level == 0) { michael@0: *nextTokPtr = ptr; michael@0: return XML_TOK_IGNORE_SECT; michael@0: } michael@0: --level; michael@0: } michael@0: } michael@0: break; michael@0: default: michael@0: ptr += MINBPC(enc); michael@0: break; michael@0: } michael@0: } michael@0: return XML_TOK_PARTIAL; michael@0: } michael@0: michael@0: #endif /* XML_DTD */ michael@0: michael@0: static int PTRCALL michael@0: PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end, michael@0: const char **badPtr) michael@0: { michael@0: ptr += MINBPC(enc); michael@0: end -= MINBPC(enc); michael@0: for (; ptr != end; ptr += MINBPC(enc)) { michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: case BT_DIGIT: michael@0: case BT_HEX: michael@0: case BT_MINUS: michael@0: case BT_APOS: michael@0: case BT_LPAR: michael@0: case BT_RPAR: michael@0: case BT_PLUS: michael@0: case BT_COMMA: michael@0: case BT_SOL: michael@0: case BT_EQUALS: michael@0: case BT_QUEST: michael@0: case BT_CR: michael@0: case BT_LF: michael@0: case BT_SEMI: michael@0: case BT_EXCL: michael@0: case BT_AST: michael@0: case BT_PERCNT: michael@0: case BT_NUM: michael@0: #ifdef XML_NS michael@0: case BT_COLON: michael@0: #endif michael@0: break; michael@0: case BT_S: michael@0: if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) { michael@0: *badPtr = ptr; michael@0: return 0; michael@0: } michael@0: break; michael@0: case BT_NAME: michael@0: case BT_NMSTRT: michael@0: if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f)) michael@0: break; michael@0: default: michael@0: switch (BYTE_TO_ASCII(enc, ptr)) { michael@0: case 0x24: /* $ */ michael@0: case 0x40: /* @ */ michael@0: break; michael@0: default: michael@0: *badPtr = ptr; michael@0: return 0; michael@0: } michael@0: break; michael@0: } michael@0: } michael@0: return 1; michael@0: } michael@0: michael@0: /* This must only be called for a well-formed start-tag or empty michael@0: element tag. Returns the number of attributes. Pointers to the michael@0: first attsMax attributes are stored in atts. michael@0: */ michael@0: michael@0: static int PTRCALL michael@0: PREFIX(getAtts)(const ENCODING *enc, const char *ptr, michael@0: int attsMax, ATTRIBUTE *atts) michael@0: { michael@0: enum { other, inName, inValue } state = inName; michael@0: int nAtts = 0; michael@0: int open = 0; /* defined when state == inValue; michael@0: initialization just to shut up compilers */ michael@0: michael@0: for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) { michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: #define START_NAME \ michael@0: if (state == other) { \ michael@0: if (nAtts < attsMax) { \ michael@0: atts[nAtts].name = ptr; \ michael@0: atts[nAtts].normalized = 1; \ michael@0: } \ michael@0: state = inName; \ michael@0: } michael@0: #define LEAD_CASE(n) \ michael@0: case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break; michael@0: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) michael@0: #undef LEAD_CASE michael@0: case BT_NONASCII: michael@0: case BT_NMSTRT: michael@0: case BT_HEX: michael@0: START_NAME michael@0: break; michael@0: #undef START_NAME michael@0: case BT_QUOT: michael@0: if (state != inValue) { michael@0: if (nAtts < attsMax) michael@0: atts[nAtts].valuePtr = ptr + MINBPC(enc); michael@0: state = inValue; michael@0: open = BT_QUOT; michael@0: } michael@0: else if (open == BT_QUOT) { michael@0: state = other; michael@0: if (nAtts < attsMax) michael@0: atts[nAtts].valueEnd = ptr; michael@0: nAtts++; michael@0: } michael@0: break; michael@0: case BT_APOS: michael@0: if (state != inValue) { michael@0: if (nAtts < attsMax) michael@0: atts[nAtts].valuePtr = ptr + MINBPC(enc); michael@0: state = inValue; michael@0: open = BT_APOS; michael@0: } michael@0: else if (open == BT_APOS) { michael@0: state = other; michael@0: if (nAtts < attsMax) michael@0: atts[nAtts].valueEnd = ptr; michael@0: nAtts++; michael@0: } michael@0: break; michael@0: case BT_AMP: michael@0: if (nAtts < attsMax) michael@0: atts[nAtts].normalized = 0; michael@0: break; michael@0: case BT_S: michael@0: if (state == inName) michael@0: state = other; michael@0: else if (state == inValue michael@0: && nAtts < attsMax michael@0: && atts[nAtts].normalized michael@0: && (ptr == atts[nAtts].valuePtr michael@0: || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE michael@0: || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE michael@0: || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open)) michael@0: atts[nAtts].normalized = 0; michael@0: break; michael@0: case BT_CR: case BT_LF: michael@0: /* This case ensures that the first attribute name is counted michael@0: Apart from that we could just change state on the quote. */ michael@0: if (state == inName) michael@0: state = other; michael@0: else if (state == inValue && nAtts < attsMax) michael@0: atts[nAtts].normalized = 0; michael@0: break; michael@0: case BT_GT: michael@0: case BT_SOL: michael@0: if (state != inValue) michael@0: return nAtts; michael@0: break; michael@0: default: michael@0: break; michael@0: } michael@0: } michael@0: /* not reached */ michael@0: } michael@0: michael@0: static int PTRFASTCALL michael@0: PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) michael@0: { michael@0: int result = 0; michael@0: /* skip &# */ michael@0: ptr += 2*MINBPC(enc); michael@0: if (CHAR_MATCHES(enc, ptr, ASCII_x)) { michael@0: for (ptr += MINBPC(enc); michael@0: !CHAR_MATCHES(enc, ptr, ASCII_SEMI); michael@0: ptr += MINBPC(enc)) { michael@0: int c = BYTE_TO_ASCII(enc, ptr); michael@0: switch (c) { michael@0: case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4: michael@0: case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9: michael@0: result <<= 4; michael@0: result |= (c - ASCII_0); michael@0: break; michael@0: case ASCII_A: case ASCII_B: case ASCII_C: michael@0: case ASCII_D: case ASCII_E: case ASCII_F: michael@0: result <<= 4; michael@0: result += 10 + (c - ASCII_A); michael@0: break; michael@0: case ASCII_a: case ASCII_b: case ASCII_c: michael@0: case ASCII_d: case ASCII_e: case ASCII_f: michael@0: result <<= 4; michael@0: result += 10 + (c - ASCII_a); michael@0: break; michael@0: } michael@0: if (result >= 0x110000) michael@0: return -1; michael@0: } michael@0: } michael@0: else { michael@0: for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) { michael@0: int c = BYTE_TO_ASCII(enc, ptr); michael@0: result *= 10; michael@0: result += (c - ASCII_0); michael@0: if (result >= 0x110000) michael@0: return -1; michael@0: } michael@0: } michael@0: return checkCharRefNumber(result); michael@0: } michael@0: michael@0: static int PTRCALL michael@0: PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, michael@0: const char *end) michael@0: { michael@0: switch ((end - ptr)/MINBPC(enc)) { michael@0: case 2: michael@0: if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) { michael@0: switch (BYTE_TO_ASCII(enc, ptr)) { michael@0: case ASCII_l: michael@0: return ASCII_LT; michael@0: case ASCII_g: michael@0: return ASCII_GT; michael@0: } michael@0: } michael@0: break; michael@0: case 3: michael@0: if (CHAR_MATCHES(enc, ptr, ASCII_a)) { michael@0: ptr += MINBPC(enc); michael@0: if (CHAR_MATCHES(enc, ptr, ASCII_m)) { michael@0: ptr += MINBPC(enc); michael@0: if (CHAR_MATCHES(enc, ptr, ASCII_p)) michael@0: return ASCII_AMP; michael@0: } michael@0: } michael@0: break; michael@0: case 4: michael@0: switch (BYTE_TO_ASCII(enc, ptr)) { michael@0: case ASCII_q: michael@0: ptr += MINBPC(enc); michael@0: if (CHAR_MATCHES(enc, ptr, ASCII_u)) { michael@0: ptr += MINBPC(enc); michael@0: if (CHAR_MATCHES(enc, ptr, ASCII_o)) { michael@0: ptr += MINBPC(enc); michael@0: if (CHAR_MATCHES(enc, ptr, ASCII_t)) michael@0: return ASCII_QUOT; michael@0: } michael@0: } michael@0: break; michael@0: case ASCII_a: michael@0: ptr += MINBPC(enc); michael@0: if (CHAR_MATCHES(enc, ptr, ASCII_p)) { michael@0: ptr += MINBPC(enc); michael@0: if (CHAR_MATCHES(enc, ptr, ASCII_o)) { michael@0: ptr += MINBPC(enc); michael@0: if (CHAR_MATCHES(enc, ptr, ASCII_s)) michael@0: return ASCII_APOS; michael@0: } michael@0: } michael@0: break; michael@0: } michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: static int PTRCALL michael@0: PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2) michael@0: { michael@0: for (;;) { michael@0: switch (BYTE_TYPE(enc, ptr1)) { michael@0: #define LEAD_CASE(n) \ michael@0: case BT_LEAD ## n: \ michael@0: if (*ptr1++ != *ptr2++) \ michael@0: return 0; michael@0: LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2) michael@0: #undef LEAD_CASE michael@0: /* fall through */ michael@0: if (*ptr1++ != *ptr2++) michael@0: return 0; michael@0: break; michael@0: case BT_NONASCII: michael@0: case BT_NMSTRT: michael@0: #ifdef XML_NS michael@0: case BT_COLON: michael@0: #endif michael@0: case BT_HEX: michael@0: case BT_DIGIT: michael@0: case BT_NAME: michael@0: case BT_MINUS: michael@0: if (*ptr2++ != *ptr1++) michael@0: return 0; michael@0: if (MINBPC(enc) > 1) { michael@0: if (*ptr2++ != *ptr1++) michael@0: return 0; michael@0: if (MINBPC(enc) > 2) { michael@0: if (*ptr2++ != *ptr1++) michael@0: return 0; michael@0: if (MINBPC(enc) > 3) { michael@0: if (*ptr2++ != *ptr1++) michael@0: return 0; michael@0: } michael@0: } michael@0: } michael@0: break; michael@0: default: michael@0: if (MINBPC(enc) == 1 && *ptr1 == *ptr2) michael@0: return 1; michael@0: switch (BYTE_TYPE(enc, ptr2)) { michael@0: case BT_LEAD2: michael@0: case BT_LEAD3: michael@0: case BT_LEAD4: michael@0: case BT_NONASCII: michael@0: case BT_NMSTRT: michael@0: #ifdef XML_NS michael@0: case BT_COLON: michael@0: #endif michael@0: case BT_HEX: michael@0: case BT_DIGIT: michael@0: case BT_NAME: michael@0: case BT_MINUS: michael@0: return 0; michael@0: default: michael@0: return 1; michael@0: } michael@0: } michael@0: } michael@0: /* not reached */ michael@0: } michael@0: michael@0: static int PTRCALL michael@0: PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1, michael@0: const char *end1, const char *ptr2) michael@0: { michael@0: for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) { michael@0: if (ptr1 == end1) michael@0: return 0; michael@0: if (!CHAR_MATCHES(enc, ptr1, *ptr2)) michael@0: return 0; michael@0: } michael@0: return ptr1 == end1; michael@0: } michael@0: michael@0: static int PTRFASTCALL michael@0: PREFIX(nameLength)(const ENCODING *enc, const char *ptr) michael@0: { michael@0: const char *start = ptr; michael@0: for (;;) { michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: #define LEAD_CASE(n) \ michael@0: case BT_LEAD ## n: ptr += n; break; michael@0: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) michael@0: #undef LEAD_CASE michael@0: case BT_NONASCII: michael@0: case BT_NMSTRT: michael@0: #ifdef XML_NS michael@0: case BT_COLON: michael@0: #endif michael@0: case BT_HEX: michael@0: case BT_DIGIT: michael@0: case BT_NAME: michael@0: case BT_MINUS: michael@0: ptr += MINBPC(enc); michael@0: break; michael@0: default: michael@0: return (int)(ptr - start); michael@0: } michael@0: } michael@0: } michael@0: michael@0: static const char * PTRFASTCALL michael@0: PREFIX(skipS)(const ENCODING *enc, const char *ptr) michael@0: { michael@0: for (;;) { michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: case BT_LF: michael@0: case BT_CR: michael@0: case BT_S: michael@0: ptr += MINBPC(enc); michael@0: break; michael@0: default: michael@0: return ptr; michael@0: } michael@0: } michael@0: } michael@0: michael@0: static void PTRCALL michael@0: PREFIX(updatePosition)(const ENCODING *enc, michael@0: const char *ptr, michael@0: const char *end, michael@0: POSITION *pos) michael@0: { michael@0: while (ptr != end) { michael@0: switch (BYTE_TYPE(enc, ptr)) { michael@0: #define LEAD_CASE(n) \ michael@0: case BT_LEAD ## n: \ michael@0: ptr += n; \ michael@0: break; michael@0: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) michael@0: #undef LEAD_CASE michael@0: case BT_LF: michael@0: pos->columnNumber = (XML_Size)-1; michael@0: pos->lineNumber++; michael@0: ptr += MINBPC(enc); michael@0: break; michael@0: case BT_CR: michael@0: pos->lineNumber++; michael@0: ptr += MINBPC(enc); michael@0: if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF) michael@0: ptr += MINBPC(enc); michael@0: pos->columnNumber = (XML_Size)-1; michael@0: break; michael@0: default: michael@0: ptr += MINBPC(enc); michael@0: break; michael@0: } michael@0: pos->columnNumber++; michael@0: } michael@0: } michael@0: michael@0: #undef DO_LEAD_CASE michael@0: #undef MULTIBYTE_CASES michael@0: #undef INVALID_CASES michael@0: #undef CHECK_NAME_CASE michael@0: #undef CHECK_NAME_CASES michael@0: #undef CHECK_NMSTRT_CASE michael@0: #undef CHECK_NMSTRT_CASES michael@0: