parser/expat/lib/xmltok_impl.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
     2    See the file COPYING for copying permission.
     3 */
     5 #ifndef IS_INVALID_CHAR
     6 #define IS_INVALID_CHAR(enc, ptr, n) (0)
     7 #endif
     9 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
    10     case BT_LEAD ## n: \
    11       if (end - ptr < n) \
    12         return XML_TOK_PARTIAL_CHAR; \
    13       if (IS_INVALID_CHAR(enc, ptr, n)) { \
    14         *(nextTokPtr) = (ptr); \
    15         return XML_TOK_INVALID; \
    16       } \
    17       ptr += n; \
    18       break;
    20 #define INVALID_CASES(ptr, nextTokPtr) \
    21   INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
    22   INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
    23   INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
    24   case BT_NONXML: \
    25   case BT_MALFORM: \
    26   case BT_TRAIL: \
    27     *(nextTokPtr) = (ptr); \
    28     return XML_TOK_INVALID;
    30 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
    31    case BT_LEAD ## n: \
    32      if (end - ptr < n) \
    33        return XML_TOK_PARTIAL_CHAR; \
    34      if (!IS_NAME_CHAR(enc, ptr, n)) { \
    35        *nextTokPtr = ptr; \
    36        return XML_TOK_INVALID; \
    37      } \
    38      ptr += n; \
    39      break;
    41 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
    42   case BT_NONASCII: \
    43     if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
    44       *nextTokPtr = ptr; \
    45       return XML_TOK_INVALID; \
    46     } \
    47   case BT_NMSTRT: \
    48   case BT_HEX: \
    49   case BT_DIGIT: \
    50   case BT_NAME: \
    51   case BT_MINUS: \
    52     ptr += MINBPC(enc); \
    53     break; \
    54   CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
    55   CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
    56   CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
    58 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
    59    case BT_LEAD ## n: \
    60      if (end - ptr < n) \
    61        return XML_TOK_PARTIAL_CHAR; \
    62      if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
    63        *nextTokPtr = ptr; \
    64        return XML_TOK_INVALID; \
    65      } \
    66      ptr += n; \
    67      break;
    69 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
    70   case BT_NONASCII: \
    71     if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
    72       *nextTokPtr = ptr; \
    73       return XML_TOK_INVALID; \
    74     } \
    75   case BT_NMSTRT: \
    76   case BT_HEX: \
    77     ptr += MINBPC(enc); \
    78     break; \
    79   CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
    80   CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
    81   CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
    83 #ifndef PREFIX
    84 #define PREFIX(ident) ident
    85 #endif
    87 /* ptr points to character following "<!-" */
    89 static int PTRCALL
    90 PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
    91                     const char *end, const char **nextTokPtr)
    92 {
    93   if (ptr != end) {
    94     if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
    95       *nextTokPtr = ptr;
    96       return XML_TOK_INVALID;
    97     }
    98     ptr += MINBPC(enc);
    99     while (ptr != end) {
   100       switch (BYTE_TYPE(enc, ptr)) {
   101       INVALID_CASES(ptr, nextTokPtr)
   102       case BT_MINUS:
   103         if ((ptr += MINBPC(enc)) == end)
   104           return XML_TOK_PARTIAL;
   105         if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
   106           if ((ptr += MINBPC(enc)) == end)
   107             return XML_TOK_PARTIAL;
   108           if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
   109             *nextTokPtr = ptr;
   110             return XML_TOK_INVALID;
   111           }
   112           *nextTokPtr = ptr + MINBPC(enc);
   113           return XML_TOK_COMMENT;
   114         }
   115         break;
   116       default:
   117         ptr += MINBPC(enc);
   118         break;
   119       }
   120     }
   121   }
   122   return XML_TOK_PARTIAL;
   123 }
   125 /* ptr points to character following "<!" */
   127 static int PTRCALL
   128 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
   129                  const char *end, const char **nextTokPtr)
   130 {
   131   if (ptr == end)
   132     return XML_TOK_PARTIAL;
   133   switch (BYTE_TYPE(enc, ptr)) {
   134   case BT_MINUS:
   135     return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   136   case BT_LSQB:
   137     *nextTokPtr = ptr + MINBPC(enc);
   138     return XML_TOK_COND_SECT_OPEN;
   139   case BT_NMSTRT:
   140   case BT_HEX:
   141     ptr += MINBPC(enc);
   142     break;
   143   default:
   144     *nextTokPtr = ptr;
   145     return XML_TOK_INVALID;
   146   }
   147   while (ptr != end) {
   148     switch (BYTE_TYPE(enc, ptr)) {
   149     case BT_PERCNT:
   150       if (ptr + MINBPC(enc) == end)
   151         return XML_TOK_PARTIAL;
   152       /* don't allow <!ENTITY% foo "whatever"> */
   153       switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
   154       case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
   155         *nextTokPtr = ptr;
   156         return XML_TOK_INVALID;
   157       }
   158       /* fall through */
   159     case BT_S: case BT_CR: case BT_LF:
   160       *nextTokPtr = ptr;
   161       return XML_TOK_DECL_OPEN;
   162     case BT_NMSTRT:
   163     case BT_HEX:
   164       ptr += MINBPC(enc);
   165       break;
   166     default:
   167       *nextTokPtr = ptr;
   168       return XML_TOK_INVALID;
   169     }
   170   }
   171   return XML_TOK_PARTIAL;
   172 }
   174 static int PTRCALL
   175 PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr,
   176                       const char *end, int *tokPtr)
   177 {
   178   int upper = 0;
   179   *tokPtr = XML_TOK_PI;
   180   if (end - ptr != MINBPC(enc)*3)
   181     return 1;
   182   switch (BYTE_TO_ASCII(enc, ptr)) {
   183   case ASCII_x:
   184     break;
   185   case ASCII_X:
   186     upper = 1;
   187     break;
   188   default:
   189     return 1;
   190   }
   191   ptr += MINBPC(enc);
   192   switch (BYTE_TO_ASCII(enc, ptr)) {
   193   case ASCII_m:
   194     break;
   195   case ASCII_M:
   196     upper = 1;
   197     break;
   198   default:
   199     return 1;
   200   }
   201   ptr += MINBPC(enc);
   202   switch (BYTE_TO_ASCII(enc, ptr)) {
   203   case ASCII_l:
   204     break;
   205   case ASCII_L:
   206     upper = 1;
   207     break;
   208   default:
   209     return 1;
   210   }
   211   if (upper)
   212     return 0;
   213   *tokPtr = XML_TOK_XML_DECL;
   214   return 1;
   215 }
   217 /* ptr points to character following "<?" */
   219 static int PTRCALL
   220 PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
   221                const char *end, const char **nextTokPtr)
   222 {
   223   int tok;
   224   const char *target = ptr;
   225   if (ptr == end)
   226     return XML_TOK_PARTIAL;
   227   switch (BYTE_TYPE(enc, ptr)) {
   228   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
   229   default:
   230     *nextTokPtr = ptr;
   231     return XML_TOK_INVALID;
   232   }
   233   while (ptr != end) {
   234     switch (BYTE_TYPE(enc, ptr)) {
   235     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
   236     case BT_S: case BT_CR: case BT_LF:
   237       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
   238         *nextTokPtr = ptr;
   239         return XML_TOK_INVALID;
   240       }
   241       ptr += MINBPC(enc);
   242       while (ptr != end) {
   243         switch (BYTE_TYPE(enc, ptr)) {
   244         INVALID_CASES(ptr, nextTokPtr)
   245         case BT_QUEST:
   246           ptr += MINBPC(enc);
   247           if (ptr == end)
   248             return XML_TOK_PARTIAL;
   249           if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
   250             *nextTokPtr = ptr + MINBPC(enc);
   251             return tok;
   252           }
   253           break;
   254         default:
   255           ptr += MINBPC(enc);
   256           break;
   257         }
   258       }
   259       return XML_TOK_PARTIAL;
   260     case BT_QUEST:
   261       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
   262         *nextTokPtr = ptr;
   263         return XML_TOK_INVALID;
   264       }
   265       ptr += MINBPC(enc);
   266       if (ptr == end)
   267         return XML_TOK_PARTIAL;
   268       if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
   269         *nextTokPtr = ptr + MINBPC(enc);
   270         return tok;
   271       }
   272       /* fall through */
   273     default:
   274       *nextTokPtr = ptr;
   275       return XML_TOK_INVALID;
   276     }
   277   }
   278   return XML_TOK_PARTIAL;
   279 }
   281 static int PTRCALL
   282 PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr,
   283                          const char *end, const char **nextTokPtr)
   284 {
   285   static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
   286                                      ASCII_T, ASCII_A, ASCII_LSQB };
   287   int i;
   288   /* CDATA[ */
   289   if (end - ptr < 6 * MINBPC(enc))
   290     return XML_TOK_PARTIAL;
   291   for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
   292     if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
   293       *nextTokPtr = ptr;
   294       return XML_TOK_INVALID;
   295     }
   296   }
   297   *nextTokPtr = ptr;
   298   return XML_TOK_CDATA_SECT_OPEN;
   299 }
   301 static int PTRCALL
   302 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
   303                         const char *end, const char **nextTokPtr)
   304 {
   305   if (ptr == end)
   306     return XML_TOK_NONE;
   307   if (MINBPC(enc) > 1) {
   308     size_t n = end - ptr;
   309     if (n & (MINBPC(enc) - 1)) {
   310       n &= ~(MINBPC(enc) - 1);
   311       if (n == 0)
   312         return XML_TOK_PARTIAL;
   313       end = ptr + n;
   314     }
   315   }
   316   switch (BYTE_TYPE(enc, ptr)) {
   317   case BT_RSQB:
   318     ptr += MINBPC(enc);
   319     if (ptr == end)
   320       return XML_TOK_PARTIAL;
   321     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
   322       break;
   323     ptr += MINBPC(enc);
   324     if (ptr == end)
   325       return XML_TOK_PARTIAL;
   326     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
   327       ptr -= MINBPC(enc);
   328       break;
   329     }
   330     *nextTokPtr = ptr + MINBPC(enc);
   331     return XML_TOK_CDATA_SECT_CLOSE;
   332   case BT_CR:
   333     ptr += MINBPC(enc);
   334     if (ptr == end)
   335       return XML_TOK_PARTIAL;
   336     if (BYTE_TYPE(enc, ptr) == BT_LF)
   337       ptr += MINBPC(enc);
   338     *nextTokPtr = ptr;
   339     return XML_TOK_DATA_NEWLINE;
   340   case BT_LF:
   341     *nextTokPtr = ptr + MINBPC(enc);
   342     return XML_TOK_DATA_NEWLINE;
   343   INVALID_CASES(ptr, nextTokPtr)
   344   default:
   345     ptr += MINBPC(enc);
   346     break;
   347   }
   348   while (ptr != end) {
   349     switch (BYTE_TYPE(enc, ptr)) {
   350 #define LEAD_CASE(n) \
   351     case BT_LEAD ## n: \
   352       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
   353         *nextTokPtr = ptr; \
   354         return XML_TOK_DATA_CHARS; \
   355       } \
   356       ptr += n; \
   357       break;
   358     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
   359 #undef LEAD_CASE
   360     case BT_NONXML:
   361     case BT_MALFORM:
   362     case BT_TRAIL:
   363     case BT_CR:
   364     case BT_LF:
   365     case BT_RSQB:
   366       *nextTokPtr = ptr;
   367       return XML_TOK_DATA_CHARS;
   368     default:
   369       ptr += MINBPC(enc);
   370       break;
   371     }
   372   }
   373   *nextTokPtr = ptr;
   374   return XML_TOK_DATA_CHARS;
   375 }
   377 /* ptr points to character following "</" */
   379 static int PTRCALL
   380 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
   381                    const char *end, const char **nextTokPtr)
   382 {
   383   if (ptr == end)
   384     return XML_TOK_PARTIAL;
   385   switch (BYTE_TYPE(enc, ptr)) {
   386   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
   387   default:
   388     *nextTokPtr = ptr;
   389     return XML_TOK_INVALID;
   390   }
   391   while (ptr != end) {
   392     switch (BYTE_TYPE(enc, ptr)) {
   393     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
   394     case BT_S: case BT_CR: case BT_LF:
   395       for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
   396         switch (BYTE_TYPE(enc, ptr)) {
   397         case BT_S: case BT_CR: case BT_LF:
   398           break;
   399         case BT_GT:
   400           *nextTokPtr = ptr + MINBPC(enc);
   401           return XML_TOK_END_TAG;
   402         default:
   403           *nextTokPtr = ptr;
   404           return XML_TOK_INVALID;
   405         }
   406       }
   407       return XML_TOK_PARTIAL;
   408 #ifdef XML_NS
   409     case BT_COLON:
   410       /* no need to check qname syntax here,
   411          since end-tag must match exactly */
   412       ptr += MINBPC(enc);
   413       break;
   414 #endif
   415     case BT_GT:
   416       *nextTokPtr = ptr + MINBPC(enc);
   417       return XML_TOK_END_TAG;
   418     default:
   419       *nextTokPtr = ptr;
   420       return XML_TOK_INVALID;
   421     }
   422   }
   423   return XML_TOK_PARTIAL;
   424 }
   426 /* ptr points to character following "&#X" */
   428 static int PTRCALL
   429 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
   430                        const char *end, const char **nextTokPtr)
   431 {
   432   if (ptr != end) {
   433     switch (BYTE_TYPE(enc, ptr)) {
   434     case BT_DIGIT:
   435     case BT_HEX:
   436       break;
   437     default:
   438       *nextTokPtr = ptr;
   439       return XML_TOK_INVALID;
   440     }
   441     for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
   442       switch (BYTE_TYPE(enc, ptr)) {
   443       case BT_DIGIT:
   444       case BT_HEX:
   445         break;
   446       case BT_SEMI:
   447         *nextTokPtr = ptr + MINBPC(enc);
   448         return XML_TOK_CHAR_REF;
   449       default:
   450         *nextTokPtr = ptr;
   451         return XML_TOK_INVALID;
   452       }
   453     }
   454   }
   455   return XML_TOK_PARTIAL;
   456 }
   458 /* ptr points to character following "&#" */
   460 static int PTRCALL
   461 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
   462                     const char *end, const char **nextTokPtr)
   463 {
   464   if (ptr != end) {
   465     if (CHAR_MATCHES(enc, ptr, ASCII_x))
   466       return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   467     switch (BYTE_TYPE(enc, ptr)) {
   468     case BT_DIGIT:
   469       break;
   470     default:
   471       *nextTokPtr = ptr;
   472       return XML_TOK_INVALID;
   473     }
   474     for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
   475       switch (BYTE_TYPE(enc, ptr)) {
   476       case BT_DIGIT:
   477         break;
   478       case BT_SEMI:
   479         *nextTokPtr = ptr + MINBPC(enc);
   480         return XML_TOK_CHAR_REF;
   481       default:
   482         *nextTokPtr = ptr;
   483         return XML_TOK_INVALID;
   484       }
   485     }
   486   }
   487   return XML_TOK_PARTIAL;
   488 }
   490 /* ptr points to character following "&" */
   492 static int PTRCALL
   493 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
   494                 const char **nextTokPtr)
   495 {
   496   if (ptr == end)
   497     return XML_TOK_PARTIAL;
   498   switch (BYTE_TYPE(enc, ptr)) {
   499   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
   500   case BT_NUM:
   501     return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   502   default:
   503     *nextTokPtr = ptr;
   504     return XML_TOK_INVALID;
   505   }
   506   while (ptr != end) {
   507     switch (BYTE_TYPE(enc, ptr)) {
   508     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
   509     case BT_SEMI:
   510       *nextTokPtr = ptr + MINBPC(enc);
   511       return XML_TOK_ENTITY_REF;
   512     default:
   513       *nextTokPtr = ptr;
   514       return XML_TOK_INVALID;
   515     }
   516   }
   517   return XML_TOK_PARTIAL;
   518 }
   520 /* ptr points to character following first character of attribute name */
   522 static int PTRCALL
   523 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
   524                  const char **nextTokPtr)
   525 {
   526 #ifdef XML_NS
   527   int hadColon = 0;
   528 #endif
   529   while (ptr != end) {
   530     switch (BYTE_TYPE(enc, ptr)) {
   531     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
   532 #ifdef XML_NS
   533     case BT_COLON:
   534       if (hadColon) {
   535         *nextTokPtr = ptr;
   536         return XML_TOK_INVALID;
   537       }
   538       hadColon = 1;
   539       ptr += MINBPC(enc);
   540       if (ptr == end)
   541         return XML_TOK_PARTIAL;
   542       switch (BYTE_TYPE(enc, ptr)) {
   543       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
   544       default:
   545         *nextTokPtr = ptr;
   546         return XML_TOK_INVALID;
   547       }
   548       break;
   549 #endif
   550     case BT_S: case BT_CR: case BT_LF:
   551       for (;;) {
   552         int t;
   554         ptr += MINBPC(enc);
   555         if (ptr == end)
   556           return XML_TOK_PARTIAL;
   557         t = BYTE_TYPE(enc, ptr);
   558         if (t == BT_EQUALS)
   559           break;
   560         switch (t) {
   561         case BT_S:
   562         case BT_LF:
   563         case BT_CR:
   564           break;
   565         default:
   566           *nextTokPtr = ptr;
   567           return XML_TOK_INVALID;
   568         }
   569       }
   570     /* fall through */
   571     case BT_EQUALS:
   572       {
   573         int open;
   574 #ifdef XML_NS
   575         hadColon = 0;
   576 #endif
   577         for (;;) {
   578           ptr += MINBPC(enc);
   579           if (ptr == end)
   580             return XML_TOK_PARTIAL;
   581           open = BYTE_TYPE(enc, ptr);
   582           if (open == BT_QUOT || open == BT_APOS)
   583             break;
   584           switch (open) {
   585           case BT_S:
   586           case BT_LF:
   587           case BT_CR:
   588             break;
   589           default:
   590             *nextTokPtr = ptr;
   591             return XML_TOK_INVALID;
   592           }
   593         }
   594         ptr += MINBPC(enc);
   595         /* in attribute value */
   596         for (;;) {
   597           int t;
   598           if (ptr == end)
   599             return XML_TOK_PARTIAL;
   600           t = BYTE_TYPE(enc, ptr);
   601           if (t == open)
   602             break;
   603           switch (t) {
   604           INVALID_CASES(ptr, nextTokPtr)
   605           case BT_AMP:
   606             {
   607               int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
   608               if (tok <= 0) {
   609                 if (tok == XML_TOK_INVALID)
   610                   *nextTokPtr = ptr;
   611                 return tok;
   612               }
   613               break;
   614             }
   615           case BT_LT:
   616             *nextTokPtr = ptr;
   617             return XML_TOK_INVALID;
   618           default:
   619             ptr += MINBPC(enc);
   620             break;
   621           }
   622         }
   623         ptr += MINBPC(enc);
   624         if (ptr == end)
   625           return XML_TOK_PARTIAL;
   626         switch (BYTE_TYPE(enc, ptr)) {
   627         case BT_S:
   628         case BT_CR:
   629         case BT_LF:
   630           break;
   631         case BT_SOL:
   632           goto sol;
   633         case BT_GT:
   634           goto gt;
   635         default:
   636           *nextTokPtr = ptr;
   637           return XML_TOK_INVALID;
   638         }
   639         /* ptr points to closing quote */
   640         for (;;) {
   641           ptr += MINBPC(enc);
   642           if (ptr == end)
   643             return XML_TOK_PARTIAL;
   644           switch (BYTE_TYPE(enc, ptr)) {
   645           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
   646           case BT_S: case BT_CR: case BT_LF:
   647             continue;
   648           case BT_GT:
   649           gt:
   650             *nextTokPtr = ptr + MINBPC(enc);
   651             return XML_TOK_START_TAG_WITH_ATTS;
   652           case BT_SOL:
   653           sol:
   654             ptr += MINBPC(enc);
   655             if (ptr == end)
   656               return XML_TOK_PARTIAL;
   657             if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
   658               *nextTokPtr = ptr;
   659               return XML_TOK_INVALID;
   660             }
   661             *nextTokPtr = ptr + MINBPC(enc);
   662             return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
   663           default:
   664             *nextTokPtr = ptr;
   665             return XML_TOK_INVALID;
   666           }
   667           break;
   668         }
   669         break;
   670       }
   671     default:
   672       *nextTokPtr = ptr;
   673       return XML_TOK_INVALID;
   674     }
   675   }
   676   return XML_TOK_PARTIAL;
   677 }
   679 /* ptr points to character following "<" */
   681 static int PTRCALL
   682 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
   683                const char **nextTokPtr)
   684 {
   685 #ifdef XML_NS
   686   int hadColon;
   687 #endif
   688   if (ptr == end)
   689     return XML_TOK_PARTIAL;
   690   switch (BYTE_TYPE(enc, ptr)) {
   691   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
   692   case BT_EXCL:
   693     if ((ptr += MINBPC(enc)) == end)
   694       return XML_TOK_PARTIAL;
   695     switch (BYTE_TYPE(enc, ptr)) {
   696     case BT_MINUS:
   697       return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   698     case BT_LSQB:
   699       return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
   700                                       end, nextTokPtr);
   701     }
   702     *nextTokPtr = ptr;
   703     return XML_TOK_INVALID;
   704   case BT_QUEST:
   705     return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   706   case BT_SOL:
   707     return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   708   default:
   709     *nextTokPtr = ptr;
   710     return XML_TOK_INVALID;
   711   }
   712 #ifdef XML_NS
   713   hadColon = 0;
   714 #endif
   715   /* we have a start-tag */
   716   while (ptr != end) {
   717     switch (BYTE_TYPE(enc, ptr)) {
   718     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
   719 #ifdef XML_NS
   720     case BT_COLON:
   721       if (hadColon) {
   722         *nextTokPtr = ptr;
   723         return XML_TOK_INVALID;
   724       }
   725       hadColon = 1;
   726       ptr += MINBPC(enc);
   727       if (ptr == end)
   728         return XML_TOK_PARTIAL;
   729       switch (BYTE_TYPE(enc, ptr)) {
   730       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
   731       default:
   732         *nextTokPtr = ptr;
   733         return XML_TOK_INVALID;
   734       }
   735       break;
   736 #endif
   737     case BT_S: case BT_CR: case BT_LF:
   738       {
   739         ptr += MINBPC(enc);
   740         while (ptr != end) {
   741           switch (BYTE_TYPE(enc, ptr)) {
   742           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
   743           case BT_GT:
   744             goto gt;
   745           case BT_SOL:
   746             goto sol;
   747           case BT_S: case BT_CR: case BT_LF:
   748             ptr += MINBPC(enc);
   749             continue;
   750           default:
   751             *nextTokPtr = ptr;
   752             return XML_TOK_INVALID;
   753           }
   754           return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
   755         }
   756         return XML_TOK_PARTIAL;
   757       }
   758     case BT_GT:
   759     gt:
   760       *nextTokPtr = ptr + MINBPC(enc);
   761       return XML_TOK_START_TAG_NO_ATTS;
   762     case BT_SOL:
   763     sol:
   764       ptr += MINBPC(enc);
   765       if (ptr == end)
   766         return XML_TOK_PARTIAL;
   767       if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
   768         *nextTokPtr = ptr;
   769         return XML_TOK_INVALID;
   770       }
   771       *nextTokPtr = ptr + MINBPC(enc);
   772       return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
   773     default:
   774       *nextTokPtr = ptr;
   775       return XML_TOK_INVALID;
   776     }
   777   }
   778   return XML_TOK_PARTIAL;
   779 }
   781 static int PTRCALL
   782 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
   783                    const char **nextTokPtr)
   784 {
   785   if (ptr == end)
   786     return XML_TOK_NONE;
   787   if (MINBPC(enc) > 1) {
   788     size_t n = end - ptr;
   789     if (n & (MINBPC(enc) - 1)) {
   790       n &= ~(MINBPC(enc) - 1);
   791       if (n == 0)
   792         return XML_TOK_PARTIAL;
   793       end = ptr + n;
   794     }
   795   }
   796   switch (BYTE_TYPE(enc, ptr)) {
   797   case BT_LT:
   798     return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   799   case BT_AMP:
   800     return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   801   case BT_CR:
   802     ptr += MINBPC(enc);
   803     if (ptr == end)
   804       return XML_TOK_TRAILING_CR;
   805     if (BYTE_TYPE(enc, ptr) == BT_LF)
   806       ptr += MINBPC(enc);
   807     *nextTokPtr = ptr;
   808     return XML_TOK_DATA_NEWLINE;
   809   case BT_LF:
   810     *nextTokPtr = ptr + MINBPC(enc);
   811     return XML_TOK_DATA_NEWLINE;
   812   case BT_RSQB:
   813     ptr += MINBPC(enc);
   814     if (ptr == end)
   815       return XML_TOK_TRAILING_RSQB;
   816     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
   817       break;
   818     ptr += MINBPC(enc);
   819     if (ptr == end)
   820       return XML_TOK_TRAILING_RSQB;
   821     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
   822       ptr -= MINBPC(enc);
   823       break;
   824     }
   825     *nextTokPtr = ptr;
   826     return XML_TOK_INVALID;
   827   INVALID_CASES(ptr, nextTokPtr)
   828   default:
   829     ptr += MINBPC(enc);
   830     break;
   831   }
   832   while (ptr != end) {
   833     switch (BYTE_TYPE(enc, ptr)) {
   834 #define LEAD_CASE(n) \
   835     case BT_LEAD ## n: \
   836       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
   837         *nextTokPtr = ptr; \
   838         return XML_TOK_DATA_CHARS; \
   839       } \
   840       ptr += n; \
   841       break;
   842     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
   843 #undef LEAD_CASE
   844     case BT_RSQB:
   845       if (ptr + MINBPC(enc) != end) {
   846          if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
   847            ptr += MINBPC(enc);
   848            break;
   849          }
   850          if (ptr + 2*MINBPC(enc) != end) {
   851            if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
   852              ptr += MINBPC(enc);
   853              break;
   854            }
   855            *nextTokPtr = ptr + 2*MINBPC(enc);
   856            return XML_TOK_INVALID;
   857          }
   858       }
   859       /* fall through */
   860     case BT_AMP:
   861     case BT_LT:
   862     case BT_NONXML:
   863     case BT_MALFORM:
   864     case BT_TRAIL:
   865     case BT_CR:
   866     case BT_LF:
   867       *nextTokPtr = ptr;
   868       return XML_TOK_DATA_CHARS;
   869     default:
   870       ptr += MINBPC(enc);
   871       break;
   872     }
   873   }
   874   *nextTokPtr = ptr;
   875   return XML_TOK_DATA_CHARS;
   876 }
   878 /* ptr points to character following "%" */
   880 static int PTRCALL
   881 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
   882                     const char **nextTokPtr)
   883 {
   884   if (ptr == end)
   885     return -XML_TOK_PERCENT;
   886   switch (BYTE_TYPE(enc, ptr)) {
   887   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
   888   case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
   889     *nextTokPtr = ptr;
   890     return XML_TOK_PERCENT;
   891   default:
   892     *nextTokPtr = ptr;
   893     return XML_TOK_INVALID;
   894   }
   895   while (ptr != end) {
   896     switch (BYTE_TYPE(enc, ptr)) {
   897     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
   898     case BT_SEMI:
   899       *nextTokPtr = ptr + MINBPC(enc);
   900       return XML_TOK_PARAM_ENTITY_REF;
   901     default:
   902       *nextTokPtr = ptr;
   903       return XML_TOK_INVALID;
   904     }
   905   }
   906   return XML_TOK_PARTIAL;
   907 }
   909 static int PTRCALL
   910 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
   911                       const char **nextTokPtr)
   912 {
   913   if (ptr == end)
   914     return XML_TOK_PARTIAL;
   915   switch (BYTE_TYPE(enc, ptr)) {
   916   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
   917   default:
   918     *nextTokPtr = ptr;
   919     return XML_TOK_INVALID;
   920   }
   921   while (ptr != end) {
   922     switch (BYTE_TYPE(enc, ptr)) {
   923     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
   924     case BT_CR: case BT_LF: case BT_S:
   925     case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
   926       *nextTokPtr = ptr;
   927       return XML_TOK_POUND_NAME;
   928     default:
   929       *nextTokPtr = ptr;
   930       return XML_TOK_INVALID;
   931     }
   932   }
   933   return -XML_TOK_POUND_NAME;
   934 }
   936 static int PTRCALL
   937 PREFIX(scanLit)(int open, const ENCODING *enc,
   938                 const char *ptr, const char *end,
   939                 const char **nextTokPtr)
   940 {
   941   while (ptr != end) {
   942     int t = BYTE_TYPE(enc, ptr);
   943     switch (t) {
   944     INVALID_CASES(ptr, nextTokPtr)
   945     case BT_QUOT:
   946     case BT_APOS:
   947       ptr += MINBPC(enc);
   948       if (t != open)
   949         break;
   950       if (ptr == end)
   951         return -XML_TOK_LITERAL;
   952       *nextTokPtr = ptr;
   953       switch (BYTE_TYPE(enc, ptr)) {
   954       case BT_S: case BT_CR: case BT_LF:
   955       case BT_GT: case BT_PERCNT: case BT_LSQB:
   956         return XML_TOK_LITERAL;
   957       default:
   958         return XML_TOK_INVALID;
   959       }
   960     default:
   961       ptr += MINBPC(enc);
   962       break;
   963     }
   964   }
   965   return XML_TOK_PARTIAL;
   966 }
   968 static int PTRCALL
   969 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
   970                   const char **nextTokPtr)
   971 {
   972   int tok;
   973   if (ptr == end)
   974     return XML_TOK_NONE;
   975   if (MINBPC(enc) > 1) {
   976     size_t n = end - ptr;
   977     if (n & (MINBPC(enc) - 1)) {
   978       n &= ~(MINBPC(enc) - 1);
   979       if (n == 0)
   980         return XML_TOK_PARTIAL;
   981       end = ptr + n;
   982     }
   983   }
   984   switch (BYTE_TYPE(enc, ptr)) {
   985   case BT_QUOT:
   986     return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
   987   case BT_APOS:
   988     return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
   989   case BT_LT:
   990     {
   991       ptr += MINBPC(enc);
   992       if (ptr == end)
   993         return XML_TOK_PARTIAL;
   994       switch (BYTE_TYPE(enc, ptr)) {
   995       case BT_EXCL:
   996         return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   997       case BT_QUEST:
   998         return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   999       case BT_NMSTRT:
  1000       case BT_HEX:
  1001       case BT_NONASCII:
  1002       case BT_LEAD2:
  1003       case BT_LEAD3:
  1004       case BT_LEAD4:
  1005         *nextTokPtr = ptr - MINBPC(enc);
  1006         return XML_TOK_INSTANCE_START;
  1008       *nextTokPtr = ptr;
  1009       return XML_TOK_INVALID;
  1011   case BT_CR:
  1012     if (ptr + MINBPC(enc) == end) {
  1013       *nextTokPtr = end;
  1014       /* indicate that this might be part of a CR/LF pair */
  1015       return -XML_TOK_PROLOG_S;
  1017     /* fall through */
  1018   case BT_S: case BT_LF:
  1019     for (;;) {
  1020       ptr += MINBPC(enc);
  1021       if (ptr == end)
  1022         break;
  1023       switch (BYTE_TYPE(enc, ptr)) {
  1024       case BT_S: case BT_LF:
  1025         break;
  1026       case BT_CR:
  1027         /* don't split CR/LF pair */
  1028         if (ptr + MINBPC(enc) != end)
  1029           break;
  1030         /* fall through */
  1031       default:
  1032         *nextTokPtr = ptr;
  1033         return XML_TOK_PROLOG_S;
  1036     *nextTokPtr = ptr;
  1037     return XML_TOK_PROLOG_S;
  1038   case BT_PERCNT:
  1039     return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1040   case BT_COMMA:
  1041     *nextTokPtr = ptr + MINBPC(enc);
  1042     return XML_TOK_COMMA;
  1043   case BT_LSQB:
  1044     *nextTokPtr = ptr + MINBPC(enc);
  1045     return XML_TOK_OPEN_BRACKET;
  1046   case BT_RSQB:
  1047     ptr += MINBPC(enc);
  1048     if (ptr == end)
  1049       return -XML_TOK_CLOSE_BRACKET;
  1050     if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
  1051       if (ptr + MINBPC(enc) == end)
  1052         return XML_TOK_PARTIAL;
  1053       if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
  1054         *nextTokPtr = ptr + 2*MINBPC(enc);
  1055         return XML_TOK_COND_SECT_CLOSE;
  1058     *nextTokPtr = ptr;
  1059     return XML_TOK_CLOSE_BRACKET;
  1060   case BT_LPAR:
  1061     *nextTokPtr = ptr + MINBPC(enc);
  1062     return XML_TOK_OPEN_PAREN;
  1063   case BT_RPAR:
  1064     ptr += MINBPC(enc);
  1065     if (ptr == end)
  1066       return -XML_TOK_CLOSE_PAREN;
  1067     switch (BYTE_TYPE(enc, ptr)) {
  1068     case BT_AST:
  1069       *nextTokPtr = ptr + MINBPC(enc);
  1070       return XML_TOK_CLOSE_PAREN_ASTERISK;
  1071     case BT_QUEST:
  1072       *nextTokPtr = ptr + MINBPC(enc);
  1073       return XML_TOK_CLOSE_PAREN_QUESTION;
  1074     case BT_PLUS:
  1075       *nextTokPtr = ptr + MINBPC(enc);
  1076       return XML_TOK_CLOSE_PAREN_PLUS;
  1077     case BT_CR: case BT_LF: case BT_S:
  1078     case BT_GT: case BT_COMMA: case BT_VERBAR:
  1079     case BT_RPAR:
  1080       *nextTokPtr = ptr;
  1081       return XML_TOK_CLOSE_PAREN;
  1083     *nextTokPtr = ptr;
  1084     return XML_TOK_INVALID;
  1085   case BT_VERBAR:
  1086     *nextTokPtr = ptr + MINBPC(enc);
  1087     return XML_TOK_OR;
  1088   case BT_GT:
  1089     *nextTokPtr = ptr + MINBPC(enc);
  1090     return XML_TOK_DECL_CLOSE;
  1091   case BT_NUM:
  1092     return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1093 #define LEAD_CASE(n) \
  1094   case BT_LEAD ## n: \
  1095     if (end - ptr < n) \
  1096       return XML_TOK_PARTIAL_CHAR; \
  1097     if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
  1098       ptr += n; \
  1099       tok = XML_TOK_NAME; \
  1100       break; \
  1101     } \
  1102     if (IS_NAME_CHAR(enc, ptr, n)) { \
  1103       ptr += n; \
  1104       tok = XML_TOK_NMTOKEN; \
  1105       break; \
  1106     } \
  1107     *nextTokPtr = ptr; \
  1108     return XML_TOK_INVALID;
  1109     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1110 #undef LEAD_CASE
  1111   case BT_NMSTRT:
  1112   case BT_HEX:
  1113     tok = XML_TOK_NAME;
  1114     ptr += MINBPC(enc);
  1115     break;
  1116   case BT_DIGIT:
  1117   case BT_NAME:
  1118   case BT_MINUS:
  1119 #ifdef XML_NS
  1120   case BT_COLON:
  1121 #endif
  1122     tok = XML_TOK_NMTOKEN;
  1123     ptr += MINBPC(enc);
  1124     break;
  1125   case BT_NONASCII:
  1126     if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
  1127       ptr += MINBPC(enc);
  1128       tok = XML_TOK_NAME;
  1129       break;
  1131     if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
  1132       ptr += MINBPC(enc);
  1133       tok = XML_TOK_NMTOKEN;
  1134       break;
  1136     /* fall through */
  1137   default:
  1138     *nextTokPtr = ptr;
  1139     return XML_TOK_INVALID;
  1141   while (ptr != end) {
  1142     switch (BYTE_TYPE(enc, ptr)) {
  1143     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  1144     case BT_GT: case BT_RPAR: case BT_COMMA:
  1145     case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
  1146     case BT_S: case BT_CR: case BT_LF:
  1147       *nextTokPtr = ptr;
  1148       return tok;
  1149 #ifdef XML_NS
  1150     case BT_COLON:
  1151       ptr += MINBPC(enc);
  1152       switch (tok) {
  1153       case XML_TOK_NAME:
  1154         if (ptr == end)
  1155           return XML_TOK_PARTIAL;
  1156         tok = XML_TOK_PREFIXED_NAME;
  1157         switch (BYTE_TYPE(enc, ptr)) {
  1158         CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  1159         default:
  1160           tok = XML_TOK_NMTOKEN;
  1161           break;
  1163         break;
  1164       case XML_TOK_PREFIXED_NAME:
  1165         tok = XML_TOK_NMTOKEN;
  1166         break;
  1168       break;
  1169 #endif
  1170     case BT_PLUS:
  1171       if (tok == XML_TOK_NMTOKEN)  {
  1172         *nextTokPtr = ptr;
  1173         return XML_TOK_INVALID;
  1175       *nextTokPtr = ptr + MINBPC(enc);
  1176       return XML_TOK_NAME_PLUS;
  1177     case BT_AST:
  1178       if (tok == XML_TOK_NMTOKEN)  {
  1179         *nextTokPtr = ptr;
  1180         return XML_TOK_INVALID;
  1182       *nextTokPtr = ptr + MINBPC(enc);
  1183       return XML_TOK_NAME_ASTERISK;
  1184     case BT_QUEST:
  1185       if (tok == XML_TOK_NMTOKEN)  {
  1186         *nextTokPtr = ptr;
  1187         return XML_TOK_INVALID;
  1189       *nextTokPtr = ptr + MINBPC(enc);
  1190       return XML_TOK_NAME_QUESTION;
  1191     default:
  1192       *nextTokPtr = ptr;
  1193       return XML_TOK_INVALID;
  1196   return -tok;
  1199 static int PTRCALL
  1200 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
  1201                           const char *end, const char **nextTokPtr)
  1203   const char *start;
  1204   if (ptr == end)
  1205     return XML_TOK_NONE;
  1206   start = ptr;
  1207   while (ptr != end) {
  1208     switch (BYTE_TYPE(enc, ptr)) {
  1209 #define LEAD_CASE(n) \
  1210     case BT_LEAD ## n: ptr += n; break;
  1211     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1212 #undef LEAD_CASE
  1213     case BT_AMP:
  1214       if (ptr == start)
  1215         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1216       *nextTokPtr = ptr;
  1217       return XML_TOK_DATA_CHARS;
  1218     case BT_LT:
  1219       /* this is for inside entity references */
  1220       *nextTokPtr = ptr;
  1221       return XML_TOK_INVALID;
  1222     case BT_LF:
  1223       if (ptr == start) {
  1224         *nextTokPtr = ptr + MINBPC(enc);
  1225         return XML_TOK_DATA_NEWLINE;
  1227       *nextTokPtr = ptr;
  1228       return XML_TOK_DATA_CHARS;
  1229     case BT_CR:
  1230       if (ptr == start) {
  1231         ptr += MINBPC(enc);
  1232         if (ptr == end)
  1233           return XML_TOK_TRAILING_CR;
  1234         if (BYTE_TYPE(enc, ptr) == BT_LF)
  1235           ptr += MINBPC(enc);
  1236         *nextTokPtr = ptr;
  1237         return XML_TOK_DATA_NEWLINE;
  1239       *nextTokPtr = ptr;
  1240       return XML_TOK_DATA_CHARS;
  1241     case BT_S:
  1242       if (ptr == start) {
  1243         *nextTokPtr = ptr + MINBPC(enc);
  1244         return XML_TOK_ATTRIBUTE_VALUE_S;
  1246       *nextTokPtr = ptr;
  1247       return XML_TOK_DATA_CHARS;
  1248     default:
  1249       ptr += MINBPC(enc);
  1250       break;
  1253   *nextTokPtr = ptr;
  1254   return XML_TOK_DATA_CHARS;
  1257 static int PTRCALL
  1258 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
  1259                        const char *end, const char **nextTokPtr)
  1261   const char *start;
  1262   if (ptr == end)
  1263     return XML_TOK_NONE;
  1264   start = ptr;
  1265   while (ptr != end) {
  1266     switch (BYTE_TYPE(enc, ptr)) {
  1267 #define LEAD_CASE(n) \
  1268     case BT_LEAD ## n: ptr += n; break;
  1269     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1270 #undef LEAD_CASE
  1271     case BT_AMP:
  1272       if (ptr == start)
  1273         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1274       *nextTokPtr = ptr;
  1275       return XML_TOK_DATA_CHARS;
  1276     case BT_PERCNT:
  1277       if (ptr == start) {
  1278         int tok =  PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
  1279                                        end, nextTokPtr);
  1280         return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
  1282       *nextTokPtr = ptr;
  1283       return XML_TOK_DATA_CHARS;
  1284     case BT_LF:
  1285       if (ptr == start) {
  1286         *nextTokPtr = ptr + MINBPC(enc);
  1287         return XML_TOK_DATA_NEWLINE;
  1289       *nextTokPtr = ptr;
  1290       return XML_TOK_DATA_CHARS;
  1291     case BT_CR:
  1292       if (ptr == start) {
  1293         ptr += MINBPC(enc);
  1294         if (ptr == end)
  1295           return XML_TOK_TRAILING_CR;
  1296         if (BYTE_TYPE(enc, ptr) == BT_LF)
  1297           ptr += MINBPC(enc);
  1298         *nextTokPtr = ptr;
  1299         return XML_TOK_DATA_NEWLINE;
  1301       *nextTokPtr = ptr;
  1302       return XML_TOK_DATA_CHARS;
  1303     default:
  1304       ptr += MINBPC(enc);
  1305       break;
  1308   *nextTokPtr = ptr;
  1309   return XML_TOK_DATA_CHARS;
  1312 #ifdef XML_DTD
  1314 static int PTRCALL
  1315 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
  1316                          const char *end, const char **nextTokPtr)
  1318   int level = 0;
  1319   if (MINBPC(enc) > 1) {
  1320     size_t n = end - ptr;
  1321     if (n & (MINBPC(enc) - 1)) {
  1322       n &= ~(MINBPC(enc) - 1);
  1323       end = ptr + n;
  1326   while (ptr != end) {
  1327     switch (BYTE_TYPE(enc, ptr)) {
  1328     INVALID_CASES(ptr, nextTokPtr)
  1329     case BT_LT:
  1330       if ((ptr += MINBPC(enc)) == end)
  1331         return XML_TOK_PARTIAL;
  1332       if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
  1333         if ((ptr += MINBPC(enc)) == end)
  1334           return XML_TOK_PARTIAL;
  1335         if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
  1336           ++level;
  1337           ptr += MINBPC(enc);
  1340       break;
  1341     case BT_RSQB:
  1342       if ((ptr += MINBPC(enc)) == end)
  1343         return XML_TOK_PARTIAL;
  1344       if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
  1345         if ((ptr += MINBPC(enc)) == end)
  1346           return XML_TOK_PARTIAL;
  1347         if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  1348           ptr += MINBPC(enc);
  1349           if (level == 0) {
  1350             *nextTokPtr = ptr;
  1351             return XML_TOK_IGNORE_SECT;
  1353           --level;
  1356       break;
  1357     default:
  1358       ptr += MINBPC(enc);
  1359       break;
  1362   return XML_TOK_PARTIAL;
  1365 #endif /* XML_DTD */
  1367 static int PTRCALL
  1368 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
  1369                    const char **badPtr)
  1371   ptr += MINBPC(enc);
  1372   end -= MINBPC(enc);
  1373   for (; ptr != end; ptr += MINBPC(enc)) {
  1374     switch (BYTE_TYPE(enc, ptr)) {
  1375     case BT_DIGIT:
  1376     case BT_HEX:
  1377     case BT_MINUS:
  1378     case BT_APOS:
  1379     case BT_LPAR:
  1380     case BT_RPAR:
  1381     case BT_PLUS:
  1382     case BT_COMMA:
  1383     case BT_SOL:
  1384     case BT_EQUALS:
  1385     case BT_QUEST:
  1386     case BT_CR:
  1387     case BT_LF:
  1388     case BT_SEMI:
  1389     case BT_EXCL:
  1390     case BT_AST:
  1391     case BT_PERCNT:
  1392     case BT_NUM:
  1393 #ifdef XML_NS
  1394     case BT_COLON:
  1395 #endif
  1396       break;
  1397     case BT_S:
  1398       if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
  1399         *badPtr = ptr;
  1400         return 0;
  1402       break;
  1403     case BT_NAME:
  1404     case BT_NMSTRT:
  1405       if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
  1406         break;
  1407     default:
  1408       switch (BYTE_TO_ASCII(enc, ptr)) {
  1409       case 0x24: /* $ */
  1410       case 0x40: /* @ */
  1411         break;
  1412       default:
  1413         *badPtr = ptr;
  1414         return 0;
  1416       break;
  1419   return 1;
  1422 /* This must only be called for a well-formed start-tag or empty
  1423    element tag.  Returns the number of attributes.  Pointers to the
  1424    first attsMax attributes are stored in atts.
  1425 */
  1427 static int PTRCALL
  1428 PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
  1429                 int attsMax, ATTRIBUTE *atts)
  1431   enum { other, inName, inValue } state = inName;
  1432   int nAtts = 0;
  1433   int open = 0; /* defined when state == inValue;
  1434                    initialization just to shut up compilers */
  1436   for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
  1437     switch (BYTE_TYPE(enc, ptr)) {
  1438 #define START_NAME \
  1439       if (state == other) { \
  1440         if (nAtts < attsMax) { \
  1441           atts[nAtts].name = ptr; \
  1442           atts[nAtts].normalized = 1; \
  1443         } \
  1444         state = inName; \
  1446 #define LEAD_CASE(n) \
  1447     case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
  1448     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1449 #undef LEAD_CASE
  1450     case BT_NONASCII:
  1451     case BT_NMSTRT:
  1452     case BT_HEX:
  1453       START_NAME
  1454       break;
  1455 #undef START_NAME
  1456     case BT_QUOT:
  1457       if (state != inValue) {
  1458         if (nAtts < attsMax)
  1459           atts[nAtts].valuePtr = ptr + MINBPC(enc);
  1460         state = inValue;
  1461         open = BT_QUOT;
  1463       else if (open == BT_QUOT) {
  1464         state = other;
  1465         if (nAtts < attsMax)
  1466           atts[nAtts].valueEnd = ptr;
  1467         nAtts++;
  1469       break;
  1470     case BT_APOS:
  1471       if (state != inValue) {
  1472         if (nAtts < attsMax)
  1473           atts[nAtts].valuePtr = ptr + MINBPC(enc);
  1474         state = inValue;
  1475         open = BT_APOS;
  1477       else if (open == BT_APOS) {
  1478         state = other;
  1479         if (nAtts < attsMax)
  1480           atts[nAtts].valueEnd = ptr;
  1481         nAtts++;
  1483       break;
  1484     case BT_AMP:
  1485       if (nAtts < attsMax)
  1486         atts[nAtts].normalized = 0;
  1487       break;
  1488     case BT_S:
  1489       if (state == inName)
  1490         state = other;
  1491       else if (state == inValue
  1492                && nAtts < attsMax
  1493                && atts[nAtts].normalized
  1494                && (ptr == atts[nAtts].valuePtr
  1495                    || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
  1496                    || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
  1497                    || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
  1498         atts[nAtts].normalized = 0;
  1499       break;
  1500     case BT_CR: case BT_LF:
  1501       /* This case ensures that the first attribute name is counted
  1502          Apart from that we could just change state on the quote. */
  1503       if (state == inName)
  1504         state = other;
  1505       else if (state == inValue && nAtts < attsMax)
  1506         atts[nAtts].normalized = 0;
  1507       break;
  1508     case BT_GT:
  1509     case BT_SOL:
  1510       if (state != inValue)
  1511         return nAtts;
  1512       break;
  1513     default:
  1514       break;
  1517   /* not reached */
  1520 static int PTRFASTCALL
  1521 PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
  1523   int result = 0;
  1524   /* skip &# */
  1525   ptr += 2*MINBPC(enc);
  1526   if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
  1527     for (ptr += MINBPC(enc);
  1528          !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
  1529          ptr += MINBPC(enc)) {
  1530       int c = BYTE_TO_ASCII(enc, ptr);
  1531       switch (c) {
  1532       case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
  1533       case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
  1534         result <<= 4;
  1535         result |= (c - ASCII_0);
  1536         break;
  1537       case ASCII_A: case ASCII_B: case ASCII_C:
  1538       case ASCII_D: case ASCII_E: case ASCII_F:
  1539         result <<= 4;
  1540         result += 10 + (c - ASCII_A);
  1541         break;
  1542       case ASCII_a: case ASCII_b: case ASCII_c:
  1543       case ASCII_d: case ASCII_e: case ASCII_f:
  1544         result <<= 4;
  1545         result += 10 + (c - ASCII_a);
  1546         break;
  1548       if (result >= 0x110000)
  1549         return -1;
  1552   else {
  1553     for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
  1554       int c = BYTE_TO_ASCII(enc, ptr);
  1555       result *= 10;
  1556       result += (c - ASCII_0);
  1557       if (result >= 0x110000)
  1558         return -1;
  1561   return checkCharRefNumber(result);
  1564 static int PTRCALL
  1565 PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
  1566                              const char *end)
  1568   switch ((end - ptr)/MINBPC(enc)) {
  1569   case 2:
  1570     if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
  1571       switch (BYTE_TO_ASCII(enc, ptr)) {
  1572       case ASCII_l:
  1573         return ASCII_LT;
  1574       case ASCII_g:
  1575         return ASCII_GT;
  1578     break;
  1579   case 3:
  1580     if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
  1581       ptr += MINBPC(enc);
  1582       if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
  1583         ptr += MINBPC(enc);
  1584         if (CHAR_MATCHES(enc, ptr, ASCII_p))
  1585           return ASCII_AMP;
  1588     break;
  1589   case 4:
  1590     switch (BYTE_TO_ASCII(enc, ptr)) {
  1591     case ASCII_q:
  1592       ptr += MINBPC(enc);
  1593       if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
  1594         ptr += MINBPC(enc);
  1595         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
  1596           ptr += MINBPC(enc);
  1597           if (CHAR_MATCHES(enc, ptr, ASCII_t))
  1598             return ASCII_QUOT;
  1601       break;
  1602     case ASCII_a:
  1603       ptr += MINBPC(enc);
  1604       if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
  1605         ptr += MINBPC(enc);
  1606         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
  1607           ptr += MINBPC(enc);
  1608           if (CHAR_MATCHES(enc, ptr, ASCII_s))
  1609             return ASCII_APOS;
  1612       break;
  1615   return 0;
  1618 static int PTRCALL
  1619 PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
  1621   for (;;) {
  1622     switch (BYTE_TYPE(enc, ptr1)) {
  1623 #define LEAD_CASE(n) \
  1624     case BT_LEAD ## n: \
  1625       if (*ptr1++ != *ptr2++) \
  1626         return 0;
  1627     LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
  1628 #undef LEAD_CASE
  1629       /* fall through */
  1630       if (*ptr1++ != *ptr2++)
  1631         return 0;
  1632       break;
  1633     case BT_NONASCII:
  1634     case BT_NMSTRT:
  1635 #ifdef XML_NS
  1636     case BT_COLON:
  1637 #endif
  1638     case BT_HEX:
  1639     case BT_DIGIT:
  1640     case BT_NAME:
  1641     case BT_MINUS:
  1642       if (*ptr2++ != *ptr1++)
  1643         return 0;
  1644       if (MINBPC(enc) > 1) {
  1645         if (*ptr2++ != *ptr1++)
  1646           return 0;
  1647         if (MINBPC(enc) > 2) {
  1648           if (*ptr2++ != *ptr1++)
  1649             return 0;
  1650           if (MINBPC(enc) > 3) {
  1651             if (*ptr2++ != *ptr1++)
  1652               return 0;
  1656       break;
  1657     default:
  1658       if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
  1659         return 1;
  1660       switch (BYTE_TYPE(enc, ptr2)) {
  1661       case BT_LEAD2:
  1662       case BT_LEAD3:
  1663       case BT_LEAD4:
  1664       case BT_NONASCII:
  1665       case BT_NMSTRT:
  1666 #ifdef XML_NS
  1667       case BT_COLON:
  1668 #endif
  1669       case BT_HEX:
  1670       case BT_DIGIT:
  1671       case BT_NAME:
  1672       case BT_MINUS:
  1673         return 0;
  1674       default:
  1675         return 1;
  1679   /* not reached */
  1682 static int PTRCALL
  1683 PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
  1684                          const char *end1, const char *ptr2)
  1686   for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
  1687     if (ptr1 == end1)
  1688       return 0;
  1689     if (!CHAR_MATCHES(enc, ptr1, *ptr2))
  1690       return 0;
  1692   return ptr1 == end1;
  1695 static int PTRFASTCALL
  1696 PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
  1698   const char *start = ptr;
  1699   for (;;) {
  1700     switch (BYTE_TYPE(enc, ptr)) {
  1701 #define LEAD_CASE(n) \
  1702     case BT_LEAD ## n: ptr += n; break;
  1703     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1704 #undef LEAD_CASE
  1705     case BT_NONASCII:
  1706     case BT_NMSTRT:
  1707 #ifdef XML_NS
  1708     case BT_COLON:
  1709 #endif
  1710     case BT_HEX:
  1711     case BT_DIGIT:
  1712     case BT_NAME:
  1713     case BT_MINUS:
  1714       ptr += MINBPC(enc);
  1715       break;
  1716     default:
  1717       return (int)(ptr - start);
  1722 static const char * PTRFASTCALL
  1723 PREFIX(skipS)(const ENCODING *enc, const char *ptr)
  1725   for (;;) {
  1726     switch (BYTE_TYPE(enc, ptr)) {
  1727     case BT_LF:
  1728     case BT_CR:
  1729     case BT_S:
  1730       ptr += MINBPC(enc);
  1731       break;
  1732     default:
  1733       return ptr;
  1738 static void PTRCALL
  1739 PREFIX(updatePosition)(const ENCODING *enc,
  1740                        const char *ptr,
  1741                        const char *end,
  1742                        POSITION *pos)
  1744   while (ptr != end) {
  1745     switch (BYTE_TYPE(enc, ptr)) {
  1746 #define LEAD_CASE(n) \
  1747     case BT_LEAD ## n: \
  1748       ptr += n; \
  1749       break;
  1750     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1751 #undef LEAD_CASE
  1752     case BT_LF:
  1753       pos->columnNumber = (XML_Size)-1;
  1754       pos->lineNumber++;
  1755       ptr += MINBPC(enc);
  1756       break;
  1757     case BT_CR:
  1758       pos->lineNumber++;
  1759       ptr += MINBPC(enc);
  1760       if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
  1761         ptr += MINBPC(enc);
  1762       pos->columnNumber = (XML_Size)-1;
  1763       break;
  1764     default:
  1765       ptr += MINBPC(enc);
  1766       break;
  1768     pos->columnNumber++;
  1772 #undef DO_LEAD_CASE
  1773 #undef MULTIBYTE_CASES
  1774 #undef INVALID_CASES
  1775 #undef CHECK_NAME_CASE
  1776 #undef CHECK_NAME_CASES
  1777 #undef CHECK_NMSTRT_CASE
  1778 #undef CHECK_NMSTRT_CASES

mercurial