parser/html/javasrc/Tokenizer.java

Fri, 16 Jan 2015 18:13:44 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Fri, 16 Jan 2015 18:13:44 +0100
branch
TOR_BUG_9701
changeset 14
925c144e1f1f
permissions
-rw-r--r--

Integrate suggestion from review to improve consistency with existing code.

     1 /*
     2  * Copyright (c) 2005-2007 Henri Sivonen
     3  * Copyright (c) 2007-2013 Mozilla Foundation
     4  * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla 
     5  * Foundation, and Opera Software ASA.
     6  *
     7  * Permission is hereby granted, free of charge, to any person obtaining a 
     8  * copy of this software and associated documentation files (the "Software"), 
     9  * to deal in the Software without restriction, including without limitation 
    10  * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
    11  * and/or sell copies of the Software, and to permit persons to whom the 
    12  * Software is furnished to do so, subject to the following conditions:
    13  *
    14  * The above copyright notice and this permission notice shall be included in 
    15  * all copies or substantial portions of the Software.
    16  *
    17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
    18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
    19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
    20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
    21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
    22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
    23  * DEALINGS IN THE SOFTWARE.
    24  */
    26 /*
    27  * The comments following this one that use the same comment syntax as this 
    28  * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007 
    29  * amended as of June 18 2008 and May 31 2010.
    30  * That document came with this statement:
    31  * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and 
    32  * Opera Software ASA. You are granted a license to use, reproduce and 
    33  * create derivative works of this document."
    34  */
    36 package nu.validator.htmlparser.impl;
    38 import nu.validator.htmlparser.annotation.Auto;
    39 import nu.validator.htmlparser.annotation.CharacterName;
    40 import nu.validator.htmlparser.annotation.Const;
    41 import nu.validator.htmlparser.annotation.Inline;
    42 import nu.validator.htmlparser.annotation.Local;
    43 import nu.validator.htmlparser.annotation.NoLength;
    44 import nu.validator.htmlparser.common.EncodingDeclarationHandler;
    45 import nu.validator.htmlparser.common.Interner;
    46 import nu.validator.htmlparser.common.TokenHandler;
    47 import nu.validator.htmlparser.common.XmlViolationPolicy;
    49 import org.xml.sax.ErrorHandler;
    50 import org.xml.sax.Locator;
    51 import org.xml.sax.SAXException;
    52 import org.xml.sax.SAXParseException;
    54 /**
    55  * An implementation of
    56  * http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
    57  * 
    58  * This class implements the <code>Locator</code> interface. This is not an
    59  * incidental implementation detail: Users of this class are encouraged to make
    60  * use of the <code>Locator</code> nature.
    61  * 
    62  * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer
    63  * can be configured to treat these conditions as fatal or to coerce the infoset
    64  * to something that XML 1.0 allows.
    65  * 
    66  * @version $Id$
    67  * @author hsivonen
    68  */
    69 public class Tokenizer implements Locator {
    71     private static final int DATA_AND_RCDATA_MASK = ~1;
    73     public static final int DATA = 0;
    75     public static final int RCDATA = 1;
    77     public static final int SCRIPT_DATA = 2;
    79     public static final int RAWTEXT = 3;
    81     public static final int SCRIPT_DATA_ESCAPED = 4;
    83     public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5;
    85     public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6;
    87     public static final int ATTRIBUTE_VALUE_UNQUOTED = 7;
    89     public static final int PLAINTEXT = 8;
    91     public static final int TAG_OPEN = 9;
    93     public static final int CLOSE_TAG_OPEN = 10;
    95     public static final int TAG_NAME = 11;
    97     public static final int BEFORE_ATTRIBUTE_NAME = 12;
    99     public static final int ATTRIBUTE_NAME = 13;
   101     public static final int AFTER_ATTRIBUTE_NAME = 14;
   103     public static final int BEFORE_ATTRIBUTE_VALUE = 15;
   105     public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16;
   107     public static final int BOGUS_COMMENT = 17;
   109     public static final int MARKUP_DECLARATION_OPEN = 18;
   111     public static final int DOCTYPE = 19;
   113     public static final int BEFORE_DOCTYPE_NAME = 20;
   115     public static final int DOCTYPE_NAME = 21;
   117     public static final int AFTER_DOCTYPE_NAME = 22;
   119     public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23;
   121     public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24;
   123     public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25;
   125     public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26;
   127     public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27;
   129     public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28;
   131     public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29;
   133     public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30;
   135     public static final int BOGUS_DOCTYPE = 31;
   137     public static final int COMMENT_START = 32;
   139     public static final int COMMENT_START_DASH = 33;
   141     public static final int COMMENT = 34;
   143     public static final int COMMENT_END_DASH = 35;
   145     public static final int COMMENT_END = 36;
   147     public static final int COMMENT_END_BANG = 37;
   149     public static final int NON_DATA_END_TAG_NAME = 38;
   151     public static final int MARKUP_DECLARATION_HYPHEN = 39;
   153     public static final int MARKUP_DECLARATION_OCTYPE = 40;
   155     public static final int DOCTYPE_UBLIC = 41;
   157     public static final int DOCTYPE_YSTEM = 42;
   159     public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43;
   161     public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44;
   163     public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45;
   165     public static final int CONSUME_CHARACTER_REFERENCE = 46;
   167     public static final int CONSUME_NCR = 47;
   169     public static final int CHARACTER_REFERENCE_TAIL = 48;
   171     public static final int HEX_NCR_LOOP = 49;
   173     public static final int DECIMAL_NRC_LOOP = 50;
   175     public static final int HANDLE_NCR_VALUE = 51;
   177     public static final int HANDLE_NCR_VALUE_RECONSUME = 52;
   179     public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53;
   181     public static final int SELF_CLOSING_START_TAG = 54;
   183     public static final int CDATA_START = 55;
   185     public static final int CDATA_SECTION = 56;
   187     public static final int CDATA_RSQB = 57;
   189     public static final int CDATA_RSQB_RSQB = 58;
   191     public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59;
   193     public static final int SCRIPT_DATA_ESCAPE_START = 60;
   195     public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61;
   197     public static final int SCRIPT_DATA_ESCAPED_DASH = 62;
   199     public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63;
   201     public static final int BOGUS_COMMENT_HYPHEN = 64;
   203     public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65;
   205     public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66;
   207     public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67;
   209     public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68;
   211     public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69;
   213     public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70;
   215     public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71;
   217     public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72;
   219     public static final int PROCESSING_INSTRUCTION = 73;
   221     public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74;
   223     /**
   224      * Magic value for UTF-16 operations.
   225      */
   226     private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10));
   228     /**
   229      * UTF-16 code unit array containing less than and greater than for emitting
   230      * those characters on certain parse errors.
   231      */
   232     private static final @NoLength char[] LT_GT = { '<', '>' };
   234     /**
   235      * UTF-16 code unit array containing less than and solidus for emitting
   236      * those characters on certain parse errors.
   237      */
   238     private static final @NoLength char[] LT_SOLIDUS = { '<', '/' };
   240     /**
   241      * UTF-16 code unit array containing ]] for emitting those characters on
   242      * state transitions.
   243      */
   244     private static final @NoLength char[] RSQB_RSQB = { ']', ']' };
   246     /**
   247      * Array version of U+FFFD.
   248      */
   249     private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' };
   251     // [NOCPP[
   253     /**
   254      * Array version of space.
   255      */
   256     private static final @NoLength char[] SPACE = { ' ' };
   258     // ]NOCPP]
   260     /**
   261      * Array version of line feed.
   262      */
   263     private static final @NoLength char[] LF = { '\n' };
   265     /**
   266      * Buffer growth parameter.
   267      */
   268     private static final int BUFFER_GROW_BY = 1024;
   270     /**
   271      * "CDATA[" as <code>char[]</code>
   272      */
   273     private static final @NoLength char[] CDATA_LSQB = { 'C', 'D', 'A', 'T',
   274             'A', '[' };
   276     /**
   277      * "octype" as <code>char[]</code>
   278      */
   279     private static final @NoLength char[] OCTYPE = { 'o', 'c', 't', 'y', 'p',
   280             'e' };
   282     /**
   283      * "ublic" as <code>char[]</code>
   284      */
   285     private static final @NoLength char[] UBLIC = { 'u', 'b', 'l', 'i', 'c' };
   287     /**
   288      * "ystem" as <code>char[]</code>
   289      */
   290     private static final @NoLength char[] YSTEM = { 'y', 's', 't', 'e', 'm' };
   292     private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' };
   294     private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' };
   296     private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' };
   298     private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't',
   299             'e', 'x', 't' };
   301     private static final char[] XMP_ARR = { 'x', 'm', 'p' };
   303     private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r',
   304             'e', 'a' };
   306     private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' };
   308     private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e',
   309             'd' };
   311     private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i',
   312             'p', 't' };
   314     private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm',
   315             'e', 's' };
   317     /**
   318      * The token handler.
   319      */
   320     protected final TokenHandler tokenHandler;
   322     protected EncodingDeclarationHandler encodingDeclarationHandler;
   324     // [NOCPP[
   326     /**
   327      * The error handler.
   328      */
   329     protected ErrorHandler errorHandler;
   331     // ]NOCPP]
   333     /**
   334      * Whether the previous char read was CR.
   335      */
   336     protected boolean lastCR;
   338     protected int stateSave;
   340     private int returnStateSave;
   342     protected int index;
   344     private boolean forceQuirks;
   346     private char additional;
   348     private int entCol;
   350     private int firstCharKey;
   352     private int lo;
   354     private int hi;
   356     private int candidate;
   358     private int strBufMark;
   360     private int prevValue;
   362     protected int value;
   364     private boolean seenDigits;
   366     protected int cstart;
   368     /**
   369      * The SAX public id for the resource being tokenized. (Only passed to back
   370      * as part of locator data.)
   371      */
   372     private String publicId;
   374     /**
   375      * The SAX system id for the resource being tokenized. (Only passed to back
   376      * as part of locator data.)
   377      */
   378     private String systemId;
   380     /**
   381      * Buffer for short identifiers.
   382      */
   383     private @Auto char[] strBuf;
   385     /**
   386      * Number of significant <code>char</code>s in <code>strBuf</code>.
   387      */
   388     private int strBufLen;
   390     /**
   391      * <code>-1</code> to indicate that <code>strBuf</code> is used or otherwise
   392      * an offset to the main buffer.
   393      */
   394     // private int strBufOffset = -1;
   395     /**
   396      * Buffer for long strings.
   397      */
   398     private @Auto char[] longStrBuf;
   400     /**
   401      * Number of significant <code>char</code>s in <code>longStrBuf</code>.
   402      */
   403     private int longStrBufLen;
   405     /**
   406      * <code>-1</code> to indicate that <code>longStrBuf</code> is used or
   407      * otherwise an offset to the main buffer.
   408      */
   409     // private int longStrBufOffset = -1;
   411     /**
   412      * Buffer for expanding NCRs falling into the Basic Multilingual Plane.
   413      */
   414     private final @Auto char[] bmpChar;
   416     /**
   417      * Buffer for expanding astral NCRs.
   418      */
   419     private final @Auto char[] astralChar;
   421     /**
   422      * The element whose end tag closes the current CDATA or RCDATA element.
   423      */
   424     protected ElementName endTagExpectation = null;
   426     private char[] endTagExpectationAsArray; // not @Auto!
   428     /**
   429      * <code>true</code> if tokenizing an end tag
   430      */
   431     protected boolean endTag;
   433     /**
   434      * The current tag token name.
   435      */
   436     private ElementName tagName = null;
   438     /**
   439      * The current attribute name.
   440      */
   441     protected AttributeName attributeName = null;
   443     // [NOCPP[
   445     /**
   446      * Whether comment tokens are emitted.
   447      */
   448     private boolean wantsComments = false;
   450     /**
   451      * <code>true</code> when HTML4-specific additional errors are requested.
   452      */
   453     protected boolean html4;
   455     /**
   456      * Whether the stream is past the first 512 bytes.
   457      */
   458     private boolean metaBoundaryPassed;
   460     // ]NOCPP]
   462     /**
   463      * The name of the current doctype token.
   464      */
   465     private @Local String doctypeName;
   467     /**
   468      * The public id of the current doctype token.
   469      */
   470     private String publicIdentifier;
   472     /**
   473      * The system id of the current doctype token.
   474      */
   475     private String systemIdentifier;
   477     /**
   478      * The attribute holder.
   479      */
   480     private HtmlAttributes attributes;
   482     // [NOCPP[
   484     /**
   485      * The policy for vertical tab and form feed.
   486      */
   487     private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET;
   489     /**
   490      * The policy for comments.
   491      */
   492     private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET;
   494     private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET;
   496     private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET;
   498     private boolean html4ModeCompatibleWithXhtml1Schemata;
   500     private int mappingLangToXmlLang;
   502     // ]NOCPP]
   504     private final boolean newAttributesEachTime;
   506     private boolean shouldSuspend;
   508     protected boolean confident;
   510     private int line;
   512     private Interner interner;
   514     // CPPONLY: private boolean viewingXmlSource;
   516     // [NOCPP[
   518     protected LocatorImpl ampersandLocation;
   520     public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) {
   521         this.tokenHandler = tokenHandler;
   522         this.encodingDeclarationHandler = null;
   523         this.newAttributesEachTime = newAttributesEachTime;
   524         this.bmpChar = new char[1];
   525         this.astralChar = new char[2];
   526         this.tagName = null;
   527         this.attributeName = null;
   528         this.doctypeName = null;
   529         this.publicIdentifier = null;
   530         this.systemIdentifier = null;
   531         this.attributes = null;
   532     }
   534     // ]NOCPP]
   536     /**
   537      * The constructor.
   538      * 
   539      * @param tokenHandler
   540      *            the handler for receiving tokens
   541      */
   542     public Tokenizer(TokenHandler tokenHandler
   543     // CPPONLY: , boolean viewingXmlSource        
   544     ) {
   545         this.tokenHandler = tokenHandler;
   546         this.encodingDeclarationHandler = null;
   547         // [NOCPP[
   548         this.newAttributesEachTime = false;
   549         // ]NOCPP]
   550         this.bmpChar = new char[1];
   551         this.astralChar = new char[2];
   552         this.tagName = null;
   553         this.attributeName = null;
   554         this.doctypeName = null;
   555         this.publicIdentifier = null;
   556         this.systemIdentifier = null;
   557         // [NOCPP[
   558         this.attributes = null;
   559         // ]NOCPP]
   560         // CPPONLY: this.attributes = tokenHandler.HasBuilder() ? new HtmlAttributes(mappingLangToXmlLang) : null;
   561         // CPPONLY: this.newAttributesEachTime = !tokenHandler.HasBuilder();
   562         // CPPONLY: this.viewingXmlSource = viewingXmlSource;
   563     }
   565     public void setInterner(Interner interner) {
   566         this.interner = interner;
   567     }
   569     public void initLocation(String newPublicId, String newSystemId) {
   570         this.systemId = newSystemId;
   571         this.publicId = newPublicId;
   573     }
   575     // CPPONLY: boolean isViewingXmlSource() {
   576     // CPPONLY: return viewingXmlSource;
   577     // CPPONLY: }
   579     // [NOCPP[
   581     /**
   582      * Returns the mappingLangToXmlLang.
   583      * 
   584      * @return the mappingLangToXmlLang
   585      */
   586     public boolean isMappingLangToXmlLang() {
   587         return mappingLangToXmlLang == AttributeName.HTML_LANG;
   588     }
   590     /**
   591      * Sets the mappingLangToXmlLang.
   592      * 
   593      * @param mappingLangToXmlLang
   594      *            the mappingLangToXmlLang to set
   595      */
   596     public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
   597         this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG
   598                 : AttributeName.HTML;
   599     }
   601     /**
   602      * Sets the error handler.
   603      * 
   604      * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
   605      */
   606     public void setErrorHandler(ErrorHandler eh) {
   607         this.errorHandler = eh;
   608     }
   610     public ErrorHandler getErrorHandler() {
   611         return this.errorHandler;
   612     }
   614     /**
   615      * Sets the commentPolicy.
   616      * 
   617      * @param commentPolicy
   618      *            the commentPolicy to set
   619      */
   620     public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
   621         this.commentPolicy = commentPolicy;
   622     }
   624     /**
   625      * Sets the contentNonXmlCharPolicy.
   626      * 
   627      * @param contentNonXmlCharPolicy
   628      *            the contentNonXmlCharPolicy to set
   629      */
   630     public void setContentNonXmlCharPolicy(
   631             XmlViolationPolicy contentNonXmlCharPolicy) {
   632         if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) {
   633             throw new IllegalArgumentException(
   634                     "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW.");
   635         }
   636     }
   638     /**
   639      * Sets the contentSpacePolicy.
   640      * 
   641      * @param contentSpacePolicy
   642      *            the contentSpacePolicy to set
   643      */
   644     public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
   645         this.contentSpacePolicy = contentSpacePolicy;
   646     }
   648     /**
   649      * Sets the xmlnsPolicy.
   650      * 
   651      * @param xmlnsPolicy
   652      *            the xmlnsPolicy to set
   653      */
   654     public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
   655         if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
   656             throw new IllegalArgumentException("Can't use FATAL here.");
   657         }
   658         this.xmlnsPolicy = xmlnsPolicy;
   659     }
   661     public void setNamePolicy(XmlViolationPolicy namePolicy) {
   662         this.namePolicy = namePolicy;
   663     }
   665     /**
   666      * Sets the html4ModeCompatibleWithXhtml1Schemata.
   667      * 
   668      * @param html4ModeCompatibleWithXhtml1Schemata
   669      *            the html4ModeCompatibleWithXhtml1Schemata to set
   670      */
   671     public void setHtml4ModeCompatibleWithXhtml1Schemata(
   672             boolean html4ModeCompatibleWithXhtml1Schemata) {
   673         this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata;
   674     }
   676     // ]NOCPP]
   678     // For the token handler to call
   679     /**
   680      * Sets the tokenizer state and the associated element name. This should 
   681      * only ever used to put the tokenizer into one of the states that have
   682      * a special end tag expectation.
   683      * 
   684      * @param specialTokenizerState
   685      *            the tokenizer state to set
   686      * @param endTagExpectation
   687      *            the expected end tag for transitioning back to normal
   688      */
   689     public void setStateAndEndTagExpectation(int specialTokenizerState,
   690             @Local String endTagExpectation) {
   691         this.stateSave = specialTokenizerState;
   692         if (specialTokenizerState == Tokenizer.DATA) {
   693             return;
   694         }
   695         @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation);
   696         this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 0,
   697                 asArray.length, interner);
   698         endTagExpectationToArray();
   699     }
   701     /**
   702      * Sets the tokenizer state and the associated element name. This should 
   703      * only ever used to put the tokenizer into one of the states that have
   704      * a special end tag expectation.
   705      * 
   706      * @param specialTokenizerState
   707      *            the tokenizer state to set
   708      * @param endTagExpectation
   709      *            the expected end tag for transitioning back to normal
   710      */
   711     public void setStateAndEndTagExpectation(int specialTokenizerState,
   712             ElementName endTagExpectation) {
   713         this.stateSave = specialTokenizerState;
   714         this.endTagExpectation = endTagExpectation;
   715         endTagExpectationToArray();
   716     }
   718     private void endTagExpectationToArray() {
   719         switch (endTagExpectation.getGroup()) {
   720             case TreeBuilder.TITLE:
   721                 endTagExpectationAsArray = TITLE_ARR;
   722                 return;
   723             case TreeBuilder.SCRIPT:
   724                 endTagExpectationAsArray = SCRIPT_ARR;
   725                 return;
   726             case TreeBuilder.STYLE:
   727                 endTagExpectationAsArray = STYLE_ARR;
   728                 return;
   729             case TreeBuilder.PLAINTEXT:
   730                 endTagExpectationAsArray = PLAINTEXT_ARR;
   731                 return;
   732             case TreeBuilder.XMP:
   733                 endTagExpectationAsArray = XMP_ARR;
   734                 return;
   735             case TreeBuilder.TEXTAREA:
   736                 endTagExpectationAsArray = TEXTAREA_ARR;
   737                 return;
   738             case TreeBuilder.IFRAME:
   739                 endTagExpectationAsArray = IFRAME_ARR;
   740                 return;
   741             case TreeBuilder.NOEMBED:
   742                 endTagExpectationAsArray = NOEMBED_ARR;
   743                 return;
   744             case TreeBuilder.NOSCRIPT:
   745                 endTagExpectationAsArray = NOSCRIPT_ARR;
   746                 return;
   747             case TreeBuilder.NOFRAMES:
   748                 endTagExpectationAsArray = NOFRAMES_ARR;
   749                 return;
   750             default:
   751                 assert false: "Bad end tag expectation.";
   752                 return;
   753         }
   754     }
   756     /**
   757      * For C++ use only.
   758      */
   759     public void setLineNumber(int line) {
   760         this.line = line;
   761     }
   763     // start Locator impl
   765     /**
   766      * @see org.xml.sax.Locator#getLineNumber()
   767      */
   768     @Inline public int getLineNumber() {
   769         return line;
   770     }
   772     // [NOCPP[
   774     /**
   775      * @see org.xml.sax.Locator#getColumnNumber()
   776      */
   777     @Inline public int getColumnNumber() {
   778         return -1;
   779     }
   781     /**
   782      * @see org.xml.sax.Locator#getPublicId()
   783      */
   784     public String getPublicId() {
   785         return publicId;
   786     }
   788     /**
   789      * @see org.xml.sax.Locator#getSystemId()
   790      */
   791     public String getSystemId() {
   792         return systemId;
   793     }
   795     // end Locator impl
   797     // end public API
   799     public void notifyAboutMetaBoundary() {
   800         metaBoundaryPassed = true;
   801     }
   803     void turnOnAdditionalHtml4Errors() {
   804         html4 = true;
   805     }
   807     // ]NOCPP]
   809     HtmlAttributes emptyAttributes() {
   810         // [NOCPP[
   811         if (newAttributesEachTime) {
   812             return new HtmlAttributes(mappingLangToXmlLang);
   813         } else {
   814             // ]NOCPP]
   815             return HtmlAttributes.EMPTY_ATTRIBUTES;
   816             // [NOCPP[
   817         }
   818         // ]NOCPP]
   819     }
   821     @Inline private void clearStrBufAndAppend(char c) {
   822         strBuf[0] = c;
   823         strBufLen = 1;
   824     }
   826     @Inline private void clearStrBuf() {
   827         strBufLen = 0;
   828     }
   830     /**
   831      * Appends to the smaller buffer.
   832      * 
   833      * @param c
   834      *            the UTF-16 code unit to append
   835      */
   836     private void appendStrBuf(char c) {
   837         if (strBufLen == strBuf.length) {
   838             char[] newBuf = new char[strBuf.length + Tokenizer.BUFFER_GROW_BY];
   839             System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length);
   840             strBuf = newBuf;
   841         }
   842         strBuf[strBufLen++] = c;
   843     }
   845     /**
   846      * The smaller buffer as a String. Currently only used for error reporting.
   847      * 
   848      * <p>
   849      * C++ memory note: The return value must be released.
   850      * 
   851      * @return the smaller buffer as a string
   852      */
   853     protected String strBufToString() {
   854         return Portability.newStringFromBuffer(strBuf, 0, strBufLen);
   855     }
   857     /**
   858      * Returns the short buffer as a local name. The return value is released in
   859      * emitDoctypeToken().
   860      * 
   861      * @return the smaller buffer as local name
   862      */
   863     private void strBufToDoctypeName() {
   864         doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen,
   865                 interner);
   866     }
   868     /**
   869      * Emits the smaller buffer as character tokens.
   870      * 
   871      * @throws SAXException
   872      *             if the token handler threw
   873      */
   874     private void emitStrBuf() throws SAXException {
   875         if (strBufLen > 0) {
   876             tokenHandler.characters(strBuf, 0, strBufLen);
   877         }
   878     }
   880     @Inline private void clearLongStrBuf() {
   881         longStrBufLen = 0;
   882     }
   884     @Inline private void clearLongStrBufAndAppend(char c) {
   885         longStrBuf[0] = c;
   886         longStrBufLen = 1;
   887     }
   889     /**
   890      * Appends to the larger buffer.
   891      * 
   892      * @param c
   893      *            the UTF-16 code unit to append
   894      */
   895     private void appendLongStrBuf(char c) {
   896         if (longStrBufLen == longStrBuf.length) {
   897             char[] newBuf = new char[longStrBufLen + (longStrBufLen >> 1)];
   898             System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);
   899             longStrBuf = newBuf;
   900         }
   901         longStrBuf[longStrBufLen++] = c;
   902     }
   904     @Inline private void appendSecondHyphenToBogusComment() throws SAXException {
   905         // [NOCPP[
   906         switch (commentPolicy) {
   907             case ALTER_INFOSET:
   908                 // detachLongStrBuf();
   909                 appendLongStrBuf(' ');
   910                 // FALLTHROUGH
   911             case ALLOW:
   912                 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
   913                 // ]NOCPP]
   914                 appendLongStrBuf('-');
   915                 // [NOCPP[
   916                 break;
   917             case FATAL:
   918                 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
   919                 break;
   920         }
   921         // ]NOCPP]
   922     }
   924     // [NOCPP[
   925     private void maybeAppendSpaceToBogusComment() throws SAXException {
   926         switch (commentPolicy) {
   927             case ALTER_INFOSET:
   928                 // detachLongStrBuf();
   929                 appendLongStrBuf(' ');
   930                 // FALLTHROUGH
   931             case ALLOW:
   932                 warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
   933                 break;
   934             case FATAL:
   935                 fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
   936                 break;
   937         }
   938     }
   940     // ]NOCPP]
   942     @Inline private void adjustDoubleHyphenAndAppendToLongStrBufAndErr(char c)
   943             throws SAXException {
   944         errConsecutiveHyphens();
   945         // [NOCPP[
   946         switch (commentPolicy) {
   947             case ALTER_INFOSET:
   948                 // detachLongStrBuf();
   949                 longStrBufLen--;
   950                 appendLongStrBuf(' ');
   951                 appendLongStrBuf('-');
   952                 // FALLTHROUGH
   953             case ALLOW:
   954                 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
   955                 // ]NOCPP]
   956                 appendLongStrBuf(c);
   957                 // [NOCPP[
   958                 break;
   959             case FATAL:
   960                 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
   961                 break;
   962         }
   963         // ]NOCPP]
   964     }
   966     private void appendLongStrBuf(@NoLength char[] buffer, int offset, int length) {
   967         int reqLen = longStrBufLen + length;
   968         if (longStrBuf.length < reqLen) {
   969             char[] newBuf = new char[reqLen + (reqLen >> 1)];
   970             System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);
   971             longStrBuf = newBuf;
   972         }
   973         System.arraycopy(buffer, offset, longStrBuf, longStrBufLen, length);
   974         longStrBufLen = reqLen;
   975     }
   977     /**
   978      * Append the contents of the smaller buffer to the larger one.
   979      */
   980     @Inline private void appendStrBufToLongStrBuf() {
   981         appendLongStrBuf(strBuf, 0, strBufLen);
   982     }
   984     /**
   985      * The larger buffer as a string.
   986      * 
   987      * <p>
   988      * C++ memory note: The return value must be released.
   989      * 
   990      * @return the larger buffer as a string
   991      */
   992     private String longStrBufToString() {
   993         return Portability.newStringFromBuffer(longStrBuf, 0, longStrBufLen);
   994     }
   996     /**
   997      * Emits the current comment token.
   998      * 
   999      * @param pos
  1000      *            TODO
  1002      * @throws SAXException
  1003      */
  1004     private void emitComment(int provisionalHyphens, int pos)
  1005             throws SAXException {
  1006         // [NOCPP[
  1007         if (wantsComments) {
  1008             // ]NOCPP]
  1009             // if (longStrBufOffset != -1) {
  1010             // tokenHandler.comment(buf, longStrBufOffset, longStrBufLen
  1011             // - provisionalHyphens);
  1012             // } else {
  1013             tokenHandler.comment(longStrBuf, 0, longStrBufLen
  1014                     - provisionalHyphens);
  1015             // }
  1016             // [NOCPP[
  1018         // ]NOCPP]
  1019         cstart = pos + 1;
  1022     /**
  1023      * Flushes coalesced character tokens.
  1025      * @param buf
  1026      *            TODO
  1027      * @param pos
  1028      *            TODO
  1030      * @throws SAXException
  1031      */
  1032     protected void flushChars(@NoLength char[] buf, int pos)
  1033             throws SAXException {
  1034         if (pos > cstart) {
  1035             tokenHandler.characters(buf, cstart, pos - cstart);
  1037         cstart = Integer.MAX_VALUE;
  1040     /**
  1041      * Reports an condition that would make the infoset incompatible with XML
  1042      * 1.0 as fatal.
  1044      * @param message
  1045      *            the message
  1046      * @throws SAXException
  1047      * @throws SAXParseException
  1048      */
  1049     public void fatal(String message) throws SAXException {
  1050         SAXParseException spe = new SAXParseException(message, this);
  1051         if (errorHandler != null) {
  1052             errorHandler.fatalError(spe);
  1054         throw spe;
  1057     /**
  1058      * Reports a Parse Error.
  1060      * @param message
  1061      *            the message
  1062      * @throws SAXException
  1063      */
  1064     public void err(String message) throws SAXException {
  1065         if (errorHandler == null) {
  1066             return;
  1068         SAXParseException spe = new SAXParseException(message, this);
  1069         errorHandler.error(spe);
  1072     public void errTreeBuilder(String message) throws SAXException {
  1073         ErrorHandler eh = null;
  1074         if (tokenHandler instanceof TreeBuilder<?>) {
  1075             TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler;
  1076             eh = treeBuilder.getErrorHandler();
  1078         if (eh == null) {
  1079             eh = errorHandler;
  1081         if (eh == null) {
  1082             return;
  1084         SAXParseException spe = new SAXParseException(message, this);
  1085         eh.error(spe);
  1088     /**
  1089      * Reports a warning
  1091      * @param message
  1092      *            the message
  1093      * @throws SAXException
  1094      */
  1095     public void warn(String message) throws SAXException {
  1096         if (errorHandler == null) {
  1097             return;
  1099         SAXParseException spe = new SAXParseException(message, this);
  1100         errorHandler.warning(spe);
  1103     private void strBufToElementNameString() {
  1104         // if (strBufOffset != -1) {
  1105         // return ElementName.elementNameByBuffer(buf, strBufOffset, strBufLen);
  1106         // } else {
  1107         tagName = ElementName.elementNameByBuffer(strBuf, 0, strBufLen,
  1108                 interner);
  1109         // }
  1112     private int emitCurrentTagToken(boolean selfClosing, int pos)
  1113             throws SAXException {
  1114         cstart = pos + 1;
  1115         maybeErrSlashInEndTag(selfClosing);
  1116         stateSave = Tokenizer.DATA;
  1117         HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES
  1118                 : attributes);
  1119         if (endTag) {
  1120             /*
  1121              * When an end tag token is emitted, the content model flag must be
  1122              * switched to the PCDATA state.
  1123              */
  1124             maybeErrAttributesOnEndTag(attrs);
  1125             // CPPONLY: if (!viewingXmlSource) {
  1126             tokenHandler.endTag(tagName);
  1127             // CPPONLY: }
  1128             // CPPONLY: if (newAttributesEachTime) {
  1129             // CPPONLY:   Portability.delete(attributes);
  1130             // CPPONLY:   attributes = null;
  1131             // CPPONLY: }
  1132         } else {
  1133             // CPPONLY: if (viewingXmlSource) {
  1134             // CPPONLY:   assert newAttributesEachTime;
  1135             // CPPONLY:   Portability.delete(attributes);
  1136             // CPPONLY:   attributes = null;
  1137             // CPPONLY: } else {
  1138             tokenHandler.startTag(tagName, attrs, selfClosing);
  1139             // CPPONLY: }
  1141         tagName.release();
  1142         tagName = null;
  1143         if (newAttributesEachTime) {
  1144             attributes = null;
  1145         } else {
  1146             attributes.clear(mappingLangToXmlLang);
  1148         /*
  1149          * The token handler may have called setStateAndEndTagExpectation
  1150          * and changed stateSave since the start of this method.
  1151          */
  1152         return stateSave;
  1155     private void attributeNameComplete() throws SAXException {
  1156         // if (strBufOffset != -1) {
  1157         // attributeName = AttributeName.nameByBuffer(buf, strBufOffset,
  1158         // strBufLen, namePolicy != XmlViolationPolicy.ALLOW);
  1159         // } else {
  1160         attributeName = AttributeName.nameByBuffer(strBuf, 0, strBufLen
  1161         // [NOCPP[
  1162                 , namePolicy != XmlViolationPolicy.ALLOW
  1163                 // ]NOCPP]
  1164                 , interner);
  1165         // }
  1167         if (attributes == null) {
  1168             attributes = new HtmlAttributes(mappingLangToXmlLang);
  1171         /*
  1172          * When the user agent leaves the attribute name state (and before
  1173          * emitting the tag token, if appropriate), the complete attribute's
  1174          * name must be compared to the other attributes on the same token; if
  1175          * there is already an attribute on the token with the exact same name,
  1176          * then this is a parse error and the new attribute must be dropped,
  1177          * along with the value that gets associated with it (if any).
  1178          */
  1179         if (attributes.contains(attributeName)) {
  1180             errDuplicateAttribute();
  1181             attributeName.release();
  1182             attributeName = null;
  1186     private void addAttributeWithoutValue() throws SAXException {
  1187         noteAttributeWithoutValue();
  1189         // [NOCPP[
  1190         if (metaBoundaryPassed && AttributeName.CHARSET == attributeName
  1191                 && ElementName.META == tagName) {
  1192             err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
  1194         // ]NOCPP]
  1195         if (attributeName != null) {
  1196             // [NOCPP[
  1197             if (html4) {
  1198                 if (attributeName.isBoolean()) {
  1199                     if (html4ModeCompatibleWithXhtml1Schemata) {
  1200                         attributes.addAttribute(attributeName,
  1201                                 attributeName.getLocal(AttributeName.HTML),
  1202                                 xmlnsPolicy);
  1203                     } else {
  1204                         attributes.addAttribute(attributeName, "", xmlnsPolicy);
  1206                 } else {
  1207                     if (AttributeName.BORDER != attributeName) {
  1208                         err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)");
  1209                         attributes.addAttribute(attributeName, "", xmlnsPolicy);
  1212             } else {
  1213                 if (AttributeName.SRC == attributeName
  1214                         || AttributeName.HREF == attributeName) {
  1215                     warn("Attribute \u201C"
  1216                             + attributeName.getLocal(AttributeName.HTML)
  1217                             + "\u201D without an explicit value seen. The attribute may be dropped by IE7.");
  1219                 // ]NOCPP]
  1220                 attributes.addAttribute(attributeName,
  1221                         Portability.newEmptyString()
  1222                         // [NOCPP[
  1223                         , xmlnsPolicy
  1224                 // ]NOCPP]
  1225                 );
  1226                 // [NOCPP[
  1228             // ]NOCPP]
  1229             attributeName = null; // attributeName has been adopted by the
  1230             // |attributes| object
  1234     private void addAttributeWithValue() throws SAXException {
  1235         // [NOCPP[
  1236         if (metaBoundaryPassed && ElementName.META == tagName
  1237                 && AttributeName.CHARSET == attributeName) {
  1238             err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
  1240         // ]NOCPP]
  1241         if (attributeName != null) {
  1242             String val = longStrBufToString(); // Ownership transferred to
  1243             // HtmlAttributes
  1244             // CPPONLY: if (mViewSource) {
  1245             // CPPONLY:   mViewSource.MaybeLinkifyAttributeValue(attributeName, val);
  1246             // CPPONLY: }
  1247             // [NOCPP[
  1248             if (!endTag && html4 && html4ModeCompatibleWithXhtml1Schemata
  1249                     && attributeName.isCaseFolded()) {
  1250                 val = newAsciiLowerCaseStringFromString(val);
  1252             // ]NOCPP]
  1253             attributes.addAttribute(attributeName, val
  1254             // [NOCPP[
  1255                     , xmlnsPolicy
  1256             // ]NOCPP]
  1257             );
  1258             attributeName = null; // attributeName has been adopted by the
  1259             // |attributes| object
  1263     // [NOCPP[
  1265     private static String newAsciiLowerCaseStringFromString(String str) {
  1266         if (str == null) {
  1267             return null;
  1269         char[] buf = new char[str.length()];
  1270         for (int i = 0; i < str.length(); i++) {
  1271             char c = str.charAt(i);
  1272             if (c >= 'A' && c <= 'Z') {
  1273                 c += 0x20;
  1275             buf[i] = c;
  1277         return new String(buf);
  1280     protected void startErrorReporting() throws SAXException {
  1284     // ]NOCPP]
  1286     public void start() throws SAXException {
  1287         initializeWithoutStarting();
  1288         tokenHandler.startTokenization(this);
  1289         // [NOCPP[
  1290         startErrorReporting();
  1291         // ]NOCPP]
  1294     public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException {
  1295         int state = stateSave;
  1296         int returnState = returnStateSave;
  1297         char c = '\u0000';
  1298         shouldSuspend = false;
  1299         lastCR = false;
  1301         int start = buffer.getStart();
  1302         /**
  1303          * The index of the last <code>char</code> read from <code>buf</code>.
  1304          */
  1305         int pos = start - 1;
  1307         /**
  1308          * The index of the first <code>char</code> in <code>buf</code> that is
  1309          * part of a coalesced run of character tokens or
  1310          * <code>Integer.MAX_VALUE</code> if there is not a current run being
  1311          * coalesced.
  1312          */
  1313         switch (state) {
  1314             case DATA:
  1315             case RCDATA:
  1316             case SCRIPT_DATA:
  1317             case PLAINTEXT:
  1318             case RAWTEXT:
  1319             case CDATA_SECTION:
  1320             case SCRIPT_DATA_ESCAPED:
  1321             case SCRIPT_DATA_ESCAPE_START:
  1322             case SCRIPT_DATA_ESCAPE_START_DASH:
  1323             case SCRIPT_DATA_ESCAPED_DASH:
  1324             case SCRIPT_DATA_ESCAPED_DASH_DASH:
  1325             case SCRIPT_DATA_DOUBLE_ESCAPE_START:
  1326             case SCRIPT_DATA_DOUBLE_ESCAPED:
  1327             case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
  1328             case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
  1329             case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
  1330             case SCRIPT_DATA_DOUBLE_ESCAPE_END:
  1331                 cstart = start;
  1332                 break;
  1333             default:
  1334                 cstart = Integer.MAX_VALUE;
  1335                 break;
  1338         /**
  1339          * The number of <code>char</code>s in <code>buf</code> that have
  1340          * meaning. (The rest of the array is garbage and should not be
  1341          * examined.)
  1342          */
  1343         // CPPONLY: if (mViewSource) {
  1344         // CPPONLY:   mViewSource.SetBuffer(buffer);
  1345         // CPPONLY:   pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
  1346         // CPPONLY:   mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1);
  1347         // CPPONLY: } else {
  1348         // CPPONLY:   pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
  1349         // CPPONLY: }
  1350         // [NOCPP[
  1351         pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState,
  1352                 buffer.getEnd());
  1353         // ]NOCPP]
  1354         if (pos == buffer.getEnd()) {
  1355             // exiting due to end of buffer
  1356             buffer.setStart(pos);
  1357         } else {
  1358             buffer.setStart(pos + 1);
  1360         return lastCR;
  1363     @SuppressWarnings("unused") private int stateLoop(int state, char c,
  1364             int pos, @NoLength char[] buf, boolean reconsume, int returnState,
  1365             int endPos) throws SAXException {
  1366         /*
  1367          * Idioms used in this code:
  1370          * Consuming the next input character
  1372          * To consume the next input character, the code does this: if (++pos ==
  1373          * endPos) { break stateloop; } c = checkChar(buf, pos);
  1376          * Staying in a state
  1378          * When there's a state that the tokenizer may stay in over multiple
  1379          * input characters, the state has a wrapper |for(;;)| loop and staying
  1380          * in the state continues the loop.
  1383          * Switching to another state
  1385          * To switch to another state, the code sets the state variable to the
  1386          * magic number of the new state. Then it either continues stateloop or
  1387          * breaks out of the state's own wrapper loop if the target state is
  1388          * right after the current state in source order. (This is a partial
  1389          * workaround for Java's lack of goto.)
  1392          * Reconsume support
  1394          * The spec sometimes says that an input character is reconsumed in
  1395          * another state. If a state can ever be entered so that an input
  1396          * character can be reconsumed in it, the state's code starts with an
  1397          * |if (reconsume)| that sets reconsume to false and skips over the
  1398          * normal code for consuming a new character.
  1400          * To reconsume the current character in another state, the code sets
  1401          * |reconsume| to true and then switches to the other state.
  1404          * Emitting character tokens
  1406          * This method emits character tokens lazily. Whenever a new range of
  1407          * character tokens starts, the field cstart must be set to the start
  1408          * index of the range. The flushChars() method must be called at the end
  1409          * of a range to flush it.
  1412          * U+0000 handling
  1414          * The various states have to handle the replacement of U+0000 with
  1415          * U+FFFD. However, if U+0000 would be reconsumed in another state, the
  1416          * replacement doesn't need to happen, because it's handled by the
  1417          * reconsuming state.
  1420          * LF handling
  1422          * Every state needs to increment the line number upon LF unless the LF
  1423          * gets reconsumed by another state which increments the line number.
  1426          * CR handling
  1428          * Every state needs to handle CR unless the CR gets reconsumed and is
  1429          * handled by the reconsuming state. The CR needs to be handled as if it
  1430          * were and LF, the lastCR field must be set to true and then this
  1431          * method must return. The IO driver will then swallow the next
  1432          * character if it is an LF to coalesce CRLF.
  1433          */
  1434         stateloop: for (;;) {
  1435             switch (state) {
  1436                 case DATA:
  1437                     dataloop: for (;;) {
  1438                         if (reconsume) {
  1439                             reconsume = false;
  1440                         } else {
  1441                             if (++pos == endPos) {
  1442                                 break stateloop;
  1444                             c = checkChar(buf, pos);
  1446                         switch (c) {
  1447                             case '&':
  1448                                 /*
  1449                                  * U+0026 AMPERSAND (&) Switch to the character
  1450                                  * reference in data state.
  1451                                  */
  1452                                 flushChars(buf, pos);
  1453                                 clearStrBufAndAppend(c);
  1454                                 setAdditionalAndRememberAmpersandLocation('\u0000');
  1455                                 returnState = state;
  1456                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
  1457                                 continue stateloop;
  1458                             case '<':
  1459                                 /*
  1460                                  * U+003C LESS-THAN SIGN (<) Switch to the tag
  1461                                  * open state.
  1462                                  */
  1463                                 flushChars(buf, pos);
  1465                                 state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos);
  1466                                 break dataloop; // FALL THROUGH continue
  1467                             // stateloop;
  1468                             case '\u0000':
  1469                                 emitReplacementCharacter(buf, pos);
  1470                                 continue;
  1471                             case '\r':
  1472                                 emitCarriageReturn(buf, pos);
  1473                                 break stateloop;
  1474                             case '\n':
  1475                                 silentLineFeed();
  1476                             default:
  1477                                 /*
  1478                                  * Anything else Emit the input character as a
  1479                                  * character token.
  1481                                  * Stay in the data state.
  1482                                  */
  1483                                 continue;
  1486                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  1487                 case TAG_OPEN:
  1488                     tagopenloop: for (;;) {
  1489                         /*
  1490                          * The behavior of this state depends on the content
  1491                          * model flag.
  1492                          */
  1493                         if (++pos == endPos) {
  1494                             break stateloop;
  1496                         c = checkChar(buf, pos);
  1497                         /*
  1498                          * If the content model flag is set to the PCDATA state
  1499                          * Consume the next input character:
  1500                          */
  1501                         if (c >= 'A' && c <= 'Z') {
  1502                             /*
  1503                              * U+0041 LATIN CAPITAL LETTER A through to U+005A
  1504                              * LATIN CAPITAL LETTER Z Create a new start tag
  1505                              * token,
  1506                              */
  1507                             endTag = false;
  1508                             /*
  1509                              * set its tag name to the lowercase version of the
  1510                              * input character (add 0x0020 to the character's
  1511                              * code point),
  1512                              */
  1513                             clearStrBufAndAppend((char) (c + 0x20));
  1514                             /* then switch to the tag name state. */
  1515                             state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
  1516                             /*
  1517                              * (Don't emit the token yet; further details will
  1518                              * be filled in before it is emitted.)
  1519                              */
  1520                             break tagopenloop;
  1521                             // continue stateloop;
  1522                         } else if (c >= 'a' && c <= 'z') {
  1523                             /*
  1524                              * U+0061 LATIN SMALL LETTER A through to U+007A
  1525                              * LATIN SMALL LETTER Z Create a new start tag
  1526                              * token,
  1527                              */
  1528                             endTag = false;
  1529                             /*
  1530                              * set its tag name to the input character,
  1531                              */
  1532                             clearStrBufAndAppend(c);
  1533                             /* then switch to the tag name state. */
  1534                             state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
  1535                             /*
  1536                              * (Don't emit the token yet; further details will
  1537                              * be filled in before it is emitted.)
  1538                              */
  1539                             break tagopenloop;
  1540                             // continue stateloop;
  1542                         switch (c) {
  1543                             case '!':
  1544                                 /*
  1545                                  * U+0021 EXCLAMATION MARK (!) Switch to the
  1546                                  * markup declaration open state.
  1547                                  */
  1548                                 state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos);
  1549                                 continue stateloop;
  1550                             case '/':
  1551                                 /*
  1552                                  * U+002F SOLIDUS (/) Switch to the close tag
  1553                                  * open state.
  1554                                  */
  1555                                 state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos);
  1556                                 continue stateloop;
  1557                             case '?':
  1558                                 // CPPONLY: if (viewingXmlSource) {
  1559                                 // CPPONLY: state = transition(state,
  1560                                 // CPPONLY: Tokenizer.PROCESSING_INSTRUCTION,
  1561                                 // CPPONLY: reconsume,
  1562                                 // CPPONLY: pos);
  1563                                 // CPPONLY: continue stateloop;
  1564                                 // CPPONLY: }
  1565                                 /*
  1566                                  * U+003F QUESTION MARK (?) Parse error.
  1567                                  */
  1568                                 errProcessingInstruction();
  1569                                 /*
  1570                                  * Switch to the bogus comment state.
  1571                                  */
  1572                                 clearLongStrBufAndAppend(c);
  1573                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
  1574                                 continue stateloop;
  1575                             case '>':
  1576                                 /*
  1577                                  * U+003E GREATER-THAN SIGN (>) Parse error.
  1578                                  */
  1579                                 errLtGt();
  1580                                 /*
  1581                                  * Emit a U+003C LESS-THAN SIGN character token
  1582                                  * and a U+003E GREATER-THAN SIGN character
  1583                                  * token.
  1584                                  */
  1585                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 2);
  1586                                 /* Switch to the data state. */
  1587                                 cstart = pos + 1;
  1588                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  1589                                 continue stateloop;
  1590                             default:
  1591                                 /*
  1592                                  * Anything else Parse error.
  1593                                  */
  1594                                 errBadCharAfterLt(c);
  1595                                 /*
  1596                                  * Emit a U+003C LESS-THAN SIGN character token
  1597                                  */
  1598                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
  1599                                 /*
  1600                                  * and reconsume the current input character in
  1601                                  * the data state.
  1602                                  */
  1603                                 cstart = pos;
  1604                                 reconsume = true;
  1605                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  1606                                 continue stateloop;
  1609                     // FALL THROUGH DON'T REORDER
  1610                 case TAG_NAME:
  1611                     tagnameloop: for (;;) {
  1612                         if (++pos == endPos) {
  1613                             break stateloop;
  1615                         c = checkChar(buf, pos);
  1616                         /*
  1617                          * Consume the next input character:
  1618                          */
  1619                         switch (c) {
  1620                             case '\r':
  1621                                 silentCarriageReturn();
  1622                                 strBufToElementNameString();
  1623                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
  1624                                 break stateloop;
  1625                             case '\n':
  1626                                 silentLineFeed();
  1627                             case ' ':
  1628                             case '\t':
  1629                             case '\u000C':
  1630                                 /*
  1631                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
  1632                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
  1633                                  * Switch to the before attribute name state.
  1634                                  */
  1635                                 strBufToElementNameString();
  1636                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
  1637                                 break tagnameloop;
  1638                             // continue stateloop;
  1639                             case '/':
  1640                                 /*
  1641                                  * U+002F SOLIDUS (/) Switch to the self-closing
  1642                                  * start tag state.
  1643                                  */
  1644                                 strBufToElementNameString();
  1645                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
  1646                                 continue stateloop;
  1647                             case '>':
  1648                                 /*
  1649                                  * U+003E GREATER-THAN SIGN (>) Emit the current
  1650                                  * tag token.
  1651                                  */
  1652                                 strBufToElementNameString();
  1653                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
  1654                                 if (shouldSuspend) {
  1655                                     break stateloop;
  1657                                 /*
  1658                                  * Switch to the data state.
  1659                                  */
  1660                                 continue stateloop;
  1661                             case '\u0000':
  1662                                 c = '\uFFFD';
  1663                                 // fall thru
  1664                             default:
  1665                                 if (c >= 'A' && c <= 'Z') {
  1666                                     /*
  1667                                      * U+0041 LATIN CAPITAL LETTER A through to
  1668                                      * U+005A LATIN CAPITAL LETTER Z Append the
  1669                                      * lowercase version of the current input
  1670                                      * character (add 0x0020 to the character's
  1671                                      * code point) to the current tag token's
  1672                                      * tag name.
  1673                                      */
  1674                                     c += 0x20;
  1676                                 /*
  1677                                  * Anything else Append the current input
  1678                                  * character to the current tag token's tag
  1679                                  * name.
  1680                                  */
  1681                                 appendStrBuf(c);
  1682                                 /*
  1683                                  * Stay in the tag name state.
  1684                                  */
  1685                                 continue;
  1688                     // FALLTHRU DON'T REORDER
  1689                 case BEFORE_ATTRIBUTE_NAME:
  1690                     beforeattributenameloop: for (;;) {
  1691                         if (reconsume) {
  1692                             reconsume = false;
  1693                         } else {
  1694                             if (++pos == endPos) {
  1695                                 break stateloop;
  1697                             c = checkChar(buf, pos);
  1699                         /*
  1700                          * Consume the next input character:
  1701                          */
  1702                         switch (c) {
  1703                             case '\r':
  1704                                 silentCarriageReturn();
  1705                                 break stateloop;
  1706                             case '\n':
  1707                                 silentLineFeed();
  1708                                 // fall thru
  1709                             case ' ':
  1710                             case '\t':
  1711                             case '\u000C':
  1712                                 /*
  1713                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
  1714                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
  1715                                  * in the before attribute name state.
  1716                                  */
  1717                                 continue;
  1718                             case '/':
  1719                                 /*
  1720                                  * U+002F SOLIDUS (/) Switch to the self-closing
  1721                                  * start tag state.
  1722                                  */
  1723                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
  1724                                 continue stateloop;
  1725                             case '>':
  1726                                 /*
  1727                                  * U+003E GREATER-THAN SIGN (>) Emit the current
  1728                                  * tag token.
  1729                                  */
  1730                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
  1731                                 if (shouldSuspend) {
  1732                                     break stateloop;
  1734                                 /*
  1735                                  * Switch to the data state.
  1736                                  */
  1737                                 continue stateloop;
  1738                             case '\u0000':
  1739                                 c = '\uFFFD';
  1740                                 // fall thru
  1741                             case '\"':
  1742                             case '\'':
  1743                             case '<':
  1744                             case '=':
  1745                                 /*
  1746                                  * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
  1747                                  * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
  1748                                  * SIGN (=) Parse error.
  1749                                  */
  1750                                 errBadCharBeforeAttributeNameOrNull(c);
  1751                                 /*
  1752                                  * Treat it as per the "anything else" entry
  1753                                  * below.
  1754                                  */
  1755                             default:
  1756                                 /*
  1757                                  * Anything else Start a new attribute in the
  1758                                  * current tag token.
  1759                                  */
  1760                                 if (c >= 'A' && c <= 'Z') {
  1761                                     /*
  1762                                      * U+0041 LATIN CAPITAL LETTER A through to
  1763                                      * U+005A LATIN CAPITAL LETTER Z Set that
  1764                                      * attribute's name to the lowercase version
  1765                                      * of the current input character (add
  1766                                      * 0x0020 to the character's code point)
  1767                                      */
  1768                                     c += 0x20;
  1770                                 /*
  1771                                  * Set that attribute's name to the current
  1772                                  * input character,
  1773                                  */
  1774                                 clearStrBufAndAppend(c);
  1775                                 /*
  1776                                  * and its value to the empty string.
  1777                                  */
  1778                                 // Will do later.
  1779                                 /*
  1780                                  * Switch to the attribute name state.
  1781                                  */
  1782                                 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
  1783                                 break beforeattributenameloop;
  1784                             // continue stateloop;
  1787                     // FALLTHRU DON'T REORDER
  1788                 case ATTRIBUTE_NAME:
  1789                     attributenameloop: for (;;) {
  1790                         if (++pos == endPos) {
  1791                             break stateloop;
  1793                         c = checkChar(buf, pos);
  1794                         /*
  1795                          * Consume the next input character:
  1796                          */
  1797                         switch (c) {
  1798                             case '\r':
  1799                                 silentCarriageReturn();
  1800                                 attributeNameComplete();
  1801                                 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
  1802                                 break stateloop;
  1803                             case '\n':
  1804                                 silentLineFeed();
  1805                                 // fall thru
  1806                             case ' ':
  1807                             case '\t':
  1808                             case '\u000C':
  1809                                 /*
  1810                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
  1811                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
  1812                                  * Switch to the after attribute name state.
  1813                                  */
  1814                                 attributeNameComplete();
  1815                                 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
  1816                                 continue stateloop;
  1817                             case '/':
  1818                                 /*
  1819                                  * U+002F SOLIDUS (/) Switch to the self-closing
  1820                                  * start tag state.
  1821                                  */
  1822                                 attributeNameComplete();
  1823                                 addAttributeWithoutValue();
  1824                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
  1825                                 continue stateloop;
  1826                             case '=':
  1827                                 /*
  1828                                  * U+003D EQUALS SIGN (=) Switch to the before
  1829                                  * attribute value state.
  1830                                  */
  1831                                 attributeNameComplete();
  1832                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
  1833                                 break attributenameloop;
  1834                             // continue stateloop;
  1835                             case '>':
  1836                                 /*
  1837                                  * U+003E GREATER-THAN SIGN (>) Emit the current
  1838                                  * tag token.
  1839                                  */
  1840                                 attributeNameComplete();
  1841                                 addAttributeWithoutValue();
  1842                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
  1843                                 if (shouldSuspend) {
  1844                                     break stateloop;
  1846                                 /*
  1847                                  * Switch to the data state.
  1848                                  */
  1849                                 continue stateloop;
  1850                             case '\u0000':
  1851                                 c = '\uFFFD';
  1852                                 // fall thru
  1853                             case '\"':
  1854                             case '\'':
  1855                             case '<':
  1856                                 /*
  1857                                  * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
  1858                                  * (') U+003C LESS-THAN SIGN (<) Parse error.
  1859                                  */
  1860                                 errQuoteOrLtInAttributeNameOrNull(c);
  1861                                 /*
  1862                                  * Treat it as per the "anything else" entry
  1863                                  * below.
  1864                                  */
  1865                             default:
  1866                                 if (c >= 'A' && c <= 'Z') {
  1867                                     /*
  1868                                      * U+0041 LATIN CAPITAL LETTER A through to
  1869                                      * U+005A LATIN CAPITAL LETTER Z Append the
  1870                                      * lowercase version of the current input
  1871                                      * character (add 0x0020 to the character's
  1872                                      * code point) to the current attribute's
  1873                                      * name.
  1874                                      */
  1875                                     c += 0x20;
  1877                                 /*
  1878                                  * Anything else Append the current input
  1879                                  * character to the current attribute's name.
  1880                                  */
  1881                                 appendStrBuf(c);
  1882                                 /*
  1883                                  * Stay in the attribute name state.
  1884                                  */
  1885                                 continue;
  1888                     // FALLTHRU DON'T REORDER
  1889                 case BEFORE_ATTRIBUTE_VALUE:
  1890                     beforeattributevalueloop: for (;;) {
  1891                         if (++pos == endPos) {
  1892                             break stateloop;
  1894                         c = checkChar(buf, pos);
  1895                         /*
  1896                          * Consume the next input character:
  1897                          */
  1898                         switch (c) {
  1899                             case '\r':
  1900                                 silentCarriageReturn();
  1901                                 break stateloop;
  1902                             case '\n':
  1903                                 silentLineFeed();
  1904                                 // fall thru
  1905                             case ' ':
  1906                             case '\t':
  1907                             case '\u000C':
  1908                                 /*
  1909                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
  1910                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
  1911                                  * in the before attribute value state.
  1912                                  */
  1913                                 continue;
  1914                             case '"':
  1915                                 /*
  1916                                  * U+0022 QUOTATION MARK (") Switch to the
  1917                                  * attribute value (double-quoted) state.
  1918                                  */
  1919                                 clearLongStrBuf();
  1920                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos);
  1921                                 break beforeattributevalueloop;
  1922                             // continue stateloop;
  1923                             case '&':
  1924                                 /*
  1925                                  * U+0026 AMPERSAND (&) Switch to the attribute
  1926                                  * value (unquoted) state and reconsume this
  1927                                  * input character.
  1928                                  */
  1929                                 clearLongStrBuf();
  1930                                 reconsume = true;
  1931                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
  1932                                 noteUnquotedAttributeValue();
  1933                                 continue stateloop;
  1934                             case '\'':
  1935                                 /*
  1936                                  * U+0027 APOSTROPHE (') Switch to the attribute
  1937                                  * value (single-quoted) state.
  1938                                  */
  1939                                 clearLongStrBuf();
  1940                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos);
  1941                                 continue stateloop;
  1942                             case '>':
  1943                                 /*
  1944                                  * U+003E GREATER-THAN SIGN (>) Parse error.
  1945                                  */
  1946                                 errAttributeValueMissing();
  1947                                 /*
  1948                                  * Emit the current tag token.
  1949                                  */
  1950                                 addAttributeWithoutValue();
  1951                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
  1952                                 if (shouldSuspend) {
  1953                                     break stateloop;
  1955                                 /*
  1956                                  * Switch to the data state.
  1957                                  */
  1958                                 continue stateloop;
  1959                             case '\u0000':
  1960                                 c = '\uFFFD';
  1961                                 // fall thru
  1962                             case '<':
  1963                             case '=':
  1964                             case '`':
  1965                                 /*
  1966                                  * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN
  1967                                  * (=) U+0060 GRAVE ACCENT (`)
  1968                                  */
  1969                                 errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c);
  1970                                 /*
  1971                                  * Treat it as per the "anything else" entry
  1972                                  * below.
  1973                                  */
  1974                             default:
  1975                                 // [NOCPP[
  1976                                 errHtml4NonNameInUnquotedAttribute(c);
  1977                                 // ]NOCPP]
  1978                                 /*
  1979                                  * Anything else Append the current input
  1980                                  * character to the current attribute's value.
  1981                                  */
  1982                                 clearLongStrBufAndAppend(c);
  1983                                 /*
  1984                                  * Switch to the attribute value (unquoted)
  1985                                  * state.
  1986                                  */
  1988                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
  1989                                 noteUnquotedAttributeValue();
  1990                                 continue stateloop;
  1993                     // FALLTHRU DON'T REORDER
  1994                 case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
  1995                     attributevaluedoublequotedloop: for (;;) {
  1996                         if (reconsume) {
  1997                             reconsume = false;
  1998                         } else {
  1999                             if (++pos == endPos) {
  2000                                 break stateloop;
  2002                             c = checkChar(buf, pos);
  2004                         /*
  2005                          * Consume the next input character:
  2006                          */
  2007                         switch (c) {
  2008                             case '"':
  2009                                 /*
  2010                                  * U+0022 QUOTATION MARK (") Switch to the after
  2011                                  * attribute value (quoted) state.
  2012                                  */
  2013                                 addAttributeWithValue();
  2015                                 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
  2016                                 break attributevaluedoublequotedloop;
  2017                             // continue stateloop;
  2018                             case '&':
  2019                                 /*
  2020                                  * U+0026 AMPERSAND (&) Switch to the character
  2021                                  * reference in attribute value state, with the
  2022                                  * additional allowed character being U+0022
  2023                                  * QUOTATION MARK (").
  2024                                  */
  2025                                 clearStrBufAndAppend(c);
  2026                                 setAdditionalAndRememberAmpersandLocation('\"');
  2027                                 returnState = state;
  2028                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
  2029                                 continue stateloop;
  2030                             case '\r':
  2031                                 appendLongStrBufCarriageReturn();
  2032                                 break stateloop;
  2033                             case '\n':
  2034                                 appendLongStrBufLineFeed();
  2035                                 continue;
  2036                             case '\u0000':
  2037                                 c = '\uFFFD';
  2038                                 // fall thru
  2039                             default:
  2040                                 /*
  2041                                  * Anything else Append the current input
  2042                                  * character to the current attribute's value.
  2043                                  */
  2044                                 appendLongStrBuf(c);
  2045                                 /*
  2046                                  * Stay in the attribute value (double-quoted)
  2047                                  * state.
  2048                                  */
  2049                                 continue;
  2052                     // FALLTHRU DON'T REORDER
  2053                 case AFTER_ATTRIBUTE_VALUE_QUOTED:
  2054                     afterattributevaluequotedloop: for (;;) {
  2055                         if (++pos == endPos) {
  2056                             break stateloop;
  2058                         c = checkChar(buf, pos);
  2059                         /*
  2060                          * Consume the next input character:
  2061                          */
  2062                         switch (c) {
  2063                             case '\r':
  2064                                 silentCarriageReturn();
  2065                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
  2066                                 break stateloop;
  2067                             case '\n':
  2068                                 silentLineFeed();
  2069                                 // fall thru
  2070                             case ' ':
  2071                             case '\t':
  2072                             case '\u000C':
  2073                                 /*
  2074                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
  2075                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
  2076                                  * Switch to the before attribute name state.
  2077                                  */
  2078                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
  2079                                 continue stateloop;
  2080                             case '/':
  2081                                 /*
  2082                                  * U+002F SOLIDUS (/) Switch to the self-closing
  2083                                  * start tag state.
  2084                                  */
  2085                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
  2086                                 break afterattributevaluequotedloop;
  2087                             // continue stateloop;
  2088                             case '>':
  2089                                 /*
  2090                                  * U+003E GREATER-THAN SIGN (>) Emit the current
  2091                                  * tag token.
  2092                                  */
  2093                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
  2094                                 if (shouldSuspend) {
  2095                                     break stateloop;
  2097                                 /*
  2098                                  * Switch to the data state.
  2099                                  */
  2100                                 continue stateloop;
  2101                             default:
  2102                                 /*
  2103                                  * Anything else Parse error.
  2104                                  */
  2105                                 errNoSpaceBetweenAttributes();
  2106                                 /*
  2107                                  * Reconsume the character in the before
  2108                                  * attribute name state.
  2109                                  */
  2110                                 reconsume = true;
  2111                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
  2112                                 continue stateloop;
  2115                     // FALLTHRU DON'T REORDER
  2116                 case SELF_CLOSING_START_TAG:
  2117                     if (++pos == endPos) {
  2118                         break stateloop;
  2120                     c = checkChar(buf, pos);
  2121                     /*
  2122                      * Consume the next input character:
  2123                      */
  2124                     switch (c) {
  2125                         case '>':
  2126                             /*
  2127                              * U+003E GREATER-THAN SIGN (>) Set the self-closing
  2128                              * flag of the current tag token. Emit the current
  2129                              * tag token.
  2130                              */
  2131                             // [NOCPP[
  2132                             errHtml4XmlVoidSyntax();
  2133                             // ]NOCPP]
  2134                             state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos);
  2135                             if (shouldSuspend) {
  2136                                 break stateloop;
  2138                             /*
  2139                              * Switch to the data state.
  2140                              */
  2141                             continue stateloop;
  2142                         default:
  2143                             /* Anything else Parse error. */
  2144                             errSlashNotFollowedByGt();
  2145                             /*
  2146                              * Reconsume the character in the before attribute
  2147                              * name state.
  2148                              */
  2149                             reconsume = true;
  2150                             state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
  2151                             continue stateloop;
  2153                     // XXX reorder point
  2154                 case ATTRIBUTE_VALUE_UNQUOTED:
  2155                     for (;;) {
  2156                         if (reconsume) {
  2157                             reconsume = false;
  2158                         } else {
  2159                             if (++pos == endPos) {
  2160                                 break stateloop;
  2162                             c = checkChar(buf, pos);
  2164                         /*
  2165                          * Consume the next input character:
  2166                          */
  2167                         switch (c) {
  2168                             case '\r':
  2169                                 silentCarriageReturn();
  2170                                 addAttributeWithValue();
  2171                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
  2172                                 break stateloop;
  2173                             case '\n':
  2174                                 silentLineFeed();
  2175                                 // fall thru
  2176                             case ' ':
  2177                             case '\t':
  2178                             case '\u000C':
  2179                                 /*
  2180                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
  2181                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
  2182                                  * Switch to the before attribute name state.
  2183                                  */
  2184                                 addAttributeWithValue();
  2185                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
  2186                                 continue stateloop;
  2187                             case '&':
  2188                                 /*
  2189                                  * U+0026 AMPERSAND (&) Switch to the character
  2190                                  * reference in attribute value state, with the
  2191                                  * additional allowed character being U+003E
  2192                                  * GREATER-THAN SIGN (>)
  2193                                  */
  2194                                 clearStrBufAndAppend(c);
  2195                                 setAdditionalAndRememberAmpersandLocation('>');
  2196                                 returnState = state;
  2197                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
  2198                                 continue stateloop;
  2199                             case '>':
  2200                                 /*
  2201                                  * U+003E GREATER-THAN SIGN (>) Emit the current
  2202                                  * tag token.
  2203                                  */
  2204                                 addAttributeWithValue();
  2205                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
  2206                                 if (shouldSuspend) {
  2207                                     break stateloop;
  2209                                 /*
  2210                                  * Switch to the data state.
  2211                                  */
  2212                                 continue stateloop;
  2213                             case '\u0000':
  2214                                 c = '\uFFFD';
  2215                                 // fall thru
  2216                             case '<':
  2217                             case '\"':
  2218                             case '\'':
  2219                             case '=':
  2220                             case '`':
  2221                                 /*
  2222                                  * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
  2223                                  * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
  2224                                  * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error.
  2225                                  */
  2226                                 errUnquotedAttributeValOrNull(c);
  2227                                 /*
  2228                                  * Treat it as per the "anything else" entry
  2229                                  * below.
  2230                                  */
  2231                                 // fall through
  2232                             default:
  2233                                 // [NOCPP]
  2234                                 errHtml4NonNameInUnquotedAttribute(c);
  2235                                 // ]NOCPP]
  2236                                 /*
  2237                                  * Anything else Append the current input
  2238                                  * character to the current attribute's value.
  2239                                  */
  2240                                 appendLongStrBuf(c);
  2241                                 /*
  2242                                  * Stay in the attribute value (unquoted) state.
  2243                                  */
  2244                                 continue;
  2247                     // XXX reorder point
  2248                 case AFTER_ATTRIBUTE_NAME:
  2249                     for (;;) {
  2250                         if (++pos == endPos) {
  2251                             break stateloop;
  2253                         c = checkChar(buf, pos);
  2254                         /*
  2255                          * Consume the next input character:
  2256                          */
  2257                         switch (c) {
  2258                             case '\r':
  2259                                 silentCarriageReturn();
  2260                                 break stateloop;
  2261                             case '\n':
  2262                                 silentLineFeed();
  2263                                 // fall thru
  2264                             case ' ':
  2265                             case '\t':
  2266                             case '\u000C':
  2267                                 /*
  2268                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
  2269                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
  2270                                  * in the after attribute name state.
  2271                                  */
  2272                                 continue;
  2273                             case '/':
  2274                                 /*
  2275                                  * U+002F SOLIDUS (/) Switch to the self-closing
  2276                                  * start tag state.
  2277                                  */
  2278                                 addAttributeWithoutValue();
  2279                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
  2280                                 continue stateloop;
  2281                             case '=':
  2282                                 /*
  2283                                  * U+003D EQUALS SIGN (=) Switch to the before
  2284                                  * attribute value state.
  2285                                  */
  2286                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
  2287                                 continue stateloop;
  2288                             case '>':
  2289                                 /*
  2290                                  * U+003E GREATER-THAN SIGN (>) Emit the current
  2291                                  * tag token.
  2292                                  */
  2293                                 addAttributeWithoutValue();
  2294                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
  2295                                 if (shouldSuspend) {
  2296                                     break stateloop;
  2298                                 /*
  2299                                  * Switch to the data state.
  2300                                  */
  2301                                 continue stateloop;
  2302                             case '\u0000':
  2303                                 c = '\uFFFD';
  2304                                 // fall thru
  2305                             case '\"':
  2306                             case '\'':
  2307                             case '<':
  2308                                 errQuoteOrLtInAttributeNameOrNull(c);
  2309                                 /*
  2310                                  * Treat it as per the "anything else" entry
  2311                                  * below.
  2312                                  */
  2313                             default:
  2314                                 addAttributeWithoutValue();
  2315                                 /*
  2316                                  * Anything else Start a new attribute in the
  2317                                  * current tag token.
  2318                                  */
  2319                                 if (c >= 'A' && c <= 'Z') {
  2320                                     /*
  2321                                      * U+0041 LATIN CAPITAL LETTER A through to
  2322                                      * U+005A LATIN CAPITAL LETTER Z Set that
  2323                                      * attribute's name to the lowercase version
  2324                                      * of the current input character (add
  2325                                      * 0x0020 to the character's code point)
  2326                                      */
  2327                                     c += 0x20;
  2329                                 /*
  2330                                  * Set that attribute's name to the current
  2331                                  * input character,
  2332                                  */
  2333                                 clearStrBufAndAppend(c);
  2334                                 /*
  2335                                  * and its value to the empty string.
  2336                                  */
  2337                                 // Will do later.
  2338                                 /*
  2339                                  * Switch to the attribute name state.
  2340                                  */
  2341                                 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
  2342                                 continue stateloop;
  2345                     // XXX reorder point
  2346                 case MARKUP_DECLARATION_OPEN:
  2347                     markupdeclarationopenloop: for (;;) {
  2348                         if (++pos == endPos) {
  2349                             break stateloop;
  2351                         c = checkChar(buf, pos);
  2352                         /*
  2353                          * If the next two characters are both U+002D
  2354                          * HYPHEN-MINUS characters (-), consume those two
  2355                          * characters, create a comment token whose data is the
  2356                          * empty string, and switch to the comment start state.
  2358                          * Otherwise, if the next seven characters are an ASCII
  2359                          * case-insensitive match for the word "DOCTYPE", then
  2360                          * consume those characters and switch to the DOCTYPE
  2361                          * state.
  2363                          * Otherwise, if the insertion mode is
  2364                          * "in foreign content" and the current node is not an
  2365                          * element in the HTML namespace and the next seven
  2366                          * characters are an case-sensitive match for the string
  2367                          * "[CDATA[" (the five uppercase letters "CDATA" with a
  2368                          * U+005B LEFT SQUARE BRACKET character before and
  2369                          * after), then consume those characters and switch to
  2370                          * the CDATA section state.
  2372                          * Otherwise, is is a parse error. Switch to the bogus
  2373                          * comment state. The next character that is consumed,
  2374                          * if any, is the first character that will be in the
  2375                          * comment.
  2376                          */
  2377                         switch (c) {
  2378                             case '-':
  2379                                 clearLongStrBufAndAppend(c);
  2380                                 state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos);
  2381                                 break markupdeclarationopenloop;
  2382                             // continue stateloop;
  2383                             case 'd':
  2384                             case 'D':
  2385                                 clearLongStrBufAndAppend(c);
  2386                                 index = 0;
  2387                                 state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos);
  2388                                 continue stateloop;
  2389                             case '[':
  2390                                 if (tokenHandler.cdataSectionAllowed()) {
  2391                                     clearLongStrBufAndAppend(c);
  2392                                     index = 0;
  2393                                     state = transition(state, Tokenizer.CDATA_START, reconsume, pos);
  2394                                     continue stateloop;
  2396                                 // else fall through
  2397                             default:
  2398                                 errBogusComment();
  2399                                 clearLongStrBuf();
  2400                                 reconsume = true;
  2401                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
  2402                                 continue stateloop;
  2405                     // FALLTHRU DON'T REORDER
  2406                 case MARKUP_DECLARATION_HYPHEN:
  2407                     markupdeclarationhyphenloop: for (;;) {
  2408                         if (++pos == endPos) {
  2409                             break stateloop;
  2411                         c = checkChar(buf, pos);
  2412                         switch (c) {
  2413                             case '\u0000':
  2414                                 break stateloop;
  2415                             case '-':
  2416                                 clearLongStrBuf();
  2417                                 state = transition(state, Tokenizer.COMMENT_START, reconsume, pos);
  2418                                 break markupdeclarationhyphenloop;
  2419                             // continue stateloop;
  2420                             default:
  2421                                 errBogusComment();
  2422                                 reconsume = true;
  2423                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
  2424                                 continue stateloop;
  2427                     // FALLTHRU DON'T REORDER
  2428                 case COMMENT_START:
  2429                     commentstartloop: for (;;) {
  2430                         if (++pos == endPos) {
  2431                             break stateloop;
  2433                         c = checkChar(buf, pos);
  2434                         /*
  2435                          * Comment start state
  2438                          * Consume the next input character:
  2439                          */
  2440                         switch (c) {
  2441                             case '-':
  2442                                 /*
  2443                                  * U+002D HYPHEN-MINUS (-) Switch to the comment
  2444                                  * start dash state.
  2445                                  */
  2446                                 appendLongStrBuf(c);
  2447                                 state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos);
  2448                                 continue stateloop;
  2449                             case '>':
  2450                                 /*
  2451                                  * U+003E GREATER-THAN SIGN (>) Parse error.
  2452                                  */
  2453                                 errPrematureEndOfComment();
  2454                                 /* Emit the comment token. */
  2455                                 emitComment(0, pos);
  2456                                 /*
  2457                                  * Switch to the data state.
  2458                                  */
  2459                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  2460                                 continue stateloop;
  2461                             case '\r':
  2462                                 appendLongStrBufCarriageReturn();
  2463                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2464                                 break stateloop;
  2465                             case '\n':
  2466                                 appendLongStrBufLineFeed();
  2467                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2468                                 break commentstartloop;
  2469                             case '\u0000':
  2470                                 c = '\uFFFD';
  2471                                 // fall thru
  2472                             default:
  2473                                 /*
  2474                                  * Anything else Append the input character to
  2475                                  * the comment token's data.
  2476                                  */
  2477                                 appendLongStrBuf(c);
  2478                                 /*
  2479                                  * Switch to the comment state.
  2480                                  */
  2481                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2482                                 break commentstartloop;
  2483                             // continue stateloop;
  2486                     // FALLTHRU DON'T REORDER
  2487                 case COMMENT:
  2488                     commentloop: for (;;) {
  2489                         if (++pos == endPos) {
  2490                             break stateloop;
  2492                         c = checkChar(buf, pos);
  2493                         /*
  2494                          * Comment state Consume the next input character:
  2495                          */
  2496                         switch (c) {
  2497                             case '-':
  2498                                 /*
  2499                                  * U+002D HYPHEN-MINUS (-) Switch to the comment
  2500                                  * end dash state
  2501                                  */
  2502                                 appendLongStrBuf(c);
  2503                                 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
  2504                                 break commentloop;
  2505                             // continue stateloop;
  2506                             case '\r':
  2507                                 appendLongStrBufCarriageReturn();
  2508                                 break stateloop;
  2509                             case '\n':
  2510                                 appendLongStrBufLineFeed();
  2511                                 continue;
  2512                             case '\u0000':
  2513                                 c = '\uFFFD';
  2514                                 // fall thru
  2515                             default:
  2516                                 /*
  2517                                  * Anything else Append the input character to
  2518                                  * the comment token's data.
  2519                                  */
  2520                                 appendLongStrBuf(c);
  2521                                 /*
  2522                                  * Stay in the comment state.
  2523                                  */
  2524                                 continue;
  2527                     // FALLTHRU DON'T REORDER
  2528                 case COMMENT_END_DASH:
  2529                     commentenddashloop: for (;;) {
  2530                         if (++pos == endPos) {
  2531                             break stateloop;
  2533                         c = checkChar(buf, pos);
  2534                         /*
  2535                          * Comment end dash state Consume the next input
  2536                          * character:
  2537                          */
  2538                         switch (c) {
  2539                             case '-':
  2540                                 /*
  2541                                  * U+002D HYPHEN-MINUS (-) Switch to the comment
  2542                                  * end state
  2543                                  */
  2544                                 appendLongStrBuf(c);
  2545                                 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
  2546                                 break commentenddashloop;
  2547                             // continue stateloop;
  2548                             case '\r':
  2549                                 appendLongStrBufCarriageReturn();
  2550                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2551                                 break stateloop;
  2552                             case '\n':
  2553                                 appendLongStrBufLineFeed();
  2554                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2555                                 continue stateloop;
  2556                             case '\u0000':
  2557                                 c = '\uFFFD';
  2558                                 // fall thru
  2559                             default:
  2560                                 /*
  2561                                  * Anything else Append a U+002D HYPHEN-MINUS
  2562                                  * (-) character and the input character to the
  2563                                  * comment token's data.
  2564                                  */
  2565                                 appendLongStrBuf(c);
  2566                                 /*
  2567                                  * Switch to the comment state.
  2568                                  */
  2569                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2570                                 continue stateloop;
  2573                     // FALLTHRU DON'T REORDER
  2574                 case COMMENT_END:
  2575                     commentendloop: for (;;) {
  2576                         if (++pos == endPos) {
  2577                             break stateloop;
  2579                         c = checkChar(buf, pos);
  2580                         /*
  2581                          * Comment end dash state Consume the next input
  2582                          * character:
  2583                          */
  2584                         switch (c) {
  2585                             case '>':
  2586                                 /*
  2587                                  * U+003E GREATER-THAN SIGN (>) Emit the comment
  2588                                  * token.
  2589                                  */
  2590                                 emitComment(2, pos);
  2591                                 /*
  2592                                  * Switch to the data state.
  2593                                  */
  2594                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  2595                                 continue stateloop;
  2596                             case '-':
  2597                                 /* U+002D HYPHEN-MINUS (-) Parse error. */
  2598                                 /*
  2599                                  * Append a U+002D HYPHEN-MINUS (-) character to
  2600                                  * the comment token's data.
  2601                                  */
  2602                                 adjustDoubleHyphenAndAppendToLongStrBufAndErr(c);
  2603                                 /*
  2604                                  * Stay in the comment end state.
  2605                                  */
  2606                                 continue;
  2607                             case '\r':
  2608                                 adjustDoubleHyphenAndAppendToLongStrBufCarriageReturn();
  2609                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2610                                 break stateloop;
  2611                             case '\n':
  2612                                 adjustDoubleHyphenAndAppendToLongStrBufLineFeed();
  2613                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2614                                 continue stateloop;
  2615                             case '!':
  2616                                 errHyphenHyphenBang();
  2617                                 appendLongStrBuf(c);
  2618                                 state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos);
  2619                                 continue stateloop;
  2620                             case '\u0000':
  2621                                 c = '\uFFFD';
  2622                                 // fall thru
  2623                             default:
  2624                                 /*
  2625                                  * Append two U+002D HYPHEN-MINUS (-) characters
  2626                                  * and the input character to the comment
  2627                                  * token's data.
  2628                                  */
  2629                                 adjustDoubleHyphenAndAppendToLongStrBufAndErr(c);
  2630                                 /*
  2631                                  * Switch to the comment state.
  2632                                  */
  2633                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2634                                 continue stateloop;
  2637                     // XXX reorder point
  2638                 case COMMENT_END_BANG:
  2639                     for (;;) {
  2640                         if (++pos == endPos) {
  2641                             break stateloop;
  2643                         c = checkChar(buf, pos);
  2644                         /*
  2645                          * Comment end bang state
  2647                          * Consume the next input character:
  2648                          */
  2649                         switch (c) {
  2650                             case '>':
  2651                                 /*
  2652                                  * U+003E GREATER-THAN SIGN (>) Emit the comment
  2653                                  * token.
  2654                                  */
  2655                                 emitComment(3, pos);
  2656                                 /*
  2657                                  * Switch to the data state.
  2658                                  */
  2659                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  2660                                 continue stateloop;
  2661                             case '-':
  2662                                 /*
  2663                                  * Append two U+002D HYPHEN-MINUS (-) characters
  2664                                  * and a U+0021 EXCLAMATION MARK (!) character
  2665                                  * to the comment token's data.
  2666                                  */
  2667                                 appendLongStrBuf(c);
  2668                                 /*
  2669                                  * Switch to the comment end dash state.
  2670                                  */
  2671                                 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
  2672                                 continue stateloop;
  2673                             case '\r':
  2674                                 appendLongStrBufCarriageReturn();
  2675                                 break stateloop;
  2676                             case '\n':
  2677                                 appendLongStrBufLineFeed();
  2678                                 continue;
  2679                             case '\u0000':
  2680                                 c = '\uFFFD';
  2681                                 // fall thru
  2682                             default:
  2683                                 /*
  2684                                  * Anything else Append two U+002D HYPHEN-MINUS
  2685                                  * (-) characters, a U+0021 EXCLAMATION MARK (!)
  2686                                  * character, and the input character to the
  2687                                  * comment token's data. Switch to the comment
  2688                                  * state.
  2689                                  */
  2690                                 appendLongStrBuf(c);
  2691                                 /*
  2692                                  * Switch to the comment state.
  2693                                  */
  2694                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2695                                 continue stateloop;
  2698                     // XXX reorder point
  2699                 case COMMENT_START_DASH:
  2700                     if (++pos == endPos) {
  2701                         break stateloop;
  2703                     c = checkChar(buf, pos);
  2704                     /*
  2705                      * Comment start dash state
  2707                      * Consume the next input character:
  2708                      */
  2709                     switch (c) {
  2710                         case '-':
  2711                             /*
  2712                              * U+002D HYPHEN-MINUS (-) Switch to the comment end
  2713                              * state
  2714                              */
  2715                             appendLongStrBuf(c);
  2716                             state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
  2717                             continue stateloop;
  2718                         case '>':
  2719                             errPrematureEndOfComment();
  2720                             /* Emit the comment token. */
  2721                             emitComment(1, pos);
  2722                             /*
  2723                              * Switch to the data state.
  2724                              */
  2725                             state = transition(state, Tokenizer.DATA, reconsume, pos);
  2726                             continue stateloop;
  2727                         case '\r':
  2728                             appendLongStrBufCarriageReturn();
  2729                             state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2730                             break stateloop;
  2731                         case '\n':
  2732                             appendLongStrBufLineFeed();
  2733                             state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2734                             continue stateloop;
  2735                         case '\u0000':
  2736                             c = '\uFFFD';
  2737                             // fall thru
  2738                         default:
  2739                             /*
  2740                              * Append a U+002D HYPHEN-MINUS character (-) and
  2741                              * the current input character to the comment
  2742                              * token's data.
  2743                              */
  2744                             appendLongStrBuf(c);
  2745                             /*
  2746                              * Switch to the comment state.
  2747                              */
  2748                             state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2749                             continue stateloop;
  2751                     // XXX reorder point
  2752                 case CDATA_START:
  2753                     for (;;) {
  2754                         if (++pos == endPos) {
  2755                             break stateloop;
  2757                         c = checkChar(buf, pos);
  2758                         if (index < 6) { // CDATA_LSQB.length
  2759                             if (c == Tokenizer.CDATA_LSQB[index]) {
  2760                                 appendLongStrBuf(c);
  2761                             } else {
  2762                                 errBogusComment();
  2763                                 reconsume = true;
  2764                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
  2765                                 continue stateloop;
  2767                             index++;
  2768                             continue;
  2769                         } else {
  2770                             cstart = pos; // start coalescing
  2771                             reconsume = true;
  2772                             state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
  2773                             break; // FALL THROUGH continue stateloop;
  2776                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  2777                 case CDATA_SECTION:
  2778                     cdatasectionloop: for (;;) {
  2779                         if (reconsume) {
  2780                             reconsume = false;
  2781                         } else {
  2782                             if (++pos == endPos) {
  2783                                 break stateloop;
  2785                             c = checkChar(buf, pos);
  2787                         switch (c) {
  2788                             case ']':
  2789                                 flushChars(buf, pos);
  2790                                 state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos);
  2791                                 break cdatasectionloop; // FALL THROUGH
  2792                             case '\u0000':
  2793                                 emitReplacementCharacter(buf, pos);
  2794                                 continue;
  2795                             case '\r':
  2796                                 emitCarriageReturn(buf, pos);
  2797                                 break stateloop;
  2798                             case '\n':
  2799                                 silentLineFeed();
  2800                                 // fall thru
  2801                             default:
  2802                                 continue;
  2805                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  2806                 case CDATA_RSQB:
  2807                     cdatarsqb: for (;;) {
  2808                         if (++pos == endPos) {
  2809                             break stateloop;
  2811                         c = checkChar(buf, pos);
  2812                         switch (c) {
  2813                             case ']':
  2814                                 state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos);
  2815                                 break cdatarsqb;
  2816                             default:
  2817                                 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0,
  2818                                         1);
  2819                                 cstart = pos;
  2820                                 reconsume = true;
  2821                                 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
  2822                                 continue stateloop;
  2825                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  2826                 case CDATA_RSQB_RSQB:
  2827                     cdatarsqbrsqb: for (;;) {
  2828                         if (++pos == endPos) {
  2829                             break stateloop;
  2831                         c = checkChar(buf, pos);
  2832                         switch (c) {
  2833                             case ']':
  2834                                 // Saw a third ]. Emit one ] (logically the 
  2835                                 // first one) and stay in this state to 
  2836                                 // remember that the last two characters seen
  2837                                 // have been ]].
  2838                                 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);                                
  2839                                 continue;
  2840                             case '>':
  2841                                 cstart = pos + 1;
  2842                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  2843                                 continue stateloop;
  2844                             default:
  2845                                 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
  2846                                 cstart = pos;
  2847                                 reconsume = true;
  2848                                 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
  2849                                 continue stateloop;
  2852                     // XXX reorder point
  2853                 case ATTRIBUTE_VALUE_SINGLE_QUOTED:
  2854                     attributevaluesinglequotedloop: for (;;) {
  2855                         if (reconsume) {
  2856                             reconsume = false;
  2857                         } else {
  2858                             if (++pos == endPos) {
  2859                                 break stateloop;
  2861                             c = checkChar(buf, pos);
  2863                         /*
  2864                          * Consume the next input character:
  2865                          */
  2866                         switch (c) {
  2867                             case '\'':
  2868                                 /*
  2869                                  * U+0027 APOSTROPHE (') Switch to the after
  2870                                  * attribute value (quoted) state.
  2871                                  */
  2872                                 addAttributeWithValue();
  2874                                 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
  2875                                 continue stateloop;
  2876                             case '&':
  2877                                 /*
  2878                                  * U+0026 AMPERSAND (&) Switch to the character
  2879                                  * reference in attribute value state, with the
  2880                                  * + additional allowed character being U+0027
  2881                                  * APOSTROPHE (').
  2882                                  */
  2883                                 clearStrBufAndAppend(c);
  2884                                 setAdditionalAndRememberAmpersandLocation('\'');
  2885                                 returnState = state;
  2886                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
  2887                                 break attributevaluesinglequotedloop;
  2888                             // continue stateloop;
  2889                             case '\r':
  2890                                 appendLongStrBufCarriageReturn();
  2891                                 break stateloop;
  2892                             case '\n':
  2893                                 appendLongStrBufLineFeed();
  2894                                 continue;
  2895                             case '\u0000':
  2896                                 c = '\uFFFD';
  2897                                 // fall thru
  2898                             default:
  2899                                 /*
  2900                                  * Anything else Append the current input
  2901                                  * character to the current attribute's value.
  2902                                  */
  2903                                 appendLongStrBuf(c);
  2904                                 /*
  2905                                  * Stay in the attribute value (double-quoted)
  2906                                  * state.
  2907                                  */
  2908                                 continue;
  2911                     // FALLTHRU DON'T REORDER
  2912                 case CONSUME_CHARACTER_REFERENCE:
  2913                     if (++pos == endPos) {
  2914                         break stateloop;
  2916                     c = checkChar(buf, pos);
  2917                     if (c == '\u0000') {
  2918                         break stateloop;
  2920                     /*
  2921                      * Unlike the definition is the spec, this state does not
  2922                      * return a value and never requires the caller to
  2923                      * backtrack. This state takes care of emitting characters
  2924                      * or appending to the current attribute value. It also
  2925                      * takes care of that in the case when consuming the
  2926                      * character reference fails.
  2927                      */
  2928                     /*
  2929                      * This section defines how to consume a character
  2930                      * reference. This definition is used when parsing character
  2931                      * references in text and in attributes.
  2933                      * The behavior depends on the identity of the next
  2934                      * character (the one immediately after the U+0026 AMPERSAND
  2935                      * character):
  2936                      */
  2937                     switch (c) {
  2938                         case ' ':
  2939                         case '\t':
  2940                         case '\n':
  2941                         case '\r': // we'll reconsume!
  2942                         case '\u000C':
  2943                         case '<':
  2944                         case '&':
  2945                             emitOrAppendStrBuf(returnState);
  2946                             if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  2947                                 cstart = pos;
  2949                             reconsume = true;
  2950                             state = transition(state, returnState, reconsume, pos);
  2951                             continue stateloop;
  2952                         case '#':
  2953                             /*
  2954                              * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER
  2955                              * SIGN.
  2956                              */
  2957                             appendStrBuf('#');
  2958                             state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos);
  2959                             continue stateloop;
  2960                         default:
  2961                             if (c == additional) {
  2962                                 emitOrAppendStrBuf(returnState);
  2963                                 reconsume = true;
  2964                                 state = transition(state, returnState, reconsume, pos);
  2965                                 continue stateloop;
  2967                             if (c >= 'a' && c <= 'z') {
  2968                                 firstCharKey = c - 'a' + 26;
  2969                             } else if (c >= 'A' && c <= 'Z') {
  2970                                 firstCharKey = c - 'A';
  2971                             } else {
  2972                                 // No match
  2973                                 /*
  2974                                  * If no match can be made, then this is a parse
  2975                                  * error.
  2976                                  */
  2977                                 errNoNamedCharacterMatch();
  2978                                 emitOrAppendStrBuf(returnState);
  2979                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  2980                                     cstart = pos;
  2982                                 reconsume = true;
  2983                                 state = transition(state, returnState, reconsume, pos);
  2984                                 continue stateloop;
  2986                             // Didn't fail yet
  2987                             appendStrBuf(c);
  2988                             state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos);
  2989                             // FALL THROUGH continue stateloop;
  2991                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  2992                 case CHARACTER_REFERENCE_HILO_LOOKUP:
  2994                         if (++pos == endPos) {
  2995                             break stateloop;
  2997                         c = checkChar(buf, pos);
  2998                         if (c == '\u0000') {
  2999                             break stateloop;
  3001                         /*
  3002                          * The data structure is as follows:
  3004                          * HILO_ACCEL is a two-dimensional int array whose major
  3005                          * index corresponds to the second character of the
  3006                          * character reference (code point as index) and the
  3007                          * minor index corresponds to the first character of the
  3008                          * character reference (packed so that A-Z runs from 0
  3009                          * to 25 and a-z runs from 26 to 51). This layout makes
  3010                          * it easier to use the sparseness of the data structure
  3011                          * to omit parts of it: The second dimension of the
  3012                          * table is null when no character reference starts with
  3013                          * the character corresponding to that row.
  3015                          * The int value HILO_ACCEL (by these indeces) is zero
  3016                          * if there exists no character reference starting with
  3017                          * that two-letter prefix. Otherwise, the value is an
  3018                          * int that packs two shorts so that the higher short is
  3019                          * the index of the highest character reference name
  3020                          * with that prefix in NAMES and the lower short
  3021                          * corresponds to the index of the lowest character
  3022                          * reference name with that prefix. (It happens that the
  3023                          * first two character reference names share their
  3024                          * prefix so the packed int cannot be 0 by packing the
  3025                          * two shorts.)
  3027                          * NAMES is an array of byte arrays where each byte
  3028                          * array encodes the name of a character references as
  3029                          * ASCII. The names omit the first two letters of the
  3030                          * name. (Since storing the first two letters would be
  3031                          * redundant with the data contained in HILO_ACCEL.) The
  3032                          * entries are lexically sorted.
  3034                          * For a given index in NAMES, the same index in VALUES
  3035                          * contains the corresponding expansion as an array of
  3036                          * two UTF-16 code units (either the character and
  3037                          * U+0000 or a suggogate pair).
  3038                          */
  3039                         int hilo = 0;
  3040                         if (c <= 'z') {
  3041                             @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c];
  3042                             if (row != null) {
  3043                                 hilo = row[firstCharKey];
  3046                         if (hilo == 0) {
  3047                             /*
  3048                              * If no match can be made, then this is a parse
  3049                              * error.
  3050                              */
  3051                             errNoNamedCharacterMatch();
  3052                             emitOrAppendStrBuf(returnState);
  3053                             if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  3054                                 cstart = pos;
  3056                             reconsume = true;
  3057                             state = transition(state, returnState, reconsume, pos);
  3058                             continue stateloop;
  3060                         // Didn't fail yet
  3061                         appendStrBuf(c);
  3062                         lo = hilo & 0xFFFF;
  3063                         hi = hilo >> 16;
  3064                         entCol = -1;
  3065                         candidate = -1;
  3066                         strBufMark = 0;
  3067                         state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos);
  3068                         // FALL THROUGH continue stateloop;
  3070                 case CHARACTER_REFERENCE_TAIL:
  3071                     outer: for (;;) {
  3072                         if (++pos == endPos) {
  3073                             break stateloop;
  3075                         c = checkChar(buf, pos);
  3076                         if (c == '\u0000') {
  3077                             break stateloop;
  3079                         entCol++;
  3080                         /*
  3081                          * Consume the maximum number of characters possible,
  3082                          * with the consumed characters matching one of the
  3083                          * identifiers in the first column of the named
  3084                          * character references table (in a case-sensitive
  3085                          * manner).
  3086                          */
  3087                         loloop: for (;;) {
  3088                             if (hi < lo) {
  3089                                 break outer;
  3091                             if (entCol == NamedCharacters.NAMES[lo].length()) {
  3092                                 candidate = lo;
  3093                                 strBufMark = strBufLen;
  3094                                 lo++;
  3095                             } else if (entCol > NamedCharacters.NAMES[lo].length()) {
  3096                                 break outer;
  3097                             } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
  3098                                 lo++;
  3099                             } else {
  3100                                 break loloop;
  3104                         hiloop: for (;;) {
  3105                             if (hi < lo) {
  3106                                 break outer;
  3108                             if (entCol == NamedCharacters.NAMES[hi].length()) {
  3109                                 break hiloop;
  3111                             if (entCol > NamedCharacters.NAMES[hi].length()) {
  3112                                 break outer;
  3113                             } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
  3114                                 hi--;
  3115                             } else {
  3116                                 break hiloop;
  3120                         if (c == ';') {
  3121                             // If we see a semicolon, there cannot be a 
  3122                             // longer match. Break the loop. However, before
  3123                             // breaking, take the longest match so far as the 
  3124                             // candidate, if we are just about to complete a 
  3125                             // match.
  3126                             if (entCol + 1 == NamedCharacters.NAMES[lo].length()) {
  3127                                 candidate = lo;
  3128                                 strBufMark = strBufLen;
  3130                             break outer;
  3133                         if (hi < lo) {
  3134                             break outer;
  3136                         appendStrBuf(c);
  3137                         continue;
  3140                     if (candidate == -1) {
  3141                         // reconsume deals with CR, LF or nul
  3142                         /*
  3143                          * If no match can be made, then this is a parse error.
  3144                          */
  3145                         errNoNamedCharacterMatch();
  3146                         emitOrAppendStrBuf(returnState);
  3147                         if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  3148                             cstart = pos;
  3150                         reconsume = true;
  3151                         state = transition(state, returnState, reconsume, pos);
  3152                         continue stateloop;
  3153                     } else {
  3154                         // c can't be CR, LF or nul if we got here
  3155                         @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
  3156                         if (candidateName.length() == 0
  3157                                 || candidateName.charAt(candidateName.length() - 1) != ';') {
  3158                             /*
  3159                              * If the last character matched is not a U+003B
  3160                              * SEMICOLON (;), there is a parse error.
  3161                              */
  3162                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
  3163                                 /*
  3164                                  * If the entity is being consumed as part of an
  3165                                  * attribute, and the last character matched is
  3166                                  * not a U+003B SEMICOLON (;),
  3167                                  */
  3168                                 char ch;
  3169                                 if (strBufMark == strBufLen) {
  3170                                     ch = c;
  3171                                 } else {
  3172                                     // if (strBufOffset != -1) {
  3173                                     // ch = buf[strBufOffset + strBufMark];
  3174                                     // } else {
  3175                                     ch = strBuf[strBufMark];
  3176                                     // }
  3178                                 if (ch == '=' || (ch >= '0' && ch <= '9')
  3179                                         || (ch >= 'A' && ch <= 'Z')
  3180                                         || (ch >= 'a' && ch <= 'z')) {
  3181                                     /*
  3182                                      * and the next character is either a U+003D
  3183                                      * EQUALS SIGN character (=) or in the range
  3184                                      * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
  3185                                      * U+0041 LATIN CAPITAL LETTER A to U+005A
  3186                                      * LATIN CAPITAL LETTER Z, or U+0061 LATIN
  3187                                      * SMALL LETTER A to U+007A LATIN SMALL
  3188                                      * LETTER Z, then, for historical reasons,
  3189                                      * all the characters that were matched
  3190                                      * after the U+0026 AMPERSAND (&) must be
  3191                                      * unconsumed, and nothing is returned.
  3192                                      */
  3193                                     errNoNamedCharacterMatch();
  3194                                     appendStrBufToLongStrBuf();
  3195                                     reconsume = true;
  3196                                     state = transition(state, returnState, reconsume, pos);
  3197                                     continue stateloop;
  3200                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
  3201                                 errUnescapedAmpersandInterpretedAsCharacterReference();
  3202                             } else {
  3203                                 errNotSemicolonTerminated();
  3207                         /*
  3208                          * Otherwise, return a character token for the character
  3209                          * corresponding to the entity name (as given by the
  3210                          * second column of the named character references
  3211                          * table).
  3212                          */
  3213                         // CPPONLY: completedNamedCharacterReference();
  3214                         @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
  3215                         if (
  3216                         // [NOCPP[
  3217                         val.length == 1
  3218                         // ]NOCPP]
  3219                         // CPPONLY: val[1] == 0
  3220                         ) {
  3221                             emitOrAppendOne(val, returnState);
  3222                         } else {
  3223                             emitOrAppendTwo(val, returnState);
  3225                         // this is so complicated!
  3226                         if (strBufMark < strBufLen) {
  3227                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
  3228                                 for (int i = strBufMark; i < strBufLen; i++) {
  3229                                     appendLongStrBuf(strBuf[i]);
  3231                             } else {
  3232                                 tokenHandler.characters(strBuf, strBufMark,
  3233                                         strBufLen - strBufMark);
  3236                         // Check if we broke out early with c being the last
  3237                         // character that matched as opposed to being the
  3238                         // first one that didn't match. In the case of an 
  3239                         // early break, the next run on text should start
  3240                         // *after* the current character and the current 
  3241                         // character shouldn't be reconsumed.
  3242                         boolean earlyBreak = (c == ';' && strBufMark == strBufLen);
  3243                         if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  3244                             cstart = earlyBreak ? pos + 1 : pos;
  3246                         reconsume = !earlyBreak;
  3247                         state = transition(state, returnState, reconsume, pos);
  3248                         continue stateloop;
  3249                         /*
  3250                          * If the markup contains I'm &notit; I tell you, the
  3251                          * entity is parsed as "not", as in, I'm ¬it; I tell
  3252                          * you. But if the markup was I'm &notin; I tell you,
  3253                          * the entity would be parsed as "notin;", resulting in
  3254                          * I'm ∉ I tell you.
  3255                          */
  3257                     // XXX reorder point
  3258                 case CONSUME_NCR:
  3259                     if (++pos == endPos) {
  3260                         break stateloop;
  3262                     c = checkChar(buf, pos);
  3263                     prevValue = -1;
  3264                     value = 0;
  3265                     seenDigits = false;
  3266                     /*
  3267                      * The behavior further depends on the character after the
  3268                      * U+0023 NUMBER SIGN:
  3269                      */
  3270                     switch (c) {
  3271                         case 'x':
  3272                         case 'X':
  3274                             /*
  3275                              * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL
  3276                              * LETTER X Consume the X.
  3278                              * Follow the steps below, but using the range of
  3279                              * characters U+0030 DIGIT ZERO through to U+0039
  3280                              * DIGIT NINE, U+0061 LATIN SMALL LETTER A through
  3281                              * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN
  3282                              * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL
  3283                              * LETTER F (in other words, 0-9, A-F, a-f).
  3285                              * When it comes to interpreting the number,
  3286                              * interpret it as a hexadecimal number.
  3287                              */
  3288                             appendStrBuf(c);
  3289                             state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos);
  3290                             continue stateloop;
  3291                         default:
  3292                             /*
  3293                              * Anything else Follow the steps below, but using
  3294                              * the range of characters U+0030 DIGIT ZERO through
  3295                              * to U+0039 DIGIT NINE (i.e. just 0-9).
  3297                              * When it comes to interpreting the number,
  3298                              * interpret it as a decimal number.
  3299                              */
  3300                             reconsume = true;
  3301                             state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos);
  3302                             // FALL THROUGH continue stateloop;
  3304                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  3305                 case DECIMAL_NRC_LOOP:
  3306                     decimalloop: for (;;) {
  3307                         if (reconsume) {
  3308                             reconsume = false;
  3309                         } else {
  3310                             if (++pos == endPos) {
  3311                                 break stateloop;
  3313                             c = checkChar(buf, pos);
  3315                         // Deal with overflow gracefully
  3316                         if (value < prevValue) {
  3317                             value = 0x110000; // Value above Unicode range but
  3318                             // within int
  3319                             // range
  3321                         prevValue = value;
  3322                         /*
  3323                          * Consume as many characters as match the range of
  3324                          * characters given above.
  3325                          */
  3326                         if (c >= '0' && c <= '9') {
  3327                             seenDigits = true;
  3328                             value *= 10;
  3329                             value += c - '0';
  3330                             continue;
  3331                         } else if (c == ';') {
  3332                             if (seenDigits) {
  3333                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  3334                                     cstart = pos + 1;
  3336                                 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
  3337                                 // FALL THROUGH continue stateloop;
  3338                                 break decimalloop;
  3339                             } else {
  3340                                 errNoDigitsInNCR();
  3341                                 appendStrBuf(';');
  3342                                 emitOrAppendStrBuf(returnState);
  3343                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  3344                                     cstart = pos + 1;
  3346                                 state = transition(state, returnState, reconsume, pos);
  3347                                 continue stateloop;
  3349                         } else {
  3350                             /*
  3351                              * If no characters match the range, then don't
  3352                              * consume any characters (and unconsume the U+0023
  3353                              * NUMBER SIGN character and, if appropriate, the X
  3354                              * character). This is a parse error; nothing is
  3355                              * returned.
  3357                              * Otherwise, if the next character is a U+003B
  3358                              * SEMICOLON, consume that too. If it isn't, there
  3359                              * is a parse error.
  3360                              */
  3361                             if (!seenDigits) {
  3362                                 errNoDigitsInNCR();
  3363                                 emitOrAppendStrBuf(returnState);
  3364                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  3365                                     cstart = pos;
  3367                                 reconsume = true;
  3368                                 state = transition(state, returnState, reconsume, pos);
  3369                                 continue stateloop;
  3370                             } else {
  3371                                 errCharRefLacksSemicolon();
  3372                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  3373                                     cstart = pos;
  3375                                 reconsume = true;
  3376                                 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
  3377                                 // FALL THROUGH continue stateloop;
  3378                                 break decimalloop;
  3382                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  3383                 case HANDLE_NCR_VALUE:
  3384                     // WARNING previous state sets reconsume
  3385                     // XXX inline this case if the method size can take it
  3386                     handleNcrValue(returnState);
  3387                     state = transition(state, returnState, reconsume, pos);
  3388                     continue stateloop;
  3389                     // XXX reorder point
  3390                 case HEX_NCR_LOOP:
  3391                     for (;;) {
  3392                         if (++pos == endPos) {
  3393                             break stateloop;
  3395                         c = checkChar(buf, pos);
  3396                         // Deal with overflow gracefully
  3397                         if (value < prevValue) {
  3398                             value = 0x110000; // Value above Unicode range but
  3399                             // within int
  3400                             // range
  3402                         prevValue = value;
  3403                         /*
  3404                          * Consume as many characters as match the range of
  3405                          * characters given above.
  3406                          */
  3407                         if (c >= '0' && c <= '9') {
  3408                             seenDigits = true;
  3409                             value *= 16;
  3410                             value += c - '0';
  3411                             continue;
  3412                         } else if (c >= 'A' && c <= 'F') {
  3413                             seenDigits = true;
  3414                             value *= 16;
  3415                             value += c - 'A' + 10;
  3416                             continue;
  3417                         } else if (c >= 'a' && c <= 'f') {
  3418                             seenDigits = true;
  3419                             value *= 16;
  3420                             value += c - 'a' + 10;
  3421                             continue;
  3422                         } else if (c == ';') {
  3423                             if (seenDigits) {
  3424                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  3425                                     cstart = pos + 1;
  3427                                 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
  3428                                 continue stateloop;
  3429                             } else {
  3430                                 errNoDigitsInNCR();
  3431                                 appendStrBuf(';');
  3432                                 emitOrAppendStrBuf(returnState);
  3433                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  3434                                     cstart = pos + 1;
  3436                                 state = transition(state, returnState, reconsume, pos);
  3437                                 continue stateloop;
  3439                         } else {
  3440                             /*
  3441                              * If no characters match the range, then don't
  3442                              * consume any characters (and unconsume the U+0023
  3443                              * NUMBER SIGN character and, if appropriate, the X
  3444                              * character). This is a parse error; nothing is
  3445                              * returned.
  3447                              * Otherwise, if the next character is a U+003B
  3448                              * SEMICOLON, consume that too. If it isn't, there
  3449                              * is a parse error.
  3450                              */
  3451                             if (!seenDigits) {
  3452                                 errNoDigitsInNCR();
  3453                                 emitOrAppendStrBuf(returnState);
  3454                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  3455                                     cstart = pos;
  3457                                 reconsume = true;
  3458                                 state = transition(state, returnState, reconsume, pos);
  3459                                 continue stateloop;
  3460                             } else {
  3461                                 errCharRefLacksSemicolon();
  3462                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  3463                                     cstart = pos;
  3465                                 reconsume = true;
  3466                                 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
  3467                                 continue stateloop;
  3471                     // XXX reorder point
  3472                 case PLAINTEXT:
  3473                     plaintextloop: for (;;) {
  3474                         if (reconsume) {
  3475                             reconsume = false;
  3476                         } else {
  3477                             if (++pos == endPos) {
  3478                                 break stateloop;
  3480                             c = checkChar(buf, pos);
  3482                         switch (c) {
  3483                             case '\u0000':
  3484                                 emitPlaintextReplacementCharacter(buf, pos);
  3485                                 continue;
  3486                             case '\r':
  3487                                 emitCarriageReturn(buf, pos);
  3488                                 break stateloop;
  3489                             case '\n':
  3490                                 silentLineFeed();
  3491                             default:
  3492                                 /*
  3493                                  * Anything else Emit the current input
  3494                                  * character as a character token. Stay in the
  3495                                  * RAWTEXT state.
  3496                                  */
  3497                                 continue;
  3500                     // XXX reorder point
  3501                 case CLOSE_TAG_OPEN:
  3502                     if (++pos == endPos) {
  3503                         break stateloop;
  3505                     c = checkChar(buf, pos);
  3506                     /*
  3507                      * Otherwise, if the content model flag is set to the PCDATA
  3508                      * state, or if the next few characters do match that tag
  3509                      * name, consume the next input character:
  3510                      */
  3511                     switch (c) {
  3512                         case '>':
  3513                             /* U+003E GREATER-THAN SIGN (>) Parse error. */
  3514                             errLtSlashGt();
  3515                             /*
  3516                              * Switch to the data state.
  3517                              */
  3518                             cstart = pos + 1;
  3519                             state = transition(state, Tokenizer.DATA, reconsume, pos);
  3520                             continue stateloop;
  3521                         case '\r':
  3522                             silentCarriageReturn();
  3523                             /* Anything else Parse error. */
  3524                             errGarbageAfterLtSlash();
  3525                             /*
  3526                              * Switch to the bogus comment state.
  3527                              */
  3528                             clearLongStrBufAndAppend('\n');
  3529                             state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
  3530                             break stateloop;
  3531                         case '\n':
  3532                             silentLineFeed();
  3533                             /* Anything else Parse error. */
  3534                             errGarbageAfterLtSlash();
  3535                             /*
  3536                              * Switch to the bogus comment state.
  3537                              */
  3538                             clearLongStrBufAndAppend('\n');
  3539                             state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
  3540                             continue stateloop;
  3541                         case '\u0000':
  3542                             c = '\uFFFD';
  3543                             // fall thru
  3544                         default:
  3545                             if (c >= 'A' && c <= 'Z') {
  3546                                 c += 0x20;
  3548                             if (c >= 'a' && c <= 'z') {
  3549                                 /*
  3550                                  * U+0061 LATIN SMALL LETTER A through to U+007A
  3551                                  * LATIN SMALL LETTER Z Create a new end tag
  3552                                  * token,
  3553                                  */
  3554                                 endTag = true;
  3555                                 /*
  3556                                  * set its tag name to the input character,
  3557                                  */
  3558                                 clearStrBufAndAppend(c);
  3559                                 /*
  3560                                  * then switch to the tag name state. (Don't
  3561                                  * emit the token yet; further details will be
  3562                                  * filled in before it is emitted.)
  3563                                  */
  3564                                 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
  3565                                 continue stateloop;
  3566                             } else {
  3567                                 /* Anything else Parse error. */
  3568                                 errGarbageAfterLtSlash();
  3569                                 /*
  3570                                  * Switch to the bogus comment state.
  3571                                  */
  3572                                 clearLongStrBufAndAppend(c);
  3573                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
  3574                                 continue stateloop;
  3577                     // XXX reorder point
  3578                 case RCDATA:
  3579                     rcdataloop: for (;;) {
  3580                         if (reconsume) {
  3581                             reconsume = false;
  3582                         } else {
  3583                             if (++pos == endPos) {
  3584                                 break stateloop;
  3586                             c = checkChar(buf, pos);
  3588                         switch (c) {
  3589                             case '&':
  3590                                 /*
  3591                                  * U+0026 AMPERSAND (&) Switch to the character
  3592                                  * reference in RCDATA state.
  3593                                  */
  3594                                 flushChars(buf, pos);
  3595                                 clearStrBufAndAppend(c);
  3596                                 additional = '\u0000';
  3597                                 returnState = state;
  3598                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
  3599                                 continue stateloop;
  3600                             case '<':
  3601                                 /*
  3602                                  * U+003C LESS-THAN SIGN (<) Switch to the
  3603                                  * RCDATA less-than sign state.
  3604                                  */
  3605                                 flushChars(buf, pos);
  3607                                 returnState = state;
  3608                                 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
  3609                                 continue stateloop;
  3610                             case '\u0000':
  3611                                 emitReplacementCharacter(buf, pos);
  3612                                 continue;
  3613                             case '\r':
  3614                                 emitCarriageReturn(buf, pos);
  3615                                 break stateloop;
  3616                             case '\n':
  3617                                 silentLineFeed();
  3618                             default:
  3619                                 /*
  3620                                  * Emit the current input character as a
  3621                                  * character token. Stay in the RCDATA state.
  3622                                  */
  3623                                 continue;
  3626                     // XXX reorder point
  3627                 case RAWTEXT:
  3628                     rawtextloop: for (;;) {
  3629                         if (reconsume) {
  3630                             reconsume = false;
  3631                         } else {
  3632                             if (++pos == endPos) {
  3633                                 break stateloop;
  3635                             c = checkChar(buf, pos);
  3637                         switch (c) {
  3638                             case '<':
  3639                                 /*
  3640                                  * U+003C LESS-THAN SIGN (<) Switch to the
  3641                                  * RAWTEXT less-than sign state.
  3642                                  */
  3643                                 flushChars(buf, pos);
  3645                                 returnState = state;
  3646                                 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
  3647                                 break rawtextloop;
  3648                             // FALL THRU continue stateloop;
  3649                             case '\u0000':
  3650                                 emitReplacementCharacter(buf, pos);
  3651                                 continue;
  3652                             case '\r':
  3653                                 emitCarriageReturn(buf, pos);
  3654                                 break stateloop;
  3655                             case '\n':
  3656                                 silentLineFeed();
  3657                             default:
  3658                                 /*
  3659                                  * Emit the current input character as a
  3660                                  * character token. Stay in the RAWTEXT state.
  3661                                  */
  3662                                 continue;
  3665                     // XXX fallthru don't reorder
  3666                 case RAWTEXT_RCDATA_LESS_THAN_SIGN:
  3667                     rawtextrcdatalessthansignloop: for (;;) {
  3668                         if (++pos == endPos) {
  3669                             break stateloop;
  3671                         c = checkChar(buf, pos);
  3672                         switch (c) {
  3673                             case '/':
  3674                                 /*
  3675                                  * U+002F SOLIDUS (/) Set the temporary buffer
  3676                                  * to the empty string. Switch to the script
  3677                                  * data end tag open state.
  3678                                  */
  3679                                 index = 0;
  3680                                 clearStrBuf();
  3681                                 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
  3682                                 break rawtextrcdatalessthansignloop;
  3683                             // FALL THRU continue stateloop;
  3684                             default:
  3685                                 /*
  3686                                  * Otherwise, emit a U+003C LESS-THAN SIGN
  3687                                  * character token
  3688                                  */
  3689                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
  3690                                 /*
  3691                                  * and reconsume the current input character in
  3692                                  * the data state.
  3693                                  */
  3694                                 cstart = pos;
  3695                                 reconsume = true;
  3696                                 state = transition(state, returnState, reconsume, pos);
  3697                                 continue stateloop;
  3700                     // XXX fall thru. don't reorder.
  3701                 case NON_DATA_END_TAG_NAME:
  3702                     for (;;) {
  3703                         if (++pos == endPos) {
  3704                             break stateloop;
  3706                         c = checkChar(buf, pos);
  3707                         /*
  3708                          * ASSERT! when entering this state, set index to 0 and
  3709                          * call clearStrBuf() assert (contentModelElement !=
  3710                          * null); Let's implement the above without lookahead.
  3711                          * strBuf is the 'temporary buffer'.
  3712                          */
  3713                         if (index < endTagExpectationAsArray.length) {
  3714                             char e = endTagExpectationAsArray[index];
  3715                             char folded = c;
  3716                             if (c >= 'A' && c <= 'Z') {
  3717                                 folded += 0x20;
  3719                             if (folded != e) {
  3720                                 // [NOCPP[
  3721                                 errHtml4LtSlashInRcdata(folded);
  3722                                 // ]NOCPP]
  3723                                 tokenHandler.characters(Tokenizer.LT_SOLIDUS,
  3724                                         0, 2);
  3725                                 emitStrBuf();
  3726                                 cstart = pos;
  3727                                 reconsume = true;
  3728                                 state = transition(state, returnState, reconsume, pos);
  3729                                 continue stateloop;
  3731                             appendStrBuf(c);
  3732                             index++;
  3733                             continue;
  3734                         } else {
  3735                             endTag = true;
  3736                             // XXX replace contentModelElement with different
  3737                             // type
  3738                             tagName = endTagExpectation;
  3739                             switch (c) {
  3740                                 case '\r':
  3741                                     silentCarriageReturn();
  3742                                     state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
  3743                                     break stateloop;
  3744                                 case '\n':
  3745                                     silentLineFeed();
  3746                                     // fall thru
  3747                                 case ' ':
  3748                                 case '\t':
  3749                                 case '\u000C':
  3750                                     /*
  3751                                      * U+0009 CHARACTER TABULATION U+000A LINE
  3752                                      * FEED (LF) U+000C FORM FEED (FF) U+0020
  3753                                      * SPACE If the current end tag token is an
  3754                                      * appropriate end tag token, then switch to
  3755                                      * the before attribute name state.
  3756                                      */
  3757                                     state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
  3758                                     continue stateloop;
  3759                                 case '/':
  3760                                     /*
  3761                                      * U+002F SOLIDUS (/) If the current end tag
  3762                                      * token is an appropriate end tag token,
  3763                                      * then switch to the self-closing start tag
  3764                                      * state.
  3765                                      */
  3766                                     state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
  3767                                     continue stateloop;
  3768                                 case '>':
  3769                                     /*
  3770                                      * U+003E GREATER-THAN SIGN (>) If the
  3771                                      * current end tag token is an appropriate
  3772                                      * end tag token, then emit the current tag
  3773                                      * token and switch to the data state.
  3774                                      */
  3775                                     state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
  3776                                     if (shouldSuspend) {
  3777                                         break stateloop;
  3779                                     continue stateloop;
  3780                                 default:
  3781                                     /*
  3782                                      * Emit a U+003C LESS-THAN SIGN character
  3783                                      * token, a U+002F SOLIDUS character token,
  3784                                      * a character token for each of the
  3785                                      * characters in the temporary buffer (in
  3786                                      * the order they were added to the buffer),
  3787                                      * and reconsume the current input character
  3788                                      * in the RAWTEXT state.
  3789                                      */
  3790                                     // [NOCPP[
  3791                                     errWarnLtSlashInRcdata();
  3792                                     // ]NOCPP]
  3793                                     tokenHandler.characters(
  3794                                             Tokenizer.LT_SOLIDUS, 0, 2);
  3795                                     emitStrBuf();
  3796                                     if (c == '\u0000') {
  3797                                         emitReplacementCharacter(buf, pos);
  3798                                     } else {
  3799                                         cstart = pos; // don't drop the
  3800                                         // character
  3802                                     state = transition(state, returnState, reconsume, pos);
  3803                                     continue stateloop;
  3807                     // XXX reorder point
  3808                     // BEGIN HOTSPOT WORKAROUND
  3809                 case BOGUS_COMMENT:
  3810                     boguscommentloop: for (;;) {
  3811                         if (reconsume) {
  3812                             reconsume = false;
  3813                         } else {
  3814                             if (++pos == endPos) {
  3815                                 break stateloop;
  3817                             c = checkChar(buf, pos);
  3819                         /*
  3820                          * Consume every character up to and including the first
  3821                          * U+003E GREATER-THAN SIGN character (>) or the end of
  3822                          * the file (EOF), whichever comes first. Emit a comment
  3823                          * token whose data is the concatenation of all the
  3824                          * characters starting from and including the character
  3825                          * that caused the state machine to switch into the
  3826                          * bogus comment state, up to and including the
  3827                          * character immediately before the last consumed
  3828                          * character (i.e. up to the character just before the
  3829                          * U+003E or EOF character). (If the comment was started
  3830                          * by the end of the file (EOF), the token is empty.)
  3832                          * Switch to the data state.
  3834                          * If the end of the file was reached, reconsume the EOF
  3835                          * character.
  3836                          */
  3837                         switch (c) {
  3838                             case '>':
  3839                                 emitComment(0, pos);
  3840                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  3841                                 continue stateloop;
  3842                             case '-':
  3843                                 appendLongStrBuf(c);
  3844                                 state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos);
  3845                                 break boguscommentloop;
  3846                             case '\r':
  3847                                 appendLongStrBufCarriageReturn();
  3848                                 break stateloop;
  3849                             case '\n':
  3850                                 appendLongStrBufLineFeed();
  3851                                 continue;
  3852                             case '\u0000':
  3853                                 c = '\uFFFD';
  3854                                 // fall thru
  3855                             default:
  3856                                 appendLongStrBuf(c);
  3857                                 continue;
  3860                     // FALLTHRU DON'T REORDER
  3861                 case BOGUS_COMMENT_HYPHEN:
  3862                     boguscommenthyphenloop: for (;;) {
  3863                         if (++pos == endPos) {
  3864                             break stateloop;
  3866                         c = checkChar(buf, pos);
  3867                         switch (c) {
  3868                             case '>':
  3869                                 // [NOCPP[
  3870                                 maybeAppendSpaceToBogusComment();
  3871                                 // ]NOCPP]
  3872                                 emitComment(0, pos);
  3873                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  3874                                 continue stateloop;
  3875                             case '-':
  3876                                 appendSecondHyphenToBogusComment();
  3877                                 continue boguscommenthyphenloop;
  3878                             case '\r':
  3879                                 appendLongStrBufCarriageReturn();
  3880                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
  3881                                 break stateloop;
  3882                             case '\n':
  3883                                 appendLongStrBufLineFeed();
  3884                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
  3885                                 continue stateloop;
  3886                             case '\u0000':
  3887                                 c = '\uFFFD';
  3888                                 // fall thru
  3889                             default:
  3890                                 appendLongStrBuf(c);
  3891                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
  3892                                 continue stateloop;
  3895                     // XXX reorder point
  3896                 case SCRIPT_DATA:
  3897                     scriptdataloop: for (;;) {
  3898                         if (reconsume) {
  3899                             reconsume = false;
  3900                         } else {
  3901                             if (++pos == endPos) {
  3902                                 break stateloop;
  3904                             c = checkChar(buf, pos);
  3906                         switch (c) {
  3907                             case '<':
  3908                                 /*
  3909                                  * U+003C LESS-THAN SIGN (<) Switch to the
  3910                                  * script data less-than sign state.
  3911                                  */
  3912                                 flushChars(buf, pos);
  3913                                 returnState = state;
  3914                                 state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos);
  3915                                 break scriptdataloop; // FALL THRU continue
  3916                             // stateloop;
  3917                             case '\u0000':
  3918                                 emitReplacementCharacter(buf, pos);
  3919                                 continue;
  3920                             case '\r':
  3921                                 emitCarriageReturn(buf, pos);
  3922                                 break stateloop;
  3923                             case '\n':
  3924                                 silentLineFeed();
  3925                             default:
  3926                                 /*
  3927                                  * Anything else Emit the current input
  3928                                  * character as a character token. Stay in the
  3929                                  * script data state.
  3930                                  */
  3931                                 continue;
  3934                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  3935                 case SCRIPT_DATA_LESS_THAN_SIGN:
  3936                     scriptdatalessthansignloop: for (;;) {
  3937                         if (++pos == endPos) {
  3938                             break stateloop;
  3940                         c = checkChar(buf, pos);
  3941                         switch (c) {
  3942                             case '/':
  3943                                 /*
  3944                                  * U+002F SOLIDUS (/) Set the temporary buffer
  3945                                  * to the empty string. Switch to the script
  3946                                  * data end tag open state.
  3947                                  */
  3948                                 index = 0;
  3949                                 clearStrBuf();
  3950                                 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
  3951                                 continue stateloop;
  3952                             case '!':
  3953                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
  3954                                 cstart = pos;
  3955                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos);
  3956                                 break scriptdatalessthansignloop; // FALL THRU
  3957                             // continue
  3958                             // stateloop;
  3959                             default:
  3960                                 /*
  3961                                  * Otherwise, emit a U+003C LESS-THAN SIGN
  3962                                  * character token
  3963                                  */
  3964                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
  3965                                 /*
  3966                                  * and reconsume the current input character in
  3967                                  * the data state.
  3968                                  */
  3969                                 cstart = pos;
  3970                                 reconsume = true;
  3971                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
  3972                                 continue stateloop;
  3975                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  3976                 case SCRIPT_DATA_ESCAPE_START:
  3977                     scriptdataescapestartloop: for (;;) {
  3978                         if (++pos == endPos) {
  3979                             break stateloop;
  3981                         c = checkChar(buf, pos);
  3982                         /*
  3983                          * Consume the next input character:
  3984                          */
  3985                         switch (c) {
  3986                             case '-':
  3987                                 /*
  3988                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
  3989                                  * HYPHEN-MINUS character token. Switch to the
  3990                                  * script data escape start dash state.
  3991                                  */
  3992                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos);
  3993                                 break scriptdataescapestartloop; // FALL THRU
  3994                             // continue
  3995                             // stateloop;
  3996                             default:
  3997                                 /*
  3998                                  * Anything else Reconsume the current input
  3999                                  * character in the script data state.
  4000                                  */
  4001                                 reconsume = true;
  4002                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
  4003                                 continue stateloop;
  4006                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  4007                 case SCRIPT_DATA_ESCAPE_START_DASH:
  4008                     scriptdataescapestartdashloop: for (;;) {
  4009                         if (++pos == endPos) {
  4010                             break stateloop;
  4012                         c = checkChar(buf, pos);
  4013                         /*
  4014                          * Consume the next input character:
  4015                          */
  4016                         switch (c) {
  4017                             case '-':
  4018                                 /*
  4019                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
  4020                                  * HYPHEN-MINUS character token. Switch to the
  4021                                  * script data escaped dash dash state.
  4022                                  */
  4023                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
  4024                                 break scriptdataescapestartdashloop;
  4025                             // continue stateloop;
  4026                             default:
  4027                                 /*
  4028                                  * Anything else Reconsume the current input
  4029                                  * character in the script data state.
  4030                                  */
  4031                                 reconsume = true;
  4032                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
  4033                                 continue stateloop;
  4036                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  4037                 case SCRIPT_DATA_ESCAPED_DASH_DASH:
  4038                     scriptdataescapeddashdashloop: for (;;) {
  4039                         if (++pos == endPos) {
  4040                             break stateloop;
  4042                         c = checkChar(buf, pos);
  4043                         /*
  4044                          * Consume the next input character:
  4045                          */
  4046                         switch (c) {
  4047                             case '-':
  4048                                 /*
  4049                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
  4050                                  * HYPHEN-MINUS character token. Stay in the
  4051                                  * script data escaped dash dash state.
  4052                                  */
  4053                                 continue;
  4054                             case '<':
  4055                                 /*
  4056                                  * U+003C LESS-THAN SIGN (<) Switch to the
  4057                                  * script data escaped less-than sign state.
  4058                                  */
  4059                                 flushChars(buf, pos);
  4060                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
  4061                                 continue stateloop;
  4062                             case '>':
  4063                                 /*
  4064                                  * U+003E GREATER-THAN SIGN (>) Emit a U+003E
  4065                                  * GREATER-THAN SIGN character token. Switch to
  4066                                  * the script data state.
  4067                                  */
  4068                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
  4069                                 continue stateloop;
  4070                             case '\u0000':
  4071                                 emitReplacementCharacter(buf, pos);
  4072                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
  4073                                 break scriptdataescapeddashdashloop;
  4074                             case '\r':
  4075                                 emitCarriageReturn(buf, pos);
  4076                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
  4077                                 break stateloop;
  4078                             case '\n':
  4079                                 silentLineFeed();
  4080                             default:
  4081                                 /*
  4082                                  * Anything else Emit the current input
  4083                                  * character as a character token. Switch to the
  4084                                  * script data escaped state.
  4085                                  */
  4086                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
  4087                                 break scriptdataescapeddashdashloop;
  4088                             // continue stateloop;
  4091                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  4092                 case SCRIPT_DATA_ESCAPED:
  4093                     scriptdataescapedloop: for (;;) {
  4094                         if (reconsume) {
  4095                             reconsume = false;
  4096                         } else {
  4097                             if (++pos == endPos) {
  4098                                 break stateloop;
  4100                             c = checkChar(buf, pos);
  4102                         /*
  4103                          * Consume the next input character:
  4104                          */
  4105                         switch (c) {
  4106                             case '-':
  4107                                 /*
  4108                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
  4109                                  * HYPHEN-MINUS character token. Switch to the
  4110                                  * script data escaped dash state.
  4111                                  */
  4112                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos);
  4113                                 break scriptdataescapedloop; // FALL THRU
  4114                             // continue
  4115                             // stateloop;
  4116                             case '<':
  4117                                 /*
  4118                                  * U+003C LESS-THAN SIGN (<) Switch to the
  4119                                  * script data escaped less-than sign state.
  4120                                  */
  4121                                 flushChars(buf, pos);
  4122                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
  4123                                 continue stateloop;
  4124                             case '\u0000':
  4125                                 emitReplacementCharacter(buf, pos);
  4126                                 continue;
  4127                             case '\r':
  4128                                 emitCarriageReturn(buf, pos);
  4129                                 break stateloop;
  4130                             case '\n':
  4131                                 silentLineFeed();
  4132                             default:
  4133                                 /*
  4134                                  * Anything else Emit the current input
  4135                                  * character as a character token. Stay in the
  4136                                  * script data escaped state.
  4137                                  */
  4138                                 continue;
  4141                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  4142                 case SCRIPT_DATA_ESCAPED_DASH:
  4143                     scriptdataescapeddashloop: for (;;) {
  4144                         if (++pos == endPos) {
  4145                             break stateloop;
  4147                         c = checkChar(buf, pos);
  4148                         /*
  4149                          * Consume the next input character:
  4150                          */
  4151                         switch (c) {
  4152                             case '-':
  4153                                 /*
  4154                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
  4155                                  * HYPHEN-MINUS character token. Switch to the
  4156                                  * script data escaped dash dash state.
  4157                                  */
  4158                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
  4159                                 continue stateloop;
  4160                             case '<':
  4161                                 /*
  4162                                  * U+003C LESS-THAN SIGN (<) Switch to the
  4163                                  * script data escaped less-than sign state.
  4164                                  */
  4165                                 flushChars(buf, pos);
  4166                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
  4167                                 break scriptdataescapeddashloop;
  4168                             // continue stateloop;
  4169                             case '\u0000':
  4170                                 emitReplacementCharacter(buf, pos);
  4171                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
  4172                                 continue stateloop;
  4173                             case '\r':
  4174                                 emitCarriageReturn(buf, pos);
  4175                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
  4176                                 break stateloop;
  4177                             case '\n':
  4178                                 silentLineFeed();
  4179                             default:
  4180                                 /*
  4181                                  * Anything else Emit the current input
  4182                                  * character as a character token. Switch to the
  4183                                  * script data escaped state.
  4184                                  */
  4185                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
  4186                                 continue stateloop;
  4189                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  4190                 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
  4191                     scriptdataescapedlessthanloop: for (;;) {
  4192                         if (++pos == endPos) {
  4193                             break stateloop;
  4195                         c = checkChar(buf, pos);
  4196                         /*
  4197                          * Consume the next input character:
  4198                          */
  4199                         switch (c) {
  4200                             case '/':
  4201                                 /*
  4202                                  * U+002F SOLIDUS (/) Set the temporary buffer
  4203                                  * to the empty string. Switch to the script
  4204                                  * data escaped end tag open state.
  4205                                  */
  4206                                 index = 0;
  4207                                 clearStrBuf();
  4208                                 returnState = Tokenizer.SCRIPT_DATA_ESCAPED;
  4209                                 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
  4210                                 continue stateloop;
  4211                             case 'S':
  4212                             case 's':
  4213                                 /*
  4214                                  * U+0041 LATIN CAPITAL LETTER A through to
  4215                                  * U+005A LATIN CAPITAL LETTER Z Emit a U+003C
  4216                                  * LESS-THAN SIGN character token and the
  4217                                  * current input character as a character token.
  4218                                  */
  4219                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
  4220                                 cstart = pos;
  4221                                 index = 1;
  4222                                 /*
  4223                                  * Set the temporary buffer to the empty string.
  4224                                  * Append the lowercase version of the current
  4225                                  * input character (add 0x0020 to the
  4226                                  * character's code point) to the temporary
  4227                                  * buffer. Switch to the script data double
  4228                                  * escape start state.
  4229                                  */
  4230                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos);
  4231                                 break scriptdataescapedlessthanloop;
  4232                             // continue stateloop;
  4233                             default:
  4234                                 /*
  4235                                  * Anything else Emit a U+003C LESS-THAN SIGN
  4236                                  * character token and reconsume the current
  4237                                  * input character in the script data escaped
  4238                                  * state.
  4239                                  */
  4240                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
  4241                                 cstart = pos;
  4242                                 reconsume = true;
  4243                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
  4244                                 continue stateloop;
  4247                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  4248                 case SCRIPT_DATA_DOUBLE_ESCAPE_START:
  4249                     scriptdatadoubleescapestartloop: for (;;) {
  4250                         if (++pos == endPos) {
  4251                             break stateloop;
  4253                         c = checkChar(buf, pos);
  4254                         assert index > 0;
  4255                         if (index < 6) { // SCRIPT_ARR.length
  4256                             char folded = c;
  4257                             if (c >= 'A' && c <= 'Z') {
  4258                                 folded += 0x20;
  4260                             if (folded != Tokenizer.SCRIPT_ARR[index]) {
  4261                                 reconsume = true;
  4262                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
  4263                                 continue stateloop;
  4265                             index++;
  4266                             continue;
  4268                         switch (c) {
  4269                             case '\r':
  4270                                 emitCarriageReturn(buf, pos);
  4271                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
  4272                                 break stateloop;
  4273                             case '\n':
  4274                                 silentLineFeed();
  4275                             case ' ':
  4276                             case '\t':
  4277                             case '\u000C':
  4278                             case '/':
  4279                             case '>':
  4280                                 /*
  4281                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
  4282                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
  4283                                  * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
  4284                                  * (>) Emit the current input character as a
  4285                                  * character token. If the temporary buffer is
  4286                                  * the string "script", then switch to the
  4287                                  * script data double escaped state.
  4288                                  */
  4289                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
  4290                                 break scriptdatadoubleescapestartloop;
  4291                             // continue stateloop;
  4292                             default:
  4293                                 /*
  4294                                  * Anything else Reconsume the current input
  4295                                  * character in the script data escaped state.
  4296                                  */
  4297                                 reconsume = true;
  4298                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
  4299                                 continue stateloop;
  4302                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  4303                 case SCRIPT_DATA_DOUBLE_ESCAPED:
  4304                     scriptdatadoubleescapedloop: for (;;) {
  4305                         if (reconsume) {
  4306                             reconsume = false;
  4307                         } else {
  4308                             if (++pos == endPos) {
  4309                                 break stateloop;
  4311                             c = checkChar(buf, pos);
  4313                         /*
  4314                          * Consume the next input character:
  4315                          */
  4316                         switch (c) {
  4317                             case '-':
  4318                                 /*
  4319                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
  4320                                  * HYPHEN-MINUS character token. Switch to the
  4321                                  * script data double escaped dash state.
  4322                                  */
  4323                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos);
  4324                                 break scriptdatadoubleescapedloop; // FALL THRU
  4325                             // continue
  4326                             // stateloop;
  4327                             case '<':
  4328                                 /*
  4329                                  * U+003C LESS-THAN SIGN (<) Emit a U+003C
  4330                                  * LESS-THAN SIGN character token. Switch to the
  4331                                  * script data double escaped less-than sign
  4332                                  * state.
  4333                                  */
  4334                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
  4335                                 continue stateloop;
  4336                             case '\u0000':
  4337                                 emitReplacementCharacter(buf, pos);
  4338                                 continue;
  4339                             case '\r':
  4340                                 emitCarriageReturn(buf, pos);
  4341                                 break stateloop;
  4342                             case '\n':
  4343                                 silentLineFeed();
  4344                             default:
  4345                                 /*
  4346                                  * Anything else Emit the current input
  4347                                  * character as a character token. Stay in the
  4348                                  * script data double escaped state.
  4349                                  */
  4350                                 continue;
  4353                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  4354                 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
  4355                     scriptdatadoubleescapeddashloop: for (;;) {
  4356                         if (++pos == endPos) {
  4357                             break stateloop;
  4359                         c = checkChar(buf, pos);
  4360                         /*
  4361                          * Consume the next input character:
  4362                          */
  4363                         switch (c) {
  4364                             case '-':
  4365                                 /*
  4366                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
  4367                                  * HYPHEN-MINUS character token. Switch to the
  4368                                  * script data double escaped dash dash state.
  4369                                  */
  4370                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos);
  4371                                 break scriptdatadoubleescapeddashloop;
  4372                             // continue stateloop;
  4373                             case '<':
  4374                                 /*
  4375                                  * U+003C LESS-THAN SIGN (<) Emit a U+003C
  4376                                  * LESS-THAN SIGN character token. Switch to the
  4377                                  * script data double escaped less-than sign
  4378                                  * state.
  4379                                  */
  4380                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
  4381                                 continue stateloop;
  4382                             case '\u0000':
  4383                                 emitReplacementCharacter(buf, pos);
  4384                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
  4385                                 continue stateloop;
  4386                             case '\r':
  4387                                 emitCarriageReturn(buf, pos);
  4388                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
  4389                                 break stateloop;
  4390                             case '\n':
  4391                                 silentLineFeed();
  4392                             default:
  4393                                 /*
  4394                                  * Anything else Emit the current input
  4395                                  * character as a character token. Switch to the
  4396                                  * script data double escaped state.
  4397                                  */
  4398                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
  4399                                 continue stateloop;
  4402                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  4403                 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
  4404                     scriptdatadoubleescapeddashdashloop: for (;;) {
  4405                         if (++pos == endPos) {
  4406                             break stateloop;
  4408                         c = checkChar(buf, pos);
  4409                         /*
  4410                          * Consume the next input character:
  4411                          */
  4412                         switch (c) {
  4413                             case '-':
  4414                                 /*
  4415                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D
  4416                                  * HYPHEN-MINUS character token. Stay in the
  4417                                  * script data double escaped dash dash state.
  4418                                  */
  4419                                 continue;
  4420                             case '<':
  4421                                 /*
  4422                                  * U+003C LESS-THAN SIGN (<) Emit a U+003C
  4423                                  * LESS-THAN SIGN character token. Switch to the
  4424                                  * script data double escaped less-than sign
  4425                                  * state.
  4426                                  */
  4427                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
  4428                                 break scriptdatadoubleescapeddashdashloop;
  4429                             case '>':
  4430                                 /*
  4431                                  * U+003E GREATER-THAN SIGN (>) Emit a U+003E
  4432                                  * GREATER-THAN SIGN character token. Switch to
  4433                                  * the script data state.
  4434                                  */
  4435                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
  4436                                 continue stateloop;
  4437                             case '\u0000':
  4438                                 emitReplacementCharacter(buf, pos);
  4439                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
  4440                                 continue stateloop;
  4441                             case '\r':
  4442                                 emitCarriageReturn(buf, pos);
  4443                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
  4444                                 break stateloop;
  4445                             case '\n':
  4446                                 silentLineFeed();
  4447                             default:
  4448                                 /*
  4449                                  * Anything else Emit the current input
  4450                                  * character as a character token. Switch to the
  4451                                  * script data double escaped state.
  4452                                  */
  4453                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
  4454                                 continue stateloop;
  4457                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  4458                 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
  4459                     scriptdatadoubleescapedlessthanloop: for (;;) {
  4460                         if (++pos == endPos) {
  4461                             break stateloop;
  4463                         c = checkChar(buf, pos);
  4464                         /*
  4465                          * Consume the next input character:
  4466                          */
  4467                         switch (c) {
  4468                             case '/':
  4469                                 /*
  4470                                  * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS
  4471                                  * character token. Set the temporary buffer to
  4472                                  * the empty string. Switch to the script data
  4473                                  * double escape end state.
  4474                                  */
  4475                                 index = 0;
  4476                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos);
  4477                                 break scriptdatadoubleescapedlessthanloop;
  4478                             default:
  4479                                 /*
  4480                                  * Anything else Reconsume the current input
  4481                                  * character in the script data double escaped
  4482                                  * state.
  4483                                  */
  4484                                 reconsume = true;
  4485                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
  4486                                 continue stateloop;
  4489                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  4490                 case SCRIPT_DATA_DOUBLE_ESCAPE_END:
  4491                     scriptdatadoubleescapeendloop: for (;;) {
  4492                         if (++pos == endPos) {
  4493                             break stateloop;
  4495                         c = checkChar(buf, pos);
  4496                         if (index < 6) { // SCRIPT_ARR.length
  4497                             char folded = c;
  4498                             if (c >= 'A' && c <= 'Z') {
  4499                                 folded += 0x20;
  4501                             if (folded != Tokenizer.SCRIPT_ARR[index]) {
  4502                                 reconsume = true;
  4503                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
  4504                                 continue stateloop;
  4506                             index++;
  4507                             continue;
  4509                         switch (c) {
  4510                             case '\r':
  4511                                 emitCarriageReturn(buf, pos);
  4512                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
  4513                                 break stateloop;
  4514                             case '\n':
  4515                                 silentLineFeed();
  4516                             case ' ':
  4517                             case '\t':
  4518                             case '\u000C':
  4519                             case '/':
  4520                             case '>':
  4521                                 /*
  4522                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
  4523                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
  4524                                  * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
  4525                                  * (>) Emit the current input character as a
  4526                                  * character token. If the temporary buffer is
  4527                                  * the string "script", then switch to the
  4528                                  * script data escaped state.
  4529                                  */
  4530                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
  4531                                 continue stateloop;
  4532                             default:
  4533                                 /*
  4534                                  * Reconsume the current input character in the
  4535                                  * script data double escaped state.
  4536                                  */
  4537                                 reconsume = true;
  4538                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
  4539                                 continue stateloop;
  4542                     // XXX reorder point
  4543                 case MARKUP_DECLARATION_OCTYPE:
  4544                     markupdeclarationdoctypeloop: for (;;) {
  4545                         if (++pos == endPos) {
  4546                             break stateloop;
  4548                         c = checkChar(buf, pos);
  4549                         if (index < 6) { // OCTYPE.length
  4550                             char folded = c;
  4551                             if (c >= 'A' && c <= 'Z') {
  4552                                 folded += 0x20;
  4554                             if (folded == Tokenizer.OCTYPE[index]) {
  4555                                 appendLongStrBuf(c);
  4556                             } else {
  4557                                 errBogusComment();
  4558                                 reconsume = true;
  4559                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
  4560                                 continue stateloop;
  4562                             index++;
  4563                             continue;
  4564                         } else {
  4565                             reconsume = true;
  4566                             state = transition(state, Tokenizer.DOCTYPE, reconsume, pos);
  4567                             break markupdeclarationdoctypeloop;
  4568                             // continue stateloop;
  4571                     // FALLTHRU DON'T REORDER
  4572                 case DOCTYPE:
  4573                     doctypeloop: for (;;) {
  4574                         if (reconsume) {
  4575                             reconsume = false;
  4576                         } else {
  4577                             if (++pos == endPos) {
  4578                                 break stateloop;
  4580                             c = checkChar(buf, pos);
  4582                         initDoctypeFields();
  4583                         /*
  4584                          * Consume the next input character:
  4585                          */
  4586                         switch (c) {
  4587                             case '\r':
  4588                                 silentCarriageReturn();
  4589                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
  4590                                 break stateloop;
  4591                             case '\n':
  4592                                 silentLineFeed();
  4593                                 // fall thru
  4594                             case ' ':
  4595                             case '\t':
  4596                             case '\u000C':
  4597                                 /*
  4598                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
  4599                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
  4600                                  * Switch to the before DOCTYPE name state.
  4601                                  */
  4602                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
  4603                                 break doctypeloop;
  4604                             // continue stateloop;
  4605                             default:
  4606                                 /*
  4607                                  * Anything else Parse error.
  4608                                  */
  4609                                 errMissingSpaceBeforeDoctypeName();
  4610                                 /*
  4611                                  * Reconsume the current character in the before
  4612                                  * DOCTYPE name state.
  4613                                  */
  4614                                 reconsume = true;
  4615                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
  4616                                 break doctypeloop;
  4617                             // continue stateloop;
  4620                     // FALLTHRU DON'T REORDER
  4621                 case BEFORE_DOCTYPE_NAME:
  4622                     beforedoctypenameloop: for (;;) {
  4623                         if (reconsume) {
  4624                             reconsume = false;
  4625                         } else {
  4626                             if (++pos == endPos) {
  4627                                 break stateloop;
  4629                             c = checkChar(buf, pos);
  4631                         /*
  4632                          * Consume the next input character:
  4633                          */
  4634                         switch (c) {
  4635                             case '\r':
  4636                                 silentCarriageReturn();
  4637                                 break stateloop;
  4638                             case '\n':
  4639                                 silentLineFeed();
  4640                                 // fall thru
  4641                             case ' ':
  4642                             case '\t':
  4643                             case '\u000C':
  4644                                 /*
  4645                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
  4646                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
  4647                                  * in the before DOCTYPE name state.
  4648                                  */
  4649                                 continue;
  4650                             case '>':
  4651                                 /*
  4652                                  * U+003E GREATER-THAN SIGN (>) Parse error.
  4653                                  */
  4654                                 errNamelessDoctype();
  4655                                 /*
  4656                                  * Create a new DOCTYPE token. Set its
  4657                                  * force-quirks flag to on.
  4658                                  */
  4659                                 forceQuirks = true;
  4660                                 /*
  4661                                  * Emit the token.
  4662                                  */
  4663                                 emitDoctypeToken(pos);
  4664                                 /*
  4665                                  * Switch to the data state.
  4666                                  */
  4667                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  4668                                 continue stateloop;
  4669                             case '\u0000':
  4670                                 c = '\uFFFD';
  4671                                 // fall thru
  4672                             default:
  4673                                 if (c >= 'A' && c <= 'Z') {
  4674                                     /*
  4675                                      * U+0041 LATIN CAPITAL LETTER A through to
  4676                                      * U+005A LATIN CAPITAL LETTER Z Create a
  4677                                      * new DOCTYPE token. Set the token's name
  4678                                      * to the lowercase version of the input
  4679                                      * character (add 0x0020 to the character's
  4680                                      * code point).
  4681                                      */
  4682                                     c += 0x20;
  4684                                 /* Anything else Create a new DOCTYPE token. */
  4685                                 /*
  4686                                  * Set the token's name name to the current
  4687                                  * input character.
  4688                                  */
  4689                                 clearStrBufAndAppend(c);
  4690                                 /*
  4691                                  * Switch to the DOCTYPE name state.
  4692                                  */
  4693                                 state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos);
  4694                                 break beforedoctypenameloop;
  4695                             // continue stateloop;
  4698                     // FALLTHRU DON'T REORDER
  4699                 case DOCTYPE_NAME:
  4700                     doctypenameloop: for (;;) {
  4701                         if (++pos == endPos) {
  4702                             break stateloop;
  4704                         c = checkChar(buf, pos);
  4705                         /*
  4706                          * Consume the next input character:
  4707                          */
  4708                         switch (c) {
  4709                             case '\r':
  4710                                 silentCarriageReturn();
  4711                                 strBufToDoctypeName();
  4712                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
  4713                                 break stateloop;
  4714                             case '\n':
  4715                                 silentLineFeed();
  4716                                 // fall thru
  4717                             case ' ':
  4718                             case '\t':
  4719                             case '\u000C':
  4720                                 /*
  4721                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
  4722                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
  4723                                  * Switch to the after DOCTYPE name state.
  4724                                  */
  4725                                 strBufToDoctypeName();
  4726                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
  4727                                 break doctypenameloop;
  4728                             // continue stateloop;
  4729                             case '>':
  4730                                 /*
  4731                                  * U+003E GREATER-THAN SIGN (>) Emit the current
  4732                                  * DOCTYPE token.
  4733                                  */
  4734                                 strBufToDoctypeName();
  4735                                 emitDoctypeToken(pos);
  4736                                 /*
  4737                                  * Switch to the data state.
  4738                                  */
  4739                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  4740                                 continue stateloop;
  4741                             case '\u0000':
  4742                                 c = '\uFFFD';
  4743                                 // fall thru
  4744                             default:
  4745                                 /*
  4746                                  * U+0041 LATIN CAPITAL LETTER A through to
  4747                                  * U+005A LATIN CAPITAL LETTER Z Append the
  4748                                  * lowercase version of the input character (add
  4749                                  * 0x0020 to the character's code point) to the
  4750                                  * current DOCTYPE token's name.
  4751                                  */
  4752                                 if (c >= 'A' && c <= 'Z') {
  4753                                     c += 0x0020;
  4755                                 /*
  4756                                  * Anything else Append the current input
  4757                                  * character to the current DOCTYPE token's
  4758                                  * name.
  4759                                  */
  4760                                 appendStrBuf(c);
  4761                                 /*
  4762                                  * Stay in the DOCTYPE name state.
  4763                                  */
  4764                                 continue;
  4767                     // FALLTHRU DON'T REORDER
  4768                 case AFTER_DOCTYPE_NAME:
  4769                     afterdoctypenameloop: for (;;) {
  4770                         if (++pos == endPos) {
  4771                             break stateloop;
  4773                         c = checkChar(buf, pos);
  4774                         /*
  4775                          * Consume the next input character:
  4776                          */
  4777                         switch (c) {
  4778                             case '\r':
  4779                                 silentCarriageReturn();
  4780                                 break stateloop;
  4781                             case '\n':
  4782                                 silentLineFeed();
  4783                                 // fall thru
  4784                             case ' ':
  4785                             case '\t':
  4786                             case '\u000C':
  4787                                 /*
  4788                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
  4789                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
  4790                                  * in the after DOCTYPE name state.
  4791                                  */
  4792                                 continue;
  4793                             case '>':
  4794                                 /*
  4795                                  * U+003E GREATER-THAN SIGN (>) Emit the current
  4796                                  * DOCTYPE token.
  4797                                  */
  4798                                 emitDoctypeToken(pos);
  4799                                 /*
  4800                                  * Switch to the data state.
  4801                                  */
  4802                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  4803                                 continue stateloop;
  4804                             case 'p':
  4805                             case 'P':
  4806                                 index = 0;
  4807                                 state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos);
  4808                                 break afterdoctypenameloop;
  4809                             // continue stateloop;
  4810                             case 's':
  4811                             case 'S':
  4812                                 index = 0;
  4813                                 state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos);
  4814                                 continue stateloop;
  4815                             default:
  4816                                 /*
  4817                                  * Otherwise, this is the parse error.
  4818                                  */
  4819                                 bogusDoctype();
  4821                                 /*
  4822                                  * Set the DOCTYPE token's force-quirks flag to
  4823                                  * on.
  4824                                  */
  4825                                 // done by bogusDoctype();
  4826                                 /*
  4827                                  * Switch to the bogus DOCTYPE state.
  4828                                  */
  4829                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
  4830                                 continue stateloop;
  4833                     // FALLTHRU DON'T REORDER
  4834                 case DOCTYPE_UBLIC:
  4835                     doctypeublicloop: for (;;) {
  4836                         if (++pos == endPos) {
  4837                             break stateloop;
  4839                         c = checkChar(buf, pos);
  4840                         /*
  4841                          * If the six characters starting from the current input
  4842                          * character are an ASCII case-insensitive match for the
  4843                          * word "PUBLIC", then consume those characters and
  4844                          * switch to the before DOCTYPE public identifier state.
  4845                          */
  4846                         if (index < 5) { // UBLIC.length
  4847                             char folded = c;
  4848                             if (c >= 'A' && c <= 'Z') {
  4849                                 folded += 0x20;
  4851                             if (folded != Tokenizer.UBLIC[index]) {
  4852                                 bogusDoctype();
  4853                                 // forceQuirks = true;
  4854                                 reconsume = true;
  4855                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
  4856                                 continue stateloop;
  4858                             index++;
  4859                             continue;
  4860                         } else {
  4861                             reconsume = true;
  4862                             state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos);
  4863                             break doctypeublicloop;
  4864                             // continue stateloop;
  4867                     // FALLTHRU DON'T REORDER
  4868                 case AFTER_DOCTYPE_PUBLIC_KEYWORD:
  4869                     afterdoctypepublickeywordloop: for (;;) {
  4870                         if (reconsume) {
  4871                             reconsume = false;
  4872                         } else {
  4873                             if (++pos == endPos) {
  4874                                 break stateloop;
  4876                             c = checkChar(buf, pos);
  4878                         /*
  4879                          * Consume the next input character:
  4880                          */
  4881                         switch (c) {
  4882                             case '\r':
  4883                                 silentCarriageReturn();
  4884                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
  4885                                 break stateloop;
  4886                             case '\n':
  4887                                 silentLineFeed();
  4888                                 // fall thru
  4889                             case ' ':
  4890                             case '\t':
  4891                             case '\u000C':
  4892                                 /*
  4893                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
  4894                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
  4895                                  * Switch to the before DOCTYPE public
  4896                                  * identifier state.
  4897                                  */
  4898                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
  4899                                 break afterdoctypepublickeywordloop;
  4900                             // FALL THROUGH continue stateloop
  4901                             case '"':
  4902                                 /*
  4903                                  * U+0022 QUOTATION MARK (") Parse Error.
  4904                                  */
  4905                                 errNoSpaceBetweenDoctypePublicKeywordAndQuote();
  4906                                 /*
  4907                                  * Set the DOCTYPE token's public identifier to
  4908                                  * the empty string (not missing),
  4909                                  */
  4910                                 clearLongStrBuf();
  4911                                 /*
  4912                                  * then switch to the DOCTYPE public identifier
  4913                                  * (double-quoted) state.
  4914                                  */
  4915                                 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
  4916                                 continue stateloop;
  4917                             case '\'':
  4918                                 /*
  4919                                  * U+0027 APOSTROPHE (') Parse Error.
  4920                                  */
  4921                                 errNoSpaceBetweenDoctypePublicKeywordAndQuote();
  4922                                 /*
  4923                                  * Set the DOCTYPE token's public identifier to
  4924                                  * the empty string (not missing),
  4925                                  */
  4926                                 clearLongStrBuf();
  4927                                 /*
  4928                                  * then switch to the DOCTYPE public identifier
  4929                                  * (single-quoted) state.
  4930                                  */
  4931                                 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
  4932                                 continue stateloop;
  4933                             case '>':
  4934                                 /* U+003E GREATER-THAN SIGN (>) Parse error. */
  4935                                 errExpectedPublicId();
  4936                                 /*
  4937                                  * Set the DOCTYPE token's force-quirks flag to
  4938                                  * on.
  4939                                  */
  4940                                 forceQuirks = true;
  4941                                 /*
  4942                                  * Emit that DOCTYPE token.
  4943                                  */
  4944                                 emitDoctypeToken(pos);
  4945                                 /*
  4946                                  * Switch to the data state.
  4947                                  */
  4948                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  4949                                 continue stateloop;
  4950                             default:
  4951                                 bogusDoctype();
  4952                                 /*
  4953                                  * Set the DOCTYPE token's force-quirks flag to
  4954                                  * on.
  4955                                  */
  4956                                 // done by bogusDoctype();
  4957                                 /*
  4958                                  * Switch to the bogus DOCTYPE state.
  4959                                  */
  4960                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
  4961                                 continue stateloop;
  4964                     // FALLTHRU DON'T REORDER
  4965                 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
  4966                     beforedoctypepublicidentifierloop: for (;;) {
  4967                         if (++pos == endPos) {
  4968                             break stateloop;
  4970                         c = checkChar(buf, pos);
  4971                         /*
  4972                          * Consume the next input character:
  4973                          */
  4974                         switch (c) {
  4975                             case '\r':
  4976                                 silentCarriageReturn();
  4977                                 break stateloop;
  4978                             case '\n':
  4979                                 silentLineFeed();
  4980                                 // fall thru
  4981                             case ' ':
  4982                             case '\t':
  4983                             case '\u000C':
  4984                                 /*
  4985                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
  4986                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
  4987                                  * in the before DOCTYPE public identifier
  4988                                  * state.
  4989                                  */
  4990                                 continue;
  4991                             case '"':
  4992                                 /*
  4993                                  * U+0022 QUOTATION MARK (") Set the DOCTYPE
  4994                                  * token's public identifier to the empty string
  4995                                  * (not missing),
  4996                                  */
  4997                                 clearLongStrBuf();
  4998                                 /*
  4999                                  * then switch to the DOCTYPE public identifier
  5000                                  * (double-quoted) state.
  5001                                  */
  5002                                 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
  5003                                 break beforedoctypepublicidentifierloop;
  5004                             // continue stateloop;
  5005                             case '\'':
  5006                                 /*
  5007                                  * U+0027 APOSTROPHE (') Set the DOCTYPE token's
  5008                                  * public identifier to the empty string (not
  5009                                  * missing),
  5010                                  */
  5011                                 clearLongStrBuf();
  5012                                 /*
  5013                                  * then switch to the DOCTYPE public identifier
  5014                                  * (single-quoted) state.
  5015                                  */
  5016                                 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
  5017                                 continue stateloop;
  5018                             case '>':
  5019                                 /* U+003E GREATER-THAN SIGN (>) Parse error. */
  5020                                 errExpectedPublicId();
  5021                                 /*
  5022                                  * Set the DOCTYPE token's force-quirks flag to
  5023                                  * on.
  5024                                  */
  5025                                 forceQuirks = true;
  5026                                 /*
  5027                                  * Emit that DOCTYPE token.
  5028                                  */
  5029                                 emitDoctypeToken(pos);
  5030                                 /*
  5031                                  * Switch to the data state.
  5032                                  */
  5033                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  5034                                 continue stateloop;
  5035                             default:
  5036                                 bogusDoctype();
  5037                                 /*
  5038                                  * Set the DOCTYPE token's force-quirks flag to
  5039                                  * on.
  5040                                  */
  5041                                 // done by bogusDoctype();
  5042                                 /*
  5043                                  * Switch to the bogus DOCTYPE state.
  5044                                  */
  5045                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
  5046                                 continue stateloop;
  5049                     // FALLTHRU DON'T REORDER
  5050                 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
  5051                     doctypepublicidentifierdoublequotedloop: for (;;) {
  5052                         if (++pos == endPos) {
  5053                             break stateloop;
  5055                         c = checkChar(buf, pos);
  5056                         /*
  5057                          * Consume the next input character:
  5058                          */
  5059                         switch (c) {
  5060                             case '"':
  5061                                 /*
  5062                                  * U+0022 QUOTATION MARK (") Switch to the after
  5063                                  * DOCTYPE public identifier state.
  5064                                  */
  5065                                 publicIdentifier = longStrBufToString();
  5066                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
  5067                                 break doctypepublicidentifierdoublequotedloop;
  5068                             // continue stateloop;
  5069                             case '>':
  5070                                 /*
  5071                                  * U+003E GREATER-THAN SIGN (>) Parse error.
  5072                                  */
  5073                                 errGtInPublicId();
  5074                                 /*
  5075                                  * Set the DOCTYPE token's force-quirks flag to
  5076                                  * on.
  5077                                  */
  5078                                 forceQuirks = true;
  5079                                 /*
  5080                                  * Emit that DOCTYPE token.
  5081                                  */
  5082                                 publicIdentifier = longStrBufToString();
  5083                                 emitDoctypeToken(pos);
  5084                                 /*
  5085                                  * Switch to the data state.
  5086                                  */
  5087                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  5088                                 continue stateloop;
  5089                             case '\r':
  5090                                 appendLongStrBufCarriageReturn();
  5091                                 break stateloop;
  5092                             case '\n':
  5093                                 appendLongStrBufLineFeed();
  5094                                 continue;
  5095                             case '\u0000':
  5096                                 c = '\uFFFD';
  5097                                 // fall thru
  5098                             default:
  5099                                 /*
  5100                                  * Anything else Append the current input
  5101                                  * character to the current DOCTYPE token's
  5102                                  * public identifier.
  5103                                  */
  5104                                 appendLongStrBuf(c);
  5105                                 /*
  5106                                  * Stay in the DOCTYPE public identifier
  5107                                  * (double-quoted) state.
  5108                                  */
  5109                                 continue;
  5112                     // FALLTHRU DON'T REORDER
  5113                 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
  5114                     afterdoctypepublicidentifierloop: for (;;) {
  5115                         if (++pos == endPos) {
  5116                             break stateloop;
  5118                         c = checkChar(buf, pos);
  5119                         /*
  5120                          * Consume the next input character:
  5121                          */
  5122                         switch (c) {
  5123                             case '\r':
  5124                                 silentCarriageReturn();
  5125                                 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
  5126                                 break stateloop;
  5127                             case '\n':
  5128                                 silentLineFeed();
  5129                                 // fall thru
  5130                             case ' ':
  5131                             case '\t':
  5132                             case '\u000C':
  5133                                 /*
  5134                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
  5135                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
  5136                                  * Switch to the between DOCTYPE public and
  5137                                  * system identifiers state.
  5138                                  */
  5139                                 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
  5140                                 break afterdoctypepublicidentifierloop;
  5141                             // continue stateloop;
  5142                             case '>':
  5143                                 /*
  5144                                  * U+003E GREATER-THAN SIGN (>) Emit the current
  5145                                  * DOCTYPE token.
  5146                                  */
  5147                                 emitDoctypeToken(pos);
  5148                                 /*
  5149                                  * Switch to the data state.
  5150                                  */
  5151                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  5152                                 continue stateloop;
  5153                             case '"':
  5154                                 /*
  5155                                  * U+0022 QUOTATION MARK (") Parse error.
  5156                                  */
  5157                                 errNoSpaceBetweenPublicAndSystemIds();
  5158                                 /*
  5159                                  * Set the DOCTYPE token's system identifier to
  5160                                  * the empty string (not missing),
  5161                                  */
  5162                                 clearLongStrBuf();
  5163                                 /*
  5164                                  * then switch to the DOCTYPE system identifier
  5165                                  * (double-quoted) state.
  5166                                  */
  5167                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
  5168                                 continue stateloop;
  5169                             case '\'':
  5170                                 /*
  5171                                  * U+0027 APOSTROPHE (') Parse error.
  5172                                  */
  5173                                 errNoSpaceBetweenPublicAndSystemIds();
  5174                                 /*
  5175                                  * Set the DOCTYPE token's system identifier to
  5176                                  * the empty string (not missing),
  5177                                  */
  5178                                 clearLongStrBuf();
  5179                                 /*
  5180                                  * then switch to the DOCTYPE system identifier
  5181                                  * (single-quoted) state.
  5182                                  */
  5183                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
  5184                                 continue stateloop;
  5185                             default:
  5186                                 bogusDoctype();
  5187                                 /*
  5188                                  * Set the DOCTYPE token's force-quirks flag to
  5189                                  * on.
  5190                                  */
  5191                                 // done by bogusDoctype();
  5192                                 /*
  5193                                  * Switch to the bogus DOCTYPE state.
  5194                                  */
  5195                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
  5196                                 continue stateloop;
  5199                     // FALLTHRU DON'T REORDER
  5200                 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
  5201                     betweendoctypepublicandsystemidentifiersloop: for (;;) {
  5202                         if (++pos == endPos) {
  5203                             break stateloop;
  5205                         c = checkChar(buf, pos);
  5206                         /*
  5207                          * Consume the next input character:
  5208                          */
  5209                         switch (c) {
  5210                             case '\r':
  5211                                 silentCarriageReturn();
  5212                                 break stateloop;
  5213                             case '\n':
  5214                                 silentLineFeed();
  5215                                 // fall thru
  5216                             case ' ':
  5217                             case '\t':
  5218                             case '\u000C':
  5219                                 /*
  5220                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
  5221                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
  5222                                  * in the between DOCTYPE public and system
  5223                                  * identifiers state.
  5224                                  */
  5225                                 continue;
  5226                             case '>':
  5227                                 /*
  5228                                  * U+003E GREATER-THAN SIGN (>) Emit the current
  5229                                  * DOCTYPE token.
  5230                                  */
  5231                                 emitDoctypeToken(pos);
  5232                                 /*
  5233                                  * Switch to the data state.
  5234                                  */
  5235                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  5236                                 continue stateloop;
  5237                             case '"':
  5238                                 /*
  5239                                  * U+0022 QUOTATION MARK (") Set the DOCTYPE
  5240                                  * token's system identifier to the empty string
  5241                                  * (not missing),
  5242                                  */
  5243                                 clearLongStrBuf();
  5244                                 /*
  5245                                  * then switch to the DOCTYPE system identifier
  5246                                  * (double-quoted) state.
  5247                                  */
  5248                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
  5249                                 break betweendoctypepublicandsystemidentifiersloop;
  5250                             // continue stateloop;
  5251                             case '\'':
  5252                                 /*
  5253                                  * U+0027 APOSTROPHE (') Set the DOCTYPE token's
  5254                                  * system identifier to the empty string (not
  5255                                  * missing),
  5256                                  */
  5257                                 clearLongStrBuf();
  5258                                 /*
  5259                                  * then switch to the DOCTYPE system identifier
  5260                                  * (single-quoted) state.
  5261                                  */
  5262                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
  5263                                 continue stateloop;
  5264                             default:
  5265                                 bogusDoctype();
  5266                                 /*
  5267                                  * Set the DOCTYPE token's force-quirks flag to
  5268                                  * on.
  5269                                  */
  5270                                 // done by bogusDoctype();
  5271                                 /*
  5272                                  * Switch to the bogus DOCTYPE state.
  5273                                  */
  5274                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
  5275                                 continue stateloop;
  5278                     // FALLTHRU DON'T REORDER
  5279                 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
  5280                     doctypesystemidentifierdoublequotedloop: for (;;) {
  5281                         if (++pos == endPos) {
  5282                             break stateloop;
  5284                         c = checkChar(buf, pos);
  5285                         /*
  5286                          * Consume the next input character:
  5287                          */
  5288                         switch (c) {
  5289                             case '"':
  5290                                 /*
  5291                                  * U+0022 QUOTATION MARK (") Switch to the after
  5292                                  * DOCTYPE system identifier state.
  5293                                  */
  5294                                 systemIdentifier = longStrBufToString();
  5295                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
  5296                                 continue stateloop;
  5297                             case '>':
  5298                                 /*
  5299                                  * U+003E GREATER-THAN SIGN (>) Parse error.
  5300                                  */
  5301                                 errGtInSystemId();
  5302                                 /*
  5303                                  * Set the DOCTYPE token's force-quirks flag to
  5304                                  * on.
  5305                                  */
  5306                                 forceQuirks = true;
  5307                                 /*
  5308                                  * Emit that DOCTYPE token.
  5309                                  */
  5310                                 systemIdentifier = longStrBufToString();
  5311                                 emitDoctypeToken(pos);
  5312                                 /*
  5313                                  * Switch to the data state.
  5314                                  */
  5315                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  5316                                 continue stateloop;
  5317                             case '\r':
  5318                                 appendLongStrBufCarriageReturn();
  5319                                 break stateloop;
  5320                             case '\n':
  5321                                 appendLongStrBufLineFeed();
  5322                                 continue;
  5323                             case '\u0000':
  5324                                 c = '\uFFFD';
  5325                                 // fall thru
  5326                             default:
  5327                                 /*
  5328                                  * Anything else Append the current input
  5329                                  * character to the current DOCTYPE token's
  5330                                  * system identifier.
  5331                                  */
  5332                                 appendLongStrBuf(c);
  5333                                 /*
  5334                                  * Stay in the DOCTYPE system identifier
  5335                                  * (double-quoted) state.
  5336                                  */
  5337                                 continue;
  5340                     // FALLTHRU DON'T REORDER
  5341                 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
  5342                     afterdoctypesystemidentifierloop: for (;;) {
  5343                         if (++pos == endPos) {
  5344                             break stateloop;
  5346                         c = checkChar(buf, pos);
  5347                         /*
  5348                          * Consume the next input character:
  5349                          */
  5350                         switch (c) {
  5351                             case '\r':
  5352                                 silentCarriageReturn();
  5353                                 break stateloop;
  5354                             case '\n':
  5355                                 silentLineFeed();
  5356                                 // fall thru
  5357                             case ' ':
  5358                             case '\t':
  5359                             case '\u000C':
  5360                                 /*
  5361                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
  5362                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
  5363                                  * in the after DOCTYPE system identifier state.
  5364                                  */
  5365                                 continue;
  5366                             case '>':
  5367                                 /*
  5368                                  * U+003E GREATER-THAN SIGN (>) Emit the current
  5369                                  * DOCTYPE token.
  5370                                  */
  5371                                 emitDoctypeToken(pos);
  5372                                 /*
  5373                                  * Switch to the data state.
  5374                                  */
  5375                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  5376                                 continue stateloop;
  5377                             default:
  5378                                 /*
  5379                                  * Switch to the bogus DOCTYPE state. (This does
  5380                                  * not set the DOCTYPE token's force-quirks flag
  5381                                  * to on.)
  5382                                  */
  5383                                 bogusDoctypeWithoutQuirks();
  5384                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
  5385                                 break afterdoctypesystemidentifierloop;
  5386                             // continue stateloop;
  5389                     // FALLTHRU DON'T REORDER
  5390                 case BOGUS_DOCTYPE:
  5391                     for (;;) {
  5392                         if (reconsume) {
  5393                             reconsume = false;
  5394                         } else {
  5395                             if (++pos == endPos) {
  5396                                 break stateloop;
  5398                             c = checkChar(buf, pos);
  5400                         /*
  5401                          * Consume the next input character:
  5402                          */
  5403                         switch (c) {
  5404                             case '>':
  5405                                 /*
  5406                                  * U+003E GREATER-THAN SIGN (>) Emit that
  5407                                  * DOCTYPE token.
  5408                                  */
  5409                                 emitDoctypeToken(pos);
  5410                                 /*
  5411                                  * Switch to the data state.
  5412                                  */
  5413                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  5414                                 continue stateloop;
  5415                             case '\r':
  5416                                 silentCarriageReturn();
  5417                                 break stateloop;
  5418                             case '\n':
  5419                                 silentLineFeed();
  5420                                 // fall thru
  5421                             default:
  5422                                 /*
  5423                                  * Anything else Stay in the bogus DOCTYPE
  5424                                  * state.
  5425                                  */
  5426                                 continue;
  5429                     // XXX reorder point
  5430                 case DOCTYPE_YSTEM:
  5431                     doctypeystemloop: for (;;) {
  5432                         if (++pos == endPos) {
  5433                             break stateloop;
  5435                         c = checkChar(buf, pos);
  5436                         /*
  5437                          * Otherwise, if the six characters starting from the
  5438                          * current input character are an ASCII case-insensitive
  5439                          * match for the word "SYSTEM", then consume those
  5440                          * characters and switch to the before DOCTYPE system
  5441                          * identifier state.
  5442                          */
  5443                         if (index < 5) { // YSTEM.length
  5444                             char folded = c;
  5445                             if (c >= 'A' && c <= 'Z') {
  5446                                 folded += 0x20;
  5448                             if (folded != Tokenizer.YSTEM[index]) {
  5449                                 bogusDoctype();
  5450                                 reconsume = true;
  5451                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
  5452                                 continue stateloop;
  5454                             index++;
  5455                             continue stateloop;
  5456                         } else {
  5457                             reconsume = true;
  5458                             state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos);
  5459                             break doctypeystemloop;
  5460                             // continue stateloop;
  5463                     // FALLTHRU DON'T REORDER
  5464                 case AFTER_DOCTYPE_SYSTEM_KEYWORD:
  5465                     afterdoctypesystemkeywordloop: for (;;) {
  5466                         if (reconsume) {
  5467                             reconsume = false;
  5468                         } else {
  5469                             if (++pos == endPos) {
  5470                                 break stateloop;
  5472                             c = checkChar(buf, pos);
  5474                         /*
  5475                          * Consume the next input character:
  5476                          */
  5477                         switch (c) {
  5478                             case '\r':
  5479                                 silentCarriageReturn();
  5480                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
  5481                                 break stateloop;
  5482                             case '\n':
  5483                                 silentLineFeed();
  5484                                 // fall thru
  5485                             case ' ':
  5486                             case '\t':
  5487                             case '\u000C':
  5488                                 /*
  5489                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
  5490                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE
  5491                                  * Switch to the before DOCTYPE public
  5492                                  * identifier state.
  5493                                  */
  5494                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
  5495                                 break afterdoctypesystemkeywordloop;
  5496                             // FALL THROUGH continue stateloop
  5497                             case '"':
  5498                                 /*
  5499                                  * U+0022 QUOTATION MARK (") Parse Error.
  5500                                  */
  5501                                 errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
  5502                                 /*
  5503                                  * Set the DOCTYPE token's system identifier to
  5504                                  * the empty string (not missing),
  5505                                  */
  5506                                 clearLongStrBuf();
  5507                                 /*
  5508                                  * then switch to the DOCTYPE public identifier
  5509                                  * (double-quoted) state.
  5510                                  */
  5511                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
  5512                                 continue stateloop;
  5513                             case '\'':
  5514                                 /*
  5515                                  * U+0027 APOSTROPHE (') Parse Error.
  5516                                  */
  5517                                 errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
  5518                                 /*
  5519                                  * Set the DOCTYPE token's public identifier to
  5520                                  * the empty string (not missing),
  5521                                  */
  5522                                 clearLongStrBuf();
  5523                                 /*
  5524                                  * then switch to the DOCTYPE public identifier
  5525                                  * (single-quoted) state.
  5526                                  */
  5527                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
  5528                                 continue stateloop;
  5529                             case '>':
  5530                                 /* U+003E GREATER-THAN SIGN (>) Parse error. */
  5531                                 errExpectedPublicId();
  5532                                 /*
  5533                                  * Set the DOCTYPE token's force-quirks flag to
  5534                                  * on.
  5535                                  */
  5536                                 forceQuirks = true;
  5537                                 /*
  5538                                  * Emit that DOCTYPE token.
  5539                                  */
  5540                                 emitDoctypeToken(pos);
  5541                                 /*
  5542                                  * Switch to the data state.
  5543                                  */
  5544                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  5545                                 continue stateloop;
  5546                             default:
  5547                                 bogusDoctype();
  5548                                 /*
  5549                                  * Set the DOCTYPE token's force-quirks flag to
  5550                                  * on.
  5551                                  */
  5552                                 // done by bogusDoctype();
  5553                                 /*
  5554                                  * Switch to the bogus DOCTYPE state.
  5555                                  */
  5556                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
  5557                                 continue stateloop;
  5560                     // FALLTHRU DON'T REORDER
  5561                 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
  5562                     beforedoctypesystemidentifierloop: for (;;) {
  5563                         if (++pos == endPos) {
  5564                             break stateloop;
  5566                         c = checkChar(buf, pos);
  5567                         /*
  5568                          * Consume the next input character:
  5569                          */
  5570                         switch (c) {
  5571                             case '\r':
  5572                                 silentCarriageReturn();
  5573                                 break stateloop;
  5574                             case '\n':
  5575                                 silentLineFeed();
  5576                                 // fall thru
  5577                             case ' ':
  5578                             case '\t':
  5579                             case '\u000C':
  5580                                 /*
  5581                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED
  5582                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
  5583                                  * in the before DOCTYPE system identifier
  5584                                  * state.
  5585                                  */
  5586                                 continue;
  5587                             case '"':
  5588                                 /*
  5589                                  * U+0022 QUOTATION MARK (") Set the DOCTYPE
  5590                                  * token's system identifier to the empty string
  5591                                  * (not missing),
  5592                                  */
  5593                                 clearLongStrBuf();
  5594                                 /*
  5595                                  * then switch to the DOCTYPE system identifier
  5596                                  * (double-quoted) state.
  5597                                  */
  5598                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
  5599                                 continue stateloop;
  5600                             case '\'':
  5601                                 /*
  5602                                  * U+0027 APOSTROPHE (') Set the DOCTYPE token's
  5603                                  * system identifier to the empty string (not
  5604                                  * missing),
  5605                                  */
  5606                                 clearLongStrBuf();
  5607                                 /*
  5608                                  * then switch to the DOCTYPE system identifier
  5609                                  * (single-quoted) state.
  5610                                  */
  5611                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
  5612                                 break beforedoctypesystemidentifierloop;
  5613                             // continue stateloop;
  5614                             case '>':
  5615                                 /* U+003E GREATER-THAN SIGN (>) Parse error. */
  5616                                 errExpectedSystemId();
  5617                                 /*
  5618                                  * Set the DOCTYPE token's force-quirks flag to
  5619                                  * on.
  5620                                  */
  5621                                 forceQuirks = true;
  5622                                 /*
  5623                                  * Emit that DOCTYPE token.
  5624                                  */
  5625                                 emitDoctypeToken(pos);
  5626                                 /*
  5627                                  * Switch to the data state.
  5628                                  */
  5629                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  5630                                 continue stateloop;
  5631                             default:
  5632                                 bogusDoctype();
  5633                                 /*
  5634                                  * Set the DOCTYPE token's force-quirks flag to
  5635                                  * on.
  5636                                  */
  5637                                 // done by bogusDoctype();
  5638                                 /*
  5639                                  * Switch to the bogus DOCTYPE state.
  5640                                  */
  5641                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
  5642                                 continue stateloop;
  5645                     // FALLTHRU DON'T REORDER
  5646                 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
  5647                     for (;;) {
  5648                         if (++pos == endPos) {
  5649                             break stateloop;
  5651                         c = checkChar(buf, pos);
  5652                         /*
  5653                          * Consume the next input character:
  5654                          */
  5655                         switch (c) {
  5656                             case '\'':
  5657                                 /*
  5658                                  * U+0027 APOSTROPHE (') Switch to the after
  5659                                  * DOCTYPE system identifier state.
  5660                                  */
  5661                                 systemIdentifier = longStrBufToString();
  5662                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
  5663                                 continue stateloop;
  5664                             case '>':
  5665                                 errGtInSystemId();
  5666                                 /*
  5667                                  * Set the DOCTYPE token's force-quirks flag to
  5668                                  * on.
  5669                                  */
  5670                                 forceQuirks = true;
  5671                                 /*
  5672                                  * Emit that DOCTYPE token.
  5673                                  */
  5674                                 systemIdentifier = longStrBufToString();
  5675                                 emitDoctypeToken(pos);
  5676                                 /*
  5677                                  * Switch to the data state.
  5678                                  */
  5679                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  5680                                 continue stateloop;
  5681                             case '\r':
  5682                                 appendLongStrBufCarriageReturn();
  5683                                 break stateloop;
  5684                             case '\n':
  5685                                 appendLongStrBufLineFeed();
  5686                                 continue;
  5687                             case '\u0000':
  5688                                 c = '\uFFFD';
  5689                                 // fall thru
  5690                             default:
  5691                                 /*
  5692                                  * Anything else Append the current input
  5693                                  * character to the current DOCTYPE token's
  5694                                  * system identifier.
  5695                                  */
  5696                                 appendLongStrBuf(c);
  5697                                 /*
  5698                                  * Stay in the DOCTYPE system identifier
  5699                                  * (double-quoted) state.
  5700                                  */
  5701                                 continue;
  5704                     // XXX reorder point
  5705                 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
  5706                     for (;;) {
  5707                         if (++pos == endPos) {
  5708                             break stateloop;
  5710                         c = checkChar(buf, pos);
  5711                         /*
  5712                          * Consume the next input character:
  5713                          */
  5714                         switch (c) {
  5715                             case '\'':
  5716                                 /*
  5717                                  * U+0027 APOSTROPHE (') Switch to the after
  5718                                  * DOCTYPE public identifier state.
  5719                                  */
  5720                                 publicIdentifier = longStrBufToString();
  5721                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
  5722                                 continue stateloop;
  5723                             case '>':
  5724                                 errGtInPublicId();
  5725                                 /*
  5726                                  * Set the DOCTYPE token's force-quirks flag to
  5727                                  * on.
  5728                                  */
  5729                                 forceQuirks = true;
  5730                                 /*
  5731                                  * Emit that DOCTYPE token.
  5732                                  */
  5733                                 publicIdentifier = longStrBufToString();
  5734                                 emitDoctypeToken(pos);
  5735                                 /*
  5736                                  * Switch to the data state.
  5737                                  */
  5738                                 state = transition(state, Tokenizer.DATA, reconsume, pos);
  5739                                 continue stateloop;
  5740                             case '\r':
  5741                                 appendLongStrBufCarriageReturn();
  5742                                 break stateloop;
  5743                             case '\n':
  5744                                 appendLongStrBufLineFeed();
  5745                                 continue;
  5746                             case '\u0000':
  5747                                 c = '\uFFFD';
  5748                                 // fall thru
  5749                             default:
  5750                                 /*
  5751                                  * Anything else Append the current input
  5752                                  * character to the current DOCTYPE token's
  5753                                  * public identifier.
  5754                                  */
  5755                                 appendLongStrBuf(c);
  5756                                 /*
  5757                                  * Stay in the DOCTYPE public identifier
  5758                                  * (single-quoted) state.
  5759                                  */
  5760                                 continue;
  5763                     // XXX reorder point
  5764                 case PROCESSING_INSTRUCTION:
  5765                     processinginstructionloop: for (;;) {
  5766                         if (++pos == endPos) {
  5767                             break stateloop;
  5769                         c = checkChar(buf, pos);
  5770                         switch (c) {
  5771                             case '?':
  5772                                 state = transition(
  5773                                         state,
  5774                                         Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK,
  5775                                         reconsume, pos);
  5776                                 break processinginstructionloop;
  5777                             // continue stateloop;
  5778                             default:
  5779                                 continue;
  5782                 case PROCESSING_INSTRUCTION_QUESTION_MARK:
  5783                     if (++pos == endPos) {
  5784                         break stateloop;
  5786                     c = checkChar(buf, pos);
  5787                     switch (c) {
  5788                         case '>':
  5789                             state = transition(state, Tokenizer.DATA,
  5790                                     reconsume, pos);
  5791                             continue stateloop;
  5792                         default:
  5793                             state = transition(state,
  5794                                     Tokenizer.PROCESSING_INSTRUCTION,
  5795                                     reconsume, pos);
  5796                             continue stateloop;
  5798                     // END HOTSPOT WORKAROUND
  5801         flushChars(buf, pos);
  5802         /*
  5803          * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; }
  5804          */
  5805         // Save locals
  5806         stateSave = state;
  5807         returnStateSave = returnState;
  5808         return pos;
  5811     // HOTSPOT WORKAROUND INSERTION POINT
  5813     // [NOCPP[
  5815     protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException {
  5816         return to;
  5819     // ]NOCPP]
  5821     private void initDoctypeFields() {
  5822         doctypeName = "";
  5823         if (systemIdentifier != null) {
  5824             Portability.releaseString(systemIdentifier);
  5825             systemIdentifier = null;
  5827         if (publicIdentifier != null) {
  5828             Portability.releaseString(publicIdentifier);
  5829             publicIdentifier = null;
  5831         forceQuirks = false;
  5834     @Inline private void adjustDoubleHyphenAndAppendToLongStrBufCarriageReturn()
  5835             throws SAXException {
  5836         silentCarriageReturn();
  5837         adjustDoubleHyphenAndAppendToLongStrBufAndErr('\n');
  5840     @Inline private void adjustDoubleHyphenAndAppendToLongStrBufLineFeed()
  5841             throws SAXException {
  5842         silentLineFeed();
  5843         adjustDoubleHyphenAndAppendToLongStrBufAndErr('\n');
  5846     @Inline private void appendLongStrBufLineFeed() {
  5847         silentLineFeed();
  5848         appendLongStrBuf('\n');
  5851     @Inline private void appendLongStrBufCarriageReturn() {
  5852         silentCarriageReturn();
  5853         appendLongStrBuf('\n');
  5856     @Inline protected void silentCarriageReturn() {
  5857         ++line;
  5858         lastCR = true;
  5861     @Inline protected void silentLineFeed() {
  5862         ++line;
  5865     private void emitCarriageReturn(@NoLength char[] buf, int pos)
  5866             throws SAXException {
  5867         silentCarriageReturn();
  5868         flushChars(buf, pos);
  5869         tokenHandler.characters(Tokenizer.LF, 0, 1);
  5870         cstart = Integer.MAX_VALUE;
  5873     private void emitReplacementCharacter(@NoLength char[] buf, int pos)
  5874             throws SAXException {
  5875         flushChars(buf, pos);
  5876         tokenHandler.zeroOriginatingReplacementCharacter();
  5877         cstart = pos + 1;
  5880     private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos)
  5881             throws SAXException {
  5882         flushChars(buf, pos);
  5883         tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1);
  5884         cstart = pos + 1;
  5887     private void setAdditionalAndRememberAmpersandLocation(char add) {
  5888         additional = add;
  5889         // [NOCPP[
  5890         ampersandLocation = new LocatorImpl(this);
  5891         // ]NOCPP]
  5894     private void bogusDoctype() throws SAXException {
  5895         errBogusDoctype();
  5896         forceQuirks = true;
  5899     private void bogusDoctypeWithoutQuirks() throws SAXException {
  5900         errBogusDoctype();
  5901         forceQuirks = false;
  5904     private void emitOrAppendStrBuf(int returnState) throws SAXException {
  5905         if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
  5906             appendStrBufToLongStrBuf();
  5907         } else {
  5908             emitStrBuf();
  5912     private void handleNcrValue(int returnState) throws SAXException {
  5913         /*
  5914          * If one or more characters match the range, then take them all and
  5915          * interpret the string of characters as a number (either hexadecimal or
  5916          * decimal as appropriate).
  5917          */
  5918         if (value <= 0xFFFF) {
  5919             if (value >= 0x80 && value <= 0x9f) {
  5920                 /*
  5921                  * If that number is one of the numbers in the first column of
  5922                  * the following table, then this is a parse error.
  5923                  */
  5924                 errNcrInC1Range();
  5925                 /*
  5926                  * Find the row with that number in the first column, and return
  5927                  * a character token for the Unicode character given in the
  5928                  * second column of that row.
  5929                  */
  5930                 @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80];
  5931                 emitOrAppendOne(val, returnState);
  5932                 // [NOCPP[
  5933             } else if (value == 0xC
  5934                     && contentSpacePolicy != XmlViolationPolicy.ALLOW) {
  5935                 if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) {
  5936                     emitOrAppendOne(Tokenizer.SPACE, returnState);
  5937                 } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) {
  5938                     fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space.");
  5940                 // ]NOCPP]
  5941             } else if (value == 0x0) {
  5942                 errNcrZero();
  5943                 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
  5944             } else if ((value & 0xF800) == 0xD800) {
  5945                 errNcrSurrogate();
  5946                 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
  5947             } else {
  5948                 /*
  5949                  * Otherwise, return a character token for the Unicode character
  5950                  * whose code point is that number.
  5951                  */
  5952                 char ch = (char) value;
  5953                 // [NOCPP[
  5954                 if (value == 0x0D) {
  5955                     errNcrCr();
  5956                 } else if ((value <= 0x0008) || (value == 0x000B)
  5957                         || (value >= 0x000E && value <= 0x001F)) {
  5958                     ch = errNcrControlChar(ch);
  5959                 } else if (value >= 0xFDD0 && value <= 0xFDEF) {
  5960                     errNcrUnassigned();
  5961                 } else if ((value & 0xFFFE) == 0xFFFE) {
  5962                     ch = errNcrNonCharacter(ch);
  5963                 } else if (value >= 0x007F && value <= 0x009F) {
  5964                     errNcrControlChar();
  5965                 } else {
  5966                     maybeWarnPrivateUse(ch);
  5968                 // ]NOCPP]
  5969                 bmpChar[0] = ch;
  5970                 emitOrAppendOne(bmpChar, returnState);
  5972         } else if (value <= 0x10FFFF) {
  5973             // [NOCPP[
  5974             maybeWarnPrivateUseAstral();
  5975             if ((value & 0xFFFE) == 0xFFFE) {
  5976                 errAstralNonCharacter(value);
  5978             // ]NOCPP]
  5979             astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10));
  5980             astralChar[1] = (char) (0xDC00 + (value & 0x3FF));
  5981             emitOrAppendTwo(astralChar, returnState);
  5982         } else {
  5983             errNcrOutOfRange();
  5984             emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
  5988     public void eof() throws SAXException {
  5989         int state = stateSave;
  5990         int returnState = returnStateSave;
  5992         eofloop: for (;;) {
  5993             switch (state) {
  5994                 case SCRIPT_DATA_LESS_THAN_SIGN:
  5995                 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
  5996                     /*
  5997                      * Otherwise, emit a U+003C LESS-THAN SIGN character token
  5998                      */
  5999                     tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
  6000                     /*
  6001                      * and reconsume the current input character in the data
  6002                      * state.
  6003                      */
  6004                     break eofloop;
  6005                 case TAG_OPEN:
  6006                     /*
  6007                      * The behavior of this state depends on the content model
  6008                      * flag.
  6009                      */
  6010                     /*
  6011                      * Anything else Parse error.
  6012                      */
  6013                     errEofAfterLt();
  6014                     /*
  6015                      * Emit a U+003C LESS-THAN SIGN character token
  6016                      */
  6017                     tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
  6018                     /*
  6019                      * and reconsume the current input character in the data
  6020                      * state.
  6021                      */
  6022                     break eofloop;
  6023                 case RAWTEXT_RCDATA_LESS_THAN_SIGN:
  6024                     /*
  6025                      * Emit a U+003C LESS-THAN SIGN character token
  6026                      */
  6027                     tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
  6028                     /*
  6029                      * and reconsume the current input character in the RCDATA
  6030                      * state.
  6031                      */
  6032                     break eofloop;
  6033                 case NON_DATA_END_TAG_NAME:
  6034                     /*
  6035                      * Emit a U+003C LESS-THAN SIGN character token, a U+002F
  6036                      * SOLIDUS character token,
  6037                      */
  6038                     tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
  6039                     /*
  6040                      * a character token for each of the characters in the
  6041                      * temporary buffer (in the order they were added to the
  6042                      * buffer),
  6043                      */
  6044                     emitStrBuf();
  6045                     /*
  6046                      * and reconsume the current input character in the RCDATA
  6047                      * state.
  6048                      */
  6049                     break eofloop;
  6050                 case CLOSE_TAG_OPEN:
  6051                     /* EOF Parse error. */
  6052                     errEofAfterLt();
  6053                     /*
  6054                      * Emit a U+003C LESS-THAN SIGN character token and a U+002F
  6055                      * SOLIDUS character token.
  6056                      */
  6057                     tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
  6058                     /*
  6059                      * Reconsume the EOF character in the data state.
  6060                      */
  6061                     break eofloop;
  6062                 case TAG_NAME:
  6063                     /*
  6064                      * EOF Parse error.
  6065                      */
  6066                     errEofInTagName();
  6067                     /*
  6068                      * Reconsume the EOF character in the data state.
  6069                      */
  6070                     break eofloop;
  6071                 case BEFORE_ATTRIBUTE_NAME:
  6072                 case AFTER_ATTRIBUTE_VALUE_QUOTED:
  6073                 case SELF_CLOSING_START_TAG:
  6074                     /* EOF Parse error. */
  6075                     errEofWithoutGt();
  6076                     /*
  6077                      * Reconsume the EOF character in the data state.
  6078                      */
  6079                     break eofloop;
  6080                 case ATTRIBUTE_NAME:
  6081                     /*
  6082                      * EOF Parse error.
  6083                      */
  6084                     errEofInAttributeName();
  6085                     /*
  6086                      * Reconsume the EOF character in the data state.
  6087                      */
  6088                     break eofloop;
  6089                 case AFTER_ATTRIBUTE_NAME:
  6090                 case BEFORE_ATTRIBUTE_VALUE:
  6091                     /* EOF Parse error. */
  6092                     errEofWithoutGt();
  6093                     /*
  6094                      * Reconsume the EOF character in the data state.
  6095                      */
  6096                     break eofloop;
  6097                 case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
  6098                 case ATTRIBUTE_VALUE_SINGLE_QUOTED:
  6099                 case ATTRIBUTE_VALUE_UNQUOTED:
  6100                     /* EOF Parse error. */
  6101                     errEofInAttributeValue();
  6102                     /*
  6103                      * Reconsume the EOF character in the data state.
  6104                      */
  6105                     break eofloop;
  6106                 case BOGUS_COMMENT:
  6107                     emitComment(0, 0);
  6108                     break eofloop;
  6109                 case BOGUS_COMMENT_HYPHEN:
  6110                     // [NOCPP[
  6111                     maybeAppendSpaceToBogusComment();
  6112                     // ]NOCPP]
  6113                     emitComment(0, 0);
  6114                     break eofloop;
  6115                 case MARKUP_DECLARATION_OPEN:
  6116                     errBogusComment();
  6117                     clearLongStrBuf();
  6118                     emitComment(0, 0);
  6119                     break eofloop;
  6120                 case MARKUP_DECLARATION_HYPHEN:
  6121                     errBogusComment();
  6122                     emitComment(0, 0);
  6123                     break eofloop;
  6124                 case MARKUP_DECLARATION_OCTYPE:
  6125                     if (index < 6) {
  6126                         errBogusComment();
  6127                         emitComment(0, 0);
  6128                     } else {
  6129                         /* EOF Parse error. */
  6130                         errEofInDoctype();
  6131                         /*
  6132                          * Create a new DOCTYPE token. Set its force-quirks flag
  6133                          * to on.
  6134                          */
  6135                         doctypeName = "";
  6136                         if (systemIdentifier != null) {
  6137                             Portability.releaseString(systemIdentifier);
  6138                             systemIdentifier = null;
  6140                         if (publicIdentifier != null) {
  6141                             Portability.releaseString(publicIdentifier);
  6142                             publicIdentifier = null;
  6144                         forceQuirks = true;
  6145                         /*
  6146                          * Emit the token.
  6147                          */
  6148                         emitDoctypeToken(0);
  6149                         /*
  6150                          * Reconsume the EOF character in the data state.
  6151                          */
  6152                         break eofloop;
  6154                     break eofloop;
  6155                 case COMMENT_START:
  6156                 case COMMENT:
  6157                     /*
  6158                      * EOF Parse error.
  6159                      */
  6160                     errEofInComment();
  6161                     /* Emit the comment token. */
  6162                     emitComment(0, 0);
  6163                     /*
  6164                      * Reconsume the EOF character in the data state.
  6165                      */
  6166                     break eofloop;
  6167                 case COMMENT_END:
  6168                     errEofInComment();
  6169                     /* Emit the comment token. */
  6170                     emitComment(2, 0);
  6171                     /*
  6172                      * Reconsume the EOF character in the data state.
  6173                      */
  6174                     break eofloop;
  6175                 case COMMENT_END_DASH:
  6176                 case COMMENT_START_DASH:
  6177                     errEofInComment();
  6178                     /* Emit the comment token. */
  6179                     emitComment(1, 0);
  6180                     /*
  6181                      * Reconsume the EOF character in the data state.
  6182                      */
  6183                     break eofloop;
  6184                 case COMMENT_END_BANG:
  6185                     errEofInComment();
  6186                     /* Emit the comment token. */
  6187                     emitComment(3, 0);
  6188                     /*
  6189                      * Reconsume the EOF character in the data state.
  6190                      */
  6191                     break eofloop;
  6192                 case DOCTYPE:
  6193                 case BEFORE_DOCTYPE_NAME:
  6194                     errEofInDoctype();
  6195                     /*
  6196                      * Create a new DOCTYPE token. Set its force-quirks flag to
  6197                      * on.
  6198                      */
  6199                     forceQuirks = true;
  6200                     /*
  6201                      * Emit the token.
  6202                      */
  6203                     emitDoctypeToken(0);
  6204                     /*
  6205                      * Reconsume the EOF character in the data state.
  6206                      */
  6207                     break eofloop;
  6208                 case DOCTYPE_NAME:
  6209                     errEofInDoctype();
  6210                     strBufToDoctypeName();
  6211                     /*
  6212                      * Set the DOCTYPE token's force-quirks flag to on.
  6213                      */
  6214                     forceQuirks = true;
  6215                     /*
  6216                      * Emit that DOCTYPE token.
  6217                      */
  6218                     emitDoctypeToken(0);
  6219                     /*
  6220                      * Reconsume the EOF character in the data state.
  6221                      */
  6222                     break eofloop;
  6223                 case DOCTYPE_UBLIC:
  6224                 case DOCTYPE_YSTEM:
  6225                 case AFTER_DOCTYPE_NAME:
  6226                 case AFTER_DOCTYPE_PUBLIC_KEYWORD:
  6227                 case AFTER_DOCTYPE_SYSTEM_KEYWORD:
  6228                 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
  6229                     errEofInDoctype();
  6230                     /*
  6231                      * Set the DOCTYPE token's force-quirks flag to on.
  6232                      */
  6233                     forceQuirks = true;
  6234                     /*
  6235                      * Emit that DOCTYPE token.
  6236                      */
  6237                     emitDoctypeToken(0);
  6238                     /*
  6239                      * Reconsume the EOF character in the data state.
  6240                      */
  6241                     break eofloop;
  6242                 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
  6243                 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
  6244                     /* EOF Parse error. */
  6245                     errEofInPublicId();
  6246                     /*
  6247                      * Set the DOCTYPE token's force-quirks flag to on.
  6248                      */
  6249                     forceQuirks = true;
  6250                     /*
  6251                      * Emit that DOCTYPE token.
  6252                      */
  6253                     publicIdentifier = longStrBufToString();
  6254                     emitDoctypeToken(0);
  6255                     /*
  6256                      * Reconsume the EOF character in the data state.
  6257                      */
  6258                     break eofloop;
  6259                 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
  6260                 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
  6261                 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
  6262                     errEofInDoctype();
  6263                     /*
  6264                      * Set the DOCTYPE token's force-quirks flag to on.
  6265                      */
  6266                     forceQuirks = true;
  6267                     /*
  6268                      * Emit that DOCTYPE token.
  6269                      */
  6270                     emitDoctypeToken(0);
  6271                     /*
  6272                      * Reconsume the EOF character in the data state.
  6273                      */
  6274                     break eofloop;
  6275                 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
  6276                 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
  6277                     /* EOF Parse error. */
  6278                     errEofInSystemId();
  6279                     /*
  6280                      * Set the DOCTYPE token's force-quirks flag to on.
  6281                      */
  6282                     forceQuirks = true;
  6283                     /*
  6284                      * Emit that DOCTYPE token.
  6285                      */
  6286                     systemIdentifier = longStrBufToString();
  6287                     emitDoctypeToken(0);
  6288                     /*
  6289                      * Reconsume the EOF character in the data state.
  6290                      */
  6291                     break eofloop;
  6292                 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
  6293                     errEofInDoctype();
  6294                     /*
  6295                      * Set the DOCTYPE token's force-quirks flag to on.
  6296                      */
  6297                     forceQuirks = true;
  6298                     /*
  6299                      * Emit that DOCTYPE token.
  6300                      */
  6301                     emitDoctypeToken(0);
  6302                     /*
  6303                      * Reconsume the EOF character in the data state.
  6304                      */
  6305                     break eofloop;
  6306                 case BOGUS_DOCTYPE:
  6307                     /*
  6308                      * Emit that DOCTYPE token.
  6309                      */
  6310                     emitDoctypeToken(0);
  6311                     /*
  6312                      * Reconsume the EOF character in the data state.
  6313                      */
  6314                     break eofloop;
  6315                 case CONSUME_CHARACTER_REFERENCE:
  6316                     /*
  6317                      * Unlike the definition is the spec, this state does not
  6318                      * return a value and never requires the caller to
  6319                      * backtrack. This state takes care of emitting characters
  6320                      * or appending to the current attribute value. It also
  6321                      * takes care of that in the case when consuming the entity
  6322                      * fails.
  6323                      */
  6324                     /*
  6325                      * This section defines how to consume an entity. This
  6326                      * definition is used when parsing entities in text and in
  6327                      * attributes.
  6329                      * The behavior depends on the identity of the next
  6330                      * character (the one immediately after the U+0026 AMPERSAND
  6331                      * character):
  6332                      */
  6334                     emitOrAppendStrBuf(returnState);
  6335                     state = returnState;
  6336                     continue;
  6337                 case CHARACTER_REFERENCE_HILO_LOOKUP:
  6338                     errNoNamedCharacterMatch();
  6339                     emitOrAppendStrBuf(returnState);
  6340                     state = returnState;
  6341                     continue;
  6342                 case CHARACTER_REFERENCE_TAIL:
  6343                     outer: for (;;) {
  6344                         char c = '\u0000';
  6345                         entCol++;
  6346                         /*
  6347                          * Consume the maximum number of characters possible,
  6348                          * with the consumed characters matching one of the
  6349                          * identifiers in the first column of the named
  6350                          * character references table (in a case-sensitive
  6351                          * manner).
  6352                          */
  6353                         hiloop: for (;;) {
  6354                             if (hi == -1) {
  6355                                 break hiloop;
  6357                             if (entCol == NamedCharacters.NAMES[hi].length()) {
  6358                                 break hiloop;
  6360                             if (entCol > NamedCharacters.NAMES[hi].length()) {
  6361                                 break outer;
  6362                             } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
  6363                                 hi--;
  6364                             } else {
  6365                                 break hiloop;
  6369                         loloop: for (;;) {
  6370                             if (hi < lo) {
  6371                                 break outer;
  6373                             if (entCol == NamedCharacters.NAMES[lo].length()) {
  6374                                 candidate = lo;
  6375                                 strBufMark = strBufLen;
  6376                                 lo++;
  6377                             } else if (entCol > NamedCharacters.NAMES[lo].length()) {
  6378                                 break outer;
  6379                             } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
  6380                                 lo++;
  6381                             } else {
  6382                                 break loloop;
  6385                         if (hi < lo) {
  6386                             break outer;
  6388                         continue;
  6391                     if (candidate == -1) {
  6392                         /*
  6393                          * If no match can be made, then this is a parse error.
  6394                          */
  6395                         errNoNamedCharacterMatch();
  6396                         emitOrAppendStrBuf(returnState);
  6397                         state = returnState;
  6398                         continue eofloop;
  6399                     } else {
  6400                         @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
  6401                         if (candidateName.length() == 0
  6402                                 || candidateName.charAt(candidateName.length() - 1) != ';') {
  6403                             /*
  6404                              * If the last character matched is not a U+003B
  6405                              * SEMICOLON (;), there is a parse error.
  6406                              */
  6407                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
  6408                                 /*
  6409                                  * If the entity is being consumed as part of an
  6410                                  * attribute, and the last character matched is
  6411                                  * not a U+003B SEMICOLON (;),
  6412                                  */
  6413                                 char ch;
  6414                                 if (strBufMark == strBufLen) {
  6415                                     ch = '\u0000';
  6416                                 } else {
  6417                                     ch = strBuf[strBufMark];
  6419                                 if ((ch >= '0' && ch <= '9')
  6420                                         || (ch >= 'A' && ch <= 'Z')
  6421                                         || (ch >= 'a' && ch <= 'z')) {
  6422                                     /*
  6423                                      * and the next character is in the range
  6424                                      * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
  6425                                      * U+0041 LATIN CAPITAL LETTER A to U+005A
  6426                                      * LATIN CAPITAL LETTER Z, or U+0061 LATIN
  6427                                      * SMALL LETTER A to U+007A LATIN SMALL
  6428                                      * LETTER Z, then, for historical reasons,
  6429                                      * all the characters that were matched
  6430                                      * after the U+0026 AMPERSAND (&) must be
  6431                                      * unconsumed, and nothing is returned.
  6432                                      */
  6433                                     errNoNamedCharacterMatch();
  6434                                     appendStrBufToLongStrBuf();
  6435                                     state = returnState;
  6436                                     continue eofloop;
  6439                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
  6440                                 errUnescapedAmpersandInterpretedAsCharacterReference();
  6441                             } else {
  6442                                 errNotSemicolonTerminated();
  6446                         /*
  6447                          * Otherwise, return a character token for the character
  6448                          * corresponding to the entity name (as given by the
  6449                          * second column of the named character references
  6450                          * table).
  6451                          */
  6452                         @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
  6453                         if (
  6454                         // [NOCPP[
  6455                         val.length == 1
  6456                         // ]NOCPP]
  6457                         // CPPONLY: val[1] == 0
  6458                         ) {
  6459                             emitOrAppendOne(val, returnState);
  6460                         } else {
  6461                             emitOrAppendTwo(val, returnState);
  6463                         // this is so complicated!
  6464                         if (strBufMark < strBufLen) {
  6465                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
  6466                                 for (int i = strBufMark; i < strBufLen; i++) {
  6467                                     appendLongStrBuf(strBuf[i]);
  6469                             } else {
  6470                                 tokenHandler.characters(strBuf, strBufMark,
  6471                                         strBufLen - strBufMark);
  6474                         state = returnState;
  6475                         continue eofloop;
  6476                         /*
  6477                          * If the markup contains I'm &notit; I tell you, the
  6478                          * entity is parsed as "not", as in, I'm ¬it; I tell
  6479                          * you. But if the markup was I'm &notin; I tell you,
  6480                          * the entity would be parsed as "notin;", resulting in
  6481                          * I'm ∉ I tell you.
  6482                          */
  6484                 case CONSUME_NCR:
  6485                 case DECIMAL_NRC_LOOP:
  6486                 case HEX_NCR_LOOP:
  6487                     /*
  6488                      * If no characters match the range, then don't consume any
  6489                      * characters (and unconsume the U+0023 NUMBER SIGN
  6490                      * character and, if appropriate, the X character). This is
  6491                      * a parse error; nothing is returned.
  6493                      * Otherwise, if the next character is a U+003B SEMICOLON,
  6494                      * consume that too. If it isn't, there is a parse error.
  6495                      */
  6496                     if (!seenDigits) {
  6497                         errNoDigitsInNCR();
  6498                         emitOrAppendStrBuf(returnState);
  6499                         state = returnState;
  6500                         continue;
  6501                     } else {
  6502                         errCharRefLacksSemicolon();
  6504                     // WARNING previous state sets reconsume
  6505                     handleNcrValue(returnState);
  6506                     state = returnState;
  6507                     continue;
  6508                 case CDATA_RSQB:
  6509                     tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
  6510                     break eofloop;
  6511                 case CDATA_RSQB_RSQB:
  6512                     tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
  6513                     break eofloop;
  6514                 case DATA:
  6515                 default:
  6516                     break eofloop;
  6519         // case DATA:
  6520         /*
  6521          * EOF Emit an end-of-file token.
  6522          */
  6523         tokenHandler.eof();
  6524         return;
  6527     private void emitDoctypeToken(int pos) throws SAXException {
  6528         cstart = pos + 1;
  6529         tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier,
  6530                 forceQuirks);
  6531         // It is OK and sufficient to release these here, since
  6532         // there's no way out of the doctype states than through paths
  6533         // that call this method.
  6534         doctypeName = null;
  6535         Portability.releaseString(publicIdentifier);
  6536         publicIdentifier = null;
  6537         Portability.releaseString(systemIdentifier);
  6538         systemIdentifier = null;
  6541     @Inline protected char checkChar(@NoLength char[] buf, int pos)
  6542             throws SAXException {
  6543         return buf[pos];
  6546     public boolean internalEncodingDeclaration(String internalCharset)
  6547             throws SAXException {
  6548         if (encodingDeclarationHandler != null) {
  6549             return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset);
  6551         return false;
  6554     /**
  6555      * @param val
  6556      * @throws SAXException
  6557      */
  6558     private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState)
  6559             throws SAXException {
  6560         if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
  6561             appendLongStrBuf(val[0]);
  6562             appendLongStrBuf(val[1]);
  6563         } else {
  6564             tokenHandler.characters(val, 0, 2);
  6568     private void emitOrAppendOne(@Const @NoLength char[] val, int returnState)
  6569             throws SAXException {
  6570         if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
  6571             appendLongStrBuf(val[0]);
  6572         } else {
  6573             tokenHandler.characters(val, 0, 1);
  6577     public void end() throws SAXException {
  6578         strBuf = null;
  6579         longStrBuf = null;
  6580         doctypeName = null;
  6581         if (systemIdentifier != null) {
  6582             Portability.releaseString(systemIdentifier);
  6583             systemIdentifier = null;
  6585         if (publicIdentifier != null) {
  6586             Portability.releaseString(publicIdentifier);
  6587             publicIdentifier = null;
  6589         if (tagName != null) {
  6590             tagName.release();
  6591             tagName = null;
  6593         if (attributeName != null) {
  6594             attributeName.release();
  6595             attributeName = null;
  6597         tokenHandler.endTokenization();
  6598         if (attributes != null) {
  6599             // [NOCPP[
  6600             attributes = null;
  6601             // ]NOCPP]
  6602             // CPPONLY: attributes.clear(mappingLangToXmlLang);
  6606     public void requestSuspension() {
  6607         shouldSuspend = true;
  6610     // [NOCPP[
  6612     public void becomeConfident() {
  6613         confident = true;
  6616     /**
  6617      * Returns the nextCharOnNewLine.
  6619      * @return the nextCharOnNewLine
  6620      */
  6621     public boolean isNextCharOnNewLine() {
  6622         return false;
  6625     public boolean isPrevCR() {
  6626         return lastCR;
  6629     /**
  6630      * Returns the line.
  6632      * @return the line
  6633      */
  6634     public int getLine() {
  6635         return -1;
  6638     /**
  6639      * Returns the col.
  6641      * @return the col
  6642      */
  6643     public int getCol() {
  6644         return -1;
  6647     // ]NOCPP]
  6649     public boolean isInDataState() {
  6650         return (stateSave == DATA);
  6653     public void resetToDataState() {
  6654         strBufLen = 0;
  6655         longStrBufLen = 0;
  6656         stateSave = Tokenizer.DATA;
  6657         // line = 1; XXX line numbers
  6658         lastCR = false;
  6659         index = 0;
  6660         forceQuirks = false;
  6661         additional = '\u0000';
  6662         entCol = -1;
  6663         firstCharKey = -1;
  6664         lo = 0;
  6665         hi = 0; // will always be overwritten before use anyway
  6666         candidate = -1;
  6667         strBufMark = 0;
  6668         prevValue = -1;
  6669         value = 0;
  6670         seenDigits = false;
  6671         endTag = false;
  6672         shouldSuspend = false;
  6673         initDoctypeFields();
  6674         if (tagName != null) {
  6675             tagName.release();
  6676             tagName = null;
  6678         if (attributeName != null) {
  6679             attributeName.release();
  6680             attributeName = null;
  6682         if (newAttributesEachTime) {
  6683             if (attributes != null) {
  6684                 Portability.delete(attributes);
  6685                 attributes = null;
  6690     public void loadState(Tokenizer other) throws SAXException {
  6691         strBufLen = other.strBufLen;
  6692         if (strBufLen > strBuf.length) {
  6693             strBuf = new char[strBufLen];
  6695         System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen);
  6697         longStrBufLen = other.longStrBufLen;
  6698         if (longStrBufLen > longStrBuf.length) {
  6699             longStrBuf = new char[longStrBufLen];
  6701         System.arraycopy(other.longStrBuf, 0, longStrBuf, 0, longStrBufLen);
  6703         stateSave = other.stateSave;
  6704         returnStateSave = other.returnStateSave;
  6705         endTagExpectation = other.endTagExpectation;
  6706         endTagExpectationAsArray = other.endTagExpectationAsArray;
  6707         // line = 1; XXX line numbers
  6708         lastCR = other.lastCR;
  6709         index = other.index;
  6710         forceQuirks = other.forceQuirks;
  6711         additional = other.additional;
  6712         entCol = other.entCol;
  6713         firstCharKey = other.firstCharKey;
  6714         lo = other.lo;
  6715         hi = other.hi;
  6716         candidate = other.candidate;
  6717         strBufMark = other.strBufMark;
  6718         prevValue = other.prevValue;
  6719         value = other.value;
  6720         seenDigits = other.seenDigits;
  6721         endTag = other.endTag;
  6722         shouldSuspend = false;
  6724         if (other.doctypeName == null) {
  6725             doctypeName = null;
  6726         } else {
  6727             doctypeName = Portability.newLocalFromLocal(other.doctypeName,
  6728                     interner);
  6731         Portability.releaseString(systemIdentifier);
  6732         if (other.systemIdentifier == null) {
  6733             systemIdentifier = null;
  6734         } else {
  6735             systemIdentifier = Portability.newStringFromString(other.systemIdentifier);
  6738         Portability.releaseString(publicIdentifier);
  6739         if (other.publicIdentifier == null) {
  6740             publicIdentifier = null;
  6741         } else {
  6742             publicIdentifier = Portability.newStringFromString(other.publicIdentifier);
  6745         if (tagName != null) {
  6746             tagName.release();
  6748         if (other.tagName == null) {
  6749             tagName = null;
  6750         } else {
  6751             tagName = other.tagName.cloneElementName(interner);
  6754         if (attributeName != null) {
  6755             attributeName.release();
  6757         if (other.attributeName == null) {
  6758             attributeName = null;
  6759         } else {
  6760             attributeName = other.attributeName.cloneAttributeName(interner);
  6763         Portability.delete(attributes);
  6764         if (other.attributes == null) {
  6765             attributes = null;
  6766         } else {
  6767             attributes = other.attributes.cloneAttributes(interner);
  6771     public void initializeWithoutStarting() throws SAXException {
  6772         confident = false;
  6773         strBuf = new char[64];
  6774         longStrBuf = new char[1024];
  6775         line = 1;
  6776         // [NOCPP[
  6777         html4 = false;
  6778         metaBoundaryPassed = false;
  6779         wantsComments = tokenHandler.wantsComments();
  6780         if (!newAttributesEachTime) {
  6781             attributes = new HtmlAttributes(mappingLangToXmlLang);
  6783         // ]NOCPP]
  6784         resetToDataState();
  6787     protected void errGarbageAfterLtSlash() throws SAXException {
  6790     protected void errLtSlashGt() throws SAXException {
  6793     protected void errWarnLtSlashInRcdata() throws SAXException {
  6796     protected void errHtml4LtSlashInRcdata(char folded) throws SAXException {
  6799     protected void errCharRefLacksSemicolon() throws SAXException {
  6802     protected void errNoDigitsInNCR() throws SAXException {
  6805     protected void errGtInSystemId() throws SAXException {
  6808     protected void errGtInPublicId() throws SAXException {
  6811     protected void errNamelessDoctype() throws SAXException {
  6814     protected void errConsecutiveHyphens() throws SAXException {
  6817     protected void errPrematureEndOfComment() throws SAXException {
  6820     protected void errBogusComment() throws SAXException {
  6823     protected void errUnquotedAttributeValOrNull(char c) throws SAXException {
  6826     protected void errSlashNotFollowedByGt() throws SAXException {
  6829     protected void errHtml4XmlVoidSyntax() throws SAXException {
  6832     protected void errNoSpaceBetweenAttributes() throws SAXException {
  6835     protected void errHtml4NonNameInUnquotedAttribute(char c)
  6836             throws SAXException {
  6839     protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c)
  6840             throws SAXException {
  6843     protected void errAttributeValueMissing() throws SAXException {
  6846     protected void errBadCharBeforeAttributeNameOrNull(char c)
  6847             throws SAXException {
  6850     protected void errEqualsSignBeforeAttributeName() throws SAXException {
  6853     protected void errBadCharAfterLt(char c) throws SAXException {
  6856     protected void errLtGt() throws SAXException {
  6859     protected void errProcessingInstruction() throws SAXException {
  6862     protected void errUnescapedAmpersandInterpretedAsCharacterReference()
  6863             throws SAXException {
  6866     protected void errNotSemicolonTerminated() throws SAXException {
  6869     protected void errNoNamedCharacterMatch() throws SAXException {
  6872     protected void errQuoteBeforeAttributeName(char c) throws SAXException {
  6875     protected void errQuoteOrLtInAttributeNameOrNull(char c)
  6876             throws SAXException {
  6879     protected void errExpectedPublicId() throws SAXException {
  6882     protected void errBogusDoctype() throws SAXException {
  6885     protected void maybeWarnPrivateUseAstral() throws SAXException {
  6888     protected void maybeWarnPrivateUse(char ch) throws SAXException {
  6891     protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs)
  6892             throws SAXException {
  6895     protected void maybeErrSlashInEndTag(boolean selfClosing)
  6896             throws SAXException {
  6899     protected char errNcrNonCharacter(char ch) throws SAXException {
  6900         return ch;
  6903     protected void errAstralNonCharacter(int ch) throws SAXException {
  6906     protected void errNcrSurrogate() throws SAXException {
  6909     protected char errNcrControlChar(char ch) throws SAXException {
  6910         return ch;
  6913     protected void errNcrCr() throws SAXException {
  6916     protected void errNcrInC1Range() throws SAXException {
  6919     protected void errEofInPublicId() throws SAXException {
  6922     protected void errEofInComment() throws SAXException {
  6925     protected void errEofInDoctype() throws SAXException {
  6928     protected void errEofInAttributeValue() throws SAXException {
  6931     protected void errEofInAttributeName() throws SAXException {
  6934     protected void errEofWithoutGt() throws SAXException {
  6937     protected void errEofInTagName() throws SAXException {
  6940     protected void errEofInEndTag() throws SAXException {
  6943     protected void errEofAfterLt() throws SAXException {
  6946     protected void errNcrOutOfRange() throws SAXException {
  6949     protected void errNcrUnassigned() throws SAXException {
  6952     protected void errDuplicateAttribute() throws SAXException {
  6955     protected void errEofInSystemId() throws SAXException {
  6958     protected void errExpectedSystemId() throws SAXException {
  6961     protected void errMissingSpaceBeforeDoctypeName() throws SAXException {
  6964     protected void errHyphenHyphenBang() throws SAXException {
  6967     protected void errNcrControlChar() throws SAXException {
  6970     protected void errNcrZero() throws SAXException {
  6973     protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote()
  6974             throws SAXException {
  6977     protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException {
  6980     protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote()
  6981             throws SAXException {
  6984     protected void noteAttributeWithoutValue() throws SAXException {
  6987     protected void noteUnquotedAttributeValue() throws SAXException {
  6990     /**
  6991      * Sets the encodingDeclarationHandler.
  6993      * @param encodingDeclarationHandler
  6994      *            the encodingDeclarationHandler to set
  6995      */
  6996     public void setEncodingDeclarationHandler(
  6997             EncodingDeclarationHandler encodingDeclarationHandler) {
  6998         this.encodingDeclarationHandler = encodingDeclarationHandler;
  7001     void destructor() {
  7002         // The translator will write refcount tracing stuff here
  7003         Portability.delete(attributes);
  7004         attributes = null;
  7007     // [NOCPP[
  7009     /**
  7010      * Sets an offset to be added to the position reported to 
  7011      * <code>TransitionHandler</code>.
  7013      * @param offset the offset
  7014      */
  7015     public void setTransitionBaseOffset(int offset) {
  7019     // ]NOCPP]

mercurial