The Tor Browser: parser/html/javasrc/Tokenizer.java@925c144e1f1f

     1 /*

     2  * Copyright (c) 2005-2007 Henri Sivonen

     3  * Copyright (c) 2007-2013 Mozilla Foundation

     4  * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla

     5  * Foundation, and Opera Software ASA.

     6  *

     7  * Permission is hereby granted, free of charge, to any person obtaining a

     8  * copy of this software and associated documentation files (the "Software"),

     9  * to deal in the Software without restriction, including without limitation

    10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,

    11  * and/or sell copies of the Software, and to permit persons to whom the

    12  * Software is furnished to do so, subject to the following conditions:

    13  *

    14  * The above copyright notice and this permission notice shall be included in

    15  * all copies or substantial portions of the Software.

    16  *

    17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

    18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

    19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL

    20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

    21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING

    22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER

    23  * DEALINGS IN THE SOFTWARE.

    24  */

    26 /*

    27  * The comments following this one that use the same comment syntax as this

    28  * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007

    29  * amended as of June 18 2008 and May 31 2010.

    30  * That document came with this statement:

    31  * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and

    32  * Opera Software ASA. You are granted a license to use, reproduce and

    33  * create derivative works of this document."

    34  */

    36 package nu.validator.htmlparser.impl;

    38 import nu.validator.htmlparser.annotation.Auto;

    39 import nu.validator.htmlparser.annotation.CharacterName;

    40 import nu.validator.htmlparser.annotation.Const;

    41 import nu.validator.htmlparser.annotation.Inline;

    42 import nu.validator.htmlparser.annotation.Local;

    43 import nu.validator.htmlparser.annotation.NoLength;

    44 import nu.validator.htmlparser.common.EncodingDeclarationHandler;

    45 import nu.validator.htmlparser.common.Interner;

    46 import nu.validator.htmlparser.common.TokenHandler;

    47 import nu.validator.htmlparser.common.XmlViolationPolicy;

    49 import org.xml.sax.ErrorHandler;

    50 import org.xml.sax.Locator;

    51 import org.xml.sax.SAXException;

    52 import org.xml.sax.SAXParseException;

    54 /**

    55  * An implementation of

    56  * http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html

    57  *

    58  * This class implements the <code>Locator</code> interface. This is not an

    59  * incidental implementation detail: Users of this class are encouraged to make

    60  * use of the <code>Locator</code> nature.

    61  *

    62  * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer

    63  * can be configured to treat these conditions as fatal or to coerce the infoset

    64  * to something that XML 1.0 allows.

    65  *

    66  * @version $Id$

    67  * @author hsivonen

    68  */

    69 public class Tokenizer implements Locator {

    71     private static final int DATA_AND_RCDATA_MASK = ~1;

    73     public static final int DATA = 0;

    75     public static final int RCDATA = 1;

    77     public static final int SCRIPT_DATA = 2;

    79     public static final int RAWTEXT = 3;

    81     public static final int SCRIPT_DATA_ESCAPED = 4;

    83     public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5;

    85     public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6;

    87     public static final int ATTRIBUTE_VALUE_UNQUOTED = 7;

    89     public static final int PLAINTEXT = 8;

    91     public static final int TAG_OPEN = 9;

    93     public static final int CLOSE_TAG_OPEN = 10;

    95     public static final int TAG_NAME = 11;

    97     public static final int BEFORE_ATTRIBUTE_NAME = 12;

    99     public static final int ATTRIBUTE_NAME = 13;

   101     public static final int AFTER_ATTRIBUTE_NAME = 14;

   103     public static final int BEFORE_ATTRIBUTE_VALUE = 15;

   105     public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16;

   107     public static final int BOGUS_COMMENT = 17;

   109     public static final int MARKUP_DECLARATION_OPEN = 18;

   111     public static final int DOCTYPE = 19;

   113     public static final int BEFORE_DOCTYPE_NAME = 20;

   115     public static final int DOCTYPE_NAME = 21;

   117     public static final int AFTER_DOCTYPE_NAME = 22;

   119     public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23;

   121     public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24;

   123     public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25;

   125     public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26;

   127     public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27;

   129     public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28;

   131     public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29;

   133     public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30;

   135     public static final int BOGUS_DOCTYPE = 31;

   137     public static final int COMMENT_START = 32;

   139     public static final int COMMENT_START_DASH = 33;

   141     public static final int COMMENT = 34;

   143     public static final int COMMENT_END_DASH = 35;

   145     public static final int COMMENT_END = 36;

   147     public static final int COMMENT_END_BANG = 37;

   149     public static final int NON_DATA_END_TAG_NAME = 38;

   151     public static final int MARKUP_DECLARATION_HYPHEN = 39;

   153     public static final int MARKUP_DECLARATION_OCTYPE = 40;

   155     public static final int DOCTYPE_UBLIC = 41;

   157     public static final int DOCTYPE_YSTEM = 42;

   159     public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43;

   161     public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44;

   163     public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45;

   165     public static final int CONSUME_CHARACTER_REFERENCE = 46;

   167     public static final int CONSUME_NCR = 47;

   169     public static final int CHARACTER_REFERENCE_TAIL = 48;

   171     public static final int HEX_NCR_LOOP = 49;

   173     public static final int DECIMAL_NRC_LOOP = 50;

   175     public static final int HANDLE_NCR_VALUE = 51;

   177     public static final int HANDLE_NCR_VALUE_RECONSUME = 52;

   179     public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53;

   181     public static final int SELF_CLOSING_START_TAG = 54;

   183     public static final int CDATA_START = 55;

   185     public static final int CDATA_SECTION = 56;

   187     public static final int CDATA_RSQB = 57;

   189     public static final int CDATA_RSQB_RSQB = 58;

   191     public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59;

   193     public static final int SCRIPT_DATA_ESCAPE_START = 60;

   195     public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61;

   197     public static final int SCRIPT_DATA_ESCAPED_DASH = 62;

   199     public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63;

   201     public static final int BOGUS_COMMENT_HYPHEN = 64;

   203     public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65;

   205     public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66;

   207     public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67;

   209     public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68;

   211     public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69;

   213     public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70;

   215     public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71;

   217     public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72;

   219     public static final int PROCESSING_INSTRUCTION = 73;

   221     public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74;

   223     /**

   224      * Magic value for UTF-16 operations.

   225      */

   226     private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10));

   228     /**

   229      * UTF-16 code unit array containing less than and greater than for emitting

   230      * those characters on certain parse errors.

   231      */

   232     private static final @NoLength char[] LT_GT = { '<', '>' };

   234     /**

   235      * UTF-16 code unit array containing less than and solidus for emitting

   236      * those characters on certain parse errors.

   237      */

   238     private static final @NoLength char[] LT_SOLIDUS = { '<', '/' };

   240     /**

   241      * UTF-16 code unit array containing ]] for emitting those characters on

   242      * state transitions.

   243      */

   244     private static final @NoLength char[] RSQB_RSQB = { ']', ']' };

   246     /**

   247      * Array version of U+FFFD.

   248      */

   249     private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' };

   251     // [NOCPP[

   253     /**

   254      * Array version of space.

   255      */

   256     private static final @NoLength char[] SPACE = { ' ' };

   258     // ]NOCPP]

   260     /**

   261      * Array version of line feed.

   262      */

   263     private static final @NoLength char[] LF = { '\n' };

   265     /**

   266      * Buffer growth parameter.

   267      */

   268     private static final int BUFFER_GROW_BY = 1024;

   270     /**

   271      * "CDATA[" as <code>char[]</code>

   272      */

   273     private static final @NoLength char[] CDATA_LSQB = { 'C', 'D', 'A', 'T',

   274             'A', '[' };

   276     /**

   277      * "octype" as <code>char[]</code>

   278      */

   279     private static final @NoLength char[] OCTYPE = { 'o', 'c', 't', 'y', 'p',

   280             'e' };

   282     /**

   283      * "ublic" as <code>char[]</code>

   284      */

   285     private static final @NoLength char[] UBLIC = { 'u', 'b', 'l', 'i', 'c' };

   287     /**

   288      * "ystem" as <code>char[]</code>

   289      */

   290     private static final @NoLength char[] YSTEM = { 'y', 's', 't', 'e', 'm' };

   292     private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' };

   294     private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' };

   296     private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' };

   298     private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't',

   299             'e', 'x', 't' };

   301     private static final char[] XMP_ARR = { 'x', 'm', 'p' };

   303     private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r',

   304             'e', 'a' };

   306     private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' };

   308     private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e',

   309             'd' };

   311     private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i',

   312             'p', 't' };

   314     private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm',

   315             'e', 's' };

   317     /**

   318      * The token handler.

   319      */

   320     protected final TokenHandler tokenHandler;

   322     protected EncodingDeclarationHandler encodingDeclarationHandler;

   324     // [NOCPP[

   326     /**

   327      * The error handler.

   328      */

   329     protected ErrorHandler errorHandler;

   331     // ]NOCPP]

   333     /**

   334      * Whether the previous char read was CR.

   335      */

   336     protected boolean lastCR;

   338     protected int stateSave;

   340     private int returnStateSave;

   342     protected int index;

   344     private boolean forceQuirks;

   346     private char additional;

   348     private int entCol;

   350     private int firstCharKey;

   352     private int lo;

   354     private int hi;

   356     private int candidate;

   358     private int strBufMark;

   360     private int prevValue;

   362     protected int value;

   364     private boolean seenDigits;

   366     protected int cstart;

   368     /**

   369      * The SAX public id for the resource being tokenized. (Only passed to back

   370      * as part of locator data.)

   371      */

   372     private String publicId;

   374     /**

   375      * The SAX system id for the resource being tokenized. (Only passed to back

   376      * as part of locator data.)

   377      */

   378     private String systemId;

   380     /**

   381      * Buffer for short identifiers.

   382      */

   383     private @Auto char[] strBuf;

   385     /**

   386      * Number of significant <code>char</code>s in <code>strBuf</code>.

   387      */

   388     private int strBufLen;

   390     /**

   391      * <code>-1</code> to indicate that <code>strBuf</code> is used or otherwise

   392      * an offset to the main buffer.

   393      */

   394     // private int strBufOffset = -1;

   395     /**

   396      * Buffer for long strings.

   397      */

   398     private @Auto char[] longStrBuf;

   400     /**

   401      * Number of significant <code>char</code>s in <code>longStrBuf</code>.

   402      */

   403     private int longStrBufLen;

   405     /**

   406      * <code>-1</code> to indicate that <code>longStrBuf</code> is used or

   407      * otherwise an offset to the main buffer.

   408      */

   409     // private int longStrBufOffset = -1;

   411     /**

   412      * Buffer for expanding NCRs falling into the Basic Multilingual Plane.

   413      */

   414     private final @Auto char[] bmpChar;

   416     /**

   417      * Buffer for expanding astral NCRs.

   418      */

   419     private final @Auto char[] astralChar;

   421     /**

   422      * The element whose end tag closes the current CDATA or RCDATA element.

   423      */

   424     protected ElementName endTagExpectation = null;

   426     private char[] endTagExpectationAsArray; // not @Auto!

   428     /**

   429      * <code>true</code> if tokenizing an end tag

   430      */

   431     protected boolean endTag;

   433     /**

   434      * The current tag token name.

   435      */

   436     private ElementName tagName = null;

   438     /**

   439      * The current attribute name.

   440      */

   441     protected AttributeName attributeName = null;

   443     // [NOCPP[

   445     /**

   446      * Whether comment tokens are emitted.

   447      */

   448     private boolean wantsComments = false;

   450     /**

   451      * <code>true</code> when HTML4-specific additional errors are requested.

   452      */

   453     protected boolean html4;

   455     /**

   456      * Whether the stream is past the first 512 bytes.

   457      */

   458     private boolean metaBoundaryPassed;

   460     // ]NOCPP]

   462     /**

   463      * The name of the current doctype token.

   464      */

   465     private @Local String doctypeName;

   467     /**

   468      * The public id of the current doctype token.

   469      */

   470     private String publicIdentifier;

   472     /**

   473      * The system id of the current doctype token.

   474      */

   475     private String systemIdentifier;

   477     /**

   478      * The attribute holder.

   479      */

   480     private HtmlAttributes attributes;

   482     // [NOCPP[

   484     /**

   485      * The policy for vertical tab and form feed.

   486      */

   487     private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET;

   489     /**

   490      * The policy for comments.

   491      */

   492     private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET;

   494     private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET;

   496     private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET;

   498     private boolean html4ModeCompatibleWithXhtml1Schemata;

   500     private int mappingLangToXmlLang;

   502     // ]NOCPP]

   504     private final boolean newAttributesEachTime;

   506     private boolean shouldSuspend;

   508     protected boolean confident;

   510     private int line;

   512     private Interner interner;

   514     // CPPONLY: private boolean viewingXmlSource;

   516     // [NOCPP[

   518     protected LocatorImpl ampersandLocation;

   520     public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) {

   521         this.tokenHandler = tokenHandler;

   522         this.encodingDeclarationHandler = null;

   523         this.newAttributesEachTime = newAttributesEachTime;

   524         this.bmpChar = new char[1];

   525         this.astralChar = new char[2];

   526         this.tagName = null;

   527         this.attributeName = null;

   528         this.doctypeName = null;

   529         this.publicIdentifier = null;

   530         this.systemIdentifier = null;

   531         this.attributes = null;

   532     }

   534     // ]NOCPP]

   536     /**

   537      * The constructor.

   538      *

   539      * @param tokenHandler

   540      *            the handler for receiving tokens

   541      */

   542     public Tokenizer(TokenHandler tokenHandler

   543     // CPPONLY: , boolean viewingXmlSource

   544     ) {

   545         this.tokenHandler = tokenHandler;

   546         this.encodingDeclarationHandler = null;

   547         // [NOCPP[

   548         this.newAttributesEachTime = false;

   549         // ]NOCPP]

   550         this.bmpChar = new char[1];

   551         this.astralChar = new char[2];

   552         this.tagName = null;

   553         this.attributeName = null;

   554         this.doctypeName = null;

   555         this.publicIdentifier = null;

   556         this.systemIdentifier = null;

   557         // [NOCPP[

   558         this.attributes = null;

   559         // ]NOCPP]

   560         // CPPONLY: this.attributes = tokenHandler.HasBuilder() ? new HtmlAttributes(mappingLangToXmlLang) : null;

   561         // CPPONLY: this.newAttributesEachTime = !tokenHandler.HasBuilder();

   562         // CPPONLY: this.viewingXmlSource = viewingXmlSource;

   563     }

   565     public void setInterner(Interner interner) {

   566         this.interner = interner;

   567     }

   569     public void initLocation(String newPublicId, String newSystemId) {

   570         this.systemId = newSystemId;

   571         this.publicId = newPublicId;

   573     }

   575     // CPPONLY: boolean isViewingXmlSource() {

   576     // CPPONLY: return viewingXmlSource;

   577     // CPPONLY: }

   579     // [NOCPP[

   581     /**

   582      * Returns the mappingLangToXmlLang.

   583      *

   584      * @return the mappingLangToXmlLang

   585      */

   586     public boolean isMappingLangToXmlLang() {

   587         return mappingLangToXmlLang == AttributeName.HTML_LANG;

   588     }

   590     /**

   591      * Sets the mappingLangToXmlLang.

   592      *

   593      * @param mappingLangToXmlLang

   594      *            the mappingLangToXmlLang to set

   595      */

   596     public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {

   597         this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG

   598                 : AttributeName.HTML;

   599     }

   601     /**

   602      * Sets the error handler.

   603      *

   604      * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)

   605      */

   606     public void setErrorHandler(ErrorHandler eh) {

   607         this.errorHandler = eh;

   608     }

   610     public ErrorHandler getErrorHandler() {

   611         return this.errorHandler;

   612     }

   614     /**

   615      * Sets the commentPolicy.

   616      *

   617      * @param commentPolicy

   618      *            the commentPolicy to set

   619      */

   620     public void setCommentPolicy(XmlViolationPolicy commentPolicy) {

   621         this.commentPolicy = commentPolicy;

   622     }

   624     /**

   625      * Sets the contentNonXmlCharPolicy.

   626      *

   627      * @param contentNonXmlCharPolicy

   628      *            the contentNonXmlCharPolicy to set

   629      */

   630     public void setContentNonXmlCharPolicy(

   631             XmlViolationPolicy contentNonXmlCharPolicy) {

   632         if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) {

   633             throw new IllegalArgumentException(

   634                     "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW.");

   635         }

   636     }

   638     /**

   639      * Sets the contentSpacePolicy.

   640      *

   641      * @param contentSpacePolicy

   642      *            the contentSpacePolicy to set

   643      */

   644     public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {

   645         this.contentSpacePolicy = contentSpacePolicy;

   646     }

   648     /**

   649      * Sets the xmlnsPolicy.

   650      *

   651      * @param xmlnsPolicy

   652      *            the xmlnsPolicy to set

   653      */

   654     public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {

   655         if (xmlnsPolicy == XmlViolationPolicy.FATAL) {

   656             throw new IllegalArgumentException("Can't use FATAL here.");

   657         }

   658         this.xmlnsPolicy = xmlnsPolicy;

   659     }

   661     public void setNamePolicy(XmlViolationPolicy namePolicy) {

   662         this.namePolicy = namePolicy;

   663     }

   665     /**

   666      * Sets the html4ModeCompatibleWithXhtml1Schemata.

   667      *

   668      * @param html4ModeCompatibleWithXhtml1Schemata

   669      *            the html4ModeCompatibleWithXhtml1Schemata to set

   670      */

   671     public void setHtml4ModeCompatibleWithXhtml1Schemata(

   672             boolean html4ModeCompatibleWithXhtml1Schemata) {

   673         this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata;

   674     }

   676     // ]NOCPP]

   678     // For the token handler to call

   679     /**

   680      * Sets the tokenizer state and the associated element name. This should

   681      * only ever used to put the tokenizer into one of the states that have

   682      * a special end tag expectation.

   683      *

   684      * @param specialTokenizerState

   685      *            the tokenizer state to set

   686      * @param endTagExpectation

   687      *            the expected end tag for transitioning back to normal

   688      */

   689     public void setStateAndEndTagExpectation(int specialTokenizerState,

   690             @Local String endTagExpectation) {

   691         this.stateSave = specialTokenizerState;

   692         if (specialTokenizerState == Tokenizer.DATA) {

   693             return;

   694         }

   695         @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation);

   696         this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 0,

   697                 asArray.length, interner);

   698         endTagExpectationToArray();

   699     }

   701     /**

   702      * Sets the tokenizer state and the associated element name. This should

   703      * only ever used to put the tokenizer into one of the states that have

   704      * a special end tag expectation.

   705      *

   706      * @param specialTokenizerState

   707      *            the tokenizer state to set

   708      * @param endTagExpectation

   709      *            the expected end tag for transitioning back to normal

   710      */

   711     public void setStateAndEndTagExpectation(int specialTokenizerState,

   712             ElementName endTagExpectation) {

   713         this.stateSave = specialTokenizerState;

   714         this.endTagExpectation = endTagExpectation;

   715         endTagExpectationToArray();

   716     }

   718     private void endTagExpectationToArray() {

   719         switch (endTagExpectation.getGroup()) {

   720             case TreeBuilder.TITLE:

   721                 endTagExpectationAsArray = TITLE_ARR;

   722                 return;

   723             case TreeBuilder.SCRIPT:

   724                 endTagExpectationAsArray = SCRIPT_ARR;

   725                 return;

   726             case TreeBuilder.STYLE:

   727                 endTagExpectationAsArray = STYLE_ARR;

   728                 return;

   729             case TreeBuilder.PLAINTEXT:

   730                 endTagExpectationAsArray = PLAINTEXT_ARR;

   731                 return;

   732             case TreeBuilder.XMP:

   733                 endTagExpectationAsArray = XMP_ARR;

   734                 return;

   735             case TreeBuilder.TEXTAREA:

   736                 endTagExpectationAsArray = TEXTAREA_ARR;

   737                 return;

   738             case TreeBuilder.IFRAME:

   739                 endTagExpectationAsArray = IFRAME_ARR;

   740                 return;

   741             case TreeBuilder.NOEMBED:

   742                 endTagExpectationAsArray = NOEMBED_ARR;

   743                 return;

   744             case TreeBuilder.NOSCRIPT:

   745                 endTagExpectationAsArray = NOSCRIPT_ARR;

   746                 return;

   747             case TreeBuilder.NOFRAMES:

   748                 endTagExpectationAsArray = NOFRAMES_ARR;

   749                 return;

   750             default:

   751                 assert false: "Bad end tag expectation.";

   752                 return;

   753         }

   754     }

   756     /**

   757      * For C++ use only.

   758      */

   759     public void setLineNumber(int line) {

   760         this.line = line;

   761     }

   763     // start Locator impl

   765     /**

   766      * @see org.xml.sax.Locator#getLineNumber()

   767      */

   768     @Inline public int getLineNumber() {

   769         return line;

   770     }

   772     // [NOCPP[

   774     /**

   775      * @see org.xml.sax.Locator#getColumnNumber()

   776      */

   777     @Inline public int getColumnNumber() {

   778         return -1;

   779     }

   781     /**

   782      * @see org.xml.sax.Locator#getPublicId()

   783      */

   784     public String getPublicId() {

   785         return publicId;

   786     }

   788     /**

   789      * @see org.xml.sax.Locator#getSystemId()

   790      */

   791     public String getSystemId() {

   792         return systemId;

   793     }

   795     // end Locator impl

   797     // end public API

   799     public void notifyAboutMetaBoundary() {

   800         metaBoundaryPassed = true;

   801     }

   803     void turnOnAdditionalHtml4Errors() {

   804         html4 = true;

   805     }

   807     // ]NOCPP]

   809     HtmlAttributes emptyAttributes() {

   810         // [NOCPP[

   811         if (newAttributesEachTime) {

   812             return new HtmlAttributes(mappingLangToXmlLang);

   813         } else {

   814             // ]NOCPP]

   815             return HtmlAttributes.EMPTY_ATTRIBUTES;

   816             // [NOCPP[

   817         }

   818         // ]NOCPP]

   819     }

   821     @Inline private void clearStrBufAndAppend(char c) {

   822         strBuf[0] = c;

   823         strBufLen = 1;

   824     }

   826     @Inline private void clearStrBuf() {

   827         strBufLen = 0;

   828     }

   830     /**

   831      * Appends to the smaller buffer.

   832      *

   833      * @param c

   834      *            the UTF-16 code unit to append

   835      */

   836     private void appendStrBuf(char c) {

   837         if (strBufLen == strBuf.length) {

   838             char[] newBuf = new char[strBuf.length + Tokenizer.BUFFER_GROW_BY];

   839             System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length);

   840             strBuf = newBuf;

   841         }

   842         strBuf[strBufLen++] = c;

   843     }

   845     /**

   846      * The smaller buffer as a String. Currently only used for error reporting.

   847      *

   848      * <p>

   849      * C++ memory note: The return value must be released.

   850      *

   851      * @return the smaller buffer as a string

   852      */

   853     protected String strBufToString() {

   854         return Portability.newStringFromBuffer(strBuf, 0, strBufLen);

   855     }

   857     /**

   858      * Returns the short buffer as a local name. The return value is released in

   859      * emitDoctypeToken().

   860      *

   861      * @return the smaller buffer as local name

   862      */

   863     private void strBufToDoctypeName() {

   864         doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen,

   865                 interner);

   866     }

   868     /**

   869      * Emits the smaller buffer as character tokens.

   870      *

   871      * @throws SAXException

   872      *             if the token handler threw

   873      */

   874     private void emitStrBuf() throws SAXException {

   875         if (strBufLen > 0) {

   876             tokenHandler.characters(strBuf, 0, strBufLen);

   877         }

   878     }

   880     @Inline private void clearLongStrBuf() {

   881         longStrBufLen = 0;

   882     }

   884     @Inline private void clearLongStrBufAndAppend(char c) {

   885         longStrBuf[0] = c;

   886         longStrBufLen = 1;

   887     }

   889     /**

   890      * Appends to the larger buffer.

   891      *

   892      * @param c

   893      *            the UTF-16 code unit to append

   894      */

   895     private void appendLongStrBuf(char c) {

   896         if (longStrBufLen == longStrBuf.length) {

   897             char[] newBuf = new char[longStrBufLen + (longStrBufLen >> 1)];

   898             System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);

   899             longStrBuf = newBuf;

   900         }

   901         longStrBuf[longStrBufLen++] = c;

   902     }

   904     @Inline private void appendSecondHyphenToBogusComment() throws SAXException {

   905         // [NOCPP[

   906         switch (commentPolicy) {

   907             case ALTER_INFOSET:

   908                 // detachLongStrBuf();

   909                 appendLongStrBuf(' ');

   910                 // FALLTHROUGH

   911             case ALLOW:

   912                 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");

   913                 // ]NOCPP]

   914                 appendLongStrBuf('-');

   915                 // [NOCPP[

   916                 break;

   917             case FATAL:

   918                 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");

   919                 break;

   920         }

   921         // ]NOCPP]

   922     }

   924     // [NOCPP[

   925     private void maybeAppendSpaceToBogusComment() throws SAXException {

   926         switch (commentPolicy) {

   927             case ALTER_INFOSET:

   928                 // detachLongStrBuf();

   929                 appendLongStrBuf(' ');

   930                 // FALLTHROUGH

   931             case ALLOW:

   932                 warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");

   933                 break;

   934             case FATAL:

   935                 fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");

   936                 break;

   937         }

   938     }

   940     // ]NOCPP]

   942     @Inline private void adjustDoubleHyphenAndAppendToLongStrBufAndErr(char c)

   943             throws SAXException {

   944         errConsecutiveHyphens();

   945         // [NOCPP[

   946         switch (commentPolicy) {

   947             case ALTER_INFOSET:

   948                 // detachLongStrBuf();

   949                 longStrBufLen--;

   950                 appendLongStrBuf(' ');

   951                 appendLongStrBuf('-');

   952                 // FALLTHROUGH

   953             case ALLOW:

   954                 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");

   955                 // ]NOCPP]

   956                 appendLongStrBuf(c);

   957                 // [NOCPP[

   958                 break;

   959             case FATAL:

   960                 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");

   961                 break;

   962         }

   963         // ]NOCPP]

   964     }

   966     private void appendLongStrBuf(@NoLength char[] buffer, int offset, int length) {

   967         int reqLen = longStrBufLen + length;

   968         if (longStrBuf.length < reqLen) {

   969             char[] newBuf = new char[reqLen + (reqLen >> 1)];

   970             System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);

   971             longStrBuf = newBuf;

   972         }

   973         System.arraycopy(buffer, offset, longStrBuf, longStrBufLen, length);

   974         longStrBufLen = reqLen;

   975     }

   977     /**

   978      * Append the contents of the smaller buffer to the larger one.

   979      */

   980     @Inline private void appendStrBufToLongStrBuf() {

   981         appendLongStrBuf(strBuf, 0, strBufLen);

   982     }

   984     /**

   985      * The larger buffer as a string.

   986      *

   987      * <p>

   988      * C++ memory note: The return value must be released.

   989      *

   990      * @return the larger buffer as a string

   991      */

   992     private String longStrBufToString() {

   993         return Portability.newStringFromBuffer(longStrBuf, 0, longStrBufLen);

   994     }

   996     /**

   997      * Emits the current comment token.

   998      *

   999      * @param pos

  1000      *            TODO

  1001      *

  1002      * @throws SAXException

  1003      */

  1004     private void emitComment(int provisionalHyphens, int pos)

  1005             throws SAXException {

  1006         // [NOCPP[

  1007         if (wantsComments) {

  1008             // ]NOCPP]

  1009             // if (longStrBufOffset != -1) {

  1010             // tokenHandler.comment(buf, longStrBufOffset, longStrBufLen

  1011             // - provisionalHyphens);

  1012             // } else {

  1013             tokenHandler.comment(longStrBuf, 0, longStrBufLen

  1014                     - provisionalHyphens);

  1015             // }

  1016             // [NOCPP[

  1017         }

  1018         // ]NOCPP]

  1019         cstart = pos + 1;

  1020     }

  1022     /**

  1023      * Flushes coalesced character tokens.

  1024      *

  1025      * @param buf

  1026      *            TODO

  1027      * @param pos

  1028      *            TODO

  1029      *

  1030      * @throws SAXException

  1031      */

  1032     protected void flushChars(@NoLength char[] buf, int pos)

  1033             throws SAXException {

  1034         if (pos > cstart) {

  1035             tokenHandler.characters(buf, cstart, pos - cstart);

  1036         }

  1037         cstart = Integer.MAX_VALUE;

  1038     }

  1040     /**

  1041      * Reports an condition that would make the infoset incompatible with XML

  1042      * 1.0 as fatal.

  1043      *

  1044      * @param message

  1045      *            the message

  1046      * @throws SAXException

  1047      * @throws SAXParseException

  1048      */

  1049     public void fatal(String message) throws SAXException {

  1050         SAXParseException spe = new SAXParseException(message, this);

  1051         if (errorHandler != null) {

  1052             errorHandler.fatalError(spe);

  1053         }

  1054         throw spe;

  1055     }

  1057     /**

  1058      * Reports a Parse Error.

  1059      *

  1060      * @param message

  1061      *            the message

  1062      * @throws SAXException

  1063      */

  1064     public void err(String message) throws SAXException {

  1065         if (errorHandler == null) {

  1066             return;

  1067         }

  1068         SAXParseException spe = new SAXParseException(message, this);

  1069         errorHandler.error(spe);

  1070     }

  1072     public void errTreeBuilder(String message) throws SAXException {

  1073         ErrorHandler eh = null;

  1074         if (tokenHandler instanceof TreeBuilder<?>) {

  1075             TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler;

  1076             eh = treeBuilder.getErrorHandler();

  1077         }

  1078         if (eh == null) {

  1079             eh = errorHandler;

  1080         }

  1081         if (eh == null) {

  1082             return;

  1083         }

  1084         SAXParseException spe = new SAXParseException(message, this);

  1085         eh.error(spe);

  1086     }

  1088     /**

  1089      * Reports a warning

  1090      *

  1091      * @param message

  1092      *            the message

  1093      * @throws SAXException

  1094      */

  1095     public void warn(String message) throws SAXException {

  1096         if (errorHandler == null) {

  1097             return;

  1098         }

  1099         SAXParseException spe = new SAXParseException(message, this);

  1100         errorHandler.warning(spe);

  1101     }

  1103     private void strBufToElementNameString() {

  1104         // if (strBufOffset != -1) {

  1105         // return ElementName.elementNameByBuffer(buf, strBufOffset, strBufLen);

  1106         // } else {

  1107         tagName = ElementName.elementNameByBuffer(strBuf, 0, strBufLen,

  1108                 interner);

  1109         // }

  1110     }

  1112     private int emitCurrentTagToken(boolean selfClosing, int pos)

  1113             throws SAXException {

  1114         cstart = pos + 1;

  1115         maybeErrSlashInEndTag(selfClosing);

  1116         stateSave = Tokenizer.DATA;

  1117         HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES

  1118                 : attributes);

  1119         if (endTag) {

  1120             /*

  1121              * When an end tag token is emitted, the content model flag must be

  1122              * switched to the PCDATA state.

  1123              */

  1124             maybeErrAttributesOnEndTag(attrs);

  1125             // CPPONLY: if (!viewingXmlSource) {

  1126             tokenHandler.endTag(tagName);

  1127             // CPPONLY: }

  1128             // CPPONLY: if (newAttributesEachTime) {

  1129             // CPPONLY:   Portability.delete(attributes);

  1130             // CPPONLY:   attributes = null;

  1131             // CPPONLY: }

  1132         } else {

  1133             // CPPONLY: if (viewingXmlSource) {

  1134             // CPPONLY:   assert newAttributesEachTime;

  1135             // CPPONLY:   Portability.delete(attributes);

  1136             // CPPONLY:   attributes = null;

  1137             // CPPONLY: } else {

  1138             tokenHandler.startTag(tagName, attrs, selfClosing);

  1139             // CPPONLY: }

  1140         }

  1141         tagName.release();

  1142         tagName = null;

  1143         if (newAttributesEachTime) {

  1144             attributes = null;

  1145         } else {

  1146             attributes.clear(mappingLangToXmlLang);

  1147         }

  1148         /*

  1149          * The token handler may have called setStateAndEndTagExpectation

  1150          * and changed stateSave since the start of this method.

  1151          */

  1152         return stateSave;

  1153     }

  1155     private void attributeNameComplete() throws SAXException {

  1156         // if (strBufOffset != -1) {

  1157         // attributeName = AttributeName.nameByBuffer(buf, strBufOffset,

  1158         // strBufLen, namePolicy != XmlViolationPolicy.ALLOW);

  1159         // } else {

  1160         attributeName = AttributeName.nameByBuffer(strBuf, 0, strBufLen

  1161         // [NOCPP[

  1162                 , namePolicy != XmlViolationPolicy.ALLOW

  1163                 // ]NOCPP]

  1164                 , interner);

  1165         // }

  1167         if (attributes == null) {

  1168             attributes = new HtmlAttributes(mappingLangToXmlLang);

  1169         }

  1171         /*

  1172          * When the user agent leaves the attribute name state (and before

  1173          * emitting the tag token, if appropriate), the complete attribute's

  1174          * name must be compared to the other attributes on the same token; if

  1175          * there is already an attribute on the token with the exact same name,

  1176          * then this is a parse error and the new attribute must be dropped,

  1177          * along with the value that gets associated with it (if any).

  1178          */

  1179         if (attributes.contains(attributeName)) {

  1180             errDuplicateAttribute();

  1181             attributeName.release();

  1182             attributeName = null;

  1183         }

  1184     }

  1186     private void addAttributeWithoutValue() throws SAXException {

  1187         noteAttributeWithoutValue();

  1189         // [NOCPP[

  1190         if (metaBoundaryPassed && AttributeName.CHARSET == attributeName

  1191                 && ElementName.META == tagName) {

  1192             err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");

  1193         }

  1194         // ]NOCPP]

  1195         if (attributeName != null) {

  1196             // [NOCPP[

  1197             if (html4) {

  1198                 if (attributeName.isBoolean()) {

  1199                     if (html4ModeCompatibleWithXhtml1Schemata) {

  1200                         attributes.addAttribute(attributeName,

  1201                                 attributeName.getLocal(AttributeName.HTML),

  1202                                 xmlnsPolicy);

  1203                     } else {

  1204                         attributes.addAttribute(attributeName, "", xmlnsPolicy);

  1205                     }

  1206                 } else {

  1207                     if (AttributeName.BORDER != attributeName) {

  1208                         err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)");

  1209                         attributes.addAttribute(attributeName, "", xmlnsPolicy);

  1210                     }

  1211                 }

  1212             } else {

  1213                 if (AttributeName.SRC == attributeName

  1214                         || AttributeName.HREF == attributeName) {

  1215                     warn("Attribute \u201C"

  1216                             + attributeName.getLocal(AttributeName.HTML)

  1217                             + "\u201D without an explicit value seen. The attribute may be dropped by IE7.");

  1218                 }

  1219                 // ]NOCPP]

  1220                 attributes.addAttribute(attributeName,

  1221                         Portability.newEmptyString()

  1222                         // [NOCPP[

  1223                         , xmlnsPolicy

  1224                 // ]NOCPP]

  1225                 );

  1226                 // [NOCPP[

  1227             }

  1228             // ]NOCPP]

  1229             attributeName = null; // attributeName has been adopted by the

  1230             // |attributes| object

  1231         }

  1232     }

  1234     private void addAttributeWithValue() throws SAXException {

  1235         // [NOCPP[

  1236         if (metaBoundaryPassed && ElementName.META == tagName

  1237                 && AttributeName.CHARSET == attributeName) {

  1238             err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");

  1239         }

  1240         // ]NOCPP]

  1241         if (attributeName != null) {

  1242             String val = longStrBufToString(); // Ownership transferred to

  1243             // HtmlAttributes

  1244             // CPPONLY: if (mViewSource) {

  1245             // CPPONLY:   mViewSource.MaybeLinkifyAttributeValue(attributeName, val);

  1246             // CPPONLY: }

  1247             // [NOCPP[

  1248             if (!endTag && html4 && html4ModeCompatibleWithXhtml1Schemata

  1249                     && attributeName.isCaseFolded()) {

  1250                 val = newAsciiLowerCaseStringFromString(val);

  1251             }

  1252             // ]NOCPP]

  1253             attributes.addAttribute(attributeName, val

  1254             // [NOCPP[

  1255                     , xmlnsPolicy

  1256             // ]NOCPP]

  1257             );

  1258             attributeName = null; // attributeName has been adopted by the

  1259             // |attributes| object

  1260         }

  1261     }

  1263     // [NOCPP[

  1265     private static String newAsciiLowerCaseStringFromString(String str) {

  1266         if (str == null) {

  1267             return null;

  1268         }

  1269         char[] buf = new char[str.length()];

  1270         for (int i = 0; i < str.length(); i++) {

  1271             char c = str.charAt(i);

  1272             if (c >= 'A' && c <= 'Z') {

  1273                 c += 0x20;

  1274             }

  1275             buf[i] = c;

  1276         }

  1277         return new String(buf);

  1278     }

  1280     protected void startErrorReporting() throws SAXException {

  1282     }

  1284     // ]NOCPP]

  1286     public void start() throws SAXException {

  1287         initializeWithoutStarting();

  1288         tokenHandler.startTokenization(this);

  1289         // [NOCPP[

  1290         startErrorReporting();

  1291         // ]NOCPP]

  1292     }

  1294     public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException {

  1295         int state = stateSave;

  1296         int returnState = returnStateSave;

  1297         char c = '\u0000';

  1298         shouldSuspend = false;

  1299         lastCR = false;

  1301         int start = buffer.getStart();

  1302         /**

  1303          * The index of the last <code>char</code> read from <code>buf</code>.

  1304          */

  1305         int pos = start - 1;

  1307         /**

  1308          * The index of the first <code>char</code> in <code>buf</code> that is

  1309          * part of a coalesced run of character tokens or

  1310          * <code>Integer.MAX_VALUE</code> if there is not a current run being

  1311          * coalesced.

  1312          */

  1313         switch (state) {

  1314             case DATA:

  1315             case RCDATA:

  1316             case SCRIPT_DATA:

  1317             case PLAINTEXT:

  1318             case RAWTEXT:

  1319             case CDATA_SECTION:

  1320             case SCRIPT_DATA_ESCAPED:

  1321             case SCRIPT_DATA_ESCAPE_START:

  1322             case SCRIPT_DATA_ESCAPE_START_DASH:

  1323             case SCRIPT_DATA_ESCAPED_DASH:

  1324             case SCRIPT_DATA_ESCAPED_DASH_DASH:

  1325             case SCRIPT_DATA_DOUBLE_ESCAPE_START:

  1326             case SCRIPT_DATA_DOUBLE_ESCAPED:

  1327             case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:

  1328             case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:

  1329             case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:

  1330             case SCRIPT_DATA_DOUBLE_ESCAPE_END:

  1331                 cstart = start;

  1332                 break;

  1333             default:

  1334                 cstart = Integer.MAX_VALUE;

  1335                 break;

  1336         }

  1338         /**

  1339          * The number of <code>char</code>s in <code>buf</code> that have

  1340          * meaning. (The rest of the array is garbage and should not be

  1341          * examined.)

  1342          */

  1343         // CPPONLY: if (mViewSource) {

  1344         // CPPONLY:   mViewSource.SetBuffer(buffer);

  1345         // CPPONLY:   pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());

  1346         // CPPONLY:   mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1);

  1347         // CPPONLY: } else {

  1348         // CPPONLY:   pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());

  1349         // CPPONLY: }

  1350         // [NOCPP[

  1351         pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState,

  1352                 buffer.getEnd());

  1353         // ]NOCPP]

  1354         if (pos == buffer.getEnd()) {

  1355             // exiting due to end of buffer

  1356             buffer.setStart(pos);

  1357         } else {

  1358             buffer.setStart(pos + 1);

  1359         }

  1360         return lastCR;

  1361     }

  1363     @SuppressWarnings("unused") private int stateLoop(int state, char c,

  1364             int pos, @NoLength char[] buf, boolean reconsume, int returnState,

  1365             int endPos) throws SAXException {

  1366         /*

  1367          * Idioms used in this code:

  1368          *

  1369          *

  1370          * Consuming the next input character

  1371          *

  1372          * To consume the next input character, the code does this: if (++pos ==

  1373          * endPos) { break stateloop; } c = checkChar(buf, pos);

  1374          *

  1375          *

  1376          * Staying in a state

  1377          *

  1378          * When there's a state that the tokenizer may stay in over multiple

  1379          * input characters, the state has a wrapper |for(;;)| loop and staying

  1380          * in the state continues the loop.

  1381          *

  1382          *

  1383          * Switching to another state

  1384          *

  1385          * To switch to another state, the code sets the state variable to the

  1386          * magic number of the new state. Then it either continues stateloop or

  1387          * breaks out of the state's own wrapper loop if the target state is

  1388          * right after the current state in source order. (This is a partial

  1389          * workaround for Java's lack of goto.)

  1390          *

  1391          *

  1392          * Reconsume support

  1393          *

  1394          * The spec sometimes says that an input character is reconsumed in

  1395          * another state. If a state can ever be entered so that an input

  1396          * character can be reconsumed in it, the state's code starts with an

  1397          * |if (reconsume)| that sets reconsume to false and skips over the

  1398          * normal code for consuming a new character.

  1399          *

  1400          * To reconsume the current character in another state, the code sets

  1401          * |reconsume| to true and then switches to the other state.

  1402          *

  1403          *

  1404          * Emitting character tokens

  1405          *

  1406          * This method emits character tokens lazily. Whenever a new range of

  1407          * character tokens starts, the field cstart must be set to the start

  1408          * index of the range. The flushChars() method must be called at the end

  1409          * of a range to flush it.

  1410          *

  1411          *

  1412          * U+0000 handling

  1413          *

  1414          * The various states have to handle the replacement of U+0000 with

  1415          * U+FFFD. However, if U+0000 would be reconsumed in another state, the

  1416          * replacement doesn't need to happen, because it's handled by the

  1417          * reconsuming state.

  1418          *

  1419          *

  1420          * LF handling

  1421          *

  1422          * Every state needs to increment the line number upon LF unless the LF

  1423          * gets reconsumed by another state which increments the line number.

  1424          *

  1425          *

  1426          * CR handling

  1427          *

  1428          * Every state needs to handle CR unless the CR gets reconsumed and is

  1429          * handled by the reconsuming state. The CR needs to be handled as if it

  1430          * were and LF, the lastCR field must be set to true and then this

  1431          * method must return. The IO driver will then swallow the next

  1432          * character if it is an LF to coalesce CRLF.

  1433          */

  1434         stateloop: for (;;) {

  1435             switch (state) {

  1436                 case DATA:

  1437                     dataloop: for (;;) {

  1438                         if (reconsume) {

  1439                             reconsume = false;

  1440                         } else {

  1441                             if (++pos == endPos) {

  1442                                 break stateloop;

  1443                             }

  1444                             c = checkChar(buf, pos);

  1445                         }

  1446                         switch (c) {

  1447                             case '&':

  1448                                 /*

  1449                                  * U+0026 AMPERSAND (&) Switch to the character

  1450                                  * reference in data state.

  1451                                  */

  1452                                 flushChars(buf, pos);

  1453                                 clearStrBufAndAppend(c);

  1454                                 setAdditionalAndRememberAmpersandLocation('\u0000');

  1455                                 returnState = state;

  1456                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);

  1457                                 continue stateloop;

  1458                             case '<':

  1459                                 /*

  1460                                  * U+003C LESS-THAN SIGN (<) Switch to the tag

  1461                                  * open state.

  1462                                  */

  1463                                 flushChars(buf, pos);

  1465                                 state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos);

  1466                                 break dataloop; // FALL THROUGH continue

  1467                             // stateloop;

  1468                             case '\u0000':

  1469                                 emitReplacementCharacter(buf, pos);

  1470                                 continue;

  1471                             case '\r':

  1472                                 emitCarriageReturn(buf, pos);

  1473                                 break stateloop;

  1474                             case '\n':

  1475                                 silentLineFeed();

  1476                             default:

  1477                                 /*

  1478                                  * Anything else Emit the input character as a

  1479                                  * character token.

  1480                                  *

  1481                                  * Stay in the data state.

  1482                                  */

  1483                                 continue;

  1484                         }

  1485                     }

  1486                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER

  1487                 case TAG_OPEN:

  1488                     tagopenloop: for (;;) {

  1489                         /*

  1490                          * The behavior of this state depends on the content

  1491                          * model flag.

  1492                          */

  1493                         if (++pos == endPos) {

  1494                             break stateloop;

  1495                         }

  1496                         c = checkChar(buf, pos);

  1497                         /*

  1498                          * If the content model flag is set to the PCDATA state

  1499                          * Consume the next input character:

  1500                          */

  1501                         if (c >= 'A' && c <= 'Z') {

  1502                             /*

  1503                              * U+0041 LATIN CAPITAL LETTER A through to U+005A

  1504                              * LATIN CAPITAL LETTER Z Create a new start tag

  1505                              * token,

  1506                              */

  1507                             endTag = false;

  1508                             /*

  1509                              * set its tag name to the lowercase version of the

  1510                              * input character (add 0x0020 to the character's

  1511                              * code point),

  1512                              */

  1513                             clearStrBufAndAppend((char) (c + 0x20));

  1514                             /* then switch to the tag name state. */

  1515                             state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);

  1516                             /*

  1517                              * (Don't emit the token yet; further details will

  1518                              * be filled in before it is emitted.)

  1519                              */

  1520                             break tagopenloop;

  1521                             // continue stateloop;

  1522                         } else if (c >= 'a' && c <= 'z') {

  1523                             /*

  1524                              * U+0061 LATIN SMALL LETTER A through to U+007A

  1525                              * LATIN SMALL LETTER Z Create a new start tag

  1526                              * token,

  1527                              */

  1528                             endTag = false;

  1529                             /*

  1530                              * set its tag name to the input character,

  1531                              */

  1532                             clearStrBufAndAppend(c);

  1533                             /* then switch to the tag name state. */

  1534                             state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);

  1535                             /*

  1536                              * (Don't emit the token yet; further details will

  1537                              * be filled in before it is emitted.)

  1538                              */

  1539                             break tagopenloop;

  1540                             // continue stateloop;

  1541                         }

  1542                         switch (c) {

  1543                             case '!':

  1544                                 /*

  1545                                  * U+0021 EXCLAMATION MARK (!) Switch to the

  1546                                  * markup declaration open state.

  1547                                  */

  1548                                 state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos);

  1549                                 continue stateloop;

  1550                             case '/':

  1551                                 /*

  1552                                  * U+002F SOLIDUS (/) Switch to the close tag

  1553                                  * open state.

  1554                                  */

  1555                                 state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos);

  1556                                 continue stateloop;

  1557                             case '?':

  1558                                 // CPPONLY: if (viewingXmlSource) {

  1559                                 // CPPONLY: state = transition(state,

  1560                                 // CPPONLY: Tokenizer.PROCESSING_INSTRUCTION,

  1561                                 // CPPONLY: reconsume,

  1562                                 // CPPONLY: pos);

  1563                                 // CPPONLY: continue stateloop;

  1564                                 // CPPONLY: }

  1565                                 /*

  1566                                  * U+003F QUESTION MARK (?) Parse error.

  1567                                  */

  1568                                 errProcessingInstruction();

  1569                                 /*

  1570                                  * Switch to the bogus comment state.

  1571                                  */

  1572                                 clearLongStrBufAndAppend(c);

  1573                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);

  1574                                 continue stateloop;

  1575                             case '>':

  1576                                 /*

  1577                                  * U+003E GREATER-THAN SIGN (>) Parse error.

  1578                                  */

  1579                                 errLtGt();

  1580                                 /*

  1581                                  * Emit a U+003C LESS-THAN SIGN character token

  1582                                  * and a U+003E GREATER-THAN SIGN character

  1583                                  * token.

  1584                                  */

  1585                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 2);

  1586                                 /* Switch to the data state. */

  1587                                 cstart = pos + 1;

  1588                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  1589                                 continue stateloop;

  1590                             default:

  1591                                 /*

  1592                                  * Anything else Parse error.

  1593                                  */

  1594                                 errBadCharAfterLt(c);

  1595                                 /*

  1596                                  * Emit a U+003C LESS-THAN SIGN character token

  1597                                  */

  1598                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);

  1599                                 /*

  1600                                  * and reconsume the current input character in

  1601                                  * the data state.

  1602                                  */

  1603                                 cstart = pos;

  1604                                 reconsume = true;

  1605                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  1606                                 continue stateloop;

  1607                         }

  1608                     }

  1609                     // FALL THROUGH DON'T REORDER

  1610                 case TAG_NAME:

  1611                     tagnameloop: for (;;) {

  1612                         if (++pos == endPos) {

  1613                             break stateloop;

  1614                         }

  1615                         c = checkChar(buf, pos);

  1616                         /*

  1617                          * Consume the next input character:

  1618                          */

  1619                         switch (c) {

  1620                             case '\r':

  1621                                 silentCarriageReturn();

  1622                                 strBufToElementNameString();

  1623                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);

  1624                                 break stateloop;

  1625                             case '\n':

  1626                                 silentLineFeed();

  1627                             case ' ':

  1628                             case '\t':

  1629                             case '\u000C':

  1630                                 /*

  1631                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED

  1632                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE

  1633                                  * Switch to the before attribute name state.

  1634                                  */

  1635                                 strBufToElementNameString();

  1636                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);

  1637                                 break tagnameloop;

  1638                             // continue stateloop;

  1639                             case '/':

  1640                                 /*

  1641                                  * U+002F SOLIDUS (/) Switch to the self-closing

  1642                                  * start tag state.

  1643                                  */

  1644                                 strBufToElementNameString();

  1645                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);

  1646                                 continue stateloop;

  1647                             case '>':

  1648                                 /*

  1649                                  * U+003E GREATER-THAN SIGN (>) Emit the current

  1650                                  * tag token.

  1651                                  */

  1652                                 strBufToElementNameString();

  1653                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);

  1654                                 if (shouldSuspend) {

  1655                                     break stateloop;

  1656                                 }

  1657                                 /*

  1658                                  * Switch to the data state.

  1659                                  */

  1660                                 continue stateloop;

  1661                             case '\u0000':

  1662                                 c = '\uFFFD';

  1663                                 // fall thru

  1664                             default:

  1665                                 if (c >= 'A' && c <= 'Z') {

  1666                                     /*

  1667                                      * U+0041 LATIN CAPITAL LETTER A through to

  1668                                      * U+005A LATIN CAPITAL LETTER Z Append the

  1669                                      * lowercase version of the current input

  1670                                      * character (add 0x0020 to the character's

  1671                                      * code point) to the current tag token's

  1672                                      * tag name.

  1673                                      */

  1674                                     c += 0x20;

  1675                                 }

  1676                                 /*

  1677                                  * Anything else Append the current input

  1678                                  * character to the current tag token's tag

  1679                                  * name.

  1680                                  */

  1681                                 appendStrBuf(c);

  1682                                 /*

  1683                                  * Stay in the tag name state.

  1684                                  */

  1685                                 continue;

  1686                         }

  1687                     }

  1688                     // FALLTHRU DON'T REORDER

  1689                 case BEFORE_ATTRIBUTE_NAME:

  1690                     beforeattributenameloop: for (;;) {

  1691                         if (reconsume) {

  1692                             reconsume = false;

  1693                         } else {

  1694                             if (++pos == endPos) {

  1695                                 break stateloop;

  1696                             }

  1697                             c = checkChar(buf, pos);

  1698                         }

  1699                         /*

  1700                          * Consume the next input character:

  1701                          */

  1702                         switch (c) {

  1703                             case '\r':

  1704                                 silentCarriageReturn();

  1705                                 break stateloop;

  1706                             case '\n':

  1707                                 silentLineFeed();

  1708                                 // fall thru

  1709                             case ' ':

  1710                             case '\t':

  1711                             case '\u000C':

  1712                                 /*

  1713                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED

  1714                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay

  1715                                  * in the before attribute name state.

  1716                                  */

  1717                                 continue;

  1718                             case '/':

  1719                                 /*

  1720                                  * U+002F SOLIDUS (/) Switch to the self-closing

  1721                                  * start tag state.

  1722                                  */

  1723                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);

  1724                                 continue stateloop;

  1725                             case '>':

  1726                                 /*

  1727                                  * U+003E GREATER-THAN SIGN (>) Emit the current

  1728                                  * tag token.

  1729                                  */

  1730                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);

  1731                                 if (shouldSuspend) {

  1732                                     break stateloop;

  1733                                 }

  1734                                 /*

  1735                                  * Switch to the data state.

  1736                                  */

  1737                                 continue stateloop;

  1738                             case '\u0000':

  1739                                 c = '\uFFFD';

  1740                                 // fall thru

  1741                             case '\"':

  1742                             case '\'':

  1743                             case '<':

  1744                             case '=':

  1745                                 /*

  1746                                  * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE

  1747                                  * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS

  1748                                  * SIGN (=) Parse error.

  1749                                  */

  1750                                 errBadCharBeforeAttributeNameOrNull(c);

  1751                                 /*

  1752                                  * Treat it as per the "anything else" entry

  1753                                  * below.

  1754                                  */

  1755                             default:

  1756                                 /*

  1757                                  * Anything else Start a new attribute in the

  1758                                  * current tag token.

  1759                                  */

  1760                                 if (c >= 'A' && c <= 'Z') {

  1761                                     /*

  1762                                      * U+0041 LATIN CAPITAL LETTER A through to

  1763                                      * U+005A LATIN CAPITAL LETTER Z Set that

  1764                                      * attribute's name to the lowercase version

  1765                                      * of the current input character (add

  1766                                      * 0x0020 to the character's code point)

  1767                                      */

  1768                                     c += 0x20;

  1769                                 }

  1770                                 /*

  1771                                  * Set that attribute's name to the current

  1772                                  * input character,

  1773                                  */

  1774                                 clearStrBufAndAppend(c);

  1775                                 /*

  1776                                  * and its value to the empty string.

  1777                                  */

  1778                                 // Will do later.

  1779                                 /*

  1780                                  * Switch to the attribute name state.

  1781                                  */

  1782                                 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);

  1783                                 break beforeattributenameloop;

  1784                             // continue stateloop;

  1785                         }

  1786                     }

  1787                     // FALLTHRU DON'T REORDER

  1788                 case ATTRIBUTE_NAME:

  1789                     attributenameloop: for (;;) {

  1790                         if (++pos == endPos) {

  1791                             break stateloop;

  1792                         }

  1793                         c = checkChar(buf, pos);

  1794                         /*

  1795                          * Consume the next input character:

  1796                          */

  1797                         switch (c) {

  1798                             case '\r':

  1799                                 silentCarriageReturn();

  1800                                 attributeNameComplete();

  1801                                 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);

  1802                                 break stateloop;

  1803                             case '\n':

  1804                                 silentLineFeed();

  1805                                 // fall thru

  1806                             case ' ':

  1807                             case '\t':

  1808                             case '\u000C':

  1809                                 /*

  1810                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED

  1811                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE

  1812                                  * Switch to the after attribute name state.

  1813                                  */

  1814                                 attributeNameComplete();

  1815                                 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);

  1816                                 continue stateloop;

  1817                             case '/':

  1818                                 /*

  1819                                  * U+002F SOLIDUS (/) Switch to the self-closing

  1820                                  * start tag state.

  1821                                  */

  1822                                 attributeNameComplete();

  1823                                 addAttributeWithoutValue();

  1824                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);

  1825                                 continue stateloop;

  1826                             case '=':

  1827                                 /*

  1828                                  * U+003D EQUALS SIGN (=) Switch to the before

  1829                                  * attribute value state.

  1830                                  */

  1831                                 attributeNameComplete();

  1832                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);

  1833                                 break attributenameloop;

  1834                             // continue stateloop;

  1835                             case '>':

  1836                                 /*

  1837                                  * U+003E GREATER-THAN SIGN (>) Emit the current

  1838                                  * tag token.

  1839                                  */

  1840                                 attributeNameComplete();

  1841                                 addAttributeWithoutValue();

  1842                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);

  1843                                 if (shouldSuspend) {

  1844                                     break stateloop;

  1845                                 }

  1846                                 /*

  1847                                  * Switch to the data state.

  1848                                  */

  1849                                 continue stateloop;

  1850                             case '\u0000':

  1851                                 c = '\uFFFD';

  1852                                 // fall thru

  1853                             case '\"':

  1854                             case '\'':

  1855                             case '<':

  1856                                 /*

  1857                                  * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE

  1858                                  * (') U+003C LESS-THAN SIGN (<) Parse error.

  1859                                  */

  1860                                 errQuoteOrLtInAttributeNameOrNull(c);

  1861                                 /*

  1862                                  * Treat it as per the "anything else" entry

  1863                                  * below.

  1864                                  */

  1865                             default:

  1866                                 if (c >= 'A' && c <= 'Z') {

  1867                                     /*

  1868                                      * U+0041 LATIN CAPITAL LETTER A through to

  1869                                      * U+005A LATIN CAPITAL LETTER Z Append the

  1870                                      * lowercase version of the current input

  1871                                      * character (add 0x0020 to the character's

  1872                                      * code point) to the current attribute's

  1873                                      * name.

  1874                                      */

  1875                                     c += 0x20;

  1876                                 }

  1877                                 /*

  1878                                  * Anything else Append the current input

  1879                                  * character to the current attribute's name.

  1880                                  */

  1881                                 appendStrBuf(c);

  1882                                 /*

  1883                                  * Stay in the attribute name state.

  1884                                  */

  1885                                 continue;

  1886                         }

  1887                     }

  1888                     // FALLTHRU DON'T REORDER

  1889                 case BEFORE_ATTRIBUTE_VALUE:

  1890                     beforeattributevalueloop: for (;;) {

  1891                         if (++pos == endPos) {

  1892                             break stateloop;

  1893                         }

  1894                         c = checkChar(buf, pos);

  1895                         /*

  1896                          * Consume the next input character:

  1897                          */

  1898                         switch (c) {

  1899                             case '\r':

  1900                                 silentCarriageReturn();

  1901                                 break stateloop;

  1902                             case '\n':

  1903                                 silentLineFeed();

  1904                                 // fall thru

  1905                             case ' ':

  1906                             case '\t':

  1907                             case '\u000C':

  1908                                 /*

  1909                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED

  1910                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay

  1911                                  * in the before attribute value state.

  1912                                  */

  1913                                 continue;

  1914                             case '"':

  1915                                 /*

  1916                                  * U+0022 QUOTATION MARK (") Switch to the

  1917                                  * attribute value (double-quoted) state.

  1918                                  */

  1919                                 clearLongStrBuf();

  1920                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos);

  1921                                 break beforeattributevalueloop;

  1922                             // continue stateloop;

  1923                             case '&':

  1924                                 /*

  1925                                  * U+0026 AMPERSAND (&) Switch to the attribute

  1926                                  * value (unquoted) state and reconsume this

  1927                                  * input character.

  1928                                  */

  1929                                 clearLongStrBuf();

  1930                                 reconsume = true;

  1931                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);

  1932                                 noteUnquotedAttributeValue();

  1933                                 continue stateloop;

  1934                             case '\'':

  1935                                 /*

  1936                                  * U+0027 APOSTROPHE (') Switch to the attribute

  1937                                  * value (single-quoted) state.

  1938                                  */

  1939                                 clearLongStrBuf();

  1940                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos);

  1941                                 continue stateloop;

  1942                             case '>':

  1943                                 /*

  1944                                  * U+003E GREATER-THAN SIGN (>) Parse error.

  1945                                  */

  1946                                 errAttributeValueMissing();

  1947                                 /*

  1948                                  * Emit the current tag token.

  1949                                  */

  1950                                 addAttributeWithoutValue();

  1951                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);

  1952                                 if (shouldSuspend) {

  1953                                     break stateloop;

  1954                                 }

  1955                                 /*

  1956                                  * Switch to the data state.

  1957                                  */

  1958                                 continue stateloop;

  1959                             case '\u0000':

  1960                                 c = '\uFFFD';

  1961                                 // fall thru

  1962                             case '<':

  1963                             case '=':

  1964                             case '`':

  1965                                 /*

  1966                                  * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN

  1967                                  * (=) U+0060 GRAVE ACCENT (`)

  1968                                  */

  1969                                 errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c);

  1970                                 /*

  1971                                  * Treat it as per the "anything else" entry

  1972                                  * below.

  1973                                  */

  1974                             default:

  1975                                 // [NOCPP[

  1976                                 errHtml4NonNameInUnquotedAttribute(c);

  1977                                 // ]NOCPP]

  1978                                 /*

  1979                                  * Anything else Append the current input

  1980                                  * character to the current attribute's value.

  1981                                  */

  1982                                 clearLongStrBufAndAppend(c);

  1983                                 /*

  1984                                  * Switch to the attribute value (unquoted)

  1985                                  * state.

  1986                                  */

  1988                                 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);

  1989                                 noteUnquotedAttributeValue();

  1990                                 continue stateloop;

  1991                         }

  1992                     }

  1993                     // FALLTHRU DON'T REORDER

  1994                 case ATTRIBUTE_VALUE_DOUBLE_QUOTED:

  1995                     attributevaluedoublequotedloop: for (;;) {

  1996                         if (reconsume) {

  1997                             reconsume = false;

  1998                         } else {

  1999                             if (++pos == endPos) {

  2000                                 break stateloop;

  2001                             }

  2002                             c = checkChar(buf, pos);

  2003                         }

  2004                         /*

  2005                          * Consume the next input character:

  2006                          */

  2007                         switch (c) {

  2008                             case '"':

  2009                                 /*

  2010                                  * U+0022 QUOTATION MARK (") Switch to the after

  2011                                  * attribute value (quoted) state.

  2012                                  */

  2013                                 addAttributeWithValue();

  2015                                 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);

  2016                                 break attributevaluedoublequotedloop;

  2017                             // continue stateloop;

  2018                             case '&':

  2019                                 /*

  2020                                  * U+0026 AMPERSAND (&) Switch to the character

  2021                                  * reference in attribute value state, with the

  2022                                  * additional allowed character being U+0022

  2023                                  * QUOTATION MARK (").

  2024                                  */

  2025                                 clearStrBufAndAppend(c);

  2026                                 setAdditionalAndRememberAmpersandLocation('\"');

  2027                                 returnState = state;

  2028                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);

  2029                                 continue stateloop;

  2030                             case '\r':

  2031                                 appendLongStrBufCarriageReturn();

  2032                                 break stateloop;

  2033                             case '\n':

  2034                                 appendLongStrBufLineFeed();

  2035                                 continue;

  2036                             case '\u0000':

  2037                                 c = '\uFFFD';

  2038                                 // fall thru

  2039                             default:

  2040                                 /*

  2041                                  * Anything else Append the current input

  2042                                  * character to the current attribute's value.

  2043                                  */

  2044                                 appendLongStrBuf(c);

  2045                                 /*

  2046                                  * Stay in the attribute value (double-quoted)

  2047                                  * state.

  2048                                  */

  2049                                 continue;

  2050                         }

  2051                     }

  2052                     // FALLTHRU DON'T REORDER

  2053                 case AFTER_ATTRIBUTE_VALUE_QUOTED:

  2054                     afterattributevaluequotedloop: for (;;) {

  2055                         if (++pos == endPos) {

  2056                             break stateloop;

  2057                         }

  2058                         c = checkChar(buf, pos);

  2059                         /*

  2060                          * Consume the next input character:

  2061                          */

  2062                         switch (c) {

  2063                             case '\r':

  2064                                 silentCarriageReturn();

  2065                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);

  2066                                 break stateloop;

  2067                             case '\n':

  2068                                 silentLineFeed();

  2069                                 // fall thru

  2070                             case ' ':

  2071                             case '\t':

  2072                             case '\u000C':

  2073                                 /*

  2074                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED

  2075                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE

  2076                                  * Switch to the before attribute name state.

  2077                                  */

  2078                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);

  2079                                 continue stateloop;

  2080                             case '/':

  2081                                 /*

  2082                                  * U+002F SOLIDUS (/) Switch to the self-closing

  2083                                  * start tag state.

  2084                                  */

  2085                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);

  2086                                 break afterattributevaluequotedloop;

  2087                             // continue stateloop;

  2088                             case '>':

  2089                                 /*

  2090                                  * U+003E GREATER-THAN SIGN (>) Emit the current

  2091                                  * tag token.

  2092                                  */

  2093                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);

  2094                                 if (shouldSuspend) {

  2095                                     break stateloop;

  2096                                 }

  2097                                 /*

  2098                                  * Switch to the data state.

  2099                                  */

  2100                                 continue stateloop;

  2101                             default:

  2102                                 /*

  2103                                  * Anything else Parse error.

  2104                                  */

  2105                                 errNoSpaceBetweenAttributes();

  2106                                 /*

  2107                                  * Reconsume the character in the before

  2108                                  * attribute name state.

  2109                                  */

  2110                                 reconsume = true;

  2111                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);

  2112                                 continue stateloop;

  2113                         }

  2114                     }

  2115                     // FALLTHRU DON'T REORDER

  2116                 case SELF_CLOSING_START_TAG:

  2117                     if (++pos == endPos) {

  2118                         break stateloop;

  2119                     }

  2120                     c = checkChar(buf, pos);

  2121                     /*

  2122                      * Consume the next input character:

  2123                      */

  2124                     switch (c) {

  2125                         case '>':

  2126                             /*

  2127                              * U+003E GREATER-THAN SIGN (>) Set the self-closing

  2128                              * flag of the current tag token. Emit the current

  2129                              * tag token.

  2130                              */

  2131                             // [NOCPP[

  2132                             errHtml4XmlVoidSyntax();

  2133                             // ]NOCPP]

  2134                             state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos);

  2135                             if (shouldSuspend) {

  2136                                 break stateloop;

  2137                             }

  2138                             /*

  2139                              * Switch to the data state.

  2140                              */

  2141                             continue stateloop;

  2142                         default:

  2143                             /* Anything else Parse error. */

  2144                             errSlashNotFollowedByGt();

  2145                             /*

  2146                              * Reconsume the character in the before attribute

  2147                              * name state.

  2148                              */

  2149                             reconsume = true;

  2150                             state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);

  2151                             continue stateloop;

  2152                     }

  2153                     // XXX reorder point

  2154                 case ATTRIBUTE_VALUE_UNQUOTED:

  2155                     for (;;) {

  2156                         if (reconsume) {

  2157                             reconsume = false;

  2158                         } else {

  2159                             if (++pos == endPos) {

  2160                                 break stateloop;

  2161                             }

  2162                             c = checkChar(buf, pos);

  2163                         }

  2164                         /*

  2165                          * Consume the next input character:

  2166                          */

  2167                         switch (c) {

  2168                             case '\r':

  2169                                 silentCarriageReturn();

  2170                                 addAttributeWithValue();

  2171                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);

  2172                                 break stateloop;

  2173                             case '\n':

  2174                                 silentLineFeed();

  2175                                 // fall thru

  2176                             case ' ':

  2177                             case '\t':

  2178                             case '\u000C':

  2179                                 /*

  2180                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED

  2181                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE

  2182                                  * Switch to the before attribute name state.

  2183                                  */

  2184                                 addAttributeWithValue();

  2185                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);

  2186                                 continue stateloop;

  2187                             case '&':

  2188                                 /*

  2189                                  * U+0026 AMPERSAND (&) Switch to the character

  2190                                  * reference in attribute value state, with the

  2191                                  * additional allowed character being U+003E

  2192                                  * GREATER-THAN SIGN (>)

  2193                                  */

  2194                                 clearStrBufAndAppend(c);

  2195                                 setAdditionalAndRememberAmpersandLocation('>');

  2196                                 returnState = state;

  2197                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);

  2198                                 continue stateloop;

  2199                             case '>':

  2200                                 /*

  2201                                  * U+003E GREATER-THAN SIGN (>) Emit the current

  2202                                  * tag token.

  2203                                  */

  2204                                 addAttributeWithValue();

  2205                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);

  2206                                 if (shouldSuspend) {

  2207                                     break stateloop;

  2208                                 }

  2209                                 /*

  2210                                  * Switch to the data state.

  2211                                  */

  2212                                 continue stateloop;

  2213                             case '\u0000':

  2214                                 c = '\uFFFD';

  2215                                 // fall thru

  2216                             case '<':

  2217                             case '\"':

  2218                             case '\'':

  2219                             case '=':

  2220                             case '`':

  2221                                 /*

  2222                                  * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE

  2223                                  * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS

  2224                                  * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error.

  2225                                  */

  2226                                 errUnquotedAttributeValOrNull(c);

  2227                                 /*

  2228                                  * Treat it as per the "anything else" entry

  2229                                  * below.

  2230                                  */

  2231                                 // fall through

  2232                             default:

  2233                                 // [NOCPP]

  2234                                 errHtml4NonNameInUnquotedAttribute(c);

  2235                                 // ]NOCPP]

  2236                                 /*

  2237                                  * Anything else Append the current input

  2238                                  * character to the current attribute's value.

  2239                                  */

  2240                                 appendLongStrBuf(c);

  2241                                 /*

  2242                                  * Stay in the attribute value (unquoted) state.

  2243                                  */

  2244                                 continue;

  2245                         }

  2246                     }

  2247                     // XXX reorder point

  2248                 case AFTER_ATTRIBUTE_NAME:

  2249                     for (;;) {

  2250                         if (++pos == endPos) {

  2251                             break stateloop;

  2252                         }

  2253                         c = checkChar(buf, pos);

  2254                         /*

  2255                          * Consume the next input character:

  2256                          */

  2257                         switch (c) {

  2258                             case '\r':

  2259                                 silentCarriageReturn();

  2260                                 break stateloop;

  2261                             case '\n':

  2262                                 silentLineFeed();

  2263                                 // fall thru

  2264                             case ' ':

  2265                             case '\t':

  2266                             case '\u000C':

  2267                                 /*

  2268                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED

  2269                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay

  2270                                  * in the after attribute name state.

  2271                                  */

  2272                                 continue;

  2273                             case '/':

  2274                                 /*

  2275                                  * U+002F SOLIDUS (/) Switch to the self-closing

  2276                                  * start tag state.

  2277                                  */

  2278                                 addAttributeWithoutValue();

  2279                                 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);

  2280                                 continue stateloop;

  2281                             case '=':

  2282                                 /*

  2283                                  * U+003D EQUALS SIGN (=) Switch to the before

  2284                                  * attribute value state.

  2285                                  */

  2286                                 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);

  2287                                 continue stateloop;

  2288                             case '>':

  2289                                 /*

  2290                                  * U+003E GREATER-THAN SIGN (>) Emit the current

  2291                                  * tag token.

  2292                                  */

  2293                                 addAttributeWithoutValue();

  2294                                 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);

  2295                                 if (shouldSuspend) {

  2296                                     break stateloop;

  2297                                 }

  2298                                 /*

  2299                                  * Switch to the data state.

  2300                                  */

  2301                                 continue stateloop;

  2302                             case '\u0000':

  2303                                 c = '\uFFFD';

  2304                                 // fall thru

  2305                             case '\"':

  2306                             case '\'':

  2307                             case '<':

  2308                                 errQuoteOrLtInAttributeNameOrNull(c);

  2309                                 /*

  2310                                  * Treat it as per the "anything else" entry

  2311                                  * below.

  2312                                  */

  2313                             default:

  2314                                 addAttributeWithoutValue();

  2315                                 /*

  2316                                  * Anything else Start a new attribute in the

  2317                                  * current tag token.

  2318                                  */

  2319                                 if (c >= 'A' && c <= 'Z') {

  2320                                     /*

  2321                                      * U+0041 LATIN CAPITAL LETTER A through to

  2322                                      * U+005A LATIN CAPITAL LETTER Z Set that

  2323                                      * attribute's name to the lowercase version

  2324                                      * of the current input character (add

  2325                                      * 0x0020 to the character's code point)

  2326                                      */

  2327                                     c += 0x20;

  2328                                 }

  2329                                 /*

  2330                                  * Set that attribute's name to the current

  2331                                  * input character,

  2332                                  */

  2333                                 clearStrBufAndAppend(c);

  2334                                 /*

  2335                                  * and its value to the empty string.

  2336                                  */

  2337                                 // Will do later.

  2338                                 /*

  2339                                  * Switch to the attribute name state.

  2340                                  */

  2341                                 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);

  2342                                 continue stateloop;

  2343                         }

  2344                     }

  2345                     // XXX reorder point

  2346                 case MARKUP_DECLARATION_OPEN:

  2347                     markupdeclarationopenloop: for (;;) {

  2348                         if (++pos == endPos) {

  2349                             break stateloop;

  2350                         }

  2351                         c = checkChar(buf, pos);

  2352                         /*

  2353                          * If the next two characters are both U+002D

  2354                          * HYPHEN-MINUS characters (-), consume those two

  2355                          * characters, create a comment token whose data is the

  2356                          * empty string, and switch to the comment start state.

  2357                          *

  2358                          * Otherwise, if the next seven characters are an ASCII

  2359                          * case-insensitive match for the word "DOCTYPE", then

  2360                          * consume those characters and switch to the DOCTYPE

  2361                          * state.

  2362                          *

  2363                          * Otherwise, if the insertion mode is

  2364                          * "in foreign content" and the current node is not an

  2365                          * element in the HTML namespace and the next seven

  2366                          * characters are an case-sensitive match for the string

  2367                          * "[CDATA[" (the five uppercase letters "CDATA" with a

  2368                          * U+005B LEFT SQUARE BRACKET character before and

  2369                          * after), then consume those characters and switch to

  2370                          * the CDATA section state.

  2371                          *

  2372                          * Otherwise, is is a parse error. Switch to the bogus

  2373                          * comment state. The next character that is consumed,

  2374                          * if any, is the first character that will be in the

  2375                          * comment.

  2376                          */

  2377                         switch (c) {

  2378                             case '-':

  2379                                 clearLongStrBufAndAppend(c);

  2380                                 state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos);

  2381                                 break markupdeclarationopenloop;

  2382                             // continue stateloop;

  2383                             case 'd':

  2384                             case 'D':

  2385                                 clearLongStrBufAndAppend(c);

  2386                                 index = 0;

  2387                                 state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos);

  2388                                 continue stateloop;

  2389                             case '[':

  2390                                 if (tokenHandler.cdataSectionAllowed()) {

  2391                                     clearLongStrBufAndAppend(c);

  2392                                     index = 0;

  2393                                     state = transition(state, Tokenizer.CDATA_START, reconsume, pos);

  2394                                     continue stateloop;

  2395                                 }

  2396                                 // else fall through

  2397                             default:

  2398                                 errBogusComment();

  2399                                 clearLongStrBuf();

  2400                                 reconsume = true;

  2401                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);

  2402                                 continue stateloop;

  2403                         }

  2404                     }

  2405                     // FALLTHRU DON'T REORDER

  2406                 case MARKUP_DECLARATION_HYPHEN:

  2407                     markupdeclarationhyphenloop: for (;;) {

  2408                         if (++pos == endPos) {

  2409                             break stateloop;

  2410                         }

  2411                         c = checkChar(buf, pos);

  2412                         switch (c) {

  2413                             case '\u0000':

  2414                                 break stateloop;

  2415                             case '-':

  2416                                 clearLongStrBuf();

  2417                                 state = transition(state, Tokenizer.COMMENT_START, reconsume, pos);

  2418                                 break markupdeclarationhyphenloop;

  2419                             // continue stateloop;

  2420                             default:

  2421                                 errBogusComment();

  2422                                 reconsume = true;

  2423                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);

  2424                                 continue stateloop;

  2425                         }

  2426                     }

  2427                     // FALLTHRU DON'T REORDER

  2428                 case COMMENT_START:

  2429                     commentstartloop: for (;;) {

  2430                         if (++pos == endPos) {

  2431                             break stateloop;

  2432                         }

  2433                         c = checkChar(buf, pos);

  2434                         /*

  2435                          * Comment start state

  2436                          *

  2437                          *

  2438                          * Consume the next input character:

  2439                          */

  2440                         switch (c) {

  2441                             case '-':

  2442                                 /*

  2443                                  * U+002D HYPHEN-MINUS (-) Switch to the comment

  2444                                  * start dash state.

  2445                                  */

  2446                                 appendLongStrBuf(c);

  2447                                 state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos);

  2448                                 continue stateloop;

  2449                             case '>':

  2450                                 /*

  2451                                  * U+003E GREATER-THAN SIGN (>) Parse error.

  2452                                  */

  2453                                 errPrematureEndOfComment();

  2454                                 /* Emit the comment token. */

  2455                                 emitComment(0, pos);

  2456                                 /*

  2457                                  * Switch to the data state.

  2458                                  */

  2459                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  2460                                 continue stateloop;

  2461                             case '\r':

  2462                                 appendLongStrBufCarriageReturn();

  2463                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);

  2464                                 break stateloop;

  2465                             case '\n':

  2466                                 appendLongStrBufLineFeed();

  2467                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);

  2468                                 break commentstartloop;

  2469                             case '\u0000':

  2470                                 c = '\uFFFD';

  2471                                 // fall thru

  2472                             default:

  2473                                 /*

  2474                                  * Anything else Append the input character to

  2475                                  * the comment token's data.

  2476                                  */

  2477                                 appendLongStrBuf(c);

  2478                                 /*

  2479                                  * Switch to the comment state.

  2480                                  */

  2481                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);

  2482                                 break commentstartloop;

  2483                             // continue stateloop;

  2484                         }

  2485                     }

  2486                     // FALLTHRU DON'T REORDER

  2487                 case COMMENT:

  2488                     commentloop: for (;;) {

  2489                         if (++pos == endPos) {

  2490                             break stateloop;

  2491                         }

  2492                         c = checkChar(buf, pos);

  2493                         /*

  2494                          * Comment state Consume the next input character:

  2495                          */

  2496                         switch (c) {

  2497                             case '-':

  2498                                 /*

  2499                                  * U+002D HYPHEN-MINUS (-) Switch to the comment

  2500                                  * end dash state

  2501                                  */

  2502                                 appendLongStrBuf(c);

  2503                                 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);

  2504                                 break commentloop;

  2505                             // continue stateloop;

  2506                             case '\r':

  2507                                 appendLongStrBufCarriageReturn();

  2508                                 break stateloop;

  2509                             case '\n':

  2510                                 appendLongStrBufLineFeed();

  2511                                 continue;

  2512                             case '\u0000':

  2513                                 c = '\uFFFD';

  2514                                 // fall thru

  2515                             default:

  2516                                 /*

  2517                                  * Anything else Append the input character to

  2518                                  * the comment token's data.

  2519                                  */

  2520                                 appendLongStrBuf(c);

  2521                                 /*

  2522                                  * Stay in the comment state.

  2523                                  */

  2524                                 continue;

  2525                         }

  2526                     }

  2527                     // FALLTHRU DON'T REORDER

  2528                 case COMMENT_END_DASH:

  2529                     commentenddashloop: for (;;) {

  2530                         if (++pos == endPos) {

  2531                             break stateloop;

  2532                         }

  2533                         c = checkChar(buf, pos);

  2534                         /*

  2535                          * Comment end dash state Consume the next input

  2536                          * character:

  2537                          */

  2538                         switch (c) {

  2539                             case '-':

  2540                                 /*

  2541                                  * U+002D HYPHEN-MINUS (-) Switch to the comment

  2542                                  * end state

  2543                                  */

  2544                                 appendLongStrBuf(c);

  2545                                 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);

  2546                                 break commentenddashloop;

  2547                             // continue stateloop;

  2548                             case '\r':

  2549                                 appendLongStrBufCarriageReturn();

  2550                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);

  2551                                 break stateloop;

  2552                             case '\n':

  2553                                 appendLongStrBufLineFeed();

  2554                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);

  2555                                 continue stateloop;

  2556                             case '\u0000':

  2557                                 c = '\uFFFD';

  2558                                 // fall thru

  2559                             default:

  2560                                 /*

  2561                                  * Anything else Append a U+002D HYPHEN-MINUS

  2562                                  * (-) character and the input character to the

  2563                                  * comment token's data.

  2564                                  */

  2565                                 appendLongStrBuf(c);

  2566                                 /*

  2567                                  * Switch to the comment state.

  2568                                  */

  2569                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);

  2570                                 continue stateloop;

  2571                         }

  2572                     }

  2573                     // FALLTHRU DON'T REORDER

  2574                 case COMMENT_END:

  2575                     commentendloop: for (;;) {

  2576                         if (++pos == endPos) {

  2577                             break stateloop;

  2578                         }

  2579                         c = checkChar(buf, pos);

  2580                         /*

  2581                          * Comment end dash state Consume the next input

  2582                          * character:

  2583                          */

  2584                         switch (c) {

  2585                             case '>':

  2586                                 /*

  2587                                  * U+003E GREATER-THAN SIGN (>) Emit the comment

  2588                                  * token.

  2589                                  */

  2590                                 emitComment(2, pos);

  2591                                 /*

  2592                                  * Switch to the data state.

  2593                                  */

  2594                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  2595                                 continue stateloop;

  2596                             case '-':

  2597                                 /* U+002D HYPHEN-MINUS (-) Parse error. */

  2598                                 /*

  2599                                  * Append a U+002D HYPHEN-MINUS (-) character to

  2600                                  * the comment token's data.

  2601                                  */

  2602                                 adjustDoubleHyphenAndAppendToLongStrBufAndErr(c);

  2603                                 /*

  2604                                  * Stay in the comment end state.

  2605                                  */

  2606                                 continue;

  2607                             case '\r':

  2608                                 adjustDoubleHyphenAndAppendToLongStrBufCarriageReturn();

  2609                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);

  2610                                 break stateloop;

  2611                             case '\n':

  2612                                 adjustDoubleHyphenAndAppendToLongStrBufLineFeed();

  2613                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);

  2614                                 continue stateloop;

  2615                             case '!':

  2616                                 errHyphenHyphenBang();

  2617                                 appendLongStrBuf(c);

  2618                                 state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos);

  2619                                 continue stateloop;

  2620                             case '\u0000':

  2621                                 c = '\uFFFD';

  2622                                 // fall thru

  2623                             default:

  2624                                 /*

  2625                                  * Append two U+002D HYPHEN-MINUS (-) characters

  2626                                  * and the input character to the comment

  2627                                  * token's data.

  2628                                  */

  2629                                 adjustDoubleHyphenAndAppendToLongStrBufAndErr(c);

  2630                                 /*

  2631                                  * Switch to the comment state.

  2632                                  */

  2633                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);

  2634                                 continue stateloop;

  2635                         }

  2636                     }

  2637                     // XXX reorder point

  2638                 case COMMENT_END_BANG:

  2639                     for (;;) {

  2640                         if (++pos == endPos) {

  2641                             break stateloop;

  2642                         }

  2643                         c = checkChar(buf, pos);

  2644                         /*

  2645                          * Comment end bang state

  2646                          *

  2647                          * Consume the next input character:

  2648                          */

  2649                         switch (c) {

  2650                             case '>':

  2651                                 /*

  2652                                  * U+003E GREATER-THAN SIGN (>) Emit the comment

  2653                                  * token.

  2654                                  */

  2655                                 emitComment(3, pos);

  2656                                 /*

  2657                                  * Switch to the data state.

  2658                                  */

  2659                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  2660                                 continue stateloop;

  2661                             case '-':

  2662                                 /*

  2663                                  * Append two U+002D HYPHEN-MINUS (-) characters

  2664                                  * and a U+0021 EXCLAMATION MARK (!) character

  2665                                  * to the comment token's data.

  2666                                  */

  2667                                 appendLongStrBuf(c);

  2668                                 /*

  2669                                  * Switch to the comment end dash state.

  2670                                  */

  2671                                 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);

  2672                                 continue stateloop;

  2673                             case '\r':

  2674                                 appendLongStrBufCarriageReturn();

  2675                                 break stateloop;

  2676                             case '\n':

  2677                                 appendLongStrBufLineFeed();

  2678                                 continue;

  2679                             case '\u0000':

  2680                                 c = '\uFFFD';

  2681                                 // fall thru

  2682                             default:

  2683                                 /*

  2684                                  * Anything else Append two U+002D HYPHEN-MINUS

  2685                                  * (-) characters, a U+0021 EXCLAMATION MARK (!)

  2686                                  * character, and the input character to the

  2687                                  * comment token's data. Switch to the comment

  2688                                  * state.

  2689                                  */

  2690                                 appendLongStrBuf(c);

  2691                                 /*

  2692                                  * Switch to the comment state.

  2693                                  */

  2694                                 state = transition(state, Tokenizer.COMMENT, reconsume, pos);

  2695                                 continue stateloop;

  2696                         }

  2697                     }

  2698                     // XXX reorder point

  2699                 case COMMENT_START_DASH:

  2700                     if (++pos == endPos) {

  2701                         break stateloop;

  2702                     }

  2703                     c = checkChar(buf, pos);

  2704                     /*

  2705                      * Comment start dash state

  2706                      *

  2707                      * Consume the next input character:

  2708                      */

  2709                     switch (c) {

  2710                         case '-':

  2711                             /*

  2712                              * U+002D HYPHEN-MINUS (-) Switch to the comment end

  2713                              * state

  2714                              */

  2715                             appendLongStrBuf(c);

  2716                             state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);

  2717                             continue stateloop;

  2718                         case '>':

  2719                             errPrematureEndOfComment();

  2720                             /* Emit the comment token. */

  2721                             emitComment(1, pos);

  2722                             /*

  2723                              * Switch to the data state.

  2724                              */

  2725                             state = transition(state, Tokenizer.DATA, reconsume, pos);

  2726                             continue stateloop;

  2727                         case '\r':

  2728                             appendLongStrBufCarriageReturn();

  2729                             state = transition(state, Tokenizer.COMMENT, reconsume, pos);

  2730                             break stateloop;

  2731                         case '\n':

  2732                             appendLongStrBufLineFeed();

  2733                             state = transition(state, Tokenizer.COMMENT, reconsume, pos);

  2734                             continue stateloop;

  2735                         case '\u0000':

  2736                             c = '\uFFFD';

  2737                             // fall thru

  2738                         default:

  2739                             /*

  2740                              * Append a U+002D HYPHEN-MINUS character (-) and

  2741                              * the current input character to the comment

  2742                              * token's data.

  2743                              */

  2744                             appendLongStrBuf(c);

  2745                             /*

  2746                              * Switch to the comment state.

  2747                              */

  2748                             state = transition(state, Tokenizer.COMMENT, reconsume, pos);

  2749                             continue stateloop;

  2750                     }

  2751                     // XXX reorder point

  2752                 case CDATA_START:

  2753                     for (;;) {

  2754                         if (++pos == endPos) {

  2755                             break stateloop;

  2756                         }

  2757                         c = checkChar(buf, pos);

  2758                         if (index < 6) { // CDATA_LSQB.length

  2759                             if (c == Tokenizer.CDATA_LSQB[index]) {

  2760                                 appendLongStrBuf(c);

  2761                             } else {

  2762                                 errBogusComment();

  2763                                 reconsume = true;

  2764                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);

  2765                                 continue stateloop;

  2766                             }

  2767                             index++;

  2768                             continue;

  2769                         } else {

  2770                             cstart = pos; // start coalescing

  2771                             reconsume = true;

  2772                             state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);

  2773                             break; // FALL THROUGH continue stateloop;

  2774                         }

  2775                     }

  2776                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER

  2777                 case CDATA_SECTION:

  2778                     cdatasectionloop: for (;;) {

  2779                         if (reconsume) {

  2780                             reconsume = false;

  2781                         } else {

  2782                             if (++pos == endPos) {

  2783                                 break stateloop;

  2784                             }

  2785                             c = checkChar(buf, pos);

  2786                         }

  2787                         switch (c) {

  2788                             case ']':

  2789                                 flushChars(buf, pos);

  2790                                 state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos);

  2791                                 break cdatasectionloop; // FALL THROUGH

  2792                             case '\u0000':

  2793                                 emitReplacementCharacter(buf, pos);

  2794                                 continue;

  2795                             case '\r':

  2796                                 emitCarriageReturn(buf, pos);

  2797                                 break stateloop;

  2798                             case '\n':

  2799                                 silentLineFeed();

  2800                                 // fall thru

  2801                             default:

  2802                                 continue;

  2803                         }

  2804                     }

  2805                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER

  2806                 case CDATA_RSQB:

  2807                     cdatarsqb: for (;;) {

  2808                         if (++pos == endPos) {

  2809                             break stateloop;

  2810                         }

  2811                         c = checkChar(buf, pos);

  2812                         switch (c) {

  2813                             case ']':

  2814                                 state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos);

  2815                                 break cdatarsqb;

  2816                             default:

  2817                                 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0,

  2818                                         1);

  2819                                 cstart = pos;

  2820                                 reconsume = true;

  2821                                 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);

  2822                                 continue stateloop;

  2823                         }

  2824                     }

  2825                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER

  2826                 case CDATA_RSQB_RSQB:

  2827                     cdatarsqbrsqb: for (;;) {

  2828                         if (++pos == endPos) {

  2829                             break stateloop;

  2830                         }

  2831                         c = checkChar(buf, pos);

  2832                         switch (c) {

  2833                             case ']':

  2834                                 // Saw a third ]. Emit one ] (logically the

  2835                                 // first one) and stay in this state to

  2836                                 // remember that the last two characters seen

  2837                                 // have been ]].

  2838                                 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);

  2839                                 continue;

  2840                             case '>':

  2841                                 cstart = pos + 1;

  2842                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  2843                                 continue stateloop;

  2844                             default:

  2845                                 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);

  2846                                 cstart = pos;

  2847                                 reconsume = true;

  2848                                 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);

  2849                                 continue stateloop;

  2850                         }

  2851                     }

  2852                     // XXX reorder point

  2853                 case ATTRIBUTE_VALUE_SINGLE_QUOTED:

  2854                     attributevaluesinglequotedloop: for (;;) {

  2855                         if (reconsume) {

  2856                             reconsume = false;

  2857                         } else {

  2858                             if (++pos == endPos) {

  2859                                 break stateloop;

  2860                             }

  2861                             c = checkChar(buf, pos);

  2862                         }

  2863                         /*

  2864                          * Consume the next input character:

  2865                          */

  2866                         switch (c) {

  2867                             case '\'':

  2868                                 /*

  2869                                  * U+0027 APOSTROPHE (') Switch to the after

  2870                                  * attribute value (quoted) state.

  2871                                  */

  2872                                 addAttributeWithValue();

  2874                                 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);

  2875                                 continue stateloop;

  2876                             case '&':

  2877                                 /*

  2878                                  * U+0026 AMPERSAND (&) Switch to the character

  2879                                  * reference in attribute value state, with the

  2880                                  * + additional allowed character being U+0027

  2881                                  * APOSTROPHE (').

  2882                                  */

  2883                                 clearStrBufAndAppend(c);

  2884                                 setAdditionalAndRememberAmpersandLocation('\'');

  2885                                 returnState = state;

  2886                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);

  2887                                 break attributevaluesinglequotedloop;

  2888                             // continue stateloop;

  2889                             case '\r':

  2890                                 appendLongStrBufCarriageReturn();

  2891                                 break stateloop;

  2892                             case '\n':

  2893                                 appendLongStrBufLineFeed();

  2894                                 continue;

  2895                             case '\u0000':

  2896                                 c = '\uFFFD';

  2897                                 // fall thru

  2898                             default:

  2899                                 /*

  2900                                  * Anything else Append the current input

  2901                                  * character to the current attribute's value.

  2902                                  */

  2903                                 appendLongStrBuf(c);

  2904                                 /*

  2905                                  * Stay in the attribute value (double-quoted)

  2906                                  * state.

  2907                                  */

  2908                                 continue;

  2909                         }

  2910                     }

  2911                     // FALLTHRU DON'T REORDER

  2912                 case CONSUME_CHARACTER_REFERENCE:

  2913                     if (++pos == endPos) {

  2914                         break stateloop;

  2915                     }

  2916                     c = checkChar(buf, pos);

  2917                     if (c == '\u0000') {

  2918                         break stateloop;

  2919                     }

  2920                     /*

  2921                      * Unlike the definition is the spec, this state does not

  2922                      * return a value and never requires the caller to

  2923                      * backtrack. This state takes care of emitting characters

  2924                      * or appending to the current attribute value. It also

  2925                      * takes care of that in the case when consuming the

  2926                      * character reference fails.

  2927                      */

  2928                     /*

  2929                      * This section defines how to consume a character

  2930                      * reference. This definition is used when parsing character

  2931                      * references in text and in attributes.

  2932                      *

  2933                      * The behavior depends on the identity of the next

  2934                      * character (the one immediately after the U+0026 AMPERSAND

  2935                      * character):

  2936                      */

  2937                     switch (c) {

  2938                         case ' ':

  2939                         case '\t':

  2940                         case '\n':

  2941                         case '\r': // we'll reconsume!

  2942                         case '\u000C':

  2943                         case '<':

  2944                         case '&':

  2945                             emitOrAppendStrBuf(returnState);

  2946                             if ((returnState & DATA_AND_RCDATA_MASK) == 0) {

  2947                                 cstart = pos;

  2948                             }

  2949                             reconsume = true;

  2950                             state = transition(state, returnState, reconsume, pos);

  2951                             continue stateloop;

  2952                         case '#':

  2953                             /*

  2954                              * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER

  2955                              * SIGN.

  2956                              */

  2957                             appendStrBuf('#');

  2958                             state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos);

  2959                             continue stateloop;

  2960                         default:

  2961                             if (c == additional) {

  2962                                 emitOrAppendStrBuf(returnState);

  2963                                 reconsume = true;

  2964                                 state = transition(state, returnState, reconsume, pos);

  2965                                 continue stateloop;

  2966                             }

  2967                             if (c >= 'a' && c <= 'z') {

  2968                                 firstCharKey = c - 'a' + 26;

  2969                             } else if (c >= 'A' && c <= 'Z') {

  2970                                 firstCharKey = c - 'A';

  2971                             } else {

  2972                                 // No match

  2973                                 /*

  2974                                  * If no match can be made, then this is a parse

  2975                                  * error.

  2976                                  */

  2977                                 errNoNamedCharacterMatch();

  2978                                 emitOrAppendStrBuf(returnState);

  2979                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {

  2980                                     cstart = pos;

  2981                                 }

  2982                                 reconsume = true;

  2983                                 state = transition(state, returnState, reconsume, pos);

  2984                                 continue stateloop;

  2985                             }

  2986                             // Didn't fail yet

  2987                             appendStrBuf(c);

  2988                             state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos);

  2989                             // FALL THROUGH continue stateloop;

  2990                     }

  2991                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER

  2992                 case CHARACTER_REFERENCE_HILO_LOOKUP:

  2993                     {

  2994                         if (++pos == endPos) {

  2995                             break stateloop;

  2996                         }

  2997                         c = checkChar(buf, pos);

  2998                         if (c == '\u0000') {

  2999                             break stateloop;

  3000                         }

  3001                         /*

  3002                          * The data structure is as follows:

  3003                          *

  3004                          * HILO_ACCEL is a two-dimensional int array whose major

  3005                          * index corresponds to the second character of the

  3006                          * character reference (code point as index) and the

  3007                          * minor index corresponds to the first character of the

  3008                          * character reference (packed so that A-Z runs from 0

  3009                          * to 25 and a-z runs from 26 to 51). This layout makes

  3010                          * it easier to use the sparseness of the data structure

  3011                          * to omit parts of it: The second dimension of the

  3012                          * table is null when no character reference starts with

  3013                          * the character corresponding to that row.

  3014                          *

  3015                          * The int value HILO_ACCEL (by these indeces) is zero

  3016                          * if there exists no character reference starting with

  3017                          * that two-letter prefix. Otherwise, the value is an

  3018                          * int that packs two shorts so that the higher short is

  3019                          * the index of the highest character reference name

  3020                          * with that prefix in NAMES and the lower short

  3021                          * corresponds to the index of the lowest character

  3022                          * reference name with that prefix. (It happens that the

  3023                          * first two character reference names share their

  3024                          * prefix so the packed int cannot be 0 by packing the

  3025                          * two shorts.)

  3026                          *

  3027                          * NAMES is an array of byte arrays where each byte

  3028                          * array encodes the name of a character references as

  3029                          * ASCII. The names omit the first two letters of the

  3030                          * name. (Since storing the first two letters would be

  3031                          * redundant with the data contained in HILO_ACCEL.) The

  3032                          * entries are lexically sorted.

  3033                          *

  3034                          * For a given index in NAMES, the same index in VALUES

  3035                          * contains the corresponding expansion as an array of

  3036                          * two UTF-16 code units (either the character and

  3037                          * U+0000 or a suggogate pair).

  3038                          */

  3039                         int hilo = 0;

  3040                         if (c <= 'z') {

  3041                             @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c];

  3042                             if (row != null) {

  3043                                 hilo = row[firstCharKey];

  3044                             }

  3045                         }

  3046                         if (hilo == 0) {

  3047                             /*

  3048                              * If no match can be made, then this is a parse

  3049                              * error.

  3050                              */

  3051                             errNoNamedCharacterMatch();

  3052                             emitOrAppendStrBuf(returnState);

  3053                             if ((returnState & DATA_AND_RCDATA_MASK) == 0) {

  3054                                 cstart = pos;

  3055                             }

  3056                             reconsume = true;

  3057                             state = transition(state, returnState, reconsume, pos);

  3058                             continue stateloop;

  3059                         }

  3060                         // Didn't fail yet

  3061                         appendStrBuf(c);

  3062                         lo = hilo & 0xFFFF;

  3063                         hi = hilo >> 16;

  3064                         entCol = -1;

  3065                         candidate = -1;

  3066                         strBufMark = 0;

  3067                         state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos);

  3068                         // FALL THROUGH continue stateloop;

  3069                     }

  3070                 case CHARACTER_REFERENCE_TAIL:

  3071                     outer: for (;;) {

  3072                         if (++pos == endPos) {

  3073                             break stateloop;

  3074                         }

  3075                         c = checkChar(buf, pos);

  3076                         if (c == '\u0000') {

  3077                             break stateloop;

  3078                         }

  3079                         entCol++;

  3080                         /*

  3081                          * Consume the maximum number of characters possible,

  3082                          * with the consumed characters matching one of the

  3083                          * identifiers in the first column of the named

  3084                          * character references table (in a case-sensitive

  3085                          * manner).

  3086                          */

  3087                         loloop: for (;;) {

  3088                             if (hi < lo) {

  3089                                 break outer;

  3090                             }

  3091                             if (entCol == NamedCharacters.NAMES[lo].length()) {

  3092                                 candidate = lo;

  3093                                 strBufMark = strBufLen;

  3094                                 lo++;

  3095                             } else if (entCol > NamedCharacters.NAMES[lo].length()) {

  3096                                 break outer;

  3097                             } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {

  3098                                 lo++;

  3099                             } else {

  3100                                 break loloop;

  3101                             }

  3102                         }

  3104                         hiloop: for (;;) {

  3105                             if (hi < lo) {

  3106                                 break outer;

  3107                             }

  3108                             if (entCol == NamedCharacters.NAMES[hi].length()) {

  3109                                 break hiloop;

  3110                             }

  3111                             if (entCol > NamedCharacters.NAMES[hi].length()) {

  3112                                 break outer;

  3113                             } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {

  3114                                 hi--;

  3115                             } else {

  3116                                 break hiloop;

  3117                             }

  3118                         }

  3120                         if (c == ';') {

  3121                             // If we see a semicolon, there cannot be a

  3122                             // longer match. Break the loop. However, before

  3123                             // breaking, take the longest match so far as the

  3124                             // candidate, if we are just about to complete a

  3125                             // match.

  3126                             if (entCol + 1 == NamedCharacters.NAMES[lo].length()) {

  3127                                 candidate = lo;

  3128                                 strBufMark = strBufLen;

  3129                             }

  3130                             break outer;

  3131                         }

  3133                         if (hi < lo) {

  3134                             break outer;

  3135                         }

  3136                         appendStrBuf(c);

  3137                         continue;

  3138                     }

  3140                     if (candidate == -1) {

  3141                         // reconsume deals with CR, LF or nul

  3142                         /*

  3143                          * If no match can be made, then this is a parse error.

  3144                          */

  3145                         errNoNamedCharacterMatch();

  3146                         emitOrAppendStrBuf(returnState);

  3147                         if ((returnState & DATA_AND_RCDATA_MASK) == 0) {

  3148                             cstart = pos;

  3149                         }

  3150                         reconsume = true;

  3151                         state = transition(state, returnState, reconsume, pos);

  3152                         continue stateloop;

  3153                     } else {

  3154                         // c can't be CR, LF or nul if we got here

  3155                         @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];

  3156                         if (candidateName.length() == 0

  3157                                 || candidateName.charAt(candidateName.length() - 1) != ';') {

  3158                             /*

  3159                              * If the last character matched is not a U+003B

  3160                              * SEMICOLON (;), there is a parse error.

  3161                              */

  3162                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {

  3163                                 /*

  3164                                  * If the entity is being consumed as part of an

  3165                                  * attribute, and the last character matched is

  3166                                  * not a U+003B SEMICOLON (;),

  3167                                  */

  3168                                 char ch;

  3169                                 if (strBufMark == strBufLen) {

  3170                                     ch = c;

  3171                                 } else {

  3172                                     // if (strBufOffset != -1) {

  3173                                     // ch = buf[strBufOffset + strBufMark];

  3174                                     // } else {

  3175                                     ch = strBuf[strBufMark];

  3176                                     // }

  3177                                 }

  3178                                 if (ch == '=' || (ch >= '0' && ch <= '9')

  3179                                         || (ch >= 'A' && ch <= 'Z')

  3180                                         || (ch >= 'a' && ch <= 'z')) {

  3181                                     /*

  3182                                      * and the next character is either a U+003D

  3183                                      * EQUALS SIGN character (=) or in the range

  3184                                      * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,

  3185                                      * U+0041 LATIN CAPITAL LETTER A to U+005A

  3186                                      * LATIN CAPITAL LETTER Z, or U+0061 LATIN

  3187                                      * SMALL LETTER A to U+007A LATIN SMALL

  3188                                      * LETTER Z, then, for historical reasons,

  3189                                      * all the characters that were matched

  3190                                      * after the U+0026 AMPERSAND (&) must be

  3191                                      * unconsumed, and nothing is returned.

  3192                                      */

  3193                                     errNoNamedCharacterMatch();

  3194                                     appendStrBufToLongStrBuf();

  3195                                     reconsume = true;

  3196                                     state = transition(state, returnState, reconsume, pos);

  3197                                     continue stateloop;

  3198                                 }

  3199                             }

  3200                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {

  3201                                 errUnescapedAmpersandInterpretedAsCharacterReference();

  3202                             } else {

  3203                                 errNotSemicolonTerminated();

  3204                             }

  3205                         }

  3207                         /*

  3208                          * Otherwise, return a character token for the character

  3209                          * corresponding to the entity name (as given by the

  3210                          * second column of the named character references

  3211                          * table).

  3212                          */

  3213                         // CPPONLY: completedNamedCharacterReference();

  3214                         @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];

  3215                         if (

  3216                         // [NOCPP[

  3217                         val.length == 1

  3218                         // ]NOCPP]

  3219                         // CPPONLY: val[1] == 0

  3220                         ) {

  3221                             emitOrAppendOne(val, returnState);

  3222                         } else {

  3223                             emitOrAppendTwo(val, returnState);

  3224                         }

  3225                         // this is so complicated!

  3226                         if (strBufMark < strBufLen) {

  3227                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {

  3228                                 for (int i = strBufMark; i < strBufLen; i++) {

  3229                                     appendLongStrBuf(strBuf[i]);

  3230                                 }

  3231                             } else {

  3232                                 tokenHandler.characters(strBuf, strBufMark,

  3233                                         strBufLen - strBufMark);

  3234                             }

  3235                         }

  3236                         // Check if we broke out early with c being the last

  3237                         // character that matched as opposed to being the

  3238                         // first one that didn't match. In the case of an

  3239                         // early break, the next run on text should start

  3240                         // *after* the current character and the current

  3241                         // character shouldn't be reconsumed.

  3242                         boolean earlyBreak = (c == ';' && strBufMark == strBufLen);

  3243                         if ((returnState & DATA_AND_RCDATA_MASK) == 0) {

  3244                             cstart = earlyBreak ? pos + 1 : pos;

  3245                         }

  3246                         reconsume = !earlyBreak;

  3247                         state = transition(state, returnState, reconsume, pos);

  3248                         continue stateloop;

  3249                         /*

  3250                          * If the markup contains I'm &notit; I tell you, the

  3251                          * entity is parsed as "not", as in, I'm ¬it; I tell

  3252                          * you. But if the markup was I'm &notin; I tell you,

  3253                          * the entity would be parsed as "notin;", resulting in

  3254                          * I'm ∉ I tell you.

  3255                          */

  3256                     }

  3257                     // XXX reorder point

  3258                 case CONSUME_NCR:

  3259                     if (++pos == endPos) {

  3260                         break stateloop;

  3261                     }

  3262                     c = checkChar(buf, pos);

  3263                     prevValue = -1;

  3264                     value = 0;

  3265                     seenDigits = false;

  3266                     /*

  3267                      * The behavior further depends on the character after the

  3268                      * U+0023 NUMBER SIGN:

  3269                      */

  3270                     switch (c) {

  3271                         case 'x':

  3272                         case 'X':

  3274                             /*

  3275                              * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL

  3276                              * LETTER X Consume the X.

  3277                              *

  3278                              * Follow the steps below, but using the range of

  3279                              * characters U+0030 DIGIT ZERO through to U+0039

  3280                              * DIGIT NINE, U+0061 LATIN SMALL LETTER A through

  3281                              * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN

  3282                              * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL

  3283                              * LETTER F (in other words, 0-9, A-F, a-f).

  3284                              *

  3285                              * When it comes to interpreting the number,

  3286                              * interpret it as a hexadecimal number.

  3287                              */

  3288                             appendStrBuf(c);

  3289                             state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos);

  3290                             continue stateloop;

  3291                         default:

  3292                             /*

  3293                              * Anything else Follow the steps below, but using

  3294                              * the range of characters U+0030 DIGIT ZERO through

  3295                              * to U+0039 DIGIT NINE (i.e. just 0-9).

  3296                              *

  3297                              * When it comes to interpreting the number,

  3298                              * interpret it as a decimal number.

  3299                              */

  3300                             reconsume = true;

  3301                             state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos);

  3302                             // FALL THROUGH continue stateloop;

  3303                     }

  3304                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER

  3305                 case DECIMAL_NRC_LOOP:

  3306                     decimalloop: for (;;) {

  3307                         if (reconsume) {

  3308                             reconsume = false;

  3309                         } else {

  3310                             if (++pos == endPos) {

  3311                                 break stateloop;

  3312                             }

  3313                             c = checkChar(buf, pos);

  3314                         }

  3315                         // Deal with overflow gracefully

  3316                         if (value < prevValue) {

  3317                             value = 0x110000; // Value above Unicode range but

  3318                             // within int

  3319                             // range

  3320                         }

  3321                         prevValue = value;

  3322                         /*

  3323                          * Consume as many characters as match the range of

  3324                          * characters given above.

  3325                          */

  3326                         if (c >= '0' && c <= '9') {

  3327                             seenDigits = true;

  3328                             value *= 10;

  3329                             value += c - '0';

  3330                             continue;

  3331                         } else if (c == ';') {

  3332                             if (seenDigits) {

  3333                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {

  3334                                     cstart = pos + 1;

  3335                                 }

  3336                                 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);

  3337                                 // FALL THROUGH continue stateloop;

  3338                                 break decimalloop;

  3339                             } else {

  3340                                 errNoDigitsInNCR();

  3341                                 appendStrBuf(';');

  3342                                 emitOrAppendStrBuf(returnState);

  3343                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {

  3344                                     cstart = pos + 1;

  3345                                 }

  3346                                 state = transition(state, returnState, reconsume, pos);

  3347                                 continue stateloop;

  3348                             }

  3349                         } else {

  3350                             /*

  3351                              * If no characters match the range, then don't

  3352                              * consume any characters (and unconsume the U+0023

  3353                              * NUMBER SIGN character and, if appropriate, the X

  3354                              * character). This is a parse error; nothing is

  3355                              * returned.

  3356                              *

  3357                              * Otherwise, if the next character is a U+003B

  3358                              * SEMICOLON, consume that too. If it isn't, there

  3359                              * is a parse error.

  3360                              */

  3361                             if (!seenDigits) {

  3362                                 errNoDigitsInNCR();

  3363                                 emitOrAppendStrBuf(returnState);

  3364                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {

  3365                                     cstart = pos;

  3366                                 }

  3367                                 reconsume = true;

  3368                                 state = transition(state, returnState, reconsume, pos);

  3369                                 continue stateloop;

  3370                             } else {

  3371                                 errCharRefLacksSemicolon();

  3372                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {

  3373                                     cstart = pos;

  3374                                 }

  3375                                 reconsume = true;

  3376                                 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);

  3377                                 // FALL THROUGH continue stateloop;

  3378                                 break decimalloop;

  3379                             }

  3380                         }

  3381                     }

  3382                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER

  3383                 case HANDLE_NCR_VALUE:

  3384                     // WARNING previous state sets reconsume

  3385                     // XXX inline this case if the method size can take it

  3386                     handleNcrValue(returnState);

  3387                     state = transition(state, returnState, reconsume, pos);

  3388                     continue stateloop;

  3389                     // XXX reorder point

  3390                 case HEX_NCR_LOOP:

  3391                     for (;;) {

  3392                         if (++pos == endPos) {

  3393                             break stateloop;

  3394                         }

  3395                         c = checkChar(buf, pos);

  3396                         // Deal with overflow gracefully

  3397                         if (value < prevValue) {

  3398                             value = 0x110000; // Value above Unicode range but

  3399                             // within int

  3400                             // range

  3401                         }

  3402                         prevValue = value;

  3403                         /*

  3404                          * Consume as many characters as match the range of

  3405                          * characters given above.

  3406                          */

  3407                         if (c >= '0' && c <= '9') {

  3408                             seenDigits = true;

  3409                             value *= 16;

  3410                             value += c - '0';

  3411                             continue;

  3412                         } else if (c >= 'A' && c <= 'F') {

  3413                             seenDigits = true;

  3414                             value *= 16;

  3415                             value += c - 'A' + 10;

  3416                             continue;

  3417                         } else if (c >= 'a' && c <= 'f') {

  3418                             seenDigits = true;

  3419                             value *= 16;

  3420                             value += c - 'a' + 10;

  3421                             continue;

  3422                         } else if (c == ';') {

  3423                             if (seenDigits) {

  3424                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {

  3425                                     cstart = pos + 1;

  3426                                 }

  3427                                 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);

  3428                                 continue stateloop;

  3429                             } else {

  3430                                 errNoDigitsInNCR();

  3431                                 appendStrBuf(';');

  3432                                 emitOrAppendStrBuf(returnState);

  3433                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {

  3434                                     cstart = pos + 1;

  3435                                 }

  3436                                 state = transition(state, returnState, reconsume, pos);

  3437                                 continue stateloop;

  3438                             }

  3439                         } else {

  3440                             /*

  3441                              * If no characters match the range, then don't

  3442                              * consume any characters (and unconsume the U+0023

  3443                              * NUMBER SIGN character and, if appropriate, the X

  3444                              * character). This is a parse error; nothing is

  3445                              * returned.

  3446                              *

  3447                              * Otherwise, if the next character is a U+003B

  3448                              * SEMICOLON, consume that too. If it isn't, there

  3449                              * is a parse error.

  3450                              */

  3451                             if (!seenDigits) {

  3452                                 errNoDigitsInNCR();

  3453                                 emitOrAppendStrBuf(returnState);

  3454                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {

  3455                                     cstart = pos;

  3456                                 }

  3457                                 reconsume = true;

  3458                                 state = transition(state, returnState, reconsume, pos);

  3459                                 continue stateloop;

  3460                             } else {

  3461                                 errCharRefLacksSemicolon();

  3462                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {

  3463                                     cstart = pos;

  3464                                 }

  3465                                 reconsume = true;

  3466                                 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);

  3467                                 continue stateloop;

  3468                             }

  3469                         }

  3470                     }

  3471                     // XXX reorder point

  3472                 case PLAINTEXT:

  3473                     plaintextloop: for (;;) {

  3474                         if (reconsume) {

  3475                             reconsume = false;

  3476                         } else {

  3477                             if (++pos == endPos) {

  3478                                 break stateloop;

  3479                             }

  3480                             c = checkChar(buf, pos);

  3481                         }

  3482                         switch (c) {

  3483                             case '\u0000':

  3484                                 emitPlaintextReplacementCharacter(buf, pos);

  3485                                 continue;

  3486                             case '\r':

  3487                                 emitCarriageReturn(buf, pos);

  3488                                 break stateloop;

  3489                             case '\n':

  3490                                 silentLineFeed();

  3491                             default:

  3492                                 /*

  3493                                  * Anything else Emit the current input

  3494                                  * character as a character token. Stay in the

  3495                                  * RAWTEXT state.

  3496                                  */

  3497                                 continue;

  3498                         }

  3499                     }

  3500                     // XXX reorder point

  3501                 case CLOSE_TAG_OPEN:

  3502                     if (++pos == endPos) {

  3503                         break stateloop;

  3504                     }

  3505                     c = checkChar(buf, pos);

  3506                     /*

  3507                      * Otherwise, if the content model flag is set to the PCDATA

  3508                      * state, or if the next few characters do match that tag

  3509                      * name, consume the next input character:

  3510                      */

  3511                     switch (c) {

  3512                         case '>':

  3513                             /* U+003E GREATER-THAN SIGN (>) Parse error. */

  3514                             errLtSlashGt();

  3515                             /*

  3516                              * Switch to the data state.

  3517                              */

  3518                             cstart = pos + 1;

  3519                             state = transition(state, Tokenizer.DATA, reconsume, pos);

  3520                             continue stateloop;

  3521                         case '\r':

  3522                             silentCarriageReturn();

  3523                             /* Anything else Parse error. */

  3524                             errGarbageAfterLtSlash();

  3525                             /*

  3526                              * Switch to the bogus comment state.

  3527                              */

  3528                             clearLongStrBufAndAppend('\n');

  3529                             state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);

  3530                             break stateloop;

  3531                         case '\n':

  3532                             silentLineFeed();

  3533                             /* Anything else Parse error. */

  3534                             errGarbageAfterLtSlash();

  3535                             /*

  3536                              * Switch to the bogus comment state.

  3537                              */

  3538                             clearLongStrBufAndAppend('\n');

  3539                             state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);

  3540                             continue stateloop;

  3541                         case '\u0000':

  3542                             c = '\uFFFD';

  3543                             // fall thru

  3544                         default:

  3545                             if (c >= 'A' && c <= 'Z') {

  3546                                 c += 0x20;

  3547                             }

  3548                             if (c >= 'a' && c <= 'z') {

  3549                                 /*

  3550                                  * U+0061 LATIN SMALL LETTER A through to U+007A

  3551                                  * LATIN SMALL LETTER Z Create a new end tag

  3552                                  * token,

  3553                                  */

  3554                                 endTag = true;

  3555                                 /*

  3556                                  * set its tag name to the input character,

  3557                                  */

  3558                                 clearStrBufAndAppend(c);

  3559                                 /*

  3560                                  * then switch to the tag name state. (Don't

  3561                                  * emit the token yet; further details will be

  3562                                  * filled in before it is emitted.)

  3563                                  */

  3564                                 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);

  3565                                 continue stateloop;

  3566                             } else {

  3567                                 /* Anything else Parse error. */

  3568                                 errGarbageAfterLtSlash();

  3569                                 /*

  3570                                  * Switch to the bogus comment state.

  3571                                  */

  3572                                 clearLongStrBufAndAppend(c);

  3573                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);

  3574                                 continue stateloop;

  3575                             }

  3576                     }

  3577                     // XXX reorder point

  3578                 case RCDATA:

  3579                     rcdataloop: for (;;) {

  3580                         if (reconsume) {

  3581                             reconsume = false;

  3582                         } else {

  3583                             if (++pos == endPos) {

  3584                                 break stateloop;

  3585                             }

  3586                             c = checkChar(buf, pos);

  3587                         }

  3588                         switch (c) {

  3589                             case '&':

  3590                                 /*

  3591                                  * U+0026 AMPERSAND (&) Switch to the character

  3592                                  * reference in RCDATA state.

  3593                                  */

  3594                                 flushChars(buf, pos);

  3595                                 clearStrBufAndAppend(c);

  3596                                 additional = '\u0000';

  3597                                 returnState = state;

  3598                                 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);

  3599                                 continue stateloop;

  3600                             case '<':

  3601                                 /*

  3602                                  * U+003C LESS-THAN SIGN (<) Switch to the

  3603                                  * RCDATA less-than sign state.

  3604                                  */

  3605                                 flushChars(buf, pos);

  3607                                 returnState = state;

  3608                                 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);

  3609                                 continue stateloop;

  3610                             case '\u0000':

  3611                                 emitReplacementCharacter(buf, pos);

  3612                                 continue;

  3613                             case '\r':

  3614                                 emitCarriageReturn(buf, pos);

  3615                                 break stateloop;

  3616                             case '\n':

  3617                                 silentLineFeed();

  3618                             default:

  3619                                 /*

  3620                                  * Emit the current input character as a

  3621                                  * character token. Stay in the RCDATA state.

  3622                                  */

  3623                                 continue;

  3624                         }

  3625                     }

  3626                     // XXX reorder point

  3627                 case RAWTEXT:

  3628                     rawtextloop: for (;;) {

  3629                         if (reconsume) {

  3630                             reconsume = false;

  3631                         } else {

  3632                             if (++pos == endPos) {

  3633                                 break stateloop;

  3634                             }

  3635                             c = checkChar(buf, pos);

  3636                         }

  3637                         switch (c) {

  3638                             case '<':

  3639                                 /*

  3640                                  * U+003C LESS-THAN SIGN (<) Switch to the

  3641                                  * RAWTEXT less-than sign state.

  3642                                  */

  3643                                 flushChars(buf, pos);

  3645                                 returnState = state;

  3646                                 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);

  3647                                 break rawtextloop;

  3648                             // FALL THRU continue stateloop;

  3649                             case '\u0000':

  3650                                 emitReplacementCharacter(buf, pos);

  3651                                 continue;

  3652                             case '\r':

  3653                                 emitCarriageReturn(buf, pos);

  3654                                 break stateloop;

  3655                             case '\n':

  3656                                 silentLineFeed();

  3657                             default:

  3658                                 /*

  3659                                  * Emit the current input character as a

  3660                                  * character token. Stay in the RAWTEXT state.

  3661                                  */

  3662                                 continue;

  3663                         }

  3664                     }

  3665                     // XXX fallthru don't reorder

  3666                 case RAWTEXT_RCDATA_LESS_THAN_SIGN:

  3667                     rawtextrcdatalessthansignloop: for (;;) {

  3668                         if (++pos == endPos) {

  3669                             break stateloop;

  3670                         }

  3671                         c = checkChar(buf, pos);

  3672                         switch (c) {

  3673                             case '/':

  3674                                 /*

  3675                                  * U+002F SOLIDUS (/) Set the temporary buffer

  3676                                  * to the empty string. Switch to the script

  3677                                  * data end tag open state.

  3678                                  */

  3679                                 index = 0;

  3680                                 clearStrBuf();

  3681                                 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);

  3682                                 break rawtextrcdatalessthansignloop;

  3683                             // FALL THRU continue stateloop;

  3684                             default:

  3685                                 /*

  3686                                  * Otherwise, emit a U+003C LESS-THAN SIGN

  3687                                  * character token

  3688                                  */

  3689                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);

  3690                                 /*

  3691                                  * and reconsume the current input character in

  3692                                  * the data state.

  3693                                  */

  3694                                 cstart = pos;

  3695                                 reconsume = true;

  3696                                 state = transition(state, returnState, reconsume, pos);

  3697                                 continue stateloop;

  3698                         }

  3699                     }

  3700                     // XXX fall thru. don't reorder.

  3701                 case NON_DATA_END_TAG_NAME:

  3702                     for (;;) {

  3703                         if (++pos == endPos) {

  3704                             break stateloop;

  3705                         }

  3706                         c = checkChar(buf, pos);

  3707                         /*

  3708                          * ASSERT! when entering this state, set index to 0 and

  3709                          * call clearStrBuf() assert (contentModelElement !=

  3710                          * null); Let's implement the above without lookahead.

  3711                          * strBuf is the 'temporary buffer'.

  3712                          */

  3713                         if (index < endTagExpectationAsArray.length) {

  3714                             char e = endTagExpectationAsArray[index];

  3715                             char folded = c;

  3716                             if (c >= 'A' && c <= 'Z') {

  3717                                 folded += 0x20;

  3718                             }

  3719                             if (folded != e) {

  3720                                 // [NOCPP[

  3721                                 errHtml4LtSlashInRcdata(folded);

  3722                                 // ]NOCPP]

  3723                                 tokenHandler.characters(Tokenizer.LT_SOLIDUS,

  3724                                         0, 2);

  3725                                 emitStrBuf();

  3726                                 cstart = pos;

  3727                                 reconsume = true;

  3728                                 state = transition(state, returnState, reconsume, pos);

  3729                                 continue stateloop;

  3730                             }

  3731                             appendStrBuf(c);

  3732                             index++;

  3733                             continue;

  3734                         } else {

  3735                             endTag = true;

  3736                             // XXX replace contentModelElement with different

  3737                             // type

  3738                             tagName = endTagExpectation;

  3739                             switch (c) {

  3740                                 case '\r':

  3741                                     silentCarriageReturn();

  3742                                     state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);

  3743                                     break stateloop;

  3744                                 case '\n':

  3745                                     silentLineFeed();

  3746                                     // fall thru

  3747                                 case ' ':

  3748                                 case '\t':

  3749                                 case '\u000C':

  3750                                     /*

  3751                                      * U+0009 CHARACTER TABULATION U+000A LINE

  3752                                      * FEED (LF) U+000C FORM FEED (FF) U+0020

  3753                                      * SPACE If the current end tag token is an

  3754                                      * appropriate end tag token, then switch to

  3755                                      * the before attribute name state.

  3756                                      */

  3757                                     state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);

  3758                                     continue stateloop;

  3759                                 case '/':

  3760                                     /*

  3761                                      * U+002F SOLIDUS (/) If the current end tag

  3762                                      * token is an appropriate end tag token,

  3763                                      * then switch to the self-closing start tag

  3764                                      * state.

  3765                                      */

  3766                                     state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);

  3767                                     continue stateloop;

  3768                                 case '>':

  3769                                     /*

  3770                                      * U+003E GREATER-THAN SIGN (>) If the

  3771                                      * current end tag token is an appropriate

  3772                                      * end tag token, then emit the current tag

  3773                                      * token and switch to the data state.

  3774                                      */

  3775                                     state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);

  3776                                     if (shouldSuspend) {

  3777                                         break stateloop;

  3778                                     }

  3779                                     continue stateloop;

  3780                                 default:

  3781                                     /*

  3782                                      * Emit a U+003C LESS-THAN SIGN character

  3783                                      * token, a U+002F SOLIDUS character token,

  3784                                      * a character token for each of the

  3785                                      * characters in the temporary buffer (in

  3786                                      * the order they were added to the buffer),

  3787                                      * and reconsume the current input character

  3788                                      * in the RAWTEXT state.

  3789                                      */

  3790                                     // [NOCPP[

  3791                                     errWarnLtSlashInRcdata();

  3792                                     // ]NOCPP]

  3793                                     tokenHandler.characters(

  3794                                             Tokenizer.LT_SOLIDUS, 0, 2);

  3795                                     emitStrBuf();

  3796                                     if (c == '\u0000') {

  3797                                         emitReplacementCharacter(buf, pos);

  3798                                     } else {

  3799                                         cstart = pos; // don't drop the

  3800                                         // character

  3801                                     }

  3802                                     state = transition(state, returnState, reconsume, pos);

  3803                                     continue stateloop;

  3804                             }

  3805                         }

  3806                     }

  3807                     // XXX reorder point

  3808                     // BEGIN HOTSPOT WORKAROUND

  3809                 case BOGUS_COMMENT:

  3810                     boguscommentloop: for (;;) {

  3811                         if (reconsume) {

  3812                             reconsume = false;

  3813                         } else {

  3814                             if (++pos == endPos) {

  3815                                 break stateloop;

  3816                             }

  3817                             c = checkChar(buf, pos);

  3818                         }

  3819                         /*

  3820                          * Consume every character up to and including the first

  3821                          * U+003E GREATER-THAN SIGN character (>) or the end of

  3822                          * the file (EOF), whichever comes first. Emit a comment

  3823                          * token whose data is the concatenation of all the

  3824                          * characters starting from and including the character

  3825                          * that caused the state machine to switch into the

  3826                          * bogus comment state, up to and including the

  3827                          * character immediately before the last consumed

  3828                          * character (i.e. up to the character just before the

  3829                          * U+003E or EOF character). (If the comment was started

  3830                          * by the end of the file (EOF), the token is empty.)

  3831                          *

  3832                          * Switch to the data state.

  3833                          *

  3834                          * If the end of the file was reached, reconsume the EOF

  3835                          * character.

  3836                          */

  3837                         switch (c) {

  3838                             case '>':

  3839                                 emitComment(0, pos);

  3840                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  3841                                 continue stateloop;

  3842                             case '-':

  3843                                 appendLongStrBuf(c);

  3844                                 state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos);

  3845                                 break boguscommentloop;

  3846                             case '\r':

  3847                                 appendLongStrBufCarriageReturn();

  3848                                 break stateloop;

  3849                             case '\n':

  3850                                 appendLongStrBufLineFeed();

  3851                                 continue;

  3852                             case '\u0000':

  3853                                 c = '\uFFFD';

  3854                                 // fall thru

  3855                             default:

  3856                                 appendLongStrBuf(c);

  3857                                 continue;

  3858                         }

  3859                     }

  3860                     // FALLTHRU DON'T REORDER

  3861                 case BOGUS_COMMENT_HYPHEN:

  3862                     boguscommenthyphenloop: for (;;) {

  3863                         if (++pos == endPos) {

  3864                             break stateloop;

  3865                         }

  3866                         c = checkChar(buf, pos);

  3867                         switch (c) {

  3868                             case '>':

  3869                                 // [NOCPP[

  3870                                 maybeAppendSpaceToBogusComment();

  3871                                 // ]NOCPP]

  3872                                 emitComment(0, pos);

  3873                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  3874                                 continue stateloop;

  3875                             case '-':

  3876                                 appendSecondHyphenToBogusComment();

  3877                                 continue boguscommenthyphenloop;

  3878                             case '\r':

  3879                                 appendLongStrBufCarriageReturn();

  3880                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);

  3881                                 break stateloop;

  3882                             case '\n':

  3883                                 appendLongStrBufLineFeed();

  3884                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);

  3885                                 continue stateloop;

  3886                             case '\u0000':

  3887                                 c = '\uFFFD';

  3888                                 // fall thru

  3889                             default:

  3890                                 appendLongStrBuf(c);

  3891                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);

  3892                                 continue stateloop;

  3893                         }

  3894                     }

  3895                     // XXX reorder point

  3896                 case SCRIPT_DATA:

  3897                     scriptdataloop: for (;;) {

  3898                         if (reconsume) {

  3899                             reconsume = false;

  3900                         } else {

  3901                             if (++pos == endPos) {

  3902                                 break stateloop;

  3903                             }

  3904                             c = checkChar(buf, pos);

  3905                         }

  3906                         switch (c) {

  3907                             case '<':

  3908                                 /*

  3909                                  * U+003C LESS-THAN SIGN (<) Switch to the

  3910                                  * script data less-than sign state.

  3911                                  */

  3912                                 flushChars(buf, pos);

  3913                                 returnState = state;

  3914                                 state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos);

  3915                                 break scriptdataloop; // FALL THRU continue

  3916                             // stateloop;

  3917                             case '\u0000':

  3918                                 emitReplacementCharacter(buf, pos);

  3919                                 continue;

  3920                             case '\r':

  3921                                 emitCarriageReturn(buf, pos);

  3922                                 break stateloop;

  3923                             case '\n':

  3924                                 silentLineFeed();

  3925                             default:

  3926                                 /*

  3927                                  * Anything else Emit the current input

  3928                                  * character as a character token. Stay in the

  3929                                  * script data state.

  3930                                  */

  3931                                 continue;

  3932                         }

  3933                     }

  3934                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER

  3935                 case SCRIPT_DATA_LESS_THAN_SIGN:

  3936                     scriptdatalessthansignloop: for (;;) {

  3937                         if (++pos == endPos) {

  3938                             break stateloop;

  3939                         }

  3940                         c = checkChar(buf, pos);

  3941                         switch (c) {

  3942                             case '/':

  3943                                 /*

  3944                                  * U+002F SOLIDUS (/) Set the temporary buffer

  3945                                  * to the empty string. Switch to the script

  3946                                  * data end tag open state.

  3947                                  */

  3948                                 index = 0;

  3949                                 clearStrBuf();

  3950                                 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);

  3951                                 continue stateloop;

  3952                             case '!':

  3953                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);

  3954                                 cstart = pos;

  3955                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos);

  3956                                 break scriptdatalessthansignloop; // FALL THRU

  3957                             // continue

  3958                             // stateloop;

  3959                             default:

  3960                                 /*

  3961                                  * Otherwise, emit a U+003C LESS-THAN SIGN

  3962                                  * character token

  3963                                  */

  3964                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);

  3965                                 /*

  3966                                  * and reconsume the current input character in

  3967                                  * the data state.

  3968                                  */

  3969                                 cstart = pos;

  3970                                 reconsume = true;

  3971                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);

  3972                                 continue stateloop;

  3973                         }

  3974                     }

  3975                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER

  3976                 case SCRIPT_DATA_ESCAPE_START:

  3977                     scriptdataescapestartloop: for (;;) {

  3978                         if (++pos == endPos) {

  3979                             break stateloop;

  3980                         }

  3981                         c = checkChar(buf, pos);

  3982                         /*

  3983                          * Consume the next input character:

  3984                          */

  3985                         switch (c) {

  3986                             case '-':

  3987                                 /*

  3988                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D

  3989                                  * HYPHEN-MINUS character token. Switch to the

  3990                                  * script data escape start dash state.

  3991                                  */

  3992                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos);

  3993                                 break scriptdataescapestartloop; // FALL THRU

  3994                             // continue

  3995                             // stateloop;

  3996                             default:

  3997                                 /*

  3998                                  * Anything else Reconsume the current input

  3999                                  * character in the script data state.

  4000                                  */

  4001                                 reconsume = true;

  4002                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);

  4003                                 continue stateloop;

  4004                         }

  4005                     }

  4006                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER

  4007                 case SCRIPT_DATA_ESCAPE_START_DASH:

  4008                     scriptdataescapestartdashloop: for (;;) {

  4009                         if (++pos == endPos) {

  4010                             break stateloop;

  4011                         }

  4012                         c = checkChar(buf, pos);

  4013                         /*

  4014                          * Consume the next input character:

  4015                          */

  4016                         switch (c) {

  4017                             case '-':

  4018                                 /*

  4019                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D

  4020                                  * HYPHEN-MINUS character token. Switch to the

  4021                                  * script data escaped dash dash state.

  4022                                  */

  4023                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);

  4024                                 break scriptdataescapestartdashloop;

  4025                             // continue stateloop;

  4026                             default:

  4027                                 /*

  4028                                  * Anything else Reconsume the current input

  4029                                  * character in the script data state.

  4030                                  */

  4031                                 reconsume = true;

  4032                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);

  4033                                 continue stateloop;

  4034                         }

  4035                     }

  4036                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER

  4037                 case SCRIPT_DATA_ESCAPED_DASH_DASH:

  4038                     scriptdataescapeddashdashloop: for (;;) {

  4039                         if (++pos == endPos) {

  4040                             break stateloop;

  4041                         }

  4042                         c = checkChar(buf, pos);

  4043                         /*

  4044                          * Consume the next input character:

  4045                          */

  4046                         switch (c) {

  4047                             case '-':

  4048                                 /*

  4049                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D

  4050                                  * HYPHEN-MINUS character token. Stay in the

  4051                                  * script data escaped dash dash state.

  4052                                  */

  4053                                 continue;

  4054                             case '<':

  4055                                 /*

  4056                                  * U+003C LESS-THAN SIGN (<) Switch to the

  4057                                  * script data escaped less-than sign state.

  4058                                  */

  4059                                 flushChars(buf, pos);

  4060                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);

  4061                                 continue stateloop;

  4062                             case '>':

  4063                                 /*

  4064                                  * U+003E GREATER-THAN SIGN (>) Emit a U+003E

  4065                                  * GREATER-THAN SIGN character token. Switch to

  4066                                  * the script data state.

  4067                                  */

  4068                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);

  4069                                 continue stateloop;

  4070                             case '\u0000':

  4071                                 emitReplacementCharacter(buf, pos);

  4072                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);

  4073                                 break scriptdataescapeddashdashloop;

  4074                             case '\r':

  4075                                 emitCarriageReturn(buf, pos);

  4076                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);

  4077                                 break stateloop;

  4078                             case '\n':

  4079                                 silentLineFeed();

  4080                             default:

  4081                                 /*

  4082                                  * Anything else Emit the current input

  4083                                  * character as a character token. Switch to the

  4084                                  * script data escaped state.

  4085                                  */

  4086                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);

  4087                                 break scriptdataescapeddashdashloop;

  4088                             // continue stateloop;

  4089                         }

  4090                     }

  4091                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER

  4092                 case SCRIPT_DATA_ESCAPED:

  4093                     scriptdataescapedloop: for (;;) {

  4094                         if (reconsume) {

  4095                             reconsume = false;

  4096                         } else {

  4097                             if (++pos == endPos) {

  4098                                 break stateloop;

  4099                             }

  4100                             c = checkChar(buf, pos);

  4101                         }

  4102                         /*

  4103                          * Consume the next input character:

  4104                          */

  4105                         switch (c) {

  4106                             case '-':

  4107                                 /*

  4108                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D

  4109                                  * HYPHEN-MINUS character token. Switch to the

  4110                                  * script data escaped dash state.

  4111                                  */

  4112                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos);

  4113                                 break scriptdataescapedloop; // FALL THRU

  4114                             // continue

  4115                             // stateloop;

  4116                             case '<':

  4117                                 /*

  4118                                  * U+003C LESS-THAN SIGN (<) Switch to the

  4119                                  * script data escaped less-than sign state.

  4120                                  */

  4121                                 flushChars(buf, pos);

  4122                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);

  4123                                 continue stateloop;

  4124                             case '\u0000':

  4125                                 emitReplacementCharacter(buf, pos);

  4126                                 continue;

  4127                             case '\r':

  4128                                 emitCarriageReturn(buf, pos);

  4129                                 break stateloop;

  4130                             case '\n':

  4131                                 silentLineFeed();

  4132                             default:

  4133                                 /*

  4134                                  * Anything else Emit the current input

  4135                                  * character as a character token. Stay in the

  4136                                  * script data escaped state.

  4137                                  */

  4138                                 continue;

  4139                         }

  4140                     }

  4141                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER

  4142                 case SCRIPT_DATA_ESCAPED_DASH:

  4143                     scriptdataescapeddashloop: for (;;) {

  4144                         if (++pos == endPos) {

  4145                             break stateloop;

  4146                         }

  4147                         c = checkChar(buf, pos);

  4148                         /*

  4149                          * Consume the next input character:

  4150                          */

  4151                         switch (c) {

  4152                             case '-':

  4153                                 /*

  4154                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D

  4155                                  * HYPHEN-MINUS character token. Switch to the

  4156                                  * script data escaped dash dash state.

  4157                                  */

  4158                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);

  4159                                 continue stateloop;

  4160                             case '<':

  4161                                 /*

  4162                                  * U+003C LESS-THAN SIGN (<) Switch to the

  4163                                  * script data escaped less-than sign state.

  4164                                  */

  4165                                 flushChars(buf, pos);

  4166                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);

  4167                                 break scriptdataescapeddashloop;

  4168                             // continue stateloop;

  4169                             case '\u0000':

  4170                                 emitReplacementCharacter(buf, pos);

  4171                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);

  4172                                 continue stateloop;

  4173                             case '\r':

  4174                                 emitCarriageReturn(buf, pos);

  4175                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);

  4176                                 break stateloop;

  4177                             case '\n':

  4178                                 silentLineFeed();

  4179                             default:

  4180                                 /*

  4181                                  * Anything else Emit the current input

  4182                                  * character as a character token. Switch to the

  4183                                  * script data escaped state.

  4184                                  */

  4185                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);

  4186                                 continue stateloop;

  4187                         }

  4188                     }

  4189                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER

  4190                 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:

  4191                     scriptdataescapedlessthanloop: for (;;) {

  4192                         if (++pos == endPos) {

  4193                             break stateloop;

  4194                         }

  4195                         c = checkChar(buf, pos);

  4196                         /*

  4197                          * Consume the next input character:

  4198                          */

  4199                         switch (c) {

  4200                             case '/':

  4201                                 /*

  4202                                  * U+002F SOLIDUS (/) Set the temporary buffer

  4203                                  * to the empty string. Switch to the script

  4204                                  * data escaped end tag open state.

  4205                                  */

  4206                                 index = 0;

  4207                                 clearStrBuf();

  4208                                 returnState = Tokenizer.SCRIPT_DATA_ESCAPED;

  4209                                 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);

  4210                                 continue stateloop;

  4211                             case 'S':

  4212                             case 's':

  4213                                 /*

  4214                                  * U+0041 LATIN CAPITAL LETTER A through to

  4215                                  * U+005A LATIN CAPITAL LETTER Z Emit a U+003C

  4216                                  * LESS-THAN SIGN character token and the

  4217                                  * current input character as a character token.

  4218                                  */

  4219                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);

  4220                                 cstart = pos;

  4221                                 index = 1;

  4222                                 /*

  4223                                  * Set the temporary buffer to the empty string.

  4224                                  * Append the lowercase version of the current

  4225                                  * input character (add 0x0020 to the

  4226                                  * character's code point) to the temporary

  4227                                  * buffer. Switch to the script data double

  4228                                  * escape start state.

  4229                                  */

  4230                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos);

  4231                                 break scriptdataescapedlessthanloop;

  4232                             // continue stateloop;

  4233                             default:

  4234                                 /*

  4235                                  * Anything else Emit a U+003C LESS-THAN SIGN

  4236                                  * character token and reconsume the current

  4237                                  * input character in the script data escaped

  4238                                  * state.

  4239                                  */

  4240                                 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);

  4241                                 cstart = pos;

  4242                                 reconsume = true;

  4243                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);

  4244                                 continue stateloop;

  4245                         }

  4246                     }

  4247                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER

  4248                 case SCRIPT_DATA_DOUBLE_ESCAPE_START:

  4249                     scriptdatadoubleescapestartloop: for (;;) {

  4250                         if (++pos == endPos) {

  4251                             break stateloop;

  4252                         }

  4253                         c = checkChar(buf, pos);

  4254                         assert index > 0;

  4255                         if (index < 6) { // SCRIPT_ARR.length

  4256                             char folded = c;

  4257                             if (c >= 'A' && c <= 'Z') {

  4258                                 folded += 0x20;

  4259                             }

  4260                             if (folded != Tokenizer.SCRIPT_ARR[index]) {

  4261                                 reconsume = true;

  4262                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);

  4263                                 continue stateloop;

  4264                             }

  4265                             index++;

  4266                             continue;

  4267                         }

  4268                         switch (c) {

  4269                             case '\r':

  4270                                 emitCarriageReturn(buf, pos);

  4271                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);

  4272                                 break stateloop;

  4273                             case '\n':

  4274                                 silentLineFeed();

  4275                             case ' ':

  4276                             case '\t':

  4277                             case '\u000C':

  4278                             case '/':

  4279                             case '>':

  4280                                 /*

  4281                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED

  4282                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE

  4283                                  * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN

  4284                                  * (>) Emit the current input character as a

  4285                                  * character token. If the temporary buffer is

  4286                                  * the string "script", then switch to the

  4287                                  * script data double escaped state.

  4288                                  */

  4289                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);

  4290                                 break scriptdatadoubleescapestartloop;

  4291                             // continue stateloop;

  4292                             default:

  4293                                 /*

  4294                                  * Anything else Reconsume the current input

  4295                                  * character in the script data escaped state.

  4296                                  */

  4297                                 reconsume = true;

  4298                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);

  4299                                 continue stateloop;

  4300                         }

  4301                     }

  4302                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER

  4303                 case SCRIPT_DATA_DOUBLE_ESCAPED:

  4304                     scriptdatadoubleescapedloop: for (;;) {

  4305                         if (reconsume) {

  4306                             reconsume = false;

  4307                         } else {

  4308                             if (++pos == endPos) {

  4309                                 break stateloop;

  4310                             }

  4311                             c = checkChar(buf, pos);

  4312                         }

  4313                         /*

  4314                          * Consume the next input character:

  4315                          */

  4316                         switch (c) {

  4317                             case '-':

  4318                                 /*

  4319                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D

  4320                                  * HYPHEN-MINUS character token. Switch to the

  4321                                  * script data double escaped dash state.

  4322                                  */

  4323                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos);

  4324                                 break scriptdatadoubleescapedloop; // FALL THRU

  4325                             // continue

  4326                             // stateloop;

  4327                             case '<':

  4328                                 /*

  4329                                  * U+003C LESS-THAN SIGN (<) Emit a U+003C

  4330                                  * LESS-THAN SIGN character token. Switch to the

  4331                                  * script data double escaped less-than sign

  4332                                  * state.

  4333                                  */

  4334                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);

  4335                                 continue stateloop;

  4336                             case '\u0000':

  4337                                 emitReplacementCharacter(buf, pos);

  4338                                 continue;

  4339                             case '\r':

  4340                                 emitCarriageReturn(buf, pos);

  4341                                 break stateloop;

  4342                             case '\n':

  4343                                 silentLineFeed();

  4344                             default:

  4345                                 /*

  4346                                  * Anything else Emit the current input

  4347                                  * character as a character token. Stay in the

  4348                                  * script data double escaped state.

  4349                                  */

  4350                                 continue;

  4351                         }

  4352                     }

  4353                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER

  4354                 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:

  4355                     scriptdatadoubleescapeddashloop: for (;;) {

  4356                         if (++pos == endPos) {

  4357                             break stateloop;

  4358                         }

  4359                         c = checkChar(buf, pos);

  4360                         /*

  4361                          * Consume the next input character:

  4362                          */

  4363                         switch (c) {

  4364                             case '-':

  4365                                 /*

  4366                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D

  4367                                  * HYPHEN-MINUS character token. Switch to the

  4368                                  * script data double escaped dash dash state.

  4369                                  */

  4370                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos);

  4371                                 break scriptdatadoubleescapeddashloop;

  4372                             // continue stateloop;

  4373                             case '<':

  4374                                 /*

  4375                                  * U+003C LESS-THAN SIGN (<) Emit a U+003C

  4376                                  * LESS-THAN SIGN character token. Switch to the

  4377                                  * script data double escaped less-than sign

  4378                                  * state.

  4379                                  */

  4380                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);

  4381                                 continue stateloop;

  4382                             case '\u0000':

  4383                                 emitReplacementCharacter(buf, pos);

  4384                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);

  4385                                 continue stateloop;

  4386                             case '\r':

  4387                                 emitCarriageReturn(buf, pos);

  4388                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);

  4389                                 break stateloop;

  4390                             case '\n':

  4391                                 silentLineFeed();

  4392                             default:

  4393                                 /*

  4394                                  * Anything else Emit the current input

  4395                                  * character as a character token. Switch to the

  4396                                  * script data double escaped state.

  4397                                  */

  4398                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);

  4399                                 continue stateloop;

  4400                         }

  4401                     }

  4402                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER

  4403                 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:

  4404                     scriptdatadoubleescapeddashdashloop: for (;;) {

  4405                         if (++pos == endPos) {

  4406                             break stateloop;

  4407                         }

  4408                         c = checkChar(buf, pos);

  4409                         /*

  4410                          * Consume the next input character:

  4411                          */

  4412                         switch (c) {

  4413                             case '-':

  4414                                 /*

  4415                                  * U+002D HYPHEN-MINUS (-) Emit a U+002D

  4416                                  * HYPHEN-MINUS character token. Stay in the

  4417                                  * script data double escaped dash dash state.

  4418                                  */

  4419                                 continue;

  4420                             case '<':

  4421                                 /*

  4422                                  * U+003C LESS-THAN SIGN (<) Emit a U+003C

  4423                                  * LESS-THAN SIGN character token. Switch to the

  4424                                  * script data double escaped less-than sign

  4425                                  * state.

  4426                                  */

  4427                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);

  4428                                 break scriptdatadoubleescapeddashdashloop;

  4429                             case '>':

  4430                                 /*

  4431                                  * U+003E GREATER-THAN SIGN (>) Emit a U+003E

  4432                                  * GREATER-THAN SIGN character token. Switch to

  4433                                  * the script data state.

  4434                                  */

  4435                                 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);

  4436                                 continue stateloop;

  4437                             case '\u0000':

  4438                                 emitReplacementCharacter(buf, pos);

  4439                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);

  4440                                 continue stateloop;

  4441                             case '\r':

  4442                                 emitCarriageReturn(buf, pos);

  4443                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);

  4444                                 break stateloop;

  4445                             case '\n':

  4446                                 silentLineFeed();

  4447                             default:

  4448                                 /*

  4449                                  * Anything else Emit the current input

  4450                                  * character as a character token. Switch to the

  4451                                  * script data double escaped state.

  4452                                  */

  4453                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);

  4454                                 continue stateloop;

  4455                         }

  4456                     }

  4457                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER

  4458                 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:

  4459                     scriptdatadoubleescapedlessthanloop: for (;;) {

  4460                         if (++pos == endPos) {

  4461                             break stateloop;

  4462                         }

  4463                         c = checkChar(buf, pos);

  4464                         /*

  4465                          * Consume the next input character:

  4466                          */

  4467                         switch (c) {

  4468                             case '/':

  4469                                 /*

  4470                                  * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS

  4471                                  * character token. Set the temporary buffer to

  4472                                  * the empty string. Switch to the script data

  4473                                  * double escape end state.

  4474                                  */

  4475                                 index = 0;

  4476                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos);

  4477                                 break scriptdatadoubleescapedlessthanloop;

  4478                             default:

  4479                                 /*

  4480                                  * Anything else Reconsume the current input

  4481                                  * character in the script data double escaped

  4482                                  * state.

  4483                                  */

  4484                                 reconsume = true;

  4485                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);

  4486                                 continue stateloop;

  4487                         }

  4488                     }

  4489                     // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER

  4490                 case SCRIPT_DATA_DOUBLE_ESCAPE_END:

  4491                     scriptdatadoubleescapeendloop: for (;;) {

  4492                         if (++pos == endPos) {

  4493                             break stateloop;

  4494                         }

  4495                         c = checkChar(buf, pos);

  4496                         if (index < 6) { // SCRIPT_ARR.length

  4497                             char folded = c;

  4498                             if (c >= 'A' && c <= 'Z') {

  4499                                 folded += 0x20;

  4500                             }

  4501                             if (folded != Tokenizer.SCRIPT_ARR[index]) {

  4502                                 reconsume = true;

  4503                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);

  4504                                 continue stateloop;

  4505                             }

  4506                             index++;

  4507                             continue;

  4508                         }

  4509                         switch (c) {

  4510                             case '\r':

  4511                                 emitCarriageReturn(buf, pos);

  4512                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);

  4513                                 break stateloop;

  4514                             case '\n':

  4515                                 silentLineFeed();

  4516                             case ' ':

  4517                             case '\t':

  4518                             case '\u000C':

  4519                             case '/':

  4520                             case '>':

  4521                                 /*

  4522                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED

  4523                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE

  4524                                  * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN

  4525                                  * (>) Emit the current input character as a

  4526                                  * character token. If the temporary buffer is

  4527                                  * the string "script", then switch to the

  4528                                  * script data escaped state.

  4529                                  */

  4530                                 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);

  4531                                 continue stateloop;

  4532                             default:

  4533                                 /*

  4534                                  * Reconsume the current input character in the

  4535                                  * script data double escaped state.

  4536                                  */

  4537                                 reconsume = true;

  4538                                 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);

  4539                                 continue stateloop;

  4540                         }

  4541                     }

  4542                     // XXX reorder point

  4543                 case MARKUP_DECLARATION_OCTYPE:

  4544                     markupdeclarationdoctypeloop: for (;;) {

  4545                         if (++pos == endPos) {

  4546                             break stateloop;

  4547                         }

  4548                         c = checkChar(buf, pos);

  4549                         if (index < 6) { // OCTYPE.length

  4550                             char folded = c;

  4551                             if (c >= 'A' && c <= 'Z') {

  4552                                 folded += 0x20;

  4553                             }

  4554                             if (folded == Tokenizer.OCTYPE[index]) {

  4555                                 appendLongStrBuf(c);

  4556                             } else {

  4557                                 errBogusComment();

  4558                                 reconsume = true;

  4559                                 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);

  4560                                 continue stateloop;

  4561                             }

  4562                             index++;

  4563                             continue;

  4564                         } else {

  4565                             reconsume = true;

  4566                             state = transition(state, Tokenizer.DOCTYPE, reconsume, pos);

  4567                             break markupdeclarationdoctypeloop;

  4568                             // continue stateloop;

  4569                         }

  4570                     }

  4571                     // FALLTHRU DON'T REORDER

  4572                 case DOCTYPE:

  4573                     doctypeloop: for (;;) {

  4574                         if (reconsume) {

  4575                             reconsume = false;

  4576                         } else {

  4577                             if (++pos == endPos) {

  4578                                 break stateloop;

  4579                             }

  4580                             c = checkChar(buf, pos);

  4581                         }

  4582                         initDoctypeFields();

  4583                         /*

  4584                          * Consume the next input character:

  4585                          */

  4586                         switch (c) {

  4587                             case '\r':

  4588                                 silentCarriageReturn();

  4589                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);

  4590                                 break stateloop;

  4591                             case '\n':

  4592                                 silentLineFeed();

  4593                                 // fall thru

  4594                             case ' ':

  4595                             case '\t':

  4596                             case '\u000C':

  4597                                 /*

  4598                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED

  4599                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE

  4600                                  * Switch to the before DOCTYPE name state.

  4601                                  */

  4602                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);

  4603                                 break doctypeloop;

  4604                             // continue stateloop;

  4605                             default:

  4606                                 /*

  4607                                  * Anything else Parse error.

  4608                                  */

  4609                                 errMissingSpaceBeforeDoctypeName();

  4610                                 /*

  4611                                  * Reconsume the current character in the before

  4612                                  * DOCTYPE name state.

  4613                                  */

  4614                                 reconsume = true;

  4615                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);

  4616                                 break doctypeloop;

  4617                             // continue stateloop;

  4618                         }

  4619                     }

  4620                     // FALLTHRU DON'T REORDER

  4621                 case BEFORE_DOCTYPE_NAME:

  4622                     beforedoctypenameloop: for (;;) {

  4623                         if (reconsume) {

  4624                             reconsume = false;

  4625                         } else {

  4626                             if (++pos == endPos) {

  4627                                 break stateloop;

  4628                             }

  4629                             c = checkChar(buf, pos);

  4630                         }

  4631                         /*

  4632                          * Consume the next input character:

  4633                          */

  4634                         switch (c) {

  4635                             case '\r':

  4636                                 silentCarriageReturn();

  4637                                 break stateloop;

  4638                             case '\n':

  4639                                 silentLineFeed();

  4640                                 // fall thru

  4641                             case ' ':

  4642                             case '\t':

  4643                             case '\u000C':

  4644                                 /*

  4645                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED

  4646                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay

  4647                                  * in the before DOCTYPE name state.

  4648                                  */

  4649                                 continue;

  4650                             case '>':

  4651                                 /*

  4652                                  * U+003E GREATER-THAN SIGN (>) Parse error.

  4653                                  */

  4654                                 errNamelessDoctype();

  4655                                 /*

  4656                                  * Create a new DOCTYPE token. Set its

  4657                                  * force-quirks flag to on.

  4658                                  */

  4659                                 forceQuirks = true;

  4660                                 /*

  4661                                  * Emit the token.

  4662                                  */

  4663                                 emitDoctypeToken(pos);

  4664                                 /*

  4665                                  * Switch to the data state.

  4666                                  */

  4667                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  4668                                 continue stateloop;

  4669                             case '\u0000':

  4670                                 c = '\uFFFD';

  4671                                 // fall thru

  4672                             default:

  4673                                 if (c >= 'A' && c <= 'Z') {

  4674                                     /*

  4675                                      * U+0041 LATIN CAPITAL LETTER A through to

  4676                                      * U+005A LATIN CAPITAL LETTER Z Create a

  4677                                      * new DOCTYPE token. Set the token's name

  4678                                      * to the lowercase version of the input

  4679                                      * character (add 0x0020 to the character's

  4680                                      * code point).

  4681                                      */

  4682                                     c += 0x20;

  4683                                 }

  4684                                 /* Anything else Create a new DOCTYPE token. */

  4685                                 /*

  4686                                  * Set the token's name name to the current

  4687                                  * input character.

  4688                                  */

  4689                                 clearStrBufAndAppend(c);

  4690                                 /*

  4691                                  * Switch to the DOCTYPE name state.

  4692                                  */

  4693                                 state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos);

  4694                                 break beforedoctypenameloop;

  4695                             // continue stateloop;

  4696                         }

  4697                     }

  4698                     // FALLTHRU DON'T REORDER

  4699                 case DOCTYPE_NAME:

  4700                     doctypenameloop: for (;;) {

  4701                         if (++pos == endPos) {

  4702                             break stateloop;

  4703                         }

  4704                         c = checkChar(buf, pos);

  4705                         /*

  4706                          * Consume the next input character:

  4707                          */

  4708                         switch (c) {

  4709                             case '\r':

  4710                                 silentCarriageReturn();

  4711                                 strBufToDoctypeName();

  4712                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);

  4713                                 break stateloop;

  4714                             case '\n':

  4715                                 silentLineFeed();

  4716                                 // fall thru

  4717                             case ' ':

  4718                             case '\t':

  4719                             case '\u000C':

  4720                                 /*

  4721                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED

  4722                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE

  4723                                  * Switch to the after DOCTYPE name state.

  4724                                  */

  4725                                 strBufToDoctypeName();

  4726                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);

  4727                                 break doctypenameloop;

  4728                             // continue stateloop;

  4729                             case '>':

  4730                                 /*

  4731                                  * U+003E GREATER-THAN SIGN (>) Emit the current

  4732                                  * DOCTYPE token.

  4733                                  */

  4734                                 strBufToDoctypeName();

  4735                                 emitDoctypeToken(pos);

  4736                                 /*

  4737                                  * Switch to the data state.

  4738                                  */

  4739                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  4740                                 continue stateloop;

  4741                             case '\u0000':

  4742                                 c = '\uFFFD';

  4743                                 // fall thru

  4744                             default:

  4745                                 /*

  4746                                  * U+0041 LATIN CAPITAL LETTER A through to

  4747                                  * U+005A LATIN CAPITAL LETTER Z Append the

  4748                                  * lowercase version of the input character (add

  4749                                  * 0x0020 to the character's code point) to the

  4750                                  * current DOCTYPE token's name.

  4751                                  */

  4752                                 if (c >= 'A' && c <= 'Z') {

  4753                                     c += 0x0020;

  4754                                 }

  4755                                 /*

  4756                                  * Anything else Append the current input

  4757                                  * character to the current DOCTYPE token's

  4758                                  * name.

  4759                                  */

  4760                                 appendStrBuf(c);

  4761                                 /*

  4762                                  * Stay in the DOCTYPE name state.

  4763                                  */

  4764                                 continue;

  4765                         }

  4766                     }

  4767                     // FALLTHRU DON'T REORDER

  4768                 case AFTER_DOCTYPE_NAME:

  4769                     afterdoctypenameloop: for (;;) {

  4770                         if (++pos == endPos) {

  4771                             break stateloop;

  4772                         }

  4773                         c = checkChar(buf, pos);

  4774                         /*

  4775                          * Consume the next input character:

  4776                          */

  4777                         switch (c) {

  4778                             case '\r':

  4779                                 silentCarriageReturn();

  4780                                 break stateloop;

  4781                             case '\n':

  4782                                 silentLineFeed();

  4783                                 // fall thru

  4784                             case ' ':

  4785                             case '\t':

  4786                             case '\u000C':

  4787                                 /*

  4788                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED

  4789                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay

  4790                                  * in the after DOCTYPE name state.

  4791                                  */

  4792                                 continue;

  4793                             case '>':

  4794                                 /*

  4795                                  * U+003E GREATER-THAN SIGN (>) Emit the current

  4796                                  * DOCTYPE token.

  4797                                  */

  4798                                 emitDoctypeToken(pos);

  4799                                 /*

  4800                                  * Switch to the data state.

  4801                                  */

  4802                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  4803                                 continue stateloop;

  4804                             case 'p':

  4805                             case 'P':

  4806                                 index = 0;

  4807                                 state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos);

  4808                                 break afterdoctypenameloop;

  4809                             // continue stateloop;

  4810                             case 's':

  4811                             case 'S':

  4812                                 index = 0;

  4813                                 state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos);

  4814                                 continue stateloop;

  4815                             default:

  4816                                 /*

  4817                                  * Otherwise, this is the parse error.

  4818                                  */

  4819                                 bogusDoctype();

  4821                                 /*

  4822                                  * Set the DOCTYPE token's force-quirks flag to

  4823                                  * on.

  4824                                  */

  4825                                 // done by bogusDoctype();

  4826                                 /*

  4827                                  * Switch to the bogus DOCTYPE state.

  4828                                  */

  4829                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);

  4830                                 continue stateloop;

  4831                         }

  4832                     }

  4833                     // FALLTHRU DON'T REORDER

  4834                 case DOCTYPE_UBLIC:

  4835                     doctypeublicloop: for (;;) {

  4836                         if (++pos == endPos) {

  4837                             break stateloop;

  4838                         }

  4839                         c = checkChar(buf, pos);

  4840                         /*

  4841                          * If the six characters starting from the current input

  4842                          * character are an ASCII case-insensitive match for the

  4843                          * word "PUBLIC", then consume those characters and

  4844                          * switch to the before DOCTYPE public identifier state.

  4845                          */

  4846                         if (index < 5) { // UBLIC.length

  4847                             char folded = c;

  4848                             if (c >= 'A' && c <= 'Z') {

  4849                                 folded += 0x20;

  4850                             }

  4851                             if (folded != Tokenizer.UBLIC[index]) {

  4852                                 bogusDoctype();

  4853                                 // forceQuirks = true;

  4854                                 reconsume = true;

  4855                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);

  4856                                 continue stateloop;

  4857                             }

  4858                             index++;

  4859                             continue;

  4860                         } else {

  4861                             reconsume = true;

  4862                             state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos);

  4863                             break doctypeublicloop;

  4864                             // continue stateloop;

  4865                         }

  4866                     }

  4867                     // FALLTHRU DON'T REORDER

  4868                 case AFTER_DOCTYPE_PUBLIC_KEYWORD:

  4869                     afterdoctypepublickeywordloop: for (;;) {

  4870                         if (reconsume) {

  4871                             reconsume = false;

  4872                         } else {

  4873                             if (++pos == endPos) {

  4874                                 break stateloop;

  4875                             }

  4876                             c = checkChar(buf, pos);

  4877                         }

  4878                         /*

  4879                          * Consume the next input character:

  4880                          */

  4881                         switch (c) {

  4882                             case '\r':

  4883                                 silentCarriageReturn();

  4884                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);

  4885                                 break stateloop;

  4886                             case '\n':

  4887                                 silentLineFeed();

  4888                                 // fall thru

  4889                             case ' ':

  4890                             case '\t':

  4891                             case '\u000C':

  4892                                 /*

  4893                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED

  4894                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE

  4895                                  * Switch to the before DOCTYPE public

  4896                                  * identifier state.

  4897                                  */

  4898                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);

  4899                                 break afterdoctypepublickeywordloop;

  4900                             // FALL THROUGH continue stateloop

  4901                             case '"':

  4902                                 /*

  4903                                  * U+0022 QUOTATION MARK (") Parse Error.

  4904                                  */

  4905                                 errNoSpaceBetweenDoctypePublicKeywordAndQuote();

  4906                                 /*

  4907                                  * Set the DOCTYPE token's public identifier to

  4908                                  * the empty string (not missing),

  4909                                  */

  4910                                 clearLongStrBuf();

  4911                                 /*

  4912                                  * then switch to the DOCTYPE public identifier

  4913                                  * (double-quoted) state.

  4914                                  */

  4915                                 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);

  4916                                 continue stateloop;

  4917                             case '\'':

  4918                                 /*

  4919                                  * U+0027 APOSTROPHE (') Parse Error.

  4920                                  */

  4921                                 errNoSpaceBetweenDoctypePublicKeywordAndQuote();

  4922                                 /*

  4923                                  * Set the DOCTYPE token's public identifier to

  4924                                  * the empty string (not missing),

  4925                                  */

  4926                                 clearLongStrBuf();

  4927                                 /*

  4928                                  * then switch to the DOCTYPE public identifier

  4929                                  * (single-quoted) state.

  4930                                  */

  4931                                 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);

  4932                                 continue stateloop;

  4933                             case '>':

  4934                                 /* U+003E GREATER-THAN SIGN (>) Parse error. */

  4935                                 errExpectedPublicId();

  4936                                 /*

  4937                                  * Set the DOCTYPE token's force-quirks flag to

  4938                                  * on.

  4939                                  */

  4940                                 forceQuirks = true;

  4941                                 /*

  4942                                  * Emit that DOCTYPE token.

  4943                                  */

  4944                                 emitDoctypeToken(pos);

  4945                                 /*

  4946                                  * Switch to the data state.

  4947                                  */

  4948                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  4949                                 continue stateloop;

  4950                             default:

  4951                                 bogusDoctype();

  4952                                 /*

  4953                                  * Set the DOCTYPE token's force-quirks flag to

  4954                                  * on.

  4955                                  */

  4956                                 // done by bogusDoctype();

  4957                                 /*

  4958                                  * Switch to the bogus DOCTYPE state.

  4959                                  */

  4960                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);

  4961                                 continue stateloop;

  4962                         }

  4963                     }

  4964                     // FALLTHRU DON'T REORDER

  4965                 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:

  4966                     beforedoctypepublicidentifierloop: for (;;) {

  4967                         if (++pos == endPos) {

  4968                             break stateloop;

  4969                         }

  4970                         c = checkChar(buf, pos);

  4971                         /*

  4972                          * Consume the next input character:

  4973                          */

  4974                         switch (c) {

  4975                             case '\r':

  4976                                 silentCarriageReturn();

  4977                                 break stateloop;

  4978                             case '\n':

  4979                                 silentLineFeed();

  4980                                 // fall thru

  4981                             case ' ':

  4982                             case '\t':

  4983                             case '\u000C':

  4984                                 /*

  4985                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED

  4986                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay

  4987                                  * in the before DOCTYPE public identifier

  4988                                  * state.

  4989                                  */

  4990                                 continue;

  4991                             case '"':

  4992                                 /*

  4993                                  * U+0022 QUOTATION MARK (") Set the DOCTYPE

  4994                                  * token's public identifier to the empty string

  4995                                  * (not missing),

  4996                                  */

  4997                                 clearLongStrBuf();

  4998                                 /*

  4999                                  * then switch to the DOCTYPE public identifier

  5000                                  * (double-quoted) state.

  5001                                  */

  5002                                 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);

  5003                                 break beforedoctypepublicidentifierloop;

  5004                             // continue stateloop;

  5005                             case '\'':

  5006                                 /*

  5007                                  * U+0027 APOSTROPHE (') Set the DOCTYPE token's

  5008                                  * public identifier to the empty string (not

  5009                                  * missing),

  5010                                  */

  5011                                 clearLongStrBuf();

  5012                                 /*

  5013                                  * then switch to the DOCTYPE public identifier

  5014                                  * (single-quoted) state.

  5015                                  */

  5016                                 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);

  5017                                 continue stateloop;

  5018                             case '>':

  5019                                 /* U+003E GREATER-THAN SIGN (>) Parse error. */

  5020                                 errExpectedPublicId();

  5021                                 /*

  5022                                  * Set the DOCTYPE token's force-quirks flag to

  5023                                  * on.

  5024                                  */

  5025                                 forceQuirks = true;

  5026                                 /*

  5027                                  * Emit that DOCTYPE token.

  5028                                  */

  5029                                 emitDoctypeToken(pos);

  5030                                 /*

  5031                                  * Switch to the data state.

  5032                                  */

  5033                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  5034                                 continue stateloop;

  5035                             default:

  5036                                 bogusDoctype();

  5037                                 /*

  5038                                  * Set the DOCTYPE token's force-quirks flag to

  5039                                  * on.

  5040                                  */

  5041                                 // done by bogusDoctype();

  5042                                 /*

  5043                                  * Switch to the bogus DOCTYPE state.

  5044                                  */

  5045                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);

  5046                                 continue stateloop;

  5047                         }

  5048                     }

  5049                     // FALLTHRU DON'T REORDER

  5050                 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:

  5051                     doctypepublicidentifierdoublequotedloop: for (;;) {

  5052                         if (++pos == endPos) {

  5053                             break stateloop;

  5054                         }

  5055                         c = checkChar(buf, pos);

  5056                         /*

  5057                          * Consume the next input character:

  5058                          */

  5059                         switch (c) {

  5060                             case '"':

  5061                                 /*

  5062                                  * U+0022 QUOTATION MARK (") Switch to the after

  5063                                  * DOCTYPE public identifier state.

  5064                                  */

  5065                                 publicIdentifier = longStrBufToString();

  5066                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);

  5067                                 break doctypepublicidentifierdoublequotedloop;

  5068                             // continue stateloop;

  5069                             case '>':

  5070                                 /*

  5071                                  * U+003E GREATER-THAN SIGN (>) Parse error.

  5072                                  */

  5073                                 errGtInPublicId();

  5074                                 /*

  5075                                  * Set the DOCTYPE token's force-quirks flag to

  5076                                  * on.

  5077                                  */

  5078                                 forceQuirks = true;

  5079                                 /*

  5080                                  * Emit that DOCTYPE token.

  5081                                  */

  5082                                 publicIdentifier = longStrBufToString();

  5083                                 emitDoctypeToken(pos);

  5084                                 /*

  5085                                  * Switch to the data state.

  5086                                  */

  5087                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  5088                                 continue stateloop;

  5089                             case '\r':

  5090                                 appendLongStrBufCarriageReturn();

  5091                                 break stateloop;

  5092                             case '\n':

  5093                                 appendLongStrBufLineFeed();

  5094                                 continue;

  5095                             case '\u0000':

  5096                                 c = '\uFFFD';

  5097                                 // fall thru

  5098                             default:

  5099                                 /*

  5100                                  * Anything else Append the current input

  5101                                  * character to the current DOCTYPE token's

  5102                                  * public identifier.

  5103                                  */

  5104                                 appendLongStrBuf(c);

  5105                                 /*

  5106                                  * Stay in the DOCTYPE public identifier

  5107                                  * (double-quoted) state.

  5108                                  */

  5109                                 continue;

  5110                         }

  5111                     }

  5112                     // FALLTHRU DON'T REORDER

  5113                 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:

  5114                     afterdoctypepublicidentifierloop: for (;;) {

  5115                         if (++pos == endPos) {

  5116                             break stateloop;

  5117                         }

  5118                         c = checkChar(buf, pos);

  5119                         /*

  5120                          * Consume the next input character:

  5121                          */

  5122                         switch (c) {

  5123                             case '\r':

  5124                                 silentCarriageReturn();

  5125                                 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);

  5126                                 break stateloop;

  5127                             case '\n':

  5128                                 silentLineFeed();

  5129                                 // fall thru

  5130                             case ' ':

  5131                             case '\t':

  5132                             case '\u000C':

  5133                                 /*

  5134                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED

  5135                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE

  5136                                  * Switch to the between DOCTYPE public and

  5137                                  * system identifiers state.

  5138                                  */

  5139                                 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);

  5140                                 break afterdoctypepublicidentifierloop;

  5141                             // continue stateloop;

  5142                             case '>':

  5143                                 /*

  5144                                  * U+003E GREATER-THAN SIGN (>) Emit the current

  5145                                  * DOCTYPE token.

  5146                                  */

  5147                                 emitDoctypeToken(pos);

  5148                                 /*

  5149                                  * Switch to the data state.

  5150                                  */

  5151                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  5152                                 continue stateloop;

  5153                             case '"':

  5154                                 /*

  5155                                  * U+0022 QUOTATION MARK (") Parse error.

  5156                                  */

  5157                                 errNoSpaceBetweenPublicAndSystemIds();

  5158                                 /*

  5159                                  * Set the DOCTYPE token's system identifier to

  5160                                  * the empty string (not missing),

  5161                                  */

  5162                                 clearLongStrBuf();

  5163                                 /*

  5164                                  * then switch to the DOCTYPE system identifier

  5165                                  * (double-quoted) state.

  5166                                  */

  5167                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);

  5168                                 continue stateloop;

  5169                             case '\'':

  5170                                 /*

  5171                                  * U+0027 APOSTROPHE (') Parse error.

  5172                                  */

  5173                                 errNoSpaceBetweenPublicAndSystemIds();

  5174                                 /*

  5175                                  * Set the DOCTYPE token's system identifier to

  5176                                  * the empty string (not missing),

  5177                                  */

  5178                                 clearLongStrBuf();

  5179                                 /*

  5180                                  * then switch to the DOCTYPE system identifier

  5181                                  * (single-quoted) state.

  5182                                  */

  5183                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);

  5184                                 continue stateloop;

  5185                             default:

  5186                                 bogusDoctype();

  5187                                 /*

  5188                                  * Set the DOCTYPE token's force-quirks flag to

  5189                                  * on.

  5190                                  */

  5191                                 // done by bogusDoctype();

  5192                                 /*

  5193                                  * Switch to the bogus DOCTYPE state.

  5194                                  */

  5195                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);

  5196                                 continue stateloop;

  5197                         }

  5198                     }

  5199                     // FALLTHRU DON'T REORDER

  5200                 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:

  5201                     betweendoctypepublicandsystemidentifiersloop: for (;;) {

  5202                         if (++pos == endPos) {

  5203                             break stateloop;

  5204                         }

  5205                         c = checkChar(buf, pos);

  5206                         /*

  5207                          * Consume the next input character:

  5208                          */

  5209                         switch (c) {

  5210                             case '\r':

  5211                                 silentCarriageReturn();

  5212                                 break stateloop;

  5213                             case '\n':

  5214                                 silentLineFeed();

  5215                                 // fall thru

  5216                             case ' ':

  5217                             case '\t':

  5218                             case '\u000C':

  5219                                 /*

  5220                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED

  5221                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay

  5222                                  * in the between DOCTYPE public and system

  5223                                  * identifiers state.

  5224                                  */

  5225                                 continue;

  5226                             case '>':

  5227                                 /*

  5228                                  * U+003E GREATER-THAN SIGN (>) Emit the current

  5229                                  * DOCTYPE token.

  5230                                  */

  5231                                 emitDoctypeToken(pos);

  5232                                 /*

  5233                                  * Switch to the data state.

  5234                                  */

  5235                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  5236                                 continue stateloop;

  5237                             case '"':

  5238                                 /*

  5239                                  * U+0022 QUOTATION MARK (") Set the DOCTYPE

  5240                                  * token's system identifier to the empty string

  5241                                  * (not missing),

  5242                                  */

  5243                                 clearLongStrBuf();

  5244                                 /*

  5245                                  * then switch to the DOCTYPE system identifier

  5246                                  * (double-quoted) state.

  5247                                  */

  5248                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);

  5249                                 break betweendoctypepublicandsystemidentifiersloop;

  5250                             // continue stateloop;

  5251                             case '\'':

  5252                                 /*

  5253                                  * U+0027 APOSTROPHE (') Set the DOCTYPE token's

  5254                                  * system identifier to the empty string (not

  5255                                  * missing),

  5256                                  */

  5257                                 clearLongStrBuf();

  5258                                 /*

  5259                                  * then switch to the DOCTYPE system identifier

  5260                                  * (single-quoted) state.

  5261                                  */

  5262                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);

  5263                                 continue stateloop;

  5264                             default:

  5265                                 bogusDoctype();

  5266                                 /*

  5267                                  * Set the DOCTYPE token's force-quirks flag to

  5268                                  * on.

  5269                                  */

  5270                                 // done by bogusDoctype();

  5271                                 /*

  5272                                  * Switch to the bogus DOCTYPE state.

  5273                                  */

  5274                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);

  5275                                 continue stateloop;

  5276                         }

  5277                     }

  5278                     // FALLTHRU DON'T REORDER

  5279                 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:

  5280                     doctypesystemidentifierdoublequotedloop: for (;;) {

  5281                         if (++pos == endPos) {

  5282                             break stateloop;

  5283                         }

  5284                         c = checkChar(buf, pos);

  5285                         /*

  5286                          * Consume the next input character:

  5287                          */

  5288                         switch (c) {

  5289                             case '"':

  5290                                 /*

  5291                                  * U+0022 QUOTATION MARK (") Switch to the after

  5292                                  * DOCTYPE system identifier state.

  5293                                  */

  5294                                 systemIdentifier = longStrBufToString();

  5295                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);

  5296                                 continue stateloop;

  5297                             case '>':

  5298                                 /*

  5299                                  * U+003E GREATER-THAN SIGN (>) Parse error.

  5300                                  */

  5301                                 errGtInSystemId();

  5302                                 /*

  5303                                  * Set the DOCTYPE token's force-quirks flag to

  5304                                  * on.

  5305                                  */

  5306                                 forceQuirks = true;

  5307                                 /*

  5308                                  * Emit that DOCTYPE token.

  5309                                  */

  5310                                 systemIdentifier = longStrBufToString();

  5311                                 emitDoctypeToken(pos);

  5312                                 /*

  5313                                  * Switch to the data state.

  5314                                  */

  5315                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  5316                                 continue stateloop;

  5317                             case '\r':

  5318                                 appendLongStrBufCarriageReturn();

  5319                                 break stateloop;

  5320                             case '\n':

  5321                                 appendLongStrBufLineFeed();

  5322                                 continue;

  5323                             case '\u0000':

  5324                                 c = '\uFFFD';

  5325                                 // fall thru

  5326                             default:

  5327                                 /*

  5328                                  * Anything else Append the current input

  5329                                  * character to the current DOCTYPE token's

  5330                                  * system identifier.

  5331                                  */

  5332                                 appendLongStrBuf(c);

  5333                                 /*

  5334                                  * Stay in the DOCTYPE system identifier

  5335                                  * (double-quoted) state.

  5336                                  */

  5337                                 continue;

  5338                         }

  5339                     }

  5340                     // FALLTHRU DON'T REORDER

  5341                 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:

  5342                     afterdoctypesystemidentifierloop: for (;;) {

  5343                         if (++pos == endPos) {

  5344                             break stateloop;

  5345                         }

  5346                         c = checkChar(buf, pos);

  5347                         /*

  5348                          * Consume the next input character:

  5349                          */

  5350                         switch (c) {

  5351                             case '\r':

  5352                                 silentCarriageReturn();

  5353                                 break stateloop;

  5354                             case '\n':

  5355                                 silentLineFeed();

  5356                                 // fall thru

  5357                             case ' ':

  5358                             case '\t':

  5359                             case '\u000C':

  5360                                 /*

  5361                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED

  5362                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay

  5363                                  * in the after DOCTYPE system identifier state.

  5364                                  */

  5365                                 continue;

  5366                             case '>':

  5367                                 /*

  5368                                  * U+003E GREATER-THAN SIGN (>) Emit the current

  5369                                  * DOCTYPE token.

  5370                                  */

  5371                                 emitDoctypeToken(pos);

  5372                                 /*

  5373                                  * Switch to the data state.

  5374                                  */

  5375                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  5376                                 continue stateloop;

  5377                             default:

  5378                                 /*

  5379                                  * Switch to the bogus DOCTYPE state. (This does

  5380                                  * not set the DOCTYPE token's force-quirks flag

  5381                                  * to on.)

  5382                                  */

  5383                                 bogusDoctypeWithoutQuirks();

  5384                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);

  5385                                 break afterdoctypesystemidentifierloop;

  5386                             // continue stateloop;

  5387                         }

  5388                     }

  5389                     // FALLTHRU DON'T REORDER

  5390                 case BOGUS_DOCTYPE:

  5391                     for (;;) {

  5392                         if (reconsume) {

  5393                             reconsume = false;

  5394                         } else {

  5395                             if (++pos == endPos) {

  5396                                 break stateloop;

  5397                             }

  5398                             c = checkChar(buf, pos);

  5399                         }

  5400                         /*

  5401                          * Consume the next input character:

  5402                          */

  5403                         switch (c) {

  5404                             case '>':

  5405                                 /*

  5406                                  * U+003E GREATER-THAN SIGN (>) Emit that

  5407                                  * DOCTYPE token.

  5408                                  */

  5409                                 emitDoctypeToken(pos);

  5410                                 /*

  5411                                  * Switch to the data state.

  5412                                  */

  5413                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  5414                                 continue stateloop;

  5415                             case '\r':

  5416                                 silentCarriageReturn();

  5417                                 break stateloop;

  5418                             case '\n':

  5419                                 silentLineFeed();

  5420                                 // fall thru

  5421                             default:

  5422                                 /*

  5423                                  * Anything else Stay in the bogus DOCTYPE

  5424                                  * state.

  5425                                  */

  5426                                 continue;

  5427                         }

  5428                     }

  5429                     // XXX reorder point

  5430                 case DOCTYPE_YSTEM:

  5431                     doctypeystemloop: for (;;) {

  5432                         if (++pos == endPos) {

  5433                             break stateloop;

  5434                         }

  5435                         c = checkChar(buf, pos);

  5436                         /*

  5437                          * Otherwise, if the six characters starting from the

  5438                          * current input character are an ASCII case-insensitive

  5439                          * match for the word "SYSTEM", then consume those

  5440                          * characters and switch to the before DOCTYPE system

  5441                          * identifier state.

  5442                          */

  5443                         if (index < 5) { // YSTEM.length

  5444                             char folded = c;

  5445                             if (c >= 'A' && c <= 'Z') {

  5446                                 folded += 0x20;

  5447                             }

  5448                             if (folded != Tokenizer.YSTEM[index]) {

  5449                                 bogusDoctype();

  5450                                 reconsume = true;

  5451                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);

  5452                                 continue stateloop;

  5453                             }

  5454                             index++;

  5455                             continue stateloop;

  5456                         } else {

  5457                             reconsume = true;

  5458                             state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos);

  5459                             break doctypeystemloop;

  5460                             // continue stateloop;

  5461                         }

  5462                     }

  5463                     // FALLTHRU DON'T REORDER

  5464                 case AFTER_DOCTYPE_SYSTEM_KEYWORD:

  5465                     afterdoctypesystemkeywordloop: for (;;) {

  5466                         if (reconsume) {

  5467                             reconsume = false;

  5468                         } else {

  5469                             if (++pos == endPos) {

  5470                                 break stateloop;

  5471                             }

  5472                             c = checkChar(buf, pos);

  5473                         }

  5474                         /*

  5475                          * Consume the next input character:

  5476                          */

  5477                         switch (c) {

  5478                             case '\r':

  5479                                 silentCarriageReturn();

  5480                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);

  5481                                 break stateloop;

  5482                             case '\n':

  5483                                 silentLineFeed();

  5484                                 // fall thru

  5485                             case ' ':

  5486                             case '\t':

  5487                             case '\u000C':

  5488                                 /*

  5489                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED

  5490                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE

  5491                                  * Switch to the before DOCTYPE public

  5492                                  * identifier state.

  5493                                  */

  5494                                 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);

  5495                                 break afterdoctypesystemkeywordloop;

  5496                             // FALL THROUGH continue stateloop

  5497                             case '"':

  5498                                 /*

  5499                                  * U+0022 QUOTATION MARK (") Parse Error.

  5500                                  */

  5501                                 errNoSpaceBetweenDoctypeSystemKeywordAndQuote();

  5502                                 /*

  5503                                  * Set the DOCTYPE token's system identifier to

  5504                                  * the empty string (not missing),

  5505                                  */

  5506                                 clearLongStrBuf();

  5507                                 /*

  5508                                  * then switch to the DOCTYPE public identifier

  5509                                  * (double-quoted) state.

  5510                                  */

  5511                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);

  5512                                 continue stateloop;

  5513                             case '\'':

  5514                                 /*

  5515                                  * U+0027 APOSTROPHE (') Parse Error.

  5516                                  */

  5517                                 errNoSpaceBetweenDoctypeSystemKeywordAndQuote();

  5518                                 /*

  5519                                  * Set the DOCTYPE token's public identifier to

  5520                                  * the empty string (not missing),

  5521                                  */

  5522                                 clearLongStrBuf();

  5523                                 /*

  5524                                  * then switch to the DOCTYPE public identifier

  5525                                  * (single-quoted) state.

  5526                                  */

  5527                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);

  5528                                 continue stateloop;

  5529                             case '>':

  5530                                 /* U+003E GREATER-THAN SIGN (>) Parse error. */

  5531                                 errExpectedPublicId();

  5532                                 /*

  5533                                  * Set the DOCTYPE token's force-quirks flag to

  5534                                  * on.

  5535                                  */

  5536                                 forceQuirks = true;

  5537                                 /*

  5538                                  * Emit that DOCTYPE token.

  5539                                  */

  5540                                 emitDoctypeToken(pos);

  5541                                 /*

  5542                                  * Switch to the data state.

  5543                                  */

  5544                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  5545                                 continue stateloop;

  5546                             default:

  5547                                 bogusDoctype();

  5548                                 /*

  5549                                  * Set the DOCTYPE token's force-quirks flag to

  5550                                  * on.

  5551                                  */

  5552                                 // done by bogusDoctype();

  5553                                 /*

  5554                                  * Switch to the bogus DOCTYPE state.

  5555                                  */

  5556                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);

  5557                                 continue stateloop;

  5558                         }

  5559                     }

  5560                     // FALLTHRU DON'T REORDER

  5561                 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:

  5562                     beforedoctypesystemidentifierloop: for (;;) {

  5563                         if (++pos == endPos) {

  5564                             break stateloop;

  5565                         }

  5566                         c = checkChar(buf, pos);

  5567                         /*

  5568                          * Consume the next input character:

  5569                          */

  5570                         switch (c) {

  5571                             case '\r':

  5572                                 silentCarriageReturn();

  5573                                 break stateloop;

  5574                             case '\n':

  5575                                 silentLineFeed();

  5576                                 // fall thru

  5577                             case ' ':

  5578                             case '\t':

  5579                             case '\u000C':

  5580                                 /*

  5581                                  * U+0009 CHARACTER TABULATION U+000A LINE FEED

  5582                                  * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay

  5583                                  * in the before DOCTYPE system identifier

  5584                                  * state.

  5585                                  */

  5586                                 continue;

  5587                             case '"':

  5588                                 /*

  5589                                  * U+0022 QUOTATION MARK (") Set the DOCTYPE

  5590                                  * token's system identifier to the empty string

  5591                                  * (not missing),

  5592                                  */

  5593                                 clearLongStrBuf();

  5594                                 /*

  5595                                  * then switch to the DOCTYPE system identifier

  5596                                  * (double-quoted) state.

  5597                                  */

  5598                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);

  5599                                 continue stateloop;

  5600                             case '\'':

  5601                                 /*

  5602                                  * U+0027 APOSTROPHE (') Set the DOCTYPE token's

  5603                                  * system identifier to the empty string (not

  5604                                  * missing),

  5605                                  */

  5606                                 clearLongStrBuf();

  5607                                 /*

  5608                                  * then switch to the DOCTYPE system identifier

  5609                                  * (single-quoted) state.

  5610                                  */

  5611                                 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);

  5612                                 break beforedoctypesystemidentifierloop;

  5613                             // continue stateloop;

  5614                             case '>':

  5615                                 /* U+003E GREATER-THAN SIGN (>) Parse error. */

  5616                                 errExpectedSystemId();

  5617                                 /*

  5618                                  * Set the DOCTYPE token's force-quirks flag to

  5619                                  * on.

  5620                                  */

  5621                                 forceQuirks = true;

  5622                                 /*

  5623                                  * Emit that DOCTYPE token.

  5624                                  */

  5625                                 emitDoctypeToken(pos);

  5626                                 /*

  5627                                  * Switch to the data state.

  5628                                  */

  5629                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  5630                                 continue stateloop;

  5631                             default:

  5632                                 bogusDoctype();

  5633                                 /*

  5634                                  * Set the DOCTYPE token's force-quirks flag to

  5635                                  * on.

  5636                                  */

  5637                                 // done by bogusDoctype();

  5638                                 /*

  5639                                  * Switch to the bogus DOCTYPE state.

  5640                                  */

  5641                                 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);

  5642                                 continue stateloop;

  5643                         }

  5644                     }

  5645                     // FALLTHRU DON'T REORDER

  5646                 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:

  5647                     for (;;) {

  5648                         if (++pos == endPos) {

  5649                             break stateloop;

  5650                         }

  5651                         c = checkChar(buf, pos);

  5652                         /*

  5653                          * Consume the next input character:

  5654                          */

  5655                         switch (c) {

  5656                             case '\'':

  5657                                 /*

  5658                                  * U+0027 APOSTROPHE (') Switch to the after

  5659                                  * DOCTYPE system identifier state.

  5660                                  */

  5661                                 systemIdentifier = longStrBufToString();

  5662                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);

  5663                                 continue stateloop;

  5664                             case '>':

  5665                                 errGtInSystemId();

  5666                                 /*

  5667                                  * Set the DOCTYPE token's force-quirks flag to

  5668                                  * on.

  5669                                  */

  5670                                 forceQuirks = true;

  5671                                 /*

  5672                                  * Emit that DOCTYPE token.

  5673                                  */

  5674                                 systemIdentifier = longStrBufToString();

  5675                                 emitDoctypeToken(pos);

  5676                                 /*

  5677                                  * Switch to the data state.

  5678                                  */

  5679                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  5680                                 continue stateloop;

  5681                             case '\r':

  5682                                 appendLongStrBufCarriageReturn();

  5683                                 break stateloop;

  5684                             case '\n':

  5685                                 appendLongStrBufLineFeed();

  5686                                 continue;

  5687                             case '\u0000':

  5688                                 c = '\uFFFD';

  5689                                 // fall thru

  5690                             default:

  5691                                 /*

  5692                                  * Anything else Append the current input

  5693                                  * character to the current DOCTYPE token's

  5694                                  * system identifier.

  5695                                  */

  5696                                 appendLongStrBuf(c);

  5697                                 /*

  5698                                  * Stay in the DOCTYPE system identifier

  5699                                  * (double-quoted) state.

  5700                                  */

  5701                                 continue;

  5702                         }

  5703                     }

  5704                     // XXX reorder point

  5705                 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:

  5706                     for (;;) {

  5707                         if (++pos == endPos) {

  5708                             break stateloop;

  5709                         }

  5710                         c = checkChar(buf, pos);

  5711                         /*

  5712                          * Consume the next input character:

  5713                          */

  5714                         switch (c) {

  5715                             case '\'':

  5716                                 /*

  5717                                  * U+0027 APOSTROPHE (') Switch to the after

  5718                                  * DOCTYPE public identifier state.

  5719                                  */

  5720                                 publicIdentifier = longStrBufToString();

  5721                                 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);

  5722                                 continue stateloop;

  5723                             case '>':

  5724                                 errGtInPublicId();

  5725                                 /*

  5726                                  * Set the DOCTYPE token's force-quirks flag to

  5727                                  * on.

  5728                                  */

  5729                                 forceQuirks = true;

  5730                                 /*

  5731                                  * Emit that DOCTYPE token.

  5732                                  */

  5733                                 publicIdentifier = longStrBufToString();

  5734                                 emitDoctypeToken(pos);

  5735                                 /*

  5736                                  * Switch to the data state.

  5737                                  */

  5738                                 state = transition(state, Tokenizer.DATA, reconsume, pos);

  5739                                 continue stateloop;

  5740                             case '\r':

  5741                                 appendLongStrBufCarriageReturn();

  5742                                 break stateloop;

  5743                             case '\n':

  5744                                 appendLongStrBufLineFeed();

  5745                                 continue;

  5746                             case '\u0000':

  5747                                 c = '\uFFFD';

  5748                                 // fall thru

  5749                             default:

  5750                                 /*

  5751                                  * Anything else Append the current input

  5752                                  * character to the current DOCTYPE token's

  5753                                  * public identifier.

  5754                                  */

  5755                                 appendLongStrBuf(c);

  5756                                 /*

  5757                                  * Stay in the DOCTYPE public identifier

  5758                                  * (single-quoted) state.

  5759                                  */

  5760                                 continue;

  5761                         }

  5762                     }

  5763                     // XXX reorder point

  5764                 case PROCESSING_INSTRUCTION:

  5765                     processinginstructionloop: for (;;) {

  5766                         if (++pos == endPos) {

  5767                             break stateloop;

  5768                         }

  5769                         c = checkChar(buf, pos);

  5770                         switch (c) {

  5771                             case '?':

  5772                                 state = transition(

  5773                                         state,

  5774                                         Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK,

  5775                                         reconsume, pos);

  5776                                 break processinginstructionloop;

  5777                             // continue stateloop;

  5778                             default:

  5779                                 continue;

  5780                         }

  5781                     }

  5782                 case PROCESSING_INSTRUCTION_QUESTION_MARK:

  5783                     if (++pos == endPos) {

  5784                         break stateloop;

  5785                     }

  5786                     c = checkChar(buf, pos);

  5787                     switch (c) {

  5788                         case '>':

  5789                             state = transition(state, Tokenizer.DATA,

  5790                                     reconsume, pos);

  5791                             continue stateloop;

  5792                         default:

  5793                             state = transition(state,

  5794                                     Tokenizer.PROCESSING_INSTRUCTION,

  5795                                     reconsume, pos);

  5796                             continue stateloop;

  5797                     }

  5798                     // END HOTSPOT WORKAROUND

  5799             }

  5800         }

  5801         flushChars(buf, pos);

  5802         /*

  5803          * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; }

  5804          */

  5805         // Save locals

  5806         stateSave = state;

  5807         returnStateSave = returnState;

  5808         return pos;

  5809     }

  5811     // HOTSPOT WORKAROUND INSERTION POINT

  5813     // [NOCPP[

  5815     protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException {

  5816         return to;

  5817     }

  5819     // ]NOCPP]

  5821     private void initDoctypeFields() {

  5822         doctypeName = "";

  5823         if (systemIdentifier != null) {

  5824             Portability.releaseString(systemIdentifier);

  5825             systemIdentifier = null;

  5826         }

  5827         if (publicIdentifier != null) {

  5828             Portability.releaseString(publicIdentifier);

  5829             publicIdentifier = null;

  5830         }

  5831         forceQuirks = false;

  5832     }

  5834     @Inline private void adjustDoubleHyphenAndAppendToLongStrBufCarriageReturn()

  5835             throws SAXException {

  5836         silentCarriageReturn();

  5837         adjustDoubleHyphenAndAppendToLongStrBufAndErr('\n');

  5838     }

  5840     @Inline private void adjustDoubleHyphenAndAppendToLongStrBufLineFeed()

  5841             throws SAXException {

  5842         silentLineFeed();

  5843         adjustDoubleHyphenAndAppendToLongStrBufAndErr('\n');

  5844     }

  5846     @Inline private void appendLongStrBufLineFeed() {

  5847         silentLineFeed();

  5848         appendLongStrBuf('\n');

  5849     }

  5851     @Inline private void appendLongStrBufCarriageReturn() {

  5852         silentCarriageReturn();

  5853         appendLongStrBuf('\n');

  5854     }

  5856     @Inline protected void silentCarriageReturn() {

  5857         ++line;

  5858         lastCR = true;

  5859     }

  5861     @Inline protected void silentLineFeed() {

  5862         ++line;

  5863     }

  5865     private void emitCarriageReturn(@NoLength char[] buf, int pos)

  5866             throws SAXException {

  5867         silentCarriageReturn();

  5868         flushChars(buf, pos);

  5869         tokenHandler.characters(Tokenizer.LF, 0, 1);

  5870         cstart = Integer.MAX_VALUE;

  5871     }

  5873     private void emitReplacementCharacter(@NoLength char[] buf, int pos)

  5874             throws SAXException {

  5875         flushChars(buf, pos);

  5876         tokenHandler.zeroOriginatingReplacementCharacter();

  5877         cstart = pos + 1;

  5878     }

  5880     private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos)

  5881             throws SAXException {

  5882         flushChars(buf, pos);

  5883         tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1);

  5884         cstart = pos + 1;

  5885     }

  5887     private void setAdditionalAndRememberAmpersandLocation(char add) {

  5888         additional = add;

  5889         // [NOCPP[

  5890         ampersandLocation = new LocatorImpl(this);

  5891         // ]NOCPP]

  5892     }

  5894     private void bogusDoctype() throws SAXException {

  5895         errBogusDoctype();

  5896         forceQuirks = true;

  5897     }

  5899     private void bogusDoctypeWithoutQuirks() throws SAXException {

  5900         errBogusDoctype();

  5901         forceQuirks = false;

  5902     }

  5904     private void emitOrAppendStrBuf(int returnState) throws SAXException {

  5905         if ((returnState & DATA_AND_RCDATA_MASK) != 0) {

  5906             appendStrBufToLongStrBuf();

  5907         } else {

  5908             emitStrBuf();

  5909         }

  5910     }

  5912     private void handleNcrValue(int returnState) throws SAXException {

  5913         /*

  5914          * If one or more characters match the range, then take them all and

  5915          * interpret the string of characters as a number (either hexadecimal or

  5916          * decimal as appropriate).

  5917          */

  5918         if (value <= 0xFFFF) {

  5919             if (value >= 0x80 && value <= 0x9f) {

  5920                 /*

  5921                  * If that number is one of the numbers in the first column of

  5922                  * the following table, then this is a parse error.

  5923                  */

  5924                 errNcrInC1Range();

  5925                 /*

  5926                  * Find the row with that number in the first column, and return

  5927                  * a character token for the Unicode character given in the

  5928                  * second column of that row.

  5929                  */

  5930                 @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80];

  5931                 emitOrAppendOne(val, returnState);

  5932                 // [NOCPP[

  5933             } else if (value == 0xC

  5934                     && contentSpacePolicy != XmlViolationPolicy.ALLOW) {

  5935                 if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) {

  5936                     emitOrAppendOne(Tokenizer.SPACE, returnState);

  5937                 } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) {

  5938                     fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space.");

  5939                 }

  5940                 // ]NOCPP]

  5941             } else if (value == 0x0) {

  5942                 errNcrZero();

  5943                 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);

  5944             } else if ((value & 0xF800) == 0xD800) {

  5945                 errNcrSurrogate();

  5946                 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);

  5947             } else {

  5948                 /*

  5949                  * Otherwise, return a character token for the Unicode character

  5950                  * whose code point is that number.

  5951                  */

  5952                 char ch = (char) value;

  5953                 // [NOCPP[

  5954                 if (value == 0x0D) {

  5955                     errNcrCr();

  5956                 } else if ((value <= 0x0008) || (value == 0x000B)

  5957                         || (value >= 0x000E && value <= 0x001F)) {

  5958                     ch = errNcrControlChar(ch);

  5959                 } else if (value >= 0xFDD0 && value <= 0xFDEF) {

  5960                     errNcrUnassigned();

  5961                 } else if ((value & 0xFFFE) == 0xFFFE) {

  5962                     ch = errNcrNonCharacter(ch);

  5963                 } else if (value >= 0x007F && value <= 0x009F) {

  5964                     errNcrControlChar();

  5965                 } else {

  5966                     maybeWarnPrivateUse(ch);

  5967                 }

  5968                 // ]NOCPP]

  5969                 bmpChar[0] = ch;

  5970                 emitOrAppendOne(bmpChar, returnState);

  5971             }

  5972         } else if (value <= 0x10FFFF) {

  5973             // [NOCPP[

  5974             maybeWarnPrivateUseAstral();

  5975             if ((value & 0xFFFE) == 0xFFFE) {

  5976                 errAstralNonCharacter(value);

  5977             }

  5978             // ]NOCPP]

  5979             astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10));

  5980             astralChar[1] = (char) (0xDC00 + (value & 0x3FF));

  5981             emitOrAppendTwo(astralChar, returnState);

  5982         } else {

  5983             errNcrOutOfRange();

  5984             emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);

  5985         }

  5986     }

  5988     public void eof() throws SAXException {

  5989         int state = stateSave;

  5990         int returnState = returnStateSave;

  5992         eofloop: for (;;) {

  5993             switch (state) {

  5994                 case SCRIPT_DATA_LESS_THAN_SIGN:

  5995                 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:

  5996                     /*

  5997                      * Otherwise, emit a U+003C LESS-THAN SIGN character token

  5998                      */

  5999                     tokenHandler.characters(Tokenizer.LT_GT, 0, 1);

  6000                     /*

  6001                      * and reconsume the current input character in the data

  6002                      * state.

  6003                      */

  6004                     break eofloop;

  6005                 case TAG_OPEN:

  6006                     /*

  6007                      * The behavior of this state depends on the content model

  6008                      * flag.

  6009                      */

  6010                     /*

  6011                      * Anything else Parse error.

  6012                      */

  6013                     errEofAfterLt();

  6014                     /*

  6015                      * Emit a U+003C LESS-THAN SIGN character token

  6016                      */

  6017                     tokenHandler.characters(Tokenizer.LT_GT, 0, 1);

  6018                     /*

  6019                      * and reconsume the current input character in the data

  6020                      * state.

  6021                      */

  6022                     break eofloop;

  6023                 case RAWTEXT_RCDATA_LESS_THAN_SIGN:

  6024                     /*

  6025                      * Emit a U+003C LESS-THAN SIGN character token

  6026                      */

  6027                     tokenHandler.characters(Tokenizer.LT_GT, 0, 1);

  6028                     /*

  6029                      * and reconsume the current input character in the RCDATA

  6030                      * state.

  6031                      */

  6032                     break eofloop;

  6033                 case NON_DATA_END_TAG_NAME:

  6034                     /*

  6035                      * Emit a U+003C LESS-THAN SIGN character token, a U+002F

  6036                      * SOLIDUS character token,

  6037                      */

  6038                     tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);

  6039                     /*

  6040                      * a character token for each of the characters in the

  6041                      * temporary buffer (in the order they were added to the

  6042                      * buffer),

  6043                      */

  6044                     emitStrBuf();

  6045                     /*

  6046                      * and reconsume the current input character in the RCDATA

  6047                      * state.

  6048                      */

  6049                     break eofloop;

  6050                 case CLOSE_TAG_OPEN:

  6051                     /* EOF Parse error. */

  6052                     errEofAfterLt();

  6053                     /*

  6054                      * Emit a U+003C LESS-THAN SIGN character token and a U+002F

  6055                      * SOLIDUS character token.

  6056                      */

  6057                     tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);

  6058                     /*

  6059                      * Reconsume the EOF character in the data state.

  6060                      */

  6061                     break eofloop;

  6062                 case TAG_NAME:

  6063                     /*

  6064                      * EOF Parse error.

  6065                      */

  6066                     errEofInTagName();

  6067                     /*

  6068                      * Reconsume the EOF character in the data state.

  6069                      */

  6070                     break eofloop;

  6071                 case BEFORE_ATTRIBUTE_NAME:

  6072                 case AFTER_ATTRIBUTE_VALUE_QUOTED:

  6073                 case SELF_CLOSING_START_TAG:

  6074                     /* EOF Parse error. */

  6075                     errEofWithoutGt();

  6076                     /*

  6077                      * Reconsume the EOF character in the data state.

  6078                      */

  6079                     break eofloop;

  6080                 case ATTRIBUTE_NAME:

  6081                     /*

  6082                      * EOF Parse error.

  6083                      */

  6084                     errEofInAttributeName();

  6085                     /*

  6086                      * Reconsume the EOF character in the data state.

  6087                      */

  6088                     break eofloop;

  6089                 case AFTER_ATTRIBUTE_NAME:

  6090                 case BEFORE_ATTRIBUTE_VALUE:

  6091                     /* EOF Parse error. */

  6092                     errEofWithoutGt();

  6093                     /*

  6094                      * Reconsume the EOF character in the data state.

  6095                      */

  6096                     break eofloop;

  6097                 case ATTRIBUTE_VALUE_DOUBLE_QUOTED:

  6098                 case ATTRIBUTE_VALUE_SINGLE_QUOTED:

  6099                 case ATTRIBUTE_VALUE_UNQUOTED:

  6100                     /* EOF Parse error. */

  6101                     errEofInAttributeValue();

  6102                     /*

  6103                      * Reconsume the EOF character in the data state.

  6104                      */

  6105                     break eofloop;

  6106                 case BOGUS_COMMENT:

  6107                     emitComment(0, 0);

  6108                     break eofloop;

  6109                 case BOGUS_COMMENT_HYPHEN:

  6110                     // [NOCPP[

  6111                     maybeAppendSpaceToBogusComment();

  6112                     // ]NOCPP]

  6113                     emitComment(0, 0);

  6114                     break eofloop;

  6115                 case MARKUP_DECLARATION_OPEN:

  6116                     errBogusComment();

  6117                     clearLongStrBuf();

  6118                     emitComment(0, 0);

  6119                     break eofloop;

  6120                 case MARKUP_DECLARATION_HYPHEN:

  6121                     errBogusComment();

  6122                     emitComment(0, 0);

  6123                     break eofloop;

  6124                 case MARKUP_DECLARATION_OCTYPE:

  6125                     if (index < 6) {

  6126                         errBogusComment();

  6127                         emitComment(0, 0);

  6128                     } else {

  6129                         /* EOF Parse error. */

  6130                         errEofInDoctype();

  6131                         /*

  6132                          * Create a new DOCTYPE token. Set its force-quirks flag

  6133                          * to on.

  6134                          */

  6135                         doctypeName = "";

  6136                         if (systemIdentifier != null) {

  6137                             Portability.releaseString(systemIdentifier);

  6138                             systemIdentifier = null;

  6139                         }

  6140                         if (publicIdentifier != null) {

  6141                             Portability.releaseString(publicIdentifier);

  6142                             publicIdentifier = null;

  6143                         }

  6144                         forceQuirks = true;

  6145                         /*

  6146                          * Emit the token.

  6147                          */

  6148                         emitDoctypeToken(0);

  6149                         /*

  6150                          * Reconsume the EOF character in the data state.

  6151                          */

  6152                         break eofloop;

  6153                     }

  6154                     break eofloop;

  6155                 case COMMENT_START:

  6156                 case COMMENT:

  6157                     /*

  6158                      * EOF Parse error.

  6159                      */

  6160                     errEofInComment();

  6161                     /* Emit the comment token. */

  6162                     emitComment(0, 0);

  6163                     /*

  6164                      * Reconsume the EOF character in the data state.

  6165                      */

  6166                     break eofloop;

  6167                 case COMMENT_END:

  6168                     errEofInComment();

  6169                     /* Emit the comment token. */

  6170                     emitComment(2, 0);

  6171                     /*

  6172                      * Reconsume the EOF character in the data state.

  6173                      */

  6174                     break eofloop;

  6175                 case COMMENT_END_DASH:

  6176                 case COMMENT_START_DASH:

  6177                     errEofInComment();

  6178                     /* Emit the comment token. */

  6179                     emitComment(1, 0);

  6180                     /*

  6181                      * Reconsume the EOF character in the data state.

  6182                      */

  6183                     break eofloop;

  6184                 case COMMENT_END_BANG:

  6185                     errEofInComment();

  6186                     /* Emit the comment token. */

  6187                     emitComment(3, 0);

  6188                     /*

  6189                      * Reconsume the EOF character in the data state.

  6190                      */

  6191                     break eofloop;

  6192                 case DOCTYPE:

  6193                 case BEFORE_DOCTYPE_NAME:

  6194                     errEofInDoctype();

  6195                     /*

  6196                      * Create a new DOCTYPE token. Set its force-quirks flag to

  6197                      * on.

  6198                      */

  6199                     forceQuirks = true;

  6200                     /*

  6201                      * Emit the token.

  6202                      */

  6203                     emitDoctypeToken(0);

  6204                     /*

  6205                      * Reconsume the EOF character in the data state.

  6206                      */

  6207                     break eofloop;

  6208                 case DOCTYPE_NAME:

  6209                     errEofInDoctype();

  6210                     strBufToDoctypeName();

  6211                     /*

  6212                      * Set the DOCTYPE token's force-quirks flag to on.

  6213                      */

  6214                     forceQuirks = true;

  6215                     /*

  6216                      * Emit that DOCTYPE token.

  6217                      */

  6218                     emitDoctypeToken(0);

  6219                     /*

  6220                      * Reconsume the EOF character in the data state.

  6221                      */

  6222                     break eofloop;

  6223                 case DOCTYPE_UBLIC:

  6224                 case DOCTYPE_YSTEM:

  6225                 case AFTER_DOCTYPE_NAME:

  6226                 case AFTER_DOCTYPE_PUBLIC_KEYWORD:

  6227                 case AFTER_DOCTYPE_SYSTEM_KEYWORD:

  6228                 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:

  6229                     errEofInDoctype();

  6230                     /*

  6231                      * Set the DOCTYPE token's force-quirks flag to on.

  6232                      */

  6233                     forceQuirks = true;

  6234                     /*

  6235                      * Emit that DOCTYPE token.

  6236                      */

  6237                     emitDoctypeToken(0);

  6238                     /*

  6239                      * Reconsume the EOF character in the data state.

  6240                      */

  6241                     break eofloop;

  6242                 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:

  6243                 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:

  6244                     /* EOF Parse error. */

  6245                     errEofInPublicId();

  6246                     /*

  6247                      * Set the DOCTYPE token's force-quirks flag to on.

  6248                      */

  6249                     forceQuirks = true;

  6250                     /*

  6251                      * Emit that DOCTYPE token.

  6252                      */

  6253                     publicIdentifier = longStrBufToString();

  6254                     emitDoctypeToken(0);

  6255                     /*

  6256                      * Reconsume the EOF character in the data state.

  6257                      */

  6258                     break eofloop;

  6259                 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:

  6260                 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:

  6261                 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:

  6262                     errEofInDoctype();

  6263                     /*

  6264                      * Set the DOCTYPE token's force-quirks flag to on.

  6265                      */

  6266                     forceQuirks = true;

  6267                     /*

  6268                      * Emit that DOCTYPE token.

  6269                      */

  6270                     emitDoctypeToken(0);

  6271                     /*

  6272                      * Reconsume the EOF character in the data state.

  6273                      */

  6274                     break eofloop;

  6275                 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:

  6276                 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:

  6277                     /* EOF Parse error. */

  6278                     errEofInSystemId();

  6279                     /*

  6280                      * Set the DOCTYPE token's force-quirks flag to on.

  6281                      */

  6282                     forceQuirks = true;

  6283                     /*

  6284                      * Emit that DOCTYPE token.

  6285                      */

  6286                     systemIdentifier = longStrBufToString();

  6287                     emitDoctypeToken(0);

  6288                     /*

  6289                      * Reconsume the EOF character in the data state.

  6290                      */

  6291                     break eofloop;

  6292                 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:

  6293                     errEofInDoctype();

  6294                     /*

  6295                      * Set the DOCTYPE token's force-quirks flag to on.

  6296                      */

  6297                     forceQuirks = true;

  6298                     /*

  6299                      * Emit that DOCTYPE token.

  6300                      */

  6301                     emitDoctypeToken(0);

  6302                     /*

  6303                      * Reconsume the EOF character in the data state.

  6304                      */

  6305                     break eofloop;

  6306                 case BOGUS_DOCTYPE:

  6307                     /*

  6308                      * Emit that DOCTYPE token.

  6309                      */

  6310                     emitDoctypeToken(0);

  6311                     /*

  6312                      * Reconsume the EOF character in the data state.

  6313                      */

  6314                     break eofloop;

  6315                 case CONSUME_CHARACTER_REFERENCE:

  6316                     /*

  6317                      * Unlike the definition is the spec, this state does not

  6318                      * return a value and never requires the caller to

  6319                      * backtrack. This state takes care of emitting characters

  6320                      * or appending to the current attribute value. It also

  6321                      * takes care of that in the case when consuming the entity

  6322                      * fails.

  6323                      */

  6324                     /*

  6325                      * This section defines how to consume an entity. This

  6326                      * definition is used when parsing entities in text and in

  6327                      * attributes.

  6328                      *

  6329                      * The behavior depends on the identity of the next

  6330                      * character (the one immediately after the U+0026 AMPERSAND

  6331                      * character):

  6332                      */

  6334                     emitOrAppendStrBuf(returnState);

  6335                     state = returnState;

  6336                     continue;

  6337                 case CHARACTER_REFERENCE_HILO_LOOKUP:

  6338                     errNoNamedCharacterMatch();

  6339                     emitOrAppendStrBuf(returnState);

  6340                     state = returnState;

  6341                     continue;

  6342                 case CHARACTER_REFERENCE_TAIL:

  6343                     outer: for (;;) {

  6344                         char c = '\u0000';

  6345                         entCol++;

  6346                         /*

  6347                          * Consume the maximum number of characters possible,

  6348                          * with the consumed characters matching one of the

  6349                          * identifiers in the first column of the named

  6350                          * character references table (in a case-sensitive

  6351                          * manner).

  6352                          */

  6353                         hiloop: for (;;) {

  6354                             if (hi == -1) {

  6355                                 break hiloop;

  6356                             }

  6357                             if (entCol == NamedCharacters.NAMES[hi].length()) {

  6358                                 break hiloop;

  6359                             }

  6360                             if (entCol > NamedCharacters.NAMES[hi].length()) {

  6361                                 break outer;

  6362                             } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {

  6363                                 hi--;

  6364                             } else {

  6365                                 break hiloop;

  6366                             }

  6367                         }

  6369                         loloop: for (;;) {

  6370                             if (hi < lo) {

  6371                                 break outer;

  6372                             }

  6373                             if (entCol == NamedCharacters.NAMES[lo].length()) {

  6374                                 candidate = lo;

  6375                                 strBufMark = strBufLen;

  6376                                 lo++;

  6377                             } else if (entCol > NamedCharacters.NAMES[lo].length()) {

  6378                                 break outer;

  6379                             } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {

  6380                                 lo++;

  6381                             } else {

  6382                                 break loloop;

  6383                             }

  6384                         }

  6385                         if (hi < lo) {

  6386                             break outer;

  6387                         }

  6388                         continue;

  6389                     }

  6391                     if (candidate == -1) {

  6392                         /*

  6393                          * If no match can be made, then this is a parse error.

  6394                          */

  6395                         errNoNamedCharacterMatch();

  6396                         emitOrAppendStrBuf(returnState);

  6397                         state = returnState;

  6398                         continue eofloop;

  6399                     } else {

  6400                         @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];

  6401                         if (candidateName.length() == 0

  6402                                 || candidateName.charAt(candidateName.length() - 1) != ';') {

  6403                             /*

  6404                              * If the last character matched is not a U+003B

  6405                              * SEMICOLON (;), there is a parse error.

  6406                              */

  6407                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {

  6408                                 /*

  6409                                  * If the entity is being consumed as part of an

  6410                                  * attribute, and the last character matched is

  6411                                  * not a U+003B SEMICOLON (;),

  6412                                  */

  6413                                 char ch;

  6414                                 if (strBufMark == strBufLen) {

  6415                                     ch = '\u0000';

  6416                                 } else {

  6417                                     ch = strBuf[strBufMark];

  6418                                 }

  6419                                 if ((ch >= '0' && ch <= '9')

  6420                                         || (ch >= 'A' && ch <= 'Z')

  6421                                         || (ch >= 'a' && ch <= 'z')) {

  6422                                     /*

  6423                                      * and the next character is in the range

  6424                                      * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,

  6425                                      * U+0041 LATIN CAPITAL LETTER A to U+005A

  6426                                      * LATIN CAPITAL LETTER Z, or U+0061 LATIN

  6427                                      * SMALL LETTER A to U+007A LATIN SMALL

  6428                                      * LETTER Z, then, for historical reasons,

  6429                                      * all the characters that were matched

  6430                                      * after the U+0026 AMPERSAND (&) must be

  6431                                      * unconsumed, and nothing is returned.

  6432                                      */

  6433                                     errNoNamedCharacterMatch();

  6434                                     appendStrBufToLongStrBuf();

  6435                                     state = returnState;

  6436                                     continue eofloop;

  6437                                 }

  6438                             }

  6439                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {

  6440                                 errUnescapedAmpersandInterpretedAsCharacterReference();

  6441                             } else {

  6442                                 errNotSemicolonTerminated();

  6443                             }

  6444                         }

  6446                         /*

  6447                          * Otherwise, return a character token for the character

  6448                          * corresponding to the entity name (as given by the

  6449                          * second column of the named character references

  6450                          * table).

  6451                          */

  6452                         @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];

  6453                         if (

  6454                         // [NOCPP[

  6455                         val.length == 1

  6456                         // ]NOCPP]

  6457                         // CPPONLY: val[1] == 0

  6458                         ) {

  6459                             emitOrAppendOne(val, returnState);

  6460                         } else {

  6461                             emitOrAppendTwo(val, returnState);

  6462                         }

  6463                         // this is so complicated!

  6464                         if (strBufMark < strBufLen) {

  6465                             if ((returnState & DATA_AND_RCDATA_MASK) != 0) {

  6466                                 for (int i = strBufMark; i < strBufLen; i++) {

  6467                                     appendLongStrBuf(strBuf[i]);

  6468                                 }

  6469                             } else {

  6470                                 tokenHandler.characters(strBuf, strBufMark,

  6471                                         strBufLen - strBufMark);

  6472                             }

  6473                         }

  6474                         state = returnState;

  6475                         continue eofloop;

  6476                         /*

  6477                          * If the markup contains I'm &notit; I tell you, the

  6478                          * entity is parsed as "not", as in, I'm ¬it; I tell

  6479                          * you. But if the markup was I'm &notin; I tell you,

  6480                          * the entity would be parsed as "notin;", resulting in

  6481                          * I'm ∉ I tell you.

  6482                          */

  6483                     }

  6484                 case CONSUME_NCR:

  6485                 case DECIMAL_NRC_LOOP:

  6486                 case HEX_NCR_LOOP:

  6487                     /*

  6488                      * If no characters match the range, then don't consume any

  6489                      * characters (and unconsume the U+0023 NUMBER SIGN

  6490                      * character and, if appropriate, the X character). This is

  6491                      * a parse error; nothing is returned.

  6492                      *

  6493                      * Otherwise, if the next character is a U+003B SEMICOLON,

  6494                      * consume that too. If it isn't, there is a parse error.

  6495                      */

  6496                     if (!seenDigits) {

  6497                         errNoDigitsInNCR();

  6498                         emitOrAppendStrBuf(returnState);

  6499                         state = returnState;

  6500                         continue;

  6501                     } else {

  6502                         errCharRefLacksSemicolon();

  6503                     }

  6504                     // WARNING previous state sets reconsume

  6505                     handleNcrValue(returnState);

  6506                     state = returnState;

  6507                     continue;

  6508                 case CDATA_RSQB:

  6509                     tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);

  6510                     break eofloop;

  6511                 case CDATA_RSQB_RSQB:

  6512                     tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);

  6513                     break eofloop;

  6514                 case DATA:

  6515                 default:

  6516                     break eofloop;

  6517             }

  6518         }

  6519         // case DATA:

  6520         /*

  6521          * EOF Emit an end-of-file token.

  6522          */

  6523         tokenHandler.eof();

  6524         return;

  6525     }

  6527     private void emitDoctypeToken(int pos) throws SAXException {

  6528         cstart = pos + 1;

  6529         tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier,

  6530                 forceQuirks);

  6531         // It is OK and sufficient to release these here, since

  6532         // there's no way out of the doctype states than through paths

  6533         // that call this method.

  6534         doctypeName = null;

  6535         Portability.releaseString(publicIdentifier);

  6536         publicIdentifier = null;

  6537         Portability.releaseString(systemIdentifier);

  6538         systemIdentifier = null;

  6539     }

  6541     @Inline protected char checkChar(@NoLength char[] buf, int pos)

  6542             throws SAXException {

  6543         return buf[pos];

  6544     }

  6546     public boolean internalEncodingDeclaration(String internalCharset)

  6547             throws SAXException {

  6548         if (encodingDeclarationHandler != null) {

  6549             return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset);

  6550         }

  6551         return false;

  6552     }

  6554     /**

  6555      * @param val

  6556      * @throws SAXException

  6557      */

  6558     private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState)

  6559             throws SAXException {

  6560         if ((returnState & DATA_AND_RCDATA_MASK) != 0) {

  6561             appendLongStrBuf(val[0]);

  6562             appendLongStrBuf(val[1]);

  6563         } else {

  6564             tokenHandler.characters(val, 0, 2);

  6565         }

  6566     }

  6568     private void emitOrAppendOne(@Const @NoLength char[] val, int returnState)

  6569             throws SAXException {

  6570         if ((returnState & DATA_AND_RCDATA_MASK) != 0) {

  6571             appendLongStrBuf(val[0]);

  6572         } else {

  6573             tokenHandler.characters(val, 0, 1);

  6574         }

  6575     }

  6577     public void end() throws SAXException {

  6578         strBuf = null;

  6579         longStrBuf = null;

  6580         doctypeName = null;

  6581         if (systemIdentifier != null) {

  6582             Portability.releaseString(systemIdentifier);

  6583             systemIdentifier = null;

  6584         }

  6585         if (publicIdentifier != null) {

  6586             Portability.releaseString(publicIdentifier);

  6587             publicIdentifier = null;

  6588         }

  6589         if (tagName != null) {

  6590             tagName.release();

  6591             tagName = null;

  6592         }

  6593         if (attributeName != null) {

  6594             attributeName.release();

  6595             attributeName = null;

  6596         }

  6597         tokenHandler.endTokenization();

  6598         if (attributes != null) {

  6599             // [NOCPP[

  6600             attributes = null;

  6601             // ]NOCPP]

  6602             // CPPONLY: attributes.clear(mappingLangToXmlLang);

  6603         }

  6604     }

  6606     public void requestSuspension() {

  6607         shouldSuspend = true;

  6608     }

  6610     // [NOCPP[

  6612     public void becomeConfident() {

  6613         confident = true;

  6614     }

  6616     /**

  6617      * Returns the nextCharOnNewLine.

  6618      *

  6619      * @return the nextCharOnNewLine

  6620      */

  6621     public boolean isNextCharOnNewLine() {

  6622         return false;

  6623     }

  6625     public boolean isPrevCR() {

  6626         return lastCR;

  6627     }

  6629     /**

  6630      * Returns the line.

  6631      *

  6632      * @return the line

  6633      */

  6634     public int getLine() {

  6635         return -1;

  6636     }

  6638     /**

  6639      * Returns the col.

  6640      *

  6641      * @return the col

  6642      */

  6643     public int getCol() {

  6644         return -1;

  6645     }

  6647     // ]NOCPP]

  6649     public boolean isInDataState() {

  6650         return (stateSave == DATA);

  6651     }

  6653     public void resetToDataState() {

  6654         strBufLen = 0;

  6655         longStrBufLen = 0;

  6656         stateSave = Tokenizer.DATA;

  6657         // line = 1; XXX line numbers

  6658         lastCR = false;

  6659         index = 0;

  6660         forceQuirks = false;

  6661         additional = '\u0000';

  6662         entCol = -1;

  6663         firstCharKey = -1;

  6664         lo = 0;

  6665         hi = 0; // will always be overwritten before use anyway

  6666         candidate = -1;

  6667         strBufMark = 0;

  6668         prevValue = -1;

  6669         value = 0;

  6670         seenDigits = false;

  6671         endTag = false;

  6672         shouldSuspend = false;

  6673         initDoctypeFields();

  6674         if (tagName != null) {

  6675             tagName.release();

  6676             tagName = null;

  6677         }

  6678         if (attributeName != null) {

  6679             attributeName.release();

  6680             attributeName = null;

  6681         }

  6682         if (newAttributesEachTime) {

  6683             if (attributes != null) {

  6684                 Portability.delete(attributes);

  6685                 attributes = null;

  6686             }

  6687         }

  6688     }

  6690     public void loadState(Tokenizer other) throws SAXException {

  6691         strBufLen = other.strBufLen;

  6692         if (strBufLen > strBuf.length) {

  6693             strBuf = new char[strBufLen];

  6694         }

  6695         System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen);

  6697         longStrBufLen = other.longStrBufLen;

  6698         if (longStrBufLen > longStrBuf.length) {

  6699             longStrBuf = new char[longStrBufLen];

  6700         }

  6701         System.arraycopy(other.longStrBuf, 0, longStrBuf, 0, longStrBufLen);

  6703         stateSave = other.stateSave;

  6704         returnStateSave = other.returnStateSave;

  6705         endTagExpectation = other.endTagExpectation;

  6706         endTagExpectationAsArray = other.endTagExpectationAsArray;

  6707         // line = 1; XXX line numbers

  6708         lastCR = other.lastCR;

  6709         index = other.index;

  6710         forceQuirks = other.forceQuirks;

  6711         additional = other.additional;

  6712         entCol = other.entCol;

  6713         firstCharKey = other.firstCharKey;

  6714         lo = other.lo;

  6715         hi = other.hi;

  6716         candidate = other.candidate;

  6717         strBufMark = other.strBufMark;

  6718         prevValue = other.prevValue;

  6719         value = other.value;

  6720         seenDigits = other.seenDigits;

  6721         endTag = other.endTag;

  6722         shouldSuspend = false;

  6724         if (other.doctypeName == null) {

  6725             doctypeName = null;

  6726         } else {

  6727             doctypeName = Portability.newLocalFromLocal(other.doctypeName,

  6728                     interner);

  6729         }

  6731         Portability.releaseString(systemIdentifier);

  6732         if (other.systemIdentifier == null) {

  6733             systemIdentifier = null;

  6734         } else {

  6735             systemIdentifier = Portability.newStringFromString(other.systemIdentifier);

  6736         }

  6738         Portability.releaseString(publicIdentifier);

  6739         if (other.publicIdentifier == null) {

  6740             publicIdentifier = null;

  6741         } else {

  6742             publicIdentifier = Portability.newStringFromString(other.publicIdentifier);

  6743         }

  6745         if (tagName != null) {

  6746             tagName.release();

  6747         }

  6748         if (other.tagName == null) {

  6749             tagName = null;

  6750         } else {

  6751             tagName = other.tagName.cloneElementName(interner);

  6752         }

  6754         if (attributeName != null) {

  6755             attributeName.release();

  6756         }

  6757         if (other.attributeName == null) {

  6758             attributeName = null;

  6759         } else {

  6760             attributeName = other.attributeName.cloneAttributeName(interner);

  6761         }

  6763         Portability.delete(attributes);

  6764         if (other.attributes == null) {

  6765             attributes = null;

  6766         } else {

  6767             attributes = other.attributes.cloneAttributes(interner);

  6768         }

  6769     }

  6771     public void initializeWithoutStarting() throws SAXException {

  6772         confident = false;

  6773         strBuf = new char[64];

  6774         longStrBuf = new char[1024];

  6775         line = 1;

  6776         // [NOCPP[

  6777         html4 = false;

  6778         metaBoundaryPassed = false;

  6779         wantsComments = tokenHandler.wantsComments();

  6780         if (!newAttributesEachTime) {

  6781             attributes = new HtmlAttributes(mappingLangToXmlLang);

  6782         }

  6783         // ]NOCPP]

  6784         resetToDataState();

  6785     }

  6787     protected void errGarbageAfterLtSlash() throws SAXException {

  6788     }

  6790     protected void errLtSlashGt() throws SAXException {

  6791     }

  6793     protected void errWarnLtSlashInRcdata() throws SAXException {

  6794     }

  6796     protected void errHtml4LtSlashInRcdata(char folded) throws SAXException {

  6797     }

  6799     protected void errCharRefLacksSemicolon() throws SAXException {

  6800     }

  6802     protected void errNoDigitsInNCR() throws SAXException {

  6803     }

  6805     protected void errGtInSystemId() throws SAXException {

  6806     }

  6808     protected void errGtInPublicId() throws SAXException {

  6809     }

  6811     protected void errNamelessDoctype() throws SAXException {

  6812     }

  6814     protected void errConsecutiveHyphens() throws SAXException {

  6815     }

  6817     protected void errPrematureEndOfComment() throws SAXException {

  6818     }

  6820     protected void errBogusComment() throws SAXException {

  6821     }

  6823     protected void errUnquotedAttributeValOrNull(char c) throws SAXException {

  6824     }

  6826     protected void errSlashNotFollowedByGt() throws SAXException {

  6827     }

  6829     protected void errHtml4XmlVoidSyntax() throws SAXException {

  6830     }

  6832     protected void errNoSpaceBetweenAttributes() throws SAXException {

  6833     }

  6835     protected void errHtml4NonNameInUnquotedAttribute(char c)

  6836             throws SAXException {

  6837     }

  6839     protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c)

  6840             throws SAXException {

  6841     }

  6843     protected void errAttributeValueMissing() throws SAXException {

  6844     }

  6846     protected void errBadCharBeforeAttributeNameOrNull(char c)

  6847             throws SAXException {

  6848     }

  6850     protected void errEqualsSignBeforeAttributeName() throws SAXException {

  6851     }

  6853     protected void errBadCharAfterLt(char c) throws SAXException {

  6854     }

  6856     protected void errLtGt() throws SAXException {

  6857     }

  6859     protected void errProcessingInstruction() throws SAXException {

  6860     }

  6862     protected void errUnescapedAmpersandInterpretedAsCharacterReference()

  6863             throws SAXException {

  6864     }

  6866     protected void errNotSemicolonTerminated() throws SAXException {

  6867     }

  6869     protected void errNoNamedCharacterMatch() throws SAXException {

  6870     }

  6872     protected void errQuoteBeforeAttributeName(char c) throws SAXException {

  6873     }

  6875     protected void errQuoteOrLtInAttributeNameOrNull(char c)

  6876             throws SAXException {

  6877     }

  6879     protected void errExpectedPublicId() throws SAXException {

  6880     }

  6882     protected void errBogusDoctype() throws SAXException {

  6883     }

  6885     protected void maybeWarnPrivateUseAstral() throws SAXException {

  6886     }

  6888     protected void maybeWarnPrivateUse(char ch) throws SAXException {

  6889     }

  6891     protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs)

  6892             throws SAXException {

  6893     }

  6895     protected void maybeErrSlashInEndTag(boolean selfClosing)

  6896             throws SAXException {

  6897     }

  6899     protected char errNcrNonCharacter(char ch) throws SAXException {

  6900         return ch;

  6901     }

  6903     protected void errAstralNonCharacter(int ch) throws SAXException {

  6904     }

  6906     protected void errNcrSurrogate() throws SAXException {

  6907     }

  6909     protected char errNcrControlChar(char ch) throws SAXException {

  6910         return ch;

  6911     }

  6913     protected void errNcrCr() throws SAXException {

  6914     }

  6916     protected void errNcrInC1Range() throws SAXException {

  6917     }

  6919     protected void errEofInPublicId() throws SAXException {

  6920     }

  6922     protected void errEofInComment() throws SAXException {

  6923     }

  6925     protected void errEofInDoctype() throws SAXException {

  6926     }

  6928     protected void errEofInAttributeValue() throws SAXException {

  6929     }

  6931     protected void errEofInAttributeName() throws SAXException {

  6932     }

  6934     protected void errEofWithoutGt() throws SAXException {

  6935     }

  6937     protected void errEofInTagName() throws SAXException {

  6938     }

  6940     protected void errEofInEndTag() throws SAXException {

  6941     }

  6943     protected void errEofAfterLt() throws SAXException {

  6944     }

  6946     protected void errNcrOutOfRange() throws SAXException {

  6947     }

  6949     protected void errNcrUnassigned() throws SAXException {

  6950     }

  6952     protected void errDuplicateAttribute() throws SAXException {

  6953     }

  6955     protected void errEofInSystemId() throws SAXException {

  6956     }

  6958     protected void errExpectedSystemId() throws SAXException {

  6959     }

  6961     protected void errMissingSpaceBeforeDoctypeName() throws SAXException {

  6962     }

  6964     protected void errHyphenHyphenBang() throws SAXException {

  6965     }

  6967     protected void errNcrControlChar() throws SAXException {

  6968     }

  6970     protected void errNcrZero() throws SAXException {

  6971     }

  6973     protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote()

  6974             throws SAXException {

  6975     }

  6977     protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException {

  6978     }

  6980     protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote()

  6981             throws SAXException {

  6982     }

  6984     protected void noteAttributeWithoutValue() throws SAXException {

  6985     }

  6987     protected void noteUnquotedAttributeValue() throws SAXException {

  6988     }

  6990     /**

  6991      * Sets the encodingDeclarationHandler.

  6992      *

  6993      * @param encodingDeclarationHandler

  6994      *            the encodingDeclarationHandler to set

  6995      */

  6996     public void setEncodingDeclarationHandler(

  6997             EncodingDeclarationHandler encodingDeclarationHandler) {

  6998         this.encodingDeclarationHandler = encodingDeclarationHandler;

  6999     }

  7001     void destructor() {

  7002         // The translator will write refcount tracing stuff here

  7003         Portability.delete(attributes);

  7004         attributes = null;

  7005     }

  7007     // [NOCPP[

  7009     /**

  7010      * Sets an offset to be added to the position reported to

  7011      * <code>TransitionHandler</code>.

  7012      *

  7013      * @param offset the offset

  7014      */

  7015     public void setTransitionBaseOffset(int offset) {

  7017     }

  7019     // ]NOCPP]

  7021 }

The Tor Browser / file revision

parser/html/javasrc/Tokenizer.java@925c144e1f1f

parser/html/javasrc/Tokenizer.java