michael@0: /*
michael@0: * Copyright (c) 2005-2007 Henri Sivonen
michael@0: * Copyright (c) 2007-2013 Mozilla Foundation
michael@0: * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla
michael@0: * Foundation, and Opera Software ASA.
michael@0: *
michael@0: * Permission is hereby granted, free of charge, to any person obtaining a
michael@0: * copy of this software and associated documentation files (the "Software"),
michael@0: * to deal in the Software without restriction, including without limitation
michael@0: * the rights to use, copy, modify, merge, publish, distribute, sublicense,
michael@0: * and/or sell copies of the Software, and to permit persons to whom the
michael@0: * Software is furnished to do so, subject to the following conditions:
michael@0: *
michael@0: * The above copyright notice and this permission notice shall be included in
michael@0: * all copies or substantial portions of the Software.
michael@0: *
michael@0: * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
michael@0: * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
michael@0: * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
michael@0: * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
michael@0: * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
michael@0: * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
michael@0: * DEALINGS IN THE SOFTWARE.
michael@0: */
michael@0:
michael@0: /*
michael@0: * The comments following this one that use the same comment syntax as this
michael@0: * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007
michael@0: * amended as of June 18 2008 and May 31 2010.
michael@0: * That document came with this statement:
michael@0: * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and
michael@0: * Opera Software ASA. You are granted a license to use, reproduce and
michael@0: * create derivative works of this document."
michael@0: */
michael@0:
michael@0: package nu.validator.htmlparser.impl;
michael@0:
michael@0: import nu.validator.htmlparser.annotation.Auto;
michael@0: import nu.validator.htmlparser.annotation.CharacterName;
michael@0: import nu.validator.htmlparser.annotation.Const;
michael@0: import nu.validator.htmlparser.annotation.Inline;
michael@0: import nu.validator.htmlparser.annotation.Local;
michael@0: import nu.validator.htmlparser.annotation.NoLength;
michael@0: import nu.validator.htmlparser.common.EncodingDeclarationHandler;
michael@0: import nu.validator.htmlparser.common.Interner;
michael@0: import nu.validator.htmlparser.common.TokenHandler;
michael@0: import nu.validator.htmlparser.common.XmlViolationPolicy;
michael@0:
michael@0: import org.xml.sax.ErrorHandler;
michael@0: import org.xml.sax.Locator;
michael@0: import org.xml.sax.SAXException;
michael@0: import org.xml.sax.SAXParseException;
michael@0:
michael@0: /**
michael@0: * An implementation of
michael@0: * http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
michael@0: *
michael@0: * This class implements the Locator
interface. This is not an
michael@0: * incidental implementation detail: Users of this class are encouraged to make
michael@0: * use of the Locator
nature.
michael@0: *
michael@0: * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer
michael@0: * can be configured to treat these conditions as fatal or to coerce the infoset
michael@0: * to something that XML 1.0 allows.
michael@0: *
michael@0: * @version $Id$
michael@0: * @author hsivonen
michael@0: */
michael@0: public class Tokenizer implements Locator {
michael@0:
michael@0: private static final int DATA_AND_RCDATA_MASK = ~1;
michael@0:
michael@0: public static final int DATA = 0;
michael@0:
michael@0: public static final int RCDATA = 1;
michael@0:
michael@0: public static final int SCRIPT_DATA = 2;
michael@0:
michael@0: public static final int RAWTEXT = 3;
michael@0:
michael@0: public static final int SCRIPT_DATA_ESCAPED = 4;
michael@0:
michael@0: public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5;
michael@0:
michael@0: public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6;
michael@0:
michael@0: public static final int ATTRIBUTE_VALUE_UNQUOTED = 7;
michael@0:
michael@0: public static final int PLAINTEXT = 8;
michael@0:
michael@0: public static final int TAG_OPEN = 9;
michael@0:
michael@0: public static final int CLOSE_TAG_OPEN = 10;
michael@0:
michael@0: public static final int TAG_NAME = 11;
michael@0:
michael@0: public static final int BEFORE_ATTRIBUTE_NAME = 12;
michael@0:
michael@0: public static final int ATTRIBUTE_NAME = 13;
michael@0:
michael@0: public static final int AFTER_ATTRIBUTE_NAME = 14;
michael@0:
michael@0: public static final int BEFORE_ATTRIBUTE_VALUE = 15;
michael@0:
michael@0: public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16;
michael@0:
michael@0: public static final int BOGUS_COMMENT = 17;
michael@0:
michael@0: public static final int MARKUP_DECLARATION_OPEN = 18;
michael@0:
michael@0: public static final int DOCTYPE = 19;
michael@0:
michael@0: public static final int BEFORE_DOCTYPE_NAME = 20;
michael@0:
michael@0: public static final int DOCTYPE_NAME = 21;
michael@0:
michael@0: public static final int AFTER_DOCTYPE_NAME = 22;
michael@0:
michael@0: public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23;
michael@0:
michael@0: public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24;
michael@0:
michael@0: public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25;
michael@0:
michael@0: public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26;
michael@0:
michael@0: public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27;
michael@0:
michael@0: public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28;
michael@0:
michael@0: public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29;
michael@0:
michael@0: public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30;
michael@0:
michael@0: public static final int BOGUS_DOCTYPE = 31;
michael@0:
michael@0: public static final int COMMENT_START = 32;
michael@0:
michael@0: public static final int COMMENT_START_DASH = 33;
michael@0:
michael@0: public static final int COMMENT = 34;
michael@0:
michael@0: public static final int COMMENT_END_DASH = 35;
michael@0:
michael@0: public static final int COMMENT_END = 36;
michael@0:
michael@0: public static final int COMMENT_END_BANG = 37;
michael@0:
michael@0: public static final int NON_DATA_END_TAG_NAME = 38;
michael@0:
michael@0: public static final int MARKUP_DECLARATION_HYPHEN = 39;
michael@0:
michael@0: public static final int MARKUP_DECLARATION_OCTYPE = 40;
michael@0:
michael@0: public static final int DOCTYPE_UBLIC = 41;
michael@0:
michael@0: public static final int DOCTYPE_YSTEM = 42;
michael@0:
michael@0: public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43;
michael@0:
michael@0: public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44;
michael@0:
michael@0: public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45;
michael@0:
michael@0: public static final int CONSUME_CHARACTER_REFERENCE = 46;
michael@0:
michael@0: public static final int CONSUME_NCR = 47;
michael@0:
michael@0: public static final int CHARACTER_REFERENCE_TAIL = 48;
michael@0:
michael@0: public static final int HEX_NCR_LOOP = 49;
michael@0:
michael@0: public static final int DECIMAL_NRC_LOOP = 50;
michael@0:
michael@0: public static final int HANDLE_NCR_VALUE = 51;
michael@0:
michael@0: public static final int HANDLE_NCR_VALUE_RECONSUME = 52;
michael@0:
michael@0: public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53;
michael@0:
michael@0: public static final int SELF_CLOSING_START_TAG = 54;
michael@0:
michael@0: public static final int CDATA_START = 55;
michael@0:
michael@0: public static final int CDATA_SECTION = 56;
michael@0:
michael@0: public static final int CDATA_RSQB = 57;
michael@0:
michael@0: public static final int CDATA_RSQB_RSQB = 58;
michael@0:
michael@0: public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59;
michael@0:
michael@0: public static final int SCRIPT_DATA_ESCAPE_START = 60;
michael@0:
michael@0: public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61;
michael@0:
michael@0: public static final int SCRIPT_DATA_ESCAPED_DASH = 62;
michael@0:
michael@0: public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63;
michael@0:
michael@0: public static final int BOGUS_COMMENT_HYPHEN = 64;
michael@0:
michael@0: public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65;
michael@0:
michael@0: public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66;
michael@0:
michael@0: public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67;
michael@0:
michael@0: public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68;
michael@0:
michael@0: public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69;
michael@0:
michael@0: public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70;
michael@0:
michael@0: public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71;
michael@0:
michael@0: public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72;
michael@0:
michael@0: public static final int PROCESSING_INSTRUCTION = 73;
michael@0:
michael@0: public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74;
michael@0:
michael@0: /**
michael@0: * Magic value for UTF-16 operations.
michael@0: */
michael@0: private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10));
michael@0:
michael@0: /**
michael@0: * UTF-16 code unit array containing less than and greater than for emitting
michael@0: * those characters on certain parse errors.
michael@0: */
michael@0: private static final @NoLength char[] LT_GT = { '<', '>' };
michael@0:
michael@0: /**
michael@0: * UTF-16 code unit array containing less than and solidus for emitting
michael@0: * those characters on certain parse errors.
michael@0: */
michael@0: private static final @NoLength char[] LT_SOLIDUS = { '<', '/' };
michael@0:
michael@0: /**
michael@0: * UTF-16 code unit array containing ]] for emitting those characters on
michael@0: * state transitions.
michael@0: */
michael@0: private static final @NoLength char[] RSQB_RSQB = { ']', ']' };
michael@0:
michael@0: /**
michael@0: * Array version of U+FFFD.
michael@0: */
michael@0: private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' };
michael@0:
michael@0: // [NOCPP[
michael@0:
michael@0: /**
michael@0: * Array version of space.
michael@0: */
michael@0: private static final @NoLength char[] SPACE = { ' ' };
michael@0:
michael@0: // ]NOCPP]
michael@0:
michael@0: /**
michael@0: * Array version of line feed.
michael@0: */
michael@0: private static final @NoLength char[] LF = { '\n' };
michael@0:
michael@0: /**
michael@0: * Buffer growth parameter.
michael@0: */
michael@0: private static final int BUFFER_GROW_BY = 1024;
michael@0:
michael@0: /**
michael@0: * "CDATA[" as char[]
michael@0: */
michael@0: private static final @NoLength char[] CDATA_LSQB = { 'C', 'D', 'A', 'T',
michael@0: 'A', '[' };
michael@0:
michael@0: /**
michael@0: * "octype" as char[]
michael@0: */
michael@0: private static final @NoLength char[] OCTYPE = { 'o', 'c', 't', 'y', 'p',
michael@0: 'e' };
michael@0:
michael@0: /**
michael@0: * "ublic" as char[]
michael@0: */
michael@0: private static final @NoLength char[] UBLIC = { 'u', 'b', 'l', 'i', 'c' };
michael@0:
michael@0: /**
michael@0: * "ystem" as char[]
michael@0: */
michael@0: private static final @NoLength char[] YSTEM = { 'y', 's', 't', 'e', 'm' };
michael@0:
michael@0: private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' };
michael@0:
michael@0: private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' };
michael@0:
michael@0: private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' };
michael@0:
michael@0: private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't',
michael@0: 'e', 'x', 't' };
michael@0:
michael@0: private static final char[] XMP_ARR = { 'x', 'm', 'p' };
michael@0:
michael@0: private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r',
michael@0: 'e', 'a' };
michael@0:
michael@0: private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' };
michael@0:
michael@0: private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e',
michael@0: 'd' };
michael@0:
michael@0: private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i',
michael@0: 'p', 't' };
michael@0:
michael@0: private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm',
michael@0: 'e', 's' };
michael@0:
michael@0: /**
michael@0: * The token handler.
michael@0: */
michael@0: protected final TokenHandler tokenHandler;
michael@0:
michael@0: protected EncodingDeclarationHandler encodingDeclarationHandler;
michael@0:
michael@0: // [NOCPP[
michael@0:
michael@0: /**
michael@0: * The error handler.
michael@0: */
michael@0: protected ErrorHandler errorHandler;
michael@0:
michael@0: // ]NOCPP]
michael@0:
michael@0: /**
michael@0: * Whether the previous char read was CR.
michael@0: */
michael@0: protected boolean lastCR;
michael@0:
michael@0: protected int stateSave;
michael@0:
michael@0: private int returnStateSave;
michael@0:
michael@0: protected int index;
michael@0:
michael@0: private boolean forceQuirks;
michael@0:
michael@0: private char additional;
michael@0:
michael@0: private int entCol;
michael@0:
michael@0: private int firstCharKey;
michael@0:
michael@0: private int lo;
michael@0:
michael@0: private int hi;
michael@0:
michael@0: private int candidate;
michael@0:
michael@0: private int strBufMark;
michael@0:
michael@0: private int prevValue;
michael@0:
michael@0: protected int value;
michael@0:
michael@0: private boolean seenDigits;
michael@0:
michael@0: protected int cstart;
michael@0:
michael@0: /**
michael@0: * The SAX public id for the resource being tokenized. (Only passed to back
michael@0: * as part of locator data.)
michael@0: */
michael@0: private String publicId;
michael@0:
michael@0: /**
michael@0: * The SAX system id for the resource being tokenized. (Only passed to back
michael@0: * as part of locator data.)
michael@0: */
michael@0: private String systemId;
michael@0:
michael@0: /**
michael@0: * Buffer for short identifiers.
michael@0: */
michael@0: private @Auto char[] strBuf;
michael@0:
michael@0: /**
michael@0: * Number of significant char
s in strBuf
.
michael@0: */
michael@0: private int strBufLen;
michael@0:
michael@0: /**
michael@0: * -1
to indicate that strBuf
is used or otherwise
michael@0: * an offset to the main buffer.
michael@0: */
michael@0: // private int strBufOffset = -1;
michael@0: /**
michael@0: * Buffer for long strings.
michael@0: */
michael@0: private @Auto char[] longStrBuf;
michael@0:
michael@0: /**
michael@0: * Number of significant char
s in longStrBuf
.
michael@0: */
michael@0: private int longStrBufLen;
michael@0:
michael@0: /**
michael@0: * -1
to indicate that longStrBuf
is used or
michael@0: * otherwise an offset to the main buffer.
michael@0: */
michael@0: // private int longStrBufOffset = -1;
michael@0:
michael@0: /**
michael@0: * Buffer for expanding NCRs falling into the Basic Multilingual Plane.
michael@0: */
michael@0: private final @Auto char[] bmpChar;
michael@0:
michael@0: /**
michael@0: * Buffer for expanding astral NCRs.
michael@0: */
michael@0: private final @Auto char[] astralChar;
michael@0:
michael@0: /**
michael@0: * The element whose end tag closes the current CDATA or RCDATA element.
michael@0: */
michael@0: protected ElementName endTagExpectation = null;
michael@0:
michael@0: private char[] endTagExpectationAsArray; // not @Auto!
michael@0:
michael@0: /**
michael@0: * true
if tokenizing an end tag
michael@0: */
michael@0: protected boolean endTag;
michael@0:
michael@0: /**
michael@0: * The current tag token name.
michael@0: */
michael@0: private ElementName tagName = null;
michael@0:
michael@0: /**
michael@0: * The current attribute name.
michael@0: */
michael@0: protected AttributeName attributeName = null;
michael@0:
michael@0: // [NOCPP[
michael@0:
michael@0: /**
michael@0: * Whether comment tokens are emitted.
michael@0: */
michael@0: private boolean wantsComments = false;
michael@0:
michael@0: /**
michael@0: * true
when HTML4-specific additional errors are requested.
michael@0: */
michael@0: protected boolean html4;
michael@0:
michael@0: /**
michael@0: * Whether the stream is past the first 512 bytes.
michael@0: */
michael@0: private boolean metaBoundaryPassed;
michael@0:
michael@0: // ]NOCPP]
michael@0:
michael@0: /**
michael@0: * The name of the current doctype token.
michael@0: */
michael@0: private @Local String doctypeName;
michael@0:
michael@0: /**
michael@0: * The public id of the current doctype token.
michael@0: */
michael@0: private String publicIdentifier;
michael@0:
michael@0: /**
michael@0: * The system id of the current doctype token.
michael@0: */
michael@0: private String systemIdentifier;
michael@0:
michael@0: /**
michael@0: * The attribute holder.
michael@0: */
michael@0: private HtmlAttributes attributes;
michael@0:
michael@0: // [NOCPP[
michael@0:
michael@0: /**
michael@0: * The policy for vertical tab and form feed.
michael@0: */
michael@0: private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET;
michael@0:
michael@0: /**
michael@0: * The policy for comments.
michael@0: */
michael@0: private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET;
michael@0:
michael@0: private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET;
michael@0:
michael@0: private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET;
michael@0:
michael@0: private boolean html4ModeCompatibleWithXhtml1Schemata;
michael@0:
michael@0: private int mappingLangToXmlLang;
michael@0:
michael@0: // ]NOCPP]
michael@0:
michael@0: private final boolean newAttributesEachTime;
michael@0:
michael@0: private boolean shouldSuspend;
michael@0:
michael@0: protected boolean confident;
michael@0:
michael@0: private int line;
michael@0:
michael@0: private Interner interner;
michael@0:
michael@0: // CPPONLY: private boolean viewingXmlSource;
michael@0:
michael@0: // [NOCPP[
michael@0:
michael@0: protected LocatorImpl ampersandLocation;
michael@0:
michael@0: public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) {
michael@0: this.tokenHandler = tokenHandler;
michael@0: this.encodingDeclarationHandler = null;
michael@0: this.newAttributesEachTime = newAttributesEachTime;
michael@0: this.bmpChar = new char[1];
michael@0: this.astralChar = new char[2];
michael@0: this.tagName = null;
michael@0: this.attributeName = null;
michael@0: this.doctypeName = null;
michael@0: this.publicIdentifier = null;
michael@0: this.systemIdentifier = null;
michael@0: this.attributes = null;
michael@0: }
michael@0:
michael@0: // ]NOCPP]
michael@0:
michael@0: /**
michael@0: * The constructor.
michael@0: *
michael@0: * @param tokenHandler
michael@0: * the handler for receiving tokens
michael@0: */
michael@0: public Tokenizer(TokenHandler tokenHandler
michael@0: // CPPONLY: , boolean viewingXmlSource
michael@0: ) {
michael@0: this.tokenHandler = tokenHandler;
michael@0: this.encodingDeclarationHandler = null;
michael@0: // [NOCPP[
michael@0: this.newAttributesEachTime = false;
michael@0: // ]NOCPP]
michael@0: this.bmpChar = new char[1];
michael@0: this.astralChar = new char[2];
michael@0: this.tagName = null;
michael@0: this.attributeName = null;
michael@0: this.doctypeName = null;
michael@0: this.publicIdentifier = null;
michael@0: this.systemIdentifier = null;
michael@0: // [NOCPP[
michael@0: this.attributes = null;
michael@0: // ]NOCPP]
michael@0: // CPPONLY: this.attributes = tokenHandler.HasBuilder() ? new HtmlAttributes(mappingLangToXmlLang) : null;
michael@0: // CPPONLY: this.newAttributesEachTime = !tokenHandler.HasBuilder();
michael@0: // CPPONLY: this.viewingXmlSource = viewingXmlSource;
michael@0: }
michael@0:
michael@0: public void setInterner(Interner interner) {
michael@0: this.interner = interner;
michael@0: }
michael@0:
michael@0: public void initLocation(String newPublicId, String newSystemId) {
michael@0: this.systemId = newSystemId;
michael@0: this.publicId = newPublicId;
michael@0:
michael@0: }
michael@0:
michael@0: // CPPONLY: boolean isViewingXmlSource() {
michael@0: // CPPONLY: return viewingXmlSource;
michael@0: // CPPONLY: }
michael@0:
michael@0: // [NOCPP[
michael@0:
michael@0: /**
michael@0: * Returns the mappingLangToXmlLang.
michael@0: *
michael@0: * @return the mappingLangToXmlLang
michael@0: */
michael@0: public boolean isMappingLangToXmlLang() {
michael@0: return mappingLangToXmlLang == AttributeName.HTML_LANG;
michael@0: }
michael@0:
michael@0: /**
michael@0: * Sets the mappingLangToXmlLang.
michael@0: *
michael@0: * @param mappingLangToXmlLang
michael@0: * the mappingLangToXmlLang to set
michael@0: */
michael@0: public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
michael@0: this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG
michael@0: : AttributeName.HTML;
michael@0: }
michael@0:
michael@0: /**
michael@0: * Sets the error handler.
michael@0: *
michael@0: * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
michael@0: */
michael@0: public void setErrorHandler(ErrorHandler eh) {
michael@0: this.errorHandler = eh;
michael@0: }
michael@0:
michael@0: public ErrorHandler getErrorHandler() {
michael@0: return this.errorHandler;
michael@0: }
michael@0:
michael@0: /**
michael@0: * Sets the commentPolicy.
michael@0: *
michael@0: * @param commentPolicy
michael@0: * the commentPolicy to set
michael@0: */
michael@0: public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
michael@0: this.commentPolicy = commentPolicy;
michael@0: }
michael@0:
michael@0: /**
michael@0: * Sets the contentNonXmlCharPolicy.
michael@0: *
michael@0: * @param contentNonXmlCharPolicy
michael@0: * the contentNonXmlCharPolicy to set
michael@0: */
michael@0: public void setContentNonXmlCharPolicy(
michael@0: XmlViolationPolicy contentNonXmlCharPolicy) {
michael@0: if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) {
michael@0: throw new IllegalArgumentException(
michael@0: "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW.");
michael@0: }
michael@0: }
michael@0:
michael@0: /**
michael@0: * Sets the contentSpacePolicy.
michael@0: *
michael@0: * @param contentSpacePolicy
michael@0: * the contentSpacePolicy to set
michael@0: */
michael@0: public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
michael@0: this.contentSpacePolicy = contentSpacePolicy;
michael@0: }
michael@0:
michael@0: /**
michael@0: * Sets the xmlnsPolicy.
michael@0: *
michael@0: * @param xmlnsPolicy
michael@0: * the xmlnsPolicy to set
michael@0: */
michael@0: public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
michael@0: if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
michael@0: throw new IllegalArgumentException("Can't use FATAL here.");
michael@0: }
michael@0: this.xmlnsPolicy = xmlnsPolicy;
michael@0: }
michael@0:
michael@0: public void setNamePolicy(XmlViolationPolicy namePolicy) {
michael@0: this.namePolicy = namePolicy;
michael@0: }
michael@0:
michael@0: /**
michael@0: * Sets the html4ModeCompatibleWithXhtml1Schemata.
michael@0: *
michael@0: * @param html4ModeCompatibleWithXhtml1Schemata
michael@0: * the html4ModeCompatibleWithXhtml1Schemata to set
michael@0: */
michael@0: public void setHtml4ModeCompatibleWithXhtml1Schemata(
michael@0: boolean html4ModeCompatibleWithXhtml1Schemata) {
michael@0: this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata;
michael@0: }
michael@0:
michael@0: // ]NOCPP]
michael@0:
michael@0: // For the token handler to call
michael@0: /**
michael@0: * Sets the tokenizer state and the associated element name. This should
michael@0: * only ever used to put the tokenizer into one of the states that have
michael@0: * a special end tag expectation.
michael@0: *
michael@0: * @param specialTokenizerState
michael@0: * the tokenizer state to set
michael@0: * @param endTagExpectation
michael@0: * the expected end tag for transitioning back to normal
michael@0: */
michael@0: public void setStateAndEndTagExpectation(int specialTokenizerState,
michael@0: @Local String endTagExpectation) {
michael@0: this.stateSave = specialTokenizerState;
michael@0: if (specialTokenizerState == Tokenizer.DATA) {
michael@0: return;
michael@0: }
michael@0: @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation);
michael@0: this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 0,
michael@0: asArray.length, interner);
michael@0: endTagExpectationToArray();
michael@0: }
michael@0:
michael@0: /**
michael@0: * Sets the tokenizer state and the associated element name. This should
michael@0: * only ever used to put the tokenizer into one of the states that have
michael@0: * a special end tag expectation.
michael@0: *
michael@0: * @param specialTokenizerState
michael@0: * the tokenizer state to set
michael@0: * @param endTagExpectation
michael@0: * the expected end tag for transitioning back to normal
michael@0: */
michael@0: public void setStateAndEndTagExpectation(int specialTokenizerState,
michael@0: ElementName endTagExpectation) {
michael@0: this.stateSave = specialTokenizerState;
michael@0: this.endTagExpectation = endTagExpectation;
michael@0: endTagExpectationToArray();
michael@0: }
michael@0:
michael@0: private void endTagExpectationToArray() {
michael@0: switch (endTagExpectation.getGroup()) {
michael@0: case TreeBuilder.TITLE:
michael@0: endTagExpectationAsArray = TITLE_ARR;
michael@0: return;
michael@0: case TreeBuilder.SCRIPT:
michael@0: endTagExpectationAsArray = SCRIPT_ARR;
michael@0: return;
michael@0: case TreeBuilder.STYLE:
michael@0: endTagExpectationAsArray = STYLE_ARR;
michael@0: return;
michael@0: case TreeBuilder.PLAINTEXT:
michael@0: endTagExpectationAsArray = PLAINTEXT_ARR;
michael@0: return;
michael@0: case TreeBuilder.XMP:
michael@0: endTagExpectationAsArray = XMP_ARR;
michael@0: return;
michael@0: case TreeBuilder.TEXTAREA:
michael@0: endTagExpectationAsArray = TEXTAREA_ARR;
michael@0: return;
michael@0: case TreeBuilder.IFRAME:
michael@0: endTagExpectationAsArray = IFRAME_ARR;
michael@0: return;
michael@0: case TreeBuilder.NOEMBED:
michael@0: endTagExpectationAsArray = NOEMBED_ARR;
michael@0: return;
michael@0: case TreeBuilder.NOSCRIPT:
michael@0: endTagExpectationAsArray = NOSCRIPT_ARR;
michael@0: return;
michael@0: case TreeBuilder.NOFRAMES:
michael@0: endTagExpectationAsArray = NOFRAMES_ARR;
michael@0: return;
michael@0: default:
michael@0: assert false: "Bad end tag expectation.";
michael@0: return;
michael@0: }
michael@0: }
michael@0:
michael@0: /**
michael@0: * For C++ use only.
michael@0: */
michael@0: public void setLineNumber(int line) {
michael@0: this.line = line;
michael@0: }
michael@0:
michael@0: // start Locator impl
michael@0:
michael@0: /**
michael@0: * @see org.xml.sax.Locator#getLineNumber()
michael@0: */
michael@0: @Inline public int getLineNumber() {
michael@0: return line;
michael@0: }
michael@0:
michael@0: // [NOCPP[
michael@0:
michael@0: /**
michael@0: * @see org.xml.sax.Locator#getColumnNumber()
michael@0: */
michael@0: @Inline public int getColumnNumber() {
michael@0: return -1;
michael@0: }
michael@0:
michael@0: /**
michael@0: * @see org.xml.sax.Locator#getPublicId()
michael@0: */
michael@0: public String getPublicId() {
michael@0: return publicId;
michael@0: }
michael@0:
michael@0: /**
michael@0: * @see org.xml.sax.Locator#getSystemId()
michael@0: */
michael@0: public String getSystemId() {
michael@0: return systemId;
michael@0: }
michael@0:
michael@0: // end Locator impl
michael@0:
michael@0: // end public API
michael@0:
michael@0: public void notifyAboutMetaBoundary() {
michael@0: metaBoundaryPassed = true;
michael@0: }
michael@0:
michael@0: void turnOnAdditionalHtml4Errors() {
michael@0: html4 = true;
michael@0: }
michael@0:
michael@0: // ]NOCPP]
michael@0:
michael@0: HtmlAttributes emptyAttributes() {
michael@0: // [NOCPP[
michael@0: if (newAttributesEachTime) {
michael@0: return new HtmlAttributes(mappingLangToXmlLang);
michael@0: } else {
michael@0: // ]NOCPP]
michael@0: return HtmlAttributes.EMPTY_ATTRIBUTES;
michael@0: // [NOCPP[
michael@0: }
michael@0: // ]NOCPP]
michael@0: }
michael@0:
michael@0: @Inline private void clearStrBufAndAppend(char c) {
michael@0: strBuf[0] = c;
michael@0: strBufLen = 1;
michael@0: }
michael@0:
michael@0: @Inline private void clearStrBuf() {
michael@0: strBufLen = 0;
michael@0: }
michael@0:
michael@0: /**
michael@0: * Appends to the smaller buffer.
michael@0: *
michael@0: * @param c
michael@0: * the UTF-16 code unit to append
michael@0: */
michael@0: private void appendStrBuf(char c) {
michael@0: if (strBufLen == strBuf.length) {
michael@0: char[] newBuf = new char[strBuf.length + Tokenizer.BUFFER_GROW_BY];
michael@0: System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length);
michael@0: strBuf = newBuf;
michael@0: }
michael@0: strBuf[strBufLen++] = c;
michael@0: }
michael@0:
michael@0: /**
michael@0: * The smaller buffer as a String. Currently only used for error reporting.
michael@0: *
michael@0: *
michael@0: * C++ memory note: The return value must be released. michael@0: * michael@0: * @return the smaller buffer as a string michael@0: */ michael@0: protected String strBufToString() { michael@0: return Portability.newStringFromBuffer(strBuf, 0, strBufLen); michael@0: } michael@0: michael@0: /** michael@0: * Returns the short buffer as a local name. The return value is released in michael@0: * emitDoctypeToken(). michael@0: * michael@0: * @return the smaller buffer as local name michael@0: */ michael@0: private void strBufToDoctypeName() { michael@0: doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen, michael@0: interner); michael@0: } michael@0: michael@0: /** michael@0: * Emits the smaller buffer as character tokens. michael@0: * michael@0: * @throws SAXException michael@0: * if the token handler threw michael@0: */ michael@0: private void emitStrBuf() throws SAXException { michael@0: if (strBufLen > 0) { michael@0: tokenHandler.characters(strBuf, 0, strBufLen); michael@0: } michael@0: } michael@0: michael@0: @Inline private void clearLongStrBuf() { michael@0: longStrBufLen = 0; michael@0: } michael@0: michael@0: @Inline private void clearLongStrBufAndAppend(char c) { michael@0: longStrBuf[0] = c; michael@0: longStrBufLen = 1; michael@0: } michael@0: michael@0: /** michael@0: * Appends to the larger buffer. michael@0: * michael@0: * @param c michael@0: * the UTF-16 code unit to append michael@0: */ michael@0: private void appendLongStrBuf(char c) { michael@0: if (longStrBufLen == longStrBuf.length) { michael@0: char[] newBuf = new char[longStrBufLen + (longStrBufLen >> 1)]; michael@0: System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length); michael@0: longStrBuf = newBuf; michael@0: } michael@0: longStrBuf[longStrBufLen++] = c; michael@0: } michael@0: michael@0: @Inline private void appendSecondHyphenToBogusComment() throws SAXException { michael@0: // [NOCPP[ michael@0: switch (commentPolicy) { michael@0: case ALTER_INFOSET: michael@0: // detachLongStrBuf(); michael@0: appendLongStrBuf(' '); michael@0: // FALLTHROUGH michael@0: case ALLOW: michael@0: warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); michael@0: // ]NOCPP] michael@0: appendLongStrBuf('-'); michael@0: // [NOCPP[ michael@0: break; michael@0: case FATAL: michael@0: fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); michael@0: break; michael@0: } michael@0: // ]NOCPP] michael@0: } michael@0: michael@0: // [NOCPP[ michael@0: private void maybeAppendSpaceToBogusComment() throws SAXException { michael@0: switch (commentPolicy) { michael@0: case ALTER_INFOSET: michael@0: // detachLongStrBuf(); michael@0: appendLongStrBuf(' '); michael@0: // FALLTHROUGH michael@0: case ALLOW: michael@0: warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment."); michael@0: break; michael@0: case FATAL: michael@0: fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment."); michael@0: break; michael@0: } michael@0: } michael@0: michael@0: // ]NOCPP] michael@0: michael@0: @Inline private void adjustDoubleHyphenAndAppendToLongStrBufAndErr(char c) michael@0: throws SAXException { michael@0: errConsecutiveHyphens(); michael@0: // [NOCPP[ michael@0: switch (commentPolicy) { michael@0: case ALTER_INFOSET: michael@0: // detachLongStrBuf(); michael@0: longStrBufLen--; michael@0: appendLongStrBuf(' '); michael@0: appendLongStrBuf('-'); michael@0: // FALLTHROUGH michael@0: case ALLOW: michael@0: warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); michael@0: // ]NOCPP] michael@0: appendLongStrBuf(c); michael@0: // [NOCPP[ michael@0: break; michael@0: case FATAL: michael@0: fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); michael@0: break; michael@0: } michael@0: // ]NOCPP] michael@0: } michael@0: michael@0: private void appendLongStrBuf(@NoLength char[] buffer, int offset, int length) { michael@0: int reqLen = longStrBufLen + length; michael@0: if (longStrBuf.length < reqLen) { michael@0: char[] newBuf = new char[reqLen + (reqLen >> 1)]; michael@0: System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length); michael@0: longStrBuf = newBuf; michael@0: } michael@0: System.arraycopy(buffer, offset, longStrBuf, longStrBufLen, length); michael@0: longStrBufLen = reqLen; michael@0: } michael@0: michael@0: /** michael@0: * Append the contents of the smaller buffer to the larger one. michael@0: */ michael@0: @Inline private void appendStrBufToLongStrBuf() { michael@0: appendLongStrBuf(strBuf, 0, strBufLen); michael@0: } michael@0: michael@0: /** michael@0: * The larger buffer as a string. michael@0: * michael@0: *
michael@0: * C++ memory note: The return value must be released.
michael@0: *
michael@0: * @return the larger buffer as a string
michael@0: */
michael@0: private String longStrBufToString() {
michael@0: return Portability.newStringFromBuffer(longStrBuf, 0, longStrBufLen);
michael@0: }
michael@0:
michael@0: /**
michael@0: * Emits the current comment token.
michael@0: *
michael@0: * @param pos
michael@0: * TODO
michael@0: *
michael@0: * @throws SAXException
michael@0: */
michael@0: private void emitComment(int provisionalHyphens, int pos)
michael@0: throws SAXException {
michael@0: // [NOCPP[
michael@0: if (wantsComments) {
michael@0: // ]NOCPP]
michael@0: // if (longStrBufOffset != -1) {
michael@0: // tokenHandler.comment(buf, longStrBufOffset, longStrBufLen
michael@0: // - provisionalHyphens);
michael@0: // } else {
michael@0: tokenHandler.comment(longStrBuf, 0, longStrBufLen
michael@0: - provisionalHyphens);
michael@0: // }
michael@0: // [NOCPP[
michael@0: }
michael@0: // ]NOCPP]
michael@0: cstart = pos + 1;
michael@0: }
michael@0:
michael@0: /**
michael@0: * Flushes coalesced character tokens.
michael@0: *
michael@0: * @param buf
michael@0: * TODO
michael@0: * @param pos
michael@0: * TODO
michael@0: *
michael@0: * @throws SAXException
michael@0: */
michael@0: protected void flushChars(@NoLength char[] buf, int pos)
michael@0: throws SAXException {
michael@0: if (pos > cstart) {
michael@0: tokenHandler.characters(buf, cstart, pos - cstart);
michael@0: }
michael@0: cstart = Integer.MAX_VALUE;
michael@0: }
michael@0:
michael@0: /**
michael@0: * Reports an condition that would make the infoset incompatible with XML
michael@0: * 1.0 as fatal.
michael@0: *
michael@0: * @param message
michael@0: * the message
michael@0: * @throws SAXException
michael@0: * @throws SAXParseException
michael@0: */
michael@0: public void fatal(String message) throws SAXException {
michael@0: SAXParseException spe = new SAXParseException(message, this);
michael@0: if (errorHandler != null) {
michael@0: errorHandler.fatalError(spe);
michael@0: }
michael@0: throw spe;
michael@0: }
michael@0:
michael@0: /**
michael@0: * Reports a Parse Error.
michael@0: *
michael@0: * @param message
michael@0: * the message
michael@0: * @throws SAXException
michael@0: */
michael@0: public void err(String message) throws SAXException {
michael@0: if (errorHandler == null) {
michael@0: return;
michael@0: }
michael@0: SAXParseException spe = new SAXParseException(message, this);
michael@0: errorHandler.error(spe);
michael@0: }
michael@0:
michael@0: public void errTreeBuilder(String message) throws SAXException {
michael@0: ErrorHandler eh = null;
michael@0: if (tokenHandler instanceof TreeBuilder>) {
michael@0: TreeBuilder> treeBuilder = (TreeBuilder>) tokenHandler;
michael@0: eh = treeBuilder.getErrorHandler();
michael@0: }
michael@0: if (eh == null) {
michael@0: eh = errorHandler;
michael@0: }
michael@0: if (eh == null) {
michael@0: return;
michael@0: }
michael@0: SAXParseException spe = new SAXParseException(message, this);
michael@0: eh.error(spe);
michael@0: }
michael@0:
michael@0: /**
michael@0: * Reports a warning
michael@0: *
michael@0: * @param message
michael@0: * the message
michael@0: * @throws SAXException
michael@0: */
michael@0: public void warn(String message) throws SAXException {
michael@0: if (errorHandler == null) {
michael@0: return;
michael@0: }
michael@0: SAXParseException spe = new SAXParseException(message, this);
michael@0: errorHandler.warning(spe);
michael@0: }
michael@0:
michael@0: private void strBufToElementNameString() {
michael@0: // if (strBufOffset != -1) {
michael@0: // return ElementName.elementNameByBuffer(buf, strBufOffset, strBufLen);
michael@0: // } else {
michael@0: tagName = ElementName.elementNameByBuffer(strBuf, 0, strBufLen,
michael@0: interner);
michael@0: // }
michael@0: }
michael@0:
michael@0: private int emitCurrentTagToken(boolean selfClosing, int pos)
michael@0: throws SAXException {
michael@0: cstart = pos + 1;
michael@0: maybeErrSlashInEndTag(selfClosing);
michael@0: stateSave = Tokenizer.DATA;
michael@0: HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES
michael@0: : attributes);
michael@0: if (endTag) {
michael@0: /*
michael@0: * When an end tag token is emitted, the content model flag must be
michael@0: * switched to the PCDATA state.
michael@0: */
michael@0: maybeErrAttributesOnEndTag(attrs);
michael@0: // CPPONLY: if (!viewingXmlSource) {
michael@0: tokenHandler.endTag(tagName);
michael@0: // CPPONLY: }
michael@0: // CPPONLY: if (newAttributesEachTime) {
michael@0: // CPPONLY: Portability.delete(attributes);
michael@0: // CPPONLY: attributes = null;
michael@0: // CPPONLY: }
michael@0: } else {
michael@0: // CPPONLY: if (viewingXmlSource) {
michael@0: // CPPONLY: assert newAttributesEachTime;
michael@0: // CPPONLY: Portability.delete(attributes);
michael@0: // CPPONLY: attributes = null;
michael@0: // CPPONLY: } else {
michael@0: tokenHandler.startTag(tagName, attrs, selfClosing);
michael@0: // CPPONLY: }
michael@0: }
michael@0: tagName.release();
michael@0: tagName = null;
michael@0: if (newAttributesEachTime) {
michael@0: attributes = null;
michael@0: } else {
michael@0: attributes.clear(mappingLangToXmlLang);
michael@0: }
michael@0: /*
michael@0: * The token handler may have called setStateAndEndTagExpectation
michael@0: * and changed stateSave since the start of this method.
michael@0: */
michael@0: return stateSave;
michael@0: }
michael@0:
michael@0: private void attributeNameComplete() throws SAXException {
michael@0: // if (strBufOffset != -1) {
michael@0: // attributeName = AttributeName.nameByBuffer(buf, strBufOffset,
michael@0: // strBufLen, namePolicy != XmlViolationPolicy.ALLOW);
michael@0: // } else {
michael@0: attributeName = AttributeName.nameByBuffer(strBuf, 0, strBufLen
michael@0: // [NOCPP[
michael@0: , namePolicy != XmlViolationPolicy.ALLOW
michael@0: // ]NOCPP]
michael@0: , interner);
michael@0: // }
michael@0:
michael@0: if (attributes == null) {
michael@0: attributes = new HtmlAttributes(mappingLangToXmlLang);
michael@0: }
michael@0:
michael@0: /*
michael@0: * When the user agent leaves the attribute name state (and before
michael@0: * emitting the tag token, if appropriate), the complete attribute's
michael@0: * name must be compared to the other attributes on the same token; if
michael@0: * there is already an attribute on the token with the exact same name,
michael@0: * then this is a parse error and the new attribute must be dropped,
michael@0: * along with the value that gets associated with it (if any).
michael@0: */
michael@0: if (attributes.contains(attributeName)) {
michael@0: errDuplicateAttribute();
michael@0: attributeName.release();
michael@0: attributeName = null;
michael@0: }
michael@0: }
michael@0:
michael@0: private void addAttributeWithoutValue() throws SAXException {
michael@0: noteAttributeWithoutValue();
michael@0:
michael@0: // [NOCPP[
michael@0: if (metaBoundaryPassed && AttributeName.CHARSET == attributeName
michael@0: && ElementName.META == tagName) {
michael@0: err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
michael@0: }
michael@0: // ]NOCPP]
michael@0: if (attributeName != null) {
michael@0: // [NOCPP[
michael@0: if (html4) {
michael@0: if (attributeName.isBoolean()) {
michael@0: if (html4ModeCompatibleWithXhtml1Schemata) {
michael@0: attributes.addAttribute(attributeName,
michael@0: attributeName.getLocal(AttributeName.HTML),
michael@0: xmlnsPolicy);
michael@0: } else {
michael@0: attributes.addAttribute(attributeName, "", xmlnsPolicy);
michael@0: }
michael@0: } else {
michael@0: if (AttributeName.BORDER != attributeName) {
michael@0: err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)");
michael@0: attributes.addAttribute(attributeName, "", xmlnsPolicy);
michael@0: }
michael@0: }
michael@0: } else {
michael@0: if (AttributeName.SRC == attributeName
michael@0: || AttributeName.HREF == attributeName) {
michael@0: warn("Attribute \u201C"
michael@0: + attributeName.getLocal(AttributeName.HTML)
michael@0: + "\u201D without an explicit value seen. The attribute may be dropped by IE7.");
michael@0: }
michael@0: // ]NOCPP]
michael@0: attributes.addAttribute(attributeName,
michael@0: Portability.newEmptyString()
michael@0: // [NOCPP[
michael@0: , xmlnsPolicy
michael@0: // ]NOCPP]
michael@0: );
michael@0: // [NOCPP[
michael@0: }
michael@0: // ]NOCPP]
michael@0: attributeName = null; // attributeName has been adopted by the
michael@0: // |attributes| object
michael@0: }
michael@0: }
michael@0:
michael@0: private void addAttributeWithValue() throws SAXException {
michael@0: // [NOCPP[
michael@0: if (metaBoundaryPassed && ElementName.META == tagName
michael@0: && AttributeName.CHARSET == attributeName) {
michael@0: err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
michael@0: }
michael@0: // ]NOCPP]
michael@0: if (attributeName != null) {
michael@0: String val = longStrBufToString(); // Ownership transferred to
michael@0: // HtmlAttributes
michael@0: // CPPONLY: if (mViewSource) {
michael@0: // CPPONLY: mViewSource.MaybeLinkifyAttributeValue(attributeName, val);
michael@0: // CPPONLY: }
michael@0: // [NOCPP[
michael@0: if (!endTag && html4 && html4ModeCompatibleWithXhtml1Schemata
michael@0: && attributeName.isCaseFolded()) {
michael@0: val = newAsciiLowerCaseStringFromString(val);
michael@0: }
michael@0: // ]NOCPP]
michael@0: attributes.addAttribute(attributeName, val
michael@0: // [NOCPP[
michael@0: , xmlnsPolicy
michael@0: // ]NOCPP]
michael@0: );
michael@0: attributeName = null; // attributeName has been adopted by the
michael@0: // |attributes| object
michael@0: }
michael@0: }
michael@0:
michael@0: // [NOCPP[
michael@0:
michael@0: private static String newAsciiLowerCaseStringFromString(String str) {
michael@0: if (str == null) {
michael@0: return null;
michael@0: }
michael@0: char[] buf = new char[str.length()];
michael@0: for (int i = 0; i < str.length(); i++) {
michael@0: char c = str.charAt(i);
michael@0: if (c >= 'A' && c <= 'Z') {
michael@0: c += 0x20;
michael@0: }
michael@0: buf[i] = c;
michael@0: }
michael@0: return new String(buf);
michael@0: }
michael@0:
michael@0: protected void startErrorReporting() throws SAXException {
michael@0:
michael@0: }
michael@0:
michael@0: // ]NOCPP]
michael@0:
michael@0: public void start() throws SAXException {
michael@0: initializeWithoutStarting();
michael@0: tokenHandler.startTokenization(this);
michael@0: // [NOCPP[
michael@0: startErrorReporting();
michael@0: // ]NOCPP]
michael@0: }
michael@0:
michael@0: public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException {
michael@0: int state = stateSave;
michael@0: int returnState = returnStateSave;
michael@0: char c = '\u0000';
michael@0: shouldSuspend = false;
michael@0: lastCR = false;
michael@0:
michael@0: int start = buffer.getStart();
michael@0: /**
michael@0: * The index of the last char
read from buf
.
michael@0: */
michael@0: int pos = start - 1;
michael@0:
michael@0: /**
michael@0: * The index of the first char
in buf
that is
michael@0: * part of a coalesced run of character tokens or
michael@0: * Integer.MAX_VALUE
if there is not a current run being
michael@0: * coalesced.
michael@0: */
michael@0: switch (state) {
michael@0: case DATA:
michael@0: case RCDATA:
michael@0: case SCRIPT_DATA:
michael@0: case PLAINTEXT:
michael@0: case RAWTEXT:
michael@0: case CDATA_SECTION:
michael@0: case SCRIPT_DATA_ESCAPED:
michael@0: case SCRIPT_DATA_ESCAPE_START:
michael@0: case SCRIPT_DATA_ESCAPE_START_DASH:
michael@0: case SCRIPT_DATA_ESCAPED_DASH:
michael@0: case SCRIPT_DATA_ESCAPED_DASH_DASH:
michael@0: case SCRIPT_DATA_DOUBLE_ESCAPE_START:
michael@0: case SCRIPT_DATA_DOUBLE_ESCAPED:
michael@0: case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
michael@0: case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
michael@0: case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
michael@0: case SCRIPT_DATA_DOUBLE_ESCAPE_END:
michael@0: cstart = start;
michael@0: break;
michael@0: default:
michael@0: cstart = Integer.MAX_VALUE;
michael@0: break;
michael@0: }
michael@0:
michael@0: /**
michael@0: * The number of char
s in buf
that have
michael@0: * meaning. (The rest of the array is garbage and should not be
michael@0: * examined.)
michael@0: */
michael@0: // CPPONLY: if (mViewSource) {
michael@0: // CPPONLY: mViewSource.SetBuffer(buffer);
michael@0: // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
michael@0: // CPPONLY: mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1);
michael@0: // CPPONLY: } else {
michael@0: // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
michael@0: // CPPONLY: }
michael@0: // [NOCPP[
michael@0: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState,
michael@0: buffer.getEnd());
michael@0: // ]NOCPP]
michael@0: if (pos == buffer.getEnd()) {
michael@0: // exiting due to end of buffer
michael@0: buffer.setStart(pos);
michael@0: } else {
michael@0: buffer.setStart(pos + 1);
michael@0: }
michael@0: return lastCR;
michael@0: }
michael@0:
michael@0: @SuppressWarnings("unused") private int stateLoop(int state, char c,
michael@0: int pos, @NoLength char[] buf, boolean reconsume, int returnState,
michael@0: int endPos) throws SAXException {
michael@0: /*
michael@0: * Idioms used in this code:
michael@0: *
michael@0: *
michael@0: * Consuming the next input character
michael@0: *
michael@0: * To consume the next input character, the code does this: if (++pos ==
michael@0: * endPos) { break stateloop; } c = checkChar(buf, pos);
michael@0: *
michael@0: *
michael@0: * Staying in a state
michael@0: *
michael@0: * When there's a state that the tokenizer may stay in over multiple
michael@0: * input characters, the state has a wrapper |for(;;)| loop and staying
michael@0: * in the state continues the loop.
michael@0: *
michael@0: *
michael@0: * Switching to another state
michael@0: *
michael@0: * To switch to another state, the code sets the state variable to the
michael@0: * magic number of the new state. Then it either continues stateloop or
michael@0: * breaks out of the state's own wrapper loop if the target state is
michael@0: * right after the current state in source order. (This is a partial
michael@0: * workaround for Java's lack of goto.)
michael@0: *
michael@0: *
michael@0: * Reconsume support
michael@0: *
michael@0: * The spec sometimes says that an input character is reconsumed in
michael@0: * another state. If a state can ever be entered so that an input
michael@0: * character can be reconsumed in it, the state's code starts with an
michael@0: * |if (reconsume)| that sets reconsume to false and skips over the
michael@0: * normal code for consuming a new character.
michael@0: *
michael@0: * To reconsume the current character in another state, the code sets
michael@0: * |reconsume| to true and then switches to the other state.
michael@0: *
michael@0: *
michael@0: * Emitting character tokens
michael@0: *
michael@0: * This method emits character tokens lazily. Whenever a new range of
michael@0: * character tokens starts, the field cstart must be set to the start
michael@0: * index of the range. The flushChars() method must be called at the end
michael@0: * of a range to flush it.
michael@0: *
michael@0: *
michael@0: * U+0000 handling
michael@0: *
michael@0: * The various states have to handle the replacement of U+0000 with
michael@0: * U+FFFD. However, if U+0000 would be reconsumed in another state, the
michael@0: * replacement doesn't need to happen, because it's handled by the
michael@0: * reconsuming state.
michael@0: *
michael@0: *
michael@0: * LF handling
michael@0: *
michael@0: * Every state needs to increment the line number upon LF unless the LF
michael@0: * gets reconsumed by another state which increments the line number.
michael@0: *
michael@0: *
michael@0: * CR handling
michael@0: *
michael@0: * Every state needs to handle CR unless the CR gets reconsumed and is
michael@0: * handled by the reconsuming state. The CR needs to be handled as if it
michael@0: * were and LF, the lastCR field must be set to true and then this
michael@0: * method must return. The IO driver will then swallow the next
michael@0: * character if it is an LF to coalesce CRLF.
michael@0: */
michael@0: stateloop: for (;;) {
michael@0: switch (state) {
michael@0: case DATA:
michael@0: dataloop: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: }
michael@0: switch (c) {
michael@0: case '&':
michael@0: /*
michael@0: * U+0026 AMPERSAND (&) Switch to the character
michael@0: * reference in data state.
michael@0: */
michael@0: flushChars(buf, pos);
michael@0: clearStrBufAndAppend(c);
michael@0: setAdditionalAndRememberAmpersandLocation('\u0000');
michael@0: returnState = state;
michael@0: state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '<':
michael@0: /*
michael@0: * U+003C LESS-THAN SIGN (<) Switch to the tag
michael@0: * open state.
michael@0: */
michael@0: flushChars(buf, pos);
michael@0:
michael@0: state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos);
michael@0: break dataloop; // FALL THROUGH continue
michael@0: // stateloop;
michael@0: case '\u0000':
michael@0: emitReplacementCharacter(buf, pos);
michael@0: continue;
michael@0: case '\r':
michael@0: emitCarriageReturn(buf, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: default:
michael@0: /*
michael@0: * Anything else Emit the input character as a
michael@0: * character token.
michael@0: *
michael@0: * Stay in the data state.
michael@0: */
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0: case TAG_OPEN:
michael@0: tagopenloop: for (;;) {
michael@0: /*
michael@0: * The behavior of this state depends on the content
michael@0: * model flag.
michael@0: */
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * If the content model flag is set to the PCDATA state
michael@0: * Consume the next input character:
michael@0: */
michael@0: if (c >= 'A' && c <= 'Z') {
michael@0: /*
michael@0: * U+0041 LATIN CAPITAL LETTER A through to U+005A
michael@0: * LATIN CAPITAL LETTER Z Create a new start tag
michael@0: * token,
michael@0: */
michael@0: endTag = false;
michael@0: /*
michael@0: * set its tag name to the lowercase version of the
michael@0: * input character (add 0x0020 to the character's
michael@0: * code point),
michael@0: */
michael@0: clearStrBufAndAppend((char) (c + 0x20));
michael@0: /* then switch to the tag name state. */
michael@0: state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
michael@0: /*
michael@0: * (Don't emit the token yet; further details will
michael@0: * be filled in before it is emitted.)
michael@0: */
michael@0: break tagopenloop;
michael@0: // continue stateloop;
michael@0: } else if (c >= 'a' && c <= 'z') {
michael@0: /*
michael@0: * U+0061 LATIN SMALL LETTER A through to U+007A
michael@0: * LATIN SMALL LETTER Z Create a new start tag
michael@0: * token,
michael@0: */
michael@0: endTag = false;
michael@0: /*
michael@0: * set its tag name to the input character,
michael@0: */
michael@0: clearStrBufAndAppend(c);
michael@0: /* then switch to the tag name state. */
michael@0: state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
michael@0: /*
michael@0: * (Don't emit the token yet; further details will
michael@0: * be filled in before it is emitted.)
michael@0: */
michael@0: break tagopenloop;
michael@0: // continue stateloop;
michael@0: }
michael@0: switch (c) {
michael@0: case '!':
michael@0: /*
michael@0: * U+0021 EXCLAMATION MARK (!) Switch to the
michael@0: * markup declaration open state.
michael@0: */
michael@0: state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '/':
michael@0: /*
michael@0: * U+002F SOLIDUS (/) Switch to the close tag
michael@0: * open state.
michael@0: */
michael@0: state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '?':
michael@0: // CPPONLY: if (viewingXmlSource) {
michael@0: // CPPONLY: state = transition(state,
michael@0: // CPPONLY: Tokenizer.PROCESSING_INSTRUCTION,
michael@0: // CPPONLY: reconsume,
michael@0: // CPPONLY: pos);
michael@0: // CPPONLY: continue stateloop;
michael@0: // CPPONLY: }
michael@0: /*
michael@0: * U+003F QUESTION MARK (?) Parse error.
michael@0: */
michael@0: errProcessingInstruction();
michael@0: /*
michael@0: * Switch to the bogus comment state.
michael@0: */
michael@0: clearLongStrBufAndAppend(c);
michael@0: state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Parse error.
michael@0: */
michael@0: errLtGt();
michael@0: /*
michael@0: * Emit a U+003C LESS-THAN SIGN character token
michael@0: * and a U+003E GREATER-THAN SIGN character
michael@0: * token.
michael@0: */
michael@0: tokenHandler.characters(Tokenizer.LT_GT, 0, 2);
michael@0: /* Switch to the data state. */
michael@0: cstart = pos + 1;
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: default:
michael@0: /*
michael@0: * Anything else Parse error.
michael@0: */
michael@0: errBadCharAfterLt(c);
michael@0: /*
michael@0: * Emit a U+003C LESS-THAN SIGN character token
michael@0: */
michael@0: tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
michael@0: /*
michael@0: * and reconsume the current input character in
michael@0: * the data state.
michael@0: */
michael@0: cstart = pos;
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALL THROUGH DON'T REORDER
michael@0: case TAG_NAME:
michael@0: tagnameloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '\r':
michael@0: silentCarriageReturn();
michael@0: strBufToElementNameString();
michael@0: state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\u000C':
michael@0: /*
michael@0: * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0: * (LF) U+000C FORM FEED (FF) U+0020 SPACE
michael@0: * Switch to the before attribute name state.
michael@0: */
michael@0: strBufToElementNameString();
michael@0: state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
michael@0: break tagnameloop;
michael@0: // continue stateloop;
michael@0: case '/':
michael@0: /*
michael@0: * U+002F SOLIDUS (/) Switch to the self-closing
michael@0: * start tag state.
michael@0: */
michael@0: strBufToElementNameString();
michael@0: state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Emit the current
michael@0: * tag token.
michael@0: */
michael@0: strBufToElementNameString();
michael@0: state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
michael@0: if (shouldSuspend) {
michael@0: break stateloop;
michael@0: }
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: continue stateloop;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: default:
michael@0: if (c >= 'A' && c <= 'Z') {
michael@0: /*
michael@0: * U+0041 LATIN CAPITAL LETTER A through to
michael@0: * U+005A LATIN CAPITAL LETTER Z Append the
michael@0: * lowercase version of the current input
michael@0: * character (add 0x0020 to the character's
michael@0: * code point) to the current tag token's
michael@0: * tag name.
michael@0: */
michael@0: c += 0x20;
michael@0: }
michael@0: /*
michael@0: * Anything else Append the current input
michael@0: * character to the current tag token's tag
michael@0: * name.
michael@0: */
michael@0: appendStrBuf(c);
michael@0: /*
michael@0: * Stay in the tag name state.
michael@0: */
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case BEFORE_ATTRIBUTE_NAME:
michael@0: beforeattributenameloop: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: }
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '\r':
michael@0: silentCarriageReturn();
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: // fall thru
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\u000C':
michael@0: /*
michael@0: * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0: * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
michael@0: * in the before attribute name state.
michael@0: */
michael@0: continue;
michael@0: case '/':
michael@0: /*
michael@0: * U+002F SOLIDUS (/) Switch to the self-closing
michael@0: * start tag state.
michael@0: */
michael@0: state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Emit the current
michael@0: * tag token.
michael@0: */
michael@0: state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
michael@0: if (shouldSuspend) {
michael@0: break stateloop;
michael@0: }
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: continue stateloop;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: case '\"':
michael@0: case '\'':
michael@0: case '<':
michael@0: case '=':
michael@0: /*
michael@0: * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
michael@0: * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
michael@0: * SIGN (=) Parse error.
michael@0: */
michael@0: errBadCharBeforeAttributeNameOrNull(c);
michael@0: /*
michael@0: * Treat it as per the "anything else" entry
michael@0: * below.
michael@0: */
michael@0: default:
michael@0: /*
michael@0: * Anything else Start a new attribute in the
michael@0: * current tag token.
michael@0: */
michael@0: if (c >= 'A' && c <= 'Z') {
michael@0: /*
michael@0: * U+0041 LATIN CAPITAL LETTER A through to
michael@0: * U+005A LATIN CAPITAL LETTER Z Set that
michael@0: * attribute's name to the lowercase version
michael@0: * of the current input character (add
michael@0: * 0x0020 to the character's code point)
michael@0: */
michael@0: c += 0x20;
michael@0: }
michael@0: /*
michael@0: * Set that attribute's name to the current
michael@0: * input character,
michael@0: */
michael@0: clearStrBufAndAppend(c);
michael@0: /*
michael@0: * and its value to the empty string.
michael@0: */
michael@0: // Will do later.
michael@0: /*
michael@0: * Switch to the attribute name state.
michael@0: */
michael@0: state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
michael@0: break beforeattributenameloop;
michael@0: // continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case ATTRIBUTE_NAME:
michael@0: attributenameloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '\r':
michael@0: silentCarriageReturn();
michael@0: attributeNameComplete();
michael@0: state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: // fall thru
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\u000C':
michael@0: /*
michael@0: * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0: * (LF) U+000C FORM FEED (FF) U+0020 SPACE
michael@0: * Switch to the after attribute name state.
michael@0: */
michael@0: attributeNameComplete();
michael@0: state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '/':
michael@0: /*
michael@0: * U+002F SOLIDUS (/) Switch to the self-closing
michael@0: * start tag state.
michael@0: */
michael@0: attributeNameComplete();
michael@0: addAttributeWithoutValue();
michael@0: state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '=':
michael@0: /*
michael@0: * U+003D EQUALS SIGN (=) Switch to the before
michael@0: * attribute value state.
michael@0: */
michael@0: attributeNameComplete();
michael@0: state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
michael@0: break attributenameloop;
michael@0: // continue stateloop;
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Emit the current
michael@0: * tag token.
michael@0: */
michael@0: attributeNameComplete();
michael@0: addAttributeWithoutValue();
michael@0: state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
michael@0: if (shouldSuspend) {
michael@0: break stateloop;
michael@0: }
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: continue stateloop;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: case '\"':
michael@0: case '\'':
michael@0: case '<':
michael@0: /*
michael@0: * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
michael@0: * (') U+003C LESS-THAN SIGN (<) Parse error.
michael@0: */
michael@0: errQuoteOrLtInAttributeNameOrNull(c);
michael@0: /*
michael@0: * Treat it as per the "anything else" entry
michael@0: * below.
michael@0: */
michael@0: default:
michael@0: if (c >= 'A' && c <= 'Z') {
michael@0: /*
michael@0: * U+0041 LATIN CAPITAL LETTER A through to
michael@0: * U+005A LATIN CAPITAL LETTER Z Append the
michael@0: * lowercase version of the current input
michael@0: * character (add 0x0020 to the character's
michael@0: * code point) to the current attribute's
michael@0: * name.
michael@0: */
michael@0: c += 0x20;
michael@0: }
michael@0: /*
michael@0: * Anything else Append the current input
michael@0: * character to the current attribute's name.
michael@0: */
michael@0: appendStrBuf(c);
michael@0: /*
michael@0: * Stay in the attribute name state.
michael@0: */
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case BEFORE_ATTRIBUTE_VALUE:
michael@0: beforeattributevalueloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '\r':
michael@0: silentCarriageReturn();
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: // fall thru
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\u000C':
michael@0: /*
michael@0: * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0: * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
michael@0: * in the before attribute value state.
michael@0: */
michael@0: continue;
michael@0: case '"':
michael@0: /*
michael@0: * U+0022 QUOTATION MARK (") Switch to the
michael@0: * attribute value (double-quoted) state.
michael@0: */
michael@0: clearLongStrBuf();
michael@0: state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos);
michael@0: break beforeattributevalueloop;
michael@0: // continue stateloop;
michael@0: case '&':
michael@0: /*
michael@0: * U+0026 AMPERSAND (&) Switch to the attribute
michael@0: * value (unquoted) state and reconsume this
michael@0: * input character.
michael@0: */
michael@0: clearLongStrBuf();
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
michael@0: noteUnquotedAttributeValue();
michael@0: continue stateloop;
michael@0: case '\'':
michael@0: /*
michael@0: * U+0027 APOSTROPHE (') Switch to the attribute
michael@0: * value (single-quoted) state.
michael@0: */
michael@0: clearLongStrBuf();
michael@0: state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Parse error.
michael@0: */
michael@0: errAttributeValueMissing();
michael@0: /*
michael@0: * Emit the current tag token.
michael@0: */
michael@0: addAttributeWithoutValue();
michael@0: state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
michael@0: if (shouldSuspend) {
michael@0: break stateloop;
michael@0: }
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: continue stateloop;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: case '<':
michael@0: case '=':
michael@0: case '`':
michael@0: /*
michael@0: * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN
michael@0: * (=) U+0060 GRAVE ACCENT (`)
michael@0: */
michael@0: errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c);
michael@0: /*
michael@0: * Treat it as per the "anything else" entry
michael@0: * below.
michael@0: */
michael@0: default:
michael@0: // [NOCPP[
michael@0: errHtml4NonNameInUnquotedAttribute(c);
michael@0: // ]NOCPP]
michael@0: /*
michael@0: * Anything else Append the current input
michael@0: * character to the current attribute's value.
michael@0: */
michael@0: clearLongStrBufAndAppend(c);
michael@0: /*
michael@0: * Switch to the attribute value (unquoted)
michael@0: * state.
michael@0: */
michael@0:
michael@0: state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
michael@0: noteUnquotedAttributeValue();
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
michael@0: attributevaluedoublequotedloop: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: }
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '"':
michael@0: /*
michael@0: * U+0022 QUOTATION MARK (") Switch to the after
michael@0: * attribute value (quoted) state.
michael@0: */
michael@0: addAttributeWithValue();
michael@0:
michael@0: state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
michael@0: break attributevaluedoublequotedloop;
michael@0: // continue stateloop;
michael@0: case '&':
michael@0: /*
michael@0: * U+0026 AMPERSAND (&) Switch to the character
michael@0: * reference in attribute value state, with the
michael@0: * additional allowed character being U+0022
michael@0: * QUOTATION MARK (").
michael@0: */
michael@0: clearStrBufAndAppend(c);
michael@0: setAdditionalAndRememberAmpersandLocation('\"');
michael@0: returnState = state;
michael@0: state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\r':
michael@0: appendLongStrBufCarriageReturn();
michael@0: break stateloop;
michael@0: case '\n':
michael@0: appendLongStrBufLineFeed();
michael@0: continue;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: default:
michael@0: /*
michael@0: * Anything else Append the current input
michael@0: * character to the current attribute's value.
michael@0: */
michael@0: appendLongStrBuf(c);
michael@0: /*
michael@0: * Stay in the attribute value (double-quoted)
michael@0: * state.
michael@0: */
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case AFTER_ATTRIBUTE_VALUE_QUOTED:
michael@0: afterattributevaluequotedloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '\r':
michael@0: silentCarriageReturn();
michael@0: state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: // fall thru
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\u000C':
michael@0: /*
michael@0: * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0: * (LF) U+000C FORM FEED (FF) U+0020 SPACE
michael@0: * Switch to the before attribute name state.
michael@0: */
michael@0: state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '/':
michael@0: /*
michael@0: * U+002F SOLIDUS (/) Switch to the self-closing
michael@0: * start tag state.
michael@0: */
michael@0: state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
michael@0: break afterattributevaluequotedloop;
michael@0: // continue stateloop;
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Emit the current
michael@0: * tag token.
michael@0: */
michael@0: state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
michael@0: if (shouldSuspend) {
michael@0: break stateloop;
michael@0: }
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: continue stateloop;
michael@0: default:
michael@0: /*
michael@0: * Anything else Parse error.
michael@0: */
michael@0: errNoSpaceBetweenAttributes();
michael@0: /*
michael@0: * Reconsume the character in the before
michael@0: * attribute name state.
michael@0: */
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case SELF_CLOSING_START_TAG:
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Set the self-closing
michael@0: * flag of the current tag token. Emit the current
michael@0: * tag token.
michael@0: */
michael@0: // [NOCPP[
michael@0: errHtml4XmlVoidSyntax();
michael@0: // ]NOCPP]
michael@0: state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos);
michael@0: if (shouldSuspend) {
michael@0: break stateloop;
michael@0: }
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: continue stateloop;
michael@0: default:
michael@0: /* Anything else Parse error. */
michael@0: errSlashNotFollowedByGt();
michael@0: /*
michael@0: * Reconsume the character in the before attribute
michael@0: * name state.
michael@0: */
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: // XXX reorder point
michael@0: case ATTRIBUTE_VALUE_UNQUOTED:
michael@0: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: }
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '\r':
michael@0: silentCarriageReturn();
michael@0: addAttributeWithValue();
michael@0: state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: // fall thru
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\u000C':
michael@0: /*
michael@0: * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0: * (LF) U+000C FORM FEED (FF) U+0020 SPACE
michael@0: * Switch to the before attribute name state.
michael@0: */
michael@0: addAttributeWithValue();
michael@0: state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '&':
michael@0: /*
michael@0: * U+0026 AMPERSAND (&) Switch to the character
michael@0: * reference in attribute value state, with the
michael@0: * additional allowed character being U+003E
michael@0: * GREATER-THAN SIGN (>)
michael@0: */
michael@0: clearStrBufAndAppend(c);
michael@0: setAdditionalAndRememberAmpersandLocation('>');
michael@0: returnState = state;
michael@0: state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Emit the current
michael@0: * tag token.
michael@0: */
michael@0: addAttributeWithValue();
michael@0: state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
michael@0: if (shouldSuspend) {
michael@0: break stateloop;
michael@0: }
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: continue stateloop;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: case '<':
michael@0: case '\"':
michael@0: case '\'':
michael@0: case '=':
michael@0: case '`':
michael@0: /*
michael@0: * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
michael@0: * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
michael@0: * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error.
michael@0: */
michael@0: errUnquotedAttributeValOrNull(c);
michael@0: /*
michael@0: * Treat it as per the "anything else" entry
michael@0: * below.
michael@0: */
michael@0: // fall through
michael@0: default:
michael@0: // [NOCPP]
michael@0: errHtml4NonNameInUnquotedAttribute(c);
michael@0: // ]NOCPP]
michael@0: /*
michael@0: * Anything else Append the current input
michael@0: * character to the current attribute's value.
michael@0: */
michael@0: appendLongStrBuf(c);
michael@0: /*
michael@0: * Stay in the attribute value (unquoted) state.
michael@0: */
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // XXX reorder point
michael@0: case AFTER_ATTRIBUTE_NAME:
michael@0: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '\r':
michael@0: silentCarriageReturn();
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: // fall thru
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\u000C':
michael@0: /*
michael@0: * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0: * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
michael@0: * in the after attribute name state.
michael@0: */
michael@0: continue;
michael@0: case '/':
michael@0: /*
michael@0: * U+002F SOLIDUS (/) Switch to the self-closing
michael@0: * start tag state.
michael@0: */
michael@0: addAttributeWithoutValue();
michael@0: state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '=':
michael@0: /*
michael@0: * U+003D EQUALS SIGN (=) Switch to the before
michael@0: * attribute value state.
michael@0: */
michael@0: state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Emit the current
michael@0: * tag token.
michael@0: */
michael@0: addAttributeWithoutValue();
michael@0: state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
michael@0: if (shouldSuspend) {
michael@0: break stateloop;
michael@0: }
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: continue stateloop;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: case '\"':
michael@0: case '\'':
michael@0: case '<':
michael@0: errQuoteOrLtInAttributeNameOrNull(c);
michael@0: /*
michael@0: * Treat it as per the "anything else" entry
michael@0: * below.
michael@0: */
michael@0: default:
michael@0: addAttributeWithoutValue();
michael@0: /*
michael@0: * Anything else Start a new attribute in the
michael@0: * current tag token.
michael@0: */
michael@0: if (c >= 'A' && c <= 'Z') {
michael@0: /*
michael@0: * U+0041 LATIN CAPITAL LETTER A through to
michael@0: * U+005A LATIN CAPITAL LETTER Z Set that
michael@0: * attribute's name to the lowercase version
michael@0: * of the current input character (add
michael@0: * 0x0020 to the character's code point)
michael@0: */
michael@0: c += 0x20;
michael@0: }
michael@0: /*
michael@0: * Set that attribute's name to the current
michael@0: * input character,
michael@0: */
michael@0: clearStrBufAndAppend(c);
michael@0: /*
michael@0: * and its value to the empty string.
michael@0: */
michael@0: // Will do later.
michael@0: /*
michael@0: * Switch to the attribute name state.
michael@0: */
michael@0: state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // XXX reorder point
michael@0: case MARKUP_DECLARATION_OPEN:
michael@0: markupdeclarationopenloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * If the next two characters are both U+002D
michael@0: * HYPHEN-MINUS characters (-), consume those two
michael@0: * characters, create a comment token whose data is the
michael@0: * empty string, and switch to the comment start state.
michael@0: *
michael@0: * Otherwise, if the next seven characters are an ASCII
michael@0: * case-insensitive match for the word "DOCTYPE", then
michael@0: * consume those characters and switch to the DOCTYPE
michael@0: * state.
michael@0: *
michael@0: * Otherwise, if the insertion mode is
michael@0: * "in foreign content" and the current node is not an
michael@0: * element in the HTML namespace and the next seven
michael@0: * characters are an case-sensitive match for the string
michael@0: * "[CDATA[" (the five uppercase letters "CDATA" with a
michael@0: * U+005B LEFT SQUARE BRACKET character before and
michael@0: * after), then consume those characters and switch to
michael@0: * the CDATA section state.
michael@0: *
michael@0: * Otherwise, is is a parse error. Switch to the bogus
michael@0: * comment state. The next character that is consumed,
michael@0: * if any, is the first character that will be in the
michael@0: * comment.
michael@0: */
michael@0: switch (c) {
michael@0: case '-':
michael@0: clearLongStrBufAndAppend(c);
michael@0: state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos);
michael@0: break markupdeclarationopenloop;
michael@0: // continue stateloop;
michael@0: case 'd':
michael@0: case 'D':
michael@0: clearLongStrBufAndAppend(c);
michael@0: index = 0;
michael@0: state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '[':
michael@0: if (tokenHandler.cdataSectionAllowed()) {
michael@0: clearLongStrBufAndAppend(c);
michael@0: index = 0;
michael@0: state = transition(state, Tokenizer.CDATA_START, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: // else fall through
michael@0: default:
michael@0: errBogusComment();
michael@0: clearLongStrBuf();
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case MARKUP_DECLARATION_HYPHEN:
michael@0: markupdeclarationhyphenloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: switch (c) {
michael@0: case '\u0000':
michael@0: break stateloop;
michael@0: case '-':
michael@0: clearLongStrBuf();
michael@0: state = transition(state, Tokenizer.COMMENT_START, reconsume, pos);
michael@0: break markupdeclarationhyphenloop;
michael@0: // continue stateloop;
michael@0: default:
michael@0: errBogusComment();
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case COMMENT_START:
michael@0: commentstartloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Comment start state
michael@0: *
michael@0: *
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '-':
michael@0: /*
michael@0: * U+002D HYPHEN-MINUS (-) Switch to the comment
michael@0: * start dash state.
michael@0: */
michael@0: appendLongStrBuf(c);
michael@0: state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Parse error.
michael@0: */
michael@0: errPrematureEndOfComment();
michael@0: /* Emit the comment token. */
michael@0: emitComment(0, pos);
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\r':
michael@0: appendLongStrBufCarriageReturn();
michael@0: state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: appendLongStrBufLineFeed();
michael@0: state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0: break commentstartloop;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: default:
michael@0: /*
michael@0: * Anything else Append the input character to
michael@0: * the comment token's data.
michael@0: */
michael@0: appendLongStrBuf(c);
michael@0: /*
michael@0: * Switch to the comment state.
michael@0: */
michael@0: state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0: break commentstartloop;
michael@0: // continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case COMMENT:
michael@0: commentloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Comment state Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '-':
michael@0: /*
michael@0: * U+002D HYPHEN-MINUS (-) Switch to the comment
michael@0: * end dash state
michael@0: */
michael@0: appendLongStrBuf(c);
michael@0: state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
michael@0: break commentloop;
michael@0: // continue stateloop;
michael@0: case '\r':
michael@0: appendLongStrBufCarriageReturn();
michael@0: break stateloop;
michael@0: case '\n':
michael@0: appendLongStrBufLineFeed();
michael@0: continue;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: default:
michael@0: /*
michael@0: * Anything else Append the input character to
michael@0: * the comment token's data.
michael@0: */
michael@0: appendLongStrBuf(c);
michael@0: /*
michael@0: * Stay in the comment state.
michael@0: */
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case COMMENT_END_DASH:
michael@0: commentenddashloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Comment end dash state Consume the next input
michael@0: * character:
michael@0: */
michael@0: switch (c) {
michael@0: case '-':
michael@0: /*
michael@0: * U+002D HYPHEN-MINUS (-) Switch to the comment
michael@0: * end state
michael@0: */
michael@0: appendLongStrBuf(c);
michael@0: state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
michael@0: break commentenddashloop;
michael@0: // continue stateloop;
michael@0: case '\r':
michael@0: appendLongStrBufCarriageReturn();
michael@0: state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: appendLongStrBufLineFeed();
michael@0: state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: default:
michael@0: /*
michael@0: * Anything else Append a U+002D HYPHEN-MINUS
michael@0: * (-) character and the input character to the
michael@0: * comment token's data.
michael@0: */
michael@0: appendLongStrBuf(c);
michael@0: /*
michael@0: * Switch to the comment state.
michael@0: */
michael@0: state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case COMMENT_END:
michael@0: commentendloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Comment end dash state Consume the next input
michael@0: * character:
michael@0: */
michael@0: switch (c) {
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Emit the comment
michael@0: * token.
michael@0: */
michael@0: emitComment(2, pos);
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '-':
michael@0: /* U+002D HYPHEN-MINUS (-) Parse error. */
michael@0: /*
michael@0: * Append a U+002D HYPHEN-MINUS (-) character to
michael@0: * the comment token's data.
michael@0: */
michael@0: adjustDoubleHyphenAndAppendToLongStrBufAndErr(c);
michael@0: /*
michael@0: * Stay in the comment end state.
michael@0: */
michael@0: continue;
michael@0: case '\r':
michael@0: adjustDoubleHyphenAndAppendToLongStrBufCarriageReturn();
michael@0: state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: adjustDoubleHyphenAndAppendToLongStrBufLineFeed();
michael@0: state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '!':
michael@0: errHyphenHyphenBang();
michael@0: appendLongStrBuf(c);
michael@0: state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: default:
michael@0: /*
michael@0: * Append two U+002D HYPHEN-MINUS (-) characters
michael@0: * and the input character to the comment
michael@0: * token's data.
michael@0: */
michael@0: adjustDoubleHyphenAndAppendToLongStrBufAndErr(c);
michael@0: /*
michael@0: * Switch to the comment state.
michael@0: */
michael@0: state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // XXX reorder point
michael@0: case COMMENT_END_BANG:
michael@0: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Comment end bang state
michael@0: *
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Emit the comment
michael@0: * token.
michael@0: */
michael@0: emitComment(3, pos);
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '-':
michael@0: /*
michael@0: * Append two U+002D HYPHEN-MINUS (-) characters
michael@0: * and a U+0021 EXCLAMATION MARK (!) character
michael@0: * to the comment token's data.
michael@0: */
michael@0: appendLongStrBuf(c);
michael@0: /*
michael@0: * Switch to the comment end dash state.
michael@0: */
michael@0: state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\r':
michael@0: appendLongStrBufCarriageReturn();
michael@0: break stateloop;
michael@0: case '\n':
michael@0: appendLongStrBufLineFeed();
michael@0: continue;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: default:
michael@0: /*
michael@0: * Anything else Append two U+002D HYPHEN-MINUS
michael@0: * (-) characters, a U+0021 EXCLAMATION MARK (!)
michael@0: * character, and the input character to the
michael@0: * comment token's data. Switch to the comment
michael@0: * state.
michael@0: */
michael@0: appendLongStrBuf(c);
michael@0: /*
michael@0: * Switch to the comment state.
michael@0: */
michael@0: state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // XXX reorder point
michael@0: case COMMENT_START_DASH:
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Comment start dash state
michael@0: *
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '-':
michael@0: /*
michael@0: * U+002D HYPHEN-MINUS (-) Switch to the comment end
michael@0: * state
michael@0: */
michael@0: appendLongStrBuf(c);
michael@0: state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '>':
michael@0: errPrematureEndOfComment();
michael@0: /* Emit the comment token. */
michael@0: emitComment(1, pos);
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\r':
michael@0: appendLongStrBufCarriageReturn();
michael@0: state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: appendLongStrBufLineFeed();
michael@0: state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: default:
michael@0: /*
michael@0: * Append a U+002D HYPHEN-MINUS character (-) and
michael@0: * the current input character to the comment
michael@0: * token's data.
michael@0: */
michael@0: appendLongStrBuf(c);
michael@0: /*
michael@0: * Switch to the comment state.
michael@0: */
michael@0: state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: // XXX reorder point
michael@0: case CDATA_START:
michael@0: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: if (index < 6) { // CDATA_LSQB.length
michael@0: if (c == Tokenizer.CDATA_LSQB[index]) {
michael@0: appendLongStrBuf(c);
michael@0: } else {
michael@0: errBogusComment();
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: index++;
michael@0: continue;
michael@0: } else {
michael@0: cstart = pos; // start coalescing
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
michael@0: break; // FALL THROUGH continue stateloop;
michael@0: }
michael@0: }
michael@0: // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0: case CDATA_SECTION:
michael@0: cdatasectionloop: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: }
michael@0: switch (c) {
michael@0: case ']':
michael@0: flushChars(buf, pos);
michael@0: state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos);
michael@0: break cdatasectionloop; // FALL THROUGH
michael@0: case '\u0000':
michael@0: emitReplacementCharacter(buf, pos);
michael@0: continue;
michael@0: case '\r':
michael@0: emitCarriageReturn(buf, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: // fall thru
michael@0: default:
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0: case CDATA_RSQB:
michael@0: cdatarsqb: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: switch (c) {
michael@0: case ']':
michael@0: state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos);
michael@0: break cdatarsqb;
michael@0: default:
michael@0: tokenHandler.characters(Tokenizer.RSQB_RSQB, 0,
michael@0: 1);
michael@0: cstart = pos;
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0: case CDATA_RSQB_RSQB:
michael@0: cdatarsqbrsqb: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: switch (c) {
michael@0: case ']':
michael@0: // Saw a third ]. Emit one ] (logically the
michael@0: // first one) and stay in this state to
michael@0: // remember that the last two characters seen
michael@0: // have been ]].
michael@0: tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
michael@0: continue;
michael@0: case '>':
michael@0: cstart = pos + 1;
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: default:
michael@0: tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
michael@0: cstart = pos;
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // XXX reorder point
michael@0: case ATTRIBUTE_VALUE_SINGLE_QUOTED:
michael@0: attributevaluesinglequotedloop: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: }
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '\'':
michael@0: /*
michael@0: * U+0027 APOSTROPHE (') Switch to the after
michael@0: * attribute value (quoted) state.
michael@0: */
michael@0: addAttributeWithValue();
michael@0:
michael@0: state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '&':
michael@0: /*
michael@0: * U+0026 AMPERSAND (&) Switch to the character
michael@0: * reference in attribute value state, with the
michael@0: * + additional allowed character being U+0027
michael@0: * APOSTROPHE (').
michael@0: */
michael@0: clearStrBufAndAppend(c);
michael@0: setAdditionalAndRememberAmpersandLocation('\'');
michael@0: returnState = state;
michael@0: state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
michael@0: break attributevaluesinglequotedloop;
michael@0: // continue stateloop;
michael@0: case '\r':
michael@0: appendLongStrBufCarriageReturn();
michael@0: break stateloop;
michael@0: case '\n':
michael@0: appendLongStrBufLineFeed();
michael@0: continue;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: default:
michael@0: /*
michael@0: * Anything else Append the current input
michael@0: * character to the current attribute's value.
michael@0: */
michael@0: appendLongStrBuf(c);
michael@0: /*
michael@0: * Stay in the attribute value (double-quoted)
michael@0: * state.
michael@0: */
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case CONSUME_CHARACTER_REFERENCE:
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: if (c == '\u0000') {
michael@0: break stateloop;
michael@0: }
michael@0: /*
michael@0: * Unlike the definition is the spec, this state does not
michael@0: * return a value and never requires the caller to
michael@0: * backtrack. This state takes care of emitting characters
michael@0: * or appending to the current attribute value. It also
michael@0: * takes care of that in the case when consuming the
michael@0: * character reference fails.
michael@0: */
michael@0: /*
michael@0: * This section defines how to consume a character
michael@0: * reference. This definition is used when parsing character
michael@0: * references in text and in attributes.
michael@0: *
michael@0: * The behavior depends on the identity of the next
michael@0: * character (the one immediately after the U+0026 AMPERSAND
michael@0: * character):
michael@0: */
michael@0: switch (c) {
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\n':
michael@0: case '\r': // we'll reconsume!
michael@0: case '\u000C':
michael@0: case '<':
michael@0: case '&':
michael@0: emitOrAppendStrBuf(returnState);
michael@0: if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0: cstart = pos;
michael@0: }
michael@0: reconsume = true;
michael@0: state = transition(state, returnState, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '#':
michael@0: /*
michael@0: * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER
michael@0: * SIGN.
michael@0: */
michael@0: appendStrBuf('#');
michael@0: state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos);
michael@0: continue stateloop;
michael@0: default:
michael@0: if (c == additional) {
michael@0: emitOrAppendStrBuf(returnState);
michael@0: reconsume = true;
michael@0: state = transition(state, returnState, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: if (c >= 'a' && c <= 'z') {
michael@0: firstCharKey = c - 'a' + 26;
michael@0: } else if (c >= 'A' && c <= 'Z') {
michael@0: firstCharKey = c - 'A';
michael@0: } else {
michael@0: // No match
michael@0: /*
michael@0: * If no match can be made, then this is a parse
michael@0: * error.
michael@0: */
michael@0: errNoNamedCharacterMatch();
michael@0: emitOrAppendStrBuf(returnState);
michael@0: if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0: cstart = pos;
michael@0: }
michael@0: reconsume = true;
michael@0: state = transition(state, returnState, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: // Didn't fail yet
michael@0: appendStrBuf(c);
michael@0: state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos);
michael@0: // FALL THROUGH continue stateloop;
michael@0: }
michael@0: // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0: case CHARACTER_REFERENCE_HILO_LOOKUP:
michael@0: {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: if (c == '\u0000') {
michael@0: break stateloop;
michael@0: }
michael@0: /*
michael@0: * The data structure is as follows:
michael@0: *
michael@0: * HILO_ACCEL is a two-dimensional int array whose major
michael@0: * index corresponds to the second character of the
michael@0: * character reference (code point as index) and the
michael@0: * minor index corresponds to the first character of the
michael@0: * character reference (packed so that A-Z runs from 0
michael@0: * to 25 and a-z runs from 26 to 51). This layout makes
michael@0: * it easier to use the sparseness of the data structure
michael@0: * to omit parts of it: The second dimension of the
michael@0: * table is null when no character reference starts with
michael@0: * the character corresponding to that row.
michael@0: *
michael@0: * The int value HILO_ACCEL (by these indeces) is zero
michael@0: * if there exists no character reference starting with
michael@0: * that two-letter prefix. Otherwise, the value is an
michael@0: * int that packs two shorts so that the higher short is
michael@0: * the index of the highest character reference name
michael@0: * with that prefix in NAMES and the lower short
michael@0: * corresponds to the index of the lowest character
michael@0: * reference name with that prefix. (It happens that the
michael@0: * first two character reference names share their
michael@0: * prefix so the packed int cannot be 0 by packing the
michael@0: * two shorts.)
michael@0: *
michael@0: * NAMES is an array of byte arrays where each byte
michael@0: * array encodes the name of a character references as
michael@0: * ASCII. The names omit the first two letters of the
michael@0: * name. (Since storing the first two letters would be
michael@0: * redundant with the data contained in HILO_ACCEL.) The
michael@0: * entries are lexically sorted.
michael@0: *
michael@0: * For a given index in NAMES, the same index in VALUES
michael@0: * contains the corresponding expansion as an array of
michael@0: * two UTF-16 code units (either the character and
michael@0: * U+0000 or a suggogate pair).
michael@0: */
michael@0: int hilo = 0;
michael@0: if (c <= 'z') {
michael@0: @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c];
michael@0: if (row != null) {
michael@0: hilo = row[firstCharKey];
michael@0: }
michael@0: }
michael@0: if (hilo == 0) {
michael@0: /*
michael@0: * If no match can be made, then this is a parse
michael@0: * error.
michael@0: */
michael@0: errNoNamedCharacterMatch();
michael@0: emitOrAppendStrBuf(returnState);
michael@0: if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0: cstart = pos;
michael@0: }
michael@0: reconsume = true;
michael@0: state = transition(state, returnState, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: // Didn't fail yet
michael@0: appendStrBuf(c);
michael@0: lo = hilo & 0xFFFF;
michael@0: hi = hilo >> 16;
michael@0: entCol = -1;
michael@0: candidate = -1;
michael@0: strBufMark = 0;
michael@0: state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos);
michael@0: // FALL THROUGH continue stateloop;
michael@0: }
michael@0: case CHARACTER_REFERENCE_TAIL:
michael@0: outer: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: if (c == '\u0000') {
michael@0: break stateloop;
michael@0: }
michael@0: entCol++;
michael@0: /*
michael@0: * Consume the maximum number of characters possible,
michael@0: * with the consumed characters matching one of the
michael@0: * identifiers in the first column of the named
michael@0: * character references table (in a case-sensitive
michael@0: * manner).
michael@0: */
michael@0: loloop: for (;;) {
michael@0: if (hi < lo) {
michael@0: break outer;
michael@0: }
michael@0: if (entCol == NamedCharacters.NAMES[lo].length()) {
michael@0: candidate = lo;
michael@0: strBufMark = strBufLen;
michael@0: lo++;
michael@0: } else if (entCol > NamedCharacters.NAMES[lo].length()) {
michael@0: break outer;
michael@0: } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
michael@0: lo++;
michael@0: } else {
michael@0: break loloop;
michael@0: }
michael@0: }
michael@0:
michael@0: hiloop: for (;;) {
michael@0: if (hi < lo) {
michael@0: break outer;
michael@0: }
michael@0: if (entCol == NamedCharacters.NAMES[hi].length()) {
michael@0: break hiloop;
michael@0: }
michael@0: if (entCol > NamedCharacters.NAMES[hi].length()) {
michael@0: break outer;
michael@0: } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
michael@0: hi--;
michael@0: } else {
michael@0: break hiloop;
michael@0: }
michael@0: }
michael@0:
michael@0: if (c == ';') {
michael@0: // If we see a semicolon, there cannot be a
michael@0: // longer match. Break the loop. However, before
michael@0: // breaking, take the longest match so far as the
michael@0: // candidate, if we are just about to complete a
michael@0: // match.
michael@0: if (entCol + 1 == NamedCharacters.NAMES[lo].length()) {
michael@0: candidate = lo;
michael@0: strBufMark = strBufLen;
michael@0: }
michael@0: break outer;
michael@0: }
michael@0:
michael@0: if (hi < lo) {
michael@0: break outer;
michael@0: }
michael@0: appendStrBuf(c);
michael@0: continue;
michael@0: }
michael@0:
michael@0: if (candidate == -1) {
michael@0: // reconsume deals with CR, LF or nul
michael@0: /*
michael@0: * If no match can be made, then this is a parse error.
michael@0: */
michael@0: errNoNamedCharacterMatch();
michael@0: emitOrAppendStrBuf(returnState);
michael@0: if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0: cstart = pos;
michael@0: }
michael@0: reconsume = true;
michael@0: state = transition(state, returnState, reconsume, pos);
michael@0: continue stateloop;
michael@0: } else {
michael@0: // c can't be CR, LF or nul if we got here
michael@0: @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
michael@0: if (candidateName.length() == 0
michael@0: || candidateName.charAt(candidateName.length() - 1) != ';') {
michael@0: /*
michael@0: * If the last character matched is not a U+003B
michael@0: * SEMICOLON (;), there is a parse error.
michael@0: */
michael@0: if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
michael@0: /*
michael@0: * If the entity is being consumed as part of an
michael@0: * attribute, and the last character matched is
michael@0: * not a U+003B SEMICOLON (;),
michael@0: */
michael@0: char ch;
michael@0: if (strBufMark == strBufLen) {
michael@0: ch = c;
michael@0: } else {
michael@0: // if (strBufOffset != -1) {
michael@0: // ch = buf[strBufOffset + strBufMark];
michael@0: // } else {
michael@0: ch = strBuf[strBufMark];
michael@0: // }
michael@0: }
michael@0: if (ch == '=' || (ch >= '0' && ch <= '9')
michael@0: || (ch >= 'A' && ch <= 'Z')
michael@0: || (ch >= 'a' && ch <= 'z')) {
michael@0: /*
michael@0: * and the next character is either a U+003D
michael@0: * EQUALS SIGN character (=) or in the range
michael@0: * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
michael@0: * U+0041 LATIN CAPITAL LETTER A to U+005A
michael@0: * LATIN CAPITAL LETTER Z, or U+0061 LATIN
michael@0: * SMALL LETTER A to U+007A LATIN SMALL
michael@0: * LETTER Z, then, for historical reasons,
michael@0: * all the characters that were matched
michael@0: * after the U+0026 AMPERSAND (&) must be
michael@0: * unconsumed, and nothing is returned.
michael@0: */
michael@0: errNoNamedCharacterMatch();
michael@0: appendStrBufToLongStrBuf();
michael@0: reconsume = true;
michael@0: state = transition(state, returnState, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
michael@0: errUnescapedAmpersandInterpretedAsCharacterReference();
michael@0: } else {
michael@0: errNotSemicolonTerminated();
michael@0: }
michael@0: }
michael@0:
michael@0: /*
michael@0: * Otherwise, return a character token for the character
michael@0: * corresponding to the entity name (as given by the
michael@0: * second column of the named character references
michael@0: * table).
michael@0: */
michael@0: // CPPONLY: completedNamedCharacterReference();
michael@0: @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
michael@0: if (
michael@0: // [NOCPP[
michael@0: val.length == 1
michael@0: // ]NOCPP]
michael@0: // CPPONLY: val[1] == 0
michael@0: ) {
michael@0: emitOrAppendOne(val, returnState);
michael@0: } else {
michael@0: emitOrAppendTwo(val, returnState);
michael@0: }
michael@0: // this is so complicated!
michael@0: if (strBufMark < strBufLen) {
michael@0: if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
michael@0: for (int i = strBufMark; i < strBufLen; i++) {
michael@0: appendLongStrBuf(strBuf[i]);
michael@0: }
michael@0: } else {
michael@0: tokenHandler.characters(strBuf, strBufMark,
michael@0: strBufLen - strBufMark);
michael@0: }
michael@0: }
michael@0: // Check if we broke out early with c being the last
michael@0: // character that matched as opposed to being the
michael@0: // first one that didn't match. In the case of an
michael@0: // early break, the next run on text should start
michael@0: // *after* the current character and the current
michael@0: // character shouldn't be reconsumed.
michael@0: boolean earlyBreak = (c == ';' && strBufMark == strBufLen);
michael@0: if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0: cstart = earlyBreak ? pos + 1 : pos;
michael@0: }
michael@0: reconsume = !earlyBreak;
michael@0: state = transition(state, returnState, reconsume, pos);
michael@0: continue stateloop;
michael@0: /*
michael@0: * If the markup contains I'm ¬it; I tell you, the
michael@0: * entity is parsed as "not", as in, I'm ¬it; I tell
michael@0: * you. But if the markup was I'm ∉ I tell you,
michael@0: * the entity would be parsed as "notin;", resulting in
michael@0: * I'm ∉ I tell you.
michael@0: */
michael@0: }
michael@0: // XXX reorder point
michael@0: case CONSUME_NCR:
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: prevValue = -1;
michael@0: value = 0;
michael@0: seenDigits = false;
michael@0: /*
michael@0: * The behavior further depends on the character after the
michael@0: * U+0023 NUMBER SIGN:
michael@0: */
michael@0: switch (c) {
michael@0: case 'x':
michael@0: case 'X':
michael@0:
michael@0: /*
michael@0: * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL
michael@0: * LETTER X Consume the X.
michael@0: *
michael@0: * Follow the steps below, but using the range of
michael@0: * characters U+0030 DIGIT ZERO through to U+0039
michael@0: * DIGIT NINE, U+0061 LATIN SMALL LETTER A through
michael@0: * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN
michael@0: * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL
michael@0: * LETTER F (in other words, 0-9, A-F, a-f).
michael@0: *
michael@0: * When it comes to interpreting the number,
michael@0: * interpret it as a hexadecimal number.
michael@0: */
michael@0: appendStrBuf(c);
michael@0: state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos);
michael@0: continue stateloop;
michael@0: default:
michael@0: /*
michael@0: * Anything else Follow the steps below, but using
michael@0: * the range of characters U+0030 DIGIT ZERO through
michael@0: * to U+0039 DIGIT NINE (i.e. just 0-9).
michael@0: *
michael@0: * When it comes to interpreting the number,
michael@0: * interpret it as a decimal number.
michael@0: */
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos);
michael@0: // FALL THROUGH continue stateloop;
michael@0: }
michael@0: // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0: case DECIMAL_NRC_LOOP:
michael@0: decimalloop: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: }
michael@0: // Deal with overflow gracefully
michael@0: if (value < prevValue) {
michael@0: value = 0x110000; // Value above Unicode range but
michael@0: // within int
michael@0: // range
michael@0: }
michael@0: prevValue = value;
michael@0: /*
michael@0: * Consume as many characters as match the range of
michael@0: * characters given above.
michael@0: */
michael@0: if (c >= '0' && c <= '9') {
michael@0: seenDigits = true;
michael@0: value *= 10;
michael@0: value += c - '0';
michael@0: continue;
michael@0: } else if (c == ';') {
michael@0: if (seenDigits) {
michael@0: if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0: cstart = pos + 1;
michael@0: }
michael@0: state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
michael@0: // FALL THROUGH continue stateloop;
michael@0: break decimalloop;
michael@0: } else {
michael@0: errNoDigitsInNCR();
michael@0: appendStrBuf(';');
michael@0: emitOrAppendStrBuf(returnState);
michael@0: if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0: cstart = pos + 1;
michael@0: }
michael@0: state = transition(state, returnState, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: } else {
michael@0: /*
michael@0: * If no characters match the range, then don't
michael@0: * consume any characters (and unconsume the U+0023
michael@0: * NUMBER SIGN character and, if appropriate, the X
michael@0: * character). This is a parse error; nothing is
michael@0: * returned.
michael@0: *
michael@0: * Otherwise, if the next character is a U+003B
michael@0: * SEMICOLON, consume that too. If it isn't, there
michael@0: * is a parse error.
michael@0: */
michael@0: if (!seenDigits) {
michael@0: errNoDigitsInNCR();
michael@0: emitOrAppendStrBuf(returnState);
michael@0: if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0: cstart = pos;
michael@0: }
michael@0: reconsume = true;
michael@0: state = transition(state, returnState, reconsume, pos);
michael@0: continue stateloop;
michael@0: } else {
michael@0: errCharRefLacksSemicolon();
michael@0: if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0: cstart = pos;
michael@0: }
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
michael@0: // FALL THROUGH continue stateloop;
michael@0: break decimalloop;
michael@0: }
michael@0: }
michael@0: }
michael@0: // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0: case HANDLE_NCR_VALUE:
michael@0: // WARNING previous state sets reconsume
michael@0: // XXX inline this case if the method size can take it
michael@0: handleNcrValue(returnState);
michael@0: state = transition(state, returnState, reconsume, pos);
michael@0: continue stateloop;
michael@0: // XXX reorder point
michael@0: case HEX_NCR_LOOP:
michael@0: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: // Deal with overflow gracefully
michael@0: if (value < prevValue) {
michael@0: value = 0x110000; // Value above Unicode range but
michael@0: // within int
michael@0: // range
michael@0: }
michael@0: prevValue = value;
michael@0: /*
michael@0: * Consume as many characters as match the range of
michael@0: * characters given above.
michael@0: */
michael@0: if (c >= '0' && c <= '9') {
michael@0: seenDigits = true;
michael@0: value *= 16;
michael@0: value += c - '0';
michael@0: continue;
michael@0: } else if (c >= 'A' && c <= 'F') {
michael@0: seenDigits = true;
michael@0: value *= 16;
michael@0: value += c - 'A' + 10;
michael@0: continue;
michael@0: } else if (c >= 'a' && c <= 'f') {
michael@0: seenDigits = true;
michael@0: value *= 16;
michael@0: value += c - 'a' + 10;
michael@0: continue;
michael@0: } else if (c == ';') {
michael@0: if (seenDigits) {
michael@0: if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0: cstart = pos + 1;
michael@0: }
michael@0: state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
michael@0: continue stateloop;
michael@0: } else {
michael@0: errNoDigitsInNCR();
michael@0: appendStrBuf(';');
michael@0: emitOrAppendStrBuf(returnState);
michael@0: if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0: cstart = pos + 1;
michael@0: }
michael@0: state = transition(state, returnState, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: } else {
michael@0: /*
michael@0: * If no characters match the range, then don't
michael@0: * consume any characters (and unconsume the U+0023
michael@0: * NUMBER SIGN character and, if appropriate, the X
michael@0: * character). This is a parse error; nothing is
michael@0: * returned.
michael@0: *
michael@0: * Otherwise, if the next character is a U+003B
michael@0: * SEMICOLON, consume that too. If it isn't, there
michael@0: * is a parse error.
michael@0: */
michael@0: if (!seenDigits) {
michael@0: errNoDigitsInNCR();
michael@0: emitOrAppendStrBuf(returnState);
michael@0: if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0: cstart = pos;
michael@0: }
michael@0: reconsume = true;
michael@0: state = transition(state, returnState, reconsume, pos);
michael@0: continue stateloop;
michael@0: } else {
michael@0: errCharRefLacksSemicolon();
michael@0: if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0: cstart = pos;
michael@0: }
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: }
michael@0: // XXX reorder point
michael@0: case PLAINTEXT:
michael@0: plaintextloop: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: }
michael@0: switch (c) {
michael@0: case '\u0000':
michael@0: emitPlaintextReplacementCharacter(buf, pos);
michael@0: continue;
michael@0: case '\r':
michael@0: emitCarriageReturn(buf, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: default:
michael@0: /*
michael@0: * Anything else Emit the current input
michael@0: * character as a character token. Stay in the
michael@0: * RAWTEXT state.
michael@0: */
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // XXX reorder point
michael@0: case CLOSE_TAG_OPEN:
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Otherwise, if the content model flag is set to the PCDATA
michael@0: * state, or if the next few characters do match that tag
michael@0: * name, consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '>':
michael@0: /* U+003E GREATER-THAN SIGN (>) Parse error. */
michael@0: errLtSlashGt();
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: cstart = pos + 1;
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\r':
michael@0: silentCarriageReturn();
michael@0: /* Anything else Parse error. */
michael@0: errGarbageAfterLtSlash();
michael@0: /*
michael@0: * Switch to the bogus comment state.
michael@0: */
michael@0: clearLongStrBufAndAppend('\n');
michael@0: state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: /* Anything else Parse error. */
michael@0: errGarbageAfterLtSlash();
michael@0: /*
michael@0: * Switch to the bogus comment state.
michael@0: */
michael@0: clearLongStrBufAndAppend('\n');
michael@0: state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: default:
michael@0: if (c >= 'A' && c <= 'Z') {
michael@0: c += 0x20;
michael@0: }
michael@0: if (c >= 'a' && c <= 'z') {
michael@0: /*
michael@0: * U+0061 LATIN SMALL LETTER A through to U+007A
michael@0: * LATIN SMALL LETTER Z Create a new end tag
michael@0: * token,
michael@0: */
michael@0: endTag = true;
michael@0: /*
michael@0: * set its tag name to the input character,
michael@0: */
michael@0: clearStrBufAndAppend(c);
michael@0: /*
michael@0: * then switch to the tag name state. (Don't
michael@0: * emit the token yet; further details will be
michael@0: * filled in before it is emitted.)
michael@0: */
michael@0: state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
michael@0: continue stateloop;
michael@0: } else {
michael@0: /* Anything else Parse error. */
michael@0: errGarbageAfterLtSlash();
michael@0: /*
michael@0: * Switch to the bogus comment state.
michael@0: */
michael@0: clearLongStrBufAndAppend(c);
michael@0: state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // XXX reorder point
michael@0: case RCDATA:
michael@0: rcdataloop: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: }
michael@0: switch (c) {
michael@0: case '&':
michael@0: /*
michael@0: * U+0026 AMPERSAND (&) Switch to the character
michael@0: * reference in RCDATA state.
michael@0: */
michael@0: flushChars(buf, pos);
michael@0: clearStrBufAndAppend(c);
michael@0: additional = '\u0000';
michael@0: returnState = state;
michael@0: state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '<':
michael@0: /*
michael@0: * U+003C LESS-THAN SIGN (<) Switch to the
michael@0: * RCDATA less-than sign state.
michael@0: */
michael@0: flushChars(buf, pos);
michael@0:
michael@0: returnState = state;
michael@0: state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\u0000':
michael@0: emitReplacementCharacter(buf, pos);
michael@0: continue;
michael@0: case '\r':
michael@0: emitCarriageReturn(buf, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: default:
michael@0: /*
michael@0: * Emit the current input character as a
michael@0: * character token. Stay in the RCDATA state.
michael@0: */
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // XXX reorder point
michael@0: case RAWTEXT:
michael@0: rawtextloop: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: }
michael@0: switch (c) {
michael@0: case '<':
michael@0: /*
michael@0: * U+003C LESS-THAN SIGN (<) Switch to the
michael@0: * RAWTEXT less-than sign state.
michael@0: */
michael@0: flushChars(buf, pos);
michael@0:
michael@0: returnState = state;
michael@0: state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
michael@0: break rawtextloop;
michael@0: // FALL THRU continue stateloop;
michael@0: case '\u0000':
michael@0: emitReplacementCharacter(buf, pos);
michael@0: continue;
michael@0: case '\r':
michael@0: emitCarriageReturn(buf, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: default:
michael@0: /*
michael@0: * Emit the current input character as a
michael@0: * character token. Stay in the RAWTEXT state.
michael@0: */
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // XXX fallthru don't reorder
michael@0: case RAWTEXT_RCDATA_LESS_THAN_SIGN:
michael@0: rawtextrcdatalessthansignloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: switch (c) {
michael@0: case '/':
michael@0: /*
michael@0: * U+002F SOLIDUS (/) Set the temporary buffer
michael@0: * to the empty string. Switch to the script
michael@0: * data end tag open state.
michael@0: */
michael@0: index = 0;
michael@0: clearStrBuf();
michael@0: state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
michael@0: break rawtextrcdatalessthansignloop;
michael@0: // FALL THRU continue stateloop;
michael@0: default:
michael@0: /*
michael@0: * Otherwise, emit a U+003C LESS-THAN SIGN
michael@0: * character token
michael@0: */
michael@0: tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
michael@0: /*
michael@0: * and reconsume the current input character in
michael@0: * the data state.
michael@0: */
michael@0: cstart = pos;
michael@0: reconsume = true;
michael@0: state = transition(state, returnState, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // XXX fall thru. don't reorder.
michael@0: case NON_DATA_END_TAG_NAME:
michael@0: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * ASSERT! when entering this state, set index to 0 and
michael@0: * call clearStrBuf() assert (contentModelElement !=
michael@0: * null); Let's implement the above without lookahead.
michael@0: * strBuf is the 'temporary buffer'.
michael@0: */
michael@0: if (index < endTagExpectationAsArray.length) {
michael@0: char e = endTagExpectationAsArray[index];
michael@0: char folded = c;
michael@0: if (c >= 'A' && c <= 'Z') {
michael@0: folded += 0x20;
michael@0: }
michael@0: if (folded != e) {
michael@0: // [NOCPP[
michael@0: errHtml4LtSlashInRcdata(folded);
michael@0: // ]NOCPP]
michael@0: tokenHandler.characters(Tokenizer.LT_SOLIDUS,
michael@0: 0, 2);
michael@0: emitStrBuf();
michael@0: cstart = pos;
michael@0: reconsume = true;
michael@0: state = transition(state, returnState, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: appendStrBuf(c);
michael@0: index++;
michael@0: continue;
michael@0: } else {
michael@0: endTag = true;
michael@0: // XXX replace contentModelElement with different
michael@0: // type
michael@0: tagName = endTagExpectation;
michael@0: switch (c) {
michael@0: case '\r':
michael@0: silentCarriageReturn();
michael@0: state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: // fall thru
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\u000C':
michael@0: /*
michael@0: * U+0009 CHARACTER TABULATION U+000A LINE
michael@0: * FEED (LF) U+000C FORM FEED (FF) U+0020
michael@0: * SPACE If the current end tag token is an
michael@0: * appropriate end tag token, then switch to
michael@0: * the before attribute name state.
michael@0: */
michael@0: state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '/':
michael@0: /*
michael@0: * U+002F SOLIDUS (/) If the current end tag
michael@0: * token is an appropriate end tag token,
michael@0: * then switch to the self-closing start tag
michael@0: * state.
michael@0: */
michael@0: state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) If the
michael@0: * current end tag token is an appropriate
michael@0: * end tag token, then emit the current tag
michael@0: * token and switch to the data state.
michael@0: */
michael@0: state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
michael@0: if (shouldSuspend) {
michael@0: break stateloop;
michael@0: }
michael@0: continue stateloop;
michael@0: default:
michael@0: /*
michael@0: * Emit a U+003C LESS-THAN SIGN character
michael@0: * token, a U+002F SOLIDUS character token,
michael@0: * a character token for each of the
michael@0: * characters in the temporary buffer (in
michael@0: * the order they were added to the buffer),
michael@0: * and reconsume the current input character
michael@0: * in the RAWTEXT state.
michael@0: */
michael@0: // [NOCPP[
michael@0: errWarnLtSlashInRcdata();
michael@0: // ]NOCPP]
michael@0: tokenHandler.characters(
michael@0: Tokenizer.LT_SOLIDUS, 0, 2);
michael@0: emitStrBuf();
michael@0: if (c == '\u0000') {
michael@0: emitReplacementCharacter(buf, pos);
michael@0: } else {
michael@0: cstart = pos; // don't drop the
michael@0: // character
michael@0: }
michael@0: state = transition(state, returnState, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: }
michael@0: // XXX reorder point
michael@0: // BEGIN HOTSPOT WORKAROUND
michael@0: case BOGUS_COMMENT:
michael@0: boguscommentloop: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: }
michael@0: /*
michael@0: * Consume every character up to and including the first
michael@0: * U+003E GREATER-THAN SIGN character (>) or the end of
michael@0: * the file (EOF), whichever comes first. Emit a comment
michael@0: * token whose data is the concatenation of all the
michael@0: * characters starting from and including the character
michael@0: * that caused the state machine to switch into the
michael@0: * bogus comment state, up to and including the
michael@0: * character immediately before the last consumed
michael@0: * character (i.e. up to the character just before the
michael@0: * U+003E or EOF character). (If the comment was started
michael@0: * by the end of the file (EOF), the token is empty.)
michael@0: *
michael@0: * Switch to the data state.
michael@0: *
michael@0: * If the end of the file was reached, reconsume the EOF
michael@0: * character.
michael@0: */
michael@0: switch (c) {
michael@0: case '>':
michael@0: emitComment(0, pos);
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '-':
michael@0: appendLongStrBuf(c);
michael@0: state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos);
michael@0: break boguscommentloop;
michael@0: case '\r':
michael@0: appendLongStrBufCarriageReturn();
michael@0: break stateloop;
michael@0: case '\n':
michael@0: appendLongStrBufLineFeed();
michael@0: continue;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: default:
michael@0: appendLongStrBuf(c);
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case BOGUS_COMMENT_HYPHEN:
michael@0: boguscommenthyphenloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: switch (c) {
michael@0: case '>':
michael@0: // [NOCPP[
michael@0: maybeAppendSpaceToBogusComment();
michael@0: // ]NOCPP]
michael@0: emitComment(0, pos);
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '-':
michael@0: appendSecondHyphenToBogusComment();
michael@0: continue boguscommenthyphenloop;
michael@0: case '\r':
michael@0: appendLongStrBufCarriageReturn();
michael@0: state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: appendLongStrBufLineFeed();
michael@0: state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: default:
michael@0: appendLongStrBuf(c);
michael@0: state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // XXX reorder point
michael@0: case SCRIPT_DATA:
michael@0: scriptdataloop: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: }
michael@0: switch (c) {
michael@0: case '<':
michael@0: /*
michael@0: * U+003C LESS-THAN SIGN (<) Switch to the
michael@0: * script data less-than sign state.
michael@0: */
michael@0: flushChars(buf, pos);
michael@0: returnState = state;
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos);
michael@0: break scriptdataloop; // FALL THRU continue
michael@0: // stateloop;
michael@0: case '\u0000':
michael@0: emitReplacementCharacter(buf, pos);
michael@0: continue;
michael@0: case '\r':
michael@0: emitCarriageReturn(buf, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: default:
michael@0: /*
michael@0: * Anything else Emit the current input
michael@0: * character as a character token. Stay in the
michael@0: * script data state.
michael@0: */
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0: case SCRIPT_DATA_LESS_THAN_SIGN:
michael@0: scriptdatalessthansignloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: switch (c) {
michael@0: case '/':
michael@0: /*
michael@0: * U+002F SOLIDUS (/) Set the temporary buffer
michael@0: * to the empty string. Switch to the script
michael@0: * data end tag open state.
michael@0: */
michael@0: index = 0;
michael@0: clearStrBuf();
michael@0: state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '!':
michael@0: tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
michael@0: cstart = pos;
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos);
michael@0: break scriptdatalessthansignloop; // FALL THRU
michael@0: // continue
michael@0: // stateloop;
michael@0: default:
michael@0: /*
michael@0: * Otherwise, emit a U+003C LESS-THAN SIGN
michael@0: * character token
michael@0: */
michael@0: tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
michael@0: /*
michael@0: * and reconsume the current input character in
michael@0: * the data state.
michael@0: */
michael@0: cstart = pos;
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0: case SCRIPT_DATA_ESCAPE_START:
michael@0: scriptdataescapestartloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '-':
michael@0: /*
michael@0: * U+002D HYPHEN-MINUS (-) Emit a U+002D
michael@0: * HYPHEN-MINUS character token. Switch to the
michael@0: * script data escape start dash state.
michael@0: */
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos);
michael@0: break scriptdataescapestartloop; // FALL THRU
michael@0: // continue
michael@0: // stateloop;
michael@0: default:
michael@0: /*
michael@0: * Anything else Reconsume the current input
michael@0: * character in the script data state.
michael@0: */
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0: case SCRIPT_DATA_ESCAPE_START_DASH:
michael@0: scriptdataescapestartdashloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '-':
michael@0: /*
michael@0: * U+002D HYPHEN-MINUS (-) Emit a U+002D
michael@0: * HYPHEN-MINUS character token. Switch to the
michael@0: * script data escaped dash dash state.
michael@0: */
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
michael@0: break scriptdataescapestartdashloop;
michael@0: // continue stateloop;
michael@0: default:
michael@0: /*
michael@0: * Anything else Reconsume the current input
michael@0: * character in the script data state.
michael@0: */
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0: case SCRIPT_DATA_ESCAPED_DASH_DASH:
michael@0: scriptdataescapeddashdashloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '-':
michael@0: /*
michael@0: * U+002D HYPHEN-MINUS (-) Emit a U+002D
michael@0: * HYPHEN-MINUS character token. Stay in the
michael@0: * script data escaped dash dash state.
michael@0: */
michael@0: continue;
michael@0: case '<':
michael@0: /*
michael@0: * U+003C LESS-THAN SIGN (<) Switch to the
michael@0: * script data escaped less-than sign state.
michael@0: */
michael@0: flushChars(buf, pos);
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Emit a U+003E
michael@0: * GREATER-THAN SIGN character token. Switch to
michael@0: * the script data state.
michael@0: */
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\u0000':
michael@0: emitReplacementCharacter(buf, pos);
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
michael@0: break scriptdataescapeddashdashloop;
michael@0: case '\r':
michael@0: emitCarriageReturn(buf, pos);
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: default:
michael@0: /*
michael@0: * Anything else Emit the current input
michael@0: * character as a character token. Switch to the
michael@0: * script data escaped state.
michael@0: */
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
michael@0: break scriptdataescapeddashdashloop;
michael@0: // continue stateloop;
michael@0: }
michael@0: }
michael@0: // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0: case SCRIPT_DATA_ESCAPED:
michael@0: scriptdataescapedloop: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: }
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '-':
michael@0: /*
michael@0: * U+002D HYPHEN-MINUS (-) Emit a U+002D
michael@0: * HYPHEN-MINUS character token. Switch to the
michael@0: * script data escaped dash state.
michael@0: */
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos);
michael@0: break scriptdataescapedloop; // FALL THRU
michael@0: // continue
michael@0: // stateloop;
michael@0: case '<':
michael@0: /*
michael@0: * U+003C LESS-THAN SIGN (<) Switch to the
michael@0: * script data escaped less-than sign state.
michael@0: */
michael@0: flushChars(buf, pos);
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\u0000':
michael@0: emitReplacementCharacter(buf, pos);
michael@0: continue;
michael@0: case '\r':
michael@0: emitCarriageReturn(buf, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: default:
michael@0: /*
michael@0: * Anything else Emit the current input
michael@0: * character as a character token. Stay in the
michael@0: * script data escaped state.
michael@0: */
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0: case SCRIPT_DATA_ESCAPED_DASH:
michael@0: scriptdataescapeddashloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '-':
michael@0: /*
michael@0: * U+002D HYPHEN-MINUS (-) Emit a U+002D
michael@0: * HYPHEN-MINUS character token. Switch to the
michael@0: * script data escaped dash dash state.
michael@0: */
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '<':
michael@0: /*
michael@0: * U+003C LESS-THAN SIGN (<) Switch to the
michael@0: * script data escaped less-than sign state.
michael@0: */
michael@0: flushChars(buf, pos);
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
michael@0: break scriptdataescapeddashloop;
michael@0: // continue stateloop;
michael@0: case '\u0000':
michael@0: emitReplacementCharacter(buf, pos);
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\r':
michael@0: emitCarriageReturn(buf, pos);
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: default:
michael@0: /*
michael@0: * Anything else Emit the current input
michael@0: * character as a character token. Switch to the
michael@0: * script data escaped state.
michael@0: */
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0: case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
michael@0: scriptdataescapedlessthanloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '/':
michael@0: /*
michael@0: * U+002F SOLIDUS (/) Set the temporary buffer
michael@0: * to the empty string. Switch to the script
michael@0: * data escaped end tag open state.
michael@0: */
michael@0: index = 0;
michael@0: clearStrBuf();
michael@0: returnState = Tokenizer.SCRIPT_DATA_ESCAPED;
michael@0: state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
michael@0: continue stateloop;
michael@0: case 'S':
michael@0: case 's':
michael@0: /*
michael@0: * U+0041 LATIN CAPITAL LETTER A through to
michael@0: * U+005A LATIN CAPITAL LETTER Z Emit a U+003C
michael@0: * LESS-THAN SIGN character token and the
michael@0: * current input character as a character token.
michael@0: */
michael@0: tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
michael@0: cstart = pos;
michael@0: index = 1;
michael@0: /*
michael@0: * Set the temporary buffer to the empty string.
michael@0: * Append the lowercase version of the current
michael@0: * input character (add 0x0020 to the
michael@0: * character's code point) to the temporary
michael@0: * buffer. Switch to the script data double
michael@0: * escape start state.
michael@0: */
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos);
michael@0: break scriptdataescapedlessthanloop;
michael@0: // continue stateloop;
michael@0: default:
michael@0: /*
michael@0: * Anything else Emit a U+003C LESS-THAN SIGN
michael@0: * character token and reconsume the current
michael@0: * input character in the script data escaped
michael@0: * state.
michael@0: */
michael@0: tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
michael@0: cstart = pos;
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0: case SCRIPT_DATA_DOUBLE_ESCAPE_START:
michael@0: scriptdatadoubleescapestartloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: assert index > 0;
michael@0: if (index < 6) { // SCRIPT_ARR.length
michael@0: char folded = c;
michael@0: if (c >= 'A' && c <= 'Z') {
michael@0: folded += 0x20;
michael@0: }
michael@0: if (folded != Tokenizer.SCRIPT_ARR[index]) {
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: index++;
michael@0: continue;
michael@0: }
michael@0: switch (c) {
michael@0: case '\r':
michael@0: emitCarriageReturn(buf, pos);
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\u000C':
michael@0: case '/':
michael@0: case '>':
michael@0: /*
michael@0: * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0: * (LF) U+000C FORM FEED (FF) U+0020 SPACE
michael@0: * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
michael@0: * (>) Emit the current input character as a
michael@0: * character token. If the temporary buffer is
michael@0: * the string "script", then switch to the
michael@0: * script data double escaped state.
michael@0: */
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
michael@0: break scriptdatadoubleescapestartloop;
michael@0: // continue stateloop;
michael@0: default:
michael@0: /*
michael@0: * Anything else Reconsume the current input
michael@0: * character in the script data escaped state.
michael@0: */
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0: case SCRIPT_DATA_DOUBLE_ESCAPED:
michael@0: scriptdatadoubleescapedloop: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: }
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '-':
michael@0: /*
michael@0: * U+002D HYPHEN-MINUS (-) Emit a U+002D
michael@0: * HYPHEN-MINUS character token. Switch to the
michael@0: * script data double escaped dash state.
michael@0: */
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos);
michael@0: break scriptdatadoubleescapedloop; // FALL THRU
michael@0: // continue
michael@0: // stateloop;
michael@0: case '<':
michael@0: /*
michael@0: * U+003C LESS-THAN SIGN (<) Emit a U+003C
michael@0: * LESS-THAN SIGN character token. Switch to the
michael@0: * script data double escaped less-than sign
michael@0: * state.
michael@0: */
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\u0000':
michael@0: emitReplacementCharacter(buf, pos);
michael@0: continue;
michael@0: case '\r':
michael@0: emitCarriageReturn(buf, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: default:
michael@0: /*
michael@0: * Anything else Emit the current input
michael@0: * character as a character token. Stay in the
michael@0: * script data double escaped state.
michael@0: */
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0: case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
michael@0: scriptdatadoubleescapeddashloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '-':
michael@0: /*
michael@0: * U+002D HYPHEN-MINUS (-) Emit a U+002D
michael@0: * HYPHEN-MINUS character token. Switch to the
michael@0: * script data double escaped dash dash state.
michael@0: */
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos);
michael@0: break scriptdatadoubleescapeddashloop;
michael@0: // continue stateloop;
michael@0: case '<':
michael@0: /*
michael@0: * U+003C LESS-THAN SIGN (<) Emit a U+003C
michael@0: * LESS-THAN SIGN character token. Switch to the
michael@0: * script data double escaped less-than sign
michael@0: * state.
michael@0: */
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\u0000':
michael@0: emitReplacementCharacter(buf, pos);
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\r':
michael@0: emitCarriageReturn(buf, pos);
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: default:
michael@0: /*
michael@0: * Anything else Emit the current input
michael@0: * character as a character token. Switch to the
michael@0: * script data double escaped state.
michael@0: */
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0: case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
michael@0: scriptdatadoubleescapeddashdashloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '-':
michael@0: /*
michael@0: * U+002D HYPHEN-MINUS (-) Emit a U+002D
michael@0: * HYPHEN-MINUS character token. Stay in the
michael@0: * script data double escaped dash dash state.
michael@0: */
michael@0: continue;
michael@0: case '<':
michael@0: /*
michael@0: * U+003C LESS-THAN SIGN (<) Emit a U+003C
michael@0: * LESS-THAN SIGN character token. Switch to the
michael@0: * script data double escaped less-than sign
michael@0: * state.
michael@0: */
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
michael@0: break scriptdatadoubleescapeddashdashloop;
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Emit a U+003E
michael@0: * GREATER-THAN SIGN character token. Switch to
michael@0: * the script data state.
michael@0: */
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\u0000':
michael@0: emitReplacementCharacter(buf, pos);
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\r':
michael@0: emitCarriageReturn(buf, pos);
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: default:
michael@0: /*
michael@0: * Anything else Emit the current input
michael@0: * character as a character token. Switch to the
michael@0: * script data double escaped state.
michael@0: */
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0: case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
michael@0: scriptdatadoubleescapedlessthanloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '/':
michael@0: /*
michael@0: * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS
michael@0: * character token. Set the temporary buffer to
michael@0: * the empty string. Switch to the script data
michael@0: * double escape end state.
michael@0: */
michael@0: index = 0;
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos);
michael@0: break scriptdatadoubleescapedlessthanloop;
michael@0: default:
michael@0: /*
michael@0: * Anything else Reconsume the current input
michael@0: * character in the script data double escaped
michael@0: * state.
michael@0: */
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0: case SCRIPT_DATA_DOUBLE_ESCAPE_END:
michael@0: scriptdatadoubleescapeendloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: if (index < 6) { // SCRIPT_ARR.length
michael@0: char folded = c;
michael@0: if (c >= 'A' && c <= 'Z') {
michael@0: folded += 0x20;
michael@0: }
michael@0: if (folded != Tokenizer.SCRIPT_ARR[index]) {
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: index++;
michael@0: continue;
michael@0: }
michael@0: switch (c) {
michael@0: case '\r':
michael@0: emitCarriageReturn(buf, pos);
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\u000C':
michael@0: case '/':
michael@0: case '>':
michael@0: /*
michael@0: * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0: * (LF) U+000C FORM FEED (FF) U+0020 SPACE
michael@0: * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
michael@0: * (>) Emit the current input character as a
michael@0: * character token. If the temporary buffer is
michael@0: * the string "script", then switch to the
michael@0: * script data escaped state.
michael@0: */
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
michael@0: continue stateloop;
michael@0: default:
michael@0: /*
michael@0: * Reconsume the current input character in the
michael@0: * script data double escaped state.
michael@0: */
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // XXX reorder point
michael@0: case MARKUP_DECLARATION_OCTYPE:
michael@0: markupdeclarationdoctypeloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: if (index < 6) { // OCTYPE.length
michael@0: char folded = c;
michael@0: if (c >= 'A' && c <= 'Z') {
michael@0: folded += 0x20;
michael@0: }
michael@0: if (folded == Tokenizer.OCTYPE[index]) {
michael@0: appendLongStrBuf(c);
michael@0: } else {
michael@0: errBogusComment();
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: index++;
michael@0: continue;
michael@0: } else {
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.DOCTYPE, reconsume, pos);
michael@0: break markupdeclarationdoctypeloop;
michael@0: // continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case DOCTYPE:
michael@0: doctypeloop: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: }
michael@0: initDoctypeFields();
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '\r':
michael@0: silentCarriageReturn();
michael@0: state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: // fall thru
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\u000C':
michael@0: /*
michael@0: * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0: * (LF) U+000C FORM FEED (FF) U+0020 SPACE
michael@0: * Switch to the before DOCTYPE name state.
michael@0: */
michael@0: state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
michael@0: break doctypeloop;
michael@0: // continue stateloop;
michael@0: default:
michael@0: /*
michael@0: * Anything else Parse error.
michael@0: */
michael@0: errMissingSpaceBeforeDoctypeName();
michael@0: /*
michael@0: * Reconsume the current character in the before
michael@0: * DOCTYPE name state.
michael@0: */
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
michael@0: break doctypeloop;
michael@0: // continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case BEFORE_DOCTYPE_NAME:
michael@0: beforedoctypenameloop: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: }
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '\r':
michael@0: silentCarriageReturn();
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: // fall thru
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\u000C':
michael@0: /*
michael@0: * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0: * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
michael@0: * in the before DOCTYPE name state.
michael@0: */
michael@0: continue;
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Parse error.
michael@0: */
michael@0: errNamelessDoctype();
michael@0: /*
michael@0: * Create a new DOCTYPE token. Set its
michael@0: * force-quirks flag to on.
michael@0: */
michael@0: forceQuirks = true;
michael@0: /*
michael@0: * Emit the token.
michael@0: */
michael@0: emitDoctypeToken(pos);
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: default:
michael@0: if (c >= 'A' && c <= 'Z') {
michael@0: /*
michael@0: * U+0041 LATIN CAPITAL LETTER A through to
michael@0: * U+005A LATIN CAPITAL LETTER Z Create a
michael@0: * new DOCTYPE token. Set the token's name
michael@0: * to the lowercase version of the input
michael@0: * character (add 0x0020 to the character's
michael@0: * code point).
michael@0: */
michael@0: c += 0x20;
michael@0: }
michael@0: /* Anything else Create a new DOCTYPE token. */
michael@0: /*
michael@0: * Set the token's name name to the current
michael@0: * input character.
michael@0: */
michael@0: clearStrBufAndAppend(c);
michael@0: /*
michael@0: * Switch to the DOCTYPE name state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos);
michael@0: break beforedoctypenameloop;
michael@0: // continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case DOCTYPE_NAME:
michael@0: doctypenameloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '\r':
michael@0: silentCarriageReturn();
michael@0: strBufToDoctypeName();
michael@0: state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: // fall thru
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\u000C':
michael@0: /*
michael@0: * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0: * (LF) U+000C FORM FEED (FF) U+0020 SPACE
michael@0: * Switch to the after DOCTYPE name state.
michael@0: */
michael@0: strBufToDoctypeName();
michael@0: state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
michael@0: break doctypenameloop;
michael@0: // continue stateloop;
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Emit the current
michael@0: * DOCTYPE token.
michael@0: */
michael@0: strBufToDoctypeName();
michael@0: emitDoctypeToken(pos);
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: default:
michael@0: /*
michael@0: * U+0041 LATIN CAPITAL LETTER A through to
michael@0: * U+005A LATIN CAPITAL LETTER Z Append the
michael@0: * lowercase version of the input character (add
michael@0: * 0x0020 to the character's code point) to the
michael@0: * current DOCTYPE token's name.
michael@0: */
michael@0: if (c >= 'A' && c <= 'Z') {
michael@0: c += 0x0020;
michael@0: }
michael@0: /*
michael@0: * Anything else Append the current input
michael@0: * character to the current DOCTYPE token's
michael@0: * name.
michael@0: */
michael@0: appendStrBuf(c);
michael@0: /*
michael@0: * Stay in the DOCTYPE name state.
michael@0: */
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case AFTER_DOCTYPE_NAME:
michael@0: afterdoctypenameloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '\r':
michael@0: silentCarriageReturn();
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: // fall thru
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\u000C':
michael@0: /*
michael@0: * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0: * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
michael@0: * in the after DOCTYPE name state.
michael@0: */
michael@0: continue;
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Emit the current
michael@0: * DOCTYPE token.
michael@0: */
michael@0: emitDoctypeToken(pos);
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: case 'p':
michael@0: case 'P':
michael@0: index = 0;
michael@0: state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos);
michael@0: break afterdoctypenameloop;
michael@0: // continue stateloop;
michael@0: case 's':
michael@0: case 'S':
michael@0: index = 0;
michael@0: state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos);
michael@0: continue stateloop;
michael@0: default:
michael@0: /*
michael@0: * Otherwise, this is the parse error.
michael@0: */
michael@0: bogusDoctype();
michael@0:
michael@0: /*
michael@0: * Set the DOCTYPE token's force-quirks flag to
michael@0: * on.
michael@0: */
michael@0: // done by bogusDoctype();
michael@0: /*
michael@0: * Switch to the bogus DOCTYPE state.
michael@0: */
michael@0: state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case DOCTYPE_UBLIC:
michael@0: doctypeublicloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * If the six characters starting from the current input
michael@0: * character are an ASCII case-insensitive match for the
michael@0: * word "PUBLIC", then consume those characters and
michael@0: * switch to the before DOCTYPE public identifier state.
michael@0: */
michael@0: if (index < 5) { // UBLIC.length
michael@0: char folded = c;
michael@0: if (c >= 'A' && c <= 'Z') {
michael@0: folded += 0x20;
michael@0: }
michael@0: if (folded != Tokenizer.UBLIC[index]) {
michael@0: bogusDoctype();
michael@0: // forceQuirks = true;
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: index++;
michael@0: continue;
michael@0: } else {
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos);
michael@0: break doctypeublicloop;
michael@0: // continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case AFTER_DOCTYPE_PUBLIC_KEYWORD:
michael@0: afterdoctypepublickeywordloop: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: }
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '\r':
michael@0: silentCarriageReturn();
michael@0: state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: // fall thru
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\u000C':
michael@0: /*
michael@0: * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0: * (LF) U+000C FORM FEED (FF) U+0020 SPACE
michael@0: * Switch to the before DOCTYPE public
michael@0: * identifier state.
michael@0: */
michael@0: state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
michael@0: break afterdoctypepublickeywordloop;
michael@0: // FALL THROUGH continue stateloop
michael@0: case '"':
michael@0: /*
michael@0: * U+0022 QUOTATION MARK (") Parse Error.
michael@0: */
michael@0: errNoSpaceBetweenDoctypePublicKeywordAndQuote();
michael@0: /*
michael@0: * Set the DOCTYPE token's public identifier to
michael@0: * the empty string (not missing),
michael@0: */
michael@0: clearLongStrBuf();
michael@0: /*
michael@0: * then switch to the DOCTYPE public identifier
michael@0: * (double-quoted) state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\'':
michael@0: /*
michael@0: * U+0027 APOSTROPHE (') Parse Error.
michael@0: */
michael@0: errNoSpaceBetweenDoctypePublicKeywordAndQuote();
michael@0: /*
michael@0: * Set the DOCTYPE token's public identifier to
michael@0: * the empty string (not missing),
michael@0: */
michael@0: clearLongStrBuf();
michael@0: /*
michael@0: * then switch to the DOCTYPE public identifier
michael@0: * (single-quoted) state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '>':
michael@0: /* U+003E GREATER-THAN SIGN (>) Parse error. */
michael@0: errExpectedPublicId();
michael@0: /*
michael@0: * Set the DOCTYPE token's force-quirks flag to
michael@0: * on.
michael@0: */
michael@0: forceQuirks = true;
michael@0: /*
michael@0: * Emit that DOCTYPE token.
michael@0: */
michael@0: emitDoctypeToken(pos);
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: default:
michael@0: bogusDoctype();
michael@0: /*
michael@0: * Set the DOCTYPE token's force-quirks flag to
michael@0: * on.
michael@0: */
michael@0: // done by bogusDoctype();
michael@0: /*
michael@0: * Switch to the bogus DOCTYPE state.
michael@0: */
michael@0: state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
michael@0: beforedoctypepublicidentifierloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '\r':
michael@0: silentCarriageReturn();
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: // fall thru
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\u000C':
michael@0: /*
michael@0: * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0: * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
michael@0: * in the before DOCTYPE public identifier
michael@0: * state.
michael@0: */
michael@0: continue;
michael@0: case '"':
michael@0: /*
michael@0: * U+0022 QUOTATION MARK (") Set the DOCTYPE
michael@0: * token's public identifier to the empty string
michael@0: * (not missing),
michael@0: */
michael@0: clearLongStrBuf();
michael@0: /*
michael@0: * then switch to the DOCTYPE public identifier
michael@0: * (double-quoted) state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
michael@0: break beforedoctypepublicidentifierloop;
michael@0: // continue stateloop;
michael@0: case '\'':
michael@0: /*
michael@0: * U+0027 APOSTROPHE (') Set the DOCTYPE token's
michael@0: * public identifier to the empty string (not
michael@0: * missing),
michael@0: */
michael@0: clearLongStrBuf();
michael@0: /*
michael@0: * then switch to the DOCTYPE public identifier
michael@0: * (single-quoted) state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '>':
michael@0: /* U+003E GREATER-THAN SIGN (>) Parse error. */
michael@0: errExpectedPublicId();
michael@0: /*
michael@0: * Set the DOCTYPE token's force-quirks flag to
michael@0: * on.
michael@0: */
michael@0: forceQuirks = true;
michael@0: /*
michael@0: * Emit that DOCTYPE token.
michael@0: */
michael@0: emitDoctypeToken(pos);
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: default:
michael@0: bogusDoctype();
michael@0: /*
michael@0: * Set the DOCTYPE token's force-quirks flag to
michael@0: * on.
michael@0: */
michael@0: // done by bogusDoctype();
michael@0: /*
michael@0: * Switch to the bogus DOCTYPE state.
michael@0: */
michael@0: state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
michael@0: doctypepublicidentifierdoublequotedloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '"':
michael@0: /*
michael@0: * U+0022 QUOTATION MARK (") Switch to the after
michael@0: * DOCTYPE public identifier state.
michael@0: */
michael@0: publicIdentifier = longStrBufToString();
michael@0: state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
michael@0: break doctypepublicidentifierdoublequotedloop;
michael@0: // continue stateloop;
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Parse error.
michael@0: */
michael@0: errGtInPublicId();
michael@0: /*
michael@0: * Set the DOCTYPE token's force-quirks flag to
michael@0: * on.
michael@0: */
michael@0: forceQuirks = true;
michael@0: /*
michael@0: * Emit that DOCTYPE token.
michael@0: */
michael@0: publicIdentifier = longStrBufToString();
michael@0: emitDoctypeToken(pos);
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\r':
michael@0: appendLongStrBufCarriageReturn();
michael@0: break stateloop;
michael@0: case '\n':
michael@0: appendLongStrBufLineFeed();
michael@0: continue;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: default:
michael@0: /*
michael@0: * Anything else Append the current input
michael@0: * character to the current DOCTYPE token's
michael@0: * public identifier.
michael@0: */
michael@0: appendLongStrBuf(c);
michael@0: /*
michael@0: * Stay in the DOCTYPE public identifier
michael@0: * (double-quoted) state.
michael@0: */
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
michael@0: afterdoctypepublicidentifierloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '\r':
michael@0: silentCarriageReturn();
michael@0: state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: // fall thru
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\u000C':
michael@0: /*
michael@0: * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0: * (LF) U+000C FORM FEED (FF) U+0020 SPACE
michael@0: * Switch to the between DOCTYPE public and
michael@0: * system identifiers state.
michael@0: */
michael@0: state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
michael@0: break afterdoctypepublicidentifierloop;
michael@0: // continue stateloop;
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Emit the current
michael@0: * DOCTYPE token.
michael@0: */
michael@0: emitDoctypeToken(pos);
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '"':
michael@0: /*
michael@0: * U+0022 QUOTATION MARK (") Parse error.
michael@0: */
michael@0: errNoSpaceBetweenPublicAndSystemIds();
michael@0: /*
michael@0: * Set the DOCTYPE token's system identifier to
michael@0: * the empty string (not missing),
michael@0: */
michael@0: clearLongStrBuf();
michael@0: /*
michael@0: * then switch to the DOCTYPE system identifier
michael@0: * (double-quoted) state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\'':
michael@0: /*
michael@0: * U+0027 APOSTROPHE (') Parse error.
michael@0: */
michael@0: errNoSpaceBetweenPublicAndSystemIds();
michael@0: /*
michael@0: * Set the DOCTYPE token's system identifier to
michael@0: * the empty string (not missing),
michael@0: */
michael@0: clearLongStrBuf();
michael@0: /*
michael@0: * then switch to the DOCTYPE system identifier
michael@0: * (single-quoted) state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
michael@0: continue stateloop;
michael@0: default:
michael@0: bogusDoctype();
michael@0: /*
michael@0: * Set the DOCTYPE token's force-quirks flag to
michael@0: * on.
michael@0: */
michael@0: // done by bogusDoctype();
michael@0: /*
michael@0: * Switch to the bogus DOCTYPE state.
michael@0: */
michael@0: state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
michael@0: betweendoctypepublicandsystemidentifiersloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '\r':
michael@0: silentCarriageReturn();
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: // fall thru
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\u000C':
michael@0: /*
michael@0: * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0: * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
michael@0: * in the between DOCTYPE public and system
michael@0: * identifiers state.
michael@0: */
michael@0: continue;
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Emit the current
michael@0: * DOCTYPE token.
michael@0: */
michael@0: emitDoctypeToken(pos);
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '"':
michael@0: /*
michael@0: * U+0022 QUOTATION MARK (") Set the DOCTYPE
michael@0: * token's system identifier to the empty string
michael@0: * (not missing),
michael@0: */
michael@0: clearLongStrBuf();
michael@0: /*
michael@0: * then switch to the DOCTYPE system identifier
michael@0: * (double-quoted) state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
michael@0: break betweendoctypepublicandsystemidentifiersloop;
michael@0: // continue stateloop;
michael@0: case '\'':
michael@0: /*
michael@0: * U+0027 APOSTROPHE (') Set the DOCTYPE token's
michael@0: * system identifier to the empty string (not
michael@0: * missing),
michael@0: */
michael@0: clearLongStrBuf();
michael@0: /*
michael@0: * then switch to the DOCTYPE system identifier
michael@0: * (single-quoted) state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
michael@0: continue stateloop;
michael@0: default:
michael@0: bogusDoctype();
michael@0: /*
michael@0: * Set the DOCTYPE token's force-quirks flag to
michael@0: * on.
michael@0: */
michael@0: // done by bogusDoctype();
michael@0: /*
michael@0: * Switch to the bogus DOCTYPE state.
michael@0: */
michael@0: state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
michael@0: doctypesystemidentifierdoublequotedloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '"':
michael@0: /*
michael@0: * U+0022 QUOTATION MARK (") Switch to the after
michael@0: * DOCTYPE system identifier state.
michael@0: */
michael@0: systemIdentifier = longStrBufToString();
michael@0: state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Parse error.
michael@0: */
michael@0: errGtInSystemId();
michael@0: /*
michael@0: * Set the DOCTYPE token's force-quirks flag to
michael@0: * on.
michael@0: */
michael@0: forceQuirks = true;
michael@0: /*
michael@0: * Emit that DOCTYPE token.
michael@0: */
michael@0: systemIdentifier = longStrBufToString();
michael@0: emitDoctypeToken(pos);
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\r':
michael@0: appendLongStrBufCarriageReturn();
michael@0: break stateloop;
michael@0: case '\n':
michael@0: appendLongStrBufLineFeed();
michael@0: continue;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: default:
michael@0: /*
michael@0: * Anything else Append the current input
michael@0: * character to the current DOCTYPE token's
michael@0: * system identifier.
michael@0: */
michael@0: appendLongStrBuf(c);
michael@0: /*
michael@0: * Stay in the DOCTYPE system identifier
michael@0: * (double-quoted) state.
michael@0: */
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
michael@0: afterdoctypesystemidentifierloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '\r':
michael@0: silentCarriageReturn();
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: // fall thru
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\u000C':
michael@0: /*
michael@0: * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0: * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
michael@0: * in the after DOCTYPE system identifier state.
michael@0: */
michael@0: continue;
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Emit the current
michael@0: * DOCTYPE token.
michael@0: */
michael@0: emitDoctypeToken(pos);
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: default:
michael@0: /*
michael@0: * Switch to the bogus DOCTYPE state. (This does
michael@0: * not set the DOCTYPE token's force-quirks flag
michael@0: * to on.)
michael@0: */
michael@0: bogusDoctypeWithoutQuirks();
michael@0: state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
michael@0: break afterdoctypesystemidentifierloop;
michael@0: // continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case BOGUS_DOCTYPE:
michael@0: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: }
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '>':
michael@0: /*
michael@0: * U+003E GREATER-THAN SIGN (>) Emit that
michael@0: * DOCTYPE token.
michael@0: */
michael@0: emitDoctypeToken(pos);
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\r':
michael@0: silentCarriageReturn();
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: // fall thru
michael@0: default:
michael@0: /*
michael@0: * Anything else Stay in the bogus DOCTYPE
michael@0: * state.
michael@0: */
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // XXX reorder point
michael@0: case DOCTYPE_YSTEM:
michael@0: doctypeystemloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Otherwise, if the six characters starting from the
michael@0: * current input character are an ASCII case-insensitive
michael@0: * match for the word "SYSTEM", then consume those
michael@0: * characters and switch to the before DOCTYPE system
michael@0: * identifier state.
michael@0: */
michael@0: if (index < 5) { // YSTEM.length
michael@0: char folded = c;
michael@0: if (c >= 'A' && c <= 'Z') {
michael@0: folded += 0x20;
michael@0: }
michael@0: if (folded != Tokenizer.YSTEM[index]) {
michael@0: bogusDoctype();
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: index++;
michael@0: continue stateloop;
michael@0: } else {
michael@0: reconsume = true;
michael@0: state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos);
michael@0: break doctypeystemloop;
michael@0: // continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case AFTER_DOCTYPE_SYSTEM_KEYWORD:
michael@0: afterdoctypesystemkeywordloop: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: }
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '\r':
michael@0: silentCarriageReturn();
michael@0: state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: // fall thru
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\u000C':
michael@0: /*
michael@0: * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0: * (LF) U+000C FORM FEED (FF) U+0020 SPACE
michael@0: * Switch to the before DOCTYPE public
michael@0: * identifier state.
michael@0: */
michael@0: state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
michael@0: break afterdoctypesystemkeywordloop;
michael@0: // FALL THROUGH continue stateloop
michael@0: case '"':
michael@0: /*
michael@0: * U+0022 QUOTATION MARK (") Parse Error.
michael@0: */
michael@0: errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
michael@0: /*
michael@0: * Set the DOCTYPE token's system identifier to
michael@0: * the empty string (not missing),
michael@0: */
michael@0: clearLongStrBuf();
michael@0: /*
michael@0: * then switch to the DOCTYPE public identifier
michael@0: * (double-quoted) state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\'':
michael@0: /*
michael@0: * U+0027 APOSTROPHE (') Parse Error.
michael@0: */
michael@0: errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
michael@0: /*
michael@0: * Set the DOCTYPE token's public identifier to
michael@0: * the empty string (not missing),
michael@0: */
michael@0: clearLongStrBuf();
michael@0: /*
michael@0: * then switch to the DOCTYPE public identifier
michael@0: * (single-quoted) state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '>':
michael@0: /* U+003E GREATER-THAN SIGN (>) Parse error. */
michael@0: errExpectedPublicId();
michael@0: /*
michael@0: * Set the DOCTYPE token's force-quirks flag to
michael@0: * on.
michael@0: */
michael@0: forceQuirks = true;
michael@0: /*
michael@0: * Emit that DOCTYPE token.
michael@0: */
michael@0: emitDoctypeToken(pos);
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: default:
michael@0: bogusDoctype();
michael@0: /*
michael@0: * Set the DOCTYPE token's force-quirks flag to
michael@0: * on.
michael@0: */
michael@0: // done by bogusDoctype();
michael@0: /*
michael@0: * Switch to the bogus DOCTYPE state.
michael@0: */
michael@0: state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
michael@0: beforedoctypesystemidentifierloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '\r':
michael@0: silentCarriageReturn();
michael@0: break stateloop;
michael@0: case '\n':
michael@0: silentLineFeed();
michael@0: // fall thru
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\u000C':
michael@0: /*
michael@0: * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0: * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
michael@0: * in the before DOCTYPE system identifier
michael@0: * state.
michael@0: */
michael@0: continue;
michael@0: case '"':
michael@0: /*
michael@0: * U+0022 QUOTATION MARK (") Set the DOCTYPE
michael@0: * token's system identifier to the empty string
michael@0: * (not missing),
michael@0: */
michael@0: clearLongStrBuf();
michael@0: /*
michael@0: * then switch to the DOCTYPE system identifier
michael@0: * (double-quoted) state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\'':
michael@0: /*
michael@0: * U+0027 APOSTROPHE (') Set the DOCTYPE token's
michael@0: * system identifier to the empty string (not
michael@0: * missing),
michael@0: */
michael@0: clearLongStrBuf();
michael@0: /*
michael@0: * then switch to the DOCTYPE system identifier
michael@0: * (single-quoted) state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
michael@0: break beforedoctypesystemidentifierloop;
michael@0: // continue stateloop;
michael@0: case '>':
michael@0: /* U+003E GREATER-THAN SIGN (>) Parse error. */
michael@0: errExpectedSystemId();
michael@0: /*
michael@0: * Set the DOCTYPE token's force-quirks flag to
michael@0: * on.
michael@0: */
michael@0: forceQuirks = true;
michael@0: /*
michael@0: * Emit that DOCTYPE token.
michael@0: */
michael@0: emitDoctypeToken(pos);
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: default:
michael@0: bogusDoctype();
michael@0: /*
michael@0: * Set the DOCTYPE token's force-quirks flag to
michael@0: * on.
michael@0: */
michael@0: // done by bogusDoctype();
michael@0: /*
michael@0: * Switch to the bogus DOCTYPE state.
michael@0: */
michael@0: state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
michael@0: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '\'':
michael@0: /*
michael@0: * U+0027 APOSTROPHE (') Switch to the after
michael@0: * DOCTYPE system identifier state.
michael@0: */
michael@0: systemIdentifier = longStrBufToString();
michael@0: state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '>':
michael@0: errGtInSystemId();
michael@0: /*
michael@0: * Set the DOCTYPE token's force-quirks flag to
michael@0: * on.
michael@0: */
michael@0: forceQuirks = true;
michael@0: /*
michael@0: * Emit that DOCTYPE token.
michael@0: */
michael@0: systemIdentifier = longStrBufToString();
michael@0: emitDoctypeToken(pos);
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\r':
michael@0: appendLongStrBufCarriageReturn();
michael@0: break stateloop;
michael@0: case '\n':
michael@0: appendLongStrBufLineFeed();
michael@0: continue;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: default:
michael@0: /*
michael@0: * Anything else Append the current input
michael@0: * character to the current DOCTYPE token's
michael@0: * system identifier.
michael@0: */
michael@0: appendLongStrBuf(c);
michael@0: /*
michael@0: * Stay in the DOCTYPE system identifier
michael@0: * (double-quoted) state.
michael@0: */
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // XXX reorder point
michael@0: case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
michael@0: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case '\'':
michael@0: /*
michael@0: * U+0027 APOSTROPHE (') Switch to the after
michael@0: * DOCTYPE public identifier state.
michael@0: */
michael@0: publicIdentifier = longStrBufToString();
michael@0: state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '>':
michael@0: errGtInPublicId();
michael@0: /*
michael@0: * Set the DOCTYPE token's force-quirks flag to
michael@0: * on.
michael@0: */
michael@0: forceQuirks = true;
michael@0: /*
michael@0: * Emit that DOCTYPE token.
michael@0: */
michael@0: publicIdentifier = longStrBufToString();
michael@0: emitDoctypeToken(pos);
michael@0: /*
michael@0: * Switch to the data state.
michael@0: */
michael@0: state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0: continue stateloop;
michael@0: case '\r':
michael@0: appendLongStrBufCarriageReturn();
michael@0: break stateloop;
michael@0: case '\n':
michael@0: appendLongStrBufLineFeed();
michael@0: continue;
michael@0: case '\u0000':
michael@0: c = '\uFFFD';
michael@0: // fall thru
michael@0: default:
michael@0: /*
michael@0: * Anything else Append the current input
michael@0: * character to the current DOCTYPE token's
michael@0: * public identifier.
michael@0: */
michael@0: appendLongStrBuf(c);
michael@0: /*
michael@0: * Stay in the DOCTYPE public identifier
michael@0: * (single-quoted) state.
michael@0: */
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // XXX reorder point
michael@0: case PROCESSING_INSTRUCTION:
michael@0: processinginstructionloop: for (;;) {
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: switch (c) {
michael@0: case '?':
michael@0: state = transition(
michael@0: state,
michael@0: Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK,
michael@0: reconsume, pos);
michael@0: break processinginstructionloop;
michael@0: // continue stateloop;
michael@0: default:
michael@0: continue;
michael@0: }
michael@0: }
michael@0: case PROCESSING_INSTRUCTION_QUESTION_MARK:
michael@0: if (++pos == endPos) {
michael@0: break stateloop;
michael@0: }
michael@0: c = checkChar(buf, pos);
michael@0: switch (c) {
michael@0: case '>':
michael@0: state = transition(state, Tokenizer.DATA,
michael@0: reconsume, pos);
michael@0: continue stateloop;
michael@0: default:
michael@0: state = transition(state,
michael@0: Tokenizer.PROCESSING_INSTRUCTION,
michael@0: reconsume, pos);
michael@0: continue stateloop;
michael@0: }
michael@0: // END HOTSPOT WORKAROUND
michael@0: }
michael@0: }
michael@0: flushChars(buf, pos);
michael@0: /*
michael@0: * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; }
michael@0: */
michael@0: // Save locals
michael@0: stateSave = state;
michael@0: returnStateSave = returnState;
michael@0: return pos;
michael@0: }
michael@0:
michael@0: // HOTSPOT WORKAROUND INSERTION POINT
michael@0:
michael@0: // [NOCPP[
michael@0:
michael@0: protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException {
michael@0: return to;
michael@0: }
michael@0:
michael@0: // ]NOCPP]
michael@0:
michael@0: private void initDoctypeFields() {
michael@0: doctypeName = "";
michael@0: if (systemIdentifier != null) {
michael@0: Portability.releaseString(systemIdentifier);
michael@0: systemIdentifier = null;
michael@0: }
michael@0: if (publicIdentifier != null) {
michael@0: Portability.releaseString(publicIdentifier);
michael@0: publicIdentifier = null;
michael@0: }
michael@0: forceQuirks = false;
michael@0: }
michael@0:
michael@0: @Inline private void adjustDoubleHyphenAndAppendToLongStrBufCarriageReturn()
michael@0: throws SAXException {
michael@0: silentCarriageReturn();
michael@0: adjustDoubleHyphenAndAppendToLongStrBufAndErr('\n');
michael@0: }
michael@0:
michael@0: @Inline private void adjustDoubleHyphenAndAppendToLongStrBufLineFeed()
michael@0: throws SAXException {
michael@0: silentLineFeed();
michael@0: adjustDoubleHyphenAndAppendToLongStrBufAndErr('\n');
michael@0: }
michael@0:
michael@0: @Inline private void appendLongStrBufLineFeed() {
michael@0: silentLineFeed();
michael@0: appendLongStrBuf('\n');
michael@0: }
michael@0:
michael@0: @Inline private void appendLongStrBufCarriageReturn() {
michael@0: silentCarriageReturn();
michael@0: appendLongStrBuf('\n');
michael@0: }
michael@0:
michael@0: @Inline protected void silentCarriageReturn() {
michael@0: ++line;
michael@0: lastCR = true;
michael@0: }
michael@0:
michael@0: @Inline protected void silentLineFeed() {
michael@0: ++line;
michael@0: }
michael@0:
michael@0: private void emitCarriageReturn(@NoLength char[] buf, int pos)
michael@0: throws SAXException {
michael@0: silentCarriageReturn();
michael@0: flushChars(buf, pos);
michael@0: tokenHandler.characters(Tokenizer.LF, 0, 1);
michael@0: cstart = Integer.MAX_VALUE;
michael@0: }
michael@0:
michael@0: private void emitReplacementCharacter(@NoLength char[] buf, int pos)
michael@0: throws SAXException {
michael@0: flushChars(buf, pos);
michael@0: tokenHandler.zeroOriginatingReplacementCharacter();
michael@0: cstart = pos + 1;
michael@0: }
michael@0:
michael@0: private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos)
michael@0: throws SAXException {
michael@0: flushChars(buf, pos);
michael@0: tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1);
michael@0: cstart = pos + 1;
michael@0: }
michael@0:
michael@0: private void setAdditionalAndRememberAmpersandLocation(char add) {
michael@0: additional = add;
michael@0: // [NOCPP[
michael@0: ampersandLocation = new LocatorImpl(this);
michael@0: // ]NOCPP]
michael@0: }
michael@0:
michael@0: private void bogusDoctype() throws SAXException {
michael@0: errBogusDoctype();
michael@0: forceQuirks = true;
michael@0: }
michael@0:
michael@0: private void bogusDoctypeWithoutQuirks() throws SAXException {
michael@0: errBogusDoctype();
michael@0: forceQuirks = false;
michael@0: }
michael@0:
michael@0: private void emitOrAppendStrBuf(int returnState) throws SAXException {
michael@0: if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
michael@0: appendStrBufToLongStrBuf();
michael@0: } else {
michael@0: emitStrBuf();
michael@0: }
michael@0: }
michael@0:
michael@0: private void handleNcrValue(int returnState) throws SAXException {
michael@0: /*
michael@0: * If one or more characters match the range, then take them all and
michael@0: * interpret the string of characters as a number (either hexadecimal or
michael@0: * decimal as appropriate).
michael@0: */
michael@0: if (value <= 0xFFFF) {
michael@0: if (value >= 0x80 && value <= 0x9f) {
michael@0: /*
michael@0: * If that number is one of the numbers in the first column of
michael@0: * the following table, then this is a parse error.
michael@0: */
michael@0: errNcrInC1Range();
michael@0: /*
michael@0: * Find the row with that number in the first column, and return
michael@0: * a character token for the Unicode character given in the
michael@0: * second column of that row.
michael@0: */
michael@0: @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80];
michael@0: emitOrAppendOne(val, returnState);
michael@0: // [NOCPP[
michael@0: } else if (value == 0xC
michael@0: && contentSpacePolicy != XmlViolationPolicy.ALLOW) {
michael@0: if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) {
michael@0: emitOrAppendOne(Tokenizer.SPACE, returnState);
michael@0: } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) {
michael@0: fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space.");
michael@0: }
michael@0: // ]NOCPP]
michael@0: } else if (value == 0x0) {
michael@0: errNcrZero();
michael@0: emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
michael@0: } else if ((value & 0xF800) == 0xD800) {
michael@0: errNcrSurrogate();
michael@0: emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
michael@0: } else {
michael@0: /*
michael@0: * Otherwise, return a character token for the Unicode character
michael@0: * whose code point is that number.
michael@0: */
michael@0: char ch = (char) value;
michael@0: // [NOCPP[
michael@0: if (value == 0x0D) {
michael@0: errNcrCr();
michael@0: } else if ((value <= 0x0008) || (value == 0x000B)
michael@0: || (value >= 0x000E && value <= 0x001F)) {
michael@0: ch = errNcrControlChar(ch);
michael@0: } else if (value >= 0xFDD0 && value <= 0xFDEF) {
michael@0: errNcrUnassigned();
michael@0: } else if ((value & 0xFFFE) == 0xFFFE) {
michael@0: ch = errNcrNonCharacter(ch);
michael@0: } else if (value >= 0x007F && value <= 0x009F) {
michael@0: errNcrControlChar();
michael@0: } else {
michael@0: maybeWarnPrivateUse(ch);
michael@0: }
michael@0: // ]NOCPP]
michael@0: bmpChar[0] = ch;
michael@0: emitOrAppendOne(bmpChar, returnState);
michael@0: }
michael@0: } else if (value <= 0x10FFFF) {
michael@0: // [NOCPP[
michael@0: maybeWarnPrivateUseAstral();
michael@0: if ((value & 0xFFFE) == 0xFFFE) {
michael@0: errAstralNonCharacter(value);
michael@0: }
michael@0: // ]NOCPP]
michael@0: astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10));
michael@0: astralChar[1] = (char) (0xDC00 + (value & 0x3FF));
michael@0: emitOrAppendTwo(astralChar, returnState);
michael@0: } else {
michael@0: errNcrOutOfRange();
michael@0: emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
michael@0: }
michael@0: }
michael@0:
michael@0: public void eof() throws SAXException {
michael@0: int state = stateSave;
michael@0: int returnState = returnStateSave;
michael@0:
michael@0: eofloop: for (;;) {
michael@0: switch (state) {
michael@0: case SCRIPT_DATA_LESS_THAN_SIGN:
michael@0: case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
michael@0: /*
michael@0: * Otherwise, emit a U+003C LESS-THAN SIGN character token
michael@0: */
michael@0: tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
michael@0: /*
michael@0: * and reconsume the current input character in the data
michael@0: * state.
michael@0: */
michael@0: break eofloop;
michael@0: case TAG_OPEN:
michael@0: /*
michael@0: * The behavior of this state depends on the content model
michael@0: * flag.
michael@0: */
michael@0: /*
michael@0: * Anything else Parse error.
michael@0: */
michael@0: errEofAfterLt();
michael@0: /*
michael@0: * Emit a U+003C LESS-THAN SIGN character token
michael@0: */
michael@0: tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
michael@0: /*
michael@0: * and reconsume the current input character in the data
michael@0: * state.
michael@0: */
michael@0: break eofloop;
michael@0: case RAWTEXT_RCDATA_LESS_THAN_SIGN:
michael@0: /*
michael@0: * Emit a U+003C LESS-THAN SIGN character token
michael@0: */
michael@0: tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
michael@0: /*
michael@0: * and reconsume the current input character in the RCDATA
michael@0: * state.
michael@0: */
michael@0: break eofloop;
michael@0: case NON_DATA_END_TAG_NAME:
michael@0: /*
michael@0: * Emit a U+003C LESS-THAN SIGN character token, a U+002F
michael@0: * SOLIDUS character token,
michael@0: */
michael@0: tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
michael@0: /*
michael@0: * a character token for each of the characters in the
michael@0: * temporary buffer (in the order they were added to the
michael@0: * buffer),
michael@0: */
michael@0: emitStrBuf();
michael@0: /*
michael@0: * and reconsume the current input character in the RCDATA
michael@0: * state.
michael@0: */
michael@0: break eofloop;
michael@0: case CLOSE_TAG_OPEN:
michael@0: /* EOF Parse error. */
michael@0: errEofAfterLt();
michael@0: /*
michael@0: * Emit a U+003C LESS-THAN SIGN character token and a U+002F
michael@0: * SOLIDUS character token.
michael@0: */
michael@0: tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
michael@0: /*
michael@0: * Reconsume the EOF character in the data state.
michael@0: */
michael@0: break eofloop;
michael@0: case TAG_NAME:
michael@0: /*
michael@0: * EOF Parse error.
michael@0: */
michael@0: errEofInTagName();
michael@0: /*
michael@0: * Reconsume the EOF character in the data state.
michael@0: */
michael@0: break eofloop;
michael@0: case BEFORE_ATTRIBUTE_NAME:
michael@0: case AFTER_ATTRIBUTE_VALUE_QUOTED:
michael@0: case SELF_CLOSING_START_TAG:
michael@0: /* EOF Parse error. */
michael@0: errEofWithoutGt();
michael@0: /*
michael@0: * Reconsume the EOF character in the data state.
michael@0: */
michael@0: break eofloop;
michael@0: case ATTRIBUTE_NAME:
michael@0: /*
michael@0: * EOF Parse error.
michael@0: */
michael@0: errEofInAttributeName();
michael@0: /*
michael@0: * Reconsume the EOF character in the data state.
michael@0: */
michael@0: break eofloop;
michael@0: case AFTER_ATTRIBUTE_NAME:
michael@0: case BEFORE_ATTRIBUTE_VALUE:
michael@0: /* EOF Parse error. */
michael@0: errEofWithoutGt();
michael@0: /*
michael@0: * Reconsume the EOF character in the data state.
michael@0: */
michael@0: break eofloop;
michael@0: case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
michael@0: case ATTRIBUTE_VALUE_SINGLE_QUOTED:
michael@0: case ATTRIBUTE_VALUE_UNQUOTED:
michael@0: /* EOF Parse error. */
michael@0: errEofInAttributeValue();
michael@0: /*
michael@0: * Reconsume the EOF character in the data state.
michael@0: */
michael@0: break eofloop;
michael@0: case BOGUS_COMMENT:
michael@0: emitComment(0, 0);
michael@0: break eofloop;
michael@0: case BOGUS_COMMENT_HYPHEN:
michael@0: // [NOCPP[
michael@0: maybeAppendSpaceToBogusComment();
michael@0: // ]NOCPP]
michael@0: emitComment(0, 0);
michael@0: break eofloop;
michael@0: case MARKUP_DECLARATION_OPEN:
michael@0: errBogusComment();
michael@0: clearLongStrBuf();
michael@0: emitComment(0, 0);
michael@0: break eofloop;
michael@0: case MARKUP_DECLARATION_HYPHEN:
michael@0: errBogusComment();
michael@0: emitComment(0, 0);
michael@0: break eofloop;
michael@0: case MARKUP_DECLARATION_OCTYPE:
michael@0: if (index < 6) {
michael@0: errBogusComment();
michael@0: emitComment(0, 0);
michael@0: } else {
michael@0: /* EOF Parse error. */
michael@0: errEofInDoctype();
michael@0: /*
michael@0: * Create a new DOCTYPE token. Set its force-quirks flag
michael@0: * to on.
michael@0: */
michael@0: doctypeName = "";
michael@0: if (systemIdentifier != null) {
michael@0: Portability.releaseString(systemIdentifier);
michael@0: systemIdentifier = null;
michael@0: }
michael@0: if (publicIdentifier != null) {
michael@0: Portability.releaseString(publicIdentifier);
michael@0: publicIdentifier = null;
michael@0: }
michael@0: forceQuirks = true;
michael@0: /*
michael@0: * Emit the token.
michael@0: */
michael@0: emitDoctypeToken(0);
michael@0: /*
michael@0: * Reconsume the EOF character in the data state.
michael@0: */
michael@0: break eofloop;
michael@0: }
michael@0: break eofloop;
michael@0: case COMMENT_START:
michael@0: case COMMENT:
michael@0: /*
michael@0: * EOF Parse error.
michael@0: */
michael@0: errEofInComment();
michael@0: /* Emit the comment token. */
michael@0: emitComment(0, 0);
michael@0: /*
michael@0: * Reconsume the EOF character in the data state.
michael@0: */
michael@0: break eofloop;
michael@0: case COMMENT_END:
michael@0: errEofInComment();
michael@0: /* Emit the comment token. */
michael@0: emitComment(2, 0);
michael@0: /*
michael@0: * Reconsume the EOF character in the data state.
michael@0: */
michael@0: break eofloop;
michael@0: case COMMENT_END_DASH:
michael@0: case COMMENT_START_DASH:
michael@0: errEofInComment();
michael@0: /* Emit the comment token. */
michael@0: emitComment(1, 0);
michael@0: /*
michael@0: * Reconsume the EOF character in the data state.
michael@0: */
michael@0: break eofloop;
michael@0: case COMMENT_END_BANG:
michael@0: errEofInComment();
michael@0: /* Emit the comment token. */
michael@0: emitComment(3, 0);
michael@0: /*
michael@0: * Reconsume the EOF character in the data state.
michael@0: */
michael@0: break eofloop;
michael@0: case DOCTYPE:
michael@0: case BEFORE_DOCTYPE_NAME:
michael@0: errEofInDoctype();
michael@0: /*
michael@0: * Create a new DOCTYPE token. Set its force-quirks flag to
michael@0: * on.
michael@0: */
michael@0: forceQuirks = true;
michael@0: /*
michael@0: * Emit the token.
michael@0: */
michael@0: emitDoctypeToken(0);
michael@0: /*
michael@0: * Reconsume the EOF character in the data state.
michael@0: */
michael@0: break eofloop;
michael@0: case DOCTYPE_NAME:
michael@0: errEofInDoctype();
michael@0: strBufToDoctypeName();
michael@0: /*
michael@0: * Set the DOCTYPE token's force-quirks flag to on.
michael@0: */
michael@0: forceQuirks = true;
michael@0: /*
michael@0: * Emit that DOCTYPE token.
michael@0: */
michael@0: emitDoctypeToken(0);
michael@0: /*
michael@0: * Reconsume the EOF character in the data state.
michael@0: */
michael@0: break eofloop;
michael@0: case DOCTYPE_UBLIC:
michael@0: case DOCTYPE_YSTEM:
michael@0: case AFTER_DOCTYPE_NAME:
michael@0: case AFTER_DOCTYPE_PUBLIC_KEYWORD:
michael@0: case AFTER_DOCTYPE_SYSTEM_KEYWORD:
michael@0: case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
michael@0: errEofInDoctype();
michael@0: /*
michael@0: * Set the DOCTYPE token's force-quirks flag to on.
michael@0: */
michael@0: forceQuirks = true;
michael@0: /*
michael@0: * Emit that DOCTYPE token.
michael@0: */
michael@0: emitDoctypeToken(0);
michael@0: /*
michael@0: * Reconsume the EOF character in the data state.
michael@0: */
michael@0: break eofloop;
michael@0: case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
michael@0: case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
michael@0: /* EOF Parse error. */
michael@0: errEofInPublicId();
michael@0: /*
michael@0: * Set the DOCTYPE token's force-quirks flag to on.
michael@0: */
michael@0: forceQuirks = true;
michael@0: /*
michael@0: * Emit that DOCTYPE token.
michael@0: */
michael@0: publicIdentifier = longStrBufToString();
michael@0: emitDoctypeToken(0);
michael@0: /*
michael@0: * Reconsume the EOF character in the data state.
michael@0: */
michael@0: break eofloop;
michael@0: case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
michael@0: case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
michael@0: case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
michael@0: errEofInDoctype();
michael@0: /*
michael@0: * Set the DOCTYPE token's force-quirks flag to on.
michael@0: */
michael@0: forceQuirks = true;
michael@0: /*
michael@0: * Emit that DOCTYPE token.
michael@0: */
michael@0: emitDoctypeToken(0);
michael@0: /*
michael@0: * Reconsume the EOF character in the data state.
michael@0: */
michael@0: break eofloop;
michael@0: case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
michael@0: case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
michael@0: /* EOF Parse error. */
michael@0: errEofInSystemId();
michael@0: /*
michael@0: * Set the DOCTYPE token's force-quirks flag to on.
michael@0: */
michael@0: forceQuirks = true;
michael@0: /*
michael@0: * Emit that DOCTYPE token.
michael@0: */
michael@0: systemIdentifier = longStrBufToString();
michael@0: emitDoctypeToken(0);
michael@0: /*
michael@0: * Reconsume the EOF character in the data state.
michael@0: */
michael@0: break eofloop;
michael@0: case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
michael@0: errEofInDoctype();
michael@0: /*
michael@0: * Set the DOCTYPE token's force-quirks flag to on.
michael@0: */
michael@0: forceQuirks = true;
michael@0: /*
michael@0: * Emit that DOCTYPE token.
michael@0: */
michael@0: emitDoctypeToken(0);
michael@0: /*
michael@0: * Reconsume the EOF character in the data state.
michael@0: */
michael@0: break eofloop;
michael@0: case BOGUS_DOCTYPE:
michael@0: /*
michael@0: * Emit that DOCTYPE token.
michael@0: */
michael@0: emitDoctypeToken(0);
michael@0: /*
michael@0: * Reconsume the EOF character in the data state.
michael@0: */
michael@0: break eofloop;
michael@0: case CONSUME_CHARACTER_REFERENCE:
michael@0: /*
michael@0: * Unlike the definition is the spec, this state does not
michael@0: * return a value and never requires the caller to
michael@0: * backtrack. This state takes care of emitting characters
michael@0: * or appending to the current attribute value. It also
michael@0: * takes care of that in the case when consuming the entity
michael@0: * fails.
michael@0: */
michael@0: /*
michael@0: * This section defines how to consume an entity. This
michael@0: * definition is used when parsing entities in text and in
michael@0: * attributes.
michael@0: *
michael@0: * The behavior depends on the identity of the next
michael@0: * character (the one immediately after the U+0026 AMPERSAND
michael@0: * character):
michael@0: */
michael@0:
michael@0: emitOrAppendStrBuf(returnState);
michael@0: state = returnState;
michael@0: continue;
michael@0: case CHARACTER_REFERENCE_HILO_LOOKUP:
michael@0: errNoNamedCharacterMatch();
michael@0: emitOrAppendStrBuf(returnState);
michael@0: state = returnState;
michael@0: continue;
michael@0: case CHARACTER_REFERENCE_TAIL:
michael@0: outer: for (;;) {
michael@0: char c = '\u0000';
michael@0: entCol++;
michael@0: /*
michael@0: * Consume the maximum number of characters possible,
michael@0: * with the consumed characters matching one of the
michael@0: * identifiers in the first column of the named
michael@0: * character references table (in a case-sensitive
michael@0: * manner).
michael@0: */
michael@0: hiloop: for (;;) {
michael@0: if (hi == -1) {
michael@0: break hiloop;
michael@0: }
michael@0: if (entCol == NamedCharacters.NAMES[hi].length()) {
michael@0: break hiloop;
michael@0: }
michael@0: if (entCol > NamedCharacters.NAMES[hi].length()) {
michael@0: break outer;
michael@0: } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
michael@0: hi--;
michael@0: } else {
michael@0: break hiloop;
michael@0: }
michael@0: }
michael@0:
michael@0: loloop: for (;;) {
michael@0: if (hi < lo) {
michael@0: break outer;
michael@0: }
michael@0: if (entCol == NamedCharacters.NAMES[lo].length()) {
michael@0: candidate = lo;
michael@0: strBufMark = strBufLen;
michael@0: lo++;
michael@0: } else if (entCol > NamedCharacters.NAMES[lo].length()) {
michael@0: break outer;
michael@0: } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
michael@0: lo++;
michael@0: } else {
michael@0: break loloop;
michael@0: }
michael@0: }
michael@0: if (hi < lo) {
michael@0: break outer;
michael@0: }
michael@0: continue;
michael@0: }
michael@0:
michael@0: if (candidate == -1) {
michael@0: /*
michael@0: * If no match can be made, then this is a parse error.
michael@0: */
michael@0: errNoNamedCharacterMatch();
michael@0: emitOrAppendStrBuf(returnState);
michael@0: state = returnState;
michael@0: continue eofloop;
michael@0: } else {
michael@0: @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
michael@0: if (candidateName.length() == 0
michael@0: || candidateName.charAt(candidateName.length() - 1) != ';') {
michael@0: /*
michael@0: * If the last character matched is not a U+003B
michael@0: * SEMICOLON (;), there is a parse error.
michael@0: */
michael@0: if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
michael@0: /*
michael@0: * If the entity is being consumed as part of an
michael@0: * attribute, and the last character matched is
michael@0: * not a U+003B SEMICOLON (;),
michael@0: */
michael@0: char ch;
michael@0: if (strBufMark == strBufLen) {
michael@0: ch = '\u0000';
michael@0: } else {
michael@0: ch = strBuf[strBufMark];
michael@0: }
michael@0: if ((ch >= '0' && ch <= '9')
michael@0: || (ch >= 'A' && ch <= 'Z')
michael@0: || (ch >= 'a' && ch <= 'z')) {
michael@0: /*
michael@0: * and the next character is in the range
michael@0: * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
michael@0: * U+0041 LATIN CAPITAL LETTER A to U+005A
michael@0: * LATIN CAPITAL LETTER Z, or U+0061 LATIN
michael@0: * SMALL LETTER A to U+007A LATIN SMALL
michael@0: * LETTER Z, then, for historical reasons,
michael@0: * all the characters that were matched
michael@0: * after the U+0026 AMPERSAND (&) must be
michael@0: * unconsumed, and nothing is returned.
michael@0: */
michael@0: errNoNamedCharacterMatch();
michael@0: appendStrBufToLongStrBuf();
michael@0: state = returnState;
michael@0: continue eofloop;
michael@0: }
michael@0: }
michael@0: if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
michael@0: errUnescapedAmpersandInterpretedAsCharacterReference();
michael@0: } else {
michael@0: errNotSemicolonTerminated();
michael@0: }
michael@0: }
michael@0:
michael@0: /*
michael@0: * Otherwise, return a character token for the character
michael@0: * corresponding to the entity name (as given by the
michael@0: * second column of the named character references
michael@0: * table).
michael@0: */
michael@0: @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
michael@0: if (
michael@0: // [NOCPP[
michael@0: val.length == 1
michael@0: // ]NOCPP]
michael@0: // CPPONLY: val[1] == 0
michael@0: ) {
michael@0: emitOrAppendOne(val, returnState);
michael@0: } else {
michael@0: emitOrAppendTwo(val, returnState);
michael@0: }
michael@0: // this is so complicated!
michael@0: if (strBufMark < strBufLen) {
michael@0: if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
michael@0: for (int i = strBufMark; i < strBufLen; i++) {
michael@0: appendLongStrBuf(strBuf[i]);
michael@0: }
michael@0: } else {
michael@0: tokenHandler.characters(strBuf, strBufMark,
michael@0: strBufLen - strBufMark);
michael@0: }
michael@0: }
michael@0: state = returnState;
michael@0: continue eofloop;
michael@0: /*
michael@0: * If the markup contains I'm ¬it; I tell you, the
michael@0: * entity is parsed as "not", as in, I'm ¬it; I tell
michael@0: * you. But if the markup was I'm ∉ I tell you,
michael@0: * the entity would be parsed as "notin;", resulting in
michael@0: * I'm ∉ I tell you.
michael@0: */
michael@0: }
michael@0: case CONSUME_NCR:
michael@0: case DECIMAL_NRC_LOOP:
michael@0: case HEX_NCR_LOOP:
michael@0: /*
michael@0: * If no characters match the range, then don't consume any
michael@0: * characters (and unconsume the U+0023 NUMBER SIGN
michael@0: * character and, if appropriate, the X character). This is
michael@0: * a parse error; nothing is returned.
michael@0: *
michael@0: * Otherwise, if the next character is a U+003B SEMICOLON,
michael@0: * consume that too. If it isn't, there is a parse error.
michael@0: */
michael@0: if (!seenDigits) {
michael@0: errNoDigitsInNCR();
michael@0: emitOrAppendStrBuf(returnState);
michael@0: state = returnState;
michael@0: continue;
michael@0: } else {
michael@0: errCharRefLacksSemicolon();
michael@0: }
michael@0: // WARNING previous state sets reconsume
michael@0: handleNcrValue(returnState);
michael@0: state = returnState;
michael@0: continue;
michael@0: case CDATA_RSQB:
michael@0: tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
michael@0: break eofloop;
michael@0: case CDATA_RSQB_RSQB:
michael@0: tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
michael@0: break eofloop;
michael@0: case DATA:
michael@0: default:
michael@0: break eofloop;
michael@0: }
michael@0: }
michael@0: // case DATA:
michael@0: /*
michael@0: * EOF Emit an end-of-file token.
michael@0: */
michael@0: tokenHandler.eof();
michael@0: return;
michael@0: }
michael@0:
michael@0: private void emitDoctypeToken(int pos) throws SAXException {
michael@0: cstart = pos + 1;
michael@0: tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier,
michael@0: forceQuirks);
michael@0: // It is OK and sufficient to release these here, since
michael@0: // there's no way out of the doctype states than through paths
michael@0: // that call this method.
michael@0: doctypeName = null;
michael@0: Portability.releaseString(publicIdentifier);
michael@0: publicIdentifier = null;
michael@0: Portability.releaseString(systemIdentifier);
michael@0: systemIdentifier = null;
michael@0: }
michael@0:
michael@0: @Inline protected char checkChar(@NoLength char[] buf, int pos)
michael@0: throws SAXException {
michael@0: return buf[pos];
michael@0: }
michael@0:
michael@0: public boolean internalEncodingDeclaration(String internalCharset)
michael@0: throws SAXException {
michael@0: if (encodingDeclarationHandler != null) {
michael@0: return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset);
michael@0: }
michael@0: return false;
michael@0: }
michael@0:
michael@0: /**
michael@0: * @param val
michael@0: * @throws SAXException
michael@0: */
michael@0: private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState)
michael@0: throws SAXException {
michael@0: if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
michael@0: appendLongStrBuf(val[0]);
michael@0: appendLongStrBuf(val[1]);
michael@0: } else {
michael@0: tokenHandler.characters(val, 0, 2);
michael@0: }
michael@0: }
michael@0:
michael@0: private void emitOrAppendOne(@Const @NoLength char[] val, int returnState)
michael@0: throws SAXException {
michael@0: if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
michael@0: appendLongStrBuf(val[0]);
michael@0: } else {
michael@0: tokenHandler.characters(val, 0, 1);
michael@0: }
michael@0: }
michael@0:
michael@0: public void end() throws SAXException {
michael@0: strBuf = null;
michael@0: longStrBuf = null;
michael@0: doctypeName = null;
michael@0: if (systemIdentifier != null) {
michael@0: Portability.releaseString(systemIdentifier);
michael@0: systemIdentifier = null;
michael@0: }
michael@0: if (publicIdentifier != null) {
michael@0: Portability.releaseString(publicIdentifier);
michael@0: publicIdentifier = null;
michael@0: }
michael@0: if (tagName != null) {
michael@0: tagName.release();
michael@0: tagName = null;
michael@0: }
michael@0: if (attributeName != null) {
michael@0: attributeName.release();
michael@0: attributeName = null;
michael@0: }
michael@0: tokenHandler.endTokenization();
michael@0: if (attributes != null) {
michael@0: // [NOCPP[
michael@0: attributes = null;
michael@0: // ]NOCPP]
michael@0: // CPPONLY: attributes.clear(mappingLangToXmlLang);
michael@0: }
michael@0: }
michael@0:
michael@0: public void requestSuspension() {
michael@0: shouldSuspend = true;
michael@0: }
michael@0:
michael@0: // [NOCPP[
michael@0:
michael@0: public void becomeConfident() {
michael@0: confident = true;
michael@0: }
michael@0:
michael@0: /**
michael@0: * Returns the nextCharOnNewLine.
michael@0: *
michael@0: * @return the nextCharOnNewLine
michael@0: */
michael@0: public boolean isNextCharOnNewLine() {
michael@0: return false;
michael@0: }
michael@0:
michael@0: public boolean isPrevCR() {
michael@0: return lastCR;
michael@0: }
michael@0:
michael@0: /**
michael@0: * Returns the line.
michael@0: *
michael@0: * @return the line
michael@0: */
michael@0: public int getLine() {
michael@0: return -1;
michael@0: }
michael@0:
michael@0: /**
michael@0: * Returns the col.
michael@0: *
michael@0: * @return the col
michael@0: */
michael@0: public int getCol() {
michael@0: return -1;
michael@0: }
michael@0:
michael@0: // ]NOCPP]
michael@0:
michael@0: public boolean isInDataState() {
michael@0: return (stateSave == DATA);
michael@0: }
michael@0:
michael@0: public void resetToDataState() {
michael@0: strBufLen = 0;
michael@0: longStrBufLen = 0;
michael@0: stateSave = Tokenizer.DATA;
michael@0: // line = 1; XXX line numbers
michael@0: lastCR = false;
michael@0: index = 0;
michael@0: forceQuirks = false;
michael@0: additional = '\u0000';
michael@0: entCol = -1;
michael@0: firstCharKey = -1;
michael@0: lo = 0;
michael@0: hi = 0; // will always be overwritten before use anyway
michael@0: candidate = -1;
michael@0: strBufMark = 0;
michael@0: prevValue = -1;
michael@0: value = 0;
michael@0: seenDigits = false;
michael@0: endTag = false;
michael@0: shouldSuspend = false;
michael@0: initDoctypeFields();
michael@0: if (tagName != null) {
michael@0: tagName.release();
michael@0: tagName = null;
michael@0: }
michael@0: if (attributeName != null) {
michael@0: attributeName.release();
michael@0: attributeName = null;
michael@0: }
michael@0: if (newAttributesEachTime) {
michael@0: if (attributes != null) {
michael@0: Portability.delete(attributes);
michael@0: attributes = null;
michael@0: }
michael@0: }
michael@0: }
michael@0:
michael@0: public void loadState(Tokenizer other) throws SAXException {
michael@0: strBufLen = other.strBufLen;
michael@0: if (strBufLen > strBuf.length) {
michael@0: strBuf = new char[strBufLen];
michael@0: }
michael@0: System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen);
michael@0:
michael@0: longStrBufLen = other.longStrBufLen;
michael@0: if (longStrBufLen > longStrBuf.length) {
michael@0: longStrBuf = new char[longStrBufLen];
michael@0: }
michael@0: System.arraycopy(other.longStrBuf, 0, longStrBuf, 0, longStrBufLen);
michael@0:
michael@0: stateSave = other.stateSave;
michael@0: returnStateSave = other.returnStateSave;
michael@0: endTagExpectation = other.endTagExpectation;
michael@0: endTagExpectationAsArray = other.endTagExpectationAsArray;
michael@0: // line = 1; XXX line numbers
michael@0: lastCR = other.lastCR;
michael@0: index = other.index;
michael@0: forceQuirks = other.forceQuirks;
michael@0: additional = other.additional;
michael@0: entCol = other.entCol;
michael@0: firstCharKey = other.firstCharKey;
michael@0: lo = other.lo;
michael@0: hi = other.hi;
michael@0: candidate = other.candidate;
michael@0: strBufMark = other.strBufMark;
michael@0: prevValue = other.prevValue;
michael@0: value = other.value;
michael@0: seenDigits = other.seenDigits;
michael@0: endTag = other.endTag;
michael@0: shouldSuspend = false;
michael@0:
michael@0: if (other.doctypeName == null) {
michael@0: doctypeName = null;
michael@0: } else {
michael@0: doctypeName = Portability.newLocalFromLocal(other.doctypeName,
michael@0: interner);
michael@0: }
michael@0:
michael@0: Portability.releaseString(systemIdentifier);
michael@0: if (other.systemIdentifier == null) {
michael@0: systemIdentifier = null;
michael@0: } else {
michael@0: systemIdentifier = Portability.newStringFromString(other.systemIdentifier);
michael@0: }
michael@0:
michael@0: Portability.releaseString(publicIdentifier);
michael@0: if (other.publicIdentifier == null) {
michael@0: publicIdentifier = null;
michael@0: } else {
michael@0: publicIdentifier = Portability.newStringFromString(other.publicIdentifier);
michael@0: }
michael@0:
michael@0: if (tagName != null) {
michael@0: tagName.release();
michael@0: }
michael@0: if (other.tagName == null) {
michael@0: tagName = null;
michael@0: } else {
michael@0: tagName = other.tagName.cloneElementName(interner);
michael@0: }
michael@0:
michael@0: if (attributeName != null) {
michael@0: attributeName.release();
michael@0: }
michael@0: if (other.attributeName == null) {
michael@0: attributeName = null;
michael@0: } else {
michael@0: attributeName = other.attributeName.cloneAttributeName(interner);
michael@0: }
michael@0:
michael@0: Portability.delete(attributes);
michael@0: if (other.attributes == null) {
michael@0: attributes = null;
michael@0: } else {
michael@0: attributes = other.attributes.cloneAttributes(interner);
michael@0: }
michael@0: }
michael@0:
michael@0: public void initializeWithoutStarting() throws SAXException {
michael@0: confident = false;
michael@0: strBuf = new char[64];
michael@0: longStrBuf = new char[1024];
michael@0: line = 1;
michael@0: // [NOCPP[
michael@0: html4 = false;
michael@0: metaBoundaryPassed = false;
michael@0: wantsComments = tokenHandler.wantsComments();
michael@0: if (!newAttributesEachTime) {
michael@0: attributes = new HtmlAttributes(mappingLangToXmlLang);
michael@0: }
michael@0: // ]NOCPP]
michael@0: resetToDataState();
michael@0: }
michael@0:
michael@0: protected void errGarbageAfterLtSlash() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errLtSlashGt() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errWarnLtSlashInRcdata() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errHtml4LtSlashInRcdata(char folded) throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errCharRefLacksSemicolon() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errNoDigitsInNCR() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errGtInSystemId() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errGtInPublicId() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errNamelessDoctype() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errConsecutiveHyphens() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errPrematureEndOfComment() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errBogusComment() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errUnquotedAttributeValOrNull(char c) throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errSlashNotFollowedByGt() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errHtml4XmlVoidSyntax() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errNoSpaceBetweenAttributes() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errHtml4NonNameInUnquotedAttribute(char c)
michael@0: throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c)
michael@0: throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errAttributeValueMissing() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errBadCharBeforeAttributeNameOrNull(char c)
michael@0: throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errEqualsSignBeforeAttributeName() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errBadCharAfterLt(char c) throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errLtGt() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errProcessingInstruction() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errUnescapedAmpersandInterpretedAsCharacterReference()
michael@0: throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errNotSemicolonTerminated() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errNoNamedCharacterMatch() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errQuoteBeforeAttributeName(char c) throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errQuoteOrLtInAttributeNameOrNull(char c)
michael@0: throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errExpectedPublicId() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errBogusDoctype() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void maybeWarnPrivateUseAstral() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void maybeWarnPrivateUse(char ch) throws SAXException {
michael@0: }
michael@0:
michael@0: protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs)
michael@0: throws SAXException {
michael@0: }
michael@0:
michael@0: protected void maybeErrSlashInEndTag(boolean selfClosing)
michael@0: throws SAXException {
michael@0: }
michael@0:
michael@0: protected char errNcrNonCharacter(char ch) throws SAXException {
michael@0: return ch;
michael@0: }
michael@0:
michael@0: protected void errAstralNonCharacter(int ch) throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errNcrSurrogate() throws SAXException {
michael@0: }
michael@0:
michael@0: protected char errNcrControlChar(char ch) throws SAXException {
michael@0: return ch;
michael@0: }
michael@0:
michael@0: protected void errNcrCr() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errNcrInC1Range() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errEofInPublicId() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errEofInComment() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errEofInDoctype() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errEofInAttributeValue() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errEofInAttributeName() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errEofWithoutGt() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errEofInTagName() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errEofInEndTag() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errEofAfterLt() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errNcrOutOfRange() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errNcrUnassigned() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errDuplicateAttribute() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errEofInSystemId() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errExpectedSystemId() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errMissingSpaceBeforeDoctypeName() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errHyphenHyphenBang() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errNcrControlChar() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errNcrZero() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote()
michael@0: throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote()
michael@0: throws SAXException {
michael@0: }
michael@0:
michael@0: protected void noteAttributeWithoutValue() throws SAXException {
michael@0: }
michael@0:
michael@0: protected void noteUnquotedAttributeValue() throws SAXException {
michael@0: }
michael@0:
michael@0: /**
michael@0: * Sets the encodingDeclarationHandler.
michael@0: *
michael@0: * @param encodingDeclarationHandler
michael@0: * the encodingDeclarationHandler to set
michael@0: */
michael@0: public void setEncodingDeclarationHandler(
michael@0: EncodingDeclarationHandler encodingDeclarationHandler) {
michael@0: this.encodingDeclarationHandler = encodingDeclarationHandler;
michael@0: }
michael@0:
michael@0: void destructor() {
michael@0: // The translator will write refcount tracing stuff here
michael@0: Portability.delete(attributes);
michael@0: attributes = null;
michael@0: }
michael@0:
michael@0: // [NOCPP[
michael@0:
michael@0: /**
michael@0: * Sets an offset to be added to the position reported to
michael@0: * TransitionHandler
.
michael@0: *
michael@0: * @param offset the offset
michael@0: */
michael@0: public void setTransitionBaseOffset(int offset) {
michael@0:
michael@0: }
michael@0:
michael@0: // ]NOCPP]
michael@0:
michael@0: }