1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/parser/html/javasrc/Tokenizer.java Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,7021 @@ 1.4 +/* 1.5 + * Copyright (c) 2005-2007 Henri Sivonen 1.6 + * Copyright (c) 2007-2013 Mozilla Foundation 1.7 + * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla 1.8 + * Foundation, and Opera Software ASA. 1.9 + * 1.10 + * Permission is hereby granted, free of charge, to any person obtaining a 1.11 + * copy of this software and associated documentation files (the "Software"), 1.12 + * to deal in the Software without restriction, including without limitation 1.13 + * the rights to use, copy, modify, merge, publish, distribute, sublicense, 1.14 + * and/or sell copies of the Software, and to permit persons to whom the 1.15 + * Software is furnished to do so, subject to the following conditions: 1.16 + * 1.17 + * The above copyright notice and this permission notice shall be included in 1.18 + * all copies or substantial portions of the Software. 1.19 + * 1.20 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1.21 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1.22 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1.23 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1.24 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 1.25 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 1.26 + * DEALINGS IN THE SOFTWARE. 1.27 + */ 1.28 + 1.29 +/* 1.30 + * The comments following this one that use the same comment syntax as this 1.31 + * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007 1.32 + * amended as of June 18 2008 and May 31 2010. 1.33 + * That document came with this statement: 1.34 + * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and 1.35 + * Opera Software ASA. You are granted a license to use, reproduce and 1.36 + * create derivative works of this document." 1.37 + */ 1.38 + 1.39 +package nu.validator.htmlparser.impl; 1.40 + 1.41 +import nu.validator.htmlparser.annotation.Auto; 1.42 +import nu.validator.htmlparser.annotation.CharacterName; 1.43 +import nu.validator.htmlparser.annotation.Const; 1.44 +import nu.validator.htmlparser.annotation.Inline; 1.45 +import nu.validator.htmlparser.annotation.Local; 1.46 +import nu.validator.htmlparser.annotation.NoLength; 1.47 +import nu.validator.htmlparser.common.EncodingDeclarationHandler; 1.48 +import nu.validator.htmlparser.common.Interner; 1.49 +import nu.validator.htmlparser.common.TokenHandler; 1.50 +import nu.validator.htmlparser.common.XmlViolationPolicy; 1.51 + 1.52 +import org.xml.sax.ErrorHandler; 1.53 +import org.xml.sax.Locator; 1.54 +import org.xml.sax.SAXException; 1.55 +import org.xml.sax.SAXParseException; 1.56 + 1.57 +/** 1.58 + * An implementation of 1.59 + * http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html 1.60 + * 1.61 + * This class implements the <code>Locator</code> interface. This is not an 1.62 + * incidental implementation detail: Users of this class are encouraged to make 1.63 + * use of the <code>Locator</code> nature. 1.64 + * 1.65 + * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer 1.66 + * can be configured to treat these conditions as fatal or to coerce the infoset 1.67 + * to something that XML 1.0 allows. 1.68 + * 1.69 + * @version $Id$ 1.70 + * @author hsivonen 1.71 + */ 1.72 +public class Tokenizer implements Locator { 1.73 + 1.74 + private static final int DATA_AND_RCDATA_MASK = ~1; 1.75 + 1.76 + public static final int DATA = 0; 1.77 + 1.78 + public static final int RCDATA = 1; 1.79 + 1.80 + public static final int SCRIPT_DATA = 2; 1.81 + 1.82 + public static final int RAWTEXT = 3; 1.83 + 1.84 + public static final int SCRIPT_DATA_ESCAPED = 4; 1.85 + 1.86 + public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5; 1.87 + 1.88 + public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6; 1.89 + 1.90 + public static final int ATTRIBUTE_VALUE_UNQUOTED = 7; 1.91 + 1.92 + public static final int PLAINTEXT = 8; 1.93 + 1.94 + public static final int TAG_OPEN = 9; 1.95 + 1.96 + public static final int CLOSE_TAG_OPEN = 10; 1.97 + 1.98 + public static final int TAG_NAME = 11; 1.99 + 1.100 + public static final int BEFORE_ATTRIBUTE_NAME = 12; 1.101 + 1.102 + public static final int ATTRIBUTE_NAME = 13; 1.103 + 1.104 + public static final int AFTER_ATTRIBUTE_NAME = 14; 1.105 + 1.106 + public static final int BEFORE_ATTRIBUTE_VALUE = 15; 1.107 + 1.108 + public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16; 1.109 + 1.110 + public static final int BOGUS_COMMENT = 17; 1.111 + 1.112 + public static final int MARKUP_DECLARATION_OPEN = 18; 1.113 + 1.114 + public static final int DOCTYPE = 19; 1.115 + 1.116 + public static final int BEFORE_DOCTYPE_NAME = 20; 1.117 + 1.118 + public static final int DOCTYPE_NAME = 21; 1.119 + 1.120 + public static final int AFTER_DOCTYPE_NAME = 22; 1.121 + 1.122 + public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23; 1.123 + 1.124 + public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24; 1.125 + 1.126 + public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25; 1.127 + 1.128 + public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26; 1.129 + 1.130 + public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27; 1.131 + 1.132 + public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28; 1.133 + 1.134 + public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29; 1.135 + 1.136 + public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30; 1.137 + 1.138 + public static final int BOGUS_DOCTYPE = 31; 1.139 + 1.140 + public static final int COMMENT_START = 32; 1.141 + 1.142 + public static final int COMMENT_START_DASH = 33; 1.143 + 1.144 + public static final int COMMENT = 34; 1.145 + 1.146 + public static final int COMMENT_END_DASH = 35; 1.147 + 1.148 + public static final int COMMENT_END = 36; 1.149 + 1.150 + public static final int COMMENT_END_BANG = 37; 1.151 + 1.152 + public static final int NON_DATA_END_TAG_NAME = 38; 1.153 + 1.154 + public static final int MARKUP_DECLARATION_HYPHEN = 39; 1.155 + 1.156 + public static final int MARKUP_DECLARATION_OCTYPE = 40; 1.157 + 1.158 + public static final int DOCTYPE_UBLIC = 41; 1.159 + 1.160 + public static final int DOCTYPE_YSTEM = 42; 1.161 + 1.162 + public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43; 1.163 + 1.164 + public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44; 1.165 + 1.166 + public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45; 1.167 + 1.168 + public static final int CONSUME_CHARACTER_REFERENCE = 46; 1.169 + 1.170 + public static final int CONSUME_NCR = 47; 1.171 + 1.172 + public static final int CHARACTER_REFERENCE_TAIL = 48; 1.173 + 1.174 + public static final int HEX_NCR_LOOP = 49; 1.175 + 1.176 + public static final int DECIMAL_NRC_LOOP = 50; 1.177 + 1.178 + public static final int HANDLE_NCR_VALUE = 51; 1.179 + 1.180 + public static final int HANDLE_NCR_VALUE_RECONSUME = 52; 1.181 + 1.182 + public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53; 1.183 + 1.184 + public static final int SELF_CLOSING_START_TAG = 54; 1.185 + 1.186 + public static final int CDATA_START = 55; 1.187 + 1.188 + public static final int CDATA_SECTION = 56; 1.189 + 1.190 + public static final int CDATA_RSQB = 57; 1.191 + 1.192 + public static final int CDATA_RSQB_RSQB = 58; 1.193 + 1.194 + public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59; 1.195 + 1.196 + public static final int SCRIPT_DATA_ESCAPE_START = 60; 1.197 + 1.198 + public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61; 1.199 + 1.200 + public static final int SCRIPT_DATA_ESCAPED_DASH = 62; 1.201 + 1.202 + public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63; 1.203 + 1.204 + public static final int BOGUS_COMMENT_HYPHEN = 64; 1.205 + 1.206 + public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65; 1.207 + 1.208 + public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66; 1.209 + 1.210 + public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67; 1.211 + 1.212 + public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68; 1.213 + 1.214 + public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69; 1.215 + 1.216 + public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70; 1.217 + 1.218 + public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71; 1.219 + 1.220 + public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72; 1.221 + 1.222 + public static final int PROCESSING_INSTRUCTION = 73; 1.223 + 1.224 + public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74; 1.225 + 1.226 + /** 1.227 + * Magic value for UTF-16 operations. 1.228 + */ 1.229 + private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10)); 1.230 + 1.231 + /** 1.232 + * UTF-16 code unit array containing less than and greater than for emitting 1.233 + * those characters on certain parse errors. 1.234 + */ 1.235 + private static final @NoLength char[] LT_GT = { '<', '>' }; 1.236 + 1.237 + /** 1.238 + * UTF-16 code unit array containing less than and solidus for emitting 1.239 + * those characters on certain parse errors. 1.240 + */ 1.241 + private static final @NoLength char[] LT_SOLIDUS = { '<', '/' }; 1.242 + 1.243 + /** 1.244 + * UTF-16 code unit array containing ]] for emitting those characters on 1.245 + * state transitions. 1.246 + */ 1.247 + private static final @NoLength char[] RSQB_RSQB = { ']', ']' }; 1.248 + 1.249 + /** 1.250 + * Array version of U+FFFD. 1.251 + */ 1.252 + private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' }; 1.253 + 1.254 + // [NOCPP[ 1.255 + 1.256 + /** 1.257 + * Array version of space. 1.258 + */ 1.259 + private static final @NoLength char[] SPACE = { ' ' }; 1.260 + 1.261 + // ]NOCPP] 1.262 + 1.263 + /** 1.264 + * Array version of line feed. 1.265 + */ 1.266 + private static final @NoLength char[] LF = { '\n' }; 1.267 + 1.268 + /** 1.269 + * Buffer growth parameter. 1.270 + */ 1.271 + private static final int BUFFER_GROW_BY = 1024; 1.272 + 1.273 + /** 1.274 + * "CDATA[" as <code>char[]</code> 1.275 + */ 1.276 + private static final @NoLength char[] CDATA_LSQB = { 'C', 'D', 'A', 'T', 1.277 + 'A', '[' }; 1.278 + 1.279 + /** 1.280 + * "octype" as <code>char[]</code> 1.281 + */ 1.282 + private static final @NoLength char[] OCTYPE = { 'o', 'c', 't', 'y', 'p', 1.283 + 'e' }; 1.284 + 1.285 + /** 1.286 + * "ublic" as <code>char[]</code> 1.287 + */ 1.288 + private static final @NoLength char[] UBLIC = { 'u', 'b', 'l', 'i', 'c' }; 1.289 + 1.290 + /** 1.291 + * "ystem" as <code>char[]</code> 1.292 + */ 1.293 + private static final @NoLength char[] YSTEM = { 'y', 's', 't', 'e', 'm' }; 1.294 + 1.295 + private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' }; 1.296 + 1.297 + private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' }; 1.298 + 1.299 + private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' }; 1.300 + 1.301 + private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't', 1.302 + 'e', 'x', 't' }; 1.303 + 1.304 + private static final char[] XMP_ARR = { 'x', 'm', 'p' }; 1.305 + 1.306 + private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r', 1.307 + 'e', 'a' }; 1.308 + 1.309 + private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' }; 1.310 + 1.311 + private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e', 1.312 + 'd' }; 1.313 + 1.314 + private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i', 1.315 + 'p', 't' }; 1.316 + 1.317 + private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm', 1.318 + 'e', 's' }; 1.319 + 1.320 + /** 1.321 + * The token handler. 1.322 + */ 1.323 + protected final TokenHandler tokenHandler; 1.324 + 1.325 + protected EncodingDeclarationHandler encodingDeclarationHandler; 1.326 + 1.327 + // [NOCPP[ 1.328 + 1.329 + /** 1.330 + * The error handler. 1.331 + */ 1.332 + protected ErrorHandler errorHandler; 1.333 + 1.334 + // ]NOCPP] 1.335 + 1.336 + /** 1.337 + * Whether the previous char read was CR. 1.338 + */ 1.339 + protected boolean lastCR; 1.340 + 1.341 + protected int stateSave; 1.342 + 1.343 + private int returnStateSave; 1.344 + 1.345 + protected int index; 1.346 + 1.347 + private boolean forceQuirks; 1.348 + 1.349 + private char additional; 1.350 + 1.351 + private int entCol; 1.352 + 1.353 + private int firstCharKey; 1.354 + 1.355 + private int lo; 1.356 + 1.357 + private int hi; 1.358 + 1.359 + private int candidate; 1.360 + 1.361 + private int strBufMark; 1.362 + 1.363 + private int prevValue; 1.364 + 1.365 + protected int value; 1.366 + 1.367 + private boolean seenDigits; 1.368 + 1.369 + protected int cstart; 1.370 + 1.371 + /** 1.372 + * The SAX public id for the resource being tokenized. (Only passed to back 1.373 + * as part of locator data.) 1.374 + */ 1.375 + private String publicId; 1.376 + 1.377 + /** 1.378 + * The SAX system id for the resource being tokenized. (Only passed to back 1.379 + * as part of locator data.) 1.380 + */ 1.381 + private String systemId; 1.382 + 1.383 + /** 1.384 + * Buffer for short identifiers. 1.385 + */ 1.386 + private @Auto char[] strBuf; 1.387 + 1.388 + /** 1.389 + * Number of significant <code>char</code>s in <code>strBuf</code>. 1.390 + */ 1.391 + private int strBufLen; 1.392 + 1.393 + /** 1.394 + * <code>-1</code> to indicate that <code>strBuf</code> is used or otherwise 1.395 + * an offset to the main buffer. 1.396 + */ 1.397 + // private int strBufOffset = -1; 1.398 + /** 1.399 + * Buffer for long strings. 1.400 + */ 1.401 + private @Auto char[] longStrBuf; 1.402 + 1.403 + /** 1.404 + * Number of significant <code>char</code>s in <code>longStrBuf</code>. 1.405 + */ 1.406 + private int longStrBufLen; 1.407 + 1.408 + /** 1.409 + * <code>-1</code> to indicate that <code>longStrBuf</code> is used or 1.410 + * otherwise an offset to the main buffer. 1.411 + */ 1.412 + // private int longStrBufOffset = -1; 1.413 + 1.414 + /** 1.415 + * Buffer for expanding NCRs falling into the Basic Multilingual Plane. 1.416 + */ 1.417 + private final @Auto char[] bmpChar; 1.418 + 1.419 + /** 1.420 + * Buffer for expanding astral NCRs. 1.421 + */ 1.422 + private final @Auto char[] astralChar; 1.423 + 1.424 + /** 1.425 + * The element whose end tag closes the current CDATA or RCDATA element. 1.426 + */ 1.427 + protected ElementName endTagExpectation = null; 1.428 + 1.429 + private char[] endTagExpectationAsArray; // not @Auto! 1.430 + 1.431 + /** 1.432 + * <code>true</code> if tokenizing an end tag 1.433 + */ 1.434 + protected boolean endTag; 1.435 + 1.436 + /** 1.437 + * The current tag token name. 1.438 + */ 1.439 + private ElementName tagName = null; 1.440 + 1.441 + /** 1.442 + * The current attribute name. 1.443 + */ 1.444 + protected AttributeName attributeName = null; 1.445 + 1.446 + // [NOCPP[ 1.447 + 1.448 + /** 1.449 + * Whether comment tokens are emitted. 1.450 + */ 1.451 + private boolean wantsComments = false; 1.452 + 1.453 + /** 1.454 + * <code>true</code> when HTML4-specific additional errors are requested. 1.455 + */ 1.456 + protected boolean html4; 1.457 + 1.458 + /** 1.459 + * Whether the stream is past the first 512 bytes. 1.460 + */ 1.461 + private boolean metaBoundaryPassed; 1.462 + 1.463 + // ]NOCPP] 1.464 + 1.465 + /** 1.466 + * The name of the current doctype token. 1.467 + */ 1.468 + private @Local String doctypeName; 1.469 + 1.470 + /** 1.471 + * The public id of the current doctype token. 1.472 + */ 1.473 + private String publicIdentifier; 1.474 + 1.475 + /** 1.476 + * The system id of the current doctype token. 1.477 + */ 1.478 + private String systemIdentifier; 1.479 + 1.480 + /** 1.481 + * The attribute holder. 1.482 + */ 1.483 + private HtmlAttributes attributes; 1.484 + 1.485 + // [NOCPP[ 1.486 + 1.487 + /** 1.488 + * The policy for vertical tab and form feed. 1.489 + */ 1.490 + private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET; 1.491 + 1.492 + /** 1.493 + * The policy for comments. 1.494 + */ 1.495 + private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET; 1.496 + 1.497 + private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET; 1.498 + 1.499 + private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET; 1.500 + 1.501 + private boolean html4ModeCompatibleWithXhtml1Schemata; 1.502 + 1.503 + private int mappingLangToXmlLang; 1.504 + 1.505 + // ]NOCPP] 1.506 + 1.507 + private final boolean newAttributesEachTime; 1.508 + 1.509 + private boolean shouldSuspend; 1.510 + 1.511 + protected boolean confident; 1.512 + 1.513 + private int line; 1.514 + 1.515 + private Interner interner; 1.516 + 1.517 + // CPPONLY: private boolean viewingXmlSource; 1.518 + 1.519 + // [NOCPP[ 1.520 + 1.521 + protected LocatorImpl ampersandLocation; 1.522 + 1.523 + public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) { 1.524 + this.tokenHandler = tokenHandler; 1.525 + this.encodingDeclarationHandler = null; 1.526 + this.newAttributesEachTime = newAttributesEachTime; 1.527 + this.bmpChar = new char[1]; 1.528 + this.astralChar = new char[2]; 1.529 + this.tagName = null; 1.530 + this.attributeName = null; 1.531 + this.doctypeName = null; 1.532 + this.publicIdentifier = null; 1.533 + this.systemIdentifier = null; 1.534 + this.attributes = null; 1.535 + } 1.536 + 1.537 + // ]NOCPP] 1.538 + 1.539 + /** 1.540 + * The constructor. 1.541 + * 1.542 + * @param tokenHandler 1.543 + * the handler for receiving tokens 1.544 + */ 1.545 + public Tokenizer(TokenHandler tokenHandler 1.546 + // CPPONLY: , boolean viewingXmlSource 1.547 + ) { 1.548 + this.tokenHandler = tokenHandler; 1.549 + this.encodingDeclarationHandler = null; 1.550 + // [NOCPP[ 1.551 + this.newAttributesEachTime = false; 1.552 + // ]NOCPP] 1.553 + this.bmpChar = new char[1]; 1.554 + this.astralChar = new char[2]; 1.555 + this.tagName = null; 1.556 + this.attributeName = null; 1.557 + this.doctypeName = null; 1.558 + this.publicIdentifier = null; 1.559 + this.systemIdentifier = null; 1.560 + // [NOCPP[ 1.561 + this.attributes = null; 1.562 + // ]NOCPP] 1.563 + // CPPONLY: this.attributes = tokenHandler.HasBuilder() ? new HtmlAttributes(mappingLangToXmlLang) : null; 1.564 + // CPPONLY: this.newAttributesEachTime = !tokenHandler.HasBuilder(); 1.565 + // CPPONLY: this.viewingXmlSource = viewingXmlSource; 1.566 + } 1.567 + 1.568 + public void setInterner(Interner interner) { 1.569 + this.interner = interner; 1.570 + } 1.571 + 1.572 + public void initLocation(String newPublicId, String newSystemId) { 1.573 + this.systemId = newSystemId; 1.574 + this.publicId = newPublicId; 1.575 + 1.576 + } 1.577 + 1.578 + // CPPONLY: boolean isViewingXmlSource() { 1.579 + // CPPONLY: return viewingXmlSource; 1.580 + // CPPONLY: } 1.581 + 1.582 + // [NOCPP[ 1.583 + 1.584 + /** 1.585 + * Returns the mappingLangToXmlLang. 1.586 + * 1.587 + * @return the mappingLangToXmlLang 1.588 + */ 1.589 + public boolean isMappingLangToXmlLang() { 1.590 + return mappingLangToXmlLang == AttributeName.HTML_LANG; 1.591 + } 1.592 + 1.593 + /** 1.594 + * Sets the mappingLangToXmlLang. 1.595 + * 1.596 + * @param mappingLangToXmlLang 1.597 + * the mappingLangToXmlLang to set 1.598 + */ 1.599 + public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) { 1.600 + this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG 1.601 + : AttributeName.HTML; 1.602 + } 1.603 + 1.604 + /** 1.605 + * Sets the error handler. 1.606 + * 1.607 + * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler) 1.608 + */ 1.609 + public void setErrorHandler(ErrorHandler eh) { 1.610 + this.errorHandler = eh; 1.611 + } 1.612 + 1.613 + public ErrorHandler getErrorHandler() { 1.614 + return this.errorHandler; 1.615 + } 1.616 + 1.617 + /** 1.618 + * Sets the commentPolicy. 1.619 + * 1.620 + * @param commentPolicy 1.621 + * the commentPolicy to set 1.622 + */ 1.623 + public void setCommentPolicy(XmlViolationPolicy commentPolicy) { 1.624 + this.commentPolicy = commentPolicy; 1.625 + } 1.626 + 1.627 + /** 1.628 + * Sets the contentNonXmlCharPolicy. 1.629 + * 1.630 + * @param contentNonXmlCharPolicy 1.631 + * the contentNonXmlCharPolicy to set 1.632 + */ 1.633 + public void setContentNonXmlCharPolicy( 1.634 + XmlViolationPolicy contentNonXmlCharPolicy) { 1.635 + if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) { 1.636 + throw new IllegalArgumentException( 1.637 + "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW."); 1.638 + } 1.639 + } 1.640 + 1.641 + /** 1.642 + * Sets the contentSpacePolicy. 1.643 + * 1.644 + * @param contentSpacePolicy 1.645 + * the contentSpacePolicy to set 1.646 + */ 1.647 + public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) { 1.648 + this.contentSpacePolicy = contentSpacePolicy; 1.649 + } 1.650 + 1.651 + /** 1.652 + * Sets the xmlnsPolicy. 1.653 + * 1.654 + * @param xmlnsPolicy 1.655 + * the xmlnsPolicy to set 1.656 + */ 1.657 + public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) { 1.658 + if (xmlnsPolicy == XmlViolationPolicy.FATAL) { 1.659 + throw new IllegalArgumentException("Can't use FATAL here."); 1.660 + } 1.661 + this.xmlnsPolicy = xmlnsPolicy; 1.662 + } 1.663 + 1.664 + public void setNamePolicy(XmlViolationPolicy namePolicy) { 1.665 + this.namePolicy = namePolicy; 1.666 + } 1.667 + 1.668 + /** 1.669 + * Sets the html4ModeCompatibleWithXhtml1Schemata. 1.670 + * 1.671 + * @param html4ModeCompatibleWithXhtml1Schemata 1.672 + * the html4ModeCompatibleWithXhtml1Schemata to set 1.673 + */ 1.674 + public void setHtml4ModeCompatibleWithXhtml1Schemata( 1.675 + boolean html4ModeCompatibleWithXhtml1Schemata) { 1.676 + this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata; 1.677 + } 1.678 + 1.679 + // ]NOCPP] 1.680 + 1.681 + // For the token handler to call 1.682 + /** 1.683 + * Sets the tokenizer state and the associated element name. This should 1.684 + * only ever used to put the tokenizer into one of the states that have 1.685 + * a special end tag expectation. 1.686 + * 1.687 + * @param specialTokenizerState 1.688 + * the tokenizer state to set 1.689 + * @param endTagExpectation 1.690 + * the expected end tag for transitioning back to normal 1.691 + */ 1.692 + public void setStateAndEndTagExpectation(int specialTokenizerState, 1.693 + @Local String endTagExpectation) { 1.694 + this.stateSave = specialTokenizerState; 1.695 + if (specialTokenizerState == Tokenizer.DATA) { 1.696 + return; 1.697 + } 1.698 + @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation); 1.699 + this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 0, 1.700 + asArray.length, interner); 1.701 + endTagExpectationToArray(); 1.702 + } 1.703 + 1.704 + /** 1.705 + * Sets the tokenizer state and the associated element name. This should 1.706 + * only ever used to put the tokenizer into one of the states that have 1.707 + * a special end tag expectation. 1.708 + * 1.709 + * @param specialTokenizerState 1.710 + * the tokenizer state to set 1.711 + * @param endTagExpectation 1.712 + * the expected end tag for transitioning back to normal 1.713 + */ 1.714 + public void setStateAndEndTagExpectation(int specialTokenizerState, 1.715 + ElementName endTagExpectation) { 1.716 + this.stateSave = specialTokenizerState; 1.717 + this.endTagExpectation = endTagExpectation; 1.718 + endTagExpectationToArray(); 1.719 + } 1.720 + 1.721 + private void endTagExpectationToArray() { 1.722 + switch (endTagExpectation.getGroup()) { 1.723 + case TreeBuilder.TITLE: 1.724 + endTagExpectationAsArray = TITLE_ARR; 1.725 + return; 1.726 + case TreeBuilder.SCRIPT: 1.727 + endTagExpectationAsArray = SCRIPT_ARR; 1.728 + return; 1.729 + case TreeBuilder.STYLE: 1.730 + endTagExpectationAsArray = STYLE_ARR; 1.731 + return; 1.732 + case TreeBuilder.PLAINTEXT: 1.733 + endTagExpectationAsArray = PLAINTEXT_ARR; 1.734 + return; 1.735 + case TreeBuilder.XMP: 1.736 + endTagExpectationAsArray = XMP_ARR; 1.737 + return; 1.738 + case TreeBuilder.TEXTAREA: 1.739 + endTagExpectationAsArray = TEXTAREA_ARR; 1.740 + return; 1.741 + case TreeBuilder.IFRAME: 1.742 + endTagExpectationAsArray = IFRAME_ARR; 1.743 + return; 1.744 + case TreeBuilder.NOEMBED: 1.745 + endTagExpectationAsArray = NOEMBED_ARR; 1.746 + return; 1.747 + case TreeBuilder.NOSCRIPT: 1.748 + endTagExpectationAsArray = NOSCRIPT_ARR; 1.749 + return; 1.750 + case TreeBuilder.NOFRAMES: 1.751 + endTagExpectationAsArray = NOFRAMES_ARR; 1.752 + return; 1.753 + default: 1.754 + assert false: "Bad end tag expectation."; 1.755 + return; 1.756 + } 1.757 + } 1.758 + 1.759 + /** 1.760 + * For C++ use only. 1.761 + */ 1.762 + public void setLineNumber(int line) { 1.763 + this.line = line; 1.764 + } 1.765 + 1.766 + // start Locator impl 1.767 + 1.768 + /** 1.769 + * @see org.xml.sax.Locator#getLineNumber() 1.770 + */ 1.771 + @Inline public int getLineNumber() { 1.772 + return line; 1.773 + } 1.774 + 1.775 + // [NOCPP[ 1.776 + 1.777 + /** 1.778 + * @see org.xml.sax.Locator#getColumnNumber() 1.779 + */ 1.780 + @Inline public int getColumnNumber() { 1.781 + return -1; 1.782 + } 1.783 + 1.784 + /** 1.785 + * @see org.xml.sax.Locator#getPublicId() 1.786 + */ 1.787 + public String getPublicId() { 1.788 + return publicId; 1.789 + } 1.790 + 1.791 + /** 1.792 + * @see org.xml.sax.Locator#getSystemId() 1.793 + */ 1.794 + public String getSystemId() { 1.795 + return systemId; 1.796 + } 1.797 + 1.798 + // end Locator impl 1.799 + 1.800 + // end public API 1.801 + 1.802 + public void notifyAboutMetaBoundary() { 1.803 + metaBoundaryPassed = true; 1.804 + } 1.805 + 1.806 + void turnOnAdditionalHtml4Errors() { 1.807 + html4 = true; 1.808 + } 1.809 + 1.810 + // ]NOCPP] 1.811 + 1.812 + HtmlAttributes emptyAttributes() { 1.813 + // [NOCPP[ 1.814 + if (newAttributesEachTime) { 1.815 + return new HtmlAttributes(mappingLangToXmlLang); 1.816 + } else { 1.817 + // ]NOCPP] 1.818 + return HtmlAttributes.EMPTY_ATTRIBUTES; 1.819 + // [NOCPP[ 1.820 + } 1.821 + // ]NOCPP] 1.822 + } 1.823 + 1.824 + @Inline private void clearStrBufAndAppend(char c) { 1.825 + strBuf[0] = c; 1.826 + strBufLen = 1; 1.827 + } 1.828 + 1.829 + @Inline private void clearStrBuf() { 1.830 + strBufLen = 0; 1.831 + } 1.832 + 1.833 + /** 1.834 + * Appends to the smaller buffer. 1.835 + * 1.836 + * @param c 1.837 + * the UTF-16 code unit to append 1.838 + */ 1.839 + private void appendStrBuf(char c) { 1.840 + if (strBufLen == strBuf.length) { 1.841 + char[] newBuf = new char[strBuf.length + Tokenizer.BUFFER_GROW_BY]; 1.842 + System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length); 1.843 + strBuf = newBuf; 1.844 + } 1.845 + strBuf[strBufLen++] = c; 1.846 + } 1.847 + 1.848 + /** 1.849 + * The smaller buffer as a String. Currently only used for error reporting. 1.850 + * 1.851 + * <p> 1.852 + * C++ memory note: The return value must be released. 1.853 + * 1.854 + * @return the smaller buffer as a string 1.855 + */ 1.856 + protected String strBufToString() { 1.857 + return Portability.newStringFromBuffer(strBuf, 0, strBufLen); 1.858 + } 1.859 + 1.860 + /** 1.861 + * Returns the short buffer as a local name. The return value is released in 1.862 + * emitDoctypeToken(). 1.863 + * 1.864 + * @return the smaller buffer as local name 1.865 + */ 1.866 + private void strBufToDoctypeName() { 1.867 + doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen, 1.868 + interner); 1.869 + } 1.870 + 1.871 + /** 1.872 + * Emits the smaller buffer as character tokens. 1.873 + * 1.874 + * @throws SAXException 1.875 + * if the token handler threw 1.876 + */ 1.877 + private void emitStrBuf() throws SAXException { 1.878 + if (strBufLen > 0) { 1.879 + tokenHandler.characters(strBuf, 0, strBufLen); 1.880 + } 1.881 + } 1.882 + 1.883 + @Inline private void clearLongStrBuf() { 1.884 + longStrBufLen = 0; 1.885 + } 1.886 + 1.887 + @Inline private void clearLongStrBufAndAppend(char c) { 1.888 + longStrBuf[0] = c; 1.889 + longStrBufLen = 1; 1.890 + } 1.891 + 1.892 + /** 1.893 + * Appends to the larger buffer. 1.894 + * 1.895 + * @param c 1.896 + * the UTF-16 code unit to append 1.897 + */ 1.898 + private void appendLongStrBuf(char c) { 1.899 + if (longStrBufLen == longStrBuf.length) { 1.900 + char[] newBuf = new char[longStrBufLen + (longStrBufLen >> 1)]; 1.901 + System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length); 1.902 + longStrBuf = newBuf; 1.903 + } 1.904 + longStrBuf[longStrBufLen++] = c; 1.905 + } 1.906 + 1.907 + @Inline private void appendSecondHyphenToBogusComment() throws SAXException { 1.908 + // [NOCPP[ 1.909 + switch (commentPolicy) { 1.910 + case ALTER_INFOSET: 1.911 + // detachLongStrBuf(); 1.912 + appendLongStrBuf(' '); 1.913 + // FALLTHROUGH 1.914 + case ALLOW: 1.915 + warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 1.916 + // ]NOCPP] 1.917 + appendLongStrBuf('-'); 1.918 + // [NOCPP[ 1.919 + break; 1.920 + case FATAL: 1.921 + fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 1.922 + break; 1.923 + } 1.924 + // ]NOCPP] 1.925 + } 1.926 + 1.927 + // [NOCPP[ 1.928 + private void maybeAppendSpaceToBogusComment() throws SAXException { 1.929 + switch (commentPolicy) { 1.930 + case ALTER_INFOSET: 1.931 + // detachLongStrBuf(); 1.932 + appendLongStrBuf(' '); 1.933 + // FALLTHROUGH 1.934 + case ALLOW: 1.935 + warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment."); 1.936 + break; 1.937 + case FATAL: 1.938 + fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment."); 1.939 + break; 1.940 + } 1.941 + } 1.942 + 1.943 + // ]NOCPP] 1.944 + 1.945 + @Inline private void adjustDoubleHyphenAndAppendToLongStrBufAndErr(char c) 1.946 + throws SAXException { 1.947 + errConsecutiveHyphens(); 1.948 + // [NOCPP[ 1.949 + switch (commentPolicy) { 1.950 + case ALTER_INFOSET: 1.951 + // detachLongStrBuf(); 1.952 + longStrBufLen--; 1.953 + appendLongStrBuf(' '); 1.954 + appendLongStrBuf('-'); 1.955 + // FALLTHROUGH 1.956 + case ALLOW: 1.957 + warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 1.958 + // ]NOCPP] 1.959 + appendLongStrBuf(c); 1.960 + // [NOCPP[ 1.961 + break; 1.962 + case FATAL: 1.963 + fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); 1.964 + break; 1.965 + } 1.966 + // ]NOCPP] 1.967 + } 1.968 + 1.969 + private void appendLongStrBuf(@NoLength char[] buffer, int offset, int length) { 1.970 + int reqLen = longStrBufLen + length; 1.971 + if (longStrBuf.length < reqLen) { 1.972 + char[] newBuf = new char[reqLen + (reqLen >> 1)]; 1.973 + System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length); 1.974 + longStrBuf = newBuf; 1.975 + } 1.976 + System.arraycopy(buffer, offset, longStrBuf, longStrBufLen, length); 1.977 + longStrBufLen = reqLen; 1.978 + } 1.979 + 1.980 + /** 1.981 + * Append the contents of the smaller buffer to the larger one. 1.982 + */ 1.983 + @Inline private void appendStrBufToLongStrBuf() { 1.984 + appendLongStrBuf(strBuf, 0, strBufLen); 1.985 + } 1.986 + 1.987 + /** 1.988 + * The larger buffer as a string. 1.989 + * 1.990 + * <p> 1.991 + * C++ memory note: The return value must be released. 1.992 + * 1.993 + * @return the larger buffer as a string 1.994 + */ 1.995 + private String longStrBufToString() { 1.996 + return Portability.newStringFromBuffer(longStrBuf, 0, longStrBufLen); 1.997 + } 1.998 + 1.999 + /** 1.1000 + * Emits the current comment token. 1.1001 + * 1.1002 + * @param pos 1.1003 + * TODO 1.1004 + * 1.1005 + * @throws SAXException 1.1006 + */ 1.1007 + private void emitComment(int provisionalHyphens, int pos) 1.1008 + throws SAXException { 1.1009 + // [NOCPP[ 1.1010 + if (wantsComments) { 1.1011 + // ]NOCPP] 1.1012 + // if (longStrBufOffset != -1) { 1.1013 + // tokenHandler.comment(buf, longStrBufOffset, longStrBufLen 1.1014 + // - provisionalHyphens); 1.1015 + // } else { 1.1016 + tokenHandler.comment(longStrBuf, 0, longStrBufLen 1.1017 + - provisionalHyphens); 1.1018 + // } 1.1019 + // [NOCPP[ 1.1020 + } 1.1021 + // ]NOCPP] 1.1022 + cstart = pos + 1; 1.1023 + } 1.1024 + 1.1025 + /** 1.1026 + * Flushes coalesced character tokens. 1.1027 + * 1.1028 + * @param buf 1.1029 + * TODO 1.1030 + * @param pos 1.1031 + * TODO 1.1032 + * 1.1033 + * @throws SAXException 1.1034 + */ 1.1035 + protected void flushChars(@NoLength char[] buf, int pos) 1.1036 + throws SAXException { 1.1037 + if (pos > cstart) { 1.1038 + tokenHandler.characters(buf, cstart, pos - cstart); 1.1039 + } 1.1040 + cstart = Integer.MAX_VALUE; 1.1041 + } 1.1042 + 1.1043 + /** 1.1044 + * Reports an condition that would make the infoset incompatible with XML 1.1045 + * 1.0 as fatal. 1.1046 + * 1.1047 + * @param message 1.1048 + * the message 1.1049 + * @throws SAXException 1.1050 + * @throws SAXParseException 1.1051 + */ 1.1052 + public void fatal(String message) throws SAXException { 1.1053 + SAXParseException spe = new SAXParseException(message, this); 1.1054 + if (errorHandler != null) { 1.1055 + errorHandler.fatalError(spe); 1.1056 + } 1.1057 + throw spe; 1.1058 + } 1.1059 + 1.1060 + /** 1.1061 + * Reports a Parse Error. 1.1062 + * 1.1063 + * @param message 1.1064 + * the message 1.1065 + * @throws SAXException 1.1066 + */ 1.1067 + public void err(String message) throws SAXException { 1.1068 + if (errorHandler == null) { 1.1069 + return; 1.1070 + } 1.1071 + SAXParseException spe = new SAXParseException(message, this); 1.1072 + errorHandler.error(spe); 1.1073 + } 1.1074 + 1.1075 + public void errTreeBuilder(String message) throws SAXException { 1.1076 + ErrorHandler eh = null; 1.1077 + if (tokenHandler instanceof TreeBuilder<?>) { 1.1078 + TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler; 1.1079 + eh = treeBuilder.getErrorHandler(); 1.1080 + } 1.1081 + if (eh == null) { 1.1082 + eh = errorHandler; 1.1083 + } 1.1084 + if (eh == null) { 1.1085 + return; 1.1086 + } 1.1087 + SAXParseException spe = new SAXParseException(message, this); 1.1088 + eh.error(spe); 1.1089 + } 1.1090 + 1.1091 + /** 1.1092 + * Reports a warning 1.1093 + * 1.1094 + * @param message 1.1095 + * the message 1.1096 + * @throws SAXException 1.1097 + */ 1.1098 + public void warn(String message) throws SAXException { 1.1099 + if (errorHandler == null) { 1.1100 + return; 1.1101 + } 1.1102 + SAXParseException spe = new SAXParseException(message, this); 1.1103 + errorHandler.warning(spe); 1.1104 + } 1.1105 + 1.1106 + private void strBufToElementNameString() { 1.1107 + // if (strBufOffset != -1) { 1.1108 + // return ElementName.elementNameByBuffer(buf, strBufOffset, strBufLen); 1.1109 + // } else { 1.1110 + tagName = ElementName.elementNameByBuffer(strBuf, 0, strBufLen, 1.1111 + interner); 1.1112 + // } 1.1113 + } 1.1114 + 1.1115 + private int emitCurrentTagToken(boolean selfClosing, int pos) 1.1116 + throws SAXException { 1.1117 + cstart = pos + 1; 1.1118 + maybeErrSlashInEndTag(selfClosing); 1.1119 + stateSave = Tokenizer.DATA; 1.1120 + HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES 1.1121 + : attributes); 1.1122 + if (endTag) { 1.1123 + /* 1.1124 + * When an end tag token is emitted, the content model flag must be 1.1125 + * switched to the PCDATA state. 1.1126 + */ 1.1127 + maybeErrAttributesOnEndTag(attrs); 1.1128 + // CPPONLY: if (!viewingXmlSource) { 1.1129 + tokenHandler.endTag(tagName); 1.1130 + // CPPONLY: } 1.1131 + // CPPONLY: if (newAttributesEachTime) { 1.1132 + // CPPONLY: Portability.delete(attributes); 1.1133 + // CPPONLY: attributes = null; 1.1134 + // CPPONLY: } 1.1135 + } else { 1.1136 + // CPPONLY: if (viewingXmlSource) { 1.1137 + // CPPONLY: assert newAttributesEachTime; 1.1138 + // CPPONLY: Portability.delete(attributes); 1.1139 + // CPPONLY: attributes = null; 1.1140 + // CPPONLY: } else { 1.1141 + tokenHandler.startTag(tagName, attrs, selfClosing); 1.1142 + // CPPONLY: } 1.1143 + } 1.1144 + tagName.release(); 1.1145 + tagName = null; 1.1146 + if (newAttributesEachTime) { 1.1147 + attributes = null; 1.1148 + } else { 1.1149 + attributes.clear(mappingLangToXmlLang); 1.1150 + } 1.1151 + /* 1.1152 + * The token handler may have called setStateAndEndTagExpectation 1.1153 + * and changed stateSave since the start of this method. 1.1154 + */ 1.1155 + return stateSave; 1.1156 + } 1.1157 + 1.1158 + private void attributeNameComplete() throws SAXException { 1.1159 + // if (strBufOffset != -1) { 1.1160 + // attributeName = AttributeName.nameByBuffer(buf, strBufOffset, 1.1161 + // strBufLen, namePolicy != XmlViolationPolicy.ALLOW); 1.1162 + // } else { 1.1163 + attributeName = AttributeName.nameByBuffer(strBuf, 0, strBufLen 1.1164 + // [NOCPP[ 1.1165 + , namePolicy != XmlViolationPolicy.ALLOW 1.1166 + // ]NOCPP] 1.1167 + , interner); 1.1168 + // } 1.1169 + 1.1170 + if (attributes == null) { 1.1171 + attributes = new HtmlAttributes(mappingLangToXmlLang); 1.1172 + } 1.1173 + 1.1174 + /* 1.1175 + * When the user agent leaves the attribute name state (and before 1.1176 + * emitting the tag token, if appropriate), the complete attribute's 1.1177 + * name must be compared to the other attributes on the same token; if 1.1178 + * there is already an attribute on the token with the exact same name, 1.1179 + * then this is a parse error and the new attribute must be dropped, 1.1180 + * along with the value that gets associated with it (if any). 1.1181 + */ 1.1182 + if (attributes.contains(attributeName)) { 1.1183 + errDuplicateAttribute(); 1.1184 + attributeName.release(); 1.1185 + attributeName = null; 1.1186 + } 1.1187 + } 1.1188 + 1.1189 + private void addAttributeWithoutValue() throws SAXException { 1.1190 + noteAttributeWithoutValue(); 1.1191 + 1.1192 + // [NOCPP[ 1.1193 + if (metaBoundaryPassed && AttributeName.CHARSET == attributeName 1.1194 + && ElementName.META == tagName) { 1.1195 + err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes."); 1.1196 + } 1.1197 + // ]NOCPP] 1.1198 + if (attributeName != null) { 1.1199 + // [NOCPP[ 1.1200 + if (html4) { 1.1201 + if (attributeName.isBoolean()) { 1.1202 + if (html4ModeCompatibleWithXhtml1Schemata) { 1.1203 + attributes.addAttribute(attributeName, 1.1204 + attributeName.getLocal(AttributeName.HTML), 1.1205 + xmlnsPolicy); 1.1206 + } else { 1.1207 + attributes.addAttribute(attributeName, "", xmlnsPolicy); 1.1208 + } 1.1209 + } else { 1.1210 + if (AttributeName.BORDER != attributeName) { 1.1211 + err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)"); 1.1212 + attributes.addAttribute(attributeName, "", xmlnsPolicy); 1.1213 + } 1.1214 + } 1.1215 + } else { 1.1216 + if (AttributeName.SRC == attributeName 1.1217 + || AttributeName.HREF == attributeName) { 1.1218 + warn("Attribute \u201C" 1.1219 + + attributeName.getLocal(AttributeName.HTML) 1.1220 + + "\u201D without an explicit value seen. The attribute may be dropped by IE7."); 1.1221 + } 1.1222 + // ]NOCPP] 1.1223 + attributes.addAttribute(attributeName, 1.1224 + Portability.newEmptyString() 1.1225 + // [NOCPP[ 1.1226 + , xmlnsPolicy 1.1227 + // ]NOCPP] 1.1228 + ); 1.1229 + // [NOCPP[ 1.1230 + } 1.1231 + // ]NOCPP] 1.1232 + attributeName = null; // attributeName has been adopted by the 1.1233 + // |attributes| object 1.1234 + } 1.1235 + } 1.1236 + 1.1237 + private void addAttributeWithValue() throws SAXException { 1.1238 + // [NOCPP[ 1.1239 + if (metaBoundaryPassed && ElementName.META == tagName 1.1240 + && AttributeName.CHARSET == attributeName) { 1.1241 + err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes."); 1.1242 + } 1.1243 + // ]NOCPP] 1.1244 + if (attributeName != null) { 1.1245 + String val = longStrBufToString(); // Ownership transferred to 1.1246 + // HtmlAttributes 1.1247 + // CPPONLY: if (mViewSource) { 1.1248 + // CPPONLY: mViewSource.MaybeLinkifyAttributeValue(attributeName, val); 1.1249 + // CPPONLY: } 1.1250 + // [NOCPP[ 1.1251 + if (!endTag && html4 && html4ModeCompatibleWithXhtml1Schemata 1.1252 + && attributeName.isCaseFolded()) { 1.1253 + val = newAsciiLowerCaseStringFromString(val); 1.1254 + } 1.1255 + // ]NOCPP] 1.1256 + attributes.addAttribute(attributeName, val 1.1257 + // [NOCPP[ 1.1258 + , xmlnsPolicy 1.1259 + // ]NOCPP] 1.1260 + ); 1.1261 + attributeName = null; // attributeName has been adopted by the 1.1262 + // |attributes| object 1.1263 + } 1.1264 + } 1.1265 + 1.1266 + // [NOCPP[ 1.1267 + 1.1268 + private static String newAsciiLowerCaseStringFromString(String str) { 1.1269 + if (str == null) { 1.1270 + return null; 1.1271 + } 1.1272 + char[] buf = new char[str.length()]; 1.1273 + for (int i = 0; i < str.length(); i++) { 1.1274 + char c = str.charAt(i); 1.1275 + if (c >= 'A' && c <= 'Z') { 1.1276 + c += 0x20; 1.1277 + } 1.1278 + buf[i] = c; 1.1279 + } 1.1280 + return new String(buf); 1.1281 + } 1.1282 + 1.1283 + protected void startErrorReporting() throws SAXException { 1.1284 + 1.1285 + } 1.1286 + 1.1287 + // ]NOCPP] 1.1288 + 1.1289 + public void start() throws SAXException { 1.1290 + initializeWithoutStarting(); 1.1291 + tokenHandler.startTokenization(this); 1.1292 + // [NOCPP[ 1.1293 + startErrorReporting(); 1.1294 + // ]NOCPP] 1.1295 + } 1.1296 + 1.1297 + public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException { 1.1298 + int state = stateSave; 1.1299 + int returnState = returnStateSave; 1.1300 + char c = '\u0000'; 1.1301 + shouldSuspend = false; 1.1302 + lastCR = false; 1.1303 + 1.1304 + int start = buffer.getStart(); 1.1305 + /** 1.1306 + * The index of the last <code>char</code> read from <code>buf</code>. 1.1307 + */ 1.1308 + int pos = start - 1; 1.1309 + 1.1310 + /** 1.1311 + * The index of the first <code>char</code> in <code>buf</code> that is 1.1312 + * part of a coalesced run of character tokens or 1.1313 + * <code>Integer.MAX_VALUE</code> if there is not a current run being 1.1314 + * coalesced. 1.1315 + */ 1.1316 + switch (state) { 1.1317 + case DATA: 1.1318 + case RCDATA: 1.1319 + case SCRIPT_DATA: 1.1320 + case PLAINTEXT: 1.1321 + case RAWTEXT: 1.1322 + case CDATA_SECTION: 1.1323 + case SCRIPT_DATA_ESCAPED: 1.1324 + case SCRIPT_DATA_ESCAPE_START: 1.1325 + case SCRIPT_DATA_ESCAPE_START_DASH: 1.1326 + case SCRIPT_DATA_ESCAPED_DASH: 1.1327 + case SCRIPT_DATA_ESCAPED_DASH_DASH: 1.1328 + case SCRIPT_DATA_DOUBLE_ESCAPE_START: 1.1329 + case SCRIPT_DATA_DOUBLE_ESCAPED: 1.1330 + case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: 1.1331 + case SCRIPT_DATA_DOUBLE_ESCAPED_DASH: 1.1332 + case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: 1.1333 + case SCRIPT_DATA_DOUBLE_ESCAPE_END: 1.1334 + cstart = start; 1.1335 + break; 1.1336 + default: 1.1337 + cstart = Integer.MAX_VALUE; 1.1338 + break; 1.1339 + } 1.1340 + 1.1341 + /** 1.1342 + * The number of <code>char</code>s in <code>buf</code> that have 1.1343 + * meaning. (The rest of the array is garbage and should not be 1.1344 + * examined.) 1.1345 + */ 1.1346 + // CPPONLY: if (mViewSource) { 1.1347 + // CPPONLY: mViewSource.SetBuffer(buffer); 1.1348 + // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); 1.1349 + // CPPONLY: mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1); 1.1350 + // CPPONLY: } else { 1.1351 + // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); 1.1352 + // CPPONLY: } 1.1353 + // [NOCPP[ 1.1354 + pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, 1.1355 + buffer.getEnd()); 1.1356 + // ]NOCPP] 1.1357 + if (pos == buffer.getEnd()) { 1.1358 + // exiting due to end of buffer 1.1359 + buffer.setStart(pos); 1.1360 + } else { 1.1361 + buffer.setStart(pos + 1); 1.1362 + } 1.1363 + return lastCR; 1.1364 + } 1.1365 + 1.1366 + @SuppressWarnings("unused") private int stateLoop(int state, char c, 1.1367 + int pos, @NoLength char[] buf, boolean reconsume, int returnState, 1.1368 + int endPos) throws SAXException { 1.1369 + /* 1.1370 + * Idioms used in this code: 1.1371 + * 1.1372 + * 1.1373 + * Consuming the next input character 1.1374 + * 1.1375 + * To consume the next input character, the code does this: if (++pos == 1.1376 + * endPos) { break stateloop; } c = checkChar(buf, pos); 1.1377 + * 1.1378 + * 1.1379 + * Staying in a state 1.1380 + * 1.1381 + * When there's a state that the tokenizer may stay in over multiple 1.1382 + * input characters, the state has a wrapper |for(;;)| loop and staying 1.1383 + * in the state continues the loop. 1.1384 + * 1.1385 + * 1.1386 + * Switching to another state 1.1387 + * 1.1388 + * To switch to another state, the code sets the state variable to the 1.1389 + * magic number of the new state. Then it either continues stateloop or 1.1390 + * breaks out of the state's own wrapper loop if the target state is 1.1391 + * right after the current state in source order. (This is a partial 1.1392 + * workaround for Java's lack of goto.) 1.1393 + * 1.1394 + * 1.1395 + * Reconsume support 1.1396 + * 1.1397 + * The spec sometimes says that an input character is reconsumed in 1.1398 + * another state. If a state can ever be entered so that an input 1.1399 + * character can be reconsumed in it, the state's code starts with an 1.1400 + * |if (reconsume)| that sets reconsume to false and skips over the 1.1401 + * normal code for consuming a new character. 1.1402 + * 1.1403 + * To reconsume the current character in another state, the code sets 1.1404 + * |reconsume| to true and then switches to the other state. 1.1405 + * 1.1406 + * 1.1407 + * Emitting character tokens 1.1408 + * 1.1409 + * This method emits character tokens lazily. Whenever a new range of 1.1410 + * character tokens starts, the field cstart must be set to the start 1.1411 + * index of the range. The flushChars() method must be called at the end 1.1412 + * of a range to flush it. 1.1413 + * 1.1414 + * 1.1415 + * U+0000 handling 1.1416 + * 1.1417 + * The various states have to handle the replacement of U+0000 with 1.1418 + * U+FFFD. However, if U+0000 would be reconsumed in another state, the 1.1419 + * replacement doesn't need to happen, because it's handled by the 1.1420 + * reconsuming state. 1.1421 + * 1.1422 + * 1.1423 + * LF handling 1.1424 + * 1.1425 + * Every state needs to increment the line number upon LF unless the LF 1.1426 + * gets reconsumed by another state which increments the line number. 1.1427 + * 1.1428 + * 1.1429 + * CR handling 1.1430 + * 1.1431 + * Every state needs to handle CR unless the CR gets reconsumed and is 1.1432 + * handled by the reconsuming state. The CR needs to be handled as if it 1.1433 + * were and LF, the lastCR field must be set to true and then this 1.1434 + * method must return. The IO driver will then swallow the next 1.1435 + * character if it is an LF to coalesce CRLF. 1.1436 + */ 1.1437 + stateloop: for (;;) { 1.1438 + switch (state) { 1.1439 + case DATA: 1.1440 + dataloop: for (;;) { 1.1441 + if (reconsume) { 1.1442 + reconsume = false; 1.1443 + } else { 1.1444 + if (++pos == endPos) { 1.1445 + break stateloop; 1.1446 + } 1.1447 + c = checkChar(buf, pos); 1.1448 + } 1.1449 + switch (c) { 1.1450 + case '&': 1.1451 + /* 1.1452 + * U+0026 AMPERSAND (&) Switch to the character 1.1453 + * reference in data state. 1.1454 + */ 1.1455 + flushChars(buf, pos); 1.1456 + clearStrBufAndAppend(c); 1.1457 + setAdditionalAndRememberAmpersandLocation('\u0000'); 1.1458 + returnState = state; 1.1459 + state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 1.1460 + continue stateloop; 1.1461 + case '<': 1.1462 + /* 1.1463 + * U+003C LESS-THAN SIGN (<) Switch to the tag 1.1464 + * open state. 1.1465 + */ 1.1466 + flushChars(buf, pos); 1.1467 + 1.1468 + state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos); 1.1469 + break dataloop; // FALL THROUGH continue 1.1470 + // stateloop; 1.1471 + case '\u0000': 1.1472 + emitReplacementCharacter(buf, pos); 1.1473 + continue; 1.1474 + case '\r': 1.1475 + emitCarriageReturn(buf, pos); 1.1476 + break stateloop; 1.1477 + case '\n': 1.1478 + silentLineFeed(); 1.1479 + default: 1.1480 + /* 1.1481 + * Anything else Emit the input character as a 1.1482 + * character token. 1.1483 + * 1.1484 + * Stay in the data state. 1.1485 + */ 1.1486 + continue; 1.1487 + } 1.1488 + } 1.1489 + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1.1490 + case TAG_OPEN: 1.1491 + tagopenloop: for (;;) { 1.1492 + /* 1.1493 + * The behavior of this state depends on the content 1.1494 + * model flag. 1.1495 + */ 1.1496 + if (++pos == endPos) { 1.1497 + break stateloop; 1.1498 + } 1.1499 + c = checkChar(buf, pos); 1.1500 + /* 1.1501 + * If the content model flag is set to the PCDATA state 1.1502 + * Consume the next input character: 1.1503 + */ 1.1504 + if (c >= 'A' && c <= 'Z') { 1.1505 + /* 1.1506 + * U+0041 LATIN CAPITAL LETTER A through to U+005A 1.1507 + * LATIN CAPITAL LETTER Z Create a new start tag 1.1508 + * token, 1.1509 + */ 1.1510 + endTag = false; 1.1511 + /* 1.1512 + * set its tag name to the lowercase version of the 1.1513 + * input character (add 0x0020 to the character's 1.1514 + * code point), 1.1515 + */ 1.1516 + clearStrBufAndAppend((char) (c + 0x20)); 1.1517 + /* then switch to the tag name state. */ 1.1518 + state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); 1.1519 + /* 1.1520 + * (Don't emit the token yet; further details will 1.1521 + * be filled in before it is emitted.) 1.1522 + */ 1.1523 + break tagopenloop; 1.1524 + // continue stateloop; 1.1525 + } else if (c >= 'a' && c <= 'z') { 1.1526 + /* 1.1527 + * U+0061 LATIN SMALL LETTER A through to U+007A 1.1528 + * LATIN SMALL LETTER Z Create a new start tag 1.1529 + * token, 1.1530 + */ 1.1531 + endTag = false; 1.1532 + /* 1.1533 + * set its tag name to the input character, 1.1534 + */ 1.1535 + clearStrBufAndAppend(c); 1.1536 + /* then switch to the tag name state. */ 1.1537 + state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); 1.1538 + /* 1.1539 + * (Don't emit the token yet; further details will 1.1540 + * be filled in before it is emitted.) 1.1541 + */ 1.1542 + break tagopenloop; 1.1543 + // continue stateloop; 1.1544 + } 1.1545 + switch (c) { 1.1546 + case '!': 1.1547 + /* 1.1548 + * U+0021 EXCLAMATION MARK (!) Switch to the 1.1549 + * markup declaration open state. 1.1550 + */ 1.1551 + state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos); 1.1552 + continue stateloop; 1.1553 + case '/': 1.1554 + /* 1.1555 + * U+002F SOLIDUS (/) Switch to the close tag 1.1556 + * open state. 1.1557 + */ 1.1558 + state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos); 1.1559 + continue stateloop; 1.1560 + case '?': 1.1561 + // CPPONLY: if (viewingXmlSource) { 1.1562 + // CPPONLY: state = transition(state, 1.1563 + // CPPONLY: Tokenizer.PROCESSING_INSTRUCTION, 1.1564 + // CPPONLY: reconsume, 1.1565 + // CPPONLY: pos); 1.1566 + // CPPONLY: continue stateloop; 1.1567 + // CPPONLY: } 1.1568 + /* 1.1569 + * U+003F QUESTION MARK (?) Parse error. 1.1570 + */ 1.1571 + errProcessingInstruction(); 1.1572 + /* 1.1573 + * Switch to the bogus comment state. 1.1574 + */ 1.1575 + clearLongStrBufAndAppend(c); 1.1576 + state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 1.1577 + continue stateloop; 1.1578 + case '>': 1.1579 + /* 1.1580 + * U+003E GREATER-THAN SIGN (>) Parse error. 1.1581 + */ 1.1582 + errLtGt(); 1.1583 + /* 1.1584 + * Emit a U+003C LESS-THAN SIGN character token 1.1585 + * and a U+003E GREATER-THAN SIGN character 1.1586 + * token. 1.1587 + */ 1.1588 + tokenHandler.characters(Tokenizer.LT_GT, 0, 2); 1.1589 + /* Switch to the data state. */ 1.1590 + cstart = pos + 1; 1.1591 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.1592 + continue stateloop; 1.1593 + default: 1.1594 + /* 1.1595 + * Anything else Parse error. 1.1596 + */ 1.1597 + errBadCharAfterLt(c); 1.1598 + /* 1.1599 + * Emit a U+003C LESS-THAN SIGN character token 1.1600 + */ 1.1601 + tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 1.1602 + /* 1.1603 + * and reconsume the current input character in 1.1604 + * the data state. 1.1605 + */ 1.1606 + cstart = pos; 1.1607 + reconsume = true; 1.1608 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.1609 + continue stateloop; 1.1610 + } 1.1611 + } 1.1612 + // FALL THROUGH DON'T REORDER 1.1613 + case TAG_NAME: 1.1614 + tagnameloop: for (;;) { 1.1615 + if (++pos == endPos) { 1.1616 + break stateloop; 1.1617 + } 1.1618 + c = checkChar(buf, pos); 1.1619 + /* 1.1620 + * Consume the next input character: 1.1621 + */ 1.1622 + switch (c) { 1.1623 + case '\r': 1.1624 + silentCarriageReturn(); 1.1625 + strBufToElementNameString(); 1.1626 + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 1.1627 + break stateloop; 1.1628 + case '\n': 1.1629 + silentLineFeed(); 1.1630 + case ' ': 1.1631 + case '\t': 1.1632 + case '\u000C': 1.1633 + /* 1.1634 + * U+0009 CHARACTER TABULATION U+000A LINE FEED 1.1635 + * (LF) U+000C FORM FEED (FF) U+0020 SPACE 1.1636 + * Switch to the before attribute name state. 1.1637 + */ 1.1638 + strBufToElementNameString(); 1.1639 + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 1.1640 + break tagnameloop; 1.1641 + // continue stateloop; 1.1642 + case '/': 1.1643 + /* 1.1644 + * U+002F SOLIDUS (/) Switch to the self-closing 1.1645 + * start tag state. 1.1646 + */ 1.1647 + strBufToElementNameString(); 1.1648 + state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 1.1649 + continue stateloop; 1.1650 + case '>': 1.1651 + /* 1.1652 + * U+003E GREATER-THAN SIGN (>) Emit the current 1.1653 + * tag token. 1.1654 + */ 1.1655 + strBufToElementNameString(); 1.1656 + state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1.1657 + if (shouldSuspend) { 1.1658 + break stateloop; 1.1659 + } 1.1660 + /* 1.1661 + * Switch to the data state. 1.1662 + */ 1.1663 + continue stateloop; 1.1664 + case '\u0000': 1.1665 + c = '\uFFFD'; 1.1666 + // fall thru 1.1667 + default: 1.1668 + if (c >= 'A' && c <= 'Z') { 1.1669 + /* 1.1670 + * U+0041 LATIN CAPITAL LETTER A through to 1.1671 + * U+005A LATIN CAPITAL LETTER Z Append the 1.1672 + * lowercase version of the current input 1.1673 + * character (add 0x0020 to the character's 1.1674 + * code point) to the current tag token's 1.1675 + * tag name. 1.1676 + */ 1.1677 + c += 0x20; 1.1678 + } 1.1679 + /* 1.1680 + * Anything else Append the current input 1.1681 + * character to the current tag token's tag 1.1682 + * name. 1.1683 + */ 1.1684 + appendStrBuf(c); 1.1685 + /* 1.1686 + * Stay in the tag name state. 1.1687 + */ 1.1688 + continue; 1.1689 + } 1.1690 + } 1.1691 + // FALLTHRU DON'T REORDER 1.1692 + case BEFORE_ATTRIBUTE_NAME: 1.1693 + beforeattributenameloop: for (;;) { 1.1694 + if (reconsume) { 1.1695 + reconsume = false; 1.1696 + } else { 1.1697 + if (++pos == endPos) { 1.1698 + break stateloop; 1.1699 + } 1.1700 + c = checkChar(buf, pos); 1.1701 + } 1.1702 + /* 1.1703 + * Consume the next input character: 1.1704 + */ 1.1705 + switch (c) { 1.1706 + case '\r': 1.1707 + silentCarriageReturn(); 1.1708 + break stateloop; 1.1709 + case '\n': 1.1710 + silentLineFeed(); 1.1711 + // fall thru 1.1712 + case ' ': 1.1713 + case '\t': 1.1714 + case '\u000C': 1.1715 + /* 1.1716 + * U+0009 CHARACTER TABULATION U+000A LINE FEED 1.1717 + * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 1.1718 + * in the before attribute name state. 1.1719 + */ 1.1720 + continue; 1.1721 + case '/': 1.1722 + /* 1.1723 + * U+002F SOLIDUS (/) Switch to the self-closing 1.1724 + * start tag state. 1.1725 + */ 1.1726 + state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 1.1727 + continue stateloop; 1.1728 + case '>': 1.1729 + /* 1.1730 + * U+003E GREATER-THAN SIGN (>) Emit the current 1.1731 + * tag token. 1.1732 + */ 1.1733 + state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1.1734 + if (shouldSuspend) { 1.1735 + break stateloop; 1.1736 + } 1.1737 + /* 1.1738 + * Switch to the data state. 1.1739 + */ 1.1740 + continue stateloop; 1.1741 + case '\u0000': 1.1742 + c = '\uFFFD'; 1.1743 + // fall thru 1.1744 + case '\"': 1.1745 + case '\'': 1.1746 + case '<': 1.1747 + case '=': 1.1748 + /* 1.1749 + * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE 1.1750 + * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS 1.1751 + * SIGN (=) Parse error. 1.1752 + */ 1.1753 + errBadCharBeforeAttributeNameOrNull(c); 1.1754 + /* 1.1755 + * Treat it as per the "anything else" entry 1.1756 + * below. 1.1757 + */ 1.1758 + default: 1.1759 + /* 1.1760 + * Anything else Start a new attribute in the 1.1761 + * current tag token. 1.1762 + */ 1.1763 + if (c >= 'A' && c <= 'Z') { 1.1764 + /* 1.1765 + * U+0041 LATIN CAPITAL LETTER A through to 1.1766 + * U+005A LATIN CAPITAL LETTER Z Set that 1.1767 + * attribute's name to the lowercase version 1.1768 + * of the current input character (add 1.1769 + * 0x0020 to the character's code point) 1.1770 + */ 1.1771 + c += 0x20; 1.1772 + } 1.1773 + /* 1.1774 + * Set that attribute's name to the current 1.1775 + * input character, 1.1776 + */ 1.1777 + clearStrBufAndAppend(c); 1.1778 + /* 1.1779 + * and its value to the empty string. 1.1780 + */ 1.1781 + // Will do later. 1.1782 + /* 1.1783 + * Switch to the attribute name state. 1.1784 + */ 1.1785 + state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); 1.1786 + break beforeattributenameloop; 1.1787 + // continue stateloop; 1.1788 + } 1.1789 + } 1.1790 + // FALLTHRU DON'T REORDER 1.1791 + case ATTRIBUTE_NAME: 1.1792 + attributenameloop: for (;;) { 1.1793 + if (++pos == endPos) { 1.1794 + break stateloop; 1.1795 + } 1.1796 + c = checkChar(buf, pos); 1.1797 + /* 1.1798 + * Consume the next input character: 1.1799 + */ 1.1800 + switch (c) { 1.1801 + case '\r': 1.1802 + silentCarriageReturn(); 1.1803 + attributeNameComplete(); 1.1804 + state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos); 1.1805 + break stateloop; 1.1806 + case '\n': 1.1807 + silentLineFeed(); 1.1808 + // fall thru 1.1809 + case ' ': 1.1810 + case '\t': 1.1811 + case '\u000C': 1.1812 + /* 1.1813 + * U+0009 CHARACTER TABULATION U+000A LINE FEED 1.1814 + * (LF) U+000C FORM FEED (FF) U+0020 SPACE 1.1815 + * Switch to the after attribute name state. 1.1816 + */ 1.1817 + attributeNameComplete(); 1.1818 + state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos); 1.1819 + continue stateloop; 1.1820 + case '/': 1.1821 + /* 1.1822 + * U+002F SOLIDUS (/) Switch to the self-closing 1.1823 + * start tag state. 1.1824 + */ 1.1825 + attributeNameComplete(); 1.1826 + addAttributeWithoutValue(); 1.1827 + state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 1.1828 + continue stateloop; 1.1829 + case '=': 1.1830 + /* 1.1831 + * U+003D EQUALS SIGN (=) Switch to the before 1.1832 + * attribute value state. 1.1833 + */ 1.1834 + attributeNameComplete(); 1.1835 + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); 1.1836 + break attributenameloop; 1.1837 + // continue stateloop; 1.1838 + case '>': 1.1839 + /* 1.1840 + * U+003E GREATER-THAN SIGN (>) Emit the current 1.1841 + * tag token. 1.1842 + */ 1.1843 + attributeNameComplete(); 1.1844 + addAttributeWithoutValue(); 1.1845 + state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1.1846 + if (shouldSuspend) { 1.1847 + break stateloop; 1.1848 + } 1.1849 + /* 1.1850 + * Switch to the data state. 1.1851 + */ 1.1852 + continue stateloop; 1.1853 + case '\u0000': 1.1854 + c = '\uFFFD'; 1.1855 + // fall thru 1.1856 + case '\"': 1.1857 + case '\'': 1.1858 + case '<': 1.1859 + /* 1.1860 + * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE 1.1861 + * (') U+003C LESS-THAN SIGN (<) Parse error. 1.1862 + */ 1.1863 + errQuoteOrLtInAttributeNameOrNull(c); 1.1864 + /* 1.1865 + * Treat it as per the "anything else" entry 1.1866 + * below. 1.1867 + */ 1.1868 + default: 1.1869 + if (c >= 'A' && c <= 'Z') { 1.1870 + /* 1.1871 + * U+0041 LATIN CAPITAL LETTER A through to 1.1872 + * U+005A LATIN CAPITAL LETTER Z Append the 1.1873 + * lowercase version of the current input 1.1874 + * character (add 0x0020 to the character's 1.1875 + * code point) to the current attribute's 1.1876 + * name. 1.1877 + */ 1.1878 + c += 0x20; 1.1879 + } 1.1880 + /* 1.1881 + * Anything else Append the current input 1.1882 + * character to the current attribute's name. 1.1883 + */ 1.1884 + appendStrBuf(c); 1.1885 + /* 1.1886 + * Stay in the attribute name state. 1.1887 + */ 1.1888 + continue; 1.1889 + } 1.1890 + } 1.1891 + // FALLTHRU DON'T REORDER 1.1892 + case BEFORE_ATTRIBUTE_VALUE: 1.1893 + beforeattributevalueloop: for (;;) { 1.1894 + if (++pos == endPos) { 1.1895 + break stateloop; 1.1896 + } 1.1897 + c = checkChar(buf, pos); 1.1898 + /* 1.1899 + * Consume the next input character: 1.1900 + */ 1.1901 + switch (c) { 1.1902 + case '\r': 1.1903 + silentCarriageReturn(); 1.1904 + break stateloop; 1.1905 + case '\n': 1.1906 + silentLineFeed(); 1.1907 + // fall thru 1.1908 + case ' ': 1.1909 + case '\t': 1.1910 + case '\u000C': 1.1911 + /* 1.1912 + * U+0009 CHARACTER TABULATION U+000A LINE FEED 1.1913 + * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 1.1914 + * in the before attribute value state. 1.1915 + */ 1.1916 + continue; 1.1917 + case '"': 1.1918 + /* 1.1919 + * U+0022 QUOTATION MARK (") Switch to the 1.1920 + * attribute value (double-quoted) state. 1.1921 + */ 1.1922 + clearLongStrBuf(); 1.1923 + state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos); 1.1924 + break beforeattributevalueloop; 1.1925 + // continue stateloop; 1.1926 + case '&': 1.1927 + /* 1.1928 + * U+0026 AMPERSAND (&) Switch to the attribute 1.1929 + * value (unquoted) state and reconsume this 1.1930 + * input character. 1.1931 + */ 1.1932 + clearLongStrBuf(); 1.1933 + reconsume = true; 1.1934 + state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); 1.1935 + noteUnquotedAttributeValue(); 1.1936 + continue stateloop; 1.1937 + case '\'': 1.1938 + /* 1.1939 + * U+0027 APOSTROPHE (') Switch to the attribute 1.1940 + * value (single-quoted) state. 1.1941 + */ 1.1942 + clearLongStrBuf(); 1.1943 + state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos); 1.1944 + continue stateloop; 1.1945 + case '>': 1.1946 + /* 1.1947 + * U+003E GREATER-THAN SIGN (>) Parse error. 1.1948 + */ 1.1949 + errAttributeValueMissing(); 1.1950 + /* 1.1951 + * Emit the current tag token. 1.1952 + */ 1.1953 + addAttributeWithoutValue(); 1.1954 + state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1.1955 + if (shouldSuspend) { 1.1956 + break stateloop; 1.1957 + } 1.1958 + /* 1.1959 + * Switch to the data state. 1.1960 + */ 1.1961 + continue stateloop; 1.1962 + case '\u0000': 1.1963 + c = '\uFFFD'; 1.1964 + // fall thru 1.1965 + case '<': 1.1966 + case '=': 1.1967 + case '`': 1.1968 + /* 1.1969 + * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN 1.1970 + * (=) U+0060 GRAVE ACCENT (`) 1.1971 + */ 1.1972 + errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c); 1.1973 + /* 1.1974 + * Treat it as per the "anything else" entry 1.1975 + * below. 1.1976 + */ 1.1977 + default: 1.1978 + // [NOCPP[ 1.1979 + errHtml4NonNameInUnquotedAttribute(c); 1.1980 + // ]NOCPP] 1.1981 + /* 1.1982 + * Anything else Append the current input 1.1983 + * character to the current attribute's value. 1.1984 + */ 1.1985 + clearLongStrBufAndAppend(c); 1.1986 + /* 1.1987 + * Switch to the attribute value (unquoted) 1.1988 + * state. 1.1989 + */ 1.1990 + 1.1991 + state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); 1.1992 + noteUnquotedAttributeValue(); 1.1993 + continue stateloop; 1.1994 + } 1.1995 + } 1.1996 + // FALLTHRU DON'T REORDER 1.1997 + case ATTRIBUTE_VALUE_DOUBLE_QUOTED: 1.1998 + attributevaluedoublequotedloop: for (;;) { 1.1999 + if (reconsume) { 1.2000 + reconsume = false; 1.2001 + } else { 1.2002 + if (++pos == endPos) { 1.2003 + break stateloop; 1.2004 + } 1.2005 + c = checkChar(buf, pos); 1.2006 + } 1.2007 + /* 1.2008 + * Consume the next input character: 1.2009 + */ 1.2010 + switch (c) { 1.2011 + case '"': 1.2012 + /* 1.2013 + * U+0022 QUOTATION MARK (") Switch to the after 1.2014 + * attribute value (quoted) state. 1.2015 + */ 1.2016 + addAttributeWithValue(); 1.2017 + 1.2018 + state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); 1.2019 + break attributevaluedoublequotedloop; 1.2020 + // continue stateloop; 1.2021 + case '&': 1.2022 + /* 1.2023 + * U+0026 AMPERSAND (&) Switch to the character 1.2024 + * reference in attribute value state, with the 1.2025 + * additional allowed character being U+0022 1.2026 + * QUOTATION MARK ("). 1.2027 + */ 1.2028 + clearStrBufAndAppend(c); 1.2029 + setAdditionalAndRememberAmpersandLocation('\"'); 1.2030 + returnState = state; 1.2031 + state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 1.2032 + continue stateloop; 1.2033 + case '\r': 1.2034 + appendLongStrBufCarriageReturn(); 1.2035 + break stateloop; 1.2036 + case '\n': 1.2037 + appendLongStrBufLineFeed(); 1.2038 + continue; 1.2039 + case '\u0000': 1.2040 + c = '\uFFFD'; 1.2041 + // fall thru 1.2042 + default: 1.2043 + /* 1.2044 + * Anything else Append the current input 1.2045 + * character to the current attribute's value. 1.2046 + */ 1.2047 + appendLongStrBuf(c); 1.2048 + /* 1.2049 + * Stay in the attribute value (double-quoted) 1.2050 + * state. 1.2051 + */ 1.2052 + continue; 1.2053 + } 1.2054 + } 1.2055 + // FALLTHRU DON'T REORDER 1.2056 + case AFTER_ATTRIBUTE_VALUE_QUOTED: 1.2057 + afterattributevaluequotedloop: for (;;) { 1.2058 + if (++pos == endPos) { 1.2059 + break stateloop; 1.2060 + } 1.2061 + c = checkChar(buf, pos); 1.2062 + /* 1.2063 + * Consume the next input character: 1.2064 + */ 1.2065 + switch (c) { 1.2066 + case '\r': 1.2067 + silentCarriageReturn(); 1.2068 + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 1.2069 + break stateloop; 1.2070 + case '\n': 1.2071 + silentLineFeed(); 1.2072 + // fall thru 1.2073 + case ' ': 1.2074 + case '\t': 1.2075 + case '\u000C': 1.2076 + /* 1.2077 + * U+0009 CHARACTER TABULATION U+000A LINE FEED 1.2078 + * (LF) U+000C FORM FEED (FF) U+0020 SPACE 1.2079 + * Switch to the before attribute name state. 1.2080 + */ 1.2081 + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 1.2082 + continue stateloop; 1.2083 + case '/': 1.2084 + /* 1.2085 + * U+002F SOLIDUS (/) Switch to the self-closing 1.2086 + * start tag state. 1.2087 + */ 1.2088 + state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 1.2089 + break afterattributevaluequotedloop; 1.2090 + // continue stateloop; 1.2091 + case '>': 1.2092 + /* 1.2093 + * U+003E GREATER-THAN SIGN (>) Emit the current 1.2094 + * tag token. 1.2095 + */ 1.2096 + state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1.2097 + if (shouldSuspend) { 1.2098 + break stateloop; 1.2099 + } 1.2100 + /* 1.2101 + * Switch to the data state. 1.2102 + */ 1.2103 + continue stateloop; 1.2104 + default: 1.2105 + /* 1.2106 + * Anything else Parse error. 1.2107 + */ 1.2108 + errNoSpaceBetweenAttributes(); 1.2109 + /* 1.2110 + * Reconsume the character in the before 1.2111 + * attribute name state. 1.2112 + */ 1.2113 + reconsume = true; 1.2114 + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 1.2115 + continue stateloop; 1.2116 + } 1.2117 + } 1.2118 + // FALLTHRU DON'T REORDER 1.2119 + case SELF_CLOSING_START_TAG: 1.2120 + if (++pos == endPos) { 1.2121 + break stateloop; 1.2122 + } 1.2123 + c = checkChar(buf, pos); 1.2124 + /* 1.2125 + * Consume the next input character: 1.2126 + */ 1.2127 + switch (c) { 1.2128 + case '>': 1.2129 + /* 1.2130 + * U+003E GREATER-THAN SIGN (>) Set the self-closing 1.2131 + * flag of the current tag token. Emit the current 1.2132 + * tag token. 1.2133 + */ 1.2134 + // [NOCPP[ 1.2135 + errHtml4XmlVoidSyntax(); 1.2136 + // ]NOCPP] 1.2137 + state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos); 1.2138 + if (shouldSuspend) { 1.2139 + break stateloop; 1.2140 + } 1.2141 + /* 1.2142 + * Switch to the data state. 1.2143 + */ 1.2144 + continue stateloop; 1.2145 + default: 1.2146 + /* Anything else Parse error. */ 1.2147 + errSlashNotFollowedByGt(); 1.2148 + /* 1.2149 + * Reconsume the character in the before attribute 1.2150 + * name state. 1.2151 + */ 1.2152 + reconsume = true; 1.2153 + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 1.2154 + continue stateloop; 1.2155 + } 1.2156 + // XXX reorder point 1.2157 + case ATTRIBUTE_VALUE_UNQUOTED: 1.2158 + for (;;) { 1.2159 + if (reconsume) { 1.2160 + reconsume = false; 1.2161 + } else { 1.2162 + if (++pos == endPos) { 1.2163 + break stateloop; 1.2164 + } 1.2165 + c = checkChar(buf, pos); 1.2166 + } 1.2167 + /* 1.2168 + * Consume the next input character: 1.2169 + */ 1.2170 + switch (c) { 1.2171 + case '\r': 1.2172 + silentCarriageReturn(); 1.2173 + addAttributeWithValue(); 1.2174 + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 1.2175 + break stateloop; 1.2176 + case '\n': 1.2177 + silentLineFeed(); 1.2178 + // fall thru 1.2179 + case ' ': 1.2180 + case '\t': 1.2181 + case '\u000C': 1.2182 + /* 1.2183 + * U+0009 CHARACTER TABULATION U+000A LINE FEED 1.2184 + * (LF) U+000C FORM FEED (FF) U+0020 SPACE 1.2185 + * Switch to the before attribute name state. 1.2186 + */ 1.2187 + addAttributeWithValue(); 1.2188 + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 1.2189 + continue stateloop; 1.2190 + case '&': 1.2191 + /* 1.2192 + * U+0026 AMPERSAND (&) Switch to the character 1.2193 + * reference in attribute value state, with the 1.2194 + * additional allowed character being U+003E 1.2195 + * GREATER-THAN SIGN (>) 1.2196 + */ 1.2197 + clearStrBufAndAppend(c); 1.2198 + setAdditionalAndRememberAmpersandLocation('>'); 1.2199 + returnState = state; 1.2200 + state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 1.2201 + continue stateloop; 1.2202 + case '>': 1.2203 + /* 1.2204 + * U+003E GREATER-THAN SIGN (>) Emit the current 1.2205 + * tag token. 1.2206 + */ 1.2207 + addAttributeWithValue(); 1.2208 + state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1.2209 + if (shouldSuspend) { 1.2210 + break stateloop; 1.2211 + } 1.2212 + /* 1.2213 + * Switch to the data state. 1.2214 + */ 1.2215 + continue stateloop; 1.2216 + case '\u0000': 1.2217 + c = '\uFFFD'; 1.2218 + // fall thru 1.2219 + case '<': 1.2220 + case '\"': 1.2221 + case '\'': 1.2222 + case '=': 1.2223 + case '`': 1.2224 + /* 1.2225 + * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE 1.2226 + * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS 1.2227 + * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error. 1.2228 + */ 1.2229 + errUnquotedAttributeValOrNull(c); 1.2230 + /* 1.2231 + * Treat it as per the "anything else" entry 1.2232 + * below. 1.2233 + */ 1.2234 + // fall through 1.2235 + default: 1.2236 + // [NOCPP] 1.2237 + errHtml4NonNameInUnquotedAttribute(c); 1.2238 + // ]NOCPP] 1.2239 + /* 1.2240 + * Anything else Append the current input 1.2241 + * character to the current attribute's value. 1.2242 + */ 1.2243 + appendLongStrBuf(c); 1.2244 + /* 1.2245 + * Stay in the attribute value (unquoted) state. 1.2246 + */ 1.2247 + continue; 1.2248 + } 1.2249 + } 1.2250 + // XXX reorder point 1.2251 + case AFTER_ATTRIBUTE_NAME: 1.2252 + for (;;) { 1.2253 + if (++pos == endPos) { 1.2254 + break stateloop; 1.2255 + } 1.2256 + c = checkChar(buf, pos); 1.2257 + /* 1.2258 + * Consume the next input character: 1.2259 + */ 1.2260 + switch (c) { 1.2261 + case '\r': 1.2262 + silentCarriageReturn(); 1.2263 + break stateloop; 1.2264 + case '\n': 1.2265 + silentLineFeed(); 1.2266 + // fall thru 1.2267 + case ' ': 1.2268 + case '\t': 1.2269 + case '\u000C': 1.2270 + /* 1.2271 + * U+0009 CHARACTER TABULATION U+000A LINE FEED 1.2272 + * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 1.2273 + * in the after attribute name state. 1.2274 + */ 1.2275 + continue; 1.2276 + case '/': 1.2277 + /* 1.2278 + * U+002F SOLIDUS (/) Switch to the self-closing 1.2279 + * start tag state. 1.2280 + */ 1.2281 + addAttributeWithoutValue(); 1.2282 + state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 1.2283 + continue stateloop; 1.2284 + case '=': 1.2285 + /* 1.2286 + * U+003D EQUALS SIGN (=) Switch to the before 1.2287 + * attribute value state. 1.2288 + */ 1.2289 + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); 1.2290 + continue stateloop; 1.2291 + case '>': 1.2292 + /* 1.2293 + * U+003E GREATER-THAN SIGN (>) Emit the current 1.2294 + * tag token. 1.2295 + */ 1.2296 + addAttributeWithoutValue(); 1.2297 + state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1.2298 + if (shouldSuspend) { 1.2299 + break stateloop; 1.2300 + } 1.2301 + /* 1.2302 + * Switch to the data state. 1.2303 + */ 1.2304 + continue stateloop; 1.2305 + case '\u0000': 1.2306 + c = '\uFFFD'; 1.2307 + // fall thru 1.2308 + case '\"': 1.2309 + case '\'': 1.2310 + case '<': 1.2311 + errQuoteOrLtInAttributeNameOrNull(c); 1.2312 + /* 1.2313 + * Treat it as per the "anything else" entry 1.2314 + * below. 1.2315 + */ 1.2316 + default: 1.2317 + addAttributeWithoutValue(); 1.2318 + /* 1.2319 + * Anything else Start a new attribute in the 1.2320 + * current tag token. 1.2321 + */ 1.2322 + if (c >= 'A' && c <= 'Z') { 1.2323 + /* 1.2324 + * U+0041 LATIN CAPITAL LETTER A through to 1.2325 + * U+005A LATIN CAPITAL LETTER Z Set that 1.2326 + * attribute's name to the lowercase version 1.2327 + * of the current input character (add 1.2328 + * 0x0020 to the character's code point) 1.2329 + */ 1.2330 + c += 0x20; 1.2331 + } 1.2332 + /* 1.2333 + * Set that attribute's name to the current 1.2334 + * input character, 1.2335 + */ 1.2336 + clearStrBufAndAppend(c); 1.2337 + /* 1.2338 + * and its value to the empty string. 1.2339 + */ 1.2340 + // Will do later. 1.2341 + /* 1.2342 + * Switch to the attribute name state. 1.2343 + */ 1.2344 + state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); 1.2345 + continue stateloop; 1.2346 + } 1.2347 + } 1.2348 + // XXX reorder point 1.2349 + case MARKUP_DECLARATION_OPEN: 1.2350 + markupdeclarationopenloop: for (;;) { 1.2351 + if (++pos == endPos) { 1.2352 + break stateloop; 1.2353 + } 1.2354 + c = checkChar(buf, pos); 1.2355 + /* 1.2356 + * If the next two characters are both U+002D 1.2357 + * HYPHEN-MINUS characters (-), consume those two 1.2358 + * characters, create a comment token whose data is the 1.2359 + * empty string, and switch to the comment start state. 1.2360 + * 1.2361 + * Otherwise, if the next seven characters are an ASCII 1.2362 + * case-insensitive match for the word "DOCTYPE", then 1.2363 + * consume those characters and switch to the DOCTYPE 1.2364 + * state. 1.2365 + * 1.2366 + * Otherwise, if the insertion mode is 1.2367 + * "in foreign content" and the current node is not an 1.2368 + * element in the HTML namespace and the next seven 1.2369 + * characters are an case-sensitive match for the string 1.2370 + * "[CDATA[" (the five uppercase letters "CDATA" with a 1.2371 + * U+005B LEFT SQUARE BRACKET character before and 1.2372 + * after), then consume those characters and switch to 1.2373 + * the CDATA section state. 1.2374 + * 1.2375 + * Otherwise, is is a parse error. Switch to the bogus 1.2376 + * comment state. The next character that is consumed, 1.2377 + * if any, is the first character that will be in the 1.2378 + * comment. 1.2379 + */ 1.2380 + switch (c) { 1.2381 + case '-': 1.2382 + clearLongStrBufAndAppend(c); 1.2383 + state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos); 1.2384 + break markupdeclarationopenloop; 1.2385 + // continue stateloop; 1.2386 + case 'd': 1.2387 + case 'D': 1.2388 + clearLongStrBufAndAppend(c); 1.2389 + index = 0; 1.2390 + state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos); 1.2391 + continue stateloop; 1.2392 + case '[': 1.2393 + if (tokenHandler.cdataSectionAllowed()) { 1.2394 + clearLongStrBufAndAppend(c); 1.2395 + index = 0; 1.2396 + state = transition(state, Tokenizer.CDATA_START, reconsume, pos); 1.2397 + continue stateloop; 1.2398 + } 1.2399 + // else fall through 1.2400 + default: 1.2401 + errBogusComment(); 1.2402 + clearLongStrBuf(); 1.2403 + reconsume = true; 1.2404 + state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 1.2405 + continue stateloop; 1.2406 + } 1.2407 + } 1.2408 + // FALLTHRU DON'T REORDER 1.2409 + case MARKUP_DECLARATION_HYPHEN: 1.2410 + markupdeclarationhyphenloop: for (;;) { 1.2411 + if (++pos == endPos) { 1.2412 + break stateloop; 1.2413 + } 1.2414 + c = checkChar(buf, pos); 1.2415 + switch (c) { 1.2416 + case '\u0000': 1.2417 + break stateloop; 1.2418 + case '-': 1.2419 + clearLongStrBuf(); 1.2420 + state = transition(state, Tokenizer.COMMENT_START, reconsume, pos); 1.2421 + break markupdeclarationhyphenloop; 1.2422 + // continue stateloop; 1.2423 + default: 1.2424 + errBogusComment(); 1.2425 + reconsume = true; 1.2426 + state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 1.2427 + continue stateloop; 1.2428 + } 1.2429 + } 1.2430 + // FALLTHRU DON'T REORDER 1.2431 + case COMMENT_START: 1.2432 + commentstartloop: for (;;) { 1.2433 + if (++pos == endPos) { 1.2434 + break stateloop; 1.2435 + } 1.2436 + c = checkChar(buf, pos); 1.2437 + /* 1.2438 + * Comment start state 1.2439 + * 1.2440 + * 1.2441 + * Consume the next input character: 1.2442 + */ 1.2443 + switch (c) { 1.2444 + case '-': 1.2445 + /* 1.2446 + * U+002D HYPHEN-MINUS (-) Switch to the comment 1.2447 + * start dash state. 1.2448 + */ 1.2449 + appendLongStrBuf(c); 1.2450 + state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos); 1.2451 + continue stateloop; 1.2452 + case '>': 1.2453 + /* 1.2454 + * U+003E GREATER-THAN SIGN (>) Parse error. 1.2455 + */ 1.2456 + errPrematureEndOfComment(); 1.2457 + /* Emit the comment token. */ 1.2458 + emitComment(0, pos); 1.2459 + /* 1.2460 + * Switch to the data state. 1.2461 + */ 1.2462 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.2463 + continue stateloop; 1.2464 + case '\r': 1.2465 + appendLongStrBufCarriageReturn(); 1.2466 + state = transition(state, Tokenizer.COMMENT, reconsume, pos); 1.2467 + break stateloop; 1.2468 + case '\n': 1.2469 + appendLongStrBufLineFeed(); 1.2470 + state = transition(state, Tokenizer.COMMENT, reconsume, pos); 1.2471 + break commentstartloop; 1.2472 + case '\u0000': 1.2473 + c = '\uFFFD'; 1.2474 + // fall thru 1.2475 + default: 1.2476 + /* 1.2477 + * Anything else Append the input character to 1.2478 + * the comment token's data. 1.2479 + */ 1.2480 + appendLongStrBuf(c); 1.2481 + /* 1.2482 + * Switch to the comment state. 1.2483 + */ 1.2484 + state = transition(state, Tokenizer.COMMENT, reconsume, pos); 1.2485 + break commentstartloop; 1.2486 + // continue stateloop; 1.2487 + } 1.2488 + } 1.2489 + // FALLTHRU DON'T REORDER 1.2490 + case COMMENT: 1.2491 + commentloop: for (;;) { 1.2492 + if (++pos == endPos) { 1.2493 + break stateloop; 1.2494 + } 1.2495 + c = checkChar(buf, pos); 1.2496 + /* 1.2497 + * Comment state Consume the next input character: 1.2498 + */ 1.2499 + switch (c) { 1.2500 + case '-': 1.2501 + /* 1.2502 + * U+002D HYPHEN-MINUS (-) Switch to the comment 1.2503 + * end dash state 1.2504 + */ 1.2505 + appendLongStrBuf(c); 1.2506 + state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); 1.2507 + break commentloop; 1.2508 + // continue stateloop; 1.2509 + case '\r': 1.2510 + appendLongStrBufCarriageReturn(); 1.2511 + break stateloop; 1.2512 + case '\n': 1.2513 + appendLongStrBufLineFeed(); 1.2514 + continue; 1.2515 + case '\u0000': 1.2516 + c = '\uFFFD'; 1.2517 + // fall thru 1.2518 + default: 1.2519 + /* 1.2520 + * Anything else Append the input character to 1.2521 + * the comment token's data. 1.2522 + */ 1.2523 + appendLongStrBuf(c); 1.2524 + /* 1.2525 + * Stay in the comment state. 1.2526 + */ 1.2527 + continue; 1.2528 + } 1.2529 + } 1.2530 + // FALLTHRU DON'T REORDER 1.2531 + case COMMENT_END_DASH: 1.2532 + commentenddashloop: for (;;) { 1.2533 + if (++pos == endPos) { 1.2534 + break stateloop; 1.2535 + } 1.2536 + c = checkChar(buf, pos); 1.2537 + /* 1.2538 + * Comment end dash state Consume the next input 1.2539 + * character: 1.2540 + */ 1.2541 + switch (c) { 1.2542 + case '-': 1.2543 + /* 1.2544 + * U+002D HYPHEN-MINUS (-) Switch to the comment 1.2545 + * end state 1.2546 + */ 1.2547 + appendLongStrBuf(c); 1.2548 + state = transition(state, Tokenizer.COMMENT_END, reconsume, pos); 1.2549 + break commentenddashloop; 1.2550 + // continue stateloop; 1.2551 + case '\r': 1.2552 + appendLongStrBufCarriageReturn(); 1.2553 + state = transition(state, Tokenizer.COMMENT, reconsume, pos); 1.2554 + break stateloop; 1.2555 + case '\n': 1.2556 + appendLongStrBufLineFeed(); 1.2557 + state = transition(state, Tokenizer.COMMENT, reconsume, pos); 1.2558 + continue stateloop; 1.2559 + case '\u0000': 1.2560 + c = '\uFFFD'; 1.2561 + // fall thru 1.2562 + default: 1.2563 + /* 1.2564 + * Anything else Append a U+002D HYPHEN-MINUS 1.2565 + * (-) character and the input character to the 1.2566 + * comment token's data. 1.2567 + */ 1.2568 + appendLongStrBuf(c); 1.2569 + /* 1.2570 + * Switch to the comment state. 1.2571 + */ 1.2572 + state = transition(state, Tokenizer.COMMENT, reconsume, pos); 1.2573 + continue stateloop; 1.2574 + } 1.2575 + } 1.2576 + // FALLTHRU DON'T REORDER 1.2577 + case COMMENT_END: 1.2578 + commentendloop: for (;;) { 1.2579 + if (++pos == endPos) { 1.2580 + break stateloop; 1.2581 + } 1.2582 + c = checkChar(buf, pos); 1.2583 + /* 1.2584 + * Comment end dash state Consume the next input 1.2585 + * character: 1.2586 + */ 1.2587 + switch (c) { 1.2588 + case '>': 1.2589 + /* 1.2590 + * U+003E GREATER-THAN SIGN (>) Emit the comment 1.2591 + * token. 1.2592 + */ 1.2593 + emitComment(2, pos); 1.2594 + /* 1.2595 + * Switch to the data state. 1.2596 + */ 1.2597 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.2598 + continue stateloop; 1.2599 + case '-': 1.2600 + /* U+002D HYPHEN-MINUS (-) Parse error. */ 1.2601 + /* 1.2602 + * Append a U+002D HYPHEN-MINUS (-) character to 1.2603 + * the comment token's data. 1.2604 + */ 1.2605 + adjustDoubleHyphenAndAppendToLongStrBufAndErr(c); 1.2606 + /* 1.2607 + * Stay in the comment end state. 1.2608 + */ 1.2609 + continue; 1.2610 + case '\r': 1.2611 + adjustDoubleHyphenAndAppendToLongStrBufCarriageReturn(); 1.2612 + state = transition(state, Tokenizer.COMMENT, reconsume, pos); 1.2613 + break stateloop; 1.2614 + case '\n': 1.2615 + adjustDoubleHyphenAndAppendToLongStrBufLineFeed(); 1.2616 + state = transition(state, Tokenizer.COMMENT, reconsume, pos); 1.2617 + continue stateloop; 1.2618 + case '!': 1.2619 + errHyphenHyphenBang(); 1.2620 + appendLongStrBuf(c); 1.2621 + state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos); 1.2622 + continue stateloop; 1.2623 + case '\u0000': 1.2624 + c = '\uFFFD'; 1.2625 + // fall thru 1.2626 + default: 1.2627 + /* 1.2628 + * Append two U+002D HYPHEN-MINUS (-) characters 1.2629 + * and the input character to the comment 1.2630 + * token's data. 1.2631 + */ 1.2632 + adjustDoubleHyphenAndAppendToLongStrBufAndErr(c); 1.2633 + /* 1.2634 + * Switch to the comment state. 1.2635 + */ 1.2636 + state = transition(state, Tokenizer.COMMENT, reconsume, pos); 1.2637 + continue stateloop; 1.2638 + } 1.2639 + } 1.2640 + // XXX reorder point 1.2641 + case COMMENT_END_BANG: 1.2642 + for (;;) { 1.2643 + if (++pos == endPos) { 1.2644 + break stateloop; 1.2645 + } 1.2646 + c = checkChar(buf, pos); 1.2647 + /* 1.2648 + * Comment end bang state 1.2649 + * 1.2650 + * Consume the next input character: 1.2651 + */ 1.2652 + switch (c) { 1.2653 + case '>': 1.2654 + /* 1.2655 + * U+003E GREATER-THAN SIGN (>) Emit the comment 1.2656 + * token. 1.2657 + */ 1.2658 + emitComment(3, pos); 1.2659 + /* 1.2660 + * Switch to the data state. 1.2661 + */ 1.2662 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.2663 + continue stateloop; 1.2664 + case '-': 1.2665 + /* 1.2666 + * Append two U+002D HYPHEN-MINUS (-) characters 1.2667 + * and a U+0021 EXCLAMATION MARK (!) character 1.2668 + * to the comment token's data. 1.2669 + */ 1.2670 + appendLongStrBuf(c); 1.2671 + /* 1.2672 + * Switch to the comment end dash state. 1.2673 + */ 1.2674 + state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); 1.2675 + continue stateloop; 1.2676 + case '\r': 1.2677 + appendLongStrBufCarriageReturn(); 1.2678 + break stateloop; 1.2679 + case '\n': 1.2680 + appendLongStrBufLineFeed(); 1.2681 + continue; 1.2682 + case '\u0000': 1.2683 + c = '\uFFFD'; 1.2684 + // fall thru 1.2685 + default: 1.2686 + /* 1.2687 + * Anything else Append two U+002D HYPHEN-MINUS 1.2688 + * (-) characters, a U+0021 EXCLAMATION MARK (!) 1.2689 + * character, and the input character to the 1.2690 + * comment token's data. Switch to the comment 1.2691 + * state. 1.2692 + */ 1.2693 + appendLongStrBuf(c); 1.2694 + /* 1.2695 + * Switch to the comment state. 1.2696 + */ 1.2697 + state = transition(state, Tokenizer.COMMENT, reconsume, pos); 1.2698 + continue stateloop; 1.2699 + } 1.2700 + } 1.2701 + // XXX reorder point 1.2702 + case COMMENT_START_DASH: 1.2703 + if (++pos == endPos) { 1.2704 + break stateloop; 1.2705 + } 1.2706 + c = checkChar(buf, pos); 1.2707 + /* 1.2708 + * Comment start dash state 1.2709 + * 1.2710 + * Consume the next input character: 1.2711 + */ 1.2712 + switch (c) { 1.2713 + case '-': 1.2714 + /* 1.2715 + * U+002D HYPHEN-MINUS (-) Switch to the comment end 1.2716 + * state 1.2717 + */ 1.2718 + appendLongStrBuf(c); 1.2719 + state = transition(state, Tokenizer.COMMENT_END, reconsume, pos); 1.2720 + continue stateloop; 1.2721 + case '>': 1.2722 + errPrematureEndOfComment(); 1.2723 + /* Emit the comment token. */ 1.2724 + emitComment(1, pos); 1.2725 + /* 1.2726 + * Switch to the data state. 1.2727 + */ 1.2728 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.2729 + continue stateloop; 1.2730 + case '\r': 1.2731 + appendLongStrBufCarriageReturn(); 1.2732 + state = transition(state, Tokenizer.COMMENT, reconsume, pos); 1.2733 + break stateloop; 1.2734 + case '\n': 1.2735 + appendLongStrBufLineFeed(); 1.2736 + state = transition(state, Tokenizer.COMMENT, reconsume, pos); 1.2737 + continue stateloop; 1.2738 + case '\u0000': 1.2739 + c = '\uFFFD'; 1.2740 + // fall thru 1.2741 + default: 1.2742 + /* 1.2743 + * Append a U+002D HYPHEN-MINUS character (-) and 1.2744 + * the current input character to the comment 1.2745 + * token's data. 1.2746 + */ 1.2747 + appendLongStrBuf(c); 1.2748 + /* 1.2749 + * Switch to the comment state. 1.2750 + */ 1.2751 + state = transition(state, Tokenizer.COMMENT, reconsume, pos); 1.2752 + continue stateloop; 1.2753 + } 1.2754 + // XXX reorder point 1.2755 + case CDATA_START: 1.2756 + for (;;) { 1.2757 + if (++pos == endPos) { 1.2758 + break stateloop; 1.2759 + } 1.2760 + c = checkChar(buf, pos); 1.2761 + if (index < 6) { // CDATA_LSQB.length 1.2762 + if (c == Tokenizer.CDATA_LSQB[index]) { 1.2763 + appendLongStrBuf(c); 1.2764 + } else { 1.2765 + errBogusComment(); 1.2766 + reconsume = true; 1.2767 + state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 1.2768 + continue stateloop; 1.2769 + } 1.2770 + index++; 1.2771 + continue; 1.2772 + } else { 1.2773 + cstart = pos; // start coalescing 1.2774 + reconsume = true; 1.2775 + state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); 1.2776 + break; // FALL THROUGH continue stateloop; 1.2777 + } 1.2778 + } 1.2779 + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1.2780 + case CDATA_SECTION: 1.2781 + cdatasectionloop: for (;;) { 1.2782 + if (reconsume) { 1.2783 + reconsume = false; 1.2784 + } else { 1.2785 + if (++pos == endPos) { 1.2786 + break stateloop; 1.2787 + } 1.2788 + c = checkChar(buf, pos); 1.2789 + } 1.2790 + switch (c) { 1.2791 + case ']': 1.2792 + flushChars(buf, pos); 1.2793 + state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos); 1.2794 + break cdatasectionloop; // FALL THROUGH 1.2795 + case '\u0000': 1.2796 + emitReplacementCharacter(buf, pos); 1.2797 + continue; 1.2798 + case '\r': 1.2799 + emitCarriageReturn(buf, pos); 1.2800 + break stateloop; 1.2801 + case '\n': 1.2802 + silentLineFeed(); 1.2803 + // fall thru 1.2804 + default: 1.2805 + continue; 1.2806 + } 1.2807 + } 1.2808 + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1.2809 + case CDATA_RSQB: 1.2810 + cdatarsqb: for (;;) { 1.2811 + if (++pos == endPos) { 1.2812 + break stateloop; 1.2813 + } 1.2814 + c = checkChar(buf, pos); 1.2815 + switch (c) { 1.2816 + case ']': 1.2817 + state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos); 1.2818 + break cdatarsqb; 1.2819 + default: 1.2820 + tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1.2821 + 1); 1.2822 + cstart = pos; 1.2823 + reconsume = true; 1.2824 + state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); 1.2825 + continue stateloop; 1.2826 + } 1.2827 + } 1.2828 + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1.2829 + case CDATA_RSQB_RSQB: 1.2830 + cdatarsqbrsqb: for (;;) { 1.2831 + if (++pos == endPos) { 1.2832 + break stateloop; 1.2833 + } 1.2834 + c = checkChar(buf, pos); 1.2835 + switch (c) { 1.2836 + case ']': 1.2837 + // Saw a third ]. Emit one ] (logically the 1.2838 + // first one) and stay in this state to 1.2839 + // remember that the last two characters seen 1.2840 + // have been ]]. 1.2841 + tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1); 1.2842 + continue; 1.2843 + case '>': 1.2844 + cstart = pos + 1; 1.2845 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.2846 + continue stateloop; 1.2847 + default: 1.2848 + tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2); 1.2849 + cstart = pos; 1.2850 + reconsume = true; 1.2851 + state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); 1.2852 + continue stateloop; 1.2853 + } 1.2854 + } 1.2855 + // XXX reorder point 1.2856 + case ATTRIBUTE_VALUE_SINGLE_QUOTED: 1.2857 + attributevaluesinglequotedloop: for (;;) { 1.2858 + if (reconsume) { 1.2859 + reconsume = false; 1.2860 + } else { 1.2861 + if (++pos == endPos) { 1.2862 + break stateloop; 1.2863 + } 1.2864 + c = checkChar(buf, pos); 1.2865 + } 1.2866 + /* 1.2867 + * Consume the next input character: 1.2868 + */ 1.2869 + switch (c) { 1.2870 + case '\'': 1.2871 + /* 1.2872 + * U+0027 APOSTROPHE (') Switch to the after 1.2873 + * attribute value (quoted) state. 1.2874 + */ 1.2875 + addAttributeWithValue(); 1.2876 + 1.2877 + state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); 1.2878 + continue stateloop; 1.2879 + case '&': 1.2880 + /* 1.2881 + * U+0026 AMPERSAND (&) Switch to the character 1.2882 + * reference in attribute value state, with the 1.2883 + * + additional allowed character being U+0027 1.2884 + * APOSTROPHE ('). 1.2885 + */ 1.2886 + clearStrBufAndAppend(c); 1.2887 + setAdditionalAndRememberAmpersandLocation('\''); 1.2888 + returnState = state; 1.2889 + state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 1.2890 + break attributevaluesinglequotedloop; 1.2891 + // continue stateloop; 1.2892 + case '\r': 1.2893 + appendLongStrBufCarriageReturn(); 1.2894 + break stateloop; 1.2895 + case '\n': 1.2896 + appendLongStrBufLineFeed(); 1.2897 + continue; 1.2898 + case '\u0000': 1.2899 + c = '\uFFFD'; 1.2900 + // fall thru 1.2901 + default: 1.2902 + /* 1.2903 + * Anything else Append the current input 1.2904 + * character to the current attribute's value. 1.2905 + */ 1.2906 + appendLongStrBuf(c); 1.2907 + /* 1.2908 + * Stay in the attribute value (double-quoted) 1.2909 + * state. 1.2910 + */ 1.2911 + continue; 1.2912 + } 1.2913 + } 1.2914 + // FALLTHRU DON'T REORDER 1.2915 + case CONSUME_CHARACTER_REFERENCE: 1.2916 + if (++pos == endPos) { 1.2917 + break stateloop; 1.2918 + } 1.2919 + c = checkChar(buf, pos); 1.2920 + if (c == '\u0000') { 1.2921 + break stateloop; 1.2922 + } 1.2923 + /* 1.2924 + * Unlike the definition is the spec, this state does not 1.2925 + * return a value and never requires the caller to 1.2926 + * backtrack. This state takes care of emitting characters 1.2927 + * or appending to the current attribute value. It also 1.2928 + * takes care of that in the case when consuming the 1.2929 + * character reference fails. 1.2930 + */ 1.2931 + /* 1.2932 + * This section defines how to consume a character 1.2933 + * reference. This definition is used when parsing character 1.2934 + * references in text and in attributes. 1.2935 + * 1.2936 + * The behavior depends on the identity of the next 1.2937 + * character (the one immediately after the U+0026 AMPERSAND 1.2938 + * character): 1.2939 + */ 1.2940 + switch (c) { 1.2941 + case ' ': 1.2942 + case '\t': 1.2943 + case '\n': 1.2944 + case '\r': // we'll reconsume! 1.2945 + case '\u000C': 1.2946 + case '<': 1.2947 + case '&': 1.2948 + emitOrAppendStrBuf(returnState); 1.2949 + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 1.2950 + cstart = pos; 1.2951 + } 1.2952 + reconsume = true; 1.2953 + state = transition(state, returnState, reconsume, pos); 1.2954 + continue stateloop; 1.2955 + case '#': 1.2956 + /* 1.2957 + * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER 1.2958 + * SIGN. 1.2959 + */ 1.2960 + appendStrBuf('#'); 1.2961 + state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos); 1.2962 + continue stateloop; 1.2963 + default: 1.2964 + if (c == additional) { 1.2965 + emitOrAppendStrBuf(returnState); 1.2966 + reconsume = true; 1.2967 + state = transition(state, returnState, reconsume, pos); 1.2968 + continue stateloop; 1.2969 + } 1.2970 + if (c >= 'a' && c <= 'z') { 1.2971 + firstCharKey = c - 'a' + 26; 1.2972 + } else if (c >= 'A' && c <= 'Z') { 1.2973 + firstCharKey = c - 'A'; 1.2974 + } else { 1.2975 + // No match 1.2976 + /* 1.2977 + * If no match can be made, then this is a parse 1.2978 + * error. 1.2979 + */ 1.2980 + errNoNamedCharacterMatch(); 1.2981 + emitOrAppendStrBuf(returnState); 1.2982 + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 1.2983 + cstart = pos; 1.2984 + } 1.2985 + reconsume = true; 1.2986 + state = transition(state, returnState, reconsume, pos); 1.2987 + continue stateloop; 1.2988 + } 1.2989 + // Didn't fail yet 1.2990 + appendStrBuf(c); 1.2991 + state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos); 1.2992 + // FALL THROUGH continue stateloop; 1.2993 + } 1.2994 + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1.2995 + case CHARACTER_REFERENCE_HILO_LOOKUP: 1.2996 + { 1.2997 + if (++pos == endPos) { 1.2998 + break stateloop; 1.2999 + } 1.3000 + c = checkChar(buf, pos); 1.3001 + if (c == '\u0000') { 1.3002 + break stateloop; 1.3003 + } 1.3004 + /* 1.3005 + * The data structure is as follows: 1.3006 + * 1.3007 + * HILO_ACCEL is a two-dimensional int array whose major 1.3008 + * index corresponds to the second character of the 1.3009 + * character reference (code point as index) and the 1.3010 + * minor index corresponds to the first character of the 1.3011 + * character reference (packed so that A-Z runs from 0 1.3012 + * to 25 and a-z runs from 26 to 51). This layout makes 1.3013 + * it easier to use the sparseness of the data structure 1.3014 + * to omit parts of it: The second dimension of the 1.3015 + * table is null when no character reference starts with 1.3016 + * the character corresponding to that row. 1.3017 + * 1.3018 + * The int value HILO_ACCEL (by these indeces) is zero 1.3019 + * if there exists no character reference starting with 1.3020 + * that two-letter prefix. Otherwise, the value is an 1.3021 + * int that packs two shorts so that the higher short is 1.3022 + * the index of the highest character reference name 1.3023 + * with that prefix in NAMES and the lower short 1.3024 + * corresponds to the index of the lowest character 1.3025 + * reference name with that prefix. (It happens that the 1.3026 + * first two character reference names share their 1.3027 + * prefix so the packed int cannot be 0 by packing the 1.3028 + * two shorts.) 1.3029 + * 1.3030 + * NAMES is an array of byte arrays where each byte 1.3031 + * array encodes the name of a character references as 1.3032 + * ASCII. The names omit the first two letters of the 1.3033 + * name. (Since storing the first two letters would be 1.3034 + * redundant with the data contained in HILO_ACCEL.) The 1.3035 + * entries are lexically sorted. 1.3036 + * 1.3037 + * For a given index in NAMES, the same index in VALUES 1.3038 + * contains the corresponding expansion as an array of 1.3039 + * two UTF-16 code units (either the character and 1.3040 + * U+0000 or a suggogate pair). 1.3041 + */ 1.3042 + int hilo = 0; 1.3043 + if (c <= 'z') { 1.3044 + @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c]; 1.3045 + if (row != null) { 1.3046 + hilo = row[firstCharKey]; 1.3047 + } 1.3048 + } 1.3049 + if (hilo == 0) { 1.3050 + /* 1.3051 + * If no match can be made, then this is a parse 1.3052 + * error. 1.3053 + */ 1.3054 + errNoNamedCharacterMatch(); 1.3055 + emitOrAppendStrBuf(returnState); 1.3056 + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 1.3057 + cstart = pos; 1.3058 + } 1.3059 + reconsume = true; 1.3060 + state = transition(state, returnState, reconsume, pos); 1.3061 + continue stateloop; 1.3062 + } 1.3063 + // Didn't fail yet 1.3064 + appendStrBuf(c); 1.3065 + lo = hilo & 0xFFFF; 1.3066 + hi = hilo >> 16; 1.3067 + entCol = -1; 1.3068 + candidate = -1; 1.3069 + strBufMark = 0; 1.3070 + state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos); 1.3071 + // FALL THROUGH continue stateloop; 1.3072 + } 1.3073 + case CHARACTER_REFERENCE_TAIL: 1.3074 + outer: for (;;) { 1.3075 + if (++pos == endPos) { 1.3076 + break stateloop; 1.3077 + } 1.3078 + c = checkChar(buf, pos); 1.3079 + if (c == '\u0000') { 1.3080 + break stateloop; 1.3081 + } 1.3082 + entCol++; 1.3083 + /* 1.3084 + * Consume the maximum number of characters possible, 1.3085 + * with the consumed characters matching one of the 1.3086 + * identifiers in the first column of the named 1.3087 + * character references table (in a case-sensitive 1.3088 + * manner). 1.3089 + */ 1.3090 + loloop: for (;;) { 1.3091 + if (hi < lo) { 1.3092 + break outer; 1.3093 + } 1.3094 + if (entCol == NamedCharacters.NAMES[lo].length()) { 1.3095 + candidate = lo; 1.3096 + strBufMark = strBufLen; 1.3097 + lo++; 1.3098 + } else if (entCol > NamedCharacters.NAMES[lo].length()) { 1.3099 + break outer; 1.3100 + } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) { 1.3101 + lo++; 1.3102 + } else { 1.3103 + break loloop; 1.3104 + } 1.3105 + } 1.3106 + 1.3107 + hiloop: for (;;) { 1.3108 + if (hi < lo) { 1.3109 + break outer; 1.3110 + } 1.3111 + if (entCol == NamedCharacters.NAMES[hi].length()) { 1.3112 + break hiloop; 1.3113 + } 1.3114 + if (entCol > NamedCharacters.NAMES[hi].length()) { 1.3115 + break outer; 1.3116 + } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) { 1.3117 + hi--; 1.3118 + } else { 1.3119 + break hiloop; 1.3120 + } 1.3121 + } 1.3122 + 1.3123 + if (c == ';') { 1.3124 + // If we see a semicolon, there cannot be a 1.3125 + // longer match. Break the loop. However, before 1.3126 + // breaking, take the longest match so far as the 1.3127 + // candidate, if we are just about to complete a 1.3128 + // match. 1.3129 + if (entCol + 1 == NamedCharacters.NAMES[lo].length()) { 1.3130 + candidate = lo; 1.3131 + strBufMark = strBufLen; 1.3132 + } 1.3133 + break outer; 1.3134 + } 1.3135 + 1.3136 + if (hi < lo) { 1.3137 + break outer; 1.3138 + } 1.3139 + appendStrBuf(c); 1.3140 + continue; 1.3141 + } 1.3142 + 1.3143 + if (candidate == -1) { 1.3144 + // reconsume deals with CR, LF or nul 1.3145 + /* 1.3146 + * If no match can be made, then this is a parse error. 1.3147 + */ 1.3148 + errNoNamedCharacterMatch(); 1.3149 + emitOrAppendStrBuf(returnState); 1.3150 + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 1.3151 + cstart = pos; 1.3152 + } 1.3153 + reconsume = true; 1.3154 + state = transition(state, returnState, reconsume, pos); 1.3155 + continue stateloop; 1.3156 + } else { 1.3157 + // c can't be CR, LF or nul if we got here 1.3158 + @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate]; 1.3159 + if (candidateName.length() == 0 1.3160 + || candidateName.charAt(candidateName.length() - 1) != ';') { 1.3161 + /* 1.3162 + * If the last character matched is not a U+003B 1.3163 + * SEMICOLON (;), there is a parse error. 1.3164 + */ 1.3165 + if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 1.3166 + /* 1.3167 + * If the entity is being consumed as part of an 1.3168 + * attribute, and the last character matched is 1.3169 + * not a U+003B SEMICOLON (;), 1.3170 + */ 1.3171 + char ch; 1.3172 + if (strBufMark == strBufLen) { 1.3173 + ch = c; 1.3174 + } else { 1.3175 + // if (strBufOffset != -1) { 1.3176 + // ch = buf[strBufOffset + strBufMark]; 1.3177 + // } else { 1.3178 + ch = strBuf[strBufMark]; 1.3179 + // } 1.3180 + } 1.3181 + if (ch == '=' || (ch >= '0' && ch <= '9') 1.3182 + || (ch >= 'A' && ch <= 'Z') 1.3183 + || (ch >= 'a' && ch <= 'z')) { 1.3184 + /* 1.3185 + * and the next character is either a U+003D 1.3186 + * EQUALS SIGN character (=) or in the range 1.3187 + * U+0030 DIGIT ZERO to U+0039 DIGIT NINE, 1.3188 + * U+0041 LATIN CAPITAL LETTER A to U+005A 1.3189 + * LATIN CAPITAL LETTER Z, or U+0061 LATIN 1.3190 + * SMALL LETTER A to U+007A LATIN SMALL 1.3191 + * LETTER Z, then, for historical reasons, 1.3192 + * all the characters that were matched 1.3193 + * after the U+0026 AMPERSAND (&) must be 1.3194 + * unconsumed, and nothing is returned. 1.3195 + */ 1.3196 + errNoNamedCharacterMatch(); 1.3197 + appendStrBufToLongStrBuf(); 1.3198 + reconsume = true; 1.3199 + state = transition(state, returnState, reconsume, pos); 1.3200 + continue stateloop; 1.3201 + } 1.3202 + } 1.3203 + if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 1.3204 + errUnescapedAmpersandInterpretedAsCharacterReference(); 1.3205 + } else { 1.3206 + errNotSemicolonTerminated(); 1.3207 + } 1.3208 + } 1.3209 + 1.3210 + /* 1.3211 + * Otherwise, return a character token for the character 1.3212 + * corresponding to the entity name (as given by the 1.3213 + * second column of the named character references 1.3214 + * table). 1.3215 + */ 1.3216 + // CPPONLY: completedNamedCharacterReference(); 1.3217 + @Const @NoLength char[] val = NamedCharacters.VALUES[candidate]; 1.3218 + if ( 1.3219 + // [NOCPP[ 1.3220 + val.length == 1 1.3221 + // ]NOCPP] 1.3222 + // CPPONLY: val[1] == 0 1.3223 + ) { 1.3224 + emitOrAppendOne(val, returnState); 1.3225 + } else { 1.3226 + emitOrAppendTwo(val, returnState); 1.3227 + } 1.3228 + // this is so complicated! 1.3229 + if (strBufMark < strBufLen) { 1.3230 + if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 1.3231 + for (int i = strBufMark; i < strBufLen; i++) { 1.3232 + appendLongStrBuf(strBuf[i]); 1.3233 + } 1.3234 + } else { 1.3235 + tokenHandler.characters(strBuf, strBufMark, 1.3236 + strBufLen - strBufMark); 1.3237 + } 1.3238 + } 1.3239 + // Check if we broke out early with c being the last 1.3240 + // character that matched as opposed to being the 1.3241 + // first one that didn't match. In the case of an 1.3242 + // early break, the next run on text should start 1.3243 + // *after* the current character and the current 1.3244 + // character shouldn't be reconsumed. 1.3245 + boolean earlyBreak = (c == ';' && strBufMark == strBufLen); 1.3246 + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 1.3247 + cstart = earlyBreak ? pos + 1 : pos; 1.3248 + } 1.3249 + reconsume = !earlyBreak; 1.3250 + state = transition(state, returnState, reconsume, pos); 1.3251 + continue stateloop; 1.3252 + /* 1.3253 + * If the markup contains I'm ¬it; I tell you, the 1.3254 + * entity is parsed as "not", as in, I'm ¬it; I tell 1.3255 + * you. But if the markup was I'm ∉ I tell you, 1.3256 + * the entity would be parsed as "notin;", resulting in 1.3257 + * I'm ∉ I tell you. 1.3258 + */ 1.3259 + } 1.3260 + // XXX reorder point 1.3261 + case CONSUME_NCR: 1.3262 + if (++pos == endPos) { 1.3263 + break stateloop; 1.3264 + } 1.3265 + c = checkChar(buf, pos); 1.3266 + prevValue = -1; 1.3267 + value = 0; 1.3268 + seenDigits = false; 1.3269 + /* 1.3270 + * The behavior further depends on the character after the 1.3271 + * U+0023 NUMBER SIGN: 1.3272 + */ 1.3273 + switch (c) { 1.3274 + case 'x': 1.3275 + case 'X': 1.3276 + 1.3277 + /* 1.3278 + * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL 1.3279 + * LETTER X Consume the X. 1.3280 + * 1.3281 + * Follow the steps below, but using the range of 1.3282 + * characters U+0030 DIGIT ZERO through to U+0039 1.3283 + * DIGIT NINE, U+0061 LATIN SMALL LETTER A through 1.3284 + * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN 1.3285 + * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL 1.3286 + * LETTER F (in other words, 0-9, A-F, a-f). 1.3287 + * 1.3288 + * When it comes to interpreting the number, 1.3289 + * interpret it as a hexadecimal number. 1.3290 + */ 1.3291 + appendStrBuf(c); 1.3292 + state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos); 1.3293 + continue stateloop; 1.3294 + default: 1.3295 + /* 1.3296 + * Anything else Follow the steps below, but using 1.3297 + * the range of characters U+0030 DIGIT ZERO through 1.3298 + * to U+0039 DIGIT NINE (i.e. just 0-9). 1.3299 + * 1.3300 + * When it comes to interpreting the number, 1.3301 + * interpret it as a decimal number. 1.3302 + */ 1.3303 + reconsume = true; 1.3304 + state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos); 1.3305 + // FALL THROUGH continue stateloop; 1.3306 + } 1.3307 + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1.3308 + case DECIMAL_NRC_LOOP: 1.3309 + decimalloop: for (;;) { 1.3310 + if (reconsume) { 1.3311 + reconsume = false; 1.3312 + } else { 1.3313 + if (++pos == endPos) { 1.3314 + break stateloop; 1.3315 + } 1.3316 + c = checkChar(buf, pos); 1.3317 + } 1.3318 + // Deal with overflow gracefully 1.3319 + if (value < prevValue) { 1.3320 + value = 0x110000; // Value above Unicode range but 1.3321 + // within int 1.3322 + // range 1.3323 + } 1.3324 + prevValue = value; 1.3325 + /* 1.3326 + * Consume as many characters as match the range of 1.3327 + * characters given above. 1.3328 + */ 1.3329 + if (c >= '0' && c <= '9') { 1.3330 + seenDigits = true; 1.3331 + value *= 10; 1.3332 + value += c - '0'; 1.3333 + continue; 1.3334 + } else if (c == ';') { 1.3335 + if (seenDigits) { 1.3336 + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 1.3337 + cstart = pos + 1; 1.3338 + } 1.3339 + state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 1.3340 + // FALL THROUGH continue stateloop; 1.3341 + break decimalloop; 1.3342 + } else { 1.3343 + errNoDigitsInNCR(); 1.3344 + appendStrBuf(';'); 1.3345 + emitOrAppendStrBuf(returnState); 1.3346 + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 1.3347 + cstart = pos + 1; 1.3348 + } 1.3349 + state = transition(state, returnState, reconsume, pos); 1.3350 + continue stateloop; 1.3351 + } 1.3352 + } else { 1.3353 + /* 1.3354 + * If no characters match the range, then don't 1.3355 + * consume any characters (and unconsume the U+0023 1.3356 + * NUMBER SIGN character and, if appropriate, the X 1.3357 + * character). This is a parse error; nothing is 1.3358 + * returned. 1.3359 + * 1.3360 + * Otherwise, if the next character is a U+003B 1.3361 + * SEMICOLON, consume that too. If it isn't, there 1.3362 + * is a parse error. 1.3363 + */ 1.3364 + if (!seenDigits) { 1.3365 + errNoDigitsInNCR(); 1.3366 + emitOrAppendStrBuf(returnState); 1.3367 + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 1.3368 + cstart = pos; 1.3369 + } 1.3370 + reconsume = true; 1.3371 + state = transition(state, returnState, reconsume, pos); 1.3372 + continue stateloop; 1.3373 + } else { 1.3374 + errCharRefLacksSemicolon(); 1.3375 + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 1.3376 + cstart = pos; 1.3377 + } 1.3378 + reconsume = true; 1.3379 + state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 1.3380 + // FALL THROUGH continue stateloop; 1.3381 + break decimalloop; 1.3382 + } 1.3383 + } 1.3384 + } 1.3385 + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1.3386 + case HANDLE_NCR_VALUE: 1.3387 + // WARNING previous state sets reconsume 1.3388 + // XXX inline this case if the method size can take it 1.3389 + handleNcrValue(returnState); 1.3390 + state = transition(state, returnState, reconsume, pos); 1.3391 + continue stateloop; 1.3392 + // XXX reorder point 1.3393 + case HEX_NCR_LOOP: 1.3394 + for (;;) { 1.3395 + if (++pos == endPos) { 1.3396 + break stateloop; 1.3397 + } 1.3398 + c = checkChar(buf, pos); 1.3399 + // Deal with overflow gracefully 1.3400 + if (value < prevValue) { 1.3401 + value = 0x110000; // Value above Unicode range but 1.3402 + // within int 1.3403 + // range 1.3404 + } 1.3405 + prevValue = value; 1.3406 + /* 1.3407 + * Consume as many characters as match the range of 1.3408 + * characters given above. 1.3409 + */ 1.3410 + if (c >= '0' && c <= '9') { 1.3411 + seenDigits = true; 1.3412 + value *= 16; 1.3413 + value += c - '0'; 1.3414 + continue; 1.3415 + } else if (c >= 'A' && c <= 'F') { 1.3416 + seenDigits = true; 1.3417 + value *= 16; 1.3418 + value += c - 'A' + 10; 1.3419 + continue; 1.3420 + } else if (c >= 'a' && c <= 'f') { 1.3421 + seenDigits = true; 1.3422 + value *= 16; 1.3423 + value += c - 'a' + 10; 1.3424 + continue; 1.3425 + } else if (c == ';') { 1.3426 + if (seenDigits) { 1.3427 + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 1.3428 + cstart = pos + 1; 1.3429 + } 1.3430 + state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 1.3431 + continue stateloop; 1.3432 + } else { 1.3433 + errNoDigitsInNCR(); 1.3434 + appendStrBuf(';'); 1.3435 + emitOrAppendStrBuf(returnState); 1.3436 + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 1.3437 + cstart = pos + 1; 1.3438 + } 1.3439 + state = transition(state, returnState, reconsume, pos); 1.3440 + continue stateloop; 1.3441 + } 1.3442 + } else { 1.3443 + /* 1.3444 + * If no characters match the range, then don't 1.3445 + * consume any characters (and unconsume the U+0023 1.3446 + * NUMBER SIGN character and, if appropriate, the X 1.3447 + * character). This is a parse error; nothing is 1.3448 + * returned. 1.3449 + * 1.3450 + * Otherwise, if the next character is a U+003B 1.3451 + * SEMICOLON, consume that too. If it isn't, there 1.3452 + * is a parse error. 1.3453 + */ 1.3454 + if (!seenDigits) { 1.3455 + errNoDigitsInNCR(); 1.3456 + emitOrAppendStrBuf(returnState); 1.3457 + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 1.3458 + cstart = pos; 1.3459 + } 1.3460 + reconsume = true; 1.3461 + state = transition(state, returnState, reconsume, pos); 1.3462 + continue stateloop; 1.3463 + } else { 1.3464 + errCharRefLacksSemicolon(); 1.3465 + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { 1.3466 + cstart = pos; 1.3467 + } 1.3468 + reconsume = true; 1.3469 + state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); 1.3470 + continue stateloop; 1.3471 + } 1.3472 + } 1.3473 + } 1.3474 + // XXX reorder point 1.3475 + case PLAINTEXT: 1.3476 + plaintextloop: for (;;) { 1.3477 + if (reconsume) { 1.3478 + reconsume = false; 1.3479 + } else { 1.3480 + if (++pos == endPos) { 1.3481 + break stateloop; 1.3482 + } 1.3483 + c = checkChar(buf, pos); 1.3484 + } 1.3485 + switch (c) { 1.3486 + case '\u0000': 1.3487 + emitPlaintextReplacementCharacter(buf, pos); 1.3488 + continue; 1.3489 + case '\r': 1.3490 + emitCarriageReturn(buf, pos); 1.3491 + break stateloop; 1.3492 + case '\n': 1.3493 + silentLineFeed(); 1.3494 + default: 1.3495 + /* 1.3496 + * Anything else Emit the current input 1.3497 + * character as a character token. Stay in the 1.3498 + * RAWTEXT state. 1.3499 + */ 1.3500 + continue; 1.3501 + } 1.3502 + } 1.3503 + // XXX reorder point 1.3504 + case CLOSE_TAG_OPEN: 1.3505 + if (++pos == endPos) { 1.3506 + break stateloop; 1.3507 + } 1.3508 + c = checkChar(buf, pos); 1.3509 + /* 1.3510 + * Otherwise, if the content model flag is set to the PCDATA 1.3511 + * state, or if the next few characters do match that tag 1.3512 + * name, consume the next input character: 1.3513 + */ 1.3514 + switch (c) { 1.3515 + case '>': 1.3516 + /* U+003E GREATER-THAN SIGN (>) Parse error. */ 1.3517 + errLtSlashGt(); 1.3518 + /* 1.3519 + * Switch to the data state. 1.3520 + */ 1.3521 + cstart = pos + 1; 1.3522 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.3523 + continue stateloop; 1.3524 + case '\r': 1.3525 + silentCarriageReturn(); 1.3526 + /* Anything else Parse error. */ 1.3527 + errGarbageAfterLtSlash(); 1.3528 + /* 1.3529 + * Switch to the bogus comment state. 1.3530 + */ 1.3531 + clearLongStrBufAndAppend('\n'); 1.3532 + state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 1.3533 + break stateloop; 1.3534 + case '\n': 1.3535 + silentLineFeed(); 1.3536 + /* Anything else Parse error. */ 1.3537 + errGarbageAfterLtSlash(); 1.3538 + /* 1.3539 + * Switch to the bogus comment state. 1.3540 + */ 1.3541 + clearLongStrBufAndAppend('\n'); 1.3542 + state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 1.3543 + continue stateloop; 1.3544 + case '\u0000': 1.3545 + c = '\uFFFD'; 1.3546 + // fall thru 1.3547 + default: 1.3548 + if (c >= 'A' && c <= 'Z') { 1.3549 + c += 0x20; 1.3550 + } 1.3551 + if (c >= 'a' && c <= 'z') { 1.3552 + /* 1.3553 + * U+0061 LATIN SMALL LETTER A through to U+007A 1.3554 + * LATIN SMALL LETTER Z Create a new end tag 1.3555 + * token, 1.3556 + */ 1.3557 + endTag = true; 1.3558 + /* 1.3559 + * set its tag name to the input character, 1.3560 + */ 1.3561 + clearStrBufAndAppend(c); 1.3562 + /* 1.3563 + * then switch to the tag name state. (Don't 1.3564 + * emit the token yet; further details will be 1.3565 + * filled in before it is emitted.) 1.3566 + */ 1.3567 + state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); 1.3568 + continue stateloop; 1.3569 + } else { 1.3570 + /* Anything else Parse error. */ 1.3571 + errGarbageAfterLtSlash(); 1.3572 + /* 1.3573 + * Switch to the bogus comment state. 1.3574 + */ 1.3575 + clearLongStrBufAndAppend(c); 1.3576 + state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 1.3577 + continue stateloop; 1.3578 + } 1.3579 + } 1.3580 + // XXX reorder point 1.3581 + case RCDATA: 1.3582 + rcdataloop: for (;;) { 1.3583 + if (reconsume) { 1.3584 + reconsume = false; 1.3585 + } else { 1.3586 + if (++pos == endPos) { 1.3587 + break stateloop; 1.3588 + } 1.3589 + c = checkChar(buf, pos); 1.3590 + } 1.3591 + switch (c) { 1.3592 + case '&': 1.3593 + /* 1.3594 + * U+0026 AMPERSAND (&) Switch to the character 1.3595 + * reference in RCDATA state. 1.3596 + */ 1.3597 + flushChars(buf, pos); 1.3598 + clearStrBufAndAppend(c); 1.3599 + additional = '\u0000'; 1.3600 + returnState = state; 1.3601 + state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); 1.3602 + continue stateloop; 1.3603 + case '<': 1.3604 + /* 1.3605 + * U+003C LESS-THAN SIGN (<) Switch to the 1.3606 + * RCDATA less-than sign state. 1.3607 + */ 1.3608 + flushChars(buf, pos); 1.3609 + 1.3610 + returnState = state; 1.3611 + state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); 1.3612 + continue stateloop; 1.3613 + case '\u0000': 1.3614 + emitReplacementCharacter(buf, pos); 1.3615 + continue; 1.3616 + case '\r': 1.3617 + emitCarriageReturn(buf, pos); 1.3618 + break stateloop; 1.3619 + case '\n': 1.3620 + silentLineFeed(); 1.3621 + default: 1.3622 + /* 1.3623 + * Emit the current input character as a 1.3624 + * character token. Stay in the RCDATA state. 1.3625 + */ 1.3626 + continue; 1.3627 + } 1.3628 + } 1.3629 + // XXX reorder point 1.3630 + case RAWTEXT: 1.3631 + rawtextloop: for (;;) { 1.3632 + if (reconsume) { 1.3633 + reconsume = false; 1.3634 + } else { 1.3635 + if (++pos == endPos) { 1.3636 + break stateloop; 1.3637 + } 1.3638 + c = checkChar(buf, pos); 1.3639 + } 1.3640 + switch (c) { 1.3641 + case '<': 1.3642 + /* 1.3643 + * U+003C LESS-THAN SIGN (<) Switch to the 1.3644 + * RAWTEXT less-than sign state. 1.3645 + */ 1.3646 + flushChars(buf, pos); 1.3647 + 1.3648 + returnState = state; 1.3649 + state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); 1.3650 + break rawtextloop; 1.3651 + // FALL THRU continue stateloop; 1.3652 + case '\u0000': 1.3653 + emitReplacementCharacter(buf, pos); 1.3654 + continue; 1.3655 + case '\r': 1.3656 + emitCarriageReturn(buf, pos); 1.3657 + break stateloop; 1.3658 + case '\n': 1.3659 + silentLineFeed(); 1.3660 + default: 1.3661 + /* 1.3662 + * Emit the current input character as a 1.3663 + * character token. Stay in the RAWTEXT state. 1.3664 + */ 1.3665 + continue; 1.3666 + } 1.3667 + } 1.3668 + // XXX fallthru don't reorder 1.3669 + case RAWTEXT_RCDATA_LESS_THAN_SIGN: 1.3670 + rawtextrcdatalessthansignloop: for (;;) { 1.3671 + if (++pos == endPos) { 1.3672 + break stateloop; 1.3673 + } 1.3674 + c = checkChar(buf, pos); 1.3675 + switch (c) { 1.3676 + case '/': 1.3677 + /* 1.3678 + * U+002F SOLIDUS (/) Set the temporary buffer 1.3679 + * to the empty string. Switch to the script 1.3680 + * data end tag open state. 1.3681 + */ 1.3682 + index = 0; 1.3683 + clearStrBuf(); 1.3684 + state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); 1.3685 + break rawtextrcdatalessthansignloop; 1.3686 + // FALL THRU continue stateloop; 1.3687 + default: 1.3688 + /* 1.3689 + * Otherwise, emit a U+003C LESS-THAN SIGN 1.3690 + * character token 1.3691 + */ 1.3692 + tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 1.3693 + /* 1.3694 + * and reconsume the current input character in 1.3695 + * the data state. 1.3696 + */ 1.3697 + cstart = pos; 1.3698 + reconsume = true; 1.3699 + state = transition(state, returnState, reconsume, pos); 1.3700 + continue stateloop; 1.3701 + } 1.3702 + } 1.3703 + // XXX fall thru. don't reorder. 1.3704 + case NON_DATA_END_TAG_NAME: 1.3705 + for (;;) { 1.3706 + if (++pos == endPos) { 1.3707 + break stateloop; 1.3708 + } 1.3709 + c = checkChar(buf, pos); 1.3710 + /* 1.3711 + * ASSERT! when entering this state, set index to 0 and 1.3712 + * call clearStrBuf() assert (contentModelElement != 1.3713 + * null); Let's implement the above without lookahead. 1.3714 + * strBuf is the 'temporary buffer'. 1.3715 + */ 1.3716 + if (index < endTagExpectationAsArray.length) { 1.3717 + char e = endTagExpectationAsArray[index]; 1.3718 + char folded = c; 1.3719 + if (c >= 'A' && c <= 'Z') { 1.3720 + folded += 0x20; 1.3721 + } 1.3722 + if (folded != e) { 1.3723 + // [NOCPP[ 1.3724 + errHtml4LtSlashInRcdata(folded); 1.3725 + // ]NOCPP] 1.3726 + tokenHandler.characters(Tokenizer.LT_SOLIDUS, 1.3727 + 0, 2); 1.3728 + emitStrBuf(); 1.3729 + cstart = pos; 1.3730 + reconsume = true; 1.3731 + state = transition(state, returnState, reconsume, pos); 1.3732 + continue stateloop; 1.3733 + } 1.3734 + appendStrBuf(c); 1.3735 + index++; 1.3736 + continue; 1.3737 + } else { 1.3738 + endTag = true; 1.3739 + // XXX replace contentModelElement with different 1.3740 + // type 1.3741 + tagName = endTagExpectation; 1.3742 + switch (c) { 1.3743 + case '\r': 1.3744 + silentCarriageReturn(); 1.3745 + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 1.3746 + break stateloop; 1.3747 + case '\n': 1.3748 + silentLineFeed(); 1.3749 + // fall thru 1.3750 + case ' ': 1.3751 + case '\t': 1.3752 + case '\u000C': 1.3753 + /* 1.3754 + * U+0009 CHARACTER TABULATION U+000A LINE 1.3755 + * FEED (LF) U+000C FORM FEED (FF) U+0020 1.3756 + * SPACE If the current end tag token is an 1.3757 + * appropriate end tag token, then switch to 1.3758 + * the before attribute name state. 1.3759 + */ 1.3760 + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); 1.3761 + continue stateloop; 1.3762 + case '/': 1.3763 + /* 1.3764 + * U+002F SOLIDUS (/) If the current end tag 1.3765 + * token is an appropriate end tag token, 1.3766 + * then switch to the self-closing start tag 1.3767 + * state. 1.3768 + */ 1.3769 + state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); 1.3770 + continue stateloop; 1.3771 + case '>': 1.3772 + /* 1.3773 + * U+003E GREATER-THAN SIGN (>) If the 1.3774 + * current end tag token is an appropriate 1.3775 + * end tag token, then emit the current tag 1.3776 + * token and switch to the data state. 1.3777 + */ 1.3778 + state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); 1.3779 + if (shouldSuspend) { 1.3780 + break stateloop; 1.3781 + } 1.3782 + continue stateloop; 1.3783 + default: 1.3784 + /* 1.3785 + * Emit a U+003C LESS-THAN SIGN character 1.3786 + * token, a U+002F SOLIDUS character token, 1.3787 + * a character token for each of the 1.3788 + * characters in the temporary buffer (in 1.3789 + * the order they were added to the buffer), 1.3790 + * and reconsume the current input character 1.3791 + * in the RAWTEXT state. 1.3792 + */ 1.3793 + // [NOCPP[ 1.3794 + errWarnLtSlashInRcdata(); 1.3795 + // ]NOCPP] 1.3796 + tokenHandler.characters( 1.3797 + Tokenizer.LT_SOLIDUS, 0, 2); 1.3798 + emitStrBuf(); 1.3799 + if (c == '\u0000') { 1.3800 + emitReplacementCharacter(buf, pos); 1.3801 + } else { 1.3802 + cstart = pos; // don't drop the 1.3803 + // character 1.3804 + } 1.3805 + state = transition(state, returnState, reconsume, pos); 1.3806 + continue stateloop; 1.3807 + } 1.3808 + } 1.3809 + } 1.3810 + // XXX reorder point 1.3811 + // BEGIN HOTSPOT WORKAROUND 1.3812 + case BOGUS_COMMENT: 1.3813 + boguscommentloop: for (;;) { 1.3814 + if (reconsume) { 1.3815 + reconsume = false; 1.3816 + } else { 1.3817 + if (++pos == endPos) { 1.3818 + break stateloop; 1.3819 + } 1.3820 + c = checkChar(buf, pos); 1.3821 + } 1.3822 + /* 1.3823 + * Consume every character up to and including the first 1.3824 + * U+003E GREATER-THAN SIGN character (>) or the end of 1.3825 + * the file (EOF), whichever comes first. Emit a comment 1.3826 + * token whose data is the concatenation of all the 1.3827 + * characters starting from and including the character 1.3828 + * that caused the state machine to switch into the 1.3829 + * bogus comment state, up to and including the 1.3830 + * character immediately before the last consumed 1.3831 + * character (i.e. up to the character just before the 1.3832 + * U+003E or EOF character). (If the comment was started 1.3833 + * by the end of the file (EOF), the token is empty.) 1.3834 + * 1.3835 + * Switch to the data state. 1.3836 + * 1.3837 + * If the end of the file was reached, reconsume the EOF 1.3838 + * character. 1.3839 + */ 1.3840 + switch (c) { 1.3841 + case '>': 1.3842 + emitComment(0, pos); 1.3843 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.3844 + continue stateloop; 1.3845 + case '-': 1.3846 + appendLongStrBuf(c); 1.3847 + state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos); 1.3848 + break boguscommentloop; 1.3849 + case '\r': 1.3850 + appendLongStrBufCarriageReturn(); 1.3851 + break stateloop; 1.3852 + case '\n': 1.3853 + appendLongStrBufLineFeed(); 1.3854 + continue; 1.3855 + case '\u0000': 1.3856 + c = '\uFFFD'; 1.3857 + // fall thru 1.3858 + default: 1.3859 + appendLongStrBuf(c); 1.3860 + continue; 1.3861 + } 1.3862 + } 1.3863 + // FALLTHRU DON'T REORDER 1.3864 + case BOGUS_COMMENT_HYPHEN: 1.3865 + boguscommenthyphenloop: for (;;) { 1.3866 + if (++pos == endPos) { 1.3867 + break stateloop; 1.3868 + } 1.3869 + c = checkChar(buf, pos); 1.3870 + switch (c) { 1.3871 + case '>': 1.3872 + // [NOCPP[ 1.3873 + maybeAppendSpaceToBogusComment(); 1.3874 + // ]NOCPP] 1.3875 + emitComment(0, pos); 1.3876 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.3877 + continue stateloop; 1.3878 + case '-': 1.3879 + appendSecondHyphenToBogusComment(); 1.3880 + continue boguscommenthyphenloop; 1.3881 + case '\r': 1.3882 + appendLongStrBufCarriageReturn(); 1.3883 + state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 1.3884 + break stateloop; 1.3885 + case '\n': 1.3886 + appendLongStrBufLineFeed(); 1.3887 + state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 1.3888 + continue stateloop; 1.3889 + case '\u0000': 1.3890 + c = '\uFFFD'; 1.3891 + // fall thru 1.3892 + default: 1.3893 + appendLongStrBuf(c); 1.3894 + state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 1.3895 + continue stateloop; 1.3896 + } 1.3897 + } 1.3898 + // XXX reorder point 1.3899 + case SCRIPT_DATA: 1.3900 + scriptdataloop: for (;;) { 1.3901 + if (reconsume) { 1.3902 + reconsume = false; 1.3903 + } else { 1.3904 + if (++pos == endPos) { 1.3905 + break stateloop; 1.3906 + } 1.3907 + c = checkChar(buf, pos); 1.3908 + } 1.3909 + switch (c) { 1.3910 + case '<': 1.3911 + /* 1.3912 + * U+003C LESS-THAN SIGN (<) Switch to the 1.3913 + * script data less-than sign state. 1.3914 + */ 1.3915 + flushChars(buf, pos); 1.3916 + returnState = state; 1.3917 + state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos); 1.3918 + break scriptdataloop; // FALL THRU continue 1.3919 + // stateloop; 1.3920 + case '\u0000': 1.3921 + emitReplacementCharacter(buf, pos); 1.3922 + continue; 1.3923 + case '\r': 1.3924 + emitCarriageReturn(buf, pos); 1.3925 + break stateloop; 1.3926 + case '\n': 1.3927 + silentLineFeed(); 1.3928 + default: 1.3929 + /* 1.3930 + * Anything else Emit the current input 1.3931 + * character as a character token. Stay in the 1.3932 + * script data state. 1.3933 + */ 1.3934 + continue; 1.3935 + } 1.3936 + } 1.3937 + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1.3938 + case SCRIPT_DATA_LESS_THAN_SIGN: 1.3939 + scriptdatalessthansignloop: for (;;) { 1.3940 + if (++pos == endPos) { 1.3941 + break stateloop; 1.3942 + } 1.3943 + c = checkChar(buf, pos); 1.3944 + switch (c) { 1.3945 + case '/': 1.3946 + /* 1.3947 + * U+002F SOLIDUS (/) Set the temporary buffer 1.3948 + * to the empty string. Switch to the script 1.3949 + * data end tag open state. 1.3950 + */ 1.3951 + index = 0; 1.3952 + clearStrBuf(); 1.3953 + state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); 1.3954 + continue stateloop; 1.3955 + case '!': 1.3956 + tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 1.3957 + cstart = pos; 1.3958 + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos); 1.3959 + break scriptdatalessthansignloop; // FALL THRU 1.3960 + // continue 1.3961 + // stateloop; 1.3962 + default: 1.3963 + /* 1.3964 + * Otherwise, emit a U+003C LESS-THAN SIGN 1.3965 + * character token 1.3966 + */ 1.3967 + tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 1.3968 + /* 1.3969 + * and reconsume the current input character in 1.3970 + * the data state. 1.3971 + */ 1.3972 + cstart = pos; 1.3973 + reconsume = true; 1.3974 + state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 1.3975 + continue stateloop; 1.3976 + } 1.3977 + } 1.3978 + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1.3979 + case SCRIPT_DATA_ESCAPE_START: 1.3980 + scriptdataescapestartloop: for (;;) { 1.3981 + if (++pos == endPos) { 1.3982 + break stateloop; 1.3983 + } 1.3984 + c = checkChar(buf, pos); 1.3985 + /* 1.3986 + * Consume the next input character: 1.3987 + */ 1.3988 + switch (c) { 1.3989 + case '-': 1.3990 + /* 1.3991 + * U+002D HYPHEN-MINUS (-) Emit a U+002D 1.3992 + * HYPHEN-MINUS character token. Switch to the 1.3993 + * script data escape start dash state. 1.3994 + */ 1.3995 + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos); 1.3996 + break scriptdataescapestartloop; // FALL THRU 1.3997 + // continue 1.3998 + // stateloop; 1.3999 + default: 1.4000 + /* 1.4001 + * Anything else Reconsume the current input 1.4002 + * character in the script data state. 1.4003 + */ 1.4004 + reconsume = true; 1.4005 + state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 1.4006 + continue stateloop; 1.4007 + } 1.4008 + } 1.4009 + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1.4010 + case SCRIPT_DATA_ESCAPE_START_DASH: 1.4011 + scriptdataescapestartdashloop: for (;;) { 1.4012 + if (++pos == endPos) { 1.4013 + break stateloop; 1.4014 + } 1.4015 + c = checkChar(buf, pos); 1.4016 + /* 1.4017 + * Consume the next input character: 1.4018 + */ 1.4019 + switch (c) { 1.4020 + case '-': 1.4021 + /* 1.4022 + * U+002D HYPHEN-MINUS (-) Emit a U+002D 1.4023 + * HYPHEN-MINUS character token. Switch to the 1.4024 + * script data escaped dash dash state. 1.4025 + */ 1.4026 + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); 1.4027 + break scriptdataescapestartdashloop; 1.4028 + // continue stateloop; 1.4029 + default: 1.4030 + /* 1.4031 + * Anything else Reconsume the current input 1.4032 + * character in the script data state. 1.4033 + */ 1.4034 + reconsume = true; 1.4035 + state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 1.4036 + continue stateloop; 1.4037 + } 1.4038 + } 1.4039 + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1.4040 + case SCRIPT_DATA_ESCAPED_DASH_DASH: 1.4041 + scriptdataescapeddashdashloop: for (;;) { 1.4042 + if (++pos == endPos) { 1.4043 + break stateloop; 1.4044 + } 1.4045 + c = checkChar(buf, pos); 1.4046 + /* 1.4047 + * Consume the next input character: 1.4048 + */ 1.4049 + switch (c) { 1.4050 + case '-': 1.4051 + /* 1.4052 + * U+002D HYPHEN-MINUS (-) Emit a U+002D 1.4053 + * HYPHEN-MINUS character token. Stay in the 1.4054 + * script data escaped dash dash state. 1.4055 + */ 1.4056 + continue; 1.4057 + case '<': 1.4058 + /* 1.4059 + * U+003C LESS-THAN SIGN (<) Switch to the 1.4060 + * script data escaped less-than sign state. 1.4061 + */ 1.4062 + flushChars(buf, pos); 1.4063 + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 1.4064 + continue stateloop; 1.4065 + case '>': 1.4066 + /* 1.4067 + * U+003E GREATER-THAN SIGN (>) Emit a U+003E 1.4068 + * GREATER-THAN SIGN character token. Switch to 1.4069 + * the script data state. 1.4070 + */ 1.4071 + state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 1.4072 + continue stateloop; 1.4073 + case '\u0000': 1.4074 + emitReplacementCharacter(buf, pos); 1.4075 + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 1.4076 + break scriptdataescapeddashdashloop; 1.4077 + case '\r': 1.4078 + emitCarriageReturn(buf, pos); 1.4079 + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 1.4080 + break stateloop; 1.4081 + case '\n': 1.4082 + silentLineFeed(); 1.4083 + default: 1.4084 + /* 1.4085 + * Anything else Emit the current input 1.4086 + * character as a character token. Switch to the 1.4087 + * script data escaped state. 1.4088 + */ 1.4089 + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 1.4090 + break scriptdataescapeddashdashloop; 1.4091 + // continue stateloop; 1.4092 + } 1.4093 + } 1.4094 + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1.4095 + case SCRIPT_DATA_ESCAPED: 1.4096 + scriptdataescapedloop: for (;;) { 1.4097 + if (reconsume) { 1.4098 + reconsume = false; 1.4099 + } else { 1.4100 + if (++pos == endPos) { 1.4101 + break stateloop; 1.4102 + } 1.4103 + c = checkChar(buf, pos); 1.4104 + } 1.4105 + /* 1.4106 + * Consume the next input character: 1.4107 + */ 1.4108 + switch (c) { 1.4109 + case '-': 1.4110 + /* 1.4111 + * U+002D HYPHEN-MINUS (-) Emit a U+002D 1.4112 + * HYPHEN-MINUS character token. Switch to the 1.4113 + * script data escaped dash state. 1.4114 + */ 1.4115 + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos); 1.4116 + break scriptdataescapedloop; // FALL THRU 1.4117 + // continue 1.4118 + // stateloop; 1.4119 + case '<': 1.4120 + /* 1.4121 + * U+003C LESS-THAN SIGN (<) Switch to the 1.4122 + * script data escaped less-than sign state. 1.4123 + */ 1.4124 + flushChars(buf, pos); 1.4125 + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 1.4126 + continue stateloop; 1.4127 + case '\u0000': 1.4128 + emitReplacementCharacter(buf, pos); 1.4129 + continue; 1.4130 + case '\r': 1.4131 + emitCarriageReturn(buf, pos); 1.4132 + break stateloop; 1.4133 + case '\n': 1.4134 + silentLineFeed(); 1.4135 + default: 1.4136 + /* 1.4137 + * Anything else Emit the current input 1.4138 + * character as a character token. Stay in the 1.4139 + * script data escaped state. 1.4140 + */ 1.4141 + continue; 1.4142 + } 1.4143 + } 1.4144 + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1.4145 + case SCRIPT_DATA_ESCAPED_DASH: 1.4146 + scriptdataescapeddashloop: for (;;) { 1.4147 + if (++pos == endPos) { 1.4148 + break stateloop; 1.4149 + } 1.4150 + c = checkChar(buf, pos); 1.4151 + /* 1.4152 + * Consume the next input character: 1.4153 + */ 1.4154 + switch (c) { 1.4155 + case '-': 1.4156 + /* 1.4157 + * U+002D HYPHEN-MINUS (-) Emit a U+002D 1.4158 + * HYPHEN-MINUS character token. Switch to the 1.4159 + * script data escaped dash dash state. 1.4160 + */ 1.4161 + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); 1.4162 + continue stateloop; 1.4163 + case '<': 1.4164 + /* 1.4165 + * U+003C LESS-THAN SIGN (<) Switch to the 1.4166 + * script data escaped less-than sign state. 1.4167 + */ 1.4168 + flushChars(buf, pos); 1.4169 + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 1.4170 + break scriptdataescapeddashloop; 1.4171 + // continue stateloop; 1.4172 + case '\u0000': 1.4173 + emitReplacementCharacter(buf, pos); 1.4174 + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 1.4175 + continue stateloop; 1.4176 + case '\r': 1.4177 + emitCarriageReturn(buf, pos); 1.4178 + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 1.4179 + break stateloop; 1.4180 + case '\n': 1.4181 + silentLineFeed(); 1.4182 + default: 1.4183 + /* 1.4184 + * Anything else Emit the current input 1.4185 + * character as a character token. Switch to the 1.4186 + * script data escaped state. 1.4187 + */ 1.4188 + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 1.4189 + continue stateloop; 1.4190 + } 1.4191 + } 1.4192 + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1.4193 + case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: 1.4194 + scriptdataescapedlessthanloop: for (;;) { 1.4195 + if (++pos == endPos) { 1.4196 + break stateloop; 1.4197 + } 1.4198 + c = checkChar(buf, pos); 1.4199 + /* 1.4200 + * Consume the next input character: 1.4201 + */ 1.4202 + switch (c) { 1.4203 + case '/': 1.4204 + /* 1.4205 + * U+002F SOLIDUS (/) Set the temporary buffer 1.4206 + * to the empty string. Switch to the script 1.4207 + * data escaped end tag open state. 1.4208 + */ 1.4209 + index = 0; 1.4210 + clearStrBuf(); 1.4211 + returnState = Tokenizer.SCRIPT_DATA_ESCAPED; 1.4212 + state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); 1.4213 + continue stateloop; 1.4214 + case 'S': 1.4215 + case 's': 1.4216 + /* 1.4217 + * U+0041 LATIN CAPITAL LETTER A through to 1.4218 + * U+005A LATIN CAPITAL LETTER Z Emit a U+003C 1.4219 + * LESS-THAN SIGN character token and the 1.4220 + * current input character as a character token. 1.4221 + */ 1.4222 + tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 1.4223 + cstart = pos; 1.4224 + index = 1; 1.4225 + /* 1.4226 + * Set the temporary buffer to the empty string. 1.4227 + * Append the lowercase version of the current 1.4228 + * input character (add 0x0020 to the 1.4229 + * character's code point) to the temporary 1.4230 + * buffer. Switch to the script data double 1.4231 + * escape start state. 1.4232 + */ 1.4233 + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos); 1.4234 + break scriptdataescapedlessthanloop; 1.4235 + // continue stateloop; 1.4236 + default: 1.4237 + /* 1.4238 + * Anything else Emit a U+003C LESS-THAN SIGN 1.4239 + * character token and reconsume the current 1.4240 + * input character in the script data escaped 1.4241 + * state. 1.4242 + */ 1.4243 + tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 1.4244 + cstart = pos; 1.4245 + reconsume = true; 1.4246 + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 1.4247 + continue stateloop; 1.4248 + } 1.4249 + } 1.4250 + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1.4251 + case SCRIPT_DATA_DOUBLE_ESCAPE_START: 1.4252 + scriptdatadoubleescapestartloop: for (;;) { 1.4253 + if (++pos == endPos) { 1.4254 + break stateloop; 1.4255 + } 1.4256 + c = checkChar(buf, pos); 1.4257 + assert index > 0; 1.4258 + if (index < 6) { // SCRIPT_ARR.length 1.4259 + char folded = c; 1.4260 + if (c >= 'A' && c <= 'Z') { 1.4261 + folded += 0x20; 1.4262 + } 1.4263 + if (folded != Tokenizer.SCRIPT_ARR[index]) { 1.4264 + reconsume = true; 1.4265 + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 1.4266 + continue stateloop; 1.4267 + } 1.4268 + index++; 1.4269 + continue; 1.4270 + } 1.4271 + switch (c) { 1.4272 + case '\r': 1.4273 + emitCarriageReturn(buf, pos); 1.4274 + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 1.4275 + break stateloop; 1.4276 + case '\n': 1.4277 + silentLineFeed(); 1.4278 + case ' ': 1.4279 + case '\t': 1.4280 + case '\u000C': 1.4281 + case '/': 1.4282 + case '>': 1.4283 + /* 1.4284 + * U+0009 CHARACTER TABULATION U+000A LINE FEED 1.4285 + * (LF) U+000C FORM FEED (FF) U+0020 SPACE 1.4286 + * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN 1.4287 + * (>) Emit the current input character as a 1.4288 + * character token. If the temporary buffer is 1.4289 + * the string "script", then switch to the 1.4290 + * script data double escaped state. 1.4291 + */ 1.4292 + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 1.4293 + break scriptdatadoubleescapestartloop; 1.4294 + // continue stateloop; 1.4295 + default: 1.4296 + /* 1.4297 + * Anything else Reconsume the current input 1.4298 + * character in the script data escaped state. 1.4299 + */ 1.4300 + reconsume = true; 1.4301 + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 1.4302 + continue stateloop; 1.4303 + } 1.4304 + } 1.4305 + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1.4306 + case SCRIPT_DATA_DOUBLE_ESCAPED: 1.4307 + scriptdatadoubleescapedloop: for (;;) { 1.4308 + if (reconsume) { 1.4309 + reconsume = false; 1.4310 + } else { 1.4311 + if (++pos == endPos) { 1.4312 + break stateloop; 1.4313 + } 1.4314 + c = checkChar(buf, pos); 1.4315 + } 1.4316 + /* 1.4317 + * Consume the next input character: 1.4318 + */ 1.4319 + switch (c) { 1.4320 + case '-': 1.4321 + /* 1.4322 + * U+002D HYPHEN-MINUS (-) Emit a U+002D 1.4323 + * HYPHEN-MINUS character token. Switch to the 1.4324 + * script data double escaped dash state. 1.4325 + */ 1.4326 + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos); 1.4327 + break scriptdatadoubleescapedloop; // FALL THRU 1.4328 + // continue 1.4329 + // stateloop; 1.4330 + case '<': 1.4331 + /* 1.4332 + * U+003C LESS-THAN SIGN (<) Emit a U+003C 1.4333 + * LESS-THAN SIGN character token. Switch to the 1.4334 + * script data double escaped less-than sign 1.4335 + * state. 1.4336 + */ 1.4337 + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 1.4338 + continue stateloop; 1.4339 + case '\u0000': 1.4340 + emitReplacementCharacter(buf, pos); 1.4341 + continue; 1.4342 + case '\r': 1.4343 + emitCarriageReturn(buf, pos); 1.4344 + break stateloop; 1.4345 + case '\n': 1.4346 + silentLineFeed(); 1.4347 + default: 1.4348 + /* 1.4349 + * Anything else Emit the current input 1.4350 + * character as a character token. Stay in the 1.4351 + * script data double escaped state. 1.4352 + */ 1.4353 + continue; 1.4354 + } 1.4355 + } 1.4356 + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1.4357 + case SCRIPT_DATA_DOUBLE_ESCAPED_DASH: 1.4358 + scriptdatadoubleescapeddashloop: for (;;) { 1.4359 + if (++pos == endPos) { 1.4360 + break stateloop; 1.4361 + } 1.4362 + c = checkChar(buf, pos); 1.4363 + /* 1.4364 + * Consume the next input character: 1.4365 + */ 1.4366 + switch (c) { 1.4367 + case '-': 1.4368 + /* 1.4369 + * U+002D HYPHEN-MINUS (-) Emit a U+002D 1.4370 + * HYPHEN-MINUS character token. Switch to the 1.4371 + * script data double escaped dash dash state. 1.4372 + */ 1.4373 + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos); 1.4374 + break scriptdatadoubleescapeddashloop; 1.4375 + // continue stateloop; 1.4376 + case '<': 1.4377 + /* 1.4378 + * U+003C LESS-THAN SIGN (<) Emit a U+003C 1.4379 + * LESS-THAN SIGN character token. Switch to the 1.4380 + * script data double escaped less-than sign 1.4381 + * state. 1.4382 + */ 1.4383 + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 1.4384 + continue stateloop; 1.4385 + case '\u0000': 1.4386 + emitReplacementCharacter(buf, pos); 1.4387 + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 1.4388 + continue stateloop; 1.4389 + case '\r': 1.4390 + emitCarriageReturn(buf, pos); 1.4391 + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 1.4392 + break stateloop; 1.4393 + case '\n': 1.4394 + silentLineFeed(); 1.4395 + default: 1.4396 + /* 1.4397 + * Anything else Emit the current input 1.4398 + * character as a character token. Switch to the 1.4399 + * script data double escaped state. 1.4400 + */ 1.4401 + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 1.4402 + continue stateloop; 1.4403 + } 1.4404 + } 1.4405 + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1.4406 + case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: 1.4407 + scriptdatadoubleescapeddashdashloop: for (;;) { 1.4408 + if (++pos == endPos) { 1.4409 + break stateloop; 1.4410 + } 1.4411 + c = checkChar(buf, pos); 1.4412 + /* 1.4413 + * Consume the next input character: 1.4414 + */ 1.4415 + switch (c) { 1.4416 + case '-': 1.4417 + /* 1.4418 + * U+002D HYPHEN-MINUS (-) Emit a U+002D 1.4419 + * HYPHEN-MINUS character token. Stay in the 1.4420 + * script data double escaped dash dash state. 1.4421 + */ 1.4422 + continue; 1.4423 + case '<': 1.4424 + /* 1.4425 + * U+003C LESS-THAN SIGN (<) Emit a U+003C 1.4426 + * LESS-THAN SIGN character token. Switch to the 1.4427 + * script data double escaped less-than sign 1.4428 + * state. 1.4429 + */ 1.4430 + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); 1.4431 + break scriptdatadoubleescapeddashdashloop; 1.4432 + case '>': 1.4433 + /* 1.4434 + * U+003E GREATER-THAN SIGN (>) Emit a U+003E 1.4435 + * GREATER-THAN SIGN character token. Switch to 1.4436 + * the script data state. 1.4437 + */ 1.4438 + state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); 1.4439 + continue stateloop; 1.4440 + case '\u0000': 1.4441 + emitReplacementCharacter(buf, pos); 1.4442 + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 1.4443 + continue stateloop; 1.4444 + case '\r': 1.4445 + emitCarriageReturn(buf, pos); 1.4446 + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 1.4447 + break stateloop; 1.4448 + case '\n': 1.4449 + silentLineFeed(); 1.4450 + default: 1.4451 + /* 1.4452 + * Anything else Emit the current input 1.4453 + * character as a character token. Switch to the 1.4454 + * script data double escaped state. 1.4455 + */ 1.4456 + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 1.4457 + continue stateloop; 1.4458 + } 1.4459 + } 1.4460 + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1.4461 + case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: 1.4462 + scriptdatadoubleescapedlessthanloop: for (;;) { 1.4463 + if (++pos == endPos) { 1.4464 + break stateloop; 1.4465 + } 1.4466 + c = checkChar(buf, pos); 1.4467 + /* 1.4468 + * Consume the next input character: 1.4469 + */ 1.4470 + switch (c) { 1.4471 + case '/': 1.4472 + /* 1.4473 + * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS 1.4474 + * character token. Set the temporary buffer to 1.4475 + * the empty string. Switch to the script data 1.4476 + * double escape end state. 1.4477 + */ 1.4478 + index = 0; 1.4479 + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos); 1.4480 + break scriptdatadoubleescapedlessthanloop; 1.4481 + default: 1.4482 + /* 1.4483 + * Anything else Reconsume the current input 1.4484 + * character in the script data double escaped 1.4485 + * state. 1.4486 + */ 1.4487 + reconsume = true; 1.4488 + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 1.4489 + continue stateloop; 1.4490 + } 1.4491 + } 1.4492 + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1.4493 + case SCRIPT_DATA_DOUBLE_ESCAPE_END: 1.4494 + scriptdatadoubleescapeendloop: for (;;) { 1.4495 + if (++pos == endPos) { 1.4496 + break stateloop; 1.4497 + } 1.4498 + c = checkChar(buf, pos); 1.4499 + if (index < 6) { // SCRIPT_ARR.length 1.4500 + char folded = c; 1.4501 + if (c >= 'A' && c <= 'Z') { 1.4502 + folded += 0x20; 1.4503 + } 1.4504 + if (folded != Tokenizer.SCRIPT_ARR[index]) { 1.4505 + reconsume = true; 1.4506 + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 1.4507 + continue stateloop; 1.4508 + } 1.4509 + index++; 1.4510 + continue; 1.4511 + } 1.4512 + switch (c) { 1.4513 + case '\r': 1.4514 + emitCarriageReturn(buf, pos); 1.4515 + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 1.4516 + break stateloop; 1.4517 + case '\n': 1.4518 + silentLineFeed(); 1.4519 + case ' ': 1.4520 + case '\t': 1.4521 + case '\u000C': 1.4522 + case '/': 1.4523 + case '>': 1.4524 + /* 1.4525 + * U+0009 CHARACTER TABULATION U+000A LINE FEED 1.4526 + * (LF) U+000C FORM FEED (FF) U+0020 SPACE 1.4527 + * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN 1.4528 + * (>) Emit the current input character as a 1.4529 + * character token. If the temporary buffer is 1.4530 + * the string "script", then switch to the 1.4531 + * script data escaped state. 1.4532 + */ 1.4533 + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); 1.4534 + continue stateloop; 1.4535 + default: 1.4536 + /* 1.4537 + * Reconsume the current input character in the 1.4538 + * script data double escaped state. 1.4539 + */ 1.4540 + reconsume = true; 1.4541 + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); 1.4542 + continue stateloop; 1.4543 + } 1.4544 + } 1.4545 + // XXX reorder point 1.4546 + case MARKUP_DECLARATION_OCTYPE: 1.4547 + markupdeclarationdoctypeloop: for (;;) { 1.4548 + if (++pos == endPos) { 1.4549 + break stateloop; 1.4550 + } 1.4551 + c = checkChar(buf, pos); 1.4552 + if (index < 6) { // OCTYPE.length 1.4553 + char folded = c; 1.4554 + if (c >= 'A' && c <= 'Z') { 1.4555 + folded += 0x20; 1.4556 + } 1.4557 + if (folded == Tokenizer.OCTYPE[index]) { 1.4558 + appendLongStrBuf(c); 1.4559 + } else { 1.4560 + errBogusComment(); 1.4561 + reconsume = true; 1.4562 + state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); 1.4563 + continue stateloop; 1.4564 + } 1.4565 + index++; 1.4566 + continue; 1.4567 + } else { 1.4568 + reconsume = true; 1.4569 + state = transition(state, Tokenizer.DOCTYPE, reconsume, pos); 1.4570 + break markupdeclarationdoctypeloop; 1.4571 + // continue stateloop; 1.4572 + } 1.4573 + } 1.4574 + // FALLTHRU DON'T REORDER 1.4575 + case DOCTYPE: 1.4576 + doctypeloop: for (;;) { 1.4577 + if (reconsume) { 1.4578 + reconsume = false; 1.4579 + } else { 1.4580 + if (++pos == endPos) { 1.4581 + break stateloop; 1.4582 + } 1.4583 + c = checkChar(buf, pos); 1.4584 + } 1.4585 + initDoctypeFields(); 1.4586 + /* 1.4587 + * Consume the next input character: 1.4588 + */ 1.4589 + switch (c) { 1.4590 + case '\r': 1.4591 + silentCarriageReturn(); 1.4592 + state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); 1.4593 + break stateloop; 1.4594 + case '\n': 1.4595 + silentLineFeed(); 1.4596 + // fall thru 1.4597 + case ' ': 1.4598 + case '\t': 1.4599 + case '\u000C': 1.4600 + /* 1.4601 + * U+0009 CHARACTER TABULATION U+000A LINE FEED 1.4602 + * (LF) U+000C FORM FEED (FF) U+0020 SPACE 1.4603 + * Switch to the before DOCTYPE name state. 1.4604 + */ 1.4605 + state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); 1.4606 + break doctypeloop; 1.4607 + // continue stateloop; 1.4608 + default: 1.4609 + /* 1.4610 + * Anything else Parse error. 1.4611 + */ 1.4612 + errMissingSpaceBeforeDoctypeName(); 1.4613 + /* 1.4614 + * Reconsume the current character in the before 1.4615 + * DOCTYPE name state. 1.4616 + */ 1.4617 + reconsume = true; 1.4618 + state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); 1.4619 + break doctypeloop; 1.4620 + // continue stateloop; 1.4621 + } 1.4622 + } 1.4623 + // FALLTHRU DON'T REORDER 1.4624 + case BEFORE_DOCTYPE_NAME: 1.4625 + beforedoctypenameloop: for (;;) { 1.4626 + if (reconsume) { 1.4627 + reconsume = false; 1.4628 + } else { 1.4629 + if (++pos == endPos) { 1.4630 + break stateloop; 1.4631 + } 1.4632 + c = checkChar(buf, pos); 1.4633 + } 1.4634 + /* 1.4635 + * Consume the next input character: 1.4636 + */ 1.4637 + switch (c) { 1.4638 + case '\r': 1.4639 + silentCarriageReturn(); 1.4640 + break stateloop; 1.4641 + case '\n': 1.4642 + silentLineFeed(); 1.4643 + // fall thru 1.4644 + case ' ': 1.4645 + case '\t': 1.4646 + case '\u000C': 1.4647 + /* 1.4648 + * U+0009 CHARACTER TABULATION U+000A LINE FEED 1.4649 + * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 1.4650 + * in the before DOCTYPE name state. 1.4651 + */ 1.4652 + continue; 1.4653 + case '>': 1.4654 + /* 1.4655 + * U+003E GREATER-THAN SIGN (>) Parse error. 1.4656 + */ 1.4657 + errNamelessDoctype(); 1.4658 + /* 1.4659 + * Create a new DOCTYPE token. Set its 1.4660 + * force-quirks flag to on. 1.4661 + */ 1.4662 + forceQuirks = true; 1.4663 + /* 1.4664 + * Emit the token. 1.4665 + */ 1.4666 + emitDoctypeToken(pos); 1.4667 + /* 1.4668 + * Switch to the data state. 1.4669 + */ 1.4670 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.4671 + continue stateloop; 1.4672 + case '\u0000': 1.4673 + c = '\uFFFD'; 1.4674 + // fall thru 1.4675 + default: 1.4676 + if (c >= 'A' && c <= 'Z') { 1.4677 + /* 1.4678 + * U+0041 LATIN CAPITAL LETTER A through to 1.4679 + * U+005A LATIN CAPITAL LETTER Z Create a 1.4680 + * new DOCTYPE token. Set the token's name 1.4681 + * to the lowercase version of the input 1.4682 + * character (add 0x0020 to the character's 1.4683 + * code point). 1.4684 + */ 1.4685 + c += 0x20; 1.4686 + } 1.4687 + /* Anything else Create a new DOCTYPE token. */ 1.4688 + /* 1.4689 + * Set the token's name name to the current 1.4690 + * input character. 1.4691 + */ 1.4692 + clearStrBufAndAppend(c); 1.4693 + /* 1.4694 + * Switch to the DOCTYPE name state. 1.4695 + */ 1.4696 + state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos); 1.4697 + break beforedoctypenameloop; 1.4698 + // continue stateloop; 1.4699 + } 1.4700 + } 1.4701 + // FALLTHRU DON'T REORDER 1.4702 + case DOCTYPE_NAME: 1.4703 + doctypenameloop: for (;;) { 1.4704 + if (++pos == endPos) { 1.4705 + break stateloop; 1.4706 + } 1.4707 + c = checkChar(buf, pos); 1.4708 + /* 1.4709 + * Consume the next input character: 1.4710 + */ 1.4711 + switch (c) { 1.4712 + case '\r': 1.4713 + silentCarriageReturn(); 1.4714 + strBufToDoctypeName(); 1.4715 + state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); 1.4716 + break stateloop; 1.4717 + case '\n': 1.4718 + silentLineFeed(); 1.4719 + // fall thru 1.4720 + case ' ': 1.4721 + case '\t': 1.4722 + case '\u000C': 1.4723 + /* 1.4724 + * U+0009 CHARACTER TABULATION U+000A LINE FEED 1.4725 + * (LF) U+000C FORM FEED (FF) U+0020 SPACE 1.4726 + * Switch to the after DOCTYPE name state. 1.4727 + */ 1.4728 + strBufToDoctypeName(); 1.4729 + state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); 1.4730 + break doctypenameloop; 1.4731 + // continue stateloop; 1.4732 + case '>': 1.4733 + /* 1.4734 + * U+003E GREATER-THAN SIGN (>) Emit the current 1.4735 + * DOCTYPE token. 1.4736 + */ 1.4737 + strBufToDoctypeName(); 1.4738 + emitDoctypeToken(pos); 1.4739 + /* 1.4740 + * Switch to the data state. 1.4741 + */ 1.4742 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.4743 + continue stateloop; 1.4744 + case '\u0000': 1.4745 + c = '\uFFFD'; 1.4746 + // fall thru 1.4747 + default: 1.4748 + /* 1.4749 + * U+0041 LATIN CAPITAL LETTER A through to 1.4750 + * U+005A LATIN CAPITAL LETTER Z Append the 1.4751 + * lowercase version of the input character (add 1.4752 + * 0x0020 to the character's code point) to the 1.4753 + * current DOCTYPE token's name. 1.4754 + */ 1.4755 + if (c >= 'A' && c <= 'Z') { 1.4756 + c += 0x0020; 1.4757 + } 1.4758 + /* 1.4759 + * Anything else Append the current input 1.4760 + * character to the current DOCTYPE token's 1.4761 + * name. 1.4762 + */ 1.4763 + appendStrBuf(c); 1.4764 + /* 1.4765 + * Stay in the DOCTYPE name state. 1.4766 + */ 1.4767 + continue; 1.4768 + } 1.4769 + } 1.4770 + // FALLTHRU DON'T REORDER 1.4771 + case AFTER_DOCTYPE_NAME: 1.4772 + afterdoctypenameloop: for (;;) { 1.4773 + if (++pos == endPos) { 1.4774 + break stateloop; 1.4775 + } 1.4776 + c = checkChar(buf, pos); 1.4777 + /* 1.4778 + * Consume the next input character: 1.4779 + */ 1.4780 + switch (c) { 1.4781 + case '\r': 1.4782 + silentCarriageReturn(); 1.4783 + break stateloop; 1.4784 + case '\n': 1.4785 + silentLineFeed(); 1.4786 + // fall thru 1.4787 + case ' ': 1.4788 + case '\t': 1.4789 + case '\u000C': 1.4790 + /* 1.4791 + * U+0009 CHARACTER TABULATION U+000A LINE FEED 1.4792 + * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 1.4793 + * in the after DOCTYPE name state. 1.4794 + */ 1.4795 + continue; 1.4796 + case '>': 1.4797 + /* 1.4798 + * U+003E GREATER-THAN SIGN (>) Emit the current 1.4799 + * DOCTYPE token. 1.4800 + */ 1.4801 + emitDoctypeToken(pos); 1.4802 + /* 1.4803 + * Switch to the data state. 1.4804 + */ 1.4805 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.4806 + continue stateloop; 1.4807 + case 'p': 1.4808 + case 'P': 1.4809 + index = 0; 1.4810 + state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos); 1.4811 + break afterdoctypenameloop; 1.4812 + // continue stateloop; 1.4813 + case 's': 1.4814 + case 'S': 1.4815 + index = 0; 1.4816 + state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos); 1.4817 + continue stateloop; 1.4818 + default: 1.4819 + /* 1.4820 + * Otherwise, this is the parse error. 1.4821 + */ 1.4822 + bogusDoctype(); 1.4823 + 1.4824 + /* 1.4825 + * Set the DOCTYPE token's force-quirks flag to 1.4826 + * on. 1.4827 + */ 1.4828 + // done by bogusDoctype(); 1.4829 + /* 1.4830 + * Switch to the bogus DOCTYPE state. 1.4831 + */ 1.4832 + state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 1.4833 + continue stateloop; 1.4834 + } 1.4835 + } 1.4836 + // FALLTHRU DON'T REORDER 1.4837 + case DOCTYPE_UBLIC: 1.4838 + doctypeublicloop: for (;;) { 1.4839 + if (++pos == endPos) { 1.4840 + break stateloop; 1.4841 + } 1.4842 + c = checkChar(buf, pos); 1.4843 + /* 1.4844 + * If the six characters starting from the current input 1.4845 + * character are an ASCII case-insensitive match for the 1.4846 + * word "PUBLIC", then consume those characters and 1.4847 + * switch to the before DOCTYPE public identifier state. 1.4848 + */ 1.4849 + if (index < 5) { // UBLIC.length 1.4850 + char folded = c; 1.4851 + if (c >= 'A' && c <= 'Z') { 1.4852 + folded += 0x20; 1.4853 + } 1.4854 + if (folded != Tokenizer.UBLIC[index]) { 1.4855 + bogusDoctype(); 1.4856 + // forceQuirks = true; 1.4857 + reconsume = true; 1.4858 + state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 1.4859 + continue stateloop; 1.4860 + } 1.4861 + index++; 1.4862 + continue; 1.4863 + } else { 1.4864 + reconsume = true; 1.4865 + state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos); 1.4866 + break doctypeublicloop; 1.4867 + // continue stateloop; 1.4868 + } 1.4869 + } 1.4870 + // FALLTHRU DON'T REORDER 1.4871 + case AFTER_DOCTYPE_PUBLIC_KEYWORD: 1.4872 + afterdoctypepublickeywordloop: for (;;) { 1.4873 + if (reconsume) { 1.4874 + reconsume = false; 1.4875 + } else { 1.4876 + if (++pos == endPos) { 1.4877 + break stateloop; 1.4878 + } 1.4879 + c = checkChar(buf, pos); 1.4880 + } 1.4881 + /* 1.4882 + * Consume the next input character: 1.4883 + */ 1.4884 + switch (c) { 1.4885 + case '\r': 1.4886 + silentCarriageReturn(); 1.4887 + state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 1.4888 + break stateloop; 1.4889 + case '\n': 1.4890 + silentLineFeed(); 1.4891 + // fall thru 1.4892 + case ' ': 1.4893 + case '\t': 1.4894 + case '\u000C': 1.4895 + /* 1.4896 + * U+0009 CHARACTER TABULATION U+000A LINE FEED 1.4897 + * (LF) U+000C FORM FEED (FF) U+0020 SPACE 1.4898 + * Switch to the before DOCTYPE public 1.4899 + * identifier state. 1.4900 + */ 1.4901 + state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 1.4902 + break afterdoctypepublickeywordloop; 1.4903 + // FALL THROUGH continue stateloop 1.4904 + case '"': 1.4905 + /* 1.4906 + * U+0022 QUOTATION MARK (") Parse Error. 1.4907 + */ 1.4908 + errNoSpaceBetweenDoctypePublicKeywordAndQuote(); 1.4909 + /* 1.4910 + * Set the DOCTYPE token's public identifier to 1.4911 + * the empty string (not missing), 1.4912 + */ 1.4913 + clearLongStrBuf(); 1.4914 + /* 1.4915 + * then switch to the DOCTYPE public identifier 1.4916 + * (double-quoted) state. 1.4917 + */ 1.4918 + state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 1.4919 + continue stateloop; 1.4920 + case '\'': 1.4921 + /* 1.4922 + * U+0027 APOSTROPHE (') Parse Error. 1.4923 + */ 1.4924 + errNoSpaceBetweenDoctypePublicKeywordAndQuote(); 1.4925 + /* 1.4926 + * Set the DOCTYPE token's public identifier to 1.4927 + * the empty string (not missing), 1.4928 + */ 1.4929 + clearLongStrBuf(); 1.4930 + /* 1.4931 + * then switch to the DOCTYPE public identifier 1.4932 + * (single-quoted) state. 1.4933 + */ 1.4934 + state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 1.4935 + continue stateloop; 1.4936 + case '>': 1.4937 + /* U+003E GREATER-THAN SIGN (>) Parse error. */ 1.4938 + errExpectedPublicId(); 1.4939 + /* 1.4940 + * Set the DOCTYPE token's force-quirks flag to 1.4941 + * on. 1.4942 + */ 1.4943 + forceQuirks = true; 1.4944 + /* 1.4945 + * Emit that DOCTYPE token. 1.4946 + */ 1.4947 + emitDoctypeToken(pos); 1.4948 + /* 1.4949 + * Switch to the data state. 1.4950 + */ 1.4951 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.4952 + continue stateloop; 1.4953 + default: 1.4954 + bogusDoctype(); 1.4955 + /* 1.4956 + * Set the DOCTYPE token's force-quirks flag to 1.4957 + * on. 1.4958 + */ 1.4959 + // done by bogusDoctype(); 1.4960 + /* 1.4961 + * Switch to the bogus DOCTYPE state. 1.4962 + */ 1.4963 + state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 1.4964 + continue stateloop; 1.4965 + } 1.4966 + } 1.4967 + // FALLTHRU DON'T REORDER 1.4968 + case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: 1.4969 + beforedoctypepublicidentifierloop: for (;;) { 1.4970 + if (++pos == endPos) { 1.4971 + break stateloop; 1.4972 + } 1.4973 + c = checkChar(buf, pos); 1.4974 + /* 1.4975 + * Consume the next input character: 1.4976 + */ 1.4977 + switch (c) { 1.4978 + case '\r': 1.4979 + silentCarriageReturn(); 1.4980 + break stateloop; 1.4981 + case '\n': 1.4982 + silentLineFeed(); 1.4983 + // fall thru 1.4984 + case ' ': 1.4985 + case '\t': 1.4986 + case '\u000C': 1.4987 + /* 1.4988 + * U+0009 CHARACTER TABULATION U+000A LINE FEED 1.4989 + * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 1.4990 + * in the before DOCTYPE public identifier 1.4991 + * state. 1.4992 + */ 1.4993 + continue; 1.4994 + case '"': 1.4995 + /* 1.4996 + * U+0022 QUOTATION MARK (") Set the DOCTYPE 1.4997 + * token's public identifier to the empty string 1.4998 + * (not missing), 1.4999 + */ 1.5000 + clearLongStrBuf(); 1.5001 + /* 1.5002 + * then switch to the DOCTYPE public identifier 1.5003 + * (double-quoted) state. 1.5004 + */ 1.5005 + state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 1.5006 + break beforedoctypepublicidentifierloop; 1.5007 + // continue stateloop; 1.5008 + case '\'': 1.5009 + /* 1.5010 + * U+0027 APOSTROPHE (') Set the DOCTYPE token's 1.5011 + * public identifier to the empty string (not 1.5012 + * missing), 1.5013 + */ 1.5014 + clearLongStrBuf(); 1.5015 + /* 1.5016 + * then switch to the DOCTYPE public identifier 1.5017 + * (single-quoted) state. 1.5018 + */ 1.5019 + state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 1.5020 + continue stateloop; 1.5021 + case '>': 1.5022 + /* U+003E GREATER-THAN SIGN (>) Parse error. */ 1.5023 + errExpectedPublicId(); 1.5024 + /* 1.5025 + * Set the DOCTYPE token's force-quirks flag to 1.5026 + * on. 1.5027 + */ 1.5028 + forceQuirks = true; 1.5029 + /* 1.5030 + * Emit that DOCTYPE token. 1.5031 + */ 1.5032 + emitDoctypeToken(pos); 1.5033 + /* 1.5034 + * Switch to the data state. 1.5035 + */ 1.5036 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.5037 + continue stateloop; 1.5038 + default: 1.5039 + bogusDoctype(); 1.5040 + /* 1.5041 + * Set the DOCTYPE token's force-quirks flag to 1.5042 + * on. 1.5043 + */ 1.5044 + // done by bogusDoctype(); 1.5045 + /* 1.5046 + * Switch to the bogus DOCTYPE state. 1.5047 + */ 1.5048 + state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 1.5049 + continue stateloop; 1.5050 + } 1.5051 + } 1.5052 + // FALLTHRU DON'T REORDER 1.5053 + case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: 1.5054 + doctypepublicidentifierdoublequotedloop: for (;;) { 1.5055 + if (++pos == endPos) { 1.5056 + break stateloop; 1.5057 + } 1.5058 + c = checkChar(buf, pos); 1.5059 + /* 1.5060 + * Consume the next input character: 1.5061 + */ 1.5062 + switch (c) { 1.5063 + case '"': 1.5064 + /* 1.5065 + * U+0022 QUOTATION MARK (") Switch to the after 1.5066 + * DOCTYPE public identifier state. 1.5067 + */ 1.5068 + publicIdentifier = longStrBufToString(); 1.5069 + state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 1.5070 + break doctypepublicidentifierdoublequotedloop; 1.5071 + // continue stateloop; 1.5072 + case '>': 1.5073 + /* 1.5074 + * U+003E GREATER-THAN SIGN (>) Parse error. 1.5075 + */ 1.5076 + errGtInPublicId(); 1.5077 + /* 1.5078 + * Set the DOCTYPE token's force-quirks flag to 1.5079 + * on. 1.5080 + */ 1.5081 + forceQuirks = true; 1.5082 + /* 1.5083 + * Emit that DOCTYPE token. 1.5084 + */ 1.5085 + publicIdentifier = longStrBufToString(); 1.5086 + emitDoctypeToken(pos); 1.5087 + /* 1.5088 + * Switch to the data state. 1.5089 + */ 1.5090 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.5091 + continue stateloop; 1.5092 + case '\r': 1.5093 + appendLongStrBufCarriageReturn(); 1.5094 + break stateloop; 1.5095 + case '\n': 1.5096 + appendLongStrBufLineFeed(); 1.5097 + continue; 1.5098 + case '\u0000': 1.5099 + c = '\uFFFD'; 1.5100 + // fall thru 1.5101 + default: 1.5102 + /* 1.5103 + * Anything else Append the current input 1.5104 + * character to the current DOCTYPE token's 1.5105 + * public identifier. 1.5106 + */ 1.5107 + appendLongStrBuf(c); 1.5108 + /* 1.5109 + * Stay in the DOCTYPE public identifier 1.5110 + * (double-quoted) state. 1.5111 + */ 1.5112 + continue; 1.5113 + } 1.5114 + } 1.5115 + // FALLTHRU DON'T REORDER 1.5116 + case AFTER_DOCTYPE_PUBLIC_IDENTIFIER: 1.5117 + afterdoctypepublicidentifierloop: for (;;) { 1.5118 + if (++pos == endPos) { 1.5119 + break stateloop; 1.5120 + } 1.5121 + c = checkChar(buf, pos); 1.5122 + /* 1.5123 + * Consume the next input character: 1.5124 + */ 1.5125 + switch (c) { 1.5126 + case '\r': 1.5127 + silentCarriageReturn(); 1.5128 + state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); 1.5129 + break stateloop; 1.5130 + case '\n': 1.5131 + silentLineFeed(); 1.5132 + // fall thru 1.5133 + case ' ': 1.5134 + case '\t': 1.5135 + case '\u000C': 1.5136 + /* 1.5137 + * U+0009 CHARACTER TABULATION U+000A LINE FEED 1.5138 + * (LF) U+000C FORM FEED (FF) U+0020 SPACE 1.5139 + * Switch to the between DOCTYPE public and 1.5140 + * system identifiers state. 1.5141 + */ 1.5142 + state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); 1.5143 + break afterdoctypepublicidentifierloop; 1.5144 + // continue stateloop; 1.5145 + case '>': 1.5146 + /* 1.5147 + * U+003E GREATER-THAN SIGN (>) Emit the current 1.5148 + * DOCTYPE token. 1.5149 + */ 1.5150 + emitDoctypeToken(pos); 1.5151 + /* 1.5152 + * Switch to the data state. 1.5153 + */ 1.5154 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.5155 + continue stateloop; 1.5156 + case '"': 1.5157 + /* 1.5158 + * U+0022 QUOTATION MARK (") Parse error. 1.5159 + */ 1.5160 + errNoSpaceBetweenPublicAndSystemIds(); 1.5161 + /* 1.5162 + * Set the DOCTYPE token's system identifier to 1.5163 + * the empty string (not missing), 1.5164 + */ 1.5165 + clearLongStrBuf(); 1.5166 + /* 1.5167 + * then switch to the DOCTYPE system identifier 1.5168 + * (double-quoted) state. 1.5169 + */ 1.5170 + state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 1.5171 + continue stateloop; 1.5172 + case '\'': 1.5173 + /* 1.5174 + * U+0027 APOSTROPHE (') Parse error. 1.5175 + */ 1.5176 + errNoSpaceBetweenPublicAndSystemIds(); 1.5177 + /* 1.5178 + * Set the DOCTYPE token's system identifier to 1.5179 + * the empty string (not missing), 1.5180 + */ 1.5181 + clearLongStrBuf(); 1.5182 + /* 1.5183 + * then switch to the DOCTYPE system identifier 1.5184 + * (single-quoted) state. 1.5185 + */ 1.5186 + state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 1.5187 + continue stateloop; 1.5188 + default: 1.5189 + bogusDoctype(); 1.5190 + /* 1.5191 + * Set the DOCTYPE token's force-quirks flag to 1.5192 + * on. 1.5193 + */ 1.5194 + // done by bogusDoctype(); 1.5195 + /* 1.5196 + * Switch to the bogus DOCTYPE state. 1.5197 + */ 1.5198 + state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 1.5199 + continue stateloop; 1.5200 + } 1.5201 + } 1.5202 + // FALLTHRU DON'T REORDER 1.5203 + case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: 1.5204 + betweendoctypepublicandsystemidentifiersloop: for (;;) { 1.5205 + if (++pos == endPos) { 1.5206 + break stateloop; 1.5207 + } 1.5208 + c = checkChar(buf, pos); 1.5209 + /* 1.5210 + * Consume the next input character: 1.5211 + */ 1.5212 + switch (c) { 1.5213 + case '\r': 1.5214 + silentCarriageReturn(); 1.5215 + break stateloop; 1.5216 + case '\n': 1.5217 + silentLineFeed(); 1.5218 + // fall thru 1.5219 + case ' ': 1.5220 + case '\t': 1.5221 + case '\u000C': 1.5222 + /* 1.5223 + * U+0009 CHARACTER TABULATION U+000A LINE FEED 1.5224 + * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 1.5225 + * in the between DOCTYPE public and system 1.5226 + * identifiers state. 1.5227 + */ 1.5228 + continue; 1.5229 + case '>': 1.5230 + /* 1.5231 + * U+003E GREATER-THAN SIGN (>) Emit the current 1.5232 + * DOCTYPE token. 1.5233 + */ 1.5234 + emitDoctypeToken(pos); 1.5235 + /* 1.5236 + * Switch to the data state. 1.5237 + */ 1.5238 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.5239 + continue stateloop; 1.5240 + case '"': 1.5241 + /* 1.5242 + * U+0022 QUOTATION MARK (") Set the DOCTYPE 1.5243 + * token's system identifier to the empty string 1.5244 + * (not missing), 1.5245 + */ 1.5246 + clearLongStrBuf(); 1.5247 + /* 1.5248 + * then switch to the DOCTYPE system identifier 1.5249 + * (double-quoted) state. 1.5250 + */ 1.5251 + state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 1.5252 + break betweendoctypepublicandsystemidentifiersloop; 1.5253 + // continue stateloop; 1.5254 + case '\'': 1.5255 + /* 1.5256 + * U+0027 APOSTROPHE (') Set the DOCTYPE token's 1.5257 + * system identifier to the empty string (not 1.5258 + * missing), 1.5259 + */ 1.5260 + clearLongStrBuf(); 1.5261 + /* 1.5262 + * then switch to the DOCTYPE system identifier 1.5263 + * (single-quoted) state. 1.5264 + */ 1.5265 + state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 1.5266 + continue stateloop; 1.5267 + default: 1.5268 + bogusDoctype(); 1.5269 + /* 1.5270 + * Set the DOCTYPE token's force-quirks flag to 1.5271 + * on. 1.5272 + */ 1.5273 + // done by bogusDoctype(); 1.5274 + /* 1.5275 + * Switch to the bogus DOCTYPE state. 1.5276 + */ 1.5277 + state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 1.5278 + continue stateloop; 1.5279 + } 1.5280 + } 1.5281 + // FALLTHRU DON'T REORDER 1.5282 + case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: 1.5283 + doctypesystemidentifierdoublequotedloop: for (;;) { 1.5284 + if (++pos == endPos) { 1.5285 + break stateloop; 1.5286 + } 1.5287 + c = checkChar(buf, pos); 1.5288 + /* 1.5289 + * Consume the next input character: 1.5290 + */ 1.5291 + switch (c) { 1.5292 + case '"': 1.5293 + /* 1.5294 + * U+0022 QUOTATION MARK (") Switch to the after 1.5295 + * DOCTYPE system identifier state. 1.5296 + */ 1.5297 + systemIdentifier = longStrBufToString(); 1.5298 + state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 1.5299 + continue stateloop; 1.5300 + case '>': 1.5301 + /* 1.5302 + * U+003E GREATER-THAN SIGN (>) Parse error. 1.5303 + */ 1.5304 + errGtInSystemId(); 1.5305 + /* 1.5306 + * Set the DOCTYPE token's force-quirks flag to 1.5307 + * on. 1.5308 + */ 1.5309 + forceQuirks = true; 1.5310 + /* 1.5311 + * Emit that DOCTYPE token. 1.5312 + */ 1.5313 + systemIdentifier = longStrBufToString(); 1.5314 + emitDoctypeToken(pos); 1.5315 + /* 1.5316 + * Switch to the data state. 1.5317 + */ 1.5318 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.5319 + continue stateloop; 1.5320 + case '\r': 1.5321 + appendLongStrBufCarriageReturn(); 1.5322 + break stateloop; 1.5323 + case '\n': 1.5324 + appendLongStrBufLineFeed(); 1.5325 + continue; 1.5326 + case '\u0000': 1.5327 + c = '\uFFFD'; 1.5328 + // fall thru 1.5329 + default: 1.5330 + /* 1.5331 + * Anything else Append the current input 1.5332 + * character to the current DOCTYPE token's 1.5333 + * system identifier. 1.5334 + */ 1.5335 + appendLongStrBuf(c); 1.5336 + /* 1.5337 + * Stay in the DOCTYPE system identifier 1.5338 + * (double-quoted) state. 1.5339 + */ 1.5340 + continue; 1.5341 + } 1.5342 + } 1.5343 + // FALLTHRU DON'T REORDER 1.5344 + case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: 1.5345 + afterdoctypesystemidentifierloop: for (;;) { 1.5346 + if (++pos == endPos) { 1.5347 + break stateloop; 1.5348 + } 1.5349 + c = checkChar(buf, pos); 1.5350 + /* 1.5351 + * Consume the next input character: 1.5352 + */ 1.5353 + switch (c) { 1.5354 + case '\r': 1.5355 + silentCarriageReturn(); 1.5356 + break stateloop; 1.5357 + case '\n': 1.5358 + silentLineFeed(); 1.5359 + // fall thru 1.5360 + case ' ': 1.5361 + case '\t': 1.5362 + case '\u000C': 1.5363 + /* 1.5364 + * U+0009 CHARACTER TABULATION U+000A LINE FEED 1.5365 + * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 1.5366 + * in the after DOCTYPE system identifier state. 1.5367 + */ 1.5368 + continue; 1.5369 + case '>': 1.5370 + /* 1.5371 + * U+003E GREATER-THAN SIGN (>) Emit the current 1.5372 + * DOCTYPE token. 1.5373 + */ 1.5374 + emitDoctypeToken(pos); 1.5375 + /* 1.5376 + * Switch to the data state. 1.5377 + */ 1.5378 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.5379 + continue stateloop; 1.5380 + default: 1.5381 + /* 1.5382 + * Switch to the bogus DOCTYPE state. (This does 1.5383 + * not set the DOCTYPE token's force-quirks flag 1.5384 + * to on.) 1.5385 + */ 1.5386 + bogusDoctypeWithoutQuirks(); 1.5387 + state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 1.5388 + break afterdoctypesystemidentifierloop; 1.5389 + // continue stateloop; 1.5390 + } 1.5391 + } 1.5392 + // FALLTHRU DON'T REORDER 1.5393 + case BOGUS_DOCTYPE: 1.5394 + for (;;) { 1.5395 + if (reconsume) { 1.5396 + reconsume = false; 1.5397 + } else { 1.5398 + if (++pos == endPos) { 1.5399 + break stateloop; 1.5400 + } 1.5401 + c = checkChar(buf, pos); 1.5402 + } 1.5403 + /* 1.5404 + * Consume the next input character: 1.5405 + */ 1.5406 + switch (c) { 1.5407 + case '>': 1.5408 + /* 1.5409 + * U+003E GREATER-THAN SIGN (>) Emit that 1.5410 + * DOCTYPE token. 1.5411 + */ 1.5412 + emitDoctypeToken(pos); 1.5413 + /* 1.5414 + * Switch to the data state. 1.5415 + */ 1.5416 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.5417 + continue stateloop; 1.5418 + case '\r': 1.5419 + silentCarriageReturn(); 1.5420 + break stateloop; 1.5421 + case '\n': 1.5422 + silentLineFeed(); 1.5423 + // fall thru 1.5424 + default: 1.5425 + /* 1.5426 + * Anything else Stay in the bogus DOCTYPE 1.5427 + * state. 1.5428 + */ 1.5429 + continue; 1.5430 + } 1.5431 + } 1.5432 + // XXX reorder point 1.5433 + case DOCTYPE_YSTEM: 1.5434 + doctypeystemloop: for (;;) { 1.5435 + if (++pos == endPos) { 1.5436 + break stateloop; 1.5437 + } 1.5438 + c = checkChar(buf, pos); 1.5439 + /* 1.5440 + * Otherwise, if the six characters starting from the 1.5441 + * current input character are an ASCII case-insensitive 1.5442 + * match for the word "SYSTEM", then consume those 1.5443 + * characters and switch to the before DOCTYPE system 1.5444 + * identifier state. 1.5445 + */ 1.5446 + if (index < 5) { // YSTEM.length 1.5447 + char folded = c; 1.5448 + if (c >= 'A' && c <= 'Z') { 1.5449 + folded += 0x20; 1.5450 + } 1.5451 + if (folded != Tokenizer.YSTEM[index]) { 1.5452 + bogusDoctype(); 1.5453 + reconsume = true; 1.5454 + state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 1.5455 + continue stateloop; 1.5456 + } 1.5457 + index++; 1.5458 + continue stateloop; 1.5459 + } else { 1.5460 + reconsume = true; 1.5461 + state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos); 1.5462 + break doctypeystemloop; 1.5463 + // continue stateloop; 1.5464 + } 1.5465 + } 1.5466 + // FALLTHRU DON'T REORDER 1.5467 + case AFTER_DOCTYPE_SYSTEM_KEYWORD: 1.5468 + afterdoctypesystemkeywordloop: for (;;) { 1.5469 + if (reconsume) { 1.5470 + reconsume = false; 1.5471 + } else { 1.5472 + if (++pos == endPos) { 1.5473 + break stateloop; 1.5474 + } 1.5475 + c = checkChar(buf, pos); 1.5476 + } 1.5477 + /* 1.5478 + * Consume the next input character: 1.5479 + */ 1.5480 + switch (c) { 1.5481 + case '\r': 1.5482 + silentCarriageReturn(); 1.5483 + state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 1.5484 + break stateloop; 1.5485 + case '\n': 1.5486 + silentLineFeed(); 1.5487 + // fall thru 1.5488 + case ' ': 1.5489 + case '\t': 1.5490 + case '\u000C': 1.5491 + /* 1.5492 + * U+0009 CHARACTER TABULATION U+000A LINE FEED 1.5493 + * (LF) U+000C FORM FEED (FF) U+0020 SPACE 1.5494 + * Switch to the before DOCTYPE public 1.5495 + * identifier state. 1.5496 + */ 1.5497 + state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 1.5498 + break afterdoctypesystemkeywordloop; 1.5499 + // FALL THROUGH continue stateloop 1.5500 + case '"': 1.5501 + /* 1.5502 + * U+0022 QUOTATION MARK (") Parse Error. 1.5503 + */ 1.5504 + errNoSpaceBetweenDoctypeSystemKeywordAndQuote(); 1.5505 + /* 1.5506 + * Set the DOCTYPE token's system identifier to 1.5507 + * the empty string (not missing), 1.5508 + */ 1.5509 + clearLongStrBuf(); 1.5510 + /* 1.5511 + * then switch to the DOCTYPE public identifier 1.5512 + * (double-quoted) state. 1.5513 + */ 1.5514 + state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 1.5515 + continue stateloop; 1.5516 + case '\'': 1.5517 + /* 1.5518 + * U+0027 APOSTROPHE (') Parse Error. 1.5519 + */ 1.5520 + errNoSpaceBetweenDoctypeSystemKeywordAndQuote(); 1.5521 + /* 1.5522 + * Set the DOCTYPE token's public identifier to 1.5523 + * the empty string (not missing), 1.5524 + */ 1.5525 + clearLongStrBuf(); 1.5526 + /* 1.5527 + * then switch to the DOCTYPE public identifier 1.5528 + * (single-quoted) state. 1.5529 + */ 1.5530 + state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 1.5531 + continue stateloop; 1.5532 + case '>': 1.5533 + /* U+003E GREATER-THAN SIGN (>) Parse error. */ 1.5534 + errExpectedPublicId(); 1.5535 + /* 1.5536 + * Set the DOCTYPE token's force-quirks flag to 1.5537 + * on. 1.5538 + */ 1.5539 + forceQuirks = true; 1.5540 + /* 1.5541 + * Emit that DOCTYPE token. 1.5542 + */ 1.5543 + emitDoctypeToken(pos); 1.5544 + /* 1.5545 + * Switch to the data state. 1.5546 + */ 1.5547 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.5548 + continue stateloop; 1.5549 + default: 1.5550 + bogusDoctype(); 1.5551 + /* 1.5552 + * Set the DOCTYPE token's force-quirks flag to 1.5553 + * on. 1.5554 + */ 1.5555 + // done by bogusDoctype(); 1.5556 + /* 1.5557 + * Switch to the bogus DOCTYPE state. 1.5558 + */ 1.5559 + state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 1.5560 + continue stateloop; 1.5561 + } 1.5562 + } 1.5563 + // FALLTHRU DON'T REORDER 1.5564 + case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: 1.5565 + beforedoctypesystemidentifierloop: for (;;) { 1.5566 + if (++pos == endPos) { 1.5567 + break stateloop; 1.5568 + } 1.5569 + c = checkChar(buf, pos); 1.5570 + /* 1.5571 + * Consume the next input character: 1.5572 + */ 1.5573 + switch (c) { 1.5574 + case '\r': 1.5575 + silentCarriageReturn(); 1.5576 + break stateloop; 1.5577 + case '\n': 1.5578 + silentLineFeed(); 1.5579 + // fall thru 1.5580 + case ' ': 1.5581 + case '\t': 1.5582 + case '\u000C': 1.5583 + /* 1.5584 + * U+0009 CHARACTER TABULATION U+000A LINE FEED 1.5585 + * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay 1.5586 + * in the before DOCTYPE system identifier 1.5587 + * state. 1.5588 + */ 1.5589 + continue; 1.5590 + case '"': 1.5591 + /* 1.5592 + * U+0022 QUOTATION MARK (") Set the DOCTYPE 1.5593 + * token's system identifier to the empty string 1.5594 + * (not missing), 1.5595 + */ 1.5596 + clearLongStrBuf(); 1.5597 + /* 1.5598 + * then switch to the DOCTYPE system identifier 1.5599 + * (double-quoted) state. 1.5600 + */ 1.5601 + state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); 1.5602 + continue stateloop; 1.5603 + case '\'': 1.5604 + /* 1.5605 + * U+0027 APOSTROPHE (') Set the DOCTYPE token's 1.5606 + * system identifier to the empty string (not 1.5607 + * missing), 1.5608 + */ 1.5609 + clearLongStrBuf(); 1.5610 + /* 1.5611 + * then switch to the DOCTYPE system identifier 1.5612 + * (single-quoted) state. 1.5613 + */ 1.5614 + state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); 1.5615 + break beforedoctypesystemidentifierloop; 1.5616 + // continue stateloop; 1.5617 + case '>': 1.5618 + /* U+003E GREATER-THAN SIGN (>) Parse error. */ 1.5619 + errExpectedSystemId(); 1.5620 + /* 1.5621 + * Set the DOCTYPE token's force-quirks flag to 1.5622 + * on. 1.5623 + */ 1.5624 + forceQuirks = true; 1.5625 + /* 1.5626 + * Emit that DOCTYPE token. 1.5627 + */ 1.5628 + emitDoctypeToken(pos); 1.5629 + /* 1.5630 + * Switch to the data state. 1.5631 + */ 1.5632 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.5633 + continue stateloop; 1.5634 + default: 1.5635 + bogusDoctype(); 1.5636 + /* 1.5637 + * Set the DOCTYPE token's force-quirks flag to 1.5638 + * on. 1.5639 + */ 1.5640 + // done by bogusDoctype(); 1.5641 + /* 1.5642 + * Switch to the bogus DOCTYPE state. 1.5643 + */ 1.5644 + state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); 1.5645 + continue stateloop; 1.5646 + } 1.5647 + } 1.5648 + // FALLTHRU DON'T REORDER 1.5649 + case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: 1.5650 + for (;;) { 1.5651 + if (++pos == endPos) { 1.5652 + break stateloop; 1.5653 + } 1.5654 + c = checkChar(buf, pos); 1.5655 + /* 1.5656 + * Consume the next input character: 1.5657 + */ 1.5658 + switch (c) { 1.5659 + case '\'': 1.5660 + /* 1.5661 + * U+0027 APOSTROPHE (') Switch to the after 1.5662 + * DOCTYPE system identifier state. 1.5663 + */ 1.5664 + systemIdentifier = longStrBufToString(); 1.5665 + state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); 1.5666 + continue stateloop; 1.5667 + case '>': 1.5668 + errGtInSystemId(); 1.5669 + /* 1.5670 + * Set the DOCTYPE token's force-quirks flag to 1.5671 + * on. 1.5672 + */ 1.5673 + forceQuirks = true; 1.5674 + /* 1.5675 + * Emit that DOCTYPE token. 1.5676 + */ 1.5677 + systemIdentifier = longStrBufToString(); 1.5678 + emitDoctypeToken(pos); 1.5679 + /* 1.5680 + * Switch to the data state. 1.5681 + */ 1.5682 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.5683 + continue stateloop; 1.5684 + case '\r': 1.5685 + appendLongStrBufCarriageReturn(); 1.5686 + break stateloop; 1.5687 + case '\n': 1.5688 + appendLongStrBufLineFeed(); 1.5689 + continue; 1.5690 + case '\u0000': 1.5691 + c = '\uFFFD'; 1.5692 + // fall thru 1.5693 + default: 1.5694 + /* 1.5695 + * Anything else Append the current input 1.5696 + * character to the current DOCTYPE token's 1.5697 + * system identifier. 1.5698 + */ 1.5699 + appendLongStrBuf(c); 1.5700 + /* 1.5701 + * Stay in the DOCTYPE system identifier 1.5702 + * (double-quoted) state. 1.5703 + */ 1.5704 + continue; 1.5705 + } 1.5706 + } 1.5707 + // XXX reorder point 1.5708 + case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: 1.5709 + for (;;) { 1.5710 + if (++pos == endPos) { 1.5711 + break stateloop; 1.5712 + } 1.5713 + c = checkChar(buf, pos); 1.5714 + /* 1.5715 + * Consume the next input character: 1.5716 + */ 1.5717 + switch (c) { 1.5718 + case '\'': 1.5719 + /* 1.5720 + * U+0027 APOSTROPHE (') Switch to the after 1.5721 + * DOCTYPE public identifier state. 1.5722 + */ 1.5723 + publicIdentifier = longStrBufToString(); 1.5724 + state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); 1.5725 + continue stateloop; 1.5726 + case '>': 1.5727 + errGtInPublicId(); 1.5728 + /* 1.5729 + * Set the DOCTYPE token's force-quirks flag to 1.5730 + * on. 1.5731 + */ 1.5732 + forceQuirks = true; 1.5733 + /* 1.5734 + * Emit that DOCTYPE token. 1.5735 + */ 1.5736 + publicIdentifier = longStrBufToString(); 1.5737 + emitDoctypeToken(pos); 1.5738 + /* 1.5739 + * Switch to the data state. 1.5740 + */ 1.5741 + state = transition(state, Tokenizer.DATA, reconsume, pos); 1.5742 + continue stateloop; 1.5743 + case '\r': 1.5744 + appendLongStrBufCarriageReturn(); 1.5745 + break stateloop; 1.5746 + case '\n': 1.5747 + appendLongStrBufLineFeed(); 1.5748 + continue; 1.5749 + case '\u0000': 1.5750 + c = '\uFFFD'; 1.5751 + // fall thru 1.5752 + default: 1.5753 + /* 1.5754 + * Anything else Append the current input 1.5755 + * character to the current DOCTYPE token's 1.5756 + * public identifier. 1.5757 + */ 1.5758 + appendLongStrBuf(c); 1.5759 + /* 1.5760 + * Stay in the DOCTYPE public identifier 1.5761 + * (single-quoted) state. 1.5762 + */ 1.5763 + continue; 1.5764 + } 1.5765 + } 1.5766 + // XXX reorder point 1.5767 + case PROCESSING_INSTRUCTION: 1.5768 + processinginstructionloop: for (;;) { 1.5769 + if (++pos == endPos) { 1.5770 + break stateloop; 1.5771 + } 1.5772 + c = checkChar(buf, pos); 1.5773 + switch (c) { 1.5774 + case '?': 1.5775 + state = transition( 1.5776 + state, 1.5777 + Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK, 1.5778 + reconsume, pos); 1.5779 + break processinginstructionloop; 1.5780 + // continue stateloop; 1.5781 + default: 1.5782 + continue; 1.5783 + } 1.5784 + } 1.5785 + case PROCESSING_INSTRUCTION_QUESTION_MARK: 1.5786 + if (++pos == endPos) { 1.5787 + break stateloop; 1.5788 + } 1.5789 + c = checkChar(buf, pos); 1.5790 + switch (c) { 1.5791 + case '>': 1.5792 + state = transition(state, Tokenizer.DATA, 1.5793 + reconsume, pos); 1.5794 + continue stateloop; 1.5795 + default: 1.5796 + state = transition(state, 1.5797 + Tokenizer.PROCESSING_INSTRUCTION, 1.5798 + reconsume, pos); 1.5799 + continue stateloop; 1.5800 + } 1.5801 + // END HOTSPOT WORKAROUND 1.5802 + } 1.5803 + } 1.5804 + flushChars(buf, pos); 1.5805 + /* 1.5806 + * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; } 1.5807 + */ 1.5808 + // Save locals 1.5809 + stateSave = state; 1.5810 + returnStateSave = returnState; 1.5811 + return pos; 1.5812 + } 1.5813 + 1.5814 + // HOTSPOT WORKAROUND INSERTION POINT 1.5815 + 1.5816 + // [NOCPP[ 1.5817 + 1.5818 + protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException { 1.5819 + return to; 1.5820 + } 1.5821 + 1.5822 + // ]NOCPP] 1.5823 + 1.5824 + private void initDoctypeFields() { 1.5825 + doctypeName = ""; 1.5826 + if (systemIdentifier != null) { 1.5827 + Portability.releaseString(systemIdentifier); 1.5828 + systemIdentifier = null; 1.5829 + } 1.5830 + if (publicIdentifier != null) { 1.5831 + Portability.releaseString(publicIdentifier); 1.5832 + publicIdentifier = null; 1.5833 + } 1.5834 + forceQuirks = false; 1.5835 + } 1.5836 + 1.5837 + @Inline private void adjustDoubleHyphenAndAppendToLongStrBufCarriageReturn() 1.5838 + throws SAXException { 1.5839 + silentCarriageReturn(); 1.5840 + adjustDoubleHyphenAndAppendToLongStrBufAndErr('\n'); 1.5841 + } 1.5842 + 1.5843 + @Inline private void adjustDoubleHyphenAndAppendToLongStrBufLineFeed() 1.5844 + throws SAXException { 1.5845 + silentLineFeed(); 1.5846 + adjustDoubleHyphenAndAppendToLongStrBufAndErr('\n'); 1.5847 + } 1.5848 + 1.5849 + @Inline private void appendLongStrBufLineFeed() { 1.5850 + silentLineFeed(); 1.5851 + appendLongStrBuf('\n'); 1.5852 + } 1.5853 + 1.5854 + @Inline private void appendLongStrBufCarriageReturn() { 1.5855 + silentCarriageReturn(); 1.5856 + appendLongStrBuf('\n'); 1.5857 + } 1.5858 + 1.5859 + @Inline protected void silentCarriageReturn() { 1.5860 + ++line; 1.5861 + lastCR = true; 1.5862 + } 1.5863 + 1.5864 + @Inline protected void silentLineFeed() { 1.5865 + ++line; 1.5866 + } 1.5867 + 1.5868 + private void emitCarriageReturn(@NoLength char[] buf, int pos) 1.5869 + throws SAXException { 1.5870 + silentCarriageReturn(); 1.5871 + flushChars(buf, pos); 1.5872 + tokenHandler.characters(Tokenizer.LF, 0, 1); 1.5873 + cstart = Integer.MAX_VALUE; 1.5874 + } 1.5875 + 1.5876 + private void emitReplacementCharacter(@NoLength char[] buf, int pos) 1.5877 + throws SAXException { 1.5878 + flushChars(buf, pos); 1.5879 + tokenHandler.zeroOriginatingReplacementCharacter(); 1.5880 + cstart = pos + 1; 1.5881 + } 1.5882 + 1.5883 + private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos) 1.5884 + throws SAXException { 1.5885 + flushChars(buf, pos); 1.5886 + tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1); 1.5887 + cstart = pos + 1; 1.5888 + } 1.5889 + 1.5890 + private void setAdditionalAndRememberAmpersandLocation(char add) { 1.5891 + additional = add; 1.5892 + // [NOCPP[ 1.5893 + ampersandLocation = new LocatorImpl(this); 1.5894 + // ]NOCPP] 1.5895 + } 1.5896 + 1.5897 + private void bogusDoctype() throws SAXException { 1.5898 + errBogusDoctype(); 1.5899 + forceQuirks = true; 1.5900 + } 1.5901 + 1.5902 + private void bogusDoctypeWithoutQuirks() throws SAXException { 1.5903 + errBogusDoctype(); 1.5904 + forceQuirks = false; 1.5905 + } 1.5906 + 1.5907 + private void emitOrAppendStrBuf(int returnState) throws SAXException { 1.5908 + if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 1.5909 + appendStrBufToLongStrBuf(); 1.5910 + } else { 1.5911 + emitStrBuf(); 1.5912 + } 1.5913 + } 1.5914 + 1.5915 + private void handleNcrValue(int returnState) throws SAXException { 1.5916 + /* 1.5917 + * If one or more characters match the range, then take them all and 1.5918 + * interpret the string of characters as a number (either hexadecimal or 1.5919 + * decimal as appropriate). 1.5920 + */ 1.5921 + if (value <= 0xFFFF) { 1.5922 + if (value >= 0x80 && value <= 0x9f) { 1.5923 + /* 1.5924 + * If that number is one of the numbers in the first column of 1.5925 + * the following table, then this is a parse error. 1.5926 + */ 1.5927 + errNcrInC1Range(); 1.5928 + /* 1.5929 + * Find the row with that number in the first column, and return 1.5930 + * a character token for the Unicode character given in the 1.5931 + * second column of that row. 1.5932 + */ 1.5933 + @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80]; 1.5934 + emitOrAppendOne(val, returnState); 1.5935 + // [NOCPP[ 1.5936 + } else if (value == 0xC 1.5937 + && contentSpacePolicy != XmlViolationPolicy.ALLOW) { 1.5938 + if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) { 1.5939 + emitOrAppendOne(Tokenizer.SPACE, returnState); 1.5940 + } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) { 1.5941 + fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space."); 1.5942 + } 1.5943 + // ]NOCPP] 1.5944 + } else if (value == 0x0) { 1.5945 + errNcrZero(); 1.5946 + emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); 1.5947 + } else if ((value & 0xF800) == 0xD800) { 1.5948 + errNcrSurrogate(); 1.5949 + emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); 1.5950 + } else { 1.5951 + /* 1.5952 + * Otherwise, return a character token for the Unicode character 1.5953 + * whose code point is that number. 1.5954 + */ 1.5955 + char ch = (char) value; 1.5956 + // [NOCPP[ 1.5957 + if (value == 0x0D) { 1.5958 + errNcrCr(); 1.5959 + } else if ((value <= 0x0008) || (value == 0x000B) 1.5960 + || (value >= 0x000E && value <= 0x001F)) { 1.5961 + ch = errNcrControlChar(ch); 1.5962 + } else if (value >= 0xFDD0 && value <= 0xFDEF) { 1.5963 + errNcrUnassigned(); 1.5964 + } else if ((value & 0xFFFE) == 0xFFFE) { 1.5965 + ch = errNcrNonCharacter(ch); 1.5966 + } else if (value >= 0x007F && value <= 0x009F) { 1.5967 + errNcrControlChar(); 1.5968 + } else { 1.5969 + maybeWarnPrivateUse(ch); 1.5970 + } 1.5971 + // ]NOCPP] 1.5972 + bmpChar[0] = ch; 1.5973 + emitOrAppendOne(bmpChar, returnState); 1.5974 + } 1.5975 + } else if (value <= 0x10FFFF) { 1.5976 + // [NOCPP[ 1.5977 + maybeWarnPrivateUseAstral(); 1.5978 + if ((value & 0xFFFE) == 0xFFFE) { 1.5979 + errAstralNonCharacter(value); 1.5980 + } 1.5981 + // ]NOCPP] 1.5982 + astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10)); 1.5983 + astralChar[1] = (char) (0xDC00 + (value & 0x3FF)); 1.5984 + emitOrAppendTwo(astralChar, returnState); 1.5985 + } else { 1.5986 + errNcrOutOfRange(); 1.5987 + emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); 1.5988 + } 1.5989 + } 1.5990 + 1.5991 + public void eof() throws SAXException { 1.5992 + int state = stateSave; 1.5993 + int returnState = returnStateSave; 1.5994 + 1.5995 + eofloop: for (;;) { 1.5996 + switch (state) { 1.5997 + case SCRIPT_DATA_LESS_THAN_SIGN: 1.5998 + case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: 1.5999 + /* 1.6000 + * Otherwise, emit a U+003C LESS-THAN SIGN character token 1.6001 + */ 1.6002 + tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 1.6003 + /* 1.6004 + * and reconsume the current input character in the data 1.6005 + * state. 1.6006 + */ 1.6007 + break eofloop; 1.6008 + case TAG_OPEN: 1.6009 + /* 1.6010 + * The behavior of this state depends on the content model 1.6011 + * flag. 1.6012 + */ 1.6013 + /* 1.6014 + * Anything else Parse error. 1.6015 + */ 1.6016 + errEofAfterLt(); 1.6017 + /* 1.6018 + * Emit a U+003C LESS-THAN SIGN character token 1.6019 + */ 1.6020 + tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 1.6021 + /* 1.6022 + * and reconsume the current input character in the data 1.6023 + * state. 1.6024 + */ 1.6025 + break eofloop; 1.6026 + case RAWTEXT_RCDATA_LESS_THAN_SIGN: 1.6027 + /* 1.6028 + * Emit a U+003C LESS-THAN SIGN character token 1.6029 + */ 1.6030 + tokenHandler.characters(Tokenizer.LT_GT, 0, 1); 1.6031 + /* 1.6032 + * and reconsume the current input character in the RCDATA 1.6033 + * state. 1.6034 + */ 1.6035 + break eofloop; 1.6036 + case NON_DATA_END_TAG_NAME: 1.6037 + /* 1.6038 + * Emit a U+003C LESS-THAN SIGN character token, a U+002F 1.6039 + * SOLIDUS character token, 1.6040 + */ 1.6041 + tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2); 1.6042 + /* 1.6043 + * a character token for each of the characters in the 1.6044 + * temporary buffer (in the order they were added to the 1.6045 + * buffer), 1.6046 + */ 1.6047 + emitStrBuf(); 1.6048 + /* 1.6049 + * and reconsume the current input character in the RCDATA 1.6050 + * state. 1.6051 + */ 1.6052 + break eofloop; 1.6053 + case CLOSE_TAG_OPEN: 1.6054 + /* EOF Parse error. */ 1.6055 + errEofAfterLt(); 1.6056 + /* 1.6057 + * Emit a U+003C LESS-THAN SIGN character token and a U+002F 1.6058 + * SOLIDUS character token. 1.6059 + */ 1.6060 + tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2); 1.6061 + /* 1.6062 + * Reconsume the EOF character in the data state. 1.6063 + */ 1.6064 + break eofloop; 1.6065 + case TAG_NAME: 1.6066 + /* 1.6067 + * EOF Parse error. 1.6068 + */ 1.6069 + errEofInTagName(); 1.6070 + /* 1.6071 + * Reconsume the EOF character in the data state. 1.6072 + */ 1.6073 + break eofloop; 1.6074 + case BEFORE_ATTRIBUTE_NAME: 1.6075 + case AFTER_ATTRIBUTE_VALUE_QUOTED: 1.6076 + case SELF_CLOSING_START_TAG: 1.6077 + /* EOF Parse error. */ 1.6078 + errEofWithoutGt(); 1.6079 + /* 1.6080 + * Reconsume the EOF character in the data state. 1.6081 + */ 1.6082 + break eofloop; 1.6083 + case ATTRIBUTE_NAME: 1.6084 + /* 1.6085 + * EOF Parse error. 1.6086 + */ 1.6087 + errEofInAttributeName(); 1.6088 + /* 1.6089 + * Reconsume the EOF character in the data state. 1.6090 + */ 1.6091 + break eofloop; 1.6092 + case AFTER_ATTRIBUTE_NAME: 1.6093 + case BEFORE_ATTRIBUTE_VALUE: 1.6094 + /* EOF Parse error. */ 1.6095 + errEofWithoutGt(); 1.6096 + /* 1.6097 + * Reconsume the EOF character in the data state. 1.6098 + */ 1.6099 + break eofloop; 1.6100 + case ATTRIBUTE_VALUE_DOUBLE_QUOTED: 1.6101 + case ATTRIBUTE_VALUE_SINGLE_QUOTED: 1.6102 + case ATTRIBUTE_VALUE_UNQUOTED: 1.6103 + /* EOF Parse error. */ 1.6104 + errEofInAttributeValue(); 1.6105 + /* 1.6106 + * Reconsume the EOF character in the data state. 1.6107 + */ 1.6108 + break eofloop; 1.6109 + case BOGUS_COMMENT: 1.6110 + emitComment(0, 0); 1.6111 + break eofloop; 1.6112 + case BOGUS_COMMENT_HYPHEN: 1.6113 + // [NOCPP[ 1.6114 + maybeAppendSpaceToBogusComment(); 1.6115 + // ]NOCPP] 1.6116 + emitComment(0, 0); 1.6117 + break eofloop; 1.6118 + case MARKUP_DECLARATION_OPEN: 1.6119 + errBogusComment(); 1.6120 + clearLongStrBuf(); 1.6121 + emitComment(0, 0); 1.6122 + break eofloop; 1.6123 + case MARKUP_DECLARATION_HYPHEN: 1.6124 + errBogusComment(); 1.6125 + emitComment(0, 0); 1.6126 + break eofloop; 1.6127 + case MARKUP_DECLARATION_OCTYPE: 1.6128 + if (index < 6) { 1.6129 + errBogusComment(); 1.6130 + emitComment(0, 0); 1.6131 + } else { 1.6132 + /* EOF Parse error. */ 1.6133 + errEofInDoctype(); 1.6134 + /* 1.6135 + * Create a new DOCTYPE token. Set its force-quirks flag 1.6136 + * to on. 1.6137 + */ 1.6138 + doctypeName = ""; 1.6139 + if (systemIdentifier != null) { 1.6140 + Portability.releaseString(systemIdentifier); 1.6141 + systemIdentifier = null; 1.6142 + } 1.6143 + if (publicIdentifier != null) { 1.6144 + Portability.releaseString(publicIdentifier); 1.6145 + publicIdentifier = null; 1.6146 + } 1.6147 + forceQuirks = true; 1.6148 + /* 1.6149 + * Emit the token. 1.6150 + */ 1.6151 + emitDoctypeToken(0); 1.6152 + /* 1.6153 + * Reconsume the EOF character in the data state. 1.6154 + */ 1.6155 + break eofloop; 1.6156 + } 1.6157 + break eofloop; 1.6158 + case COMMENT_START: 1.6159 + case COMMENT: 1.6160 + /* 1.6161 + * EOF Parse error. 1.6162 + */ 1.6163 + errEofInComment(); 1.6164 + /* Emit the comment token. */ 1.6165 + emitComment(0, 0); 1.6166 + /* 1.6167 + * Reconsume the EOF character in the data state. 1.6168 + */ 1.6169 + break eofloop; 1.6170 + case COMMENT_END: 1.6171 + errEofInComment(); 1.6172 + /* Emit the comment token. */ 1.6173 + emitComment(2, 0); 1.6174 + /* 1.6175 + * Reconsume the EOF character in the data state. 1.6176 + */ 1.6177 + break eofloop; 1.6178 + case COMMENT_END_DASH: 1.6179 + case COMMENT_START_DASH: 1.6180 + errEofInComment(); 1.6181 + /* Emit the comment token. */ 1.6182 + emitComment(1, 0); 1.6183 + /* 1.6184 + * Reconsume the EOF character in the data state. 1.6185 + */ 1.6186 + break eofloop; 1.6187 + case COMMENT_END_BANG: 1.6188 + errEofInComment(); 1.6189 + /* Emit the comment token. */ 1.6190 + emitComment(3, 0); 1.6191 + /* 1.6192 + * Reconsume the EOF character in the data state. 1.6193 + */ 1.6194 + break eofloop; 1.6195 + case DOCTYPE: 1.6196 + case BEFORE_DOCTYPE_NAME: 1.6197 + errEofInDoctype(); 1.6198 + /* 1.6199 + * Create a new DOCTYPE token. Set its force-quirks flag to 1.6200 + * on. 1.6201 + */ 1.6202 + forceQuirks = true; 1.6203 + /* 1.6204 + * Emit the token. 1.6205 + */ 1.6206 + emitDoctypeToken(0); 1.6207 + /* 1.6208 + * Reconsume the EOF character in the data state. 1.6209 + */ 1.6210 + break eofloop; 1.6211 + case DOCTYPE_NAME: 1.6212 + errEofInDoctype(); 1.6213 + strBufToDoctypeName(); 1.6214 + /* 1.6215 + * Set the DOCTYPE token's force-quirks flag to on. 1.6216 + */ 1.6217 + forceQuirks = true; 1.6218 + /* 1.6219 + * Emit that DOCTYPE token. 1.6220 + */ 1.6221 + emitDoctypeToken(0); 1.6222 + /* 1.6223 + * Reconsume the EOF character in the data state. 1.6224 + */ 1.6225 + break eofloop; 1.6226 + case DOCTYPE_UBLIC: 1.6227 + case DOCTYPE_YSTEM: 1.6228 + case AFTER_DOCTYPE_NAME: 1.6229 + case AFTER_DOCTYPE_PUBLIC_KEYWORD: 1.6230 + case AFTER_DOCTYPE_SYSTEM_KEYWORD: 1.6231 + case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: 1.6232 + errEofInDoctype(); 1.6233 + /* 1.6234 + * Set the DOCTYPE token's force-quirks flag to on. 1.6235 + */ 1.6236 + forceQuirks = true; 1.6237 + /* 1.6238 + * Emit that DOCTYPE token. 1.6239 + */ 1.6240 + emitDoctypeToken(0); 1.6241 + /* 1.6242 + * Reconsume the EOF character in the data state. 1.6243 + */ 1.6244 + break eofloop; 1.6245 + case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: 1.6246 + case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: 1.6247 + /* EOF Parse error. */ 1.6248 + errEofInPublicId(); 1.6249 + /* 1.6250 + * Set the DOCTYPE token's force-quirks flag to on. 1.6251 + */ 1.6252 + forceQuirks = true; 1.6253 + /* 1.6254 + * Emit that DOCTYPE token. 1.6255 + */ 1.6256 + publicIdentifier = longStrBufToString(); 1.6257 + emitDoctypeToken(0); 1.6258 + /* 1.6259 + * Reconsume the EOF character in the data state. 1.6260 + */ 1.6261 + break eofloop; 1.6262 + case AFTER_DOCTYPE_PUBLIC_IDENTIFIER: 1.6263 + case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: 1.6264 + case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: 1.6265 + errEofInDoctype(); 1.6266 + /* 1.6267 + * Set the DOCTYPE token's force-quirks flag to on. 1.6268 + */ 1.6269 + forceQuirks = true; 1.6270 + /* 1.6271 + * Emit that DOCTYPE token. 1.6272 + */ 1.6273 + emitDoctypeToken(0); 1.6274 + /* 1.6275 + * Reconsume the EOF character in the data state. 1.6276 + */ 1.6277 + break eofloop; 1.6278 + case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: 1.6279 + case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: 1.6280 + /* EOF Parse error. */ 1.6281 + errEofInSystemId(); 1.6282 + /* 1.6283 + * Set the DOCTYPE token's force-quirks flag to on. 1.6284 + */ 1.6285 + forceQuirks = true; 1.6286 + /* 1.6287 + * Emit that DOCTYPE token. 1.6288 + */ 1.6289 + systemIdentifier = longStrBufToString(); 1.6290 + emitDoctypeToken(0); 1.6291 + /* 1.6292 + * Reconsume the EOF character in the data state. 1.6293 + */ 1.6294 + break eofloop; 1.6295 + case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: 1.6296 + errEofInDoctype(); 1.6297 + /* 1.6298 + * Set the DOCTYPE token's force-quirks flag to on. 1.6299 + */ 1.6300 + forceQuirks = true; 1.6301 + /* 1.6302 + * Emit that DOCTYPE token. 1.6303 + */ 1.6304 + emitDoctypeToken(0); 1.6305 + /* 1.6306 + * Reconsume the EOF character in the data state. 1.6307 + */ 1.6308 + break eofloop; 1.6309 + case BOGUS_DOCTYPE: 1.6310 + /* 1.6311 + * Emit that DOCTYPE token. 1.6312 + */ 1.6313 + emitDoctypeToken(0); 1.6314 + /* 1.6315 + * Reconsume the EOF character in the data state. 1.6316 + */ 1.6317 + break eofloop; 1.6318 + case CONSUME_CHARACTER_REFERENCE: 1.6319 + /* 1.6320 + * Unlike the definition is the spec, this state does not 1.6321 + * return a value and never requires the caller to 1.6322 + * backtrack. This state takes care of emitting characters 1.6323 + * or appending to the current attribute value. It also 1.6324 + * takes care of that in the case when consuming the entity 1.6325 + * fails. 1.6326 + */ 1.6327 + /* 1.6328 + * This section defines how to consume an entity. This 1.6329 + * definition is used when parsing entities in text and in 1.6330 + * attributes. 1.6331 + * 1.6332 + * The behavior depends on the identity of the next 1.6333 + * character (the one immediately after the U+0026 AMPERSAND 1.6334 + * character): 1.6335 + */ 1.6336 + 1.6337 + emitOrAppendStrBuf(returnState); 1.6338 + state = returnState; 1.6339 + continue; 1.6340 + case CHARACTER_REFERENCE_HILO_LOOKUP: 1.6341 + errNoNamedCharacterMatch(); 1.6342 + emitOrAppendStrBuf(returnState); 1.6343 + state = returnState; 1.6344 + continue; 1.6345 + case CHARACTER_REFERENCE_TAIL: 1.6346 + outer: for (;;) { 1.6347 + char c = '\u0000'; 1.6348 + entCol++; 1.6349 + /* 1.6350 + * Consume the maximum number of characters possible, 1.6351 + * with the consumed characters matching one of the 1.6352 + * identifiers in the first column of the named 1.6353 + * character references table (in a case-sensitive 1.6354 + * manner). 1.6355 + */ 1.6356 + hiloop: for (;;) { 1.6357 + if (hi == -1) { 1.6358 + break hiloop; 1.6359 + } 1.6360 + if (entCol == NamedCharacters.NAMES[hi].length()) { 1.6361 + break hiloop; 1.6362 + } 1.6363 + if (entCol > NamedCharacters.NAMES[hi].length()) { 1.6364 + break outer; 1.6365 + } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) { 1.6366 + hi--; 1.6367 + } else { 1.6368 + break hiloop; 1.6369 + } 1.6370 + } 1.6371 + 1.6372 + loloop: for (;;) { 1.6373 + if (hi < lo) { 1.6374 + break outer; 1.6375 + } 1.6376 + if (entCol == NamedCharacters.NAMES[lo].length()) { 1.6377 + candidate = lo; 1.6378 + strBufMark = strBufLen; 1.6379 + lo++; 1.6380 + } else if (entCol > NamedCharacters.NAMES[lo].length()) { 1.6381 + break outer; 1.6382 + } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) { 1.6383 + lo++; 1.6384 + } else { 1.6385 + break loloop; 1.6386 + } 1.6387 + } 1.6388 + if (hi < lo) { 1.6389 + break outer; 1.6390 + } 1.6391 + continue; 1.6392 + } 1.6393 + 1.6394 + if (candidate == -1) { 1.6395 + /* 1.6396 + * If no match can be made, then this is a parse error. 1.6397 + */ 1.6398 + errNoNamedCharacterMatch(); 1.6399 + emitOrAppendStrBuf(returnState); 1.6400 + state = returnState; 1.6401 + continue eofloop; 1.6402 + } else { 1.6403 + @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate]; 1.6404 + if (candidateName.length() == 0 1.6405 + || candidateName.charAt(candidateName.length() - 1) != ';') { 1.6406 + /* 1.6407 + * If the last character matched is not a U+003B 1.6408 + * SEMICOLON (;), there is a parse error. 1.6409 + */ 1.6410 + if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 1.6411 + /* 1.6412 + * If the entity is being consumed as part of an 1.6413 + * attribute, and the last character matched is 1.6414 + * not a U+003B SEMICOLON (;), 1.6415 + */ 1.6416 + char ch; 1.6417 + if (strBufMark == strBufLen) { 1.6418 + ch = '\u0000'; 1.6419 + } else { 1.6420 + ch = strBuf[strBufMark]; 1.6421 + } 1.6422 + if ((ch >= '0' && ch <= '9') 1.6423 + || (ch >= 'A' && ch <= 'Z') 1.6424 + || (ch >= 'a' && ch <= 'z')) { 1.6425 + /* 1.6426 + * and the next character is in the range 1.6427 + * U+0030 DIGIT ZERO to U+0039 DIGIT NINE, 1.6428 + * U+0041 LATIN CAPITAL LETTER A to U+005A 1.6429 + * LATIN CAPITAL LETTER Z, or U+0061 LATIN 1.6430 + * SMALL LETTER A to U+007A LATIN SMALL 1.6431 + * LETTER Z, then, for historical reasons, 1.6432 + * all the characters that were matched 1.6433 + * after the U+0026 AMPERSAND (&) must be 1.6434 + * unconsumed, and nothing is returned. 1.6435 + */ 1.6436 + errNoNamedCharacterMatch(); 1.6437 + appendStrBufToLongStrBuf(); 1.6438 + state = returnState; 1.6439 + continue eofloop; 1.6440 + } 1.6441 + } 1.6442 + if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 1.6443 + errUnescapedAmpersandInterpretedAsCharacterReference(); 1.6444 + } else { 1.6445 + errNotSemicolonTerminated(); 1.6446 + } 1.6447 + } 1.6448 + 1.6449 + /* 1.6450 + * Otherwise, return a character token for the character 1.6451 + * corresponding to the entity name (as given by the 1.6452 + * second column of the named character references 1.6453 + * table). 1.6454 + */ 1.6455 + @Const @NoLength char[] val = NamedCharacters.VALUES[candidate]; 1.6456 + if ( 1.6457 + // [NOCPP[ 1.6458 + val.length == 1 1.6459 + // ]NOCPP] 1.6460 + // CPPONLY: val[1] == 0 1.6461 + ) { 1.6462 + emitOrAppendOne(val, returnState); 1.6463 + } else { 1.6464 + emitOrAppendTwo(val, returnState); 1.6465 + } 1.6466 + // this is so complicated! 1.6467 + if (strBufMark < strBufLen) { 1.6468 + if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 1.6469 + for (int i = strBufMark; i < strBufLen; i++) { 1.6470 + appendLongStrBuf(strBuf[i]); 1.6471 + } 1.6472 + } else { 1.6473 + tokenHandler.characters(strBuf, strBufMark, 1.6474 + strBufLen - strBufMark); 1.6475 + } 1.6476 + } 1.6477 + state = returnState; 1.6478 + continue eofloop; 1.6479 + /* 1.6480 + * If the markup contains I'm ¬it; I tell you, the 1.6481 + * entity is parsed as "not", as in, I'm ¬it; I tell 1.6482 + * you. But if the markup was I'm ∉ I tell you, 1.6483 + * the entity would be parsed as "notin;", resulting in 1.6484 + * I'm ∉ I tell you. 1.6485 + */ 1.6486 + } 1.6487 + case CONSUME_NCR: 1.6488 + case DECIMAL_NRC_LOOP: 1.6489 + case HEX_NCR_LOOP: 1.6490 + /* 1.6491 + * If no characters match the range, then don't consume any 1.6492 + * characters (and unconsume the U+0023 NUMBER SIGN 1.6493 + * character and, if appropriate, the X character). This is 1.6494 + * a parse error; nothing is returned. 1.6495 + * 1.6496 + * Otherwise, if the next character is a U+003B SEMICOLON, 1.6497 + * consume that too. If it isn't, there is a parse error. 1.6498 + */ 1.6499 + if (!seenDigits) { 1.6500 + errNoDigitsInNCR(); 1.6501 + emitOrAppendStrBuf(returnState); 1.6502 + state = returnState; 1.6503 + continue; 1.6504 + } else { 1.6505 + errCharRefLacksSemicolon(); 1.6506 + } 1.6507 + // WARNING previous state sets reconsume 1.6508 + handleNcrValue(returnState); 1.6509 + state = returnState; 1.6510 + continue; 1.6511 + case CDATA_RSQB: 1.6512 + tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1); 1.6513 + break eofloop; 1.6514 + case CDATA_RSQB_RSQB: 1.6515 + tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2); 1.6516 + break eofloop; 1.6517 + case DATA: 1.6518 + default: 1.6519 + break eofloop; 1.6520 + } 1.6521 + } 1.6522 + // case DATA: 1.6523 + /* 1.6524 + * EOF Emit an end-of-file token. 1.6525 + */ 1.6526 + tokenHandler.eof(); 1.6527 + return; 1.6528 + } 1.6529 + 1.6530 + private void emitDoctypeToken(int pos) throws SAXException { 1.6531 + cstart = pos + 1; 1.6532 + tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier, 1.6533 + forceQuirks); 1.6534 + // It is OK and sufficient to release these here, since 1.6535 + // there's no way out of the doctype states than through paths 1.6536 + // that call this method. 1.6537 + doctypeName = null; 1.6538 + Portability.releaseString(publicIdentifier); 1.6539 + publicIdentifier = null; 1.6540 + Portability.releaseString(systemIdentifier); 1.6541 + systemIdentifier = null; 1.6542 + } 1.6543 + 1.6544 + @Inline protected char checkChar(@NoLength char[] buf, int pos) 1.6545 + throws SAXException { 1.6546 + return buf[pos]; 1.6547 + } 1.6548 + 1.6549 + public boolean internalEncodingDeclaration(String internalCharset) 1.6550 + throws SAXException { 1.6551 + if (encodingDeclarationHandler != null) { 1.6552 + return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset); 1.6553 + } 1.6554 + return false; 1.6555 + } 1.6556 + 1.6557 + /** 1.6558 + * @param val 1.6559 + * @throws SAXException 1.6560 + */ 1.6561 + private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState) 1.6562 + throws SAXException { 1.6563 + if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 1.6564 + appendLongStrBuf(val[0]); 1.6565 + appendLongStrBuf(val[1]); 1.6566 + } else { 1.6567 + tokenHandler.characters(val, 0, 2); 1.6568 + } 1.6569 + } 1.6570 + 1.6571 + private void emitOrAppendOne(@Const @NoLength char[] val, int returnState) 1.6572 + throws SAXException { 1.6573 + if ((returnState & DATA_AND_RCDATA_MASK) != 0) { 1.6574 + appendLongStrBuf(val[0]); 1.6575 + } else { 1.6576 + tokenHandler.characters(val, 0, 1); 1.6577 + } 1.6578 + } 1.6579 + 1.6580 + public void end() throws SAXException { 1.6581 + strBuf = null; 1.6582 + longStrBuf = null; 1.6583 + doctypeName = null; 1.6584 + if (systemIdentifier != null) { 1.6585 + Portability.releaseString(systemIdentifier); 1.6586 + systemIdentifier = null; 1.6587 + } 1.6588 + if (publicIdentifier != null) { 1.6589 + Portability.releaseString(publicIdentifier); 1.6590 + publicIdentifier = null; 1.6591 + } 1.6592 + if (tagName != null) { 1.6593 + tagName.release(); 1.6594 + tagName = null; 1.6595 + } 1.6596 + if (attributeName != null) { 1.6597 + attributeName.release(); 1.6598 + attributeName = null; 1.6599 + } 1.6600 + tokenHandler.endTokenization(); 1.6601 + if (attributes != null) { 1.6602 + // [NOCPP[ 1.6603 + attributes = null; 1.6604 + // ]NOCPP] 1.6605 + // CPPONLY: attributes.clear(mappingLangToXmlLang); 1.6606 + } 1.6607 + } 1.6608 + 1.6609 + public void requestSuspension() { 1.6610 + shouldSuspend = true; 1.6611 + } 1.6612 + 1.6613 + // [NOCPP[ 1.6614 + 1.6615 + public void becomeConfident() { 1.6616 + confident = true; 1.6617 + } 1.6618 + 1.6619 + /** 1.6620 + * Returns the nextCharOnNewLine. 1.6621 + * 1.6622 + * @return the nextCharOnNewLine 1.6623 + */ 1.6624 + public boolean isNextCharOnNewLine() { 1.6625 + return false; 1.6626 + } 1.6627 + 1.6628 + public boolean isPrevCR() { 1.6629 + return lastCR; 1.6630 + } 1.6631 + 1.6632 + /** 1.6633 + * Returns the line. 1.6634 + * 1.6635 + * @return the line 1.6636 + */ 1.6637 + public int getLine() { 1.6638 + return -1; 1.6639 + } 1.6640 + 1.6641 + /** 1.6642 + * Returns the col. 1.6643 + * 1.6644 + * @return the col 1.6645 + */ 1.6646 + public int getCol() { 1.6647 + return -1; 1.6648 + } 1.6649 + 1.6650 + // ]NOCPP] 1.6651 + 1.6652 + public boolean isInDataState() { 1.6653 + return (stateSave == DATA); 1.6654 + } 1.6655 + 1.6656 + public void resetToDataState() { 1.6657 + strBufLen = 0; 1.6658 + longStrBufLen = 0; 1.6659 + stateSave = Tokenizer.DATA; 1.6660 + // line = 1; XXX line numbers 1.6661 + lastCR = false; 1.6662 + index = 0; 1.6663 + forceQuirks = false; 1.6664 + additional = '\u0000'; 1.6665 + entCol = -1; 1.6666 + firstCharKey = -1; 1.6667 + lo = 0; 1.6668 + hi = 0; // will always be overwritten before use anyway 1.6669 + candidate = -1; 1.6670 + strBufMark = 0; 1.6671 + prevValue = -1; 1.6672 + value = 0; 1.6673 + seenDigits = false; 1.6674 + endTag = false; 1.6675 + shouldSuspend = false; 1.6676 + initDoctypeFields(); 1.6677 + if (tagName != null) { 1.6678 + tagName.release(); 1.6679 + tagName = null; 1.6680 + } 1.6681 + if (attributeName != null) { 1.6682 + attributeName.release(); 1.6683 + attributeName = null; 1.6684 + } 1.6685 + if (newAttributesEachTime) { 1.6686 + if (attributes != null) { 1.6687 + Portability.delete(attributes); 1.6688 + attributes = null; 1.6689 + } 1.6690 + } 1.6691 + } 1.6692 + 1.6693 + public void loadState(Tokenizer other) throws SAXException { 1.6694 + strBufLen = other.strBufLen; 1.6695 + if (strBufLen > strBuf.length) { 1.6696 + strBuf = new char[strBufLen]; 1.6697 + } 1.6698 + System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen); 1.6699 + 1.6700 + longStrBufLen = other.longStrBufLen; 1.6701 + if (longStrBufLen > longStrBuf.length) { 1.6702 + longStrBuf = new char[longStrBufLen]; 1.6703 + } 1.6704 + System.arraycopy(other.longStrBuf, 0, longStrBuf, 0, longStrBufLen); 1.6705 + 1.6706 + stateSave = other.stateSave; 1.6707 + returnStateSave = other.returnStateSave; 1.6708 + endTagExpectation = other.endTagExpectation; 1.6709 + endTagExpectationAsArray = other.endTagExpectationAsArray; 1.6710 + // line = 1; XXX line numbers 1.6711 + lastCR = other.lastCR; 1.6712 + index = other.index; 1.6713 + forceQuirks = other.forceQuirks; 1.6714 + additional = other.additional; 1.6715 + entCol = other.entCol; 1.6716 + firstCharKey = other.firstCharKey; 1.6717 + lo = other.lo; 1.6718 + hi = other.hi; 1.6719 + candidate = other.candidate; 1.6720 + strBufMark = other.strBufMark; 1.6721 + prevValue = other.prevValue; 1.6722 + value = other.value; 1.6723 + seenDigits = other.seenDigits; 1.6724 + endTag = other.endTag; 1.6725 + shouldSuspend = false; 1.6726 + 1.6727 + if (other.doctypeName == null) { 1.6728 + doctypeName = null; 1.6729 + } else { 1.6730 + doctypeName = Portability.newLocalFromLocal(other.doctypeName, 1.6731 + interner); 1.6732 + } 1.6733 + 1.6734 + Portability.releaseString(systemIdentifier); 1.6735 + if (other.systemIdentifier == null) { 1.6736 + systemIdentifier = null; 1.6737 + } else { 1.6738 + systemIdentifier = Portability.newStringFromString(other.systemIdentifier); 1.6739 + } 1.6740 + 1.6741 + Portability.releaseString(publicIdentifier); 1.6742 + if (other.publicIdentifier == null) { 1.6743 + publicIdentifier = null; 1.6744 + } else { 1.6745 + publicIdentifier = Portability.newStringFromString(other.publicIdentifier); 1.6746 + } 1.6747 + 1.6748 + if (tagName != null) { 1.6749 + tagName.release(); 1.6750 + } 1.6751 + if (other.tagName == null) { 1.6752 + tagName = null; 1.6753 + } else { 1.6754 + tagName = other.tagName.cloneElementName(interner); 1.6755 + } 1.6756 + 1.6757 + if (attributeName != null) { 1.6758 + attributeName.release(); 1.6759 + } 1.6760 + if (other.attributeName == null) { 1.6761 + attributeName = null; 1.6762 + } else { 1.6763 + attributeName = other.attributeName.cloneAttributeName(interner); 1.6764 + } 1.6765 + 1.6766 + Portability.delete(attributes); 1.6767 + if (other.attributes == null) { 1.6768 + attributes = null; 1.6769 + } else { 1.6770 + attributes = other.attributes.cloneAttributes(interner); 1.6771 + } 1.6772 + } 1.6773 + 1.6774 + public void initializeWithoutStarting() throws SAXException { 1.6775 + confident = false; 1.6776 + strBuf = new char[64]; 1.6777 + longStrBuf = new char[1024]; 1.6778 + line = 1; 1.6779 + // [NOCPP[ 1.6780 + html4 = false; 1.6781 + metaBoundaryPassed = false; 1.6782 + wantsComments = tokenHandler.wantsComments(); 1.6783 + if (!newAttributesEachTime) { 1.6784 + attributes = new HtmlAttributes(mappingLangToXmlLang); 1.6785 + } 1.6786 + // ]NOCPP] 1.6787 + resetToDataState(); 1.6788 + } 1.6789 + 1.6790 + protected void errGarbageAfterLtSlash() throws SAXException { 1.6791 + } 1.6792 + 1.6793 + protected void errLtSlashGt() throws SAXException { 1.6794 + } 1.6795 + 1.6796 + protected void errWarnLtSlashInRcdata() throws SAXException { 1.6797 + } 1.6798 + 1.6799 + protected void errHtml4LtSlashInRcdata(char folded) throws SAXException { 1.6800 + } 1.6801 + 1.6802 + protected void errCharRefLacksSemicolon() throws SAXException { 1.6803 + } 1.6804 + 1.6805 + protected void errNoDigitsInNCR() throws SAXException { 1.6806 + } 1.6807 + 1.6808 + protected void errGtInSystemId() throws SAXException { 1.6809 + } 1.6810 + 1.6811 + protected void errGtInPublicId() throws SAXException { 1.6812 + } 1.6813 + 1.6814 + protected void errNamelessDoctype() throws SAXException { 1.6815 + } 1.6816 + 1.6817 + protected void errConsecutiveHyphens() throws SAXException { 1.6818 + } 1.6819 + 1.6820 + protected void errPrematureEndOfComment() throws SAXException { 1.6821 + } 1.6822 + 1.6823 + protected void errBogusComment() throws SAXException { 1.6824 + } 1.6825 + 1.6826 + protected void errUnquotedAttributeValOrNull(char c) throws SAXException { 1.6827 + } 1.6828 + 1.6829 + protected void errSlashNotFollowedByGt() throws SAXException { 1.6830 + } 1.6831 + 1.6832 + protected void errHtml4XmlVoidSyntax() throws SAXException { 1.6833 + } 1.6834 + 1.6835 + protected void errNoSpaceBetweenAttributes() throws SAXException { 1.6836 + } 1.6837 + 1.6838 + protected void errHtml4NonNameInUnquotedAttribute(char c) 1.6839 + throws SAXException { 1.6840 + } 1.6841 + 1.6842 + protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c) 1.6843 + throws SAXException { 1.6844 + } 1.6845 + 1.6846 + protected void errAttributeValueMissing() throws SAXException { 1.6847 + } 1.6848 + 1.6849 + protected void errBadCharBeforeAttributeNameOrNull(char c) 1.6850 + throws SAXException { 1.6851 + } 1.6852 + 1.6853 + protected void errEqualsSignBeforeAttributeName() throws SAXException { 1.6854 + } 1.6855 + 1.6856 + protected void errBadCharAfterLt(char c) throws SAXException { 1.6857 + } 1.6858 + 1.6859 + protected void errLtGt() throws SAXException { 1.6860 + } 1.6861 + 1.6862 + protected void errProcessingInstruction() throws SAXException { 1.6863 + } 1.6864 + 1.6865 + protected void errUnescapedAmpersandInterpretedAsCharacterReference() 1.6866 + throws SAXException { 1.6867 + } 1.6868 + 1.6869 + protected void errNotSemicolonTerminated() throws SAXException { 1.6870 + } 1.6871 + 1.6872 + protected void errNoNamedCharacterMatch() throws SAXException { 1.6873 + } 1.6874 + 1.6875 + protected void errQuoteBeforeAttributeName(char c) throws SAXException { 1.6876 + } 1.6877 + 1.6878 + protected void errQuoteOrLtInAttributeNameOrNull(char c) 1.6879 + throws SAXException { 1.6880 + } 1.6881 + 1.6882 + protected void errExpectedPublicId() throws SAXException { 1.6883 + } 1.6884 + 1.6885 + protected void errBogusDoctype() throws SAXException { 1.6886 + } 1.6887 + 1.6888 + protected void maybeWarnPrivateUseAstral() throws SAXException { 1.6889 + } 1.6890 + 1.6891 + protected void maybeWarnPrivateUse(char ch) throws SAXException { 1.6892 + } 1.6893 + 1.6894 + protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs) 1.6895 + throws SAXException { 1.6896 + } 1.6897 + 1.6898 + protected void maybeErrSlashInEndTag(boolean selfClosing) 1.6899 + throws SAXException { 1.6900 + } 1.6901 + 1.6902 + protected char errNcrNonCharacter(char ch) throws SAXException { 1.6903 + return ch; 1.6904 + } 1.6905 + 1.6906 + protected void errAstralNonCharacter(int ch) throws SAXException { 1.6907 + } 1.6908 + 1.6909 + protected void errNcrSurrogate() throws SAXException { 1.6910 + } 1.6911 + 1.6912 + protected char errNcrControlChar(char ch) throws SAXException { 1.6913 + return ch; 1.6914 + } 1.6915 + 1.6916 + protected void errNcrCr() throws SAXException { 1.6917 + } 1.6918 + 1.6919 + protected void errNcrInC1Range() throws SAXException { 1.6920 + } 1.6921 + 1.6922 + protected void errEofInPublicId() throws SAXException { 1.6923 + } 1.6924 + 1.6925 + protected void errEofInComment() throws SAXException { 1.6926 + } 1.6927 + 1.6928 + protected void errEofInDoctype() throws SAXException { 1.6929 + } 1.6930 + 1.6931 + protected void errEofInAttributeValue() throws SAXException { 1.6932 + } 1.6933 + 1.6934 + protected void errEofInAttributeName() throws SAXException { 1.6935 + } 1.6936 + 1.6937 + protected void errEofWithoutGt() throws SAXException { 1.6938 + } 1.6939 + 1.6940 + protected void errEofInTagName() throws SAXException { 1.6941 + } 1.6942 + 1.6943 + protected void errEofInEndTag() throws SAXException { 1.6944 + } 1.6945 + 1.6946 + protected void errEofAfterLt() throws SAXException { 1.6947 + } 1.6948 + 1.6949 + protected void errNcrOutOfRange() throws SAXException { 1.6950 + } 1.6951 + 1.6952 + protected void errNcrUnassigned() throws SAXException { 1.6953 + } 1.6954 + 1.6955 + protected void errDuplicateAttribute() throws SAXException { 1.6956 + } 1.6957 + 1.6958 + protected void errEofInSystemId() throws SAXException { 1.6959 + } 1.6960 + 1.6961 + protected void errExpectedSystemId() throws SAXException { 1.6962 + } 1.6963 + 1.6964 + protected void errMissingSpaceBeforeDoctypeName() throws SAXException { 1.6965 + } 1.6966 + 1.6967 + protected void errHyphenHyphenBang() throws SAXException { 1.6968 + } 1.6969 + 1.6970 + protected void errNcrControlChar() throws SAXException { 1.6971 + } 1.6972 + 1.6973 + protected void errNcrZero() throws SAXException { 1.6974 + } 1.6975 + 1.6976 + protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote() 1.6977 + throws SAXException { 1.6978 + } 1.6979 + 1.6980 + protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException { 1.6981 + } 1.6982 + 1.6983 + protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote() 1.6984 + throws SAXException { 1.6985 + } 1.6986 + 1.6987 + protected void noteAttributeWithoutValue() throws SAXException { 1.6988 + } 1.6989 + 1.6990 + protected void noteUnquotedAttributeValue() throws SAXException { 1.6991 + } 1.6992 + 1.6993 + /** 1.6994 + * Sets the encodingDeclarationHandler. 1.6995 + * 1.6996 + * @param encodingDeclarationHandler 1.6997 + * the encodingDeclarationHandler to set 1.6998 + */ 1.6999 + public void setEncodingDeclarationHandler( 1.7000 + EncodingDeclarationHandler encodingDeclarationHandler) { 1.7001 + this.encodingDeclarationHandler = encodingDeclarationHandler; 1.7002 + } 1.7003 + 1.7004 + void destructor() { 1.7005 + // The translator will write refcount tracing stuff here 1.7006 + Portability.delete(attributes); 1.7007 + attributes = null; 1.7008 + } 1.7009 + 1.7010 + // [NOCPP[ 1.7011 + 1.7012 + /** 1.7013 + * Sets an offset to be added to the position reported to 1.7014 + * <code>TransitionHandler</code>. 1.7015 + * 1.7016 + * @param offset the offset 1.7017 + */ 1.7018 + public void setTransitionBaseOffset(int offset) { 1.7019 + 1.7020 + } 1.7021 + 1.7022 + // ]NOCPP] 1.7023 + 1.7024 +}