parser/html/javasrc/Tokenizer.java

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 * Copyright (c) 2005-2007 Henri Sivonen
michael@0 3 * Copyright (c) 2007-2013 Mozilla Foundation
michael@0 4 * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla
michael@0 5 * Foundation, and Opera Software ASA.
michael@0 6 *
michael@0 7 * Permission is hereby granted, free of charge, to any person obtaining a
michael@0 8 * copy of this software and associated documentation files (the "Software"),
michael@0 9 * to deal in the Software without restriction, including without limitation
michael@0 10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
michael@0 11 * and/or sell copies of the Software, and to permit persons to whom the
michael@0 12 * Software is furnished to do so, subject to the following conditions:
michael@0 13 *
michael@0 14 * The above copyright notice and this permission notice shall be included in
michael@0 15 * all copies or substantial portions of the Software.
michael@0 16 *
michael@0 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
michael@0 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
michael@0 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
michael@0 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
michael@0 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
michael@0 22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
michael@0 23 * DEALINGS IN THE SOFTWARE.
michael@0 24 */
michael@0 25
michael@0 26 /*
michael@0 27 * The comments following this one that use the same comment syntax as this
michael@0 28 * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007
michael@0 29 * amended as of June 18 2008 and May 31 2010.
michael@0 30 * That document came with this statement:
michael@0 31 * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and
michael@0 32 * Opera Software ASA. You are granted a license to use, reproduce and
michael@0 33 * create derivative works of this document."
michael@0 34 */
michael@0 35
michael@0 36 package nu.validator.htmlparser.impl;
michael@0 37
michael@0 38 import nu.validator.htmlparser.annotation.Auto;
michael@0 39 import nu.validator.htmlparser.annotation.CharacterName;
michael@0 40 import nu.validator.htmlparser.annotation.Const;
michael@0 41 import nu.validator.htmlparser.annotation.Inline;
michael@0 42 import nu.validator.htmlparser.annotation.Local;
michael@0 43 import nu.validator.htmlparser.annotation.NoLength;
michael@0 44 import nu.validator.htmlparser.common.EncodingDeclarationHandler;
michael@0 45 import nu.validator.htmlparser.common.Interner;
michael@0 46 import nu.validator.htmlparser.common.TokenHandler;
michael@0 47 import nu.validator.htmlparser.common.XmlViolationPolicy;
michael@0 48
michael@0 49 import org.xml.sax.ErrorHandler;
michael@0 50 import org.xml.sax.Locator;
michael@0 51 import org.xml.sax.SAXException;
michael@0 52 import org.xml.sax.SAXParseException;
michael@0 53
michael@0 54 /**
michael@0 55 * An implementation of
michael@0 56 * http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
michael@0 57 *
michael@0 58 * This class implements the <code>Locator</code> interface. This is not an
michael@0 59 * incidental implementation detail: Users of this class are encouraged to make
michael@0 60 * use of the <code>Locator</code> nature.
michael@0 61 *
michael@0 62 * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer
michael@0 63 * can be configured to treat these conditions as fatal or to coerce the infoset
michael@0 64 * to something that XML 1.0 allows.
michael@0 65 *
michael@0 66 * @version $Id$
michael@0 67 * @author hsivonen
michael@0 68 */
michael@0 69 public class Tokenizer implements Locator {
michael@0 70
michael@0 71 private static final int DATA_AND_RCDATA_MASK = ~1;
michael@0 72
michael@0 73 public static final int DATA = 0;
michael@0 74
michael@0 75 public static final int RCDATA = 1;
michael@0 76
michael@0 77 public static final int SCRIPT_DATA = 2;
michael@0 78
michael@0 79 public static final int RAWTEXT = 3;
michael@0 80
michael@0 81 public static final int SCRIPT_DATA_ESCAPED = 4;
michael@0 82
michael@0 83 public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5;
michael@0 84
michael@0 85 public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6;
michael@0 86
michael@0 87 public static final int ATTRIBUTE_VALUE_UNQUOTED = 7;
michael@0 88
michael@0 89 public static final int PLAINTEXT = 8;
michael@0 90
michael@0 91 public static final int TAG_OPEN = 9;
michael@0 92
michael@0 93 public static final int CLOSE_TAG_OPEN = 10;
michael@0 94
michael@0 95 public static final int TAG_NAME = 11;
michael@0 96
michael@0 97 public static final int BEFORE_ATTRIBUTE_NAME = 12;
michael@0 98
michael@0 99 public static final int ATTRIBUTE_NAME = 13;
michael@0 100
michael@0 101 public static final int AFTER_ATTRIBUTE_NAME = 14;
michael@0 102
michael@0 103 public static final int BEFORE_ATTRIBUTE_VALUE = 15;
michael@0 104
michael@0 105 public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16;
michael@0 106
michael@0 107 public static final int BOGUS_COMMENT = 17;
michael@0 108
michael@0 109 public static final int MARKUP_DECLARATION_OPEN = 18;
michael@0 110
michael@0 111 public static final int DOCTYPE = 19;
michael@0 112
michael@0 113 public static final int BEFORE_DOCTYPE_NAME = 20;
michael@0 114
michael@0 115 public static final int DOCTYPE_NAME = 21;
michael@0 116
michael@0 117 public static final int AFTER_DOCTYPE_NAME = 22;
michael@0 118
michael@0 119 public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23;
michael@0 120
michael@0 121 public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24;
michael@0 122
michael@0 123 public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25;
michael@0 124
michael@0 125 public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26;
michael@0 126
michael@0 127 public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27;
michael@0 128
michael@0 129 public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28;
michael@0 130
michael@0 131 public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29;
michael@0 132
michael@0 133 public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30;
michael@0 134
michael@0 135 public static final int BOGUS_DOCTYPE = 31;
michael@0 136
michael@0 137 public static final int COMMENT_START = 32;
michael@0 138
michael@0 139 public static final int COMMENT_START_DASH = 33;
michael@0 140
michael@0 141 public static final int COMMENT = 34;
michael@0 142
michael@0 143 public static final int COMMENT_END_DASH = 35;
michael@0 144
michael@0 145 public static final int COMMENT_END = 36;
michael@0 146
michael@0 147 public static final int COMMENT_END_BANG = 37;
michael@0 148
michael@0 149 public static final int NON_DATA_END_TAG_NAME = 38;
michael@0 150
michael@0 151 public static final int MARKUP_DECLARATION_HYPHEN = 39;
michael@0 152
michael@0 153 public static final int MARKUP_DECLARATION_OCTYPE = 40;
michael@0 154
michael@0 155 public static final int DOCTYPE_UBLIC = 41;
michael@0 156
michael@0 157 public static final int DOCTYPE_YSTEM = 42;
michael@0 158
michael@0 159 public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43;
michael@0 160
michael@0 161 public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44;
michael@0 162
michael@0 163 public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45;
michael@0 164
michael@0 165 public static final int CONSUME_CHARACTER_REFERENCE = 46;
michael@0 166
michael@0 167 public static final int CONSUME_NCR = 47;
michael@0 168
michael@0 169 public static final int CHARACTER_REFERENCE_TAIL = 48;
michael@0 170
michael@0 171 public static final int HEX_NCR_LOOP = 49;
michael@0 172
michael@0 173 public static final int DECIMAL_NRC_LOOP = 50;
michael@0 174
michael@0 175 public static final int HANDLE_NCR_VALUE = 51;
michael@0 176
michael@0 177 public static final int HANDLE_NCR_VALUE_RECONSUME = 52;
michael@0 178
michael@0 179 public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53;
michael@0 180
michael@0 181 public static final int SELF_CLOSING_START_TAG = 54;
michael@0 182
michael@0 183 public static final int CDATA_START = 55;
michael@0 184
michael@0 185 public static final int CDATA_SECTION = 56;
michael@0 186
michael@0 187 public static final int CDATA_RSQB = 57;
michael@0 188
michael@0 189 public static final int CDATA_RSQB_RSQB = 58;
michael@0 190
michael@0 191 public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59;
michael@0 192
michael@0 193 public static final int SCRIPT_DATA_ESCAPE_START = 60;
michael@0 194
michael@0 195 public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61;
michael@0 196
michael@0 197 public static final int SCRIPT_DATA_ESCAPED_DASH = 62;
michael@0 198
michael@0 199 public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63;
michael@0 200
michael@0 201 public static final int BOGUS_COMMENT_HYPHEN = 64;
michael@0 202
michael@0 203 public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65;
michael@0 204
michael@0 205 public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66;
michael@0 206
michael@0 207 public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67;
michael@0 208
michael@0 209 public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68;
michael@0 210
michael@0 211 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69;
michael@0 212
michael@0 213 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70;
michael@0 214
michael@0 215 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71;
michael@0 216
michael@0 217 public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72;
michael@0 218
michael@0 219 public static final int PROCESSING_INSTRUCTION = 73;
michael@0 220
michael@0 221 public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74;
michael@0 222
michael@0 223 /**
michael@0 224 * Magic value for UTF-16 operations.
michael@0 225 */
michael@0 226 private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10));
michael@0 227
michael@0 228 /**
michael@0 229 * UTF-16 code unit array containing less than and greater than for emitting
michael@0 230 * those characters on certain parse errors.
michael@0 231 */
michael@0 232 private static final @NoLength char[] LT_GT = { '<', '>' };
michael@0 233
michael@0 234 /**
michael@0 235 * UTF-16 code unit array containing less than and solidus for emitting
michael@0 236 * those characters on certain parse errors.
michael@0 237 */
michael@0 238 private static final @NoLength char[] LT_SOLIDUS = { '<', '/' };
michael@0 239
michael@0 240 /**
michael@0 241 * UTF-16 code unit array containing ]] for emitting those characters on
michael@0 242 * state transitions.
michael@0 243 */
michael@0 244 private static final @NoLength char[] RSQB_RSQB = { ']', ']' };
michael@0 245
michael@0 246 /**
michael@0 247 * Array version of U+FFFD.
michael@0 248 */
michael@0 249 private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' };
michael@0 250
michael@0 251 // [NOCPP[
michael@0 252
michael@0 253 /**
michael@0 254 * Array version of space.
michael@0 255 */
michael@0 256 private static final @NoLength char[] SPACE = { ' ' };
michael@0 257
michael@0 258 // ]NOCPP]
michael@0 259
michael@0 260 /**
michael@0 261 * Array version of line feed.
michael@0 262 */
michael@0 263 private static final @NoLength char[] LF = { '\n' };
michael@0 264
michael@0 265 /**
michael@0 266 * Buffer growth parameter.
michael@0 267 */
michael@0 268 private static final int BUFFER_GROW_BY = 1024;
michael@0 269
michael@0 270 /**
michael@0 271 * "CDATA[" as <code>char[]</code>
michael@0 272 */
michael@0 273 private static final @NoLength char[] CDATA_LSQB = { 'C', 'D', 'A', 'T',
michael@0 274 'A', '[' };
michael@0 275
michael@0 276 /**
michael@0 277 * "octype" as <code>char[]</code>
michael@0 278 */
michael@0 279 private static final @NoLength char[] OCTYPE = { 'o', 'c', 't', 'y', 'p',
michael@0 280 'e' };
michael@0 281
michael@0 282 /**
michael@0 283 * "ublic" as <code>char[]</code>
michael@0 284 */
michael@0 285 private static final @NoLength char[] UBLIC = { 'u', 'b', 'l', 'i', 'c' };
michael@0 286
michael@0 287 /**
michael@0 288 * "ystem" as <code>char[]</code>
michael@0 289 */
michael@0 290 private static final @NoLength char[] YSTEM = { 'y', 's', 't', 'e', 'm' };
michael@0 291
michael@0 292 private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' };
michael@0 293
michael@0 294 private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' };
michael@0 295
michael@0 296 private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' };
michael@0 297
michael@0 298 private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't',
michael@0 299 'e', 'x', 't' };
michael@0 300
michael@0 301 private static final char[] XMP_ARR = { 'x', 'm', 'p' };
michael@0 302
michael@0 303 private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r',
michael@0 304 'e', 'a' };
michael@0 305
michael@0 306 private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' };
michael@0 307
michael@0 308 private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e',
michael@0 309 'd' };
michael@0 310
michael@0 311 private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i',
michael@0 312 'p', 't' };
michael@0 313
michael@0 314 private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm',
michael@0 315 'e', 's' };
michael@0 316
michael@0 317 /**
michael@0 318 * The token handler.
michael@0 319 */
michael@0 320 protected final TokenHandler tokenHandler;
michael@0 321
michael@0 322 protected EncodingDeclarationHandler encodingDeclarationHandler;
michael@0 323
michael@0 324 // [NOCPP[
michael@0 325
michael@0 326 /**
michael@0 327 * The error handler.
michael@0 328 */
michael@0 329 protected ErrorHandler errorHandler;
michael@0 330
michael@0 331 // ]NOCPP]
michael@0 332
michael@0 333 /**
michael@0 334 * Whether the previous char read was CR.
michael@0 335 */
michael@0 336 protected boolean lastCR;
michael@0 337
michael@0 338 protected int stateSave;
michael@0 339
michael@0 340 private int returnStateSave;
michael@0 341
michael@0 342 protected int index;
michael@0 343
michael@0 344 private boolean forceQuirks;
michael@0 345
michael@0 346 private char additional;
michael@0 347
michael@0 348 private int entCol;
michael@0 349
michael@0 350 private int firstCharKey;
michael@0 351
michael@0 352 private int lo;
michael@0 353
michael@0 354 private int hi;
michael@0 355
michael@0 356 private int candidate;
michael@0 357
michael@0 358 private int strBufMark;
michael@0 359
michael@0 360 private int prevValue;
michael@0 361
michael@0 362 protected int value;
michael@0 363
michael@0 364 private boolean seenDigits;
michael@0 365
michael@0 366 protected int cstart;
michael@0 367
michael@0 368 /**
michael@0 369 * The SAX public id for the resource being tokenized. (Only passed to back
michael@0 370 * as part of locator data.)
michael@0 371 */
michael@0 372 private String publicId;
michael@0 373
michael@0 374 /**
michael@0 375 * The SAX system id for the resource being tokenized. (Only passed to back
michael@0 376 * as part of locator data.)
michael@0 377 */
michael@0 378 private String systemId;
michael@0 379
michael@0 380 /**
michael@0 381 * Buffer for short identifiers.
michael@0 382 */
michael@0 383 private @Auto char[] strBuf;
michael@0 384
michael@0 385 /**
michael@0 386 * Number of significant <code>char</code>s in <code>strBuf</code>.
michael@0 387 */
michael@0 388 private int strBufLen;
michael@0 389
michael@0 390 /**
michael@0 391 * <code>-1</code> to indicate that <code>strBuf</code> is used or otherwise
michael@0 392 * an offset to the main buffer.
michael@0 393 */
michael@0 394 // private int strBufOffset = -1;
michael@0 395 /**
michael@0 396 * Buffer for long strings.
michael@0 397 */
michael@0 398 private @Auto char[] longStrBuf;
michael@0 399
michael@0 400 /**
michael@0 401 * Number of significant <code>char</code>s in <code>longStrBuf</code>.
michael@0 402 */
michael@0 403 private int longStrBufLen;
michael@0 404
michael@0 405 /**
michael@0 406 * <code>-1</code> to indicate that <code>longStrBuf</code> is used or
michael@0 407 * otherwise an offset to the main buffer.
michael@0 408 */
michael@0 409 // private int longStrBufOffset = -1;
michael@0 410
michael@0 411 /**
michael@0 412 * Buffer for expanding NCRs falling into the Basic Multilingual Plane.
michael@0 413 */
michael@0 414 private final @Auto char[] bmpChar;
michael@0 415
michael@0 416 /**
michael@0 417 * Buffer for expanding astral NCRs.
michael@0 418 */
michael@0 419 private final @Auto char[] astralChar;
michael@0 420
michael@0 421 /**
michael@0 422 * The element whose end tag closes the current CDATA or RCDATA element.
michael@0 423 */
michael@0 424 protected ElementName endTagExpectation = null;
michael@0 425
michael@0 426 private char[] endTagExpectationAsArray; // not @Auto!
michael@0 427
michael@0 428 /**
michael@0 429 * <code>true</code> if tokenizing an end tag
michael@0 430 */
michael@0 431 protected boolean endTag;
michael@0 432
michael@0 433 /**
michael@0 434 * The current tag token name.
michael@0 435 */
michael@0 436 private ElementName tagName = null;
michael@0 437
michael@0 438 /**
michael@0 439 * The current attribute name.
michael@0 440 */
michael@0 441 protected AttributeName attributeName = null;
michael@0 442
michael@0 443 // [NOCPP[
michael@0 444
michael@0 445 /**
michael@0 446 * Whether comment tokens are emitted.
michael@0 447 */
michael@0 448 private boolean wantsComments = false;
michael@0 449
michael@0 450 /**
michael@0 451 * <code>true</code> when HTML4-specific additional errors are requested.
michael@0 452 */
michael@0 453 protected boolean html4;
michael@0 454
michael@0 455 /**
michael@0 456 * Whether the stream is past the first 512 bytes.
michael@0 457 */
michael@0 458 private boolean metaBoundaryPassed;
michael@0 459
michael@0 460 // ]NOCPP]
michael@0 461
michael@0 462 /**
michael@0 463 * The name of the current doctype token.
michael@0 464 */
michael@0 465 private @Local String doctypeName;
michael@0 466
michael@0 467 /**
michael@0 468 * The public id of the current doctype token.
michael@0 469 */
michael@0 470 private String publicIdentifier;
michael@0 471
michael@0 472 /**
michael@0 473 * The system id of the current doctype token.
michael@0 474 */
michael@0 475 private String systemIdentifier;
michael@0 476
michael@0 477 /**
michael@0 478 * The attribute holder.
michael@0 479 */
michael@0 480 private HtmlAttributes attributes;
michael@0 481
michael@0 482 // [NOCPP[
michael@0 483
michael@0 484 /**
michael@0 485 * The policy for vertical tab and form feed.
michael@0 486 */
michael@0 487 private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET;
michael@0 488
michael@0 489 /**
michael@0 490 * The policy for comments.
michael@0 491 */
michael@0 492 private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET;
michael@0 493
michael@0 494 private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET;
michael@0 495
michael@0 496 private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET;
michael@0 497
michael@0 498 private boolean html4ModeCompatibleWithXhtml1Schemata;
michael@0 499
michael@0 500 private int mappingLangToXmlLang;
michael@0 501
michael@0 502 // ]NOCPP]
michael@0 503
michael@0 504 private final boolean newAttributesEachTime;
michael@0 505
michael@0 506 private boolean shouldSuspend;
michael@0 507
michael@0 508 protected boolean confident;
michael@0 509
michael@0 510 private int line;
michael@0 511
michael@0 512 private Interner interner;
michael@0 513
michael@0 514 // CPPONLY: private boolean viewingXmlSource;
michael@0 515
michael@0 516 // [NOCPP[
michael@0 517
michael@0 518 protected LocatorImpl ampersandLocation;
michael@0 519
michael@0 520 public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) {
michael@0 521 this.tokenHandler = tokenHandler;
michael@0 522 this.encodingDeclarationHandler = null;
michael@0 523 this.newAttributesEachTime = newAttributesEachTime;
michael@0 524 this.bmpChar = new char[1];
michael@0 525 this.astralChar = new char[2];
michael@0 526 this.tagName = null;
michael@0 527 this.attributeName = null;
michael@0 528 this.doctypeName = null;
michael@0 529 this.publicIdentifier = null;
michael@0 530 this.systemIdentifier = null;
michael@0 531 this.attributes = null;
michael@0 532 }
michael@0 533
michael@0 534 // ]NOCPP]
michael@0 535
michael@0 536 /**
michael@0 537 * The constructor.
michael@0 538 *
michael@0 539 * @param tokenHandler
michael@0 540 * the handler for receiving tokens
michael@0 541 */
michael@0 542 public Tokenizer(TokenHandler tokenHandler
michael@0 543 // CPPONLY: , boolean viewingXmlSource
michael@0 544 ) {
michael@0 545 this.tokenHandler = tokenHandler;
michael@0 546 this.encodingDeclarationHandler = null;
michael@0 547 // [NOCPP[
michael@0 548 this.newAttributesEachTime = false;
michael@0 549 // ]NOCPP]
michael@0 550 this.bmpChar = new char[1];
michael@0 551 this.astralChar = new char[2];
michael@0 552 this.tagName = null;
michael@0 553 this.attributeName = null;
michael@0 554 this.doctypeName = null;
michael@0 555 this.publicIdentifier = null;
michael@0 556 this.systemIdentifier = null;
michael@0 557 // [NOCPP[
michael@0 558 this.attributes = null;
michael@0 559 // ]NOCPP]
michael@0 560 // CPPONLY: this.attributes = tokenHandler.HasBuilder() ? new HtmlAttributes(mappingLangToXmlLang) : null;
michael@0 561 // CPPONLY: this.newAttributesEachTime = !tokenHandler.HasBuilder();
michael@0 562 // CPPONLY: this.viewingXmlSource = viewingXmlSource;
michael@0 563 }
michael@0 564
michael@0 565 public void setInterner(Interner interner) {
michael@0 566 this.interner = interner;
michael@0 567 }
michael@0 568
michael@0 569 public void initLocation(String newPublicId, String newSystemId) {
michael@0 570 this.systemId = newSystemId;
michael@0 571 this.publicId = newPublicId;
michael@0 572
michael@0 573 }
michael@0 574
michael@0 575 // CPPONLY: boolean isViewingXmlSource() {
michael@0 576 // CPPONLY: return viewingXmlSource;
michael@0 577 // CPPONLY: }
michael@0 578
michael@0 579 // [NOCPP[
michael@0 580
michael@0 581 /**
michael@0 582 * Returns the mappingLangToXmlLang.
michael@0 583 *
michael@0 584 * @return the mappingLangToXmlLang
michael@0 585 */
michael@0 586 public boolean isMappingLangToXmlLang() {
michael@0 587 return mappingLangToXmlLang == AttributeName.HTML_LANG;
michael@0 588 }
michael@0 589
michael@0 590 /**
michael@0 591 * Sets the mappingLangToXmlLang.
michael@0 592 *
michael@0 593 * @param mappingLangToXmlLang
michael@0 594 * the mappingLangToXmlLang to set
michael@0 595 */
michael@0 596 public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
michael@0 597 this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG
michael@0 598 : AttributeName.HTML;
michael@0 599 }
michael@0 600
michael@0 601 /**
michael@0 602 * Sets the error handler.
michael@0 603 *
michael@0 604 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
michael@0 605 */
michael@0 606 public void setErrorHandler(ErrorHandler eh) {
michael@0 607 this.errorHandler = eh;
michael@0 608 }
michael@0 609
michael@0 610 public ErrorHandler getErrorHandler() {
michael@0 611 return this.errorHandler;
michael@0 612 }
michael@0 613
michael@0 614 /**
michael@0 615 * Sets the commentPolicy.
michael@0 616 *
michael@0 617 * @param commentPolicy
michael@0 618 * the commentPolicy to set
michael@0 619 */
michael@0 620 public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
michael@0 621 this.commentPolicy = commentPolicy;
michael@0 622 }
michael@0 623
michael@0 624 /**
michael@0 625 * Sets the contentNonXmlCharPolicy.
michael@0 626 *
michael@0 627 * @param contentNonXmlCharPolicy
michael@0 628 * the contentNonXmlCharPolicy to set
michael@0 629 */
michael@0 630 public void setContentNonXmlCharPolicy(
michael@0 631 XmlViolationPolicy contentNonXmlCharPolicy) {
michael@0 632 if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) {
michael@0 633 throw new IllegalArgumentException(
michael@0 634 "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW.");
michael@0 635 }
michael@0 636 }
michael@0 637
michael@0 638 /**
michael@0 639 * Sets the contentSpacePolicy.
michael@0 640 *
michael@0 641 * @param contentSpacePolicy
michael@0 642 * the contentSpacePolicy to set
michael@0 643 */
michael@0 644 public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
michael@0 645 this.contentSpacePolicy = contentSpacePolicy;
michael@0 646 }
michael@0 647
michael@0 648 /**
michael@0 649 * Sets the xmlnsPolicy.
michael@0 650 *
michael@0 651 * @param xmlnsPolicy
michael@0 652 * the xmlnsPolicy to set
michael@0 653 */
michael@0 654 public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
michael@0 655 if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
michael@0 656 throw new IllegalArgumentException("Can't use FATAL here.");
michael@0 657 }
michael@0 658 this.xmlnsPolicy = xmlnsPolicy;
michael@0 659 }
michael@0 660
michael@0 661 public void setNamePolicy(XmlViolationPolicy namePolicy) {
michael@0 662 this.namePolicy = namePolicy;
michael@0 663 }
michael@0 664
michael@0 665 /**
michael@0 666 * Sets the html4ModeCompatibleWithXhtml1Schemata.
michael@0 667 *
michael@0 668 * @param html4ModeCompatibleWithXhtml1Schemata
michael@0 669 * the html4ModeCompatibleWithXhtml1Schemata to set
michael@0 670 */
michael@0 671 public void setHtml4ModeCompatibleWithXhtml1Schemata(
michael@0 672 boolean html4ModeCompatibleWithXhtml1Schemata) {
michael@0 673 this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata;
michael@0 674 }
michael@0 675
michael@0 676 // ]NOCPP]
michael@0 677
michael@0 678 // For the token handler to call
michael@0 679 /**
michael@0 680 * Sets the tokenizer state and the associated element name. This should
michael@0 681 * only ever used to put the tokenizer into one of the states that have
michael@0 682 * a special end tag expectation.
michael@0 683 *
michael@0 684 * @param specialTokenizerState
michael@0 685 * the tokenizer state to set
michael@0 686 * @param endTagExpectation
michael@0 687 * the expected end tag for transitioning back to normal
michael@0 688 */
michael@0 689 public void setStateAndEndTagExpectation(int specialTokenizerState,
michael@0 690 @Local String endTagExpectation) {
michael@0 691 this.stateSave = specialTokenizerState;
michael@0 692 if (specialTokenizerState == Tokenizer.DATA) {
michael@0 693 return;
michael@0 694 }
michael@0 695 @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation);
michael@0 696 this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 0,
michael@0 697 asArray.length, interner);
michael@0 698 endTagExpectationToArray();
michael@0 699 }
michael@0 700
michael@0 701 /**
michael@0 702 * Sets the tokenizer state and the associated element name. This should
michael@0 703 * only ever used to put the tokenizer into one of the states that have
michael@0 704 * a special end tag expectation.
michael@0 705 *
michael@0 706 * @param specialTokenizerState
michael@0 707 * the tokenizer state to set
michael@0 708 * @param endTagExpectation
michael@0 709 * the expected end tag for transitioning back to normal
michael@0 710 */
michael@0 711 public void setStateAndEndTagExpectation(int specialTokenizerState,
michael@0 712 ElementName endTagExpectation) {
michael@0 713 this.stateSave = specialTokenizerState;
michael@0 714 this.endTagExpectation = endTagExpectation;
michael@0 715 endTagExpectationToArray();
michael@0 716 }
michael@0 717
michael@0 718 private void endTagExpectationToArray() {
michael@0 719 switch (endTagExpectation.getGroup()) {
michael@0 720 case TreeBuilder.TITLE:
michael@0 721 endTagExpectationAsArray = TITLE_ARR;
michael@0 722 return;
michael@0 723 case TreeBuilder.SCRIPT:
michael@0 724 endTagExpectationAsArray = SCRIPT_ARR;
michael@0 725 return;
michael@0 726 case TreeBuilder.STYLE:
michael@0 727 endTagExpectationAsArray = STYLE_ARR;
michael@0 728 return;
michael@0 729 case TreeBuilder.PLAINTEXT:
michael@0 730 endTagExpectationAsArray = PLAINTEXT_ARR;
michael@0 731 return;
michael@0 732 case TreeBuilder.XMP:
michael@0 733 endTagExpectationAsArray = XMP_ARR;
michael@0 734 return;
michael@0 735 case TreeBuilder.TEXTAREA:
michael@0 736 endTagExpectationAsArray = TEXTAREA_ARR;
michael@0 737 return;
michael@0 738 case TreeBuilder.IFRAME:
michael@0 739 endTagExpectationAsArray = IFRAME_ARR;
michael@0 740 return;
michael@0 741 case TreeBuilder.NOEMBED:
michael@0 742 endTagExpectationAsArray = NOEMBED_ARR;
michael@0 743 return;
michael@0 744 case TreeBuilder.NOSCRIPT:
michael@0 745 endTagExpectationAsArray = NOSCRIPT_ARR;
michael@0 746 return;
michael@0 747 case TreeBuilder.NOFRAMES:
michael@0 748 endTagExpectationAsArray = NOFRAMES_ARR;
michael@0 749 return;
michael@0 750 default:
michael@0 751 assert false: "Bad end tag expectation.";
michael@0 752 return;
michael@0 753 }
michael@0 754 }
michael@0 755
michael@0 756 /**
michael@0 757 * For C++ use only.
michael@0 758 */
michael@0 759 public void setLineNumber(int line) {
michael@0 760 this.line = line;
michael@0 761 }
michael@0 762
michael@0 763 // start Locator impl
michael@0 764
michael@0 765 /**
michael@0 766 * @see org.xml.sax.Locator#getLineNumber()
michael@0 767 */
michael@0 768 @Inline public int getLineNumber() {
michael@0 769 return line;
michael@0 770 }
michael@0 771
michael@0 772 // [NOCPP[
michael@0 773
michael@0 774 /**
michael@0 775 * @see org.xml.sax.Locator#getColumnNumber()
michael@0 776 */
michael@0 777 @Inline public int getColumnNumber() {
michael@0 778 return -1;
michael@0 779 }
michael@0 780
michael@0 781 /**
michael@0 782 * @see org.xml.sax.Locator#getPublicId()
michael@0 783 */
michael@0 784 public String getPublicId() {
michael@0 785 return publicId;
michael@0 786 }
michael@0 787
michael@0 788 /**
michael@0 789 * @see org.xml.sax.Locator#getSystemId()
michael@0 790 */
michael@0 791 public String getSystemId() {
michael@0 792 return systemId;
michael@0 793 }
michael@0 794
michael@0 795 // end Locator impl
michael@0 796
michael@0 797 // end public API
michael@0 798
michael@0 799 public void notifyAboutMetaBoundary() {
michael@0 800 metaBoundaryPassed = true;
michael@0 801 }
michael@0 802
michael@0 803 void turnOnAdditionalHtml4Errors() {
michael@0 804 html4 = true;
michael@0 805 }
michael@0 806
michael@0 807 // ]NOCPP]
michael@0 808
michael@0 809 HtmlAttributes emptyAttributes() {
michael@0 810 // [NOCPP[
michael@0 811 if (newAttributesEachTime) {
michael@0 812 return new HtmlAttributes(mappingLangToXmlLang);
michael@0 813 } else {
michael@0 814 // ]NOCPP]
michael@0 815 return HtmlAttributes.EMPTY_ATTRIBUTES;
michael@0 816 // [NOCPP[
michael@0 817 }
michael@0 818 // ]NOCPP]
michael@0 819 }
michael@0 820
michael@0 821 @Inline private void clearStrBufAndAppend(char c) {
michael@0 822 strBuf[0] = c;
michael@0 823 strBufLen = 1;
michael@0 824 }
michael@0 825
michael@0 826 @Inline private void clearStrBuf() {
michael@0 827 strBufLen = 0;
michael@0 828 }
michael@0 829
michael@0 830 /**
michael@0 831 * Appends to the smaller buffer.
michael@0 832 *
michael@0 833 * @param c
michael@0 834 * the UTF-16 code unit to append
michael@0 835 */
michael@0 836 private void appendStrBuf(char c) {
michael@0 837 if (strBufLen == strBuf.length) {
michael@0 838 char[] newBuf = new char[strBuf.length + Tokenizer.BUFFER_GROW_BY];
michael@0 839 System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length);
michael@0 840 strBuf = newBuf;
michael@0 841 }
michael@0 842 strBuf[strBufLen++] = c;
michael@0 843 }
michael@0 844
michael@0 845 /**
michael@0 846 * The smaller buffer as a String. Currently only used for error reporting.
michael@0 847 *
michael@0 848 * <p>
michael@0 849 * C++ memory note: The return value must be released.
michael@0 850 *
michael@0 851 * @return the smaller buffer as a string
michael@0 852 */
michael@0 853 protected String strBufToString() {
michael@0 854 return Portability.newStringFromBuffer(strBuf, 0, strBufLen);
michael@0 855 }
michael@0 856
michael@0 857 /**
michael@0 858 * Returns the short buffer as a local name. The return value is released in
michael@0 859 * emitDoctypeToken().
michael@0 860 *
michael@0 861 * @return the smaller buffer as local name
michael@0 862 */
michael@0 863 private void strBufToDoctypeName() {
michael@0 864 doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen,
michael@0 865 interner);
michael@0 866 }
michael@0 867
michael@0 868 /**
michael@0 869 * Emits the smaller buffer as character tokens.
michael@0 870 *
michael@0 871 * @throws SAXException
michael@0 872 * if the token handler threw
michael@0 873 */
michael@0 874 private void emitStrBuf() throws SAXException {
michael@0 875 if (strBufLen > 0) {
michael@0 876 tokenHandler.characters(strBuf, 0, strBufLen);
michael@0 877 }
michael@0 878 }
michael@0 879
michael@0 880 @Inline private void clearLongStrBuf() {
michael@0 881 longStrBufLen = 0;
michael@0 882 }
michael@0 883
michael@0 884 @Inline private void clearLongStrBufAndAppend(char c) {
michael@0 885 longStrBuf[0] = c;
michael@0 886 longStrBufLen = 1;
michael@0 887 }
michael@0 888
michael@0 889 /**
michael@0 890 * Appends to the larger buffer.
michael@0 891 *
michael@0 892 * @param c
michael@0 893 * the UTF-16 code unit to append
michael@0 894 */
michael@0 895 private void appendLongStrBuf(char c) {
michael@0 896 if (longStrBufLen == longStrBuf.length) {
michael@0 897 char[] newBuf = new char[longStrBufLen + (longStrBufLen >> 1)];
michael@0 898 System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);
michael@0 899 longStrBuf = newBuf;
michael@0 900 }
michael@0 901 longStrBuf[longStrBufLen++] = c;
michael@0 902 }
michael@0 903
michael@0 904 @Inline private void appendSecondHyphenToBogusComment() throws SAXException {
michael@0 905 // [NOCPP[
michael@0 906 switch (commentPolicy) {
michael@0 907 case ALTER_INFOSET:
michael@0 908 // detachLongStrBuf();
michael@0 909 appendLongStrBuf(' ');
michael@0 910 // FALLTHROUGH
michael@0 911 case ALLOW:
michael@0 912 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
michael@0 913 // ]NOCPP]
michael@0 914 appendLongStrBuf('-');
michael@0 915 // [NOCPP[
michael@0 916 break;
michael@0 917 case FATAL:
michael@0 918 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
michael@0 919 break;
michael@0 920 }
michael@0 921 // ]NOCPP]
michael@0 922 }
michael@0 923
michael@0 924 // [NOCPP[
michael@0 925 private void maybeAppendSpaceToBogusComment() throws SAXException {
michael@0 926 switch (commentPolicy) {
michael@0 927 case ALTER_INFOSET:
michael@0 928 // detachLongStrBuf();
michael@0 929 appendLongStrBuf(' ');
michael@0 930 // FALLTHROUGH
michael@0 931 case ALLOW:
michael@0 932 warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
michael@0 933 break;
michael@0 934 case FATAL:
michael@0 935 fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
michael@0 936 break;
michael@0 937 }
michael@0 938 }
michael@0 939
michael@0 940 // ]NOCPP]
michael@0 941
michael@0 942 @Inline private void adjustDoubleHyphenAndAppendToLongStrBufAndErr(char c)
michael@0 943 throws SAXException {
michael@0 944 errConsecutiveHyphens();
michael@0 945 // [NOCPP[
michael@0 946 switch (commentPolicy) {
michael@0 947 case ALTER_INFOSET:
michael@0 948 // detachLongStrBuf();
michael@0 949 longStrBufLen--;
michael@0 950 appendLongStrBuf(' ');
michael@0 951 appendLongStrBuf('-');
michael@0 952 // FALLTHROUGH
michael@0 953 case ALLOW:
michael@0 954 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
michael@0 955 // ]NOCPP]
michael@0 956 appendLongStrBuf(c);
michael@0 957 // [NOCPP[
michael@0 958 break;
michael@0 959 case FATAL:
michael@0 960 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
michael@0 961 break;
michael@0 962 }
michael@0 963 // ]NOCPP]
michael@0 964 }
michael@0 965
michael@0 966 private void appendLongStrBuf(@NoLength char[] buffer, int offset, int length) {
michael@0 967 int reqLen = longStrBufLen + length;
michael@0 968 if (longStrBuf.length < reqLen) {
michael@0 969 char[] newBuf = new char[reqLen + (reqLen >> 1)];
michael@0 970 System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);
michael@0 971 longStrBuf = newBuf;
michael@0 972 }
michael@0 973 System.arraycopy(buffer, offset, longStrBuf, longStrBufLen, length);
michael@0 974 longStrBufLen = reqLen;
michael@0 975 }
michael@0 976
michael@0 977 /**
michael@0 978 * Append the contents of the smaller buffer to the larger one.
michael@0 979 */
michael@0 980 @Inline private void appendStrBufToLongStrBuf() {
michael@0 981 appendLongStrBuf(strBuf, 0, strBufLen);
michael@0 982 }
michael@0 983
michael@0 984 /**
michael@0 985 * The larger buffer as a string.
michael@0 986 *
michael@0 987 * <p>
michael@0 988 * C++ memory note: The return value must be released.
michael@0 989 *
michael@0 990 * @return the larger buffer as a string
michael@0 991 */
michael@0 992 private String longStrBufToString() {
michael@0 993 return Portability.newStringFromBuffer(longStrBuf, 0, longStrBufLen);
michael@0 994 }
michael@0 995
michael@0 996 /**
michael@0 997 * Emits the current comment token.
michael@0 998 *
michael@0 999 * @param pos
michael@0 1000 * TODO
michael@0 1001 *
michael@0 1002 * @throws SAXException
michael@0 1003 */
michael@0 1004 private void emitComment(int provisionalHyphens, int pos)
michael@0 1005 throws SAXException {
michael@0 1006 // [NOCPP[
michael@0 1007 if (wantsComments) {
michael@0 1008 // ]NOCPP]
michael@0 1009 // if (longStrBufOffset != -1) {
michael@0 1010 // tokenHandler.comment(buf, longStrBufOffset, longStrBufLen
michael@0 1011 // - provisionalHyphens);
michael@0 1012 // } else {
michael@0 1013 tokenHandler.comment(longStrBuf, 0, longStrBufLen
michael@0 1014 - provisionalHyphens);
michael@0 1015 // }
michael@0 1016 // [NOCPP[
michael@0 1017 }
michael@0 1018 // ]NOCPP]
michael@0 1019 cstart = pos + 1;
michael@0 1020 }
michael@0 1021
michael@0 1022 /**
michael@0 1023 * Flushes coalesced character tokens.
michael@0 1024 *
michael@0 1025 * @param buf
michael@0 1026 * TODO
michael@0 1027 * @param pos
michael@0 1028 * TODO
michael@0 1029 *
michael@0 1030 * @throws SAXException
michael@0 1031 */
michael@0 1032 protected void flushChars(@NoLength char[] buf, int pos)
michael@0 1033 throws SAXException {
michael@0 1034 if (pos > cstart) {
michael@0 1035 tokenHandler.characters(buf, cstart, pos - cstart);
michael@0 1036 }
michael@0 1037 cstart = Integer.MAX_VALUE;
michael@0 1038 }
michael@0 1039
michael@0 1040 /**
michael@0 1041 * Reports an condition that would make the infoset incompatible with XML
michael@0 1042 * 1.0 as fatal.
michael@0 1043 *
michael@0 1044 * @param message
michael@0 1045 * the message
michael@0 1046 * @throws SAXException
michael@0 1047 * @throws SAXParseException
michael@0 1048 */
michael@0 1049 public void fatal(String message) throws SAXException {
michael@0 1050 SAXParseException spe = new SAXParseException(message, this);
michael@0 1051 if (errorHandler != null) {
michael@0 1052 errorHandler.fatalError(spe);
michael@0 1053 }
michael@0 1054 throw spe;
michael@0 1055 }
michael@0 1056
michael@0 1057 /**
michael@0 1058 * Reports a Parse Error.
michael@0 1059 *
michael@0 1060 * @param message
michael@0 1061 * the message
michael@0 1062 * @throws SAXException
michael@0 1063 */
michael@0 1064 public void err(String message) throws SAXException {
michael@0 1065 if (errorHandler == null) {
michael@0 1066 return;
michael@0 1067 }
michael@0 1068 SAXParseException spe = new SAXParseException(message, this);
michael@0 1069 errorHandler.error(spe);
michael@0 1070 }
michael@0 1071
michael@0 1072 public void errTreeBuilder(String message) throws SAXException {
michael@0 1073 ErrorHandler eh = null;
michael@0 1074 if (tokenHandler instanceof TreeBuilder<?>) {
michael@0 1075 TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler;
michael@0 1076 eh = treeBuilder.getErrorHandler();
michael@0 1077 }
michael@0 1078 if (eh == null) {
michael@0 1079 eh = errorHandler;
michael@0 1080 }
michael@0 1081 if (eh == null) {
michael@0 1082 return;
michael@0 1083 }
michael@0 1084 SAXParseException spe = new SAXParseException(message, this);
michael@0 1085 eh.error(spe);
michael@0 1086 }
michael@0 1087
michael@0 1088 /**
michael@0 1089 * Reports a warning
michael@0 1090 *
michael@0 1091 * @param message
michael@0 1092 * the message
michael@0 1093 * @throws SAXException
michael@0 1094 */
michael@0 1095 public void warn(String message) throws SAXException {
michael@0 1096 if (errorHandler == null) {
michael@0 1097 return;
michael@0 1098 }
michael@0 1099 SAXParseException spe = new SAXParseException(message, this);
michael@0 1100 errorHandler.warning(spe);
michael@0 1101 }
michael@0 1102
michael@0 1103 private void strBufToElementNameString() {
michael@0 1104 // if (strBufOffset != -1) {
michael@0 1105 // return ElementName.elementNameByBuffer(buf, strBufOffset, strBufLen);
michael@0 1106 // } else {
michael@0 1107 tagName = ElementName.elementNameByBuffer(strBuf, 0, strBufLen,
michael@0 1108 interner);
michael@0 1109 // }
michael@0 1110 }
michael@0 1111
michael@0 1112 private int emitCurrentTagToken(boolean selfClosing, int pos)
michael@0 1113 throws SAXException {
michael@0 1114 cstart = pos + 1;
michael@0 1115 maybeErrSlashInEndTag(selfClosing);
michael@0 1116 stateSave = Tokenizer.DATA;
michael@0 1117 HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES
michael@0 1118 : attributes);
michael@0 1119 if (endTag) {
michael@0 1120 /*
michael@0 1121 * When an end tag token is emitted, the content model flag must be
michael@0 1122 * switched to the PCDATA state.
michael@0 1123 */
michael@0 1124 maybeErrAttributesOnEndTag(attrs);
michael@0 1125 // CPPONLY: if (!viewingXmlSource) {
michael@0 1126 tokenHandler.endTag(tagName);
michael@0 1127 // CPPONLY: }
michael@0 1128 // CPPONLY: if (newAttributesEachTime) {
michael@0 1129 // CPPONLY: Portability.delete(attributes);
michael@0 1130 // CPPONLY: attributes = null;
michael@0 1131 // CPPONLY: }
michael@0 1132 } else {
michael@0 1133 // CPPONLY: if (viewingXmlSource) {
michael@0 1134 // CPPONLY: assert newAttributesEachTime;
michael@0 1135 // CPPONLY: Portability.delete(attributes);
michael@0 1136 // CPPONLY: attributes = null;
michael@0 1137 // CPPONLY: } else {
michael@0 1138 tokenHandler.startTag(tagName, attrs, selfClosing);
michael@0 1139 // CPPONLY: }
michael@0 1140 }
michael@0 1141 tagName.release();
michael@0 1142 tagName = null;
michael@0 1143 if (newAttributesEachTime) {
michael@0 1144 attributes = null;
michael@0 1145 } else {
michael@0 1146 attributes.clear(mappingLangToXmlLang);
michael@0 1147 }
michael@0 1148 /*
michael@0 1149 * The token handler may have called setStateAndEndTagExpectation
michael@0 1150 * and changed stateSave since the start of this method.
michael@0 1151 */
michael@0 1152 return stateSave;
michael@0 1153 }
michael@0 1154
michael@0 1155 private void attributeNameComplete() throws SAXException {
michael@0 1156 // if (strBufOffset != -1) {
michael@0 1157 // attributeName = AttributeName.nameByBuffer(buf, strBufOffset,
michael@0 1158 // strBufLen, namePolicy != XmlViolationPolicy.ALLOW);
michael@0 1159 // } else {
michael@0 1160 attributeName = AttributeName.nameByBuffer(strBuf, 0, strBufLen
michael@0 1161 // [NOCPP[
michael@0 1162 , namePolicy != XmlViolationPolicy.ALLOW
michael@0 1163 // ]NOCPP]
michael@0 1164 , interner);
michael@0 1165 // }
michael@0 1166
michael@0 1167 if (attributes == null) {
michael@0 1168 attributes = new HtmlAttributes(mappingLangToXmlLang);
michael@0 1169 }
michael@0 1170
michael@0 1171 /*
michael@0 1172 * When the user agent leaves the attribute name state (and before
michael@0 1173 * emitting the tag token, if appropriate), the complete attribute's
michael@0 1174 * name must be compared to the other attributes on the same token; if
michael@0 1175 * there is already an attribute on the token with the exact same name,
michael@0 1176 * then this is a parse error and the new attribute must be dropped,
michael@0 1177 * along with the value that gets associated with it (if any).
michael@0 1178 */
michael@0 1179 if (attributes.contains(attributeName)) {
michael@0 1180 errDuplicateAttribute();
michael@0 1181 attributeName.release();
michael@0 1182 attributeName = null;
michael@0 1183 }
michael@0 1184 }
michael@0 1185
michael@0 1186 private void addAttributeWithoutValue() throws SAXException {
michael@0 1187 noteAttributeWithoutValue();
michael@0 1188
michael@0 1189 // [NOCPP[
michael@0 1190 if (metaBoundaryPassed && AttributeName.CHARSET == attributeName
michael@0 1191 && ElementName.META == tagName) {
michael@0 1192 err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
michael@0 1193 }
michael@0 1194 // ]NOCPP]
michael@0 1195 if (attributeName != null) {
michael@0 1196 // [NOCPP[
michael@0 1197 if (html4) {
michael@0 1198 if (attributeName.isBoolean()) {
michael@0 1199 if (html4ModeCompatibleWithXhtml1Schemata) {
michael@0 1200 attributes.addAttribute(attributeName,
michael@0 1201 attributeName.getLocal(AttributeName.HTML),
michael@0 1202 xmlnsPolicy);
michael@0 1203 } else {
michael@0 1204 attributes.addAttribute(attributeName, "", xmlnsPolicy);
michael@0 1205 }
michael@0 1206 } else {
michael@0 1207 if (AttributeName.BORDER != attributeName) {
michael@0 1208 err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)");
michael@0 1209 attributes.addAttribute(attributeName, "", xmlnsPolicy);
michael@0 1210 }
michael@0 1211 }
michael@0 1212 } else {
michael@0 1213 if (AttributeName.SRC == attributeName
michael@0 1214 || AttributeName.HREF == attributeName) {
michael@0 1215 warn("Attribute \u201C"
michael@0 1216 + attributeName.getLocal(AttributeName.HTML)
michael@0 1217 + "\u201D without an explicit value seen. The attribute may be dropped by IE7.");
michael@0 1218 }
michael@0 1219 // ]NOCPP]
michael@0 1220 attributes.addAttribute(attributeName,
michael@0 1221 Portability.newEmptyString()
michael@0 1222 // [NOCPP[
michael@0 1223 , xmlnsPolicy
michael@0 1224 // ]NOCPP]
michael@0 1225 );
michael@0 1226 // [NOCPP[
michael@0 1227 }
michael@0 1228 // ]NOCPP]
michael@0 1229 attributeName = null; // attributeName has been adopted by the
michael@0 1230 // |attributes| object
michael@0 1231 }
michael@0 1232 }
michael@0 1233
michael@0 1234 private void addAttributeWithValue() throws SAXException {
michael@0 1235 // [NOCPP[
michael@0 1236 if (metaBoundaryPassed && ElementName.META == tagName
michael@0 1237 && AttributeName.CHARSET == attributeName) {
michael@0 1238 err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
michael@0 1239 }
michael@0 1240 // ]NOCPP]
michael@0 1241 if (attributeName != null) {
michael@0 1242 String val = longStrBufToString(); // Ownership transferred to
michael@0 1243 // HtmlAttributes
michael@0 1244 // CPPONLY: if (mViewSource) {
michael@0 1245 // CPPONLY: mViewSource.MaybeLinkifyAttributeValue(attributeName, val);
michael@0 1246 // CPPONLY: }
michael@0 1247 // [NOCPP[
michael@0 1248 if (!endTag && html4 && html4ModeCompatibleWithXhtml1Schemata
michael@0 1249 && attributeName.isCaseFolded()) {
michael@0 1250 val = newAsciiLowerCaseStringFromString(val);
michael@0 1251 }
michael@0 1252 // ]NOCPP]
michael@0 1253 attributes.addAttribute(attributeName, val
michael@0 1254 // [NOCPP[
michael@0 1255 , xmlnsPolicy
michael@0 1256 // ]NOCPP]
michael@0 1257 );
michael@0 1258 attributeName = null; // attributeName has been adopted by the
michael@0 1259 // |attributes| object
michael@0 1260 }
michael@0 1261 }
michael@0 1262
michael@0 1263 // [NOCPP[
michael@0 1264
michael@0 1265 private static String newAsciiLowerCaseStringFromString(String str) {
michael@0 1266 if (str == null) {
michael@0 1267 return null;
michael@0 1268 }
michael@0 1269 char[] buf = new char[str.length()];
michael@0 1270 for (int i = 0; i < str.length(); i++) {
michael@0 1271 char c = str.charAt(i);
michael@0 1272 if (c >= 'A' && c <= 'Z') {
michael@0 1273 c += 0x20;
michael@0 1274 }
michael@0 1275 buf[i] = c;
michael@0 1276 }
michael@0 1277 return new String(buf);
michael@0 1278 }
michael@0 1279
michael@0 1280 protected void startErrorReporting() throws SAXException {
michael@0 1281
michael@0 1282 }
michael@0 1283
michael@0 1284 // ]NOCPP]
michael@0 1285
michael@0 1286 public void start() throws SAXException {
michael@0 1287 initializeWithoutStarting();
michael@0 1288 tokenHandler.startTokenization(this);
michael@0 1289 // [NOCPP[
michael@0 1290 startErrorReporting();
michael@0 1291 // ]NOCPP]
michael@0 1292 }
michael@0 1293
michael@0 1294 public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException {
michael@0 1295 int state = stateSave;
michael@0 1296 int returnState = returnStateSave;
michael@0 1297 char c = '\u0000';
michael@0 1298 shouldSuspend = false;
michael@0 1299 lastCR = false;
michael@0 1300
michael@0 1301 int start = buffer.getStart();
michael@0 1302 /**
michael@0 1303 * The index of the last <code>char</code> read from <code>buf</code>.
michael@0 1304 */
michael@0 1305 int pos = start - 1;
michael@0 1306
michael@0 1307 /**
michael@0 1308 * The index of the first <code>char</code> in <code>buf</code> that is
michael@0 1309 * part of a coalesced run of character tokens or
michael@0 1310 * <code>Integer.MAX_VALUE</code> if there is not a current run being
michael@0 1311 * coalesced.
michael@0 1312 */
michael@0 1313 switch (state) {
michael@0 1314 case DATA:
michael@0 1315 case RCDATA:
michael@0 1316 case SCRIPT_DATA:
michael@0 1317 case PLAINTEXT:
michael@0 1318 case RAWTEXT:
michael@0 1319 case CDATA_SECTION:
michael@0 1320 case SCRIPT_DATA_ESCAPED:
michael@0 1321 case SCRIPT_DATA_ESCAPE_START:
michael@0 1322 case SCRIPT_DATA_ESCAPE_START_DASH:
michael@0 1323 case SCRIPT_DATA_ESCAPED_DASH:
michael@0 1324 case SCRIPT_DATA_ESCAPED_DASH_DASH:
michael@0 1325 case SCRIPT_DATA_DOUBLE_ESCAPE_START:
michael@0 1326 case SCRIPT_DATA_DOUBLE_ESCAPED:
michael@0 1327 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
michael@0 1328 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
michael@0 1329 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
michael@0 1330 case SCRIPT_DATA_DOUBLE_ESCAPE_END:
michael@0 1331 cstart = start;
michael@0 1332 break;
michael@0 1333 default:
michael@0 1334 cstart = Integer.MAX_VALUE;
michael@0 1335 break;
michael@0 1336 }
michael@0 1337
michael@0 1338 /**
michael@0 1339 * The number of <code>char</code>s in <code>buf</code> that have
michael@0 1340 * meaning. (The rest of the array is garbage and should not be
michael@0 1341 * examined.)
michael@0 1342 */
michael@0 1343 // CPPONLY: if (mViewSource) {
michael@0 1344 // CPPONLY: mViewSource.SetBuffer(buffer);
michael@0 1345 // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
michael@0 1346 // CPPONLY: mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1);
michael@0 1347 // CPPONLY: } else {
michael@0 1348 // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
michael@0 1349 // CPPONLY: }
michael@0 1350 // [NOCPP[
michael@0 1351 pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState,
michael@0 1352 buffer.getEnd());
michael@0 1353 // ]NOCPP]
michael@0 1354 if (pos == buffer.getEnd()) {
michael@0 1355 // exiting due to end of buffer
michael@0 1356 buffer.setStart(pos);
michael@0 1357 } else {
michael@0 1358 buffer.setStart(pos + 1);
michael@0 1359 }
michael@0 1360 return lastCR;
michael@0 1361 }
michael@0 1362
michael@0 1363 @SuppressWarnings("unused") private int stateLoop(int state, char c,
michael@0 1364 int pos, @NoLength char[] buf, boolean reconsume, int returnState,
michael@0 1365 int endPos) throws SAXException {
michael@0 1366 /*
michael@0 1367 * Idioms used in this code:
michael@0 1368 *
michael@0 1369 *
michael@0 1370 * Consuming the next input character
michael@0 1371 *
michael@0 1372 * To consume the next input character, the code does this: if (++pos ==
michael@0 1373 * endPos) { break stateloop; } c = checkChar(buf, pos);
michael@0 1374 *
michael@0 1375 *
michael@0 1376 * Staying in a state
michael@0 1377 *
michael@0 1378 * When there's a state that the tokenizer may stay in over multiple
michael@0 1379 * input characters, the state has a wrapper |for(;;)| loop and staying
michael@0 1380 * in the state continues the loop.
michael@0 1381 *
michael@0 1382 *
michael@0 1383 * Switching to another state
michael@0 1384 *
michael@0 1385 * To switch to another state, the code sets the state variable to the
michael@0 1386 * magic number of the new state. Then it either continues stateloop or
michael@0 1387 * breaks out of the state's own wrapper loop if the target state is
michael@0 1388 * right after the current state in source order. (This is a partial
michael@0 1389 * workaround for Java's lack of goto.)
michael@0 1390 *
michael@0 1391 *
michael@0 1392 * Reconsume support
michael@0 1393 *
michael@0 1394 * The spec sometimes says that an input character is reconsumed in
michael@0 1395 * another state. If a state can ever be entered so that an input
michael@0 1396 * character can be reconsumed in it, the state's code starts with an
michael@0 1397 * |if (reconsume)| that sets reconsume to false and skips over the
michael@0 1398 * normal code for consuming a new character.
michael@0 1399 *
michael@0 1400 * To reconsume the current character in another state, the code sets
michael@0 1401 * |reconsume| to true and then switches to the other state.
michael@0 1402 *
michael@0 1403 *
michael@0 1404 * Emitting character tokens
michael@0 1405 *
michael@0 1406 * This method emits character tokens lazily. Whenever a new range of
michael@0 1407 * character tokens starts, the field cstart must be set to the start
michael@0 1408 * index of the range. The flushChars() method must be called at the end
michael@0 1409 * of a range to flush it.
michael@0 1410 *
michael@0 1411 *
michael@0 1412 * U+0000 handling
michael@0 1413 *
michael@0 1414 * The various states have to handle the replacement of U+0000 with
michael@0 1415 * U+FFFD. However, if U+0000 would be reconsumed in another state, the
michael@0 1416 * replacement doesn't need to happen, because it's handled by the
michael@0 1417 * reconsuming state.
michael@0 1418 *
michael@0 1419 *
michael@0 1420 * LF handling
michael@0 1421 *
michael@0 1422 * Every state needs to increment the line number upon LF unless the LF
michael@0 1423 * gets reconsumed by another state which increments the line number.
michael@0 1424 *
michael@0 1425 *
michael@0 1426 * CR handling
michael@0 1427 *
michael@0 1428 * Every state needs to handle CR unless the CR gets reconsumed and is
michael@0 1429 * handled by the reconsuming state. The CR needs to be handled as if it
michael@0 1430 * were and LF, the lastCR field must be set to true and then this
michael@0 1431 * method must return. The IO driver will then swallow the next
michael@0 1432 * character if it is an LF to coalesce CRLF.
michael@0 1433 */
michael@0 1434 stateloop: for (;;) {
michael@0 1435 switch (state) {
michael@0 1436 case DATA:
michael@0 1437 dataloop: for (;;) {
michael@0 1438 if (reconsume) {
michael@0 1439 reconsume = false;
michael@0 1440 } else {
michael@0 1441 if (++pos == endPos) {
michael@0 1442 break stateloop;
michael@0 1443 }
michael@0 1444 c = checkChar(buf, pos);
michael@0 1445 }
michael@0 1446 switch (c) {
michael@0 1447 case '&':
michael@0 1448 /*
michael@0 1449 * U+0026 AMPERSAND (&) Switch to the character
michael@0 1450 * reference in data state.
michael@0 1451 */
michael@0 1452 flushChars(buf, pos);
michael@0 1453 clearStrBufAndAppend(c);
michael@0 1454 setAdditionalAndRememberAmpersandLocation('\u0000');
michael@0 1455 returnState = state;
michael@0 1456 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
michael@0 1457 continue stateloop;
michael@0 1458 case '<':
michael@0 1459 /*
michael@0 1460 * U+003C LESS-THAN SIGN (<) Switch to the tag
michael@0 1461 * open state.
michael@0 1462 */
michael@0 1463 flushChars(buf, pos);
michael@0 1464
michael@0 1465 state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos);
michael@0 1466 break dataloop; // FALL THROUGH continue
michael@0 1467 // stateloop;
michael@0 1468 case '\u0000':
michael@0 1469 emitReplacementCharacter(buf, pos);
michael@0 1470 continue;
michael@0 1471 case '\r':
michael@0 1472 emitCarriageReturn(buf, pos);
michael@0 1473 break stateloop;
michael@0 1474 case '\n':
michael@0 1475 silentLineFeed();
michael@0 1476 default:
michael@0 1477 /*
michael@0 1478 * Anything else Emit the input character as a
michael@0 1479 * character token.
michael@0 1480 *
michael@0 1481 * Stay in the data state.
michael@0 1482 */
michael@0 1483 continue;
michael@0 1484 }
michael@0 1485 }
michael@0 1486 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0 1487 case TAG_OPEN:
michael@0 1488 tagopenloop: for (;;) {
michael@0 1489 /*
michael@0 1490 * The behavior of this state depends on the content
michael@0 1491 * model flag.
michael@0 1492 */
michael@0 1493 if (++pos == endPos) {
michael@0 1494 break stateloop;
michael@0 1495 }
michael@0 1496 c = checkChar(buf, pos);
michael@0 1497 /*
michael@0 1498 * If the content model flag is set to the PCDATA state
michael@0 1499 * Consume the next input character:
michael@0 1500 */
michael@0 1501 if (c >= 'A' && c <= 'Z') {
michael@0 1502 /*
michael@0 1503 * U+0041 LATIN CAPITAL LETTER A through to U+005A
michael@0 1504 * LATIN CAPITAL LETTER Z Create a new start tag
michael@0 1505 * token,
michael@0 1506 */
michael@0 1507 endTag = false;
michael@0 1508 /*
michael@0 1509 * set its tag name to the lowercase version of the
michael@0 1510 * input character (add 0x0020 to the character's
michael@0 1511 * code point),
michael@0 1512 */
michael@0 1513 clearStrBufAndAppend((char) (c + 0x20));
michael@0 1514 /* then switch to the tag name state. */
michael@0 1515 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
michael@0 1516 /*
michael@0 1517 * (Don't emit the token yet; further details will
michael@0 1518 * be filled in before it is emitted.)
michael@0 1519 */
michael@0 1520 break tagopenloop;
michael@0 1521 // continue stateloop;
michael@0 1522 } else if (c >= 'a' && c <= 'z') {
michael@0 1523 /*
michael@0 1524 * U+0061 LATIN SMALL LETTER A through to U+007A
michael@0 1525 * LATIN SMALL LETTER Z Create a new start tag
michael@0 1526 * token,
michael@0 1527 */
michael@0 1528 endTag = false;
michael@0 1529 /*
michael@0 1530 * set its tag name to the input character,
michael@0 1531 */
michael@0 1532 clearStrBufAndAppend(c);
michael@0 1533 /* then switch to the tag name state. */
michael@0 1534 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
michael@0 1535 /*
michael@0 1536 * (Don't emit the token yet; further details will
michael@0 1537 * be filled in before it is emitted.)
michael@0 1538 */
michael@0 1539 break tagopenloop;
michael@0 1540 // continue stateloop;
michael@0 1541 }
michael@0 1542 switch (c) {
michael@0 1543 case '!':
michael@0 1544 /*
michael@0 1545 * U+0021 EXCLAMATION MARK (!) Switch to the
michael@0 1546 * markup declaration open state.
michael@0 1547 */
michael@0 1548 state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos);
michael@0 1549 continue stateloop;
michael@0 1550 case '/':
michael@0 1551 /*
michael@0 1552 * U+002F SOLIDUS (/) Switch to the close tag
michael@0 1553 * open state.
michael@0 1554 */
michael@0 1555 state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos);
michael@0 1556 continue stateloop;
michael@0 1557 case '?':
michael@0 1558 // CPPONLY: if (viewingXmlSource) {
michael@0 1559 // CPPONLY: state = transition(state,
michael@0 1560 // CPPONLY: Tokenizer.PROCESSING_INSTRUCTION,
michael@0 1561 // CPPONLY: reconsume,
michael@0 1562 // CPPONLY: pos);
michael@0 1563 // CPPONLY: continue stateloop;
michael@0 1564 // CPPONLY: }
michael@0 1565 /*
michael@0 1566 * U+003F QUESTION MARK (?) Parse error.
michael@0 1567 */
michael@0 1568 errProcessingInstruction();
michael@0 1569 /*
michael@0 1570 * Switch to the bogus comment state.
michael@0 1571 */
michael@0 1572 clearLongStrBufAndAppend(c);
michael@0 1573 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
michael@0 1574 continue stateloop;
michael@0 1575 case '>':
michael@0 1576 /*
michael@0 1577 * U+003E GREATER-THAN SIGN (>) Parse error.
michael@0 1578 */
michael@0 1579 errLtGt();
michael@0 1580 /*
michael@0 1581 * Emit a U+003C LESS-THAN SIGN character token
michael@0 1582 * and a U+003E GREATER-THAN SIGN character
michael@0 1583 * token.
michael@0 1584 */
michael@0 1585 tokenHandler.characters(Tokenizer.LT_GT, 0, 2);
michael@0 1586 /* Switch to the data state. */
michael@0 1587 cstart = pos + 1;
michael@0 1588 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 1589 continue stateloop;
michael@0 1590 default:
michael@0 1591 /*
michael@0 1592 * Anything else Parse error.
michael@0 1593 */
michael@0 1594 errBadCharAfterLt(c);
michael@0 1595 /*
michael@0 1596 * Emit a U+003C LESS-THAN SIGN character token
michael@0 1597 */
michael@0 1598 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
michael@0 1599 /*
michael@0 1600 * and reconsume the current input character in
michael@0 1601 * the data state.
michael@0 1602 */
michael@0 1603 cstart = pos;
michael@0 1604 reconsume = true;
michael@0 1605 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 1606 continue stateloop;
michael@0 1607 }
michael@0 1608 }
michael@0 1609 // FALL THROUGH DON'T REORDER
michael@0 1610 case TAG_NAME:
michael@0 1611 tagnameloop: for (;;) {
michael@0 1612 if (++pos == endPos) {
michael@0 1613 break stateloop;
michael@0 1614 }
michael@0 1615 c = checkChar(buf, pos);
michael@0 1616 /*
michael@0 1617 * Consume the next input character:
michael@0 1618 */
michael@0 1619 switch (c) {
michael@0 1620 case '\r':
michael@0 1621 silentCarriageReturn();
michael@0 1622 strBufToElementNameString();
michael@0 1623 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
michael@0 1624 break stateloop;
michael@0 1625 case '\n':
michael@0 1626 silentLineFeed();
michael@0 1627 case ' ':
michael@0 1628 case '\t':
michael@0 1629 case '\u000C':
michael@0 1630 /*
michael@0 1631 * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0 1632 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
michael@0 1633 * Switch to the before attribute name state.
michael@0 1634 */
michael@0 1635 strBufToElementNameString();
michael@0 1636 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
michael@0 1637 break tagnameloop;
michael@0 1638 // continue stateloop;
michael@0 1639 case '/':
michael@0 1640 /*
michael@0 1641 * U+002F SOLIDUS (/) Switch to the self-closing
michael@0 1642 * start tag state.
michael@0 1643 */
michael@0 1644 strBufToElementNameString();
michael@0 1645 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
michael@0 1646 continue stateloop;
michael@0 1647 case '>':
michael@0 1648 /*
michael@0 1649 * U+003E GREATER-THAN SIGN (>) Emit the current
michael@0 1650 * tag token.
michael@0 1651 */
michael@0 1652 strBufToElementNameString();
michael@0 1653 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
michael@0 1654 if (shouldSuspend) {
michael@0 1655 break stateloop;
michael@0 1656 }
michael@0 1657 /*
michael@0 1658 * Switch to the data state.
michael@0 1659 */
michael@0 1660 continue stateloop;
michael@0 1661 case '\u0000':
michael@0 1662 c = '\uFFFD';
michael@0 1663 // fall thru
michael@0 1664 default:
michael@0 1665 if (c >= 'A' && c <= 'Z') {
michael@0 1666 /*
michael@0 1667 * U+0041 LATIN CAPITAL LETTER A through to
michael@0 1668 * U+005A LATIN CAPITAL LETTER Z Append the
michael@0 1669 * lowercase version of the current input
michael@0 1670 * character (add 0x0020 to the character's
michael@0 1671 * code point) to the current tag token's
michael@0 1672 * tag name.
michael@0 1673 */
michael@0 1674 c += 0x20;
michael@0 1675 }
michael@0 1676 /*
michael@0 1677 * Anything else Append the current input
michael@0 1678 * character to the current tag token's tag
michael@0 1679 * name.
michael@0 1680 */
michael@0 1681 appendStrBuf(c);
michael@0 1682 /*
michael@0 1683 * Stay in the tag name state.
michael@0 1684 */
michael@0 1685 continue;
michael@0 1686 }
michael@0 1687 }
michael@0 1688 // FALLTHRU DON'T REORDER
michael@0 1689 case BEFORE_ATTRIBUTE_NAME:
michael@0 1690 beforeattributenameloop: for (;;) {
michael@0 1691 if (reconsume) {
michael@0 1692 reconsume = false;
michael@0 1693 } else {
michael@0 1694 if (++pos == endPos) {
michael@0 1695 break stateloop;
michael@0 1696 }
michael@0 1697 c = checkChar(buf, pos);
michael@0 1698 }
michael@0 1699 /*
michael@0 1700 * Consume the next input character:
michael@0 1701 */
michael@0 1702 switch (c) {
michael@0 1703 case '\r':
michael@0 1704 silentCarriageReturn();
michael@0 1705 break stateloop;
michael@0 1706 case '\n':
michael@0 1707 silentLineFeed();
michael@0 1708 // fall thru
michael@0 1709 case ' ':
michael@0 1710 case '\t':
michael@0 1711 case '\u000C':
michael@0 1712 /*
michael@0 1713 * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0 1714 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
michael@0 1715 * in the before attribute name state.
michael@0 1716 */
michael@0 1717 continue;
michael@0 1718 case '/':
michael@0 1719 /*
michael@0 1720 * U+002F SOLIDUS (/) Switch to the self-closing
michael@0 1721 * start tag state.
michael@0 1722 */
michael@0 1723 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
michael@0 1724 continue stateloop;
michael@0 1725 case '>':
michael@0 1726 /*
michael@0 1727 * U+003E GREATER-THAN SIGN (>) Emit the current
michael@0 1728 * tag token.
michael@0 1729 */
michael@0 1730 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
michael@0 1731 if (shouldSuspend) {
michael@0 1732 break stateloop;
michael@0 1733 }
michael@0 1734 /*
michael@0 1735 * Switch to the data state.
michael@0 1736 */
michael@0 1737 continue stateloop;
michael@0 1738 case '\u0000':
michael@0 1739 c = '\uFFFD';
michael@0 1740 // fall thru
michael@0 1741 case '\"':
michael@0 1742 case '\'':
michael@0 1743 case '<':
michael@0 1744 case '=':
michael@0 1745 /*
michael@0 1746 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
michael@0 1747 * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
michael@0 1748 * SIGN (=) Parse error.
michael@0 1749 */
michael@0 1750 errBadCharBeforeAttributeNameOrNull(c);
michael@0 1751 /*
michael@0 1752 * Treat it as per the "anything else" entry
michael@0 1753 * below.
michael@0 1754 */
michael@0 1755 default:
michael@0 1756 /*
michael@0 1757 * Anything else Start a new attribute in the
michael@0 1758 * current tag token.
michael@0 1759 */
michael@0 1760 if (c >= 'A' && c <= 'Z') {
michael@0 1761 /*
michael@0 1762 * U+0041 LATIN CAPITAL LETTER A through to
michael@0 1763 * U+005A LATIN CAPITAL LETTER Z Set that
michael@0 1764 * attribute's name to the lowercase version
michael@0 1765 * of the current input character (add
michael@0 1766 * 0x0020 to the character's code point)
michael@0 1767 */
michael@0 1768 c += 0x20;
michael@0 1769 }
michael@0 1770 /*
michael@0 1771 * Set that attribute's name to the current
michael@0 1772 * input character,
michael@0 1773 */
michael@0 1774 clearStrBufAndAppend(c);
michael@0 1775 /*
michael@0 1776 * and its value to the empty string.
michael@0 1777 */
michael@0 1778 // Will do later.
michael@0 1779 /*
michael@0 1780 * Switch to the attribute name state.
michael@0 1781 */
michael@0 1782 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
michael@0 1783 break beforeattributenameloop;
michael@0 1784 // continue stateloop;
michael@0 1785 }
michael@0 1786 }
michael@0 1787 // FALLTHRU DON'T REORDER
michael@0 1788 case ATTRIBUTE_NAME:
michael@0 1789 attributenameloop: for (;;) {
michael@0 1790 if (++pos == endPos) {
michael@0 1791 break stateloop;
michael@0 1792 }
michael@0 1793 c = checkChar(buf, pos);
michael@0 1794 /*
michael@0 1795 * Consume the next input character:
michael@0 1796 */
michael@0 1797 switch (c) {
michael@0 1798 case '\r':
michael@0 1799 silentCarriageReturn();
michael@0 1800 attributeNameComplete();
michael@0 1801 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
michael@0 1802 break stateloop;
michael@0 1803 case '\n':
michael@0 1804 silentLineFeed();
michael@0 1805 // fall thru
michael@0 1806 case ' ':
michael@0 1807 case '\t':
michael@0 1808 case '\u000C':
michael@0 1809 /*
michael@0 1810 * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0 1811 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
michael@0 1812 * Switch to the after attribute name state.
michael@0 1813 */
michael@0 1814 attributeNameComplete();
michael@0 1815 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
michael@0 1816 continue stateloop;
michael@0 1817 case '/':
michael@0 1818 /*
michael@0 1819 * U+002F SOLIDUS (/) Switch to the self-closing
michael@0 1820 * start tag state.
michael@0 1821 */
michael@0 1822 attributeNameComplete();
michael@0 1823 addAttributeWithoutValue();
michael@0 1824 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
michael@0 1825 continue stateloop;
michael@0 1826 case '=':
michael@0 1827 /*
michael@0 1828 * U+003D EQUALS SIGN (=) Switch to the before
michael@0 1829 * attribute value state.
michael@0 1830 */
michael@0 1831 attributeNameComplete();
michael@0 1832 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
michael@0 1833 break attributenameloop;
michael@0 1834 // continue stateloop;
michael@0 1835 case '>':
michael@0 1836 /*
michael@0 1837 * U+003E GREATER-THAN SIGN (>) Emit the current
michael@0 1838 * tag token.
michael@0 1839 */
michael@0 1840 attributeNameComplete();
michael@0 1841 addAttributeWithoutValue();
michael@0 1842 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
michael@0 1843 if (shouldSuspend) {
michael@0 1844 break stateloop;
michael@0 1845 }
michael@0 1846 /*
michael@0 1847 * Switch to the data state.
michael@0 1848 */
michael@0 1849 continue stateloop;
michael@0 1850 case '\u0000':
michael@0 1851 c = '\uFFFD';
michael@0 1852 // fall thru
michael@0 1853 case '\"':
michael@0 1854 case '\'':
michael@0 1855 case '<':
michael@0 1856 /*
michael@0 1857 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
michael@0 1858 * (') U+003C LESS-THAN SIGN (<) Parse error.
michael@0 1859 */
michael@0 1860 errQuoteOrLtInAttributeNameOrNull(c);
michael@0 1861 /*
michael@0 1862 * Treat it as per the "anything else" entry
michael@0 1863 * below.
michael@0 1864 */
michael@0 1865 default:
michael@0 1866 if (c >= 'A' && c <= 'Z') {
michael@0 1867 /*
michael@0 1868 * U+0041 LATIN CAPITAL LETTER A through to
michael@0 1869 * U+005A LATIN CAPITAL LETTER Z Append the
michael@0 1870 * lowercase version of the current input
michael@0 1871 * character (add 0x0020 to the character's
michael@0 1872 * code point) to the current attribute's
michael@0 1873 * name.
michael@0 1874 */
michael@0 1875 c += 0x20;
michael@0 1876 }
michael@0 1877 /*
michael@0 1878 * Anything else Append the current input
michael@0 1879 * character to the current attribute's name.
michael@0 1880 */
michael@0 1881 appendStrBuf(c);
michael@0 1882 /*
michael@0 1883 * Stay in the attribute name state.
michael@0 1884 */
michael@0 1885 continue;
michael@0 1886 }
michael@0 1887 }
michael@0 1888 // FALLTHRU DON'T REORDER
michael@0 1889 case BEFORE_ATTRIBUTE_VALUE:
michael@0 1890 beforeattributevalueloop: for (;;) {
michael@0 1891 if (++pos == endPos) {
michael@0 1892 break stateloop;
michael@0 1893 }
michael@0 1894 c = checkChar(buf, pos);
michael@0 1895 /*
michael@0 1896 * Consume the next input character:
michael@0 1897 */
michael@0 1898 switch (c) {
michael@0 1899 case '\r':
michael@0 1900 silentCarriageReturn();
michael@0 1901 break stateloop;
michael@0 1902 case '\n':
michael@0 1903 silentLineFeed();
michael@0 1904 // fall thru
michael@0 1905 case ' ':
michael@0 1906 case '\t':
michael@0 1907 case '\u000C':
michael@0 1908 /*
michael@0 1909 * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0 1910 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
michael@0 1911 * in the before attribute value state.
michael@0 1912 */
michael@0 1913 continue;
michael@0 1914 case '"':
michael@0 1915 /*
michael@0 1916 * U+0022 QUOTATION MARK (") Switch to the
michael@0 1917 * attribute value (double-quoted) state.
michael@0 1918 */
michael@0 1919 clearLongStrBuf();
michael@0 1920 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos);
michael@0 1921 break beforeattributevalueloop;
michael@0 1922 // continue stateloop;
michael@0 1923 case '&':
michael@0 1924 /*
michael@0 1925 * U+0026 AMPERSAND (&) Switch to the attribute
michael@0 1926 * value (unquoted) state and reconsume this
michael@0 1927 * input character.
michael@0 1928 */
michael@0 1929 clearLongStrBuf();
michael@0 1930 reconsume = true;
michael@0 1931 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
michael@0 1932 noteUnquotedAttributeValue();
michael@0 1933 continue stateloop;
michael@0 1934 case '\'':
michael@0 1935 /*
michael@0 1936 * U+0027 APOSTROPHE (') Switch to the attribute
michael@0 1937 * value (single-quoted) state.
michael@0 1938 */
michael@0 1939 clearLongStrBuf();
michael@0 1940 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos);
michael@0 1941 continue stateloop;
michael@0 1942 case '>':
michael@0 1943 /*
michael@0 1944 * U+003E GREATER-THAN SIGN (>) Parse error.
michael@0 1945 */
michael@0 1946 errAttributeValueMissing();
michael@0 1947 /*
michael@0 1948 * Emit the current tag token.
michael@0 1949 */
michael@0 1950 addAttributeWithoutValue();
michael@0 1951 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
michael@0 1952 if (shouldSuspend) {
michael@0 1953 break stateloop;
michael@0 1954 }
michael@0 1955 /*
michael@0 1956 * Switch to the data state.
michael@0 1957 */
michael@0 1958 continue stateloop;
michael@0 1959 case '\u0000':
michael@0 1960 c = '\uFFFD';
michael@0 1961 // fall thru
michael@0 1962 case '<':
michael@0 1963 case '=':
michael@0 1964 case '`':
michael@0 1965 /*
michael@0 1966 * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN
michael@0 1967 * (=) U+0060 GRAVE ACCENT (`)
michael@0 1968 */
michael@0 1969 errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c);
michael@0 1970 /*
michael@0 1971 * Treat it as per the "anything else" entry
michael@0 1972 * below.
michael@0 1973 */
michael@0 1974 default:
michael@0 1975 // [NOCPP[
michael@0 1976 errHtml4NonNameInUnquotedAttribute(c);
michael@0 1977 // ]NOCPP]
michael@0 1978 /*
michael@0 1979 * Anything else Append the current input
michael@0 1980 * character to the current attribute's value.
michael@0 1981 */
michael@0 1982 clearLongStrBufAndAppend(c);
michael@0 1983 /*
michael@0 1984 * Switch to the attribute value (unquoted)
michael@0 1985 * state.
michael@0 1986 */
michael@0 1987
michael@0 1988 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
michael@0 1989 noteUnquotedAttributeValue();
michael@0 1990 continue stateloop;
michael@0 1991 }
michael@0 1992 }
michael@0 1993 // FALLTHRU DON'T REORDER
michael@0 1994 case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
michael@0 1995 attributevaluedoublequotedloop: for (;;) {
michael@0 1996 if (reconsume) {
michael@0 1997 reconsume = false;
michael@0 1998 } else {
michael@0 1999 if (++pos == endPos) {
michael@0 2000 break stateloop;
michael@0 2001 }
michael@0 2002 c = checkChar(buf, pos);
michael@0 2003 }
michael@0 2004 /*
michael@0 2005 * Consume the next input character:
michael@0 2006 */
michael@0 2007 switch (c) {
michael@0 2008 case '"':
michael@0 2009 /*
michael@0 2010 * U+0022 QUOTATION MARK (") Switch to the after
michael@0 2011 * attribute value (quoted) state.
michael@0 2012 */
michael@0 2013 addAttributeWithValue();
michael@0 2014
michael@0 2015 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
michael@0 2016 break attributevaluedoublequotedloop;
michael@0 2017 // continue stateloop;
michael@0 2018 case '&':
michael@0 2019 /*
michael@0 2020 * U+0026 AMPERSAND (&) Switch to the character
michael@0 2021 * reference in attribute value state, with the
michael@0 2022 * additional allowed character being U+0022
michael@0 2023 * QUOTATION MARK (").
michael@0 2024 */
michael@0 2025 clearStrBufAndAppend(c);
michael@0 2026 setAdditionalAndRememberAmpersandLocation('\"');
michael@0 2027 returnState = state;
michael@0 2028 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
michael@0 2029 continue stateloop;
michael@0 2030 case '\r':
michael@0 2031 appendLongStrBufCarriageReturn();
michael@0 2032 break stateloop;
michael@0 2033 case '\n':
michael@0 2034 appendLongStrBufLineFeed();
michael@0 2035 continue;
michael@0 2036 case '\u0000':
michael@0 2037 c = '\uFFFD';
michael@0 2038 // fall thru
michael@0 2039 default:
michael@0 2040 /*
michael@0 2041 * Anything else Append the current input
michael@0 2042 * character to the current attribute's value.
michael@0 2043 */
michael@0 2044 appendLongStrBuf(c);
michael@0 2045 /*
michael@0 2046 * Stay in the attribute value (double-quoted)
michael@0 2047 * state.
michael@0 2048 */
michael@0 2049 continue;
michael@0 2050 }
michael@0 2051 }
michael@0 2052 // FALLTHRU DON'T REORDER
michael@0 2053 case AFTER_ATTRIBUTE_VALUE_QUOTED:
michael@0 2054 afterattributevaluequotedloop: for (;;) {
michael@0 2055 if (++pos == endPos) {
michael@0 2056 break stateloop;
michael@0 2057 }
michael@0 2058 c = checkChar(buf, pos);
michael@0 2059 /*
michael@0 2060 * Consume the next input character:
michael@0 2061 */
michael@0 2062 switch (c) {
michael@0 2063 case '\r':
michael@0 2064 silentCarriageReturn();
michael@0 2065 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
michael@0 2066 break stateloop;
michael@0 2067 case '\n':
michael@0 2068 silentLineFeed();
michael@0 2069 // fall thru
michael@0 2070 case ' ':
michael@0 2071 case '\t':
michael@0 2072 case '\u000C':
michael@0 2073 /*
michael@0 2074 * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0 2075 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
michael@0 2076 * Switch to the before attribute name state.
michael@0 2077 */
michael@0 2078 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
michael@0 2079 continue stateloop;
michael@0 2080 case '/':
michael@0 2081 /*
michael@0 2082 * U+002F SOLIDUS (/) Switch to the self-closing
michael@0 2083 * start tag state.
michael@0 2084 */
michael@0 2085 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
michael@0 2086 break afterattributevaluequotedloop;
michael@0 2087 // continue stateloop;
michael@0 2088 case '>':
michael@0 2089 /*
michael@0 2090 * U+003E GREATER-THAN SIGN (>) Emit the current
michael@0 2091 * tag token.
michael@0 2092 */
michael@0 2093 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
michael@0 2094 if (shouldSuspend) {
michael@0 2095 break stateloop;
michael@0 2096 }
michael@0 2097 /*
michael@0 2098 * Switch to the data state.
michael@0 2099 */
michael@0 2100 continue stateloop;
michael@0 2101 default:
michael@0 2102 /*
michael@0 2103 * Anything else Parse error.
michael@0 2104 */
michael@0 2105 errNoSpaceBetweenAttributes();
michael@0 2106 /*
michael@0 2107 * Reconsume the character in the before
michael@0 2108 * attribute name state.
michael@0 2109 */
michael@0 2110 reconsume = true;
michael@0 2111 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
michael@0 2112 continue stateloop;
michael@0 2113 }
michael@0 2114 }
michael@0 2115 // FALLTHRU DON'T REORDER
michael@0 2116 case SELF_CLOSING_START_TAG:
michael@0 2117 if (++pos == endPos) {
michael@0 2118 break stateloop;
michael@0 2119 }
michael@0 2120 c = checkChar(buf, pos);
michael@0 2121 /*
michael@0 2122 * Consume the next input character:
michael@0 2123 */
michael@0 2124 switch (c) {
michael@0 2125 case '>':
michael@0 2126 /*
michael@0 2127 * U+003E GREATER-THAN SIGN (>) Set the self-closing
michael@0 2128 * flag of the current tag token. Emit the current
michael@0 2129 * tag token.
michael@0 2130 */
michael@0 2131 // [NOCPP[
michael@0 2132 errHtml4XmlVoidSyntax();
michael@0 2133 // ]NOCPP]
michael@0 2134 state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos);
michael@0 2135 if (shouldSuspend) {
michael@0 2136 break stateloop;
michael@0 2137 }
michael@0 2138 /*
michael@0 2139 * Switch to the data state.
michael@0 2140 */
michael@0 2141 continue stateloop;
michael@0 2142 default:
michael@0 2143 /* Anything else Parse error. */
michael@0 2144 errSlashNotFollowedByGt();
michael@0 2145 /*
michael@0 2146 * Reconsume the character in the before attribute
michael@0 2147 * name state.
michael@0 2148 */
michael@0 2149 reconsume = true;
michael@0 2150 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
michael@0 2151 continue stateloop;
michael@0 2152 }
michael@0 2153 // XXX reorder point
michael@0 2154 case ATTRIBUTE_VALUE_UNQUOTED:
michael@0 2155 for (;;) {
michael@0 2156 if (reconsume) {
michael@0 2157 reconsume = false;
michael@0 2158 } else {
michael@0 2159 if (++pos == endPos) {
michael@0 2160 break stateloop;
michael@0 2161 }
michael@0 2162 c = checkChar(buf, pos);
michael@0 2163 }
michael@0 2164 /*
michael@0 2165 * Consume the next input character:
michael@0 2166 */
michael@0 2167 switch (c) {
michael@0 2168 case '\r':
michael@0 2169 silentCarriageReturn();
michael@0 2170 addAttributeWithValue();
michael@0 2171 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
michael@0 2172 break stateloop;
michael@0 2173 case '\n':
michael@0 2174 silentLineFeed();
michael@0 2175 // fall thru
michael@0 2176 case ' ':
michael@0 2177 case '\t':
michael@0 2178 case '\u000C':
michael@0 2179 /*
michael@0 2180 * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0 2181 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
michael@0 2182 * Switch to the before attribute name state.
michael@0 2183 */
michael@0 2184 addAttributeWithValue();
michael@0 2185 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
michael@0 2186 continue stateloop;
michael@0 2187 case '&':
michael@0 2188 /*
michael@0 2189 * U+0026 AMPERSAND (&) Switch to the character
michael@0 2190 * reference in attribute value state, with the
michael@0 2191 * additional allowed character being U+003E
michael@0 2192 * GREATER-THAN SIGN (>)
michael@0 2193 */
michael@0 2194 clearStrBufAndAppend(c);
michael@0 2195 setAdditionalAndRememberAmpersandLocation('>');
michael@0 2196 returnState = state;
michael@0 2197 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
michael@0 2198 continue stateloop;
michael@0 2199 case '>':
michael@0 2200 /*
michael@0 2201 * U+003E GREATER-THAN SIGN (>) Emit the current
michael@0 2202 * tag token.
michael@0 2203 */
michael@0 2204 addAttributeWithValue();
michael@0 2205 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
michael@0 2206 if (shouldSuspend) {
michael@0 2207 break stateloop;
michael@0 2208 }
michael@0 2209 /*
michael@0 2210 * Switch to the data state.
michael@0 2211 */
michael@0 2212 continue stateloop;
michael@0 2213 case '\u0000':
michael@0 2214 c = '\uFFFD';
michael@0 2215 // fall thru
michael@0 2216 case '<':
michael@0 2217 case '\"':
michael@0 2218 case '\'':
michael@0 2219 case '=':
michael@0 2220 case '`':
michael@0 2221 /*
michael@0 2222 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
michael@0 2223 * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
michael@0 2224 * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error.
michael@0 2225 */
michael@0 2226 errUnquotedAttributeValOrNull(c);
michael@0 2227 /*
michael@0 2228 * Treat it as per the "anything else" entry
michael@0 2229 * below.
michael@0 2230 */
michael@0 2231 // fall through
michael@0 2232 default:
michael@0 2233 // [NOCPP]
michael@0 2234 errHtml4NonNameInUnquotedAttribute(c);
michael@0 2235 // ]NOCPP]
michael@0 2236 /*
michael@0 2237 * Anything else Append the current input
michael@0 2238 * character to the current attribute's value.
michael@0 2239 */
michael@0 2240 appendLongStrBuf(c);
michael@0 2241 /*
michael@0 2242 * Stay in the attribute value (unquoted) state.
michael@0 2243 */
michael@0 2244 continue;
michael@0 2245 }
michael@0 2246 }
michael@0 2247 // XXX reorder point
michael@0 2248 case AFTER_ATTRIBUTE_NAME:
michael@0 2249 for (;;) {
michael@0 2250 if (++pos == endPos) {
michael@0 2251 break stateloop;
michael@0 2252 }
michael@0 2253 c = checkChar(buf, pos);
michael@0 2254 /*
michael@0 2255 * Consume the next input character:
michael@0 2256 */
michael@0 2257 switch (c) {
michael@0 2258 case '\r':
michael@0 2259 silentCarriageReturn();
michael@0 2260 break stateloop;
michael@0 2261 case '\n':
michael@0 2262 silentLineFeed();
michael@0 2263 // fall thru
michael@0 2264 case ' ':
michael@0 2265 case '\t':
michael@0 2266 case '\u000C':
michael@0 2267 /*
michael@0 2268 * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0 2269 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
michael@0 2270 * in the after attribute name state.
michael@0 2271 */
michael@0 2272 continue;
michael@0 2273 case '/':
michael@0 2274 /*
michael@0 2275 * U+002F SOLIDUS (/) Switch to the self-closing
michael@0 2276 * start tag state.
michael@0 2277 */
michael@0 2278 addAttributeWithoutValue();
michael@0 2279 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
michael@0 2280 continue stateloop;
michael@0 2281 case '=':
michael@0 2282 /*
michael@0 2283 * U+003D EQUALS SIGN (=) Switch to the before
michael@0 2284 * attribute value state.
michael@0 2285 */
michael@0 2286 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
michael@0 2287 continue stateloop;
michael@0 2288 case '>':
michael@0 2289 /*
michael@0 2290 * U+003E GREATER-THAN SIGN (>) Emit the current
michael@0 2291 * tag token.
michael@0 2292 */
michael@0 2293 addAttributeWithoutValue();
michael@0 2294 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
michael@0 2295 if (shouldSuspend) {
michael@0 2296 break stateloop;
michael@0 2297 }
michael@0 2298 /*
michael@0 2299 * Switch to the data state.
michael@0 2300 */
michael@0 2301 continue stateloop;
michael@0 2302 case '\u0000':
michael@0 2303 c = '\uFFFD';
michael@0 2304 // fall thru
michael@0 2305 case '\"':
michael@0 2306 case '\'':
michael@0 2307 case '<':
michael@0 2308 errQuoteOrLtInAttributeNameOrNull(c);
michael@0 2309 /*
michael@0 2310 * Treat it as per the "anything else" entry
michael@0 2311 * below.
michael@0 2312 */
michael@0 2313 default:
michael@0 2314 addAttributeWithoutValue();
michael@0 2315 /*
michael@0 2316 * Anything else Start a new attribute in the
michael@0 2317 * current tag token.
michael@0 2318 */
michael@0 2319 if (c >= 'A' && c <= 'Z') {
michael@0 2320 /*
michael@0 2321 * U+0041 LATIN CAPITAL LETTER A through to
michael@0 2322 * U+005A LATIN CAPITAL LETTER Z Set that
michael@0 2323 * attribute's name to the lowercase version
michael@0 2324 * of the current input character (add
michael@0 2325 * 0x0020 to the character's code point)
michael@0 2326 */
michael@0 2327 c += 0x20;
michael@0 2328 }
michael@0 2329 /*
michael@0 2330 * Set that attribute's name to the current
michael@0 2331 * input character,
michael@0 2332 */
michael@0 2333 clearStrBufAndAppend(c);
michael@0 2334 /*
michael@0 2335 * and its value to the empty string.
michael@0 2336 */
michael@0 2337 // Will do later.
michael@0 2338 /*
michael@0 2339 * Switch to the attribute name state.
michael@0 2340 */
michael@0 2341 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
michael@0 2342 continue stateloop;
michael@0 2343 }
michael@0 2344 }
michael@0 2345 // XXX reorder point
michael@0 2346 case MARKUP_DECLARATION_OPEN:
michael@0 2347 markupdeclarationopenloop: for (;;) {
michael@0 2348 if (++pos == endPos) {
michael@0 2349 break stateloop;
michael@0 2350 }
michael@0 2351 c = checkChar(buf, pos);
michael@0 2352 /*
michael@0 2353 * If the next two characters are both U+002D
michael@0 2354 * HYPHEN-MINUS characters (-), consume those two
michael@0 2355 * characters, create a comment token whose data is the
michael@0 2356 * empty string, and switch to the comment start state.
michael@0 2357 *
michael@0 2358 * Otherwise, if the next seven characters are an ASCII
michael@0 2359 * case-insensitive match for the word "DOCTYPE", then
michael@0 2360 * consume those characters and switch to the DOCTYPE
michael@0 2361 * state.
michael@0 2362 *
michael@0 2363 * Otherwise, if the insertion mode is
michael@0 2364 * "in foreign content" and the current node is not an
michael@0 2365 * element in the HTML namespace and the next seven
michael@0 2366 * characters are an case-sensitive match for the string
michael@0 2367 * "[CDATA[" (the five uppercase letters "CDATA" with a
michael@0 2368 * U+005B LEFT SQUARE BRACKET character before and
michael@0 2369 * after), then consume those characters and switch to
michael@0 2370 * the CDATA section state.
michael@0 2371 *
michael@0 2372 * Otherwise, is is a parse error. Switch to the bogus
michael@0 2373 * comment state. The next character that is consumed,
michael@0 2374 * if any, is the first character that will be in the
michael@0 2375 * comment.
michael@0 2376 */
michael@0 2377 switch (c) {
michael@0 2378 case '-':
michael@0 2379 clearLongStrBufAndAppend(c);
michael@0 2380 state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos);
michael@0 2381 break markupdeclarationopenloop;
michael@0 2382 // continue stateloop;
michael@0 2383 case 'd':
michael@0 2384 case 'D':
michael@0 2385 clearLongStrBufAndAppend(c);
michael@0 2386 index = 0;
michael@0 2387 state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos);
michael@0 2388 continue stateloop;
michael@0 2389 case '[':
michael@0 2390 if (tokenHandler.cdataSectionAllowed()) {
michael@0 2391 clearLongStrBufAndAppend(c);
michael@0 2392 index = 0;
michael@0 2393 state = transition(state, Tokenizer.CDATA_START, reconsume, pos);
michael@0 2394 continue stateloop;
michael@0 2395 }
michael@0 2396 // else fall through
michael@0 2397 default:
michael@0 2398 errBogusComment();
michael@0 2399 clearLongStrBuf();
michael@0 2400 reconsume = true;
michael@0 2401 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
michael@0 2402 continue stateloop;
michael@0 2403 }
michael@0 2404 }
michael@0 2405 // FALLTHRU DON'T REORDER
michael@0 2406 case MARKUP_DECLARATION_HYPHEN:
michael@0 2407 markupdeclarationhyphenloop: for (;;) {
michael@0 2408 if (++pos == endPos) {
michael@0 2409 break stateloop;
michael@0 2410 }
michael@0 2411 c = checkChar(buf, pos);
michael@0 2412 switch (c) {
michael@0 2413 case '\u0000':
michael@0 2414 break stateloop;
michael@0 2415 case '-':
michael@0 2416 clearLongStrBuf();
michael@0 2417 state = transition(state, Tokenizer.COMMENT_START, reconsume, pos);
michael@0 2418 break markupdeclarationhyphenloop;
michael@0 2419 // continue stateloop;
michael@0 2420 default:
michael@0 2421 errBogusComment();
michael@0 2422 reconsume = true;
michael@0 2423 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
michael@0 2424 continue stateloop;
michael@0 2425 }
michael@0 2426 }
michael@0 2427 // FALLTHRU DON'T REORDER
michael@0 2428 case COMMENT_START:
michael@0 2429 commentstartloop: for (;;) {
michael@0 2430 if (++pos == endPos) {
michael@0 2431 break stateloop;
michael@0 2432 }
michael@0 2433 c = checkChar(buf, pos);
michael@0 2434 /*
michael@0 2435 * Comment start state
michael@0 2436 *
michael@0 2437 *
michael@0 2438 * Consume the next input character:
michael@0 2439 */
michael@0 2440 switch (c) {
michael@0 2441 case '-':
michael@0 2442 /*
michael@0 2443 * U+002D HYPHEN-MINUS (-) Switch to the comment
michael@0 2444 * start dash state.
michael@0 2445 */
michael@0 2446 appendLongStrBuf(c);
michael@0 2447 state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos);
michael@0 2448 continue stateloop;
michael@0 2449 case '>':
michael@0 2450 /*
michael@0 2451 * U+003E GREATER-THAN SIGN (>) Parse error.
michael@0 2452 */
michael@0 2453 errPrematureEndOfComment();
michael@0 2454 /* Emit the comment token. */
michael@0 2455 emitComment(0, pos);
michael@0 2456 /*
michael@0 2457 * Switch to the data state.
michael@0 2458 */
michael@0 2459 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 2460 continue stateloop;
michael@0 2461 case '\r':
michael@0 2462 appendLongStrBufCarriageReturn();
michael@0 2463 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0 2464 break stateloop;
michael@0 2465 case '\n':
michael@0 2466 appendLongStrBufLineFeed();
michael@0 2467 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0 2468 break commentstartloop;
michael@0 2469 case '\u0000':
michael@0 2470 c = '\uFFFD';
michael@0 2471 // fall thru
michael@0 2472 default:
michael@0 2473 /*
michael@0 2474 * Anything else Append the input character to
michael@0 2475 * the comment token's data.
michael@0 2476 */
michael@0 2477 appendLongStrBuf(c);
michael@0 2478 /*
michael@0 2479 * Switch to the comment state.
michael@0 2480 */
michael@0 2481 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0 2482 break commentstartloop;
michael@0 2483 // continue stateloop;
michael@0 2484 }
michael@0 2485 }
michael@0 2486 // FALLTHRU DON'T REORDER
michael@0 2487 case COMMENT:
michael@0 2488 commentloop: for (;;) {
michael@0 2489 if (++pos == endPos) {
michael@0 2490 break stateloop;
michael@0 2491 }
michael@0 2492 c = checkChar(buf, pos);
michael@0 2493 /*
michael@0 2494 * Comment state Consume the next input character:
michael@0 2495 */
michael@0 2496 switch (c) {
michael@0 2497 case '-':
michael@0 2498 /*
michael@0 2499 * U+002D HYPHEN-MINUS (-) Switch to the comment
michael@0 2500 * end dash state
michael@0 2501 */
michael@0 2502 appendLongStrBuf(c);
michael@0 2503 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
michael@0 2504 break commentloop;
michael@0 2505 // continue stateloop;
michael@0 2506 case '\r':
michael@0 2507 appendLongStrBufCarriageReturn();
michael@0 2508 break stateloop;
michael@0 2509 case '\n':
michael@0 2510 appendLongStrBufLineFeed();
michael@0 2511 continue;
michael@0 2512 case '\u0000':
michael@0 2513 c = '\uFFFD';
michael@0 2514 // fall thru
michael@0 2515 default:
michael@0 2516 /*
michael@0 2517 * Anything else Append the input character to
michael@0 2518 * the comment token's data.
michael@0 2519 */
michael@0 2520 appendLongStrBuf(c);
michael@0 2521 /*
michael@0 2522 * Stay in the comment state.
michael@0 2523 */
michael@0 2524 continue;
michael@0 2525 }
michael@0 2526 }
michael@0 2527 // FALLTHRU DON'T REORDER
michael@0 2528 case COMMENT_END_DASH:
michael@0 2529 commentenddashloop: for (;;) {
michael@0 2530 if (++pos == endPos) {
michael@0 2531 break stateloop;
michael@0 2532 }
michael@0 2533 c = checkChar(buf, pos);
michael@0 2534 /*
michael@0 2535 * Comment end dash state Consume the next input
michael@0 2536 * character:
michael@0 2537 */
michael@0 2538 switch (c) {
michael@0 2539 case '-':
michael@0 2540 /*
michael@0 2541 * U+002D HYPHEN-MINUS (-) Switch to the comment
michael@0 2542 * end state
michael@0 2543 */
michael@0 2544 appendLongStrBuf(c);
michael@0 2545 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
michael@0 2546 break commentenddashloop;
michael@0 2547 // continue stateloop;
michael@0 2548 case '\r':
michael@0 2549 appendLongStrBufCarriageReturn();
michael@0 2550 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0 2551 break stateloop;
michael@0 2552 case '\n':
michael@0 2553 appendLongStrBufLineFeed();
michael@0 2554 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0 2555 continue stateloop;
michael@0 2556 case '\u0000':
michael@0 2557 c = '\uFFFD';
michael@0 2558 // fall thru
michael@0 2559 default:
michael@0 2560 /*
michael@0 2561 * Anything else Append a U+002D HYPHEN-MINUS
michael@0 2562 * (-) character and the input character to the
michael@0 2563 * comment token's data.
michael@0 2564 */
michael@0 2565 appendLongStrBuf(c);
michael@0 2566 /*
michael@0 2567 * Switch to the comment state.
michael@0 2568 */
michael@0 2569 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0 2570 continue stateloop;
michael@0 2571 }
michael@0 2572 }
michael@0 2573 // FALLTHRU DON'T REORDER
michael@0 2574 case COMMENT_END:
michael@0 2575 commentendloop: for (;;) {
michael@0 2576 if (++pos == endPos) {
michael@0 2577 break stateloop;
michael@0 2578 }
michael@0 2579 c = checkChar(buf, pos);
michael@0 2580 /*
michael@0 2581 * Comment end dash state Consume the next input
michael@0 2582 * character:
michael@0 2583 */
michael@0 2584 switch (c) {
michael@0 2585 case '>':
michael@0 2586 /*
michael@0 2587 * U+003E GREATER-THAN SIGN (>) Emit the comment
michael@0 2588 * token.
michael@0 2589 */
michael@0 2590 emitComment(2, pos);
michael@0 2591 /*
michael@0 2592 * Switch to the data state.
michael@0 2593 */
michael@0 2594 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 2595 continue stateloop;
michael@0 2596 case '-':
michael@0 2597 /* U+002D HYPHEN-MINUS (-) Parse error. */
michael@0 2598 /*
michael@0 2599 * Append a U+002D HYPHEN-MINUS (-) character to
michael@0 2600 * the comment token's data.
michael@0 2601 */
michael@0 2602 adjustDoubleHyphenAndAppendToLongStrBufAndErr(c);
michael@0 2603 /*
michael@0 2604 * Stay in the comment end state.
michael@0 2605 */
michael@0 2606 continue;
michael@0 2607 case '\r':
michael@0 2608 adjustDoubleHyphenAndAppendToLongStrBufCarriageReturn();
michael@0 2609 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0 2610 break stateloop;
michael@0 2611 case '\n':
michael@0 2612 adjustDoubleHyphenAndAppendToLongStrBufLineFeed();
michael@0 2613 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0 2614 continue stateloop;
michael@0 2615 case '!':
michael@0 2616 errHyphenHyphenBang();
michael@0 2617 appendLongStrBuf(c);
michael@0 2618 state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos);
michael@0 2619 continue stateloop;
michael@0 2620 case '\u0000':
michael@0 2621 c = '\uFFFD';
michael@0 2622 // fall thru
michael@0 2623 default:
michael@0 2624 /*
michael@0 2625 * Append two U+002D HYPHEN-MINUS (-) characters
michael@0 2626 * and the input character to the comment
michael@0 2627 * token's data.
michael@0 2628 */
michael@0 2629 adjustDoubleHyphenAndAppendToLongStrBufAndErr(c);
michael@0 2630 /*
michael@0 2631 * Switch to the comment state.
michael@0 2632 */
michael@0 2633 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0 2634 continue stateloop;
michael@0 2635 }
michael@0 2636 }
michael@0 2637 // XXX reorder point
michael@0 2638 case COMMENT_END_BANG:
michael@0 2639 for (;;) {
michael@0 2640 if (++pos == endPos) {
michael@0 2641 break stateloop;
michael@0 2642 }
michael@0 2643 c = checkChar(buf, pos);
michael@0 2644 /*
michael@0 2645 * Comment end bang state
michael@0 2646 *
michael@0 2647 * Consume the next input character:
michael@0 2648 */
michael@0 2649 switch (c) {
michael@0 2650 case '>':
michael@0 2651 /*
michael@0 2652 * U+003E GREATER-THAN SIGN (>) Emit the comment
michael@0 2653 * token.
michael@0 2654 */
michael@0 2655 emitComment(3, pos);
michael@0 2656 /*
michael@0 2657 * Switch to the data state.
michael@0 2658 */
michael@0 2659 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 2660 continue stateloop;
michael@0 2661 case '-':
michael@0 2662 /*
michael@0 2663 * Append two U+002D HYPHEN-MINUS (-) characters
michael@0 2664 * and a U+0021 EXCLAMATION MARK (!) character
michael@0 2665 * to the comment token's data.
michael@0 2666 */
michael@0 2667 appendLongStrBuf(c);
michael@0 2668 /*
michael@0 2669 * Switch to the comment end dash state.
michael@0 2670 */
michael@0 2671 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
michael@0 2672 continue stateloop;
michael@0 2673 case '\r':
michael@0 2674 appendLongStrBufCarriageReturn();
michael@0 2675 break stateloop;
michael@0 2676 case '\n':
michael@0 2677 appendLongStrBufLineFeed();
michael@0 2678 continue;
michael@0 2679 case '\u0000':
michael@0 2680 c = '\uFFFD';
michael@0 2681 // fall thru
michael@0 2682 default:
michael@0 2683 /*
michael@0 2684 * Anything else Append two U+002D HYPHEN-MINUS
michael@0 2685 * (-) characters, a U+0021 EXCLAMATION MARK (!)
michael@0 2686 * character, and the input character to the
michael@0 2687 * comment token's data. Switch to the comment
michael@0 2688 * state.
michael@0 2689 */
michael@0 2690 appendLongStrBuf(c);
michael@0 2691 /*
michael@0 2692 * Switch to the comment state.
michael@0 2693 */
michael@0 2694 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0 2695 continue stateloop;
michael@0 2696 }
michael@0 2697 }
michael@0 2698 // XXX reorder point
michael@0 2699 case COMMENT_START_DASH:
michael@0 2700 if (++pos == endPos) {
michael@0 2701 break stateloop;
michael@0 2702 }
michael@0 2703 c = checkChar(buf, pos);
michael@0 2704 /*
michael@0 2705 * Comment start dash state
michael@0 2706 *
michael@0 2707 * Consume the next input character:
michael@0 2708 */
michael@0 2709 switch (c) {
michael@0 2710 case '-':
michael@0 2711 /*
michael@0 2712 * U+002D HYPHEN-MINUS (-) Switch to the comment end
michael@0 2713 * state
michael@0 2714 */
michael@0 2715 appendLongStrBuf(c);
michael@0 2716 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
michael@0 2717 continue stateloop;
michael@0 2718 case '>':
michael@0 2719 errPrematureEndOfComment();
michael@0 2720 /* Emit the comment token. */
michael@0 2721 emitComment(1, pos);
michael@0 2722 /*
michael@0 2723 * Switch to the data state.
michael@0 2724 */
michael@0 2725 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 2726 continue stateloop;
michael@0 2727 case '\r':
michael@0 2728 appendLongStrBufCarriageReturn();
michael@0 2729 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0 2730 break stateloop;
michael@0 2731 case '\n':
michael@0 2732 appendLongStrBufLineFeed();
michael@0 2733 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0 2734 continue stateloop;
michael@0 2735 case '\u0000':
michael@0 2736 c = '\uFFFD';
michael@0 2737 // fall thru
michael@0 2738 default:
michael@0 2739 /*
michael@0 2740 * Append a U+002D HYPHEN-MINUS character (-) and
michael@0 2741 * the current input character to the comment
michael@0 2742 * token's data.
michael@0 2743 */
michael@0 2744 appendLongStrBuf(c);
michael@0 2745 /*
michael@0 2746 * Switch to the comment state.
michael@0 2747 */
michael@0 2748 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
michael@0 2749 continue stateloop;
michael@0 2750 }
michael@0 2751 // XXX reorder point
michael@0 2752 case CDATA_START:
michael@0 2753 for (;;) {
michael@0 2754 if (++pos == endPos) {
michael@0 2755 break stateloop;
michael@0 2756 }
michael@0 2757 c = checkChar(buf, pos);
michael@0 2758 if (index < 6) { // CDATA_LSQB.length
michael@0 2759 if (c == Tokenizer.CDATA_LSQB[index]) {
michael@0 2760 appendLongStrBuf(c);
michael@0 2761 } else {
michael@0 2762 errBogusComment();
michael@0 2763 reconsume = true;
michael@0 2764 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
michael@0 2765 continue stateloop;
michael@0 2766 }
michael@0 2767 index++;
michael@0 2768 continue;
michael@0 2769 } else {
michael@0 2770 cstart = pos; // start coalescing
michael@0 2771 reconsume = true;
michael@0 2772 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
michael@0 2773 break; // FALL THROUGH continue stateloop;
michael@0 2774 }
michael@0 2775 }
michael@0 2776 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0 2777 case CDATA_SECTION:
michael@0 2778 cdatasectionloop: for (;;) {
michael@0 2779 if (reconsume) {
michael@0 2780 reconsume = false;
michael@0 2781 } else {
michael@0 2782 if (++pos == endPos) {
michael@0 2783 break stateloop;
michael@0 2784 }
michael@0 2785 c = checkChar(buf, pos);
michael@0 2786 }
michael@0 2787 switch (c) {
michael@0 2788 case ']':
michael@0 2789 flushChars(buf, pos);
michael@0 2790 state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos);
michael@0 2791 break cdatasectionloop; // FALL THROUGH
michael@0 2792 case '\u0000':
michael@0 2793 emitReplacementCharacter(buf, pos);
michael@0 2794 continue;
michael@0 2795 case '\r':
michael@0 2796 emitCarriageReturn(buf, pos);
michael@0 2797 break stateloop;
michael@0 2798 case '\n':
michael@0 2799 silentLineFeed();
michael@0 2800 // fall thru
michael@0 2801 default:
michael@0 2802 continue;
michael@0 2803 }
michael@0 2804 }
michael@0 2805 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0 2806 case CDATA_RSQB:
michael@0 2807 cdatarsqb: for (;;) {
michael@0 2808 if (++pos == endPos) {
michael@0 2809 break stateloop;
michael@0 2810 }
michael@0 2811 c = checkChar(buf, pos);
michael@0 2812 switch (c) {
michael@0 2813 case ']':
michael@0 2814 state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos);
michael@0 2815 break cdatarsqb;
michael@0 2816 default:
michael@0 2817 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0,
michael@0 2818 1);
michael@0 2819 cstart = pos;
michael@0 2820 reconsume = true;
michael@0 2821 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
michael@0 2822 continue stateloop;
michael@0 2823 }
michael@0 2824 }
michael@0 2825 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0 2826 case CDATA_RSQB_RSQB:
michael@0 2827 cdatarsqbrsqb: for (;;) {
michael@0 2828 if (++pos == endPos) {
michael@0 2829 break stateloop;
michael@0 2830 }
michael@0 2831 c = checkChar(buf, pos);
michael@0 2832 switch (c) {
michael@0 2833 case ']':
michael@0 2834 // Saw a third ]. Emit one ] (logically the
michael@0 2835 // first one) and stay in this state to
michael@0 2836 // remember that the last two characters seen
michael@0 2837 // have been ]].
michael@0 2838 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
michael@0 2839 continue;
michael@0 2840 case '>':
michael@0 2841 cstart = pos + 1;
michael@0 2842 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 2843 continue stateloop;
michael@0 2844 default:
michael@0 2845 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
michael@0 2846 cstart = pos;
michael@0 2847 reconsume = true;
michael@0 2848 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
michael@0 2849 continue stateloop;
michael@0 2850 }
michael@0 2851 }
michael@0 2852 // XXX reorder point
michael@0 2853 case ATTRIBUTE_VALUE_SINGLE_QUOTED:
michael@0 2854 attributevaluesinglequotedloop: for (;;) {
michael@0 2855 if (reconsume) {
michael@0 2856 reconsume = false;
michael@0 2857 } else {
michael@0 2858 if (++pos == endPos) {
michael@0 2859 break stateloop;
michael@0 2860 }
michael@0 2861 c = checkChar(buf, pos);
michael@0 2862 }
michael@0 2863 /*
michael@0 2864 * Consume the next input character:
michael@0 2865 */
michael@0 2866 switch (c) {
michael@0 2867 case '\'':
michael@0 2868 /*
michael@0 2869 * U+0027 APOSTROPHE (') Switch to the after
michael@0 2870 * attribute value (quoted) state.
michael@0 2871 */
michael@0 2872 addAttributeWithValue();
michael@0 2873
michael@0 2874 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
michael@0 2875 continue stateloop;
michael@0 2876 case '&':
michael@0 2877 /*
michael@0 2878 * U+0026 AMPERSAND (&) Switch to the character
michael@0 2879 * reference in attribute value state, with the
michael@0 2880 * + additional allowed character being U+0027
michael@0 2881 * APOSTROPHE (').
michael@0 2882 */
michael@0 2883 clearStrBufAndAppend(c);
michael@0 2884 setAdditionalAndRememberAmpersandLocation('\'');
michael@0 2885 returnState = state;
michael@0 2886 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
michael@0 2887 break attributevaluesinglequotedloop;
michael@0 2888 // continue stateloop;
michael@0 2889 case '\r':
michael@0 2890 appendLongStrBufCarriageReturn();
michael@0 2891 break stateloop;
michael@0 2892 case '\n':
michael@0 2893 appendLongStrBufLineFeed();
michael@0 2894 continue;
michael@0 2895 case '\u0000':
michael@0 2896 c = '\uFFFD';
michael@0 2897 // fall thru
michael@0 2898 default:
michael@0 2899 /*
michael@0 2900 * Anything else Append the current input
michael@0 2901 * character to the current attribute's value.
michael@0 2902 */
michael@0 2903 appendLongStrBuf(c);
michael@0 2904 /*
michael@0 2905 * Stay in the attribute value (double-quoted)
michael@0 2906 * state.
michael@0 2907 */
michael@0 2908 continue;
michael@0 2909 }
michael@0 2910 }
michael@0 2911 // FALLTHRU DON'T REORDER
michael@0 2912 case CONSUME_CHARACTER_REFERENCE:
michael@0 2913 if (++pos == endPos) {
michael@0 2914 break stateloop;
michael@0 2915 }
michael@0 2916 c = checkChar(buf, pos);
michael@0 2917 if (c == '\u0000') {
michael@0 2918 break stateloop;
michael@0 2919 }
michael@0 2920 /*
michael@0 2921 * Unlike the definition is the spec, this state does not
michael@0 2922 * return a value and never requires the caller to
michael@0 2923 * backtrack. This state takes care of emitting characters
michael@0 2924 * or appending to the current attribute value. It also
michael@0 2925 * takes care of that in the case when consuming the
michael@0 2926 * character reference fails.
michael@0 2927 */
michael@0 2928 /*
michael@0 2929 * This section defines how to consume a character
michael@0 2930 * reference. This definition is used when parsing character
michael@0 2931 * references in text and in attributes.
michael@0 2932 *
michael@0 2933 * The behavior depends on the identity of the next
michael@0 2934 * character (the one immediately after the U+0026 AMPERSAND
michael@0 2935 * character):
michael@0 2936 */
michael@0 2937 switch (c) {
michael@0 2938 case ' ':
michael@0 2939 case '\t':
michael@0 2940 case '\n':
michael@0 2941 case '\r': // we'll reconsume!
michael@0 2942 case '\u000C':
michael@0 2943 case '<':
michael@0 2944 case '&':
michael@0 2945 emitOrAppendStrBuf(returnState);
michael@0 2946 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0 2947 cstart = pos;
michael@0 2948 }
michael@0 2949 reconsume = true;
michael@0 2950 state = transition(state, returnState, reconsume, pos);
michael@0 2951 continue stateloop;
michael@0 2952 case '#':
michael@0 2953 /*
michael@0 2954 * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER
michael@0 2955 * SIGN.
michael@0 2956 */
michael@0 2957 appendStrBuf('#');
michael@0 2958 state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos);
michael@0 2959 continue stateloop;
michael@0 2960 default:
michael@0 2961 if (c == additional) {
michael@0 2962 emitOrAppendStrBuf(returnState);
michael@0 2963 reconsume = true;
michael@0 2964 state = transition(state, returnState, reconsume, pos);
michael@0 2965 continue stateloop;
michael@0 2966 }
michael@0 2967 if (c >= 'a' && c <= 'z') {
michael@0 2968 firstCharKey = c - 'a' + 26;
michael@0 2969 } else if (c >= 'A' && c <= 'Z') {
michael@0 2970 firstCharKey = c - 'A';
michael@0 2971 } else {
michael@0 2972 // No match
michael@0 2973 /*
michael@0 2974 * If no match can be made, then this is a parse
michael@0 2975 * error.
michael@0 2976 */
michael@0 2977 errNoNamedCharacterMatch();
michael@0 2978 emitOrAppendStrBuf(returnState);
michael@0 2979 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0 2980 cstart = pos;
michael@0 2981 }
michael@0 2982 reconsume = true;
michael@0 2983 state = transition(state, returnState, reconsume, pos);
michael@0 2984 continue stateloop;
michael@0 2985 }
michael@0 2986 // Didn't fail yet
michael@0 2987 appendStrBuf(c);
michael@0 2988 state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos);
michael@0 2989 // FALL THROUGH continue stateloop;
michael@0 2990 }
michael@0 2991 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0 2992 case CHARACTER_REFERENCE_HILO_LOOKUP:
michael@0 2993 {
michael@0 2994 if (++pos == endPos) {
michael@0 2995 break stateloop;
michael@0 2996 }
michael@0 2997 c = checkChar(buf, pos);
michael@0 2998 if (c == '\u0000') {
michael@0 2999 break stateloop;
michael@0 3000 }
michael@0 3001 /*
michael@0 3002 * The data structure is as follows:
michael@0 3003 *
michael@0 3004 * HILO_ACCEL is a two-dimensional int array whose major
michael@0 3005 * index corresponds to the second character of the
michael@0 3006 * character reference (code point as index) and the
michael@0 3007 * minor index corresponds to the first character of the
michael@0 3008 * character reference (packed so that A-Z runs from 0
michael@0 3009 * to 25 and a-z runs from 26 to 51). This layout makes
michael@0 3010 * it easier to use the sparseness of the data structure
michael@0 3011 * to omit parts of it: The second dimension of the
michael@0 3012 * table is null when no character reference starts with
michael@0 3013 * the character corresponding to that row.
michael@0 3014 *
michael@0 3015 * The int value HILO_ACCEL (by these indeces) is zero
michael@0 3016 * if there exists no character reference starting with
michael@0 3017 * that two-letter prefix. Otherwise, the value is an
michael@0 3018 * int that packs two shorts so that the higher short is
michael@0 3019 * the index of the highest character reference name
michael@0 3020 * with that prefix in NAMES and the lower short
michael@0 3021 * corresponds to the index of the lowest character
michael@0 3022 * reference name with that prefix. (It happens that the
michael@0 3023 * first two character reference names share their
michael@0 3024 * prefix so the packed int cannot be 0 by packing the
michael@0 3025 * two shorts.)
michael@0 3026 *
michael@0 3027 * NAMES is an array of byte arrays where each byte
michael@0 3028 * array encodes the name of a character references as
michael@0 3029 * ASCII. The names omit the first two letters of the
michael@0 3030 * name. (Since storing the first two letters would be
michael@0 3031 * redundant with the data contained in HILO_ACCEL.) The
michael@0 3032 * entries are lexically sorted.
michael@0 3033 *
michael@0 3034 * For a given index in NAMES, the same index in VALUES
michael@0 3035 * contains the corresponding expansion as an array of
michael@0 3036 * two UTF-16 code units (either the character and
michael@0 3037 * U+0000 or a suggogate pair).
michael@0 3038 */
michael@0 3039 int hilo = 0;
michael@0 3040 if (c <= 'z') {
michael@0 3041 @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c];
michael@0 3042 if (row != null) {
michael@0 3043 hilo = row[firstCharKey];
michael@0 3044 }
michael@0 3045 }
michael@0 3046 if (hilo == 0) {
michael@0 3047 /*
michael@0 3048 * If no match can be made, then this is a parse
michael@0 3049 * error.
michael@0 3050 */
michael@0 3051 errNoNamedCharacterMatch();
michael@0 3052 emitOrAppendStrBuf(returnState);
michael@0 3053 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0 3054 cstart = pos;
michael@0 3055 }
michael@0 3056 reconsume = true;
michael@0 3057 state = transition(state, returnState, reconsume, pos);
michael@0 3058 continue stateloop;
michael@0 3059 }
michael@0 3060 // Didn't fail yet
michael@0 3061 appendStrBuf(c);
michael@0 3062 lo = hilo & 0xFFFF;
michael@0 3063 hi = hilo >> 16;
michael@0 3064 entCol = -1;
michael@0 3065 candidate = -1;
michael@0 3066 strBufMark = 0;
michael@0 3067 state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos);
michael@0 3068 // FALL THROUGH continue stateloop;
michael@0 3069 }
michael@0 3070 case CHARACTER_REFERENCE_TAIL:
michael@0 3071 outer: for (;;) {
michael@0 3072 if (++pos == endPos) {
michael@0 3073 break stateloop;
michael@0 3074 }
michael@0 3075 c = checkChar(buf, pos);
michael@0 3076 if (c == '\u0000') {
michael@0 3077 break stateloop;
michael@0 3078 }
michael@0 3079 entCol++;
michael@0 3080 /*
michael@0 3081 * Consume the maximum number of characters possible,
michael@0 3082 * with the consumed characters matching one of the
michael@0 3083 * identifiers in the first column of the named
michael@0 3084 * character references table (in a case-sensitive
michael@0 3085 * manner).
michael@0 3086 */
michael@0 3087 loloop: for (;;) {
michael@0 3088 if (hi < lo) {
michael@0 3089 break outer;
michael@0 3090 }
michael@0 3091 if (entCol == NamedCharacters.NAMES[lo].length()) {
michael@0 3092 candidate = lo;
michael@0 3093 strBufMark = strBufLen;
michael@0 3094 lo++;
michael@0 3095 } else if (entCol > NamedCharacters.NAMES[lo].length()) {
michael@0 3096 break outer;
michael@0 3097 } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
michael@0 3098 lo++;
michael@0 3099 } else {
michael@0 3100 break loloop;
michael@0 3101 }
michael@0 3102 }
michael@0 3103
michael@0 3104 hiloop: for (;;) {
michael@0 3105 if (hi < lo) {
michael@0 3106 break outer;
michael@0 3107 }
michael@0 3108 if (entCol == NamedCharacters.NAMES[hi].length()) {
michael@0 3109 break hiloop;
michael@0 3110 }
michael@0 3111 if (entCol > NamedCharacters.NAMES[hi].length()) {
michael@0 3112 break outer;
michael@0 3113 } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
michael@0 3114 hi--;
michael@0 3115 } else {
michael@0 3116 break hiloop;
michael@0 3117 }
michael@0 3118 }
michael@0 3119
michael@0 3120 if (c == ';') {
michael@0 3121 // If we see a semicolon, there cannot be a
michael@0 3122 // longer match. Break the loop. However, before
michael@0 3123 // breaking, take the longest match so far as the
michael@0 3124 // candidate, if we are just about to complete a
michael@0 3125 // match.
michael@0 3126 if (entCol + 1 == NamedCharacters.NAMES[lo].length()) {
michael@0 3127 candidate = lo;
michael@0 3128 strBufMark = strBufLen;
michael@0 3129 }
michael@0 3130 break outer;
michael@0 3131 }
michael@0 3132
michael@0 3133 if (hi < lo) {
michael@0 3134 break outer;
michael@0 3135 }
michael@0 3136 appendStrBuf(c);
michael@0 3137 continue;
michael@0 3138 }
michael@0 3139
michael@0 3140 if (candidate == -1) {
michael@0 3141 // reconsume deals with CR, LF or nul
michael@0 3142 /*
michael@0 3143 * If no match can be made, then this is a parse error.
michael@0 3144 */
michael@0 3145 errNoNamedCharacterMatch();
michael@0 3146 emitOrAppendStrBuf(returnState);
michael@0 3147 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0 3148 cstart = pos;
michael@0 3149 }
michael@0 3150 reconsume = true;
michael@0 3151 state = transition(state, returnState, reconsume, pos);
michael@0 3152 continue stateloop;
michael@0 3153 } else {
michael@0 3154 // c can't be CR, LF or nul if we got here
michael@0 3155 @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
michael@0 3156 if (candidateName.length() == 0
michael@0 3157 || candidateName.charAt(candidateName.length() - 1) != ';') {
michael@0 3158 /*
michael@0 3159 * If the last character matched is not a U+003B
michael@0 3160 * SEMICOLON (;), there is a parse error.
michael@0 3161 */
michael@0 3162 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
michael@0 3163 /*
michael@0 3164 * If the entity is being consumed as part of an
michael@0 3165 * attribute, and the last character matched is
michael@0 3166 * not a U+003B SEMICOLON (;),
michael@0 3167 */
michael@0 3168 char ch;
michael@0 3169 if (strBufMark == strBufLen) {
michael@0 3170 ch = c;
michael@0 3171 } else {
michael@0 3172 // if (strBufOffset != -1) {
michael@0 3173 // ch = buf[strBufOffset + strBufMark];
michael@0 3174 // } else {
michael@0 3175 ch = strBuf[strBufMark];
michael@0 3176 // }
michael@0 3177 }
michael@0 3178 if (ch == '=' || (ch >= '0' && ch <= '9')
michael@0 3179 || (ch >= 'A' && ch <= 'Z')
michael@0 3180 || (ch >= 'a' && ch <= 'z')) {
michael@0 3181 /*
michael@0 3182 * and the next character is either a U+003D
michael@0 3183 * EQUALS SIGN character (=) or in the range
michael@0 3184 * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
michael@0 3185 * U+0041 LATIN CAPITAL LETTER A to U+005A
michael@0 3186 * LATIN CAPITAL LETTER Z, or U+0061 LATIN
michael@0 3187 * SMALL LETTER A to U+007A LATIN SMALL
michael@0 3188 * LETTER Z, then, for historical reasons,
michael@0 3189 * all the characters that were matched
michael@0 3190 * after the U+0026 AMPERSAND (&) must be
michael@0 3191 * unconsumed, and nothing is returned.
michael@0 3192 */
michael@0 3193 errNoNamedCharacterMatch();
michael@0 3194 appendStrBufToLongStrBuf();
michael@0 3195 reconsume = true;
michael@0 3196 state = transition(state, returnState, reconsume, pos);
michael@0 3197 continue stateloop;
michael@0 3198 }
michael@0 3199 }
michael@0 3200 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
michael@0 3201 errUnescapedAmpersandInterpretedAsCharacterReference();
michael@0 3202 } else {
michael@0 3203 errNotSemicolonTerminated();
michael@0 3204 }
michael@0 3205 }
michael@0 3206
michael@0 3207 /*
michael@0 3208 * Otherwise, return a character token for the character
michael@0 3209 * corresponding to the entity name (as given by the
michael@0 3210 * second column of the named character references
michael@0 3211 * table).
michael@0 3212 */
michael@0 3213 // CPPONLY: completedNamedCharacterReference();
michael@0 3214 @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
michael@0 3215 if (
michael@0 3216 // [NOCPP[
michael@0 3217 val.length == 1
michael@0 3218 // ]NOCPP]
michael@0 3219 // CPPONLY: val[1] == 0
michael@0 3220 ) {
michael@0 3221 emitOrAppendOne(val, returnState);
michael@0 3222 } else {
michael@0 3223 emitOrAppendTwo(val, returnState);
michael@0 3224 }
michael@0 3225 // this is so complicated!
michael@0 3226 if (strBufMark < strBufLen) {
michael@0 3227 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
michael@0 3228 for (int i = strBufMark; i < strBufLen; i++) {
michael@0 3229 appendLongStrBuf(strBuf[i]);
michael@0 3230 }
michael@0 3231 } else {
michael@0 3232 tokenHandler.characters(strBuf, strBufMark,
michael@0 3233 strBufLen - strBufMark);
michael@0 3234 }
michael@0 3235 }
michael@0 3236 // Check if we broke out early with c being the last
michael@0 3237 // character that matched as opposed to being the
michael@0 3238 // first one that didn't match. In the case of an
michael@0 3239 // early break, the next run on text should start
michael@0 3240 // *after* the current character and the current
michael@0 3241 // character shouldn't be reconsumed.
michael@0 3242 boolean earlyBreak = (c == ';' && strBufMark == strBufLen);
michael@0 3243 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0 3244 cstart = earlyBreak ? pos + 1 : pos;
michael@0 3245 }
michael@0 3246 reconsume = !earlyBreak;
michael@0 3247 state = transition(state, returnState, reconsume, pos);
michael@0 3248 continue stateloop;
michael@0 3249 /*
michael@0 3250 * If the markup contains I'm &notit; I tell you, the
michael@0 3251 * entity is parsed as "not", as in, I'm ¬it; I tell
michael@0 3252 * you. But if the markup was I'm &notin; I tell you,
michael@0 3253 * the entity would be parsed as "notin;", resulting in
michael@0 3254 * I'm ∉ I tell you.
michael@0 3255 */
michael@0 3256 }
michael@0 3257 // XXX reorder point
michael@0 3258 case CONSUME_NCR:
michael@0 3259 if (++pos == endPos) {
michael@0 3260 break stateloop;
michael@0 3261 }
michael@0 3262 c = checkChar(buf, pos);
michael@0 3263 prevValue = -1;
michael@0 3264 value = 0;
michael@0 3265 seenDigits = false;
michael@0 3266 /*
michael@0 3267 * The behavior further depends on the character after the
michael@0 3268 * U+0023 NUMBER SIGN:
michael@0 3269 */
michael@0 3270 switch (c) {
michael@0 3271 case 'x':
michael@0 3272 case 'X':
michael@0 3273
michael@0 3274 /*
michael@0 3275 * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL
michael@0 3276 * LETTER X Consume the X.
michael@0 3277 *
michael@0 3278 * Follow the steps below, but using the range of
michael@0 3279 * characters U+0030 DIGIT ZERO through to U+0039
michael@0 3280 * DIGIT NINE, U+0061 LATIN SMALL LETTER A through
michael@0 3281 * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN
michael@0 3282 * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL
michael@0 3283 * LETTER F (in other words, 0-9, A-F, a-f).
michael@0 3284 *
michael@0 3285 * When it comes to interpreting the number,
michael@0 3286 * interpret it as a hexadecimal number.
michael@0 3287 */
michael@0 3288 appendStrBuf(c);
michael@0 3289 state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos);
michael@0 3290 continue stateloop;
michael@0 3291 default:
michael@0 3292 /*
michael@0 3293 * Anything else Follow the steps below, but using
michael@0 3294 * the range of characters U+0030 DIGIT ZERO through
michael@0 3295 * to U+0039 DIGIT NINE (i.e. just 0-9).
michael@0 3296 *
michael@0 3297 * When it comes to interpreting the number,
michael@0 3298 * interpret it as a decimal number.
michael@0 3299 */
michael@0 3300 reconsume = true;
michael@0 3301 state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos);
michael@0 3302 // FALL THROUGH continue stateloop;
michael@0 3303 }
michael@0 3304 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0 3305 case DECIMAL_NRC_LOOP:
michael@0 3306 decimalloop: for (;;) {
michael@0 3307 if (reconsume) {
michael@0 3308 reconsume = false;
michael@0 3309 } else {
michael@0 3310 if (++pos == endPos) {
michael@0 3311 break stateloop;
michael@0 3312 }
michael@0 3313 c = checkChar(buf, pos);
michael@0 3314 }
michael@0 3315 // Deal with overflow gracefully
michael@0 3316 if (value < prevValue) {
michael@0 3317 value = 0x110000; // Value above Unicode range but
michael@0 3318 // within int
michael@0 3319 // range
michael@0 3320 }
michael@0 3321 prevValue = value;
michael@0 3322 /*
michael@0 3323 * Consume as many characters as match the range of
michael@0 3324 * characters given above.
michael@0 3325 */
michael@0 3326 if (c >= '0' && c <= '9') {
michael@0 3327 seenDigits = true;
michael@0 3328 value *= 10;
michael@0 3329 value += c - '0';
michael@0 3330 continue;
michael@0 3331 } else if (c == ';') {
michael@0 3332 if (seenDigits) {
michael@0 3333 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0 3334 cstart = pos + 1;
michael@0 3335 }
michael@0 3336 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
michael@0 3337 // FALL THROUGH continue stateloop;
michael@0 3338 break decimalloop;
michael@0 3339 } else {
michael@0 3340 errNoDigitsInNCR();
michael@0 3341 appendStrBuf(';');
michael@0 3342 emitOrAppendStrBuf(returnState);
michael@0 3343 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0 3344 cstart = pos + 1;
michael@0 3345 }
michael@0 3346 state = transition(state, returnState, reconsume, pos);
michael@0 3347 continue stateloop;
michael@0 3348 }
michael@0 3349 } else {
michael@0 3350 /*
michael@0 3351 * If no characters match the range, then don't
michael@0 3352 * consume any characters (and unconsume the U+0023
michael@0 3353 * NUMBER SIGN character and, if appropriate, the X
michael@0 3354 * character). This is a parse error; nothing is
michael@0 3355 * returned.
michael@0 3356 *
michael@0 3357 * Otherwise, if the next character is a U+003B
michael@0 3358 * SEMICOLON, consume that too. If it isn't, there
michael@0 3359 * is a parse error.
michael@0 3360 */
michael@0 3361 if (!seenDigits) {
michael@0 3362 errNoDigitsInNCR();
michael@0 3363 emitOrAppendStrBuf(returnState);
michael@0 3364 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0 3365 cstart = pos;
michael@0 3366 }
michael@0 3367 reconsume = true;
michael@0 3368 state = transition(state, returnState, reconsume, pos);
michael@0 3369 continue stateloop;
michael@0 3370 } else {
michael@0 3371 errCharRefLacksSemicolon();
michael@0 3372 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0 3373 cstart = pos;
michael@0 3374 }
michael@0 3375 reconsume = true;
michael@0 3376 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
michael@0 3377 // FALL THROUGH continue stateloop;
michael@0 3378 break decimalloop;
michael@0 3379 }
michael@0 3380 }
michael@0 3381 }
michael@0 3382 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0 3383 case HANDLE_NCR_VALUE:
michael@0 3384 // WARNING previous state sets reconsume
michael@0 3385 // XXX inline this case if the method size can take it
michael@0 3386 handleNcrValue(returnState);
michael@0 3387 state = transition(state, returnState, reconsume, pos);
michael@0 3388 continue stateloop;
michael@0 3389 // XXX reorder point
michael@0 3390 case HEX_NCR_LOOP:
michael@0 3391 for (;;) {
michael@0 3392 if (++pos == endPos) {
michael@0 3393 break stateloop;
michael@0 3394 }
michael@0 3395 c = checkChar(buf, pos);
michael@0 3396 // Deal with overflow gracefully
michael@0 3397 if (value < prevValue) {
michael@0 3398 value = 0x110000; // Value above Unicode range but
michael@0 3399 // within int
michael@0 3400 // range
michael@0 3401 }
michael@0 3402 prevValue = value;
michael@0 3403 /*
michael@0 3404 * Consume as many characters as match the range of
michael@0 3405 * characters given above.
michael@0 3406 */
michael@0 3407 if (c >= '0' && c <= '9') {
michael@0 3408 seenDigits = true;
michael@0 3409 value *= 16;
michael@0 3410 value += c - '0';
michael@0 3411 continue;
michael@0 3412 } else if (c >= 'A' && c <= 'F') {
michael@0 3413 seenDigits = true;
michael@0 3414 value *= 16;
michael@0 3415 value += c - 'A' + 10;
michael@0 3416 continue;
michael@0 3417 } else if (c >= 'a' && c <= 'f') {
michael@0 3418 seenDigits = true;
michael@0 3419 value *= 16;
michael@0 3420 value += c - 'a' + 10;
michael@0 3421 continue;
michael@0 3422 } else if (c == ';') {
michael@0 3423 if (seenDigits) {
michael@0 3424 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0 3425 cstart = pos + 1;
michael@0 3426 }
michael@0 3427 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
michael@0 3428 continue stateloop;
michael@0 3429 } else {
michael@0 3430 errNoDigitsInNCR();
michael@0 3431 appendStrBuf(';');
michael@0 3432 emitOrAppendStrBuf(returnState);
michael@0 3433 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0 3434 cstart = pos + 1;
michael@0 3435 }
michael@0 3436 state = transition(state, returnState, reconsume, pos);
michael@0 3437 continue stateloop;
michael@0 3438 }
michael@0 3439 } else {
michael@0 3440 /*
michael@0 3441 * If no characters match the range, then don't
michael@0 3442 * consume any characters (and unconsume the U+0023
michael@0 3443 * NUMBER SIGN character and, if appropriate, the X
michael@0 3444 * character). This is a parse error; nothing is
michael@0 3445 * returned.
michael@0 3446 *
michael@0 3447 * Otherwise, if the next character is a U+003B
michael@0 3448 * SEMICOLON, consume that too. If it isn't, there
michael@0 3449 * is a parse error.
michael@0 3450 */
michael@0 3451 if (!seenDigits) {
michael@0 3452 errNoDigitsInNCR();
michael@0 3453 emitOrAppendStrBuf(returnState);
michael@0 3454 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0 3455 cstart = pos;
michael@0 3456 }
michael@0 3457 reconsume = true;
michael@0 3458 state = transition(state, returnState, reconsume, pos);
michael@0 3459 continue stateloop;
michael@0 3460 } else {
michael@0 3461 errCharRefLacksSemicolon();
michael@0 3462 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
michael@0 3463 cstart = pos;
michael@0 3464 }
michael@0 3465 reconsume = true;
michael@0 3466 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
michael@0 3467 continue stateloop;
michael@0 3468 }
michael@0 3469 }
michael@0 3470 }
michael@0 3471 // XXX reorder point
michael@0 3472 case PLAINTEXT:
michael@0 3473 plaintextloop: for (;;) {
michael@0 3474 if (reconsume) {
michael@0 3475 reconsume = false;
michael@0 3476 } else {
michael@0 3477 if (++pos == endPos) {
michael@0 3478 break stateloop;
michael@0 3479 }
michael@0 3480 c = checkChar(buf, pos);
michael@0 3481 }
michael@0 3482 switch (c) {
michael@0 3483 case '\u0000':
michael@0 3484 emitPlaintextReplacementCharacter(buf, pos);
michael@0 3485 continue;
michael@0 3486 case '\r':
michael@0 3487 emitCarriageReturn(buf, pos);
michael@0 3488 break stateloop;
michael@0 3489 case '\n':
michael@0 3490 silentLineFeed();
michael@0 3491 default:
michael@0 3492 /*
michael@0 3493 * Anything else Emit the current input
michael@0 3494 * character as a character token. Stay in the
michael@0 3495 * RAWTEXT state.
michael@0 3496 */
michael@0 3497 continue;
michael@0 3498 }
michael@0 3499 }
michael@0 3500 // XXX reorder point
michael@0 3501 case CLOSE_TAG_OPEN:
michael@0 3502 if (++pos == endPos) {
michael@0 3503 break stateloop;
michael@0 3504 }
michael@0 3505 c = checkChar(buf, pos);
michael@0 3506 /*
michael@0 3507 * Otherwise, if the content model flag is set to the PCDATA
michael@0 3508 * state, or if the next few characters do match that tag
michael@0 3509 * name, consume the next input character:
michael@0 3510 */
michael@0 3511 switch (c) {
michael@0 3512 case '>':
michael@0 3513 /* U+003E GREATER-THAN SIGN (>) Parse error. */
michael@0 3514 errLtSlashGt();
michael@0 3515 /*
michael@0 3516 * Switch to the data state.
michael@0 3517 */
michael@0 3518 cstart = pos + 1;
michael@0 3519 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 3520 continue stateloop;
michael@0 3521 case '\r':
michael@0 3522 silentCarriageReturn();
michael@0 3523 /* Anything else Parse error. */
michael@0 3524 errGarbageAfterLtSlash();
michael@0 3525 /*
michael@0 3526 * Switch to the bogus comment state.
michael@0 3527 */
michael@0 3528 clearLongStrBufAndAppend('\n');
michael@0 3529 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
michael@0 3530 break stateloop;
michael@0 3531 case '\n':
michael@0 3532 silentLineFeed();
michael@0 3533 /* Anything else Parse error. */
michael@0 3534 errGarbageAfterLtSlash();
michael@0 3535 /*
michael@0 3536 * Switch to the bogus comment state.
michael@0 3537 */
michael@0 3538 clearLongStrBufAndAppend('\n');
michael@0 3539 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
michael@0 3540 continue stateloop;
michael@0 3541 case '\u0000':
michael@0 3542 c = '\uFFFD';
michael@0 3543 // fall thru
michael@0 3544 default:
michael@0 3545 if (c >= 'A' && c <= 'Z') {
michael@0 3546 c += 0x20;
michael@0 3547 }
michael@0 3548 if (c >= 'a' && c <= 'z') {
michael@0 3549 /*
michael@0 3550 * U+0061 LATIN SMALL LETTER A through to U+007A
michael@0 3551 * LATIN SMALL LETTER Z Create a new end tag
michael@0 3552 * token,
michael@0 3553 */
michael@0 3554 endTag = true;
michael@0 3555 /*
michael@0 3556 * set its tag name to the input character,
michael@0 3557 */
michael@0 3558 clearStrBufAndAppend(c);
michael@0 3559 /*
michael@0 3560 * then switch to the tag name state. (Don't
michael@0 3561 * emit the token yet; further details will be
michael@0 3562 * filled in before it is emitted.)
michael@0 3563 */
michael@0 3564 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
michael@0 3565 continue stateloop;
michael@0 3566 } else {
michael@0 3567 /* Anything else Parse error. */
michael@0 3568 errGarbageAfterLtSlash();
michael@0 3569 /*
michael@0 3570 * Switch to the bogus comment state.
michael@0 3571 */
michael@0 3572 clearLongStrBufAndAppend(c);
michael@0 3573 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
michael@0 3574 continue stateloop;
michael@0 3575 }
michael@0 3576 }
michael@0 3577 // XXX reorder point
michael@0 3578 case RCDATA:
michael@0 3579 rcdataloop: for (;;) {
michael@0 3580 if (reconsume) {
michael@0 3581 reconsume = false;
michael@0 3582 } else {
michael@0 3583 if (++pos == endPos) {
michael@0 3584 break stateloop;
michael@0 3585 }
michael@0 3586 c = checkChar(buf, pos);
michael@0 3587 }
michael@0 3588 switch (c) {
michael@0 3589 case '&':
michael@0 3590 /*
michael@0 3591 * U+0026 AMPERSAND (&) Switch to the character
michael@0 3592 * reference in RCDATA state.
michael@0 3593 */
michael@0 3594 flushChars(buf, pos);
michael@0 3595 clearStrBufAndAppend(c);
michael@0 3596 additional = '\u0000';
michael@0 3597 returnState = state;
michael@0 3598 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
michael@0 3599 continue stateloop;
michael@0 3600 case '<':
michael@0 3601 /*
michael@0 3602 * U+003C LESS-THAN SIGN (<) Switch to the
michael@0 3603 * RCDATA less-than sign state.
michael@0 3604 */
michael@0 3605 flushChars(buf, pos);
michael@0 3606
michael@0 3607 returnState = state;
michael@0 3608 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
michael@0 3609 continue stateloop;
michael@0 3610 case '\u0000':
michael@0 3611 emitReplacementCharacter(buf, pos);
michael@0 3612 continue;
michael@0 3613 case '\r':
michael@0 3614 emitCarriageReturn(buf, pos);
michael@0 3615 break stateloop;
michael@0 3616 case '\n':
michael@0 3617 silentLineFeed();
michael@0 3618 default:
michael@0 3619 /*
michael@0 3620 * Emit the current input character as a
michael@0 3621 * character token. Stay in the RCDATA state.
michael@0 3622 */
michael@0 3623 continue;
michael@0 3624 }
michael@0 3625 }
michael@0 3626 // XXX reorder point
michael@0 3627 case RAWTEXT:
michael@0 3628 rawtextloop: for (;;) {
michael@0 3629 if (reconsume) {
michael@0 3630 reconsume = false;
michael@0 3631 } else {
michael@0 3632 if (++pos == endPos) {
michael@0 3633 break stateloop;
michael@0 3634 }
michael@0 3635 c = checkChar(buf, pos);
michael@0 3636 }
michael@0 3637 switch (c) {
michael@0 3638 case '<':
michael@0 3639 /*
michael@0 3640 * U+003C LESS-THAN SIGN (<) Switch to the
michael@0 3641 * RAWTEXT less-than sign state.
michael@0 3642 */
michael@0 3643 flushChars(buf, pos);
michael@0 3644
michael@0 3645 returnState = state;
michael@0 3646 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
michael@0 3647 break rawtextloop;
michael@0 3648 // FALL THRU continue stateloop;
michael@0 3649 case '\u0000':
michael@0 3650 emitReplacementCharacter(buf, pos);
michael@0 3651 continue;
michael@0 3652 case '\r':
michael@0 3653 emitCarriageReturn(buf, pos);
michael@0 3654 break stateloop;
michael@0 3655 case '\n':
michael@0 3656 silentLineFeed();
michael@0 3657 default:
michael@0 3658 /*
michael@0 3659 * Emit the current input character as a
michael@0 3660 * character token. Stay in the RAWTEXT state.
michael@0 3661 */
michael@0 3662 continue;
michael@0 3663 }
michael@0 3664 }
michael@0 3665 // XXX fallthru don't reorder
michael@0 3666 case RAWTEXT_RCDATA_LESS_THAN_SIGN:
michael@0 3667 rawtextrcdatalessthansignloop: for (;;) {
michael@0 3668 if (++pos == endPos) {
michael@0 3669 break stateloop;
michael@0 3670 }
michael@0 3671 c = checkChar(buf, pos);
michael@0 3672 switch (c) {
michael@0 3673 case '/':
michael@0 3674 /*
michael@0 3675 * U+002F SOLIDUS (/) Set the temporary buffer
michael@0 3676 * to the empty string. Switch to the script
michael@0 3677 * data end tag open state.
michael@0 3678 */
michael@0 3679 index = 0;
michael@0 3680 clearStrBuf();
michael@0 3681 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
michael@0 3682 break rawtextrcdatalessthansignloop;
michael@0 3683 // FALL THRU continue stateloop;
michael@0 3684 default:
michael@0 3685 /*
michael@0 3686 * Otherwise, emit a U+003C LESS-THAN SIGN
michael@0 3687 * character token
michael@0 3688 */
michael@0 3689 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
michael@0 3690 /*
michael@0 3691 * and reconsume the current input character in
michael@0 3692 * the data state.
michael@0 3693 */
michael@0 3694 cstart = pos;
michael@0 3695 reconsume = true;
michael@0 3696 state = transition(state, returnState, reconsume, pos);
michael@0 3697 continue stateloop;
michael@0 3698 }
michael@0 3699 }
michael@0 3700 // XXX fall thru. don't reorder.
michael@0 3701 case NON_DATA_END_TAG_NAME:
michael@0 3702 for (;;) {
michael@0 3703 if (++pos == endPos) {
michael@0 3704 break stateloop;
michael@0 3705 }
michael@0 3706 c = checkChar(buf, pos);
michael@0 3707 /*
michael@0 3708 * ASSERT! when entering this state, set index to 0 and
michael@0 3709 * call clearStrBuf() assert (contentModelElement !=
michael@0 3710 * null); Let's implement the above without lookahead.
michael@0 3711 * strBuf is the 'temporary buffer'.
michael@0 3712 */
michael@0 3713 if (index < endTagExpectationAsArray.length) {
michael@0 3714 char e = endTagExpectationAsArray[index];
michael@0 3715 char folded = c;
michael@0 3716 if (c >= 'A' && c <= 'Z') {
michael@0 3717 folded += 0x20;
michael@0 3718 }
michael@0 3719 if (folded != e) {
michael@0 3720 // [NOCPP[
michael@0 3721 errHtml4LtSlashInRcdata(folded);
michael@0 3722 // ]NOCPP]
michael@0 3723 tokenHandler.characters(Tokenizer.LT_SOLIDUS,
michael@0 3724 0, 2);
michael@0 3725 emitStrBuf();
michael@0 3726 cstart = pos;
michael@0 3727 reconsume = true;
michael@0 3728 state = transition(state, returnState, reconsume, pos);
michael@0 3729 continue stateloop;
michael@0 3730 }
michael@0 3731 appendStrBuf(c);
michael@0 3732 index++;
michael@0 3733 continue;
michael@0 3734 } else {
michael@0 3735 endTag = true;
michael@0 3736 // XXX replace contentModelElement with different
michael@0 3737 // type
michael@0 3738 tagName = endTagExpectation;
michael@0 3739 switch (c) {
michael@0 3740 case '\r':
michael@0 3741 silentCarriageReturn();
michael@0 3742 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
michael@0 3743 break stateloop;
michael@0 3744 case '\n':
michael@0 3745 silentLineFeed();
michael@0 3746 // fall thru
michael@0 3747 case ' ':
michael@0 3748 case '\t':
michael@0 3749 case '\u000C':
michael@0 3750 /*
michael@0 3751 * U+0009 CHARACTER TABULATION U+000A LINE
michael@0 3752 * FEED (LF) U+000C FORM FEED (FF) U+0020
michael@0 3753 * SPACE If the current end tag token is an
michael@0 3754 * appropriate end tag token, then switch to
michael@0 3755 * the before attribute name state.
michael@0 3756 */
michael@0 3757 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
michael@0 3758 continue stateloop;
michael@0 3759 case '/':
michael@0 3760 /*
michael@0 3761 * U+002F SOLIDUS (/) If the current end tag
michael@0 3762 * token is an appropriate end tag token,
michael@0 3763 * then switch to the self-closing start tag
michael@0 3764 * state.
michael@0 3765 */
michael@0 3766 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
michael@0 3767 continue stateloop;
michael@0 3768 case '>':
michael@0 3769 /*
michael@0 3770 * U+003E GREATER-THAN SIGN (>) If the
michael@0 3771 * current end tag token is an appropriate
michael@0 3772 * end tag token, then emit the current tag
michael@0 3773 * token and switch to the data state.
michael@0 3774 */
michael@0 3775 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
michael@0 3776 if (shouldSuspend) {
michael@0 3777 break stateloop;
michael@0 3778 }
michael@0 3779 continue stateloop;
michael@0 3780 default:
michael@0 3781 /*
michael@0 3782 * Emit a U+003C LESS-THAN SIGN character
michael@0 3783 * token, a U+002F SOLIDUS character token,
michael@0 3784 * a character token for each of the
michael@0 3785 * characters in the temporary buffer (in
michael@0 3786 * the order they were added to the buffer),
michael@0 3787 * and reconsume the current input character
michael@0 3788 * in the RAWTEXT state.
michael@0 3789 */
michael@0 3790 // [NOCPP[
michael@0 3791 errWarnLtSlashInRcdata();
michael@0 3792 // ]NOCPP]
michael@0 3793 tokenHandler.characters(
michael@0 3794 Tokenizer.LT_SOLIDUS, 0, 2);
michael@0 3795 emitStrBuf();
michael@0 3796 if (c == '\u0000') {
michael@0 3797 emitReplacementCharacter(buf, pos);
michael@0 3798 } else {
michael@0 3799 cstart = pos; // don't drop the
michael@0 3800 // character
michael@0 3801 }
michael@0 3802 state = transition(state, returnState, reconsume, pos);
michael@0 3803 continue stateloop;
michael@0 3804 }
michael@0 3805 }
michael@0 3806 }
michael@0 3807 // XXX reorder point
michael@0 3808 // BEGIN HOTSPOT WORKAROUND
michael@0 3809 case BOGUS_COMMENT:
michael@0 3810 boguscommentloop: for (;;) {
michael@0 3811 if (reconsume) {
michael@0 3812 reconsume = false;
michael@0 3813 } else {
michael@0 3814 if (++pos == endPos) {
michael@0 3815 break stateloop;
michael@0 3816 }
michael@0 3817 c = checkChar(buf, pos);
michael@0 3818 }
michael@0 3819 /*
michael@0 3820 * Consume every character up to and including the first
michael@0 3821 * U+003E GREATER-THAN SIGN character (>) or the end of
michael@0 3822 * the file (EOF), whichever comes first. Emit a comment
michael@0 3823 * token whose data is the concatenation of all the
michael@0 3824 * characters starting from and including the character
michael@0 3825 * that caused the state machine to switch into the
michael@0 3826 * bogus comment state, up to and including the
michael@0 3827 * character immediately before the last consumed
michael@0 3828 * character (i.e. up to the character just before the
michael@0 3829 * U+003E or EOF character). (If the comment was started
michael@0 3830 * by the end of the file (EOF), the token is empty.)
michael@0 3831 *
michael@0 3832 * Switch to the data state.
michael@0 3833 *
michael@0 3834 * If the end of the file was reached, reconsume the EOF
michael@0 3835 * character.
michael@0 3836 */
michael@0 3837 switch (c) {
michael@0 3838 case '>':
michael@0 3839 emitComment(0, pos);
michael@0 3840 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 3841 continue stateloop;
michael@0 3842 case '-':
michael@0 3843 appendLongStrBuf(c);
michael@0 3844 state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos);
michael@0 3845 break boguscommentloop;
michael@0 3846 case '\r':
michael@0 3847 appendLongStrBufCarriageReturn();
michael@0 3848 break stateloop;
michael@0 3849 case '\n':
michael@0 3850 appendLongStrBufLineFeed();
michael@0 3851 continue;
michael@0 3852 case '\u0000':
michael@0 3853 c = '\uFFFD';
michael@0 3854 // fall thru
michael@0 3855 default:
michael@0 3856 appendLongStrBuf(c);
michael@0 3857 continue;
michael@0 3858 }
michael@0 3859 }
michael@0 3860 // FALLTHRU DON'T REORDER
michael@0 3861 case BOGUS_COMMENT_HYPHEN:
michael@0 3862 boguscommenthyphenloop: for (;;) {
michael@0 3863 if (++pos == endPos) {
michael@0 3864 break stateloop;
michael@0 3865 }
michael@0 3866 c = checkChar(buf, pos);
michael@0 3867 switch (c) {
michael@0 3868 case '>':
michael@0 3869 // [NOCPP[
michael@0 3870 maybeAppendSpaceToBogusComment();
michael@0 3871 // ]NOCPP]
michael@0 3872 emitComment(0, pos);
michael@0 3873 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 3874 continue stateloop;
michael@0 3875 case '-':
michael@0 3876 appendSecondHyphenToBogusComment();
michael@0 3877 continue boguscommenthyphenloop;
michael@0 3878 case '\r':
michael@0 3879 appendLongStrBufCarriageReturn();
michael@0 3880 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
michael@0 3881 break stateloop;
michael@0 3882 case '\n':
michael@0 3883 appendLongStrBufLineFeed();
michael@0 3884 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
michael@0 3885 continue stateloop;
michael@0 3886 case '\u0000':
michael@0 3887 c = '\uFFFD';
michael@0 3888 // fall thru
michael@0 3889 default:
michael@0 3890 appendLongStrBuf(c);
michael@0 3891 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
michael@0 3892 continue stateloop;
michael@0 3893 }
michael@0 3894 }
michael@0 3895 // XXX reorder point
michael@0 3896 case SCRIPT_DATA:
michael@0 3897 scriptdataloop: for (;;) {
michael@0 3898 if (reconsume) {
michael@0 3899 reconsume = false;
michael@0 3900 } else {
michael@0 3901 if (++pos == endPos) {
michael@0 3902 break stateloop;
michael@0 3903 }
michael@0 3904 c = checkChar(buf, pos);
michael@0 3905 }
michael@0 3906 switch (c) {
michael@0 3907 case '<':
michael@0 3908 /*
michael@0 3909 * U+003C LESS-THAN SIGN (<) Switch to the
michael@0 3910 * script data less-than sign state.
michael@0 3911 */
michael@0 3912 flushChars(buf, pos);
michael@0 3913 returnState = state;
michael@0 3914 state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos);
michael@0 3915 break scriptdataloop; // FALL THRU continue
michael@0 3916 // stateloop;
michael@0 3917 case '\u0000':
michael@0 3918 emitReplacementCharacter(buf, pos);
michael@0 3919 continue;
michael@0 3920 case '\r':
michael@0 3921 emitCarriageReturn(buf, pos);
michael@0 3922 break stateloop;
michael@0 3923 case '\n':
michael@0 3924 silentLineFeed();
michael@0 3925 default:
michael@0 3926 /*
michael@0 3927 * Anything else Emit the current input
michael@0 3928 * character as a character token. Stay in the
michael@0 3929 * script data state.
michael@0 3930 */
michael@0 3931 continue;
michael@0 3932 }
michael@0 3933 }
michael@0 3934 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0 3935 case SCRIPT_DATA_LESS_THAN_SIGN:
michael@0 3936 scriptdatalessthansignloop: for (;;) {
michael@0 3937 if (++pos == endPos) {
michael@0 3938 break stateloop;
michael@0 3939 }
michael@0 3940 c = checkChar(buf, pos);
michael@0 3941 switch (c) {
michael@0 3942 case '/':
michael@0 3943 /*
michael@0 3944 * U+002F SOLIDUS (/) Set the temporary buffer
michael@0 3945 * to the empty string. Switch to the script
michael@0 3946 * data end tag open state.
michael@0 3947 */
michael@0 3948 index = 0;
michael@0 3949 clearStrBuf();
michael@0 3950 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
michael@0 3951 continue stateloop;
michael@0 3952 case '!':
michael@0 3953 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
michael@0 3954 cstart = pos;
michael@0 3955 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos);
michael@0 3956 break scriptdatalessthansignloop; // FALL THRU
michael@0 3957 // continue
michael@0 3958 // stateloop;
michael@0 3959 default:
michael@0 3960 /*
michael@0 3961 * Otherwise, emit a U+003C LESS-THAN SIGN
michael@0 3962 * character token
michael@0 3963 */
michael@0 3964 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
michael@0 3965 /*
michael@0 3966 * and reconsume the current input character in
michael@0 3967 * the data state.
michael@0 3968 */
michael@0 3969 cstart = pos;
michael@0 3970 reconsume = true;
michael@0 3971 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
michael@0 3972 continue stateloop;
michael@0 3973 }
michael@0 3974 }
michael@0 3975 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0 3976 case SCRIPT_DATA_ESCAPE_START:
michael@0 3977 scriptdataescapestartloop: for (;;) {
michael@0 3978 if (++pos == endPos) {
michael@0 3979 break stateloop;
michael@0 3980 }
michael@0 3981 c = checkChar(buf, pos);
michael@0 3982 /*
michael@0 3983 * Consume the next input character:
michael@0 3984 */
michael@0 3985 switch (c) {
michael@0 3986 case '-':
michael@0 3987 /*
michael@0 3988 * U+002D HYPHEN-MINUS (-) Emit a U+002D
michael@0 3989 * HYPHEN-MINUS character token. Switch to the
michael@0 3990 * script data escape start dash state.
michael@0 3991 */
michael@0 3992 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos);
michael@0 3993 break scriptdataescapestartloop; // FALL THRU
michael@0 3994 // continue
michael@0 3995 // stateloop;
michael@0 3996 default:
michael@0 3997 /*
michael@0 3998 * Anything else Reconsume the current input
michael@0 3999 * character in the script data state.
michael@0 4000 */
michael@0 4001 reconsume = true;
michael@0 4002 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
michael@0 4003 continue stateloop;
michael@0 4004 }
michael@0 4005 }
michael@0 4006 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0 4007 case SCRIPT_DATA_ESCAPE_START_DASH:
michael@0 4008 scriptdataescapestartdashloop: for (;;) {
michael@0 4009 if (++pos == endPos) {
michael@0 4010 break stateloop;
michael@0 4011 }
michael@0 4012 c = checkChar(buf, pos);
michael@0 4013 /*
michael@0 4014 * Consume the next input character:
michael@0 4015 */
michael@0 4016 switch (c) {
michael@0 4017 case '-':
michael@0 4018 /*
michael@0 4019 * U+002D HYPHEN-MINUS (-) Emit a U+002D
michael@0 4020 * HYPHEN-MINUS character token. Switch to the
michael@0 4021 * script data escaped dash dash state.
michael@0 4022 */
michael@0 4023 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
michael@0 4024 break scriptdataescapestartdashloop;
michael@0 4025 // continue stateloop;
michael@0 4026 default:
michael@0 4027 /*
michael@0 4028 * Anything else Reconsume the current input
michael@0 4029 * character in the script data state.
michael@0 4030 */
michael@0 4031 reconsume = true;
michael@0 4032 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
michael@0 4033 continue stateloop;
michael@0 4034 }
michael@0 4035 }
michael@0 4036 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0 4037 case SCRIPT_DATA_ESCAPED_DASH_DASH:
michael@0 4038 scriptdataescapeddashdashloop: for (;;) {
michael@0 4039 if (++pos == endPos) {
michael@0 4040 break stateloop;
michael@0 4041 }
michael@0 4042 c = checkChar(buf, pos);
michael@0 4043 /*
michael@0 4044 * Consume the next input character:
michael@0 4045 */
michael@0 4046 switch (c) {
michael@0 4047 case '-':
michael@0 4048 /*
michael@0 4049 * U+002D HYPHEN-MINUS (-) Emit a U+002D
michael@0 4050 * HYPHEN-MINUS character token. Stay in the
michael@0 4051 * script data escaped dash dash state.
michael@0 4052 */
michael@0 4053 continue;
michael@0 4054 case '<':
michael@0 4055 /*
michael@0 4056 * U+003C LESS-THAN SIGN (<) Switch to the
michael@0 4057 * script data escaped less-than sign state.
michael@0 4058 */
michael@0 4059 flushChars(buf, pos);
michael@0 4060 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
michael@0 4061 continue stateloop;
michael@0 4062 case '>':
michael@0 4063 /*
michael@0 4064 * U+003E GREATER-THAN SIGN (>) Emit a U+003E
michael@0 4065 * GREATER-THAN SIGN character token. Switch to
michael@0 4066 * the script data state.
michael@0 4067 */
michael@0 4068 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
michael@0 4069 continue stateloop;
michael@0 4070 case '\u0000':
michael@0 4071 emitReplacementCharacter(buf, pos);
michael@0 4072 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
michael@0 4073 break scriptdataescapeddashdashloop;
michael@0 4074 case '\r':
michael@0 4075 emitCarriageReturn(buf, pos);
michael@0 4076 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
michael@0 4077 break stateloop;
michael@0 4078 case '\n':
michael@0 4079 silentLineFeed();
michael@0 4080 default:
michael@0 4081 /*
michael@0 4082 * Anything else Emit the current input
michael@0 4083 * character as a character token. Switch to the
michael@0 4084 * script data escaped state.
michael@0 4085 */
michael@0 4086 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
michael@0 4087 break scriptdataescapeddashdashloop;
michael@0 4088 // continue stateloop;
michael@0 4089 }
michael@0 4090 }
michael@0 4091 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0 4092 case SCRIPT_DATA_ESCAPED:
michael@0 4093 scriptdataescapedloop: for (;;) {
michael@0 4094 if (reconsume) {
michael@0 4095 reconsume = false;
michael@0 4096 } else {
michael@0 4097 if (++pos == endPos) {
michael@0 4098 break stateloop;
michael@0 4099 }
michael@0 4100 c = checkChar(buf, pos);
michael@0 4101 }
michael@0 4102 /*
michael@0 4103 * Consume the next input character:
michael@0 4104 */
michael@0 4105 switch (c) {
michael@0 4106 case '-':
michael@0 4107 /*
michael@0 4108 * U+002D HYPHEN-MINUS (-) Emit a U+002D
michael@0 4109 * HYPHEN-MINUS character token. Switch to the
michael@0 4110 * script data escaped dash state.
michael@0 4111 */
michael@0 4112 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos);
michael@0 4113 break scriptdataescapedloop; // FALL THRU
michael@0 4114 // continue
michael@0 4115 // stateloop;
michael@0 4116 case '<':
michael@0 4117 /*
michael@0 4118 * U+003C LESS-THAN SIGN (<) Switch to the
michael@0 4119 * script data escaped less-than sign state.
michael@0 4120 */
michael@0 4121 flushChars(buf, pos);
michael@0 4122 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
michael@0 4123 continue stateloop;
michael@0 4124 case '\u0000':
michael@0 4125 emitReplacementCharacter(buf, pos);
michael@0 4126 continue;
michael@0 4127 case '\r':
michael@0 4128 emitCarriageReturn(buf, pos);
michael@0 4129 break stateloop;
michael@0 4130 case '\n':
michael@0 4131 silentLineFeed();
michael@0 4132 default:
michael@0 4133 /*
michael@0 4134 * Anything else Emit the current input
michael@0 4135 * character as a character token. Stay in the
michael@0 4136 * script data escaped state.
michael@0 4137 */
michael@0 4138 continue;
michael@0 4139 }
michael@0 4140 }
michael@0 4141 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0 4142 case SCRIPT_DATA_ESCAPED_DASH:
michael@0 4143 scriptdataescapeddashloop: for (;;) {
michael@0 4144 if (++pos == endPos) {
michael@0 4145 break stateloop;
michael@0 4146 }
michael@0 4147 c = checkChar(buf, pos);
michael@0 4148 /*
michael@0 4149 * Consume the next input character:
michael@0 4150 */
michael@0 4151 switch (c) {
michael@0 4152 case '-':
michael@0 4153 /*
michael@0 4154 * U+002D HYPHEN-MINUS (-) Emit a U+002D
michael@0 4155 * HYPHEN-MINUS character token. Switch to the
michael@0 4156 * script data escaped dash dash state.
michael@0 4157 */
michael@0 4158 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
michael@0 4159 continue stateloop;
michael@0 4160 case '<':
michael@0 4161 /*
michael@0 4162 * U+003C LESS-THAN SIGN (<) Switch to the
michael@0 4163 * script data escaped less-than sign state.
michael@0 4164 */
michael@0 4165 flushChars(buf, pos);
michael@0 4166 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
michael@0 4167 break scriptdataescapeddashloop;
michael@0 4168 // continue stateloop;
michael@0 4169 case '\u0000':
michael@0 4170 emitReplacementCharacter(buf, pos);
michael@0 4171 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
michael@0 4172 continue stateloop;
michael@0 4173 case '\r':
michael@0 4174 emitCarriageReturn(buf, pos);
michael@0 4175 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
michael@0 4176 break stateloop;
michael@0 4177 case '\n':
michael@0 4178 silentLineFeed();
michael@0 4179 default:
michael@0 4180 /*
michael@0 4181 * Anything else Emit the current input
michael@0 4182 * character as a character token. Switch to the
michael@0 4183 * script data escaped state.
michael@0 4184 */
michael@0 4185 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
michael@0 4186 continue stateloop;
michael@0 4187 }
michael@0 4188 }
michael@0 4189 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0 4190 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
michael@0 4191 scriptdataescapedlessthanloop: for (;;) {
michael@0 4192 if (++pos == endPos) {
michael@0 4193 break stateloop;
michael@0 4194 }
michael@0 4195 c = checkChar(buf, pos);
michael@0 4196 /*
michael@0 4197 * Consume the next input character:
michael@0 4198 */
michael@0 4199 switch (c) {
michael@0 4200 case '/':
michael@0 4201 /*
michael@0 4202 * U+002F SOLIDUS (/) Set the temporary buffer
michael@0 4203 * to the empty string. Switch to the script
michael@0 4204 * data escaped end tag open state.
michael@0 4205 */
michael@0 4206 index = 0;
michael@0 4207 clearStrBuf();
michael@0 4208 returnState = Tokenizer.SCRIPT_DATA_ESCAPED;
michael@0 4209 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
michael@0 4210 continue stateloop;
michael@0 4211 case 'S':
michael@0 4212 case 's':
michael@0 4213 /*
michael@0 4214 * U+0041 LATIN CAPITAL LETTER A through to
michael@0 4215 * U+005A LATIN CAPITAL LETTER Z Emit a U+003C
michael@0 4216 * LESS-THAN SIGN character token and the
michael@0 4217 * current input character as a character token.
michael@0 4218 */
michael@0 4219 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
michael@0 4220 cstart = pos;
michael@0 4221 index = 1;
michael@0 4222 /*
michael@0 4223 * Set the temporary buffer to the empty string.
michael@0 4224 * Append the lowercase version of the current
michael@0 4225 * input character (add 0x0020 to the
michael@0 4226 * character's code point) to the temporary
michael@0 4227 * buffer. Switch to the script data double
michael@0 4228 * escape start state.
michael@0 4229 */
michael@0 4230 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos);
michael@0 4231 break scriptdataescapedlessthanloop;
michael@0 4232 // continue stateloop;
michael@0 4233 default:
michael@0 4234 /*
michael@0 4235 * Anything else Emit a U+003C LESS-THAN SIGN
michael@0 4236 * character token and reconsume the current
michael@0 4237 * input character in the script data escaped
michael@0 4238 * state.
michael@0 4239 */
michael@0 4240 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
michael@0 4241 cstart = pos;
michael@0 4242 reconsume = true;
michael@0 4243 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
michael@0 4244 continue stateloop;
michael@0 4245 }
michael@0 4246 }
michael@0 4247 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0 4248 case SCRIPT_DATA_DOUBLE_ESCAPE_START:
michael@0 4249 scriptdatadoubleescapestartloop: for (;;) {
michael@0 4250 if (++pos == endPos) {
michael@0 4251 break stateloop;
michael@0 4252 }
michael@0 4253 c = checkChar(buf, pos);
michael@0 4254 assert index > 0;
michael@0 4255 if (index < 6) { // SCRIPT_ARR.length
michael@0 4256 char folded = c;
michael@0 4257 if (c >= 'A' && c <= 'Z') {
michael@0 4258 folded += 0x20;
michael@0 4259 }
michael@0 4260 if (folded != Tokenizer.SCRIPT_ARR[index]) {
michael@0 4261 reconsume = true;
michael@0 4262 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
michael@0 4263 continue stateloop;
michael@0 4264 }
michael@0 4265 index++;
michael@0 4266 continue;
michael@0 4267 }
michael@0 4268 switch (c) {
michael@0 4269 case '\r':
michael@0 4270 emitCarriageReturn(buf, pos);
michael@0 4271 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
michael@0 4272 break stateloop;
michael@0 4273 case '\n':
michael@0 4274 silentLineFeed();
michael@0 4275 case ' ':
michael@0 4276 case '\t':
michael@0 4277 case '\u000C':
michael@0 4278 case '/':
michael@0 4279 case '>':
michael@0 4280 /*
michael@0 4281 * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0 4282 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
michael@0 4283 * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
michael@0 4284 * (>) Emit the current input character as a
michael@0 4285 * character token. If the temporary buffer is
michael@0 4286 * the string "script", then switch to the
michael@0 4287 * script data double escaped state.
michael@0 4288 */
michael@0 4289 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
michael@0 4290 break scriptdatadoubleescapestartloop;
michael@0 4291 // continue stateloop;
michael@0 4292 default:
michael@0 4293 /*
michael@0 4294 * Anything else Reconsume the current input
michael@0 4295 * character in the script data escaped state.
michael@0 4296 */
michael@0 4297 reconsume = true;
michael@0 4298 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
michael@0 4299 continue stateloop;
michael@0 4300 }
michael@0 4301 }
michael@0 4302 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0 4303 case SCRIPT_DATA_DOUBLE_ESCAPED:
michael@0 4304 scriptdatadoubleescapedloop: for (;;) {
michael@0 4305 if (reconsume) {
michael@0 4306 reconsume = false;
michael@0 4307 } else {
michael@0 4308 if (++pos == endPos) {
michael@0 4309 break stateloop;
michael@0 4310 }
michael@0 4311 c = checkChar(buf, pos);
michael@0 4312 }
michael@0 4313 /*
michael@0 4314 * Consume the next input character:
michael@0 4315 */
michael@0 4316 switch (c) {
michael@0 4317 case '-':
michael@0 4318 /*
michael@0 4319 * U+002D HYPHEN-MINUS (-) Emit a U+002D
michael@0 4320 * HYPHEN-MINUS character token. Switch to the
michael@0 4321 * script data double escaped dash state.
michael@0 4322 */
michael@0 4323 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos);
michael@0 4324 break scriptdatadoubleescapedloop; // FALL THRU
michael@0 4325 // continue
michael@0 4326 // stateloop;
michael@0 4327 case '<':
michael@0 4328 /*
michael@0 4329 * U+003C LESS-THAN SIGN (<) Emit a U+003C
michael@0 4330 * LESS-THAN SIGN character token. Switch to the
michael@0 4331 * script data double escaped less-than sign
michael@0 4332 * state.
michael@0 4333 */
michael@0 4334 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
michael@0 4335 continue stateloop;
michael@0 4336 case '\u0000':
michael@0 4337 emitReplacementCharacter(buf, pos);
michael@0 4338 continue;
michael@0 4339 case '\r':
michael@0 4340 emitCarriageReturn(buf, pos);
michael@0 4341 break stateloop;
michael@0 4342 case '\n':
michael@0 4343 silentLineFeed();
michael@0 4344 default:
michael@0 4345 /*
michael@0 4346 * Anything else Emit the current input
michael@0 4347 * character as a character token. Stay in the
michael@0 4348 * script data double escaped state.
michael@0 4349 */
michael@0 4350 continue;
michael@0 4351 }
michael@0 4352 }
michael@0 4353 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0 4354 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
michael@0 4355 scriptdatadoubleescapeddashloop: for (;;) {
michael@0 4356 if (++pos == endPos) {
michael@0 4357 break stateloop;
michael@0 4358 }
michael@0 4359 c = checkChar(buf, pos);
michael@0 4360 /*
michael@0 4361 * Consume the next input character:
michael@0 4362 */
michael@0 4363 switch (c) {
michael@0 4364 case '-':
michael@0 4365 /*
michael@0 4366 * U+002D HYPHEN-MINUS (-) Emit a U+002D
michael@0 4367 * HYPHEN-MINUS character token. Switch to the
michael@0 4368 * script data double escaped dash dash state.
michael@0 4369 */
michael@0 4370 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos);
michael@0 4371 break scriptdatadoubleescapeddashloop;
michael@0 4372 // continue stateloop;
michael@0 4373 case '<':
michael@0 4374 /*
michael@0 4375 * U+003C LESS-THAN SIGN (<) Emit a U+003C
michael@0 4376 * LESS-THAN SIGN character token. Switch to the
michael@0 4377 * script data double escaped less-than sign
michael@0 4378 * state.
michael@0 4379 */
michael@0 4380 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
michael@0 4381 continue stateloop;
michael@0 4382 case '\u0000':
michael@0 4383 emitReplacementCharacter(buf, pos);
michael@0 4384 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
michael@0 4385 continue stateloop;
michael@0 4386 case '\r':
michael@0 4387 emitCarriageReturn(buf, pos);
michael@0 4388 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
michael@0 4389 break stateloop;
michael@0 4390 case '\n':
michael@0 4391 silentLineFeed();
michael@0 4392 default:
michael@0 4393 /*
michael@0 4394 * Anything else Emit the current input
michael@0 4395 * character as a character token. Switch to the
michael@0 4396 * script data double escaped state.
michael@0 4397 */
michael@0 4398 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
michael@0 4399 continue stateloop;
michael@0 4400 }
michael@0 4401 }
michael@0 4402 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0 4403 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
michael@0 4404 scriptdatadoubleescapeddashdashloop: for (;;) {
michael@0 4405 if (++pos == endPos) {
michael@0 4406 break stateloop;
michael@0 4407 }
michael@0 4408 c = checkChar(buf, pos);
michael@0 4409 /*
michael@0 4410 * Consume the next input character:
michael@0 4411 */
michael@0 4412 switch (c) {
michael@0 4413 case '-':
michael@0 4414 /*
michael@0 4415 * U+002D HYPHEN-MINUS (-) Emit a U+002D
michael@0 4416 * HYPHEN-MINUS character token. Stay in the
michael@0 4417 * script data double escaped dash dash state.
michael@0 4418 */
michael@0 4419 continue;
michael@0 4420 case '<':
michael@0 4421 /*
michael@0 4422 * U+003C LESS-THAN SIGN (<) Emit a U+003C
michael@0 4423 * LESS-THAN SIGN character token. Switch to the
michael@0 4424 * script data double escaped less-than sign
michael@0 4425 * state.
michael@0 4426 */
michael@0 4427 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
michael@0 4428 break scriptdatadoubleescapeddashdashloop;
michael@0 4429 case '>':
michael@0 4430 /*
michael@0 4431 * U+003E GREATER-THAN SIGN (>) Emit a U+003E
michael@0 4432 * GREATER-THAN SIGN character token. Switch to
michael@0 4433 * the script data state.
michael@0 4434 */
michael@0 4435 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
michael@0 4436 continue stateloop;
michael@0 4437 case '\u0000':
michael@0 4438 emitReplacementCharacter(buf, pos);
michael@0 4439 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
michael@0 4440 continue stateloop;
michael@0 4441 case '\r':
michael@0 4442 emitCarriageReturn(buf, pos);
michael@0 4443 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
michael@0 4444 break stateloop;
michael@0 4445 case '\n':
michael@0 4446 silentLineFeed();
michael@0 4447 default:
michael@0 4448 /*
michael@0 4449 * Anything else Emit the current input
michael@0 4450 * character as a character token. Switch to the
michael@0 4451 * script data double escaped state.
michael@0 4452 */
michael@0 4453 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
michael@0 4454 continue stateloop;
michael@0 4455 }
michael@0 4456 }
michael@0 4457 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0 4458 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
michael@0 4459 scriptdatadoubleescapedlessthanloop: for (;;) {
michael@0 4460 if (++pos == endPos) {
michael@0 4461 break stateloop;
michael@0 4462 }
michael@0 4463 c = checkChar(buf, pos);
michael@0 4464 /*
michael@0 4465 * Consume the next input character:
michael@0 4466 */
michael@0 4467 switch (c) {
michael@0 4468 case '/':
michael@0 4469 /*
michael@0 4470 * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS
michael@0 4471 * character token. Set the temporary buffer to
michael@0 4472 * the empty string. Switch to the script data
michael@0 4473 * double escape end state.
michael@0 4474 */
michael@0 4475 index = 0;
michael@0 4476 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos);
michael@0 4477 break scriptdatadoubleescapedlessthanloop;
michael@0 4478 default:
michael@0 4479 /*
michael@0 4480 * Anything else Reconsume the current input
michael@0 4481 * character in the script data double escaped
michael@0 4482 * state.
michael@0 4483 */
michael@0 4484 reconsume = true;
michael@0 4485 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
michael@0 4486 continue stateloop;
michael@0 4487 }
michael@0 4488 }
michael@0 4489 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0 4490 case SCRIPT_DATA_DOUBLE_ESCAPE_END:
michael@0 4491 scriptdatadoubleescapeendloop: for (;;) {
michael@0 4492 if (++pos == endPos) {
michael@0 4493 break stateloop;
michael@0 4494 }
michael@0 4495 c = checkChar(buf, pos);
michael@0 4496 if (index < 6) { // SCRIPT_ARR.length
michael@0 4497 char folded = c;
michael@0 4498 if (c >= 'A' && c <= 'Z') {
michael@0 4499 folded += 0x20;
michael@0 4500 }
michael@0 4501 if (folded != Tokenizer.SCRIPT_ARR[index]) {
michael@0 4502 reconsume = true;
michael@0 4503 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
michael@0 4504 continue stateloop;
michael@0 4505 }
michael@0 4506 index++;
michael@0 4507 continue;
michael@0 4508 }
michael@0 4509 switch (c) {
michael@0 4510 case '\r':
michael@0 4511 emitCarriageReturn(buf, pos);
michael@0 4512 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
michael@0 4513 break stateloop;
michael@0 4514 case '\n':
michael@0 4515 silentLineFeed();
michael@0 4516 case ' ':
michael@0 4517 case '\t':
michael@0 4518 case '\u000C':
michael@0 4519 case '/':
michael@0 4520 case '>':
michael@0 4521 /*
michael@0 4522 * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0 4523 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
michael@0 4524 * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
michael@0 4525 * (>) Emit the current input character as a
michael@0 4526 * character token. If the temporary buffer is
michael@0 4527 * the string "script", then switch to the
michael@0 4528 * script data escaped state.
michael@0 4529 */
michael@0 4530 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
michael@0 4531 continue stateloop;
michael@0 4532 default:
michael@0 4533 /*
michael@0 4534 * Reconsume the current input character in the
michael@0 4535 * script data double escaped state.
michael@0 4536 */
michael@0 4537 reconsume = true;
michael@0 4538 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
michael@0 4539 continue stateloop;
michael@0 4540 }
michael@0 4541 }
michael@0 4542 // XXX reorder point
michael@0 4543 case MARKUP_DECLARATION_OCTYPE:
michael@0 4544 markupdeclarationdoctypeloop: for (;;) {
michael@0 4545 if (++pos == endPos) {
michael@0 4546 break stateloop;
michael@0 4547 }
michael@0 4548 c = checkChar(buf, pos);
michael@0 4549 if (index < 6) { // OCTYPE.length
michael@0 4550 char folded = c;
michael@0 4551 if (c >= 'A' && c <= 'Z') {
michael@0 4552 folded += 0x20;
michael@0 4553 }
michael@0 4554 if (folded == Tokenizer.OCTYPE[index]) {
michael@0 4555 appendLongStrBuf(c);
michael@0 4556 } else {
michael@0 4557 errBogusComment();
michael@0 4558 reconsume = true;
michael@0 4559 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
michael@0 4560 continue stateloop;
michael@0 4561 }
michael@0 4562 index++;
michael@0 4563 continue;
michael@0 4564 } else {
michael@0 4565 reconsume = true;
michael@0 4566 state = transition(state, Tokenizer.DOCTYPE, reconsume, pos);
michael@0 4567 break markupdeclarationdoctypeloop;
michael@0 4568 // continue stateloop;
michael@0 4569 }
michael@0 4570 }
michael@0 4571 // FALLTHRU DON'T REORDER
michael@0 4572 case DOCTYPE:
michael@0 4573 doctypeloop: for (;;) {
michael@0 4574 if (reconsume) {
michael@0 4575 reconsume = false;
michael@0 4576 } else {
michael@0 4577 if (++pos == endPos) {
michael@0 4578 break stateloop;
michael@0 4579 }
michael@0 4580 c = checkChar(buf, pos);
michael@0 4581 }
michael@0 4582 initDoctypeFields();
michael@0 4583 /*
michael@0 4584 * Consume the next input character:
michael@0 4585 */
michael@0 4586 switch (c) {
michael@0 4587 case '\r':
michael@0 4588 silentCarriageReturn();
michael@0 4589 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
michael@0 4590 break stateloop;
michael@0 4591 case '\n':
michael@0 4592 silentLineFeed();
michael@0 4593 // fall thru
michael@0 4594 case ' ':
michael@0 4595 case '\t':
michael@0 4596 case '\u000C':
michael@0 4597 /*
michael@0 4598 * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0 4599 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
michael@0 4600 * Switch to the before DOCTYPE name state.
michael@0 4601 */
michael@0 4602 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
michael@0 4603 break doctypeloop;
michael@0 4604 // continue stateloop;
michael@0 4605 default:
michael@0 4606 /*
michael@0 4607 * Anything else Parse error.
michael@0 4608 */
michael@0 4609 errMissingSpaceBeforeDoctypeName();
michael@0 4610 /*
michael@0 4611 * Reconsume the current character in the before
michael@0 4612 * DOCTYPE name state.
michael@0 4613 */
michael@0 4614 reconsume = true;
michael@0 4615 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
michael@0 4616 break doctypeloop;
michael@0 4617 // continue stateloop;
michael@0 4618 }
michael@0 4619 }
michael@0 4620 // FALLTHRU DON'T REORDER
michael@0 4621 case BEFORE_DOCTYPE_NAME:
michael@0 4622 beforedoctypenameloop: for (;;) {
michael@0 4623 if (reconsume) {
michael@0 4624 reconsume = false;
michael@0 4625 } else {
michael@0 4626 if (++pos == endPos) {
michael@0 4627 break stateloop;
michael@0 4628 }
michael@0 4629 c = checkChar(buf, pos);
michael@0 4630 }
michael@0 4631 /*
michael@0 4632 * Consume the next input character:
michael@0 4633 */
michael@0 4634 switch (c) {
michael@0 4635 case '\r':
michael@0 4636 silentCarriageReturn();
michael@0 4637 break stateloop;
michael@0 4638 case '\n':
michael@0 4639 silentLineFeed();
michael@0 4640 // fall thru
michael@0 4641 case ' ':
michael@0 4642 case '\t':
michael@0 4643 case '\u000C':
michael@0 4644 /*
michael@0 4645 * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0 4646 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
michael@0 4647 * in the before DOCTYPE name state.
michael@0 4648 */
michael@0 4649 continue;
michael@0 4650 case '>':
michael@0 4651 /*
michael@0 4652 * U+003E GREATER-THAN SIGN (>) Parse error.
michael@0 4653 */
michael@0 4654 errNamelessDoctype();
michael@0 4655 /*
michael@0 4656 * Create a new DOCTYPE token. Set its
michael@0 4657 * force-quirks flag to on.
michael@0 4658 */
michael@0 4659 forceQuirks = true;
michael@0 4660 /*
michael@0 4661 * Emit the token.
michael@0 4662 */
michael@0 4663 emitDoctypeToken(pos);
michael@0 4664 /*
michael@0 4665 * Switch to the data state.
michael@0 4666 */
michael@0 4667 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 4668 continue stateloop;
michael@0 4669 case '\u0000':
michael@0 4670 c = '\uFFFD';
michael@0 4671 // fall thru
michael@0 4672 default:
michael@0 4673 if (c >= 'A' && c <= 'Z') {
michael@0 4674 /*
michael@0 4675 * U+0041 LATIN CAPITAL LETTER A through to
michael@0 4676 * U+005A LATIN CAPITAL LETTER Z Create a
michael@0 4677 * new DOCTYPE token. Set the token's name
michael@0 4678 * to the lowercase version of the input
michael@0 4679 * character (add 0x0020 to the character's
michael@0 4680 * code point).
michael@0 4681 */
michael@0 4682 c += 0x20;
michael@0 4683 }
michael@0 4684 /* Anything else Create a new DOCTYPE token. */
michael@0 4685 /*
michael@0 4686 * Set the token's name name to the current
michael@0 4687 * input character.
michael@0 4688 */
michael@0 4689 clearStrBufAndAppend(c);
michael@0 4690 /*
michael@0 4691 * Switch to the DOCTYPE name state.
michael@0 4692 */
michael@0 4693 state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos);
michael@0 4694 break beforedoctypenameloop;
michael@0 4695 // continue stateloop;
michael@0 4696 }
michael@0 4697 }
michael@0 4698 // FALLTHRU DON'T REORDER
michael@0 4699 case DOCTYPE_NAME:
michael@0 4700 doctypenameloop: for (;;) {
michael@0 4701 if (++pos == endPos) {
michael@0 4702 break stateloop;
michael@0 4703 }
michael@0 4704 c = checkChar(buf, pos);
michael@0 4705 /*
michael@0 4706 * Consume the next input character:
michael@0 4707 */
michael@0 4708 switch (c) {
michael@0 4709 case '\r':
michael@0 4710 silentCarriageReturn();
michael@0 4711 strBufToDoctypeName();
michael@0 4712 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
michael@0 4713 break stateloop;
michael@0 4714 case '\n':
michael@0 4715 silentLineFeed();
michael@0 4716 // fall thru
michael@0 4717 case ' ':
michael@0 4718 case '\t':
michael@0 4719 case '\u000C':
michael@0 4720 /*
michael@0 4721 * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0 4722 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
michael@0 4723 * Switch to the after DOCTYPE name state.
michael@0 4724 */
michael@0 4725 strBufToDoctypeName();
michael@0 4726 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
michael@0 4727 break doctypenameloop;
michael@0 4728 // continue stateloop;
michael@0 4729 case '>':
michael@0 4730 /*
michael@0 4731 * U+003E GREATER-THAN SIGN (>) Emit the current
michael@0 4732 * DOCTYPE token.
michael@0 4733 */
michael@0 4734 strBufToDoctypeName();
michael@0 4735 emitDoctypeToken(pos);
michael@0 4736 /*
michael@0 4737 * Switch to the data state.
michael@0 4738 */
michael@0 4739 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 4740 continue stateloop;
michael@0 4741 case '\u0000':
michael@0 4742 c = '\uFFFD';
michael@0 4743 // fall thru
michael@0 4744 default:
michael@0 4745 /*
michael@0 4746 * U+0041 LATIN CAPITAL LETTER A through to
michael@0 4747 * U+005A LATIN CAPITAL LETTER Z Append the
michael@0 4748 * lowercase version of the input character (add
michael@0 4749 * 0x0020 to the character's code point) to the
michael@0 4750 * current DOCTYPE token's name.
michael@0 4751 */
michael@0 4752 if (c >= 'A' && c <= 'Z') {
michael@0 4753 c += 0x0020;
michael@0 4754 }
michael@0 4755 /*
michael@0 4756 * Anything else Append the current input
michael@0 4757 * character to the current DOCTYPE token's
michael@0 4758 * name.
michael@0 4759 */
michael@0 4760 appendStrBuf(c);
michael@0 4761 /*
michael@0 4762 * Stay in the DOCTYPE name state.
michael@0 4763 */
michael@0 4764 continue;
michael@0 4765 }
michael@0 4766 }
michael@0 4767 // FALLTHRU DON'T REORDER
michael@0 4768 case AFTER_DOCTYPE_NAME:
michael@0 4769 afterdoctypenameloop: for (;;) {
michael@0 4770 if (++pos == endPos) {
michael@0 4771 break stateloop;
michael@0 4772 }
michael@0 4773 c = checkChar(buf, pos);
michael@0 4774 /*
michael@0 4775 * Consume the next input character:
michael@0 4776 */
michael@0 4777 switch (c) {
michael@0 4778 case '\r':
michael@0 4779 silentCarriageReturn();
michael@0 4780 break stateloop;
michael@0 4781 case '\n':
michael@0 4782 silentLineFeed();
michael@0 4783 // fall thru
michael@0 4784 case ' ':
michael@0 4785 case '\t':
michael@0 4786 case '\u000C':
michael@0 4787 /*
michael@0 4788 * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0 4789 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
michael@0 4790 * in the after DOCTYPE name state.
michael@0 4791 */
michael@0 4792 continue;
michael@0 4793 case '>':
michael@0 4794 /*
michael@0 4795 * U+003E GREATER-THAN SIGN (>) Emit the current
michael@0 4796 * DOCTYPE token.
michael@0 4797 */
michael@0 4798 emitDoctypeToken(pos);
michael@0 4799 /*
michael@0 4800 * Switch to the data state.
michael@0 4801 */
michael@0 4802 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 4803 continue stateloop;
michael@0 4804 case 'p':
michael@0 4805 case 'P':
michael@0 4806 index = 0;
michael@0 4807 state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos);
michael@0 4808 break afterdoctypenameloop;
michael@0 4809 // continue stateloop;
michael@0 4810 case 's':
michael@0 4811 case 'S':
michael@0 4812 index = 0;
michael@0 4813 state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos);
michael@0 4814 continue stateloop;
michael@0 4815 default:
michael@0 4816 /*
michael@0 4817 * Otherwise, this is the parse error.
michael@0 4818 */
michael@0 4819 bogusDoctype();
michael@0 4820
michael@0 4821 /*
michael@0 4822 * Set the DOCTYPE token's force-quirks flag to
michael@0 4823 * on.
michael@0 4824 */
michael@0 4825 // done by bogusDoctype();
michael@0 4826 /*
michael@0 4827 * Switch to the bogus DOCTYPE state.
michael@0 4828 */
michael@0 4829 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
michael@0 4830 continue stateloop;
michael@0 4831 }
michael@0 4832 }
michael@0 4833 // FALLTHRU DON'T REORDER
michael@0 4834 case DOCTYPE_UBLIC:
michael@0 4835 doctypeublicloop: for (;;) {
michael@0 4836 if (++pos == endPos) {
michael@0 4837 break stateloop;
michael@0 4838 }
michael@0 4839 c = checkChar(buf, pos);
michael@0 4840 /*
michael@0 4841 * If the six characters starting from the current input
michael@0 4842 * character are an ASCII case-insensitive match for the
michael@0 4843 * word "PUBLIC", then consume those characters and
michael@0 4844 * switch to the before DOCTYPE public identifier state.
michael@0 4845 */
michael@0 4846 if (index < 5) { // UBLIC.length
michael@0 4847 char folded = c;
michael@0 4848 if (c >= 'A' && c <= 'Z') {
michael@0 4849 folded += 0x20;
michael@0 4850 }
michael@0 4851 if (folded != Tokenizer.UBLIC[index]) {
michael@0 4852 bogusDoctype();
michael@0 4853 // forceQuirks = true;
michael@0 4854 reconsume = true;
michael@0 4855 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
michael@0 4856 continue stateloop;
michael@0 4857 }
michael@0 4858 index++;
michael@0 4859 continue;
michael@0 4860 } else {
michael@0 4861 reconsume = true;
michael@0 4862 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos);
michael@0 4863 break doctypeublicloop;
michael@0 4864 // continue stateloop;
michael@0 4865 }
michael@0 4866 }
michael@0 4867 // FALLTHRU DON'T REORDER
michael@0 4868 case AFTER_DOCTYPE_PUBLIC_KEYWORD:
michael@0 4869 afterdoctypepublickeywordloop: for (;;) {
michael@0 4870 if (reconsume) {
michael@0 4871 reconsume = false;
michael@0 4872 } else {
michael@0 4873 if (++pos == endPos) {
michael@0 4874 break stateloop;
michael@0 4875 }
michael@0 4876 c = checkChar(buf, pos);
michael@0 4877 }
michael@0 4878 /*
michael@0 4879 * Consume the next input character:
michael@0 4880 */
michael@0 4881 switch (c) {
michael@0 4882 case '\r':
michael@0 4883 silentCarriageReturn();
michael@0 4884 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
michael@0 4885 break stateloop;
michael@0 4886 case '\n':
michael@0 4887 silentLineFeed();
michael@0 4888 // fall thru
michael@0 4889 case ' ':
michael@0 4890 case '\t':
michael@0 4891 case '\u000C':
michael@0 4892 /*
michael@0 4893 * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0 4894 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
michael@0 4895 * Switch to the before DOCTYPE public
michael@0 4896 * identifier state.
michael@0 4897 */
michael@0 4898 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
michael@0 4899 break afterdoctypepublickeywordloop;
michael@0 4900 // FALL THROUGH continue stateloop
michael@0 4901 case '"':
michael@0 4902 /*
michael@0 4903 * U+0022 QUOTATION MARK (") Parse Error.
michael@0 4904 */
michael@0 4905 errNoSpaceBetweenDoctypePublicKeywordAndQuote();
michael@0 4906 /*
michael@0 4907 * Set the DOCTYPE token's public identifier to
michael@0 4908 * the empty string (not missing),
michael@0 4909 */
michael@0 4910 clearLongStrBuf();
michael@0 4911 /*
michael@0 4912 * then switch to the DOCTYPE public identifier
michael@0 4913 * (double-quoted) state.
michael@0 4914 */
michael@0 4915 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
michael@0 4916 continue stateloop;
michael@0 4917 case '\'':
michael@0 4918 /*
michael@0 4919 * U+0027 APOSTROPHE (') Parse Error.
michael@0 4920 */
michael@0 4921 errNoSpaceBetweenDoctypePublicKeywordAndQuote();
michael@0 4922 /*
michael@0 4923 * Set the DOCTYPE token's public identifier to
michael@0 4924 * the empty string (not missing),
michael@0 4925 */
michael@0 4926 clearLongStrBuf();
michael@0 4927 /*
michael@0 4928 * then switch to the DOCTYPE public identifier
michael@0 4929 * (single-quoted) state.
michael@0 4930 */
michael@0 4931 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
michael@0 4932 continue stateloop;
michael@0 4933 case '>':
michael@0 4934 /* U+003E GREATER-THAN SIGN (>) Parse error. */
michael@0 4935 errExpectedPublicId();
michael@0 4936 /*
michael@0 4937 * Set the DOCTYPE token's force-quirks flag to
michael@0 4938 * on.
michael@0 4939 */
michael@0 4940 forceQuirks = true;
michael@0 4941 /*
michael@0 4942 * Emit that DOCTYPE token.
michael@0 4943 */
michael@0 4944 emitDoctypeToken(pos);
michael@0 4945 /*
michael@0 4946 * Switch to the data state.
michael@0 4947 */
michael@0 4948 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 4949 continue stateloop;
michael@0 4950 default:
michael@0 4951 bogusDoctype();
michael@0 4952 /*
michael@0 4953 * Set the DOCTYPE token's force-quirks flag to
michael@0 4954 * on.
michael@0 4955 */
michael@0 4956 // done by bogusDoctype();
michael@0 4957 /*
michael@0 4958 * Switch to the bogus DOCTYPE state.
michael@0 4959 */
michael@0 4960 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
michael@0 4961 continue stateloop;
michael@0 4962 }
michael@0 4963 }
michael@0 4964 // FALLTHRU DON'T REORDER
michael@0 4965 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
michael@0 4966 beforedoctypepublicidentifierloop: for (;;) {
michael@0 4967 if (++pos == endPos) {
michael@0 4968 break stateloop;
michael@0 4969 }
michael@0 4970 c = checkChar(buf, pos);
michael@0 4971 /*
michael@0 4972 * Consume the next input character:
michael@0 4973 */
michael@0 4974 switch (c) {
michael@0 4975 case '\r':
michael@0 4976 silentCarriageReturn();
michael@0 4977 break stateloop;
michael@0 4978 case '\n':
michael@0 4979 silentLineFeed();
michael@0 4980 // fall thru
michael@0 4981 case ' ':
michael@0 4982 case '\t':
michael@0 4983 case '\u000C':
michael@0 4984 /*
michael@0 4985 * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0 4986 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
michael@0 4987 * in the before DOCTYPE public identifier
michael@0 4988 * state.
michael@0 4989 */
michael@0 4990 continue;
michael@0 4991 case '"':
michael@0 4992 /*
michael@0 4993 * U+0022 QUOTATION MARK (") Set the DOCTYPE
michael@0 4994 * token's public identifier to the empty string
michael@0 4995 * (not missing),
michael@0 4996 */
michael@0 4997 clearLongStrBuf();
michael@0 4998 /*
michael@0 4999 * then switch to the DOCTYPE public identifier
michael@0 5000 * (double-quoted) state.
michael@0 5001 */
michael@0 5002 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
michael@0 5003 break beforedoctypepublicidentifierloop;
michael@0 5004 // continue stateloop;
michael@0 5005 case '\'':
michael@0 5006 /*
michael@0 5007 * U+0027 APOSTROPHE (') Set the DOCTYPE token's
michael@0 5008 * public identifier to the empty string (not
michael@0 5009 * missing),
michael@0 5010 */
michael@0 5011 clearLongStrBuf();
michael@0 5012 /*
michael@0 5013 * then switch to the DOCTYPE public identifier
michael@0 5014 * (single-quoted) state.
michael@0 5015 */
michael@0 5016 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
michael@0 5017 continue stateloop;
michael@0 5018 case '>':
michael@0 5019 /* U+003E GREATER-THAN SIGN (>) Parse error. */
michael@0 5020 errExpectedPublicId();
michael@0 5021 /*
michael@0 5022 * Set the DOCTYPE token's force-quirks flag to
michael@0 5023 * on.
michael@0 5024 */
michael@0 5025 forceQuirks = true;
michael@0 5026 /*
michael@0 5027 * Emit that DOCTYPE token.
michael@0 5028 */
michael@0 5029 emitDoctypeToken(pos);
michael@0 5030 /*
michael@0 5031 * Switch to the data state.
michael@0 5032 */
michael@0 5033 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 5034 continue stateloop;
michael@0 5035 default:
michael@0 5036 bogusDoctype();
michael@0 5037 /*
michael@0 5038 * Set the DOCTYPE token's force-quirks flag to
michael@0 5039 * on.
michael@0 5040 */
michael@0 5041 // done by bogusDoctype();
michael@0 5042 /*
michael@0 5043 * Switch to the bogus DOCTYPE state.
michael@0 5044 */
michael@0 5045 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
michael@0 5046 continue stateloop;
michael@0 5047 }
michael@0 5048 }
michael@0 5049 // FALLTHRU DON'T REORDER
michael@0 5050 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
michael@0 5051 doctypepublicidentifierdoublequotedloop: for (;;) {
michael@0 5052 if (++pos == endPos) {
michael@0 5053 break stateloop;
michael@0 5054 }
michael@0 5055 c = checkChar(buf, pos);
michael@0 5056 /*
michael@0 5057 * Consume the next input character:
michael@0 5058 */
michael@0 5059 switch (c) {
michael@0 5060 case '"':
michael@0 5061 /*
michael@0 5062 * U+0022 QUOTATION MARK (") Switch to the after
michael@0 5063 * DOCTYPE public identifier state.
michael@0 5064 */
michael@0 5065 publicIdentifier = longStrBufToString();
michael@0 5066 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
michael@0 5067 break doctypepublicidentifierdoublequotedloop;
michael@0 5068 // continue stateloop;
michael@0 5069 case '>':
michael@0 5070 /*
michael@0 5071 * U+003E GREATER-THAN SIGN (>) Parse error.
michael@0 5072 */
michael@0 5073 errGtInPublicId();
michael@0 5074 /*
michael@0 5075 * Set the DOCTYPE token's force-quirks flag to
michael@0 5076 * on.
michael@0 5077 */
michael@0 5078 forceQuirks = true;
michael@0 5079 /*
michael@0 5080 * Emit that DOCTYPE token.
michael@0 5081 */
michael@0 5082 publicIdentifier = longStrBufToString();
michael@0 5083 emitDoctypeToken(pos);
michael@0 5084 /*
michael@0 5085 * Switch to the data state.
michael@0 5086 */
michael@0 5087 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 5088 continue stateloop;
michael@0 5089 case '\r':
michael@0 5090 appendLongStrBufCarriageReturn();
michael@0 5091 break stateloop;
michael@0 5092 case '\n':
michael@0 5093 appendLongStrBufLineFeed();
michael@0 5094 continue;
michael@0 5095 case '\u0000':
michael@0 5096 c = '\uFFFD';
michael@0 5097 // fall thru
michael@0 5098 default:
michael@0 5099 /*
michael@0 5100 * Anything else Append the current input
michael@0 5101 * character to the current DOCTYPE token's
michael@0 5102 * public identifier.
michael@0 5103 */
michael@0 5104 appendLongStrBuf(c);
michael@0 5105 /*
michael@0 5106 * Stay in the DOCTYPE public identifier
michael@0 5107 * (double-quoted) state.
michael@0 5108 */
michael@0 5109 continue;
michael@0 5110 }
michael@0 5111 }
michael@0 5112 // FALLTHRU DON'T REORDER
michael@0 5113 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
michael@0 5114 afterdoctypepublicidentifierloop: for (;;) {
michael@0 5115 if (++pos == endPos) {
michael@0 5116 break stateloop;
michael@0 5117 }
michael@0 5118 c = checkChar(buf, pos);
michael@0 5119 /*
michael@0 5120 * Consume the next input character:
michael@0 5121 */
michael@0 5122 switch (c) {
michael@0 5123 case '\r':
michael@0 5124 silentCarriageReturn();
michael@0 5125 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
michael@0 5126 break stateloop;
michael@0 5127 case '\n':
michael@0 5128 silentLineFeed();
michael@0 5129 // fall thru
michael@0 5130 case ' ':
michael@0 5131 case '\t':
michael@0 5132 case '\u000C':
michael@0 5133 /*
michael@0 5134 * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0 5135 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
michael@0 5136 * Switch to the between DOCTYPE public and
michael@0 5137 * system identifiers state.
michael@0 5138 */
michael@0 5139 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
michael@0 5140 break afterdoctypepublicidentifierloop;
michael@0 5141 // continue stateloop;
michael@0 5142 case '>':
michael@0 5143 /*
michael@0 5144 * U+003E GREATER-THAN SIGN (>) Emit the current
michael@0 5145 * DOCTYPE token.
michael@0 5146 */
michael@0 5147 emitDoctypeToken(pos);
michael@0 5148 /*
michael@0 5149 * Switch to the data state.
michael@0 5150 */
michael@0 5151 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 5152 continue stateloop;
michael@0 5153 case '"':
michael@0 5154 /*
michael@0 5155 * U+0022 QUOTATION MARK (") Parse error.
michael@0 5156 */
michael@0 5157 errNoSpaceBetweenPublicAndSystemIds();
michael@0 5158 /*
michael@0 5159 * Set the DOCTYPE token's system identifier to
michael@0 5160 * the empty string (not missing),
michael@0 5161 */
michael@0 5162 clearLongStrBuf();
michael@0 5163 /*
michael@0 5164 * then switch to the DOCTYPE system identifier
michael@0 5165 * (double-quoted) state.
michael@0 5166 */
michael@0 5167 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
michael@0 5168 continue stateloop;
michael@0 5169 case '\'':
michael@0 5170 /*
michael@0 5171 * U+0027 APOSTROPHE (') Parse error.
michael@0 5172 */
michael@0 5173 errNoSpaceBetweenPublicAndSystemIds();
michael@0 5174 /*
michael@0 5175 * Set the DOCTYPE token's system identifier to
michael@0 5176 * the empty string (not missing),
michael@0 5177 */
michael@0 5178 clearLongStrBuf();
michael@0 5179 /*
michael@0 5180 * then switch to the DOCTYPE system identifier
michael@0 5181 * (single-quoted) state.
michael@0 5182 */
michael@0 5183 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
michael@0 5184 continue stateloop;
michael@0 5185 default:
michael@0 5186 bogusDoctype();
michael@0 5187 /*
michael@0 5188 * Set the DOCTYPE token's force-quirks flag to
michael@0 5189 * on.
michael@0 5190 */
michael@0 5191 // done by bogusDoctype();
michael@0 5192 /*
michael@0 5193 * Switch to the bogus DOCTYPE state.
michael@0 5194 */
michael@0 5195 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
michael@0 5196 continue stateloop;
michael@0 5197 }
michael@0 5198 }
michael@0 5199 // FALLTHRU DON'T REORDER
michael@0 5200 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
michael@0 5201 betweendoctypepublicandsystemidentifiersloop: for (;;) {
michael@0 5202 if (++pos == endPos) {
michael@0 5203 break stateloop;
michael@0 5204 }
michael@0 5205 c = checkChar(buf, pos);
michael@0 5206 /*
michael@0 5207 * Consume the next input character:
michael@0 5208 */
michael@0 5209 switch (c) {
michael@0 5210 case '\r':
michael@0 5211 silentCarriageReturn();
michael@0 5212 break stateloop;
michael@0 5213 case '\n':
michael@0 5214 silentLineFeed();
michael@0 5215 // fall thru
michael@0 5216 case ' ':
michael@0 5217 case '\t':
michael@0 5218 case '\u000C':
michael@0 5219 /*
michael@0 5220 * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0 5221 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
michael@0 5222 * in the between DOCTYPE public and system
michael@0 5223 * identifiers state.
michael@0 5224 */
michael@0 5225 continue;
michael@0 5226 case '>':
michael@0 5227 /*
michael@0 5228 * U+003E GREATER-THAN SIGN (>) Emit the current
michael@0 5229 * DOCTYPE token.
michael@0 5230 */
michael@0 5231 emitDoctypeToken(pos);
michael@0 5232 /*
michael@0 5233 * Switch to the data state.
michael@0 5234 */
michael@0 5235 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 5236 continue stateloop;
michael@0 5237 case '"':
michael@0 5238 /*
michael@0 5239 * U+0022 QUOTATION MARK (") Set the DOCTYPE
michael@0 5240 * token's system identifier to the empty string
michael@0 5241 * (not missing),
michael@0 5242 */
michael@0 5243 clearLongStrBuf();
michael@0 5244 /*
michael@0 5245 * then switch to the DOCTYPE system identifier
michael@0 5246 * (double-quoted) state.
michael@0 5247 */
michael@0 5248 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
michael@0 5249 break betweendoctypepublicandsystemidentifiersloop;
michael@0 5250 // continue stateloop;
michael@0 5251 case '\'':
michael@0 5252 /*
michael@0 5253 * U+0027 APOSTROPHE (') Set the DOCTYPE token's
michael@0 5254 * system identifier to the empty string (not
michael@0 5255 * missing),
michael@0 5256 */
michael@0 5257 clearLongStrBuf();
michael@0 5258 /*
michael@0 5259 * then switch to the DOCTYPE system identifier
michael@0 5260 * (single-quoted) state.
michael@0 5261 */
michael@0 5262 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
michael@0 5263 continue stateloop;
michael@0 5264 default:
michael@0 5265 bogusDoctype();
michael@0 5266 /*
michael@0 5267 * Set the DOCTYPE token's force-quirks flag to
michael@0 5268 * on.
michael@0 5269 */
michael@0 5270 // done by bogusDoctype();
michael@0 5271 /*
michael@0 5272 * Switch to the bogus DOCTYPE state.
michael@0 5273 */
michael@0 5274 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
michael@0 5275 continue stateloop;
michael@0 5276 }
michael@0 5277 }
michael@0 5278 // FALLTHRU DON'T REORDER
michael@0 5279 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
michael@0 5280 doctypesystemidentifierdoublequotedloop: for (;;) {
michael@0 5281 if (++pos == endPos) {
michael@0 5282 break stateloop;
michael@0 5283 }
michael@0 5284 c = checkChar(buf, pos);
michael@0 5285 /*
michael@0 5286 * Consume the next input character:
michael@0 5287 */
michael@0 5288 switch (c) {
michael@0 5289 case '"':
michael@0 5290 /*
michael@0 5291 * U+0022 QUOTATION MARK (") Switch to the after
michael@0 5292 * DOCTYPE system identifier state.
michael@0 5293 */
michael@0 5294 systemIdentifier = longStrBufToString();
michael@0 5295 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
michael@0 5296 continue stateloop;
michael@0 5297 case '>':
michael@0 5298 /*
michael@0 5299 * U+003E GREATER-THAN SIGN (>) Parse error.
michael@0 5300 */
michael@0 5301 errGtInSystemId();
michael@0 5302 /*
michael@0 5303 * Set the DOCTYPE token's force-quirks flag to
michael@0 5304 * on.
michael@0 5305 */
michael@0 5306 forceQuirks = true;
michael@0 5307 /*
michael@0 5308 * Emit that DOCTYPE token.
michael@0 5309 */
michael@0 5310 systemIdentifier = longStrBufToString();
michael@0 5311 emitDoctypeToken(pos);
michael@0 5312 /*
michael@0 5313 * Switch to the data state.
michael@0 5314 */
michael@0 5315 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 5316 continue stateloop;
michael@0 5317 case '\r':
michael@0 5318 appendLongStrBufCarriageReturn();
michael@0 5319 break stateloop;
michael@0 5320 case '\n':
michael@0 5321 appendLongStrBufLineFeed();
michael@0 5322 continue;
michael@0 5323 case '\u0000':
michael@0 5324 c = '\uFFFD';
michael@0 5325 // fall thru
michael@0 5326 default:
michael@0 5327 /*
michael@0 5328 * Anything else Append the current input
michael@0 5329 * character to the current DOCTYPE token's
michael@0 5330 * system identifier.
michael@0 5331 */
michael@0 5332 appendLongStrBuf(c);
michael@0 5333 /*
michael@0 5334 * Stay in the DOCTYPE system identifier
michael@0 5335 * (double-quoted) state.
michael@0 5336 */
michael@0 5337 continue;
michael@0 5338 }
michael@0 5339 }
michael@0 5340 // FALLTHRU DON'T REORDER
michael@0 5341 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
michael@0 5342 afterdoctypesystemidentifierloop: for (;;) {
michael@0 5343 if (++pos == endPos) {
michael@0 5344 break stateloop;
michael@0 5345 }
michael@0 5346 c = checkChar(buf, pos);
michael@0 5347 /*
michael@0 5348 * Consume the next input character:
michael@0 5349 */
michael@0 5350 switch (c) {
michael@0 5351 case '\r':
michael@0 5352 silentCarriageReturn();
michael@0 5353 break stateloop;
michael@0 5354 case '\n':
michael@0 5355 silentLineFeed();
michael@0 5356 // fall thru
michael@0 5357 case ' ':
michael@0 5358 case '\t':
michael@0 5359 case '\u000C':
michael@0 5360 /*
michael@0 5361 * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0 5362 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
michael@0 5363 * in the after DOCTYPE system identifier state.
michael@0 5364 */
michael@0 5365 continue;
michael@0 5366 case '>':
michael@0 5367 /*
michael@0 5368 * U+003E GREATER-THAN SIGN (>) Emit the current
michael@0 5369 * DOCTYPE token.
michael@0 5370 */
michael@0 5371 emitDoctypeToken(pos);
michael@0 5372 /*
michael@0 5373 * Switch to the data state.
michael@0 5374 */
michael@0 5375 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 5376 continue stateloop;
michael@0 5377 default:
michael@0 5378 /*
michael@0 5379 * Switch to the bogus DOCTYPE state. (This does
michael@0 5380 * not set the DOCTYPE token's force-quirks flag
michael@0 5381 * to on.)
michael@0 5382 */
michael@0 5383 bogusDoctypeWithoutQuirks();
michael@0 5384 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
michael@0 5385 break afterdoctypesystemidentifierloop;
michael@0 5386 // continue stateloop;
michael@0 5387 }
michael@0 5388 }
michael@0 5389 // FALLTHRU DON'T REORDER
michael@0 5390 case BOGUS_DOCTYPE:
michael@0 5391 for (;;) {
michael@0 5392 if (reconsume) {
michael@0 5393 reconsume = false;
michael@0 5394 } else {
michael@0 5395 if (++pos == endPos) {
michael@0 5396 break stateloop;
michael@0 5397 }
michael@0 5398 c = checkChar(buf, pos);
michael@0 5399 }
michael@0 5400 /*
michael@0 5401 * Consume the next input character:
michael@0 5402 */
michael@0 5403 switch (c) {
michael@0 5404 case '>':
michael@0 5405 /*
michael@0 5406 * U+003E GREATER-THAN SIGN (>) Emit that
michael@0 5407 * DOCTYPE token.
michael@0 5408 */
michael@0 5409 emitDoctypeToken(pos);
michael@0 5410 /*
michael@0 5411 * Switch to the data state.
michael@0 5412 */
michael@0 5413 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 5414 continue stateloop;
michael@0 5415 case '\r':
michael@0 5416 silentCarriageReturn();
michael@0 5417 break stateloop;
michael@0 5418 case '\n':
michael@0 5419 silentLineFeed();
michael@0 5420 // fall thru
michael@0 5421 default:
michael@0 5422 /*
michael@0 5423 * Anything else Stay in the bogus DOCTYPE
michael@0 5424 * state.
michael@0 5425 */
michael@0 5426 continue;
michael@0 5427 }
michael@0 5428 }
michael@0 5429 // XXX reorder point
michael@0 5430 case DOCTYPE_YSTEM:
michael@0 5431 doctypeystemloop: for (;;) {
michael@0 5432 if (++pos == endPos) {
michael@0 5433 break stateloop;
michael@0 5434 }
michael@0 5435 c = checkChar(buf, pos);
michael@0 5436 /*
michael@0 5437 * Otherwise, if the six characters starting from the
michael@0 5438 * current input character are an ASCII case-insensitive
michael@0 5439 * match for the word "SYSTEM", then consume those
michael@0 5440 * characters and switch to the before DOCTYPE system
michael@0 5441 * identifier state.
michael@0 5442 */
michael@0 5443 if (index < 5) { // YSTEM.length
michael@0 5444 char folded = c;
michael@0 5445 if (c >= 'A' && c <= 'Z') {
michael@0 5446 folded += 0x20;
michael@0 5447 }
michael@0 5448 if (folded != Tokenizer.YSTEM[index]) {
michael@0 5449 bogusDoctype();
michael@0 5450 reconsume = true;
michael@0 5451 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
michael@0 5452 continue stateloop;
michael@0 5453 }
michael@0 5454 index++;
michael@0 5455 continue stateloop;
michael@0 5456 } else {
michael@0 5457 reconsume = true;
michael@0 5458 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos);
michael@0 5459 break doctypeystemloop;
michael@0 5460 // continue stateloop;
michael@0 5461 }
michael@0 5462 }
michael@0 5463 // FALLTHRU DON'T REORDER
michael@0 5464 case AFTER_DOCTYPE_SYSTEM_KEYWORD:
michael@0 5465 afterdoctypesystemkeywordloop: for (;;) {
michael@0 5466 if (reconsume) {
michael@0 5467 reconsume = false;
michael@0 5468 } else {
michael@0 5469 if (++pos == endPos) {
michael@0 5470 break stateloop;
michael@0 5471 }
michael@0 5472 c = checkChar(buf, pos);
michael@0 5473 }
michael@0 5474 /*
michael@0 5475 * Consume the next input character:
michael@0 5476 */
michael@0 5477 switch (c) {
michael@0 5478 case '\r':
michael@0 5479 silentCarriageReturn();
michael@0 5480 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
michael@0 5481 break stateloop;
michael@0 5482 case '\n':
michael@0 5483 silentLineFeed();
michael@0 5484 // fall thru
michael@0 5485 case ' ':
michael@0 5486 case '\t':
michael@0 5487 case '\u000C':
michael@0 5488 /*
michael@0 5489 * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0 5490 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
michael@0 5491 * Switch to the before DOCTYPE public
michael@0 5492 * identifier state.
michael@0 5493 */
michael@0 5494 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
michael@0 5495 break afterdoctypesystemkeywordloop;
michael@0 5496 // FALL THROUGH continue stateloop
michael@0 5497 case '"':
michael@0 5498 /*
michael@0 5499 * U+0022 QUOTATION MARK (") Parse Error.
michael@0 5500 */
michael@0 5501 errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
michael@0 5502 /*
michael@0 5503 * Set the DOCTYPE token's system identifier to
michael@0 5504 * the empty string (not missing),
michael@0 5505 */
michael@0 5506 clearLongStrBuf();
michael@0 5507 /*
michael@0 5508 * then switch to the DOCTYPE public identifier
michael@0 5509 * (double-quoted) state.
michael@0 5510 */
michael@0 5511 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
michael@0 5512 continue stateloop;
michael@0 5513 case '\'':
michael@0 5514 /*
michael@0 5515 * U+0027 APOSTROPHE (') Parse Error.
michael@0 5516 */
michael@0 5517 errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
michael@0 5518 /*
michael@0 5519 * Set the DOCTYPE token's public identifier to
michael@0 5520 * the empty string (not missing),
michael@0 5521 */
michael@0 5522 clearLongStrBuf();
michael@0 5523 /*
michael@0 5524 * then switch to the DOCTYPE public identifier
michael@0 5525 * (single-quoted) state.
michael@0 5526 */
michael@0 5527 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
michael@0 5528 continue stateloop;
michael@0 5529 case '>':
michael@0 5530 /* U+003E GREATER-THAN SIGN (>) Parse error. */
michael@0 5531 errExpectedPublicId();
michael@0 5532 /*
michael@0 5533 * Set the DOCTYPE token's force-quirks flag to
michael@0 5534 * on.
michael@0 5535 */
michael@0 5536 forceQuirks = true;
michael@0 5537 /*
michael@0 5538 * Emit that DOCTYPE token.
michael@0 5539 */
michael@0 5540 emitDoctypeToken(pos);
michael@0 5541 /*
michael@0 5542 * Switch to the data state.
michael@0 5543 */
michael@0 5544 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 5545 continue stateloop;
michael@0 5546 default:
michael@0 5547 bogusDoctype();
michael@0 5548 /*
michael@0 5549 * Set the DOCTYPE token's force-quirks flag to
michael@0 5550 * on.
michael@0 5551 */
michael@0 5552 // done by bogusDoctype();
michael@0 5553 /*
michael@0 5554 * Switch to the bogus DOCTYPE state.
michael@0 5555 */
michael@0 5556 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
michael@0 5557 continue stateloop;
michael@0 5558 }
michael@0 5559 }
michael@0 5560 // FALLTHRU DON'T REORDER
michael@0 5561 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
michael@0 5562 beforedoctypesystemidentifierloop: for (;;) {
michael@0 5563 if (++pos == endPos) {
michael@0 5564 break stateloop;
michael@0 5565 }
michael@0 5566 c = checkChar(buf, pos);
michael@0 5567 /*
michael@0 5568 * Consume the next input character:
michael@0 5569 */
michael@0 5570 switch (c) {
michael@0 5571 case '\r':
michael@0 5572 silentCarriageReturn();
michael@0 5573 break stateloop;
michael@0 5574 case '\n':
michael@0 5575 silentLineFeed();
michael@0 5576 // fall thru
michael@0 5577 case ' ':
michael@0 5578 case '\t':
michael@0 5579 case '\u000C':
michael@0 5580 /*
michael@0 5581 * U+0009 CHARACTER TABULATION U+000A LINE FEED
michael@0 5582 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
michael@0 5583 * in the before DOCTYPE system identifier
michael@0 5584 * state.
michael@0 5585 */
michael@0 5586 continue;
michael@0 5587 case '"':
michael@0 5588 /*
michael@0 5589 * U+0022 QUOTATION MARK (") Set the DOCTYPE
michael@0 5590 * token's system identifier to the empty string
michael@0 5591 * (not missing),
michael@0 5592 */
michael@0 5593 clearLongStrBuf();
michael@0 5594 /*
michael@0 5595 * then switch to the DOCTYPE system identifier
michael@0 5596 * (double-quoted) state.
michael@0 5597 */
michael@0 5598 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
michael@0 5599 continue stateloop;
michael@0 5600 case '\'':
michael@0 5601 /*
michael@0 5602 * U+0027 APOSTROPHE (') Set the DOCTYPE token's
michael@0 5603 * system identifier to the empty string (not
michael@0 5604 * missing),
michael@0 5605 */
michael@0 5606 clearLongStrBuf();
michael@0 5607 /*
michael@0 5608 * then switch to the DOCTYPE system identifier
michael@0 5609 * (single-quoted) state.
michael@0 5610 */
michael@0 5611 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
michael@0 5612 break beforedoctypesystemidentifierloop;
michael@0 5613 // continue stateloop;
michael@0 5614 case '>':
michael@0 5615 /* U+003E GREATER-THAN SIGN (>) Parse error. */
michael@0 5616 errExpectedSystemId();
michael@0 5617 /*
michael@0 5618 * Set the DOCTYPE token's force-quirks flag to
michael@0 5619 * on.
michael@0 5620 */
michael@0 5621 forceQuirks = true;
michael@0 5622 /*
michael@0 5623 * Emit that DOCTYPE token.
michael@0 5624 */
michael@0 5625 emitDoctypeToken(pos);
michael@0 5626 /*
michael@0 5627 * Switch to the data state.
michael@0 5628 */
michael@0 5629 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 5630 continue stateloop;
michael@0 5631 default:
michael@0 5632 bogusDoctype();
michael@0 5633 /*
michael@0 5634 * Set the DOCTYPE token's force-quirks flag to
michael@0 5635 * on.
michael@0 5636 */
michael@0 5637 // done by bogusDoctype();
michael@0 5638 /*
michael@0 5639 * Switch to the bogus DOCTYPE state.
michael@0 5640 */
michael@0 5641 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
michael@0 5642 continue stateloop;
michael@0 5643 }
michael@0 5644 }
michael@0 5645 // FALLTHRU DON'T REORDER
michael@0 5646 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
michael@0 5647 for (;;) {
michael@0 5648 if (++pos == endPos) {
michael@0 5649 break stateloop;
michael@0 5650 }
michael@0 5651 c = checkChar(buf, pos);
michael@0 5652 /*
michael@0 5653 * Consume the next input character:
michael@0 5654 */
michael@0 5655 switch (c) {
michael@0 5656 case '\'':
michael@0 5657 /*
michael@0 5658 * U+0027 APOSTROPHE (') Switch to the after
michael@0 5659 * DOCTYPE system identifier state.
michael@0 5660 */
michael@0 5661 systemIdentifier = longStrBufToString();
michael@0 5662 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
michael@0 5663 continue stateloop;
michael@0 5664 case '>':
michael@0 5665 errGtInSystemId();
michael@0 5666 /*
michael@0 5667 * Set the DOCTYPE token's force-quirks flag to
michael@0 5668 * on.
michael@0 5669 */
michael@0 5670 forceQuirks = true;
michael@0 5671 /*
michael@0 5672 * Emit that DOCTYPE token.
michael@0 5673 */
michael@0 5674 systemIdentifier = longStrBufToString();
michael@0 5675 emitDoctypeToken(pos);
michael@0 5676 /*
michael@0 5677 * Switch to the data state.
michael@0 5678 */
michael@0 5679 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 5680 continue stateloop;
michael@0 5681 case '\r':
michael@0 5682 appendLongStrBufCarriageReturn();
michael@0 5683 break stateloop;
michael@0 5684 case '\n':
michael@0 5685 appendLongStrBufLineFeed();
michael@0 5686 continue;
michael@0 5687 case '\u0000':
michael@0 5688 c = '\uFFFD';
michael@0 5689 // fall thru
michael@0 5690 default:
michael@0 5691 /*
michael@0 5692 * Anything else Append the current input
michael@0 5693 * character to the current DOCTYPE token's
michael@0 5694 * system identifier.
michael@0 5695 */
michael@0 5696 appendLongStrBuf(c);
michael@0 5697 /*
michael@0 5698 * Stay in the DOCTYPE system identifier
michael@0 5699 * (double-quoted) state.
michael@0 5700 */
michael@0 5701 continue;
michael@0 5702 }
michael@0 5703 }
michael@0 5704 // XXX reorder point
michael@0 5705 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
michael@0 5706 for (;;) {
michael@0 5707 if (++pos == endPos) {
michael@0 5708 break stateloop;
michael@0 5709 }
michael@0 5710 c = checkChar(buf, pos);
michael@0 5711 /*
michael@0 5712 * Consume the next input character:
michael@0 5713 */
michael@0 5714 switch (c) {
michael@0 5715 case '\'':
michael@0 5716 /*
michael@0 5717 * U+0027 APOSTROPHE (') Switch to the after
michael@0 5718 * DOCTYPE public identifier state.
michael@0 5719 */
michael@0 5720 publicIdentifier = longStrBufToString();
michael@0 5721 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
michael@0 5722 continue stateloop;
michael@0 5723 case '>':
michael@0 5724 errGtInPublicId();
michael@0 5725 /*
michael@0 5726 * Set the DOCTYPE token's force-quirks flag to
michael@0 5727 * on.
michael@0 5728 */
michael@0 5729 forceQuirks = true;
michael@0 5730 /*
michael@0 5731 * Emit that DOCTYPE token.
michael@0 5732 */
michael@0 5733 publicIdentifier = longStrBufToString();
michael@0 5734 emitDoctypeToken(pos);
michael@0 5735 /*
michael@0 5736 * Switch to the data state.
michael@0 5737 */
michael@0 5738 state = transition(state, Tokenizer.DATA, reconsume, pos);
michael@0 5739 continue stateloop;
michael@0 5740 case '\r':
michael@0 5741 appendLongStrBufCarriageReturn();
michael@0 5742 break stateloop;
michael@0 5743 case '\n':
michael@0 5744 appendLongStrBufLineFeed();
michael@0 5745 continue;
michael@0 5746 case '\u0000':
michael@0 5747 c = '\uFFFD';
michael@0 5748 // fall thru
michael@0 5749 default:
michael@0 5750 /*
michael@0 5751 * Anything else Append the current input
michael@0 5752 * character to the current DOCTYPE token's
michael@0 5753 * public identifier.
michael@0 5754 */
michael@0 5755 appendLongStrBuf(c);
michael@0 5756 /*
michael@0 5757 * Stay in the DOCTYPE public identifier
michael@0 5758 * (single-quoted) state.
michael@0 5759 */
michael@0 5760 continue;
michael@0 5761 }
michael@0 5762 }
michael@0 5763 // XXX reorder point
michael@0 5764 case PROCESSING_INSTRUCTION:
michael@0 5765 processinginstructionloop: for (;;) {
michael@0 5766 if (++pos == endPos) {
michael@0 5767 break stateloop;
michael@0 5768 }
michael@0 5769 c = checkChar(buf, pos);
michael@0 5770 switch (c) {
michael@0 5771 case '?':
michael@0 5772 state = transition(
michael@0 5773 state,
michael@0 5774 Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK,
michael@0 5775 reconsume, pos);
michael@0 5776 break processinginstructionloop;
michael@0 5777 // continue stateloop;
michael@0 5778 default:
michael@0 5779 continue;
michael@0 5780 }
michael@0 5781 }
michael@0 5782 case PROCESSING_INSTRUCTION_QUESTION_MARK:
michael@0 5783 if (++pos == endPos) {
michael@0 5784 break stateloop;
michael@0 5785 }
michael@0 5786 c = checkChar(buf, pos);
michael@0 5787 switch (c) {
michael@0 5788 case '>':
michael@0 5789 state = transition(state, Tokenizer.DATA,
michael@0 5790 reconsume, pos);
michael@0 5791 continue stateloop;
michael@0 5792 default:
michael@0 5793 state = transition(state,
michael@0 5794 Tokenizer.PROCESSING_INSTRUCTION,
michael@0 5795 reconsume, pos);
michael@0 5796 continue stateloop;
michael@0 5797 }
michael@0 5798 // END HOTSPOT WORKAROUND
michael@0 5799 }
michael@0 5800 }
michael@0 5801 flushChars(buf, pos);
michael@0 5802 /*
michael@0 5803 * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; }
michael@0 5804 */
michael@0 5805 // Save locals
michael@0 5806 stateSave = state;
michael@0 5807 returnStateSave = returnState;
michael@0 5808 return pos;
michael@0 5809 }
michael@0 5810
michael@0 5811 // HOTSPOT WORKAROUND INSERTION POINT
michael@0 5812
michael@0 5813 // [NOCPP[
michael@0 5814
michael@0 5815 protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException {
michael@0 5816 return to;
michael@0 5817 }
michael@0 5818
michael@0 5819 // ]NOCPP]
michael@0 5820
michael@0 5821 private void initDoctypeFields() {
michael@0 5822 doctypeName = "";
michael@0 5823 if (systemIdentifier != null) {
michael@0 5824 Portability.releaseString(systemIdentifier);
michael@0 5825 systemIdentifier = null;
michael@0 5826 }
michael@0 5827 if (publicIdentifier != null) {
michael@0 5828 Portability.releaseString(publicIdentifier);
michael@0 5829 publicIdentifier = null;
michael@0 5830 }
michael@0 5831 forceQuirks = false;
michael@0 5832 }
michael@0 5833
michael@0 5834 @Inline private void adjustDoubleHyphenAndAppendToLongStrBufCarriageReturn()
michael@0 5835 throws SAXException {
michael@0 5836 silentCarriageReturn();
michael@0 5837 adjustDoubleHyphenAndAppendToLongStrBufAndErr('\n');
michael@0 5838 }
michael@0 5839
michael@0 5840 @Inline private void adjustDoubleHyphenAndAppendToLongStrBufLineFeed()
michael@0 5841 throws SAXException {
michael@0 5842 silentLineFeed();
michael@0 5843 adjustDoubleHyphenAndAppendToLongStrBufAndErr('\n');
michael@0 5844 }
michael@0 5845
michael@0 5846 @Inline private void appendLongStrBufLineFeed() {
michael@0 5847 silentLineFeed();
michael@0 5848 appendLongStrBuf('\n');
michael@0 5849 }
michael@0 5850
michael@0 5851 @Inline private void appendLongStrBufCarriageReturn() {
michael@0 5852 silentCarriageReturn();
michael@0 5853 appendLongStrBuf('\n');
michael@0 5854 }
michael@0 5855
michael@0 5856 @Inline protected void silentCarriageReturn() {
michael@0 5857 ++line;
michael@0 5858 lastCR = true;
michael@0 5859 }
michael@0 5860
michael@0 5861 @Inline protected void silentLineFeed() {
michael@0 5862 ++line;
michael@0 5863 }
michael@0 5864
michael@0 5865 private void emitCarriageReturn(@NoLength char[] buf, int pos)
michael@0 5866 throws SAXException {
michael@0 5867 silentCarriageReturn();
michael@0 5868 flushChars(buf, pos);
michael@0 5869 tokenHandler.characters(Tokenizer.LF, 0, 1);
michael@0 5870 cstart = Integer.MAX_VALUE;
michael@0 5871 }
michael@0 5872
michael@0 5873 private void emitReplacementCharacter(@NoLength char[] buf, int pos)
michael@0 5874 throws SAXException {
michael@0 5875 flushChars(buf, pos);
michael@0 5876 tokenHandler.zeroOriginatingReplacementCharacter();
michael@0 5877 cstart = pos + 1;
michael@0 5878 }
michael@0 5879
michael@0 5880 private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos)
michael@0 5881 throws SAXException {
michael@0 5882 flushChars(buf, pos);
michael@0 5883 tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1);
michael@0 5884 cstart = pos + 1;
michael@0 5885 }
michael@0 5886
michael@0 5887 private void setAdditionalAndRememberAmpersandLocation(char add) {
michael@0 5888 additional = add;
michael@0 5889 // [NOCPP[
michael@0 5890 ampersandLocation = new LocatorImpl(this);
michael@0 5891 // ]NOCPP]
michael@0 5892 }
michael@0 5893
michael@0 5894 private void bogusDoctype() throws SAXException {
michael@0 5895 errBogusDoctype();
michael@0 5896 forceQuirks = true;
michael@0 5897 }
michael@0 5898
michael@0 5899 private void bogusDoctypeWithoutQuirks() throws SAXException {
michael@0 5900 errBogusDoctype();
michael@0 5901 forceQuirks = false;
michael@0 5902 }
michael@0 5903
michael@0 5904 private void emitOrAppendStrBuf(int returnState) throws SAXException {
michael@0 5905 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
michael@0 5906 appendStrBufToLongStrBuf();
michael@0 5907 } else {
michael@0 5908 emitStrBuf();
michael@0 5909 }
michael@0 5910 }
michael@0 5911
michael@0 5912 private void handleNcrValue(int returnState) throws SAXException {
michael@0 5913 /*
michael@0 5914 * If one or more characters match the range, then take them all and
michael@0 5915 * interpret the string of characters as a number (either hexadecimal or
michael@0 5916 * decimal as appropriate).
michael@0 5917 */
michael@0 5918 if (value <= 0xFFFF) {
michael@0 5919 if (value >= 0x80 && value <= 0x9f) {
michael@0 5920 /*
michael@0 5921 * If that number is one of the numbers in the first column of
michael@0 5922 * the following table, then this is a parse error.
michael@0 5923 */
michael@0 5924 errNcrInC1Range();
michael@0 5925 /*
michael@0 5926 * Find the row with that number in the first column, and return
michael@0 5927 * a character token for the Unicode character given in the
michael@0 5928 * second column of that row.
michael@0 5929 */
michael@0 5930 @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80];
michael@0 5931 emitOrAppendOne(val, returnState);
michael@0 5932 // [NOCPP[
michael@0 5933 } else if (value == 0xC
michael@0 5934 && contentSpacePolicy != XmlViolationPolicy.ALLOW) {
michael@0 5935 if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) {
michael@0 5936 emitOrAppendOne(Tokenizer.SPACE, returnState);
michael@0 5937 } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) {
michael@0 5938 fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space.");
michael@0 5939 }
michael@0 5940 // ]NOCPP]
michael@0 5941 } else if (value == 0x0) {
michael@0 5942 errNcrZero();
michael@0 5943 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
michael@0 5944 } else if ((value & 0xF800) == 0xD800) {
michael@0 5945 errNcrSurrogate();
michael@0 5946 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
michael@0 5947 } else {
michael@0 5948 /*
michael@0 5949 * Otherwise, return a character token for the Unicode character
michael@0 5950 * whose code point is that number.
michael@0 5951 */
michael@0 5952 char ch = (char) value;
michael@0 5953 // [NOCPP[
michael@0 5954 if (value == 0x0D) {
michael@0 5955 errNcrCr();
michael@0 5956 } else if ((value <= 0x0008) || (value == 0x000B)
michael@0 5957 || (value >= 0x000E && value <= 0x001F)) {
michael@0 5958 ch = errNcrControlChar(ch);
michael@0 5959 } else if (value >= 0xFDD0 && value <= 0xFDEF) {
michael@0 5960 errNcrUnassigned();
michael@0 5961 } else if ((value & 0xFFFE) == 0xFFFE) {
michael@0 5962 ch = errNcrNonCharacter(ch);
michael@0 5963 } else if (value >= 0x007F && value <= 0x009F) {
michael@0 5964 errNcrControlChar();
michael@0 5965 } else {
michael@0 5966 maybeWarnPrivateUse(ch);
michael@0 5967 }
michael@0 5968 // ]NOCPP]
michael@0 5969 bmpChar[0] = ch;
michael@0 5970 emitOrAppendOne(bmpChar, returnState);
michael@0 5971 }
michael@0 5972 } else if (value <= 0x10FFFF) {
michael@0 5973 // [NOCPP[
michael@0 5974 maybeWarnPrivateUseAstral();
michael@0 5975 if ((value & 0xFFFE) == 0xFFFE) {
michael@0 5976 errAstralNonCharacter(value);
michael@0 5977 }
michael@0 5978 // ]NOCPP]
michael@0 5979 astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10));
michael@0 5980 astralChar[1] = (char) (0xDC00 + (value & 0x3FF));
michael@0 5981 emitOrAppendTwo(astralChar, returnState);
michael@0 5982 } else {
michael@0 5983 errNcrOutOfRange();
michael@0 5984 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
michael@0 5985 }
michael@0 5986 }
michael@0 5987
michael@0 5988 public void eof() throws SAXException {
michael@0 5989 int state = stateSave;
michael@0 5990 int returnState = returnStateSave;
michael@0 5991
michael@0 5992 eofloop: for (;;) {
michael@0 5993 switch (state) {
michael@0 5994 case SCRIPT_DATA_LESS_THAN_SIGN:
michael@0 5995 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
michael@0 5996 /*
michael@0 5997 * Otherwise, emit a U+003C LESS-THAN SIGN character token
michael@0 5998 */
michael@0 5999 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
michael@0 6000 /*
michael@0 6001 * and reconsume the current input character in the data
michael@0 6002 * state.
michael@0 6003 */
michael@0 6004 break eofloop;
michael@0 6005 case TAG_OPEN:
michael@0 6006 /*
michael@0 6007 * The behavior of this state depends on the content model
michael@0 6008 * flag.
michael@0 6009 */
michael@0 6010 /*
michael@0 6011 * Anything else Parse error.
michael@0 6012 */
michael@0 6013 errEofAfterLt();
michael@0 6014 /*
michael@0 6015 * Emit a U+003C LESS-THAN SIGN character token
michael@0 6016 */
michael@0 6017 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
michael@0 6018 /*
michael@0 6019 * and reconsume the current input character in the data
michael@0 6020 * state.
michael@0 6021 */
michael@0 6022 break eofloop;
michael@0 6023 case RAWTEXT_RCDATA_LESS_THAN_SIGN:
michael@0 6024 /*
michael@0 6025 * Emit a U+003C LESS-THAN SIGN character token
michael@0 6026 */
michael@0 6027 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
michael@0 6028 /*
michael@0 6029 * and reconsume the current input character in the RCDATA
michael@0 6030 * state.
michael@0 6031 */
michael@0 6032 break eofloop;
michael@0 6033 case NON_DATA_END_TAG_NAME:
michael@0 6034 /*
michael@0 6035 * Emit a U+003C LESS-THAN SIGN character token, a U+002F
michael@0 6036 * SOLIDUS character token,
michael@0 6037 */
michael@0 6038 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
michael@0 6039 /*
michael@0 6040 * a character token for each of the characters in the
michael@0 6041 * temporary buffer (in the order they were added to the
michael@0 6042 * buffer),
michael@0 6043 */
michael@0 6044 emitStrBuf();
michael@0 6045 /*
michael@0 6046 * and reconsume the current input character in the RCDATA
michael@0 6047 * state.
michael@0 6048 */
michael@0 6049 break eofloop;
michael@0 6050 case CLOSE_TAG_OPEN:
michael@0 6051 /* EOF Parse error. */
michael@0 6052 errEofAfterLt();
michael@0 6053 /*
michael@0 6054 * Emit a U+003C LESS-THAN SIGN character token and a U+002F
michael@0 6055 * SOLIDUS character token.
michael@0 6056 */
michael@0 6057 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
michael@0 6058 /*
michael@0 6059 * Reconsume the EOF character in the data state.
michael@0 6060 */
michael@0 6061 break eofloop;
michael@0 6062 case TAG_NAME:
michael@0 6063 /*
michael@0 6064 * EOF Parse error.
michael@0 6065 */
michael@0 6066 errEofInTagName();
michael@0 6067 /*
michael@0 6068 * Reconsume the EOF character in the data state.
michael@0 6069 */
michael@0 6070 break eofloop;
michael@0 6071 case BEFORE_ATTRIBUTE_NAME:
michael@0 6072 case AFTER_ATTRIBUTE_VALUE_QUOTED:
michael@0 6073 case SELF_CLOSING_START_TAG:
michael@0 6074 /* EOF Parse error. */
michael@0 6075 errEofWithoutGt();
michael@0 6076 /*
michael@0 6077 * Reconsume the EOF character in the data state.
michael@0 6078 */
michael@0 6079 break eofloop;
michael@0 6080 case ATTRIBUTE_NAME:
michael@0 6081 /*
michael@0 6082 * EOF Parse error.
michael@0 6083 */
michael@0 6084 errEofInAttributeName();
michael@0 6085 /*
michael@0 6086 * Reconsume the EOF character in the data state.
michael@0 6087 */
michael@0 6088 break eofloop;
michael@0 6089 case AFTER_ATTRIBUTE_NAME:
michael@0 6090 case BEFORE_ATTRIBUTE_VALUE:
michael@0 6091 /* EOF Parse error. */
michael@0 6092 errEofWithoutGt();
michael@0 6093 /*
michael@0 6094 * Reconsume the EOF character in the data state.
michael@0 6095 */
michael@0 6096 break eofloop;
michael@0 6097 case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
michael@0 6098 case ATTRIBUTE_VALUE_SINGLE_QUOTED:
michael@0 6099 case ATTRIBUTE_VALUE_UNQUOTED:
michael@0 6100 /* EOF Parse error. */
michael@0 6101 errEofInAttributeValue();
michael@0 6102 /*
michael@0 6103 * Reconsume the EOF character in the data state.
michael@0 6104 */
michael@0 6105 break eofloop;
michael@0 6106 case BOGUS_COMMENT:
michael@0 6107 emitComment(0, 0);
michael@0 6108 break eofloop;
michael@0 6109 case BOGUS_COMMENT_HYPHEN:
michael@0 6110 // [NOCPP[
michael@0 6111 maybeAppendSpaceToBogusComment();
michael@0 6112 // ]NOCPP]
michael@0 6113 emitComment(0, 0);
michael@0 6114 break eofloop;
michael@0 6115 case MARKUP_DECLARATION_OPEN:
michael@0 6116 errBogusComment();
michael@0 6117 clearLongStrBuf();
michael@0 6118 emitComment(0, 0);
michael@0 6119 break eofloop;
michael@0 6120 case MARKUP_DECLARATION_HYPHEN:
michael@0 6121 errBogusComment();
michael@0 6122 emitComment(0, 0);
michael@0 6123 break eofloop;
michael@0 6124 case MARKUP_DECLARATION_OCTYPE:
michael@0 6125 if (index < 6) {
michael@0 6126 errBogusComment();
michael@0 6127 emitComment(0, 0);
michael@0 6128 } else {
michael@0 6129 /* EOF Parse error. */
michael@0 6130 errEofInDoctype();
michael@0 6131 /*
michael@0 6132 * Create a new DOCTYPE token. Set its force-quirks flag
michael@0 6133 * to on.
michael@0 6134 */
michael@0 6135 doctypeName = "";
michael@0 6136 if (systemIdentifier != null) {
michael@0 6137 Portability.releaseString(systemIdentifier);
michael@0 6138 systemIdentifier = null;
michael@0 6139 }
michael@0 6140 if (publicIdentifier != null) {
michael@0 6141 Portability.releaseString(publicIdentifier);
michael@0 6142 publicIdentifier = null;
michael@0 6143 }
michael@0 6144 forceQuirks = true;
michael@0 6145 /*
michael@0 6146 * Emit the token.
michael@0 6147 */
michael@0 6148 emitDoctypeToken(0);
michael@0 6149 /*
michael@0 6150 * Reconsume the EOF character in the data state.
michael@0 6151 */
michael@0 6152 break eofloop;
michael@0 6153 }
michael@0 6154 break eofloop;
michael@0 6155 case COMMENT_START:
michael@0 6156 case COMMENT:
michael@0 6157 /*
michael@0 6158 * EOF Parse error.
michael@0 6159 */
michael@0 6160 errEofInComment();
michael@0 6161 /* Emit the comment token. */
michael@0 6162 emitComment(0, 0);
michael@0 6163 /*
michael@0 6164 * Reconsume the EOF character in the data state.
michael@0 6165 */
michael@0 6166 break eofloop;
michael@0 6167 case COMMENT_END:
michael@0 6168 errEofInComment();
michael@0 6169 /* Emit the comment token. */
michael@0 6170 emitComment(2, 0);
michael@0 6171 /*
michael@0 6172 * Reconsume the EOF character in the data state.
michael@0 6173 */
michael@0 6174 break eofloop;
michael@0 6175 case COMMENT_END_DASH:
michael@0 6176 case COMMENT_START_DASH:
michael@0 6177 errEofInComment();
michael@0 6178 /* Emit the comment token. */
michael@0 6179 emitComment(1, 0);
michael@0 6180 /*
michael@0 6181 * Reconsume the EOF character in the data state.
michael@0 6182 */
michael@0 6183 break eofloop;
michael@0 6184 case COMMENT_END_BANG:
michael@0 6185 errEofInComment();
michael@0 6186 /* Emit the comment token. */
michael@0 6187 emitComment(3, 0);
michael@0 6188 /*
michael@0 6189 * Reconsume the EOF character in the data state.
michael@0 6190 */
michael@0 6191 break eofloop;
michael@0 6192 case DOCTYPE:
michael@0 6193 case BEFORE_DOCTYPE_NAME:
michael@0 6194 errEofInDoctype();
michael@0 6195 /*
michael@0 6196 * Create a new DOCTYPE token. Set its force-quirks flag to
michael@0 6197 * on.
michael@0 6198 */
michael@0 6199 forceQuirks = true;
michael@0 6200 /*
michael@0 6201 * Emit the token.
michael@0 6202 */
michael@0 6203 emitDoctypeToken(0);
michael@0 6204 /*
michael@0 6205 * Reconsume the EOF character in the data state.
michael@0 6206 */
michael@0 6207 break eofloop;
michael@0 6208 case DOCTYPE_NAME:
michael@0 6209 errEofInDoctype();
michael@0 6210 strBufToDoctypeName();
michael@0 6211 /*
michael@0 6212 * Set the DOCTYPE token's force-quirks flag to on.
michael@0 6213 */
michael@0 6214 forceQuirks = true;
michael@0 6215 /*
michael@0 6216 * Emit that DOCTYPE token.
michael@0 6217 */
michael@0 6218 emitDoctypeToken(0);
michael@0 6219 /*
michael@0 6220 * Reconsume the EOF character in the data state.
michael@0 6221 */
michael@0 6222 break eofloop;
michael@0 6223 case DOCTYPE_UBLIC:
michael@0 6224 case DOCTYPE_YSTEM:
michael@0 6225 case AFTER_DOCTYPE_NAME:
michael@0 6226 case AFTER_DOCTYPE_PUBLIC_KEYWORD:
michael@0 6227 case AFTER_DOCTYPE_SYSTEM_KEYWORD:
michael@0 6228 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
michael@0 6229 errEofInDoctype();
michael@0 6230 /*
michael@0 6231 * Set the DOCTYPE token's force-quirks flag to on.
michael@0 6232 */
michael@0 6233 forceQuirks = true;
michael@0 6234 /*
michael@0 6235 * Emit that DOCTYPE token.
michael@0 6236 */
michael@0 6237 emitDoctypeToken(0);
michael@0 6238 /*
michael@0 6239 * Reconsume the EOF character in the data state.
michael@0 6240 */
michael@0 6241 break eofloop;
michael@0 6242 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
michael@0 6243 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
michael@0 6244 /* EOF Parse error. */
michael@0 6245 errEofInPublicId();
michael@0 6246 /*
michael@0 6247 * Set the DOCTYPE token's force-quirks flag to on.
michael@0 6248 */
michael@0 6249 forceQuirks = true;
michael@0 6250 /*
michael@0 6251 * Emit that DOCTYPE token.
michael@0 6252 */
michael@0 6253 publicIdentifier = longStrBufToString();
michael@0 6254 emitDoctypeToken(0);
michael@0 6255 /*
michael@0 6256 * Reconsume the EOF character in the data state.
michael@0 6257 */
michael@0 6258 break eofloop;
michael@0 6259 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
michael@0 6260 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
michael@0 6261 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
michael@0 6262 errEofInDoctype();
michael@0 6263 /*
michael@0 6264 * Set the DOCTYPE token's force-quirks flag to on.
michael@0 6265 */
michael@0 6266 forceQuirks = true;
michael@0 6267 /*
michael@0 6268 * Emit that DOCTYPE token.
michael@0 6269 */
michael@0 6270 emitDoctypeToken(0);
michael@0 6271 /*
michael@0 6272 * Reconsume the EOF character in the data state.
michael@0 6273 */
michael@0 6274 break eofloop;
michael@0 6275 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
michael@0 6276 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
michael@0 6277 /* EOF Parse error. */
michael@0 6278 errEofInSystemId();
michael@0 6279 /*
michael@0 6280 * Set the DOCTYPE token's force-quirks flag to on.
michael@0 6281 */
michael@0 6282 forceQuirks = true;
michael@0 6283 /*
michael@0 6284 * Emit that DOCTYPE token.
michael@0 6285 */
michael@0 6286 systemIdentifier = longStrBufToString();
michael@0 6287 emitDoctypeToken(0);
michael@0 6288 /*
michael@0 6289 * Reconsume the EOF character in the data state.
michael@0 6290 */
michael@0 6291 break eofloop;
michael@0 6292 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
michael@0 6293 errEofInDoctype();
michael@0 6294 /*
michael@0 6295 * Set the DOCTYPE token's force-quirks flag to on.
michael@0 6296 */
michael@0 6297 forceQuirks = true;
michael@0 6298 /*
michael@0 6299 * Emit that DOCTYPE token.
michael@0 6300 */
michael@0 6301 emitDoctypeToken(0);
michael@0 6302 /*
michael@0 6303 * Reconsume the EOF character in the data state.
michael@0 6304 */
michael@0 6305 break eofloop;
michael@0 6306 case BOGUS_DOCTYPE:
michael@0 6307 /*
michael@0 6308 * Emit that DOCTYPE token.
michael@0 6309 */
michael@0 6310 emitDoctypeToken(0);
michael@0 6311 /*
michael@0 6312 * Reconsume the EOF character in the data state.
michael@0 6313 */
michael@0 6314 break eofloop;
michael@0 6315 case CONSUME_CHARACTER_REFERENCE:
michael@0 6316 /*
michael@0 6317 * Unlike the definition is the spec, this state does not
michael@0 6318 * return a value and never requires the caller to
michael@0 6319 * backtrack. This state takes care of emitting characters
michael@0 6320 * or appending to the current attribute value. It also
michael@0 6321 * takes care of that in the case when consuming the entity
michael@0 6322 * fails.
michael@0 6323 */
michael@0 6324 /*
michael@0 6325 * This section defines how to consume an entity. This
michael@0 6326 * definition is used when parsing entities in text and in
michael@0 6327 * attributes.
michael@0 6328 *
michael@0 6329 * The behavior depends on the identity of the next
michael@0 6330 * character (the one immediately after the U+0026 AMPERSAND
michael@0 6331 * character):
michael@0 6332 */
michael@0 6333
michael@0 6334 emitOrAppendStrBuf(returnState);
michael@0 6335 state = returnState;
michael@0 6336 continue;
michael@0 6337 case CHARACTER_REFERENCE_HILO_LOOKUP:
michael@0 6338 errNoNamedCharacterMatch();
michael@0 6339 emitOrAppendStrBuf(returnState);
michael@0 6340 state = returnState;
michael@0 6341 continue;
michael@0 6342 case CHARACTER_REFERENCE_TAIL:
michael@0 6343 outer: for (;;) {
michael@0 6344 char c = '\u0000';
michael@0 6345 entCol++;
michael@0 6346 /*
michael@0 6347 * Consume the maximum number of characters possible,
michael@0 6348 * with the consumed characters matching one of the
michael@0 6349 * identifiers in the first column of the named
michael@0 6350 * character references table (in a case-sensitive
michael@0 6351 * manner).
michael@0 6352 */
michael@0 6353 hiloop: for (;;) {
michael@0 6354 if (hi == -1) {
michael@0 6355 break hiloop;
michael@0 6356 }
michael@0 6357 if (entCol == NamedCharacters.NAMES[hi].length()) {
michael@0 6358 break hiloop;
michael@0 6359 }
michael@0 6360 if (entCol > NamedCharacters.NAMES[hi].length()) {
michael@0 6361 break outer;
michael@0 6362 } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
michael@0 6363 hi--;
michael@0 6364 } else {
michael@0 6365 break hiloop;
michael@0 6366 }
michael@0 6367 }
michael@0 6368
michael@0 6369 loloop: for (;;) {
michael@0 6370 if (hi < lo) {
michael@0 6371 break outer;
michael@0 6372 }
michael@0 6373 if (entCol == NamedCharacters.NAMES[lo].length()) {
michael@0 6374 candidate = lo;
michael@0 6375 strBufMark = strBufLen;
michael@0 6376 lo++;
michael@0 6377 } else if (entCol > NamedCharacters.NAMES[lo].length()) {
michael@0 6378 break outer;
michael@0 6379 } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
michael@0 6380 lo++;
michael@0 6381 } else {
michael@0 6382 break loloop;
michael@0 6383 }
michael@0 6384 }
michael@0 6385 if (hi < lo) {
michael@0 6386 break outer;
michael@0 6387 }
michael@0 6388 continue;
michael@0 6389 }
michael@0 6390
michael@0 6391 if (candidate == -1) {
michael@0 6392 /*
michael@0 6393 * If no match can be made, then this is a parse error.
michael@0 6394 */
michael@0 6395 errNoNamedCharacterMatch();
michael@0 6396 emitOrAppendStrBuf(returnState);
michael@0 6397 state = returnState;
michael@0 6398 continue eofloop;
michael@0 6399 } else {
michael@0 6400 @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
michael@0 6401 if (candidateName.length() == 0
michael@0 6402 || candidateName.charAt(candidateName.length() - 1) != ';') {
michael@0 6403 /*
michael@0 6404 * If the last character matched is not a U+003B
michael@0 6405 * SEMICOLON (;), there is a parse error.
michael@0 6406 */
michael@0 6407 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
michael@0 6408 /*
michael@0 6409 * If the entity is being consumed as part of an
michael@0 6410 * attribute, and the last character matched is
michael@0 6411 * not a U+003B SEMICOLON (;),
michael@0 6412 */
michael@0 6413 char ch;
michael@0 6414 if (strBufMark == strBufLen) {
michael@0 6415 ch = '\u0000';
michael@0 6416 } else {
michael@0 6417 ch = strBuf[strBufMark];
michael@0 6418 }
michael@0 6419 if ((ch >= '0' && ch <= '9')
michael@0 6420 || (ch >= 'A' && ch <= 'Z')
michael@0 6421 || (ch >= 'a' && ch <= 'z')) {
michael@0 6422 /*
michael@0 6423 * and the next character is in the range
michael@0 6424 * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
michael@0 6425 * U+0041 LATIN CAPITAL LETTER A to U+005A
michael@0 6426 * LATIN CAPITAL LETTER Z, or U+0061 LATIN
michael@0 6427 * SMALL LETTER A to U+007A LATIN SMALL
michael@0 6428 * LETTER Z, then, for historical reasons,
michael@0 6429 * all the characters that were matched
michael@0 6430 * after the U+0026 AMPERSAND (&) must be
michael@0 6431 * unconsumed, and nothing is returned.
michael@0 6432 */
michael@0 6433 errNoNamedCharacterMatch();
michael@0 6434 appendStrBufToLongStrBuf();
michael@0 6435 state = returnState;
michael@0 6436 continue eofloop;
michael@0 6437 }
michael@0 6438 }
michael@0 6439 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
michael@0 6440 errUnescapedAmpersandInterpretedAsCharacterReference();
michael@0 6441 } else {
michael@0 6442 errNotSemicolonTerminated();
michael@0 6443 }
michael@0 6444 }
michael@0 6445
michael@0 6446 /*
michael@0 6447 * Otherwise, return a character token for the character
michael@0 6448 * corresponding to the entity name (as given by the
michael@0 6449 * second column of the named character references
michael@0 6450 * table).
michael@0 6451 */
michael@0 6452 @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
michael@0 6453 if (
michael@0 6454 // [NOCPP[
michael@0 6455 val.length == 1
michael@0 6456 // ]NOCPP]
michael@0 6457 // CPPONLY: val[1] == 0
michael@0 6458 ) {
michael@0 6459 emitOrAppendOne(val, returnState);
michael@0 6460 } else {
michael@0 6461 emitOrAppendTwo(val, returnState);
michael@0 6462 }
michael@0 6463 // this is so complicated!
michael@0 6464 if (strBufMark < strBufLen) {
michael@0 6465 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
michael@0 6466 for (int i = strBufMark; i < strBufLen; i++) {
michael@0 6467 appendLongStrBuf(strBuf[i]);
michael@0 6468 }
michael@0 6469 } else {
michael@0 6470 tokenHandler.characters(strBuf, strBufMark,
michael@0 6471 strBufLen - strBufMark);
michael@0 6472 }
michael@0 6473 }
michael@0 6474 state = returnState;
michael@0 6475 continue eofloop;
michael@0 6476 /*
michael@0 6477 * If the markup contains I'm &notit; I tell you, the
michael@0 6478 * entity is parsed as "not", as in, I'm ¬it; I tell
michael@0 6479 * you. But if the markup was I'm &notin; I tell you,
michael@0 6480 * the entity would be parsed as "notin;", resulting in
michael@0 6481 * I'm ∉ I tell you.
michael@0 6482 */
michael@0 6483 }
michael@0 6484 case CONSUME_NCR:
michael@0 6485 case DECIMAL_NRC_LOOP:
michael@0 6486 case HEX_NCR_LOOP:
michael@0 6487 /*
michael@0 6488 * If no characters match the range, then don't consume any
michael@0 6489 * characters (and unconsume the U+0023 NUMBER SIGN
michael@0 6490 * character and, if appropriate, the X character). This is
michael@0 6491 * a parse error; nothing is returned.
michael@0 6492 *
michael@0 6493 * Otherwise, if the next character is a U+003B SEMICOLON,
michael@0 6494 * consume that too. If it isn't, there is a parse error.
michael@0 6495 */
michael@0 6496 if (!seenDigits) {
michael@0 6497 errNoDigitsInNCR();
michael@0 6498 emitOrAppendStrBuf(returnState);
michael@0 6499 state = returnState;
michael@0 6500 continue;
michael@0 6501 } else {
michael@0 6502 errCharRefLacksSemicolon();
michael@0 6503 }
michael@0 6504 // WARNING previous state sets reconsume
michael@0 6505 handleNcrValue(returnState);
michael@0 6506 state = returnState;
michael@0 6507 continue;
michael@0 6508 case CDATA_RSQB:
michael@0 6509 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
michael@0 6510 break eofloop;
michael@0 6511 case CDATA_RSQB_RSQB:
michael@0 6512 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
michael@0 6513 break eofloop;
michael@0 6514 case DATA:
michael@0 6515 default:
michael@0 6516 break eofloop;
michael@0 6517 }
michael@0 6518 }
michael@0 6519 // case DATA:
michael@0 6520 /*
michael@0 6521 * EOF Emit an end-of-file token.
michael@0 6522 */
michael@0 6523 tokenHandler.eof();
michael@0 6524 return;
michael@0 6525 }
michael@0 6526
michael@0 6527 private void emitDoctypeToken(int pos) throws SAXException {
michael@0 6528 cstart = pos + 1;
michael@0 6529 tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier,
michael@0 6530 forceQuirks);
michael@0 6531 // It is OK and sufficient to release these here, since
michael@0 6532 // there's no way out of the doctype states than through paths
michael@0 6533 // that call this method.
michael@0 6534 doctypeName = null;
michael@0 6535 Portability.releaseString(publicIdentifier);
michael@0 6536 publicIdentifier = null;
michael@0 6537 Portability.releaseString(systemIdentifier);
michael@0 6538 systemIdentifier = null;
michael@0 6539 }
michael@0 6540
michael@0 6541 @Inline protected char checkChar(@NoLength char[] buf, int pos)
michael@0 6542 throws SAXException {
michael@0 6543 return buf[pos];
michael@0 6544 }
michael@0 6545
michael@0 6546 public boolean internalEncodingDeclaration(String internalCharset)
michael@0 6547 throws SAXException {
michael@0 6548 if (encodingDeclarationHandler != null) {
michael@0 6549 return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset);
michael@0 6550 }
michael@0 6551 return false;
michael@0 6552 }
michael@0 6553
michael@0 6554 /**
michael@0 6555 * @param val
michael@0 6556 * @throws SAXException
michael@0 6557 */
michael@0 6558 private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState)
michael@0 6559 throws SAXException {
michael@0 6560 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
michael@0 6561 appendLongStrBuf(val[0]);
michael@0 6562 appendLongStrBuf(val[1]);
michael@0 6563 } else {
michael@0 6564 tokenHandler.characters(val, 0, 2);
michael@0 6565 }
michael@0 6566 }
michael@0 6567
michael@0 6568 private void emitOrAppendOne(@Const @NoLength char[] val, int returnState)
michael@0 6569 throws SAXException {
michael@0 6570 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
michael@0 6571 appendLongStrBuf(val[0]);
michael@0 6572 } else {
michael@0 6573 tokenHandler.characters(val, 0, 1);
michael@0 6574 }
michael@0 6575 }
michael@0 6576
michael@0 6577 public void end() throws SAXException {
michael@0 6578 strBuf = null;
michael@0 6579 longStrBuf = null;
michael@0 6580 doctypeName = null;
michael@0 6581 if (systemIdentifier != null) {
michael@0 6582 Portability.releaseString(systemIdentifier);
michael@0 6583 systemIdentifier = null;
michael@0 6584 }
michael@0 6585 if (publicIdentifier != null) {
michael@0 6586 Portability.releaseString(publicIdentifier);
michael@0 6587 publicIdentifier = null;
michael@0 6588 }
michael@0 6589 if (tagName != null) {
michael@0 6590 tagName.release();
michael@0 6591 tagName = null;
michael@0 6592 }
michael@0 6593 if (attributeName != null) {
michael@0 6594 attributeName.release();
michael@0 6595 attributeName = null;
michael@0 6596 }
michael@0 6597 tokenHandler.endTokenization();
michael@0 6598 if (attributes != null) {
michael@0 6599 // [NOCPP[
michael@0 6600 attributes = null;
michael@0 6601 // ]NOCPP]
michael@0 6602 // CPPONLY: attributes.clear(mappingLangToXmlLang);
michael@0 6603 }
michael@0 6604 }
michael@0 6605
michael@0 6606 public void requestSuspension() {
michael@0 6607 shouldSuspend = true;
michael@0 6608 }
michael@0 6609
michael@0 6610 // [NOCPP[
michael@0 6611
michael@0 6612 public void becomeConfident() {
michael@0 6613 confident = true;
michael@0 6614 }
michael@0 6615
michael@0 6616 /**
michael@0 6617 * Returns the nextCharOnNewLine.
michael@0 6618 *
michael@0 6619 * @return the nextCharOnNewLine
michael@0 6620 */
michael@0 6621 public boolean isNextCharOnNewLine() {
michael@0 6622 return false;
michael@0 6623 }
michael@0 6624
michael@0 6625 public boolean isPrevCR() {
michael@0 6626 return lastCR;
michael@0 6627 }
michael@0 6628
michael@0 6629 /**
michael@0 6630 * Returns the line.
michael@0 6631 *
michael@0 6632 * @return the line
michael@0 6633 */
michael@0 6634 public int getLine() {
michael@0 6635 return -1;
michael@0 6636 }
michael@0 6637
michael@0 6638 /**
michael@0 6639 * Returns the col.
michael@0 6640 *
michael@0 6641 * @return the col
michael@0 6642 */
michael@0 6643 public int getCol() {
michael@0 6644 return -1;
michael@0 6645 }
michael@0 6646
michael@0 6647 // ]NOCPP]
michael@0 6648
michael@0 6649 public boolean isInDataState() {
michael@0 6650 return (stateSave == DATA);
michael@0 6651 }
michael@0 6652
michael@0 6653 public void resetToDataState() {
michael@0 6654 strBufLen = 0;
michael@0 6655 longStrBufLen = 0;
michael@0 6656 stateSave = Tokenizer.DATA;
michael@0 6657 // line = 1; XXX line numbers
michael@0 6658 lastCR = false;
michael@0 6659 index = 0;
michael@0 6660 forceQuirks = false;
michael@0 6661 additional = '\u0000';
michael@0 6662 entCol = -1;
michael@0 6663 firstCharKey = -1;
michael@0 6664 lo = 0;
michael@0 6665 hi = 0; // will always be overwritten before use anyway
michael@0 6666 candidate = -1;
michael@0 6667 strBufMark = 0;
michael@0 6668 prevValue = -1;
michael@0 6669 value = 0;
michael@0 6670 seenDigits = false;
michael@0 6671 endTag = false;
michael@0 6672 shouldSuspend = false;
michael@0 6673 initDoctypeFields();
michael@0 6674 if (tagName != null) {
michael@0 6675 tagName.release();
michael@0 6676 tagName = null;
michael@0 6677 }
michael@0 6678 if (attributeName != null) {
michael@0 6679 attributeName.release();
michael@0 6680 attributeName = null;
michael@0 6681 }
michael@0 6682 if (newAttributesEachTime) {
michael@0 6683 if (attributes != null) {
michael@0 6684 Portability.delete(attributes);
michael@0 6685 attributes = null;
michael@0 6686 }
michael@0 6687 }
michael@0 6688 }
michael@0 6689
michael@0 6690 public void loadState(Tokenizer other) throws SAXException {
michael@0 6691 strBufLen = other.strBufLen;
michael@0 6692 if (strBufLen > strBuf.length) {
michael@0 6693 strBuf = new char[strBufLen];
michael@0 6694 }
michael@0 6695 System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen);
michael@0 6696
michael@0 6697 longStrBufLen = other.longStrBufLen;
michael@0 6698 if (longStrBufLen > longStrBuf.length) {
michael@0 6699 longStrBuf = new char[longStrBufLen];
michael@0 6700 }
michael@0 6701 System.arraycopy(other.longStrBuf, 0, longStrBuf, 0, longStrBufLen);
michael@0 6702
michael@0 6703 stateSave = other.stateSave;
michael@0 6704 returnStateSave = other.returnStateSave;
michael@0 6705 endTagExpectation = other.endTagExpectation;
michael@0 6706 endTagExpectationAsArray = other.endTagExpectationAsArray;
michael@0 6707 // line = 1; XXX line numbers
michael@0 6708 lastCR = other.lastCR;
michael@0 6709 index = other.index;
michael@0 6710 forceQuirks = other.forceQuirks;
michael@0 6711 additional = other.additional;
michael@0 6712 entCol = other.entCol;
michael@0 6713 firstCharKey = other.firstCharKey;
michael@0 6714 lo = other.lo;
michael@0 6715 hi = other.hi;
michael@0 6716 candidate = other.candidate;
michael@0 6717 strBufMark = other.strBufMark;
michael@0 6718 prevValue = other.prevValue;
michael@0 6719 value = other.value;
michael@0 6720 seenDigits = other.seenDigits;
michael@0 6721 endTag = other.endTag;
michael@0 6722 shouldSuspend = false;
michael@0 6723
michael@0 6724 if (other.doctypeName == null) {
michael@0 6725 doctypeName = null;
michael@0 6726 } else {
michael@0 6727 doctypeName = Portability.newLocalFromLocal(other.doctypeName,
michael@0 6728 interner);
michael@0 6729 }
michael@0 6730
michael@0 6731 Portability.releaseString(systemIdentifier);
michael@0 6732 if (other.systemIdentifier == null) {
michael@0 6733 systemIdentifier = null;
michael@0 6734 } else {
michael@0 6735 systemIdentifier = Portability.newStringFromString(other.systemIdentifier);
michael@0 6736 }
michael@0 6737
michael@0 6738 Portability.releaseString(publicIdentifier);
michael@0 6739 if (other.publicIdentifier == null) {
michael@0 6740 publicIdentifier = null;
michael@0 6741 } else {
michael@0 6742 publicIdentifier = Portability.newStringFromString(other.publicIdentifier);
michael@0 6743 }
michael@0 6744
michael@0 6745 if (tagName != null) {
michael@0 6746 tagName.release();
michael@0 6747 }
michael@0 6748 if (other.tagName == null) {
michael@0 6749 tagName = null;
michael@0 6750 } else {
michael@0 6751 tagName = other.tagName.cloneElementName(interner);
michael@0 6752 }
michael@0 6753
michael@0 6754 if (attributeName != null) {
michael@0 6755 attributeName.release();
michael@0 6756 }
michael@0 6757 if (other.attributeName == null) {
michael@0 6758 attributeName = null;
michael@0 6759 } else {
michael@0 6760 attributeName = other.attributeName.cloneAttributeName(interner);
michael@0 6761 }
michael@0 6762
michael@0 6763 Portability.delete(attributes);
michael@0 6764 if (other.attributes == null) {
michael@0 6765 attributes = null;
michael@0 6766 } else {
michael@0 6767 attributes = other.attributes.cloneAttributes(interner);
michael@0 6768 }
michael@0 6769 }
michael@0 6770
michael@0 6771 public void initializeWithoutStarting() throws SAXException {
michael@0 6772 confident = false;
michael@0 6773 strBuf = new char[64];
michael@0 6774 longStrBuf = new char[1024];
michael@0 6775 line = 1;
michael@0 6776 // [NOCPP[
michael@0 6777 html4 = false;
michael@0 6778 metaBoundaryPassed = false;
michael@0 6779 wantsComments = tokenHandler.wantsComments();
michael@0 6780 if (!newAttributesEachTime) {
michael@0 6781 attributes = new HtmlAttributes(mappingLangToXmlLang);
michael@0 6782 }
michael@0 6783 // ]NOCPP]
michael@0 6784 resetToDataState();
michael@0 6785 }
michael@0 6786
michael@0 6787 protected void errGarbageAfterLtSlash() throws SAXException {
michael@0 6788 }
michael@0 6789
michael@0 6790 protected void errLtSlashGt() throws SAXException {
michael@0 6791 }
michael@0 6792
michael@0 6793 protected void errWarnLtSlashInRcdata() throws SAXException {
michael@0 6794 }
michael@0 6795
michael@0 6796 protected void errHtml4LtSlashInRcdata(char folded) throws SAXException {
michael@0 6797 }
michael@0 6798
michael@0 6799 protected void errCharRefLacksSemicolon() throws SAXException {
michael@0 6800 }
michael@0 6801
michael@0 6802 protected void errNoDigitsInNCR() throws SAXException {
michael@0 6803 }
michael@0 6804
michael@0 6805 protected void errGtInSystemId() throws SAXException {
michael@0 6806 }
michael@0 6807
michael@0 6808 protected void errGtInPublicId() throws SAXException {
michael@0 6809 }
michael@0 6810
michael@0 6811 protected void errNamelessDoctype() throws SAXException {
michael@0 6812 }
michael@0 6813
michael@0 6814 protected void errConsecutiveHyphens() throws SAXException {
michael@0 6815 }
michael@0 6816
michael@0 6817 protected void errPrematureEndOfComment() throws SAXException {
michael@0 6818 }
michael@0 6819
michael@0 6820 protected void errBogusComment() throws SAXException {
michael@0 6821 }
michael@0 6822
michael@0 6823 protected void errUnquotedAttributeValOrNull(char c) throws SAXException {
michael@0 6824 }
michael@0 6825
michael@0 6826 protected void errSlashNotFollowedByGt() throws SAXException {
michael@0 6827 }
michael@0 6828
michael@0 6829 protected void errHtml4XmlVoidSyntax() throws SAXException {
michael@0 6830 }
michael@0 6831
michael@0 6832 protected void errNoSpaceBetweenAttributes() throws SAXException {
michael@0 6833 }
michael@0 6834
michael@0 6835 protected void errHtml4NonNameInUnquotedAttribute(char c)
michael@0 6836 throws SAXException {
michael@0 6837 }
michael@0 6838
michael@0 6839 protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c)
michael@0 6840 throws SAXException {
michael@0 6841 }
michael@0 6842
michael@0 6843 protected void errAttributeValueMissing() throws SAXException {
michael@0 6844 }
michael@0 6845
michael@0 6846 protected void errBadCharBeforeAttributeNameOrNull(char c)
michael@0 6847 throws SAXException {
michael@0 6848 }
michael@0 6849
michael@0 6850 protected void errEqualsSignBeforeAttributeName() throws SAXException {
michael@0 6851 }
michael@0 6852
michael@0 6853 protected void errBadCharAfterLt(char c) throws SAXException {
michael@0 6854 }
michael@0 6855
michael@0 6856 protected void errLtGt() throws SAXException {
michael@0 6857 }
michael@0 6858
michael@0 6859 protected void errProcessingInstruction() throws SAXException {
michael@0 6860 }
michael@0 6861
michael@0 6862 protected void errUnescapedAmpersandInterpretedAsCharacterReference()
michael@0 6863 throws SAXException {
michael@0 6864 }
michael@0 6865
michael@0 6866 protected void errNotSemicolonTerminated() throws SAXException {
michael@0 6867 }
michael@0 6868
michael@0 6869 protected void errNoNamedCharacterMatch() throws SAXException {
michael@0 6870 }
michael@0 6871
michael@0 6872 protected void errQuoteBeforeAttributeName(char c) throws SAXException {
michael@0 6873 }
michael@0 6874
michael@0 6875 protected void errQuoteOrLtInAttributeNameOrNull(char c)
michael@0 6876 throws SAXException {
michael@0 6877 }
michael@0 6878
michael@0 6879 protected void errExpectedPublicId() throws SAXException {
michael@0 6880 }
michael@0 6881
michael@0 6882 protected void errBogusDoctype() throws SAXException {
michael@0 6883 }
michael@0 6884
michael@0 6885 protected void maybeWarnPrivateUseAstral() throws SAXException {
michael@0 6886 }
michael@0 6887
michael@0 6888 protected void maybeWarnPrivateUse(char ch) throws SAXException {
michael@0 6889 }
michael@0 6890
michael@0 6891 protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs)
michael@0 6892 throws SAXException {
michael@0 6893 }
michael@0 6894
michael@0 6895 protected void maybeErrSlashInEndTag(boolean selfClosing)
michael@0 6896 throws SAXException {
michael@0 6897 }
michael@0 6898
michael@0 6899 protected char errNcrNonCharacter(char ch) throws SAXException {
michael@0 6900 return ch;
michael@0 6901 }
michael@0 6902
michael@0 6903 protected void errAstralNonCharacter(int ch) throws SAXException {
michael@0 6904 }
michael@0 6905
michael@0 6906 protected void errNcrSurrogate() throws SAXException {
michael@0 6907 }
michael@0 6908
michael@0 6909 protected char errNcrControlChar(char ch) throws SAXException {
michael@0 6910 return ch;
michael@0 6911 }
michael@0 6912
michael@0 6913 protected void errNcrCr() throws SAXException {
michael@0 6914 }
michael@0 6915
michael@0 6916 protected void errNcrInC1Range() throws SAXException {
michael@0 6917 }
michael@0 6918
michael@0 6919 protected void errEofInPublicId() throws SAXException {
michael@0 6920 }
michael@0 6921
michael@0 6922 protected void errEofInComment() throws SAXException {
michael@0 6923 }
michael@0 6924
michael@0 6925 protected void errEofInDoctype() throws SAXException {
michael@0 6926 }
michael@0 6927
michael@0 6928 protected void errEofInAttributeValue() throws SAXException {
michael@0 6929 }
michael@0 6930
michael@0 6931 protected void errEofInAttributeName() throws SAXException {
michael@0 6932 }
michael@0 6933
michael@0 6934 protected void errEofWithoutGt() throws SAXException {
michael@0 6935 }
michael@0 6936
michael@0 6937 protected void errEofInTagName() throws SAXException {
michael@0 6938 }
michael@0 6939
michael@0 6940 protected void errEofInEndTag() throws SAXException {
michael@0 6941 }
michael@0 6942
michael@0 6943 protected void errEofAfterLt() throws SAXException {
michael@0 6944 }
michael@0 6945
michael@0 6946 protected void errNcrOutOfRange() throws SAXException {
michael@0 6947 }
michael@0 6948
michael@0 6949 protected void errNcrUnassigned() throws SAXException {
michael@0 6950 }
michael@0 6951
michael@0 6952 protected void errDuplicateAttribute() throws SAXException {
michael@0 6953 }
michael@0 6954
michael@0 6955 protected void errEofInSystemId() throws SAXException {
michael@0 6956 }
michael@0 6957
michael@0 6958 protected void errExpectedSystemId() throws SAXException {
michael@0 6959 }
michael@0 6960
michael@0 6961 protected void errMissingSpaceBeforeDoctypeName() throws SAXException {
michael@0 6962 }
michael@0 6963
michael@0 6964 protected void errHyphenHyphenBang() throws SAXException {
michael@0 6965 }
michael@0 6966
michael@0 6967 protected void errNcrControlChar() throws SAXException {
michael@0 6968 }
michael@0 6969
michael@0 6970 protected void errNcrZero() throws SAXException {
michael@0 6971 }
michael@0 6972
michael@0 6973 protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote()
michael@0 6974 throws SAXException {
michael@0 6975 }
michael@0 6976
michael@0 6977 protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException {
michael@0 6978 }
michael@0 6979
michael@0 6980 protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote()
michael@0 6981 throws SAXException {
michael@0 6982 }
michael@0 6983
michael@0 6984 protected void noteAttributeWithoutValue() throws SAXException {
michael@0 6985 }
michael@0 6986
michael@0 6987 protected void noteUnquotedAttributeValue() throws SAXException {
michael@0 6988 }
michael@0 6989
michael@0 6990 /**
michael@0 6991 * Sets the encodingDeclarationHandler.
michael@0 6992 *
michael@0 6993 * @param encodingDeclarationHandler
michael@0 6994 * the encodingDeclarationHandler to set
michael@0 6995 */
michael@0 6996 public void setEncodingDeclarationHandler(
michael@0 6997 EncodingDeclarationHandler encodingDeclarationHandler) {
michael@0 6998 this.encodingDeclarationHandler = encodingDeclarationHandler;
michael@0 6999 }
michael@0 7000
michael@0 7001 void destructor() {
michael@0 7002 // The translator will write refcount tracing stuff here
michael@0 7003 Portability.delete(attributes);
michael@0 7004 attributes = null;
michael@0 7005 }
michael@0 7006
michael@0 7007 // [NOCPP[
michael@0 7008
michael@0 7009 /**
michael@0 7010 * Sets an offset to be added to the position reported to
michael@0 7011 * <code>TransitionHandler</code>.
michael@0 7012 *
michael@0 7013 * @param offset the offset
michael@0 7014 */
michael@0 7015 public void setTransitionBaseOffset(int offset) {
michael@0 7016
michael@0 7017 }
michael@0 7018
michael@0 7019 // ]NOCPP]
michael@0 7020
michael@0 7021 }

mercurial