Fri, 16 Jan 2015 18:13:44 +0100
Integrate suggestion from review to improve consistency with existing code.
1 /*
2 * Copyright (c) 2005-2007 Henri Sivonen
3 * Copyright (c) 2007-2013 Mozilla Foundation
4 * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla
5 * Foundation, and Opera Software ASA.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
24 */
26 /*
27 * The comments following this one that use the same comment syntax as this
28 * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007
29 * amended as of June 18 2008 and May 31 2010.
30 * That document came with this statement:
31 * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and
32 * Opera Software ASA. You are granted a license to use, reproduce and
33 * create derivative works of this document."
34 */
36 package nu.validator.htmlparser.impl;
38 import nu.validator.htmlparser.annotation.Auto;
39 import nu.validator.htmlparser.annotation.CharacterName;
40 import nu.validator.htmlparser.annotation.Const;
41 import nu.validator.htmlparser.annotation.Inline;
42 import nu.validator.htmlparser.annotation.Local;
43 import nu.validator.htmlparser.annotation.NoLength;
44 import nu.validator.htmlparser.common.EncodingDeclarationHandler;
45 import nu.validator.htmlparser.common.Interner;
46 import nu.validator.htmlparser.common.TokenHandler;
47 import nu.validator.htmlparser.common.XmlViolationPolicy;
49 import org.xml.sax.ErrorHandler;
50 import org.xml.sax.Locator;
51 import org.xml.sax.SAXException;
52 import org.xml.sax.SAXParseException;
54 /**
55 * An implementation of
56 * http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
57 *
58 * This class implements the <code>Locator</code> interface. This is not an
59 * incidental implementation detail: Users of this class are encouraged to make
60 * use of the <code>Locator</code> nature.
61 *
62 * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer
63 * can be configured to treat these conditions as fatal or to coerce the infoset
64 * to something that XML 1.0 allows.
65 *
66 * @version $Id$
67 * @author hsivonen
68 */
69 public class Tokenizer implements Locator {
71 private static final int DATA_AND_RCDATA_MASK = ~1;
73 public static final int DATA = 0;
75 public static final int RCDATA = 1;
77 public static final int SCRIPT_DATA = 2;
79 public static final int RAWTEXT = 3;
81 public static final int SCRIPT_DATA_ESCAPED = 4;
83 public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5;
85 public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6;
87 public static final int ATTRIBUTE_VALUE_UNQUOTED = 7;
89 public static final int PLAINTEXT = 8;
91 public static final int TAG_OPEN = 9;
93 public static final int CLOSE_TAG_OPEN = 10;
95 public static final int TAG_NAME = 11;
97 public static final int BEFORE_ATTRIBUTE_NAME = 12;
99 public static final int ATTRIBUTE_NAME = 13;
101 public static final int AFTER_ATTRIBUTE_NAME = 14;
103 public static final int BEFORE_ATTRIBUTE_VALUE = 15;
105 public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16;
107 public static final int BOGUS_COMMENT = 17;
109 public static final int MARKUP_DECLARATION_OPEN = 18;
111 public static final int DOCTYPE = 19;
113 public static final int BEFORE_DOCTYPE_NAME = 20;
115 public static final int DOCTYPE_NAME = 21;
117 public static final int AFTER_DOCTYPE_NAME = 22;
119 public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23;
121 public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24;
123 public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25;
125 public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26;
127 public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27;
129 public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28;
131 public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29;
133 public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30;
135 public static final int BOGUS_DOCTYPE = 31;
137 public static final int COMMENT_START = 32;
139 public static final int COMMENT_START_DASH = 33;
141 public static final int COMMENT = 34;
143 public static final int COMMENT_END_DASH = 35;
145 public static final int COMMENT_END = 36;
147 public static final int COMMENT_END_BANG = 37;
149 public static final int NON_DATA_END_TAG_NAME = 38;
151 public static final int MARKUP_DECLARATION_HYPHEN = 39;
153 public static final int MARKUP_DECLARATION_OCTYPE = 40;
155 public static final int DOCTYPE_UBLIC = 41;
157 public static final int DOCTYPE_YSTEM = 42;
159 public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43;
161 public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44;
163 public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45;
165 public static final int CONSUME_CHARACTER_REFERENCE = 46;
167 public static final int CONSUME_NCR = 47;
169 public static final int CHARACTER_REFERENCE_TAIL = 48;
171 public static final int HEX_NCR_LOOP = 49;
173 public static final int DECIMAL_NRC_LOOP = 50;
175 public static final int HANDLE_NCR_VALUE = 51;
177 public static final int HANDLE_NCR_VALUE_RECONSUME = 52;
179 public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53;
181 public static final int SELF_CLOSING_START_TAG = 54;
183 public static final int CDATA_START = 55;
185 public static final int CDATA_SECTION = 56;
187 public static final int CDATA_RSQB = 57;
189 public static final int CDATA_RSQB_RSQB = 58;
191 public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59;
193 public static final int SCRIPT_DATA_ESCAPE_START = 60;
195 public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61;
197 public static final int SCRIPT_DATA_ESCAPED_DASH = 62;
199 public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63;
201 public static final int BOGUS_COMMENT_HYPHEN = 64;
203 public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65;
205 public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66;
207 public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67;
209 public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68;
211 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69;
213 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70;
215 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71;
217 public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72;
219 public static final int PROCESSING_INSTRUCTION = 73;
221 public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74;
223 /**
224 * Magic value for UTF-16 operations.
225 */
226 private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10));
228 /**
229 * UTF-16 code unit array containing less than and greater than for emitting
230 * those characters on certain parse errors.
231 */
232 private static final @NoLength char[] LT_GT = { '<', '>' };
234 /**
235 * UTF-16 code unit array containing less than and solidus for emitting
236 * those characters on certain parse errors.
237 */
238 private static final @NoLength char[] LT_SOLIDUS = { '<', '/' };
240 /**
241 * UTF-16 code unit array containing ]] for emitting those characters on
242 * state transitions.
243 */
244 private static final @NoLength char[] RSQB_RSQB = { ']', ']' };
246 /**
247 * Array version of U+FFFD.
248 */
249 private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' };
251 // [NOCPP[
253 /**
254 * Array version of space.
255 */
256 private static final @NoLength char[] SPACE = { ' ' };
258 // ]NOCPP]
260 /**
261 * Array version of line feed.
262 */
263 private static final @NoLength char[] LF = { '\n' };
265 /**
266 * Buffer growth parameter.
267 */
268 private static final int BUFFER_GROW_BY = 1024;
270 /**
271 * "CDATA[" as <code>char[]</code>
272 */
273 private static final @NoLength char[] CDATA_LSQB = { 'C', 'D', 'A', 'T',
274 'A', '[' };
276 /**
277 * "octype" as <code>char[]</code>
278 */
279 private static final @NoLength char[] OCTYPE = { 'o', 'c', 't', 'y', 'p',
280 'e' };
282 /**
283 * "ublic" as <code>char[]</code>
284 */
285 private static final @NoLength char[] UBLIC = { 'u', 'b', 'l', 'i', 'c' };
287 /**
288 * "ystem" as <code>char[]</code>
289 */
290 private static final @NoLength char[] YSTEM = { 'y', 's', 't', 'e', 'm' };
292 private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' };
294 private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' };
296 private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' };
298 private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't',
299 'e', 'x', 't' };
301 private static final char[] XMP_ARR = { 'x', 'm', 'p' };
303 private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r',
304 'e', 'a' };
306 private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' };
308 private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e',
309 'd' };
311 private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i',
312 'p', 't' };
314 private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm',
315 'e', 's' };
317 /**
318 * The token handler.
319 */
320 protected final TokenHandler tokenHandler;
322 protected EncodingDeclarationHandler encodingDeclarationHandler;
324 // [NOCPP[
326 /**
327 * The error handler.
328 */
329 protected ErrorHandler errorHandler;
331 // ]NOCPP]
333 /**
334 * Whether the previous char read was CR.
335 */
336 protected boolean lastCR;
338 protected int stateSave;
340 private int returnStateSave;
342 protected int index;
344 private boolean forceQuirks;
346 private char additional;
348 private int entCol;
350 private int firstCharKey;
352 private int lo;
354 private int hi;
356 private int candidate;
358 private int strBufMark;
360 private int prevValue;
362 protected int value;
364 private boolean seenDigits;
366 protected int cstart;
368 /**
369 * The SAX public id for the resource being tokenized. (Only passed to back
370 * as part of locator data.)
371 */
372 private String publicId;
374 /**
375 * The SAX system id for the resource being tokenized. (Only passed to back
376 * as part of locator data.)
377 */
378 private String systemId;
380 /**
381 * Buffer for short identifiers.
382 */
383 private @Auto char[] strBuf;
385 /**
386 * Number of significant <code>char</code>s in <code>strBuf</code>.
387 */
388 private int strBufLen;
390 /**
391 * <code>-1</code> to indicate that <code>strBuf</code> is used or otherwise
392 * an offset to the main buffer.
393 */
394 // private int strBufOffset = -1;
395 /**
396 * Buffer for long strings.
397 */
398 private @Auto char[] longStrBuf;
400 /**
401 * Number of significant <code>char</code>s in <code>longStrBuf</code>.
402 */
403 private int longStrBufLen;
405 /**
406 * <code>-1</code> to indicate that <code>longStrBuf</code> is used or
407 * otherwise an offset to the main buffer.
408 */
409 // private int longStrBufOffset = -1;
411 /**
412 * Buffer for expanding NCRs falling into the Basic Multilingual Plane.
413 */
414 private final @Auto char[] bmpChar;
416 /**
417 * Buffer for expanding astral NCRs.
418 */
419 private final @Auto char[] astralChar;
421 /**
422 * The element whose end tag closes the current CDATA or RCDATA element.
423 */
424 protected ElementName endTagExpectation = null;
426 private char[] endTagExpectationAsArray; // not @Auto!
428 /**
429 * <code>true</code> if tokenizing an end tag
430 */
431 protected boolean endTag;
433 /**
434 * The current tag token name.
435 */
436 private ElementName tagName = null;
438 /**
439 * The current attribute name.
440 */
441 protected AttributeName attributeName = null;
443 // [NOCPP[
445 /**
446 * Whether comment tokens are emitted.
447 */
448 private boolean wantsComments = false;
450 /**
451 * <code>true</code> when HTML4-specific additional errors are requested.
452 */
453 protected boolean html4;
455 /**
456 * Whether the stream is past the first 512 bytes.
457 */
458 private boolean metaBoundaryPassed;
460 // ]NOCPP]
462 /**
463 * The name of the current doctype token.
464 */
465 private @Local String doctypeName;
467 /**
468 * The public id of the current doctype token.
469 */
470 private String publicIdentifier;
472 /**
473 * The system id of the current doctype token.
474 */
475 private String systemIdentifier;
477 /**
478 * The attribute holder.
479 */
480 private HtmlAttributes attributes;
482 // [NOCPP[
484 /**
485 * The policy for vertical tab and form feed.
486 */
487 private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET;
489 /**
490 * The policy for comments.
491 */
492 private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET;
494 private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET;
496 private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET;
498 private boolean html4ModeCompatibleWithXhtml1Schemata;
500 private int mappingLangToXmlLang;
502 // ]NOCPP]
504 private final boolean newAttributesEachTime;
506 private boolean shouldSuspend;
508 protected boolean confident;
510 private int line;
512 private Interner interner;
514 // CPPONLY: private boolean viewingXmlSource;
516 // [NOCPP[
518 protected LocatorImpl ampersandLocation;
520 public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) {
521 this.tokenHandler = tokenHandler;
522 this.encodingDeclarationHandler = null;
523 this.newAttributesEachTime = newAttributesEachTime;
524 this.bmpChar = new char[1];
525 this.astralChar = new char[2];
526 this.tagName = null;
527 this.attributeName = null;
528 this.doctypeName = null;
529 this.publicIdentifier = null;
530 this.systemIdentifier = null;
531 this.attributes = null;
532 }
534 // ]NOCPP]
536 /**
537 * The constructor.
538 *
539 * @param tokenHandler
540 * the handler for receiving tokens
541 */
542 public Tokenizer(TokenHandler tokenHandler
543 // CPPONLY: , boolean viewingXmlSource
544 ) {
545 this.tokenHandler = tokenHandler;
546 this.encodingDeclarationHandler = null;
547 // [NOCPP[
548 this.newAttributesEachTime = false;
549 // ]NOCPP]
550 this.bmpChar = new char[1];
551 this.astralChar = new char[2];
552 this.tagName = null;
553 this.attributeName = null;
554 this.doctypeName = null;
555 this.publicIdentifier = null;
556 this.systemIdentifier = null;
557 // [NOCPP[
558 this.attributes = null;
559 // ]NOCPP]
560 // CPPONLY: this.attributes = tokenHandler.HasBuilder() ? new HtmlAttributes(mappingLangToXmlLang) : null;
561 // CPPONLY: this.newAttributesEachTime = !tokenHandler.HasBuilder();
562 // CPPONLY: this.viewingXmlSource = viewingXmlSource;
563 }
565 public void setInterner(Interner interner) {
566 this.interner = interner;
567 }
569 public void initLocation(String newPublicId, String newSystemId) {
570 this.systemId = newSystemId;
571 this.publicId = newPublicId;
573 }
575 // CPPONLY: boolean isViewingXmlSource() {
576 // CPPONLY: return viewingXmlSource;
577 // CPPONLY: }
579 // [NOCPP[
581 /**
582 * Returns the mappingLangToXmlLang.
583 *
584 * @return the mappingLangToXmlLang
585 */
586 public boolean isMappingLangToXmlLang() {
587 return mappingLangToXmlLang == AttributeName.HTML_LANG;
588 }
590 /**
591 * Sets the mappingLangToXmlLang.
592 *
593 * @param mappingLangToXmlLang
594 * the mappingLangToXmlLang to set
595 */
596 public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
597 this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG
598 : AttributeName.HTML;
599 }
601 /**
602 * Sets the error handler.
603 *
604 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
605 */
606 public void setErrorHandler(ErrorHandler eh) {
607 this.errorHandler = eh;
608 }
610 public ErrorHandler getErrorHandler() {
611 return this.errorHandler;
612 }
614 /**
615 * Sets the commentPolicy.
616 *
617 * @param commentPolicy
618 * the commentPolicy to set
619 */
620 public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
621 this.commentPolicy = commentPolicy;
622 }
624 /**
625 * Sets the contentNonXmlCharPolicy.
626 *
627 * @param contentNonXmlCharPolicy
628 * the contentNonXmlCharPolicy to set
629 */
630 public void setContentNonXmlCharPolicy(
631 XmlViolationPolicy contentNonXmlCharPolicy) {
632 if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) {
633 throw new IllegalArgumentException(
634 "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW.");
635 }
636 }
638 /**
639 * Sets the contentSpacePolicy.
640 *
641 * @param contentSpacePolicy
642 * the contentSpacePolicy to set
643 */
644 public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
645 this.contentSpacePolicy = contentSpacePolicy;
646 }
648 /**
649 * Sets the xmlnsPolicy.
650 *
651 * @param xmlnsPolicy
652 * the xmlnsPolicy to set
653 */
654 public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
655 if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
656 throw new IllegalArgumentException("Can't use FATAL here.");
657 }
658 this.xmlnsPolicy = xmlnsPolicy;
659 }
661 public void setNamePolicy(XmlViolationPolicy namePolicy) {
662 this.namePolicy = namePolicy;
663 }
665 /**
666 * Sets the html4ModeCompatibleWithXhtml1Schemata.
667 *
668 * @param html4ModeCompatibleWithXhtml1Schemata
669 * the html4ModeCompatibleWithXhtml1Schemata to set
670 */
671 public void setHtml4ModeCompatibleWithXhtml1Schemata(
672 boolean html4ModeCompatibleWithXhtml1Schemata) {
673 this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata;
674 }
676 // ]NOCPP]
678 // For the token handler to call
679 /**
680 * Sets the tokenizer state and the associated element name. This should
681 * only ever used to put the tokenizer into one of the states that have
682 * a special end tag expectation.
683 *
684 * @param specialTokenizerState
685 * the tokenizer state to set
686 * @param endTagExpectation
687 * the expected end tag for transitioning back to normal
688 */
689 public void setStateAndEndTagExpectation(int specialTokenizerState,
690 @Local String endTagExpectation) {
691 this.stateSave = specialTokenizerState;
692 if (specialTokenizerState == Tokenizer.DATA) {
693 return;
694 }
695 @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation);
696 this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 0,
697 asArray.length, interner);
698 endTagExpectationToArray();
699 }
701 /**
702 * Sets the tokenizer state and the associated element name. This should
703 * only ever used to put the tokenizer into one of the states that have
704 * a special end tag expectation.
705 *
706 * @param specialTokenizerState
707 * the tokenizer state to set
708 * @param endTagExpectation
709 * the expected end tag for transitioning back to normal
710 */
711 public void setStateAndEndTagExpectation(int specialTokenizerState,
712 ElementName endTagExpectation) {
713 this.stateSave = specialTokenizerState;
714 this.endTagExpectation = endTagExpectation;
715 endTagExpectationToArray();
716 }
718 private void endTagExpectationToArray() {
719 switch (endTagExpectation.getGroup()) {
720 case TreeBuilder.TITLE:
721 endTagExpectationAsArray = TITLE_ARR;
722 return;
723 case TreeBuilder.SCRIPT:
724 endTagExpectationAsArray = SCRIPT_ARR;
725 return;
726 case TreeBuilder.STYLE:
727 endTagExpectationAsArray = STYLE_ARR;
728 return;
729 case TreeBuilder.PLAINTEXT:
730 endTagExpectationAsArray = PLAINTEXT_ARR;
731 return;
732 case TreeBuilder.XMP:
733 endTagExpectationAsArray = XMP_ARR;
734 return;
735 case TreeBuilder.TEXTAREA:
736 endTagExpectationAsArray = TEXTAREA_ARR;
737 return;
738 case TreeBuilder.IFRAME:
739 endTagExpectationAsArray = IFRAME_ARR;
740 return;
741 case TreeBuilder.NOEMBED:
742 endTagExpectationAsArray = NOEMBED_ARR;
743 return;
744 case TreeBuilder.NOSCRIPT:
745 endTagExpectationAsArray = NOSCRIPT_ARR;
746 return;
747 case TreeBuilder.NOFRAMES:
748 endTagExpectationAsArray = NOFRAMES_ARR;
749 return;
750 default:
751 assert false: "Bad end tag expectation.";
752 return;
753 }
754 }
756 /**
757 * For C++ use only.
758 */
759 public void setLineNumber(int line) {
760 this.line = line;
761 }
763 // start Locator impl
765 /**
766 * @see org.xml.sax.Locator#getLineNumber()
767 */
768 @Inline public int getLineNumber() {
769 return line;
770 }
772 // [NOCPP[
774 /**
775 * @see org.xml.sax.Locator#getColumnNumber()
776 */
777 @Inline public int getColumnNumber() {
778 return -1;
779 }
781 /**
782 * @see org.xml.sax.Locator#getPublicId()
783 */
784 public String getPublicId() {
785 return publicId;
786 }
788 /**
789 * @see org.xml.sax.Locator#getSystemId()
790 */
791 public String getSystemId() {
792 return systemId;
793 }
795 // end Locator impl
797 // end public API
799 public void notifyAboutMetaBoundary() {
800 metaBoundaryPassed = true;
801 }
803 void turnOnAdditionalHtml4Errors() {
804 html4 = true;
805 }
807 // ]NOCPP]
809 HtmlAttributes emptyAttributes() {
810 // [NOCPP[
811 if (newAttributesEachTime) {
812 return new HtmlAttributes(mappingLangToXmlLang);
813 } else {
814 // ]NOCPP]
815 return HtmlAttributes.EMPTY_ATTRIBUTES;
816 // [NOCPP[
817 }
818 // ]NOCPP]
819 }
821 @Inline private void clearStrBufAndAppend(char c) {
822 strBuf[0] = c;
823 strBufLen = 1;
824 }
826 @Inline private void clearStrBuf() {
827 strBufLen = 0;
828 }
830 /**
831 * Appends to the smaller buffer.
832 *
833 * @param c
834 * the UTF-16 code unit to append
835 */
836 private void appendStrBuf(char c) {
837 if (strBufLen == strBuf.length) {
838 char[] newBuf = new char[strBuf.length + Tokenizer.BUFFER_GROW_BY];
839 System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length);
840 strBuf = newBuf;
841 }
842 strBuf[strBufLen++] = c;
843 }
845 /**
846 * The smaller buffer as a String. Currently only used for error reporting.
847 *
848 * <p>
849 * C++ memory note: The return value must be released.
850 *
851 * @return the smaller buffer as a string
852 */
853 protected String strBufToString() {
854 return Portability.newStringFromBuffer(strBuf, 0, strBufLen);
855 }
857 /**
858 * Returns the short buffer as a local name. The return value is released in
859 * emitDoctypeToken().
860 *
861 * @return the smaller buffer as local name
862 */
863 private void strBufToDoctypeName() {
864 doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen,
865 interner);
866 }
868 /**
869 * Emits the smaller buffer as character tokens.
870 *
871 * @throws SAXException
872 * if the token handler threw
873 */
874 private void emitStrBuf() throws SAXException {
875 if (strBufLen > 0) {
876 tokenHandler.characters(strBuf, 0, strBufLen);
877 }
878 }
880 @Inline private void clearLongStrBuf() {
881 longStrBufLen = 0;
882 }
884 @Inline private void clearLongStrBufAndAppend(char c) {
885 longStrBuf[0] = c;
886 longStrBufLen = 1;
887 }
889 /**
890 * Appends to the larger buffer.
891 *
892 * @param c
893 * the UTF-16 code unit to append
894 */
895 private void appendLongStrBuf(char c) {
896 if (longStrBufLen == longStrBuf.length) {
897 char[] newBuf = new char[longStrBufLen + (longStrBufLen >> 1)];
898 System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);
899 longStrBuf = newBuf;
900 }
901 longStrBuf[longStrBufLen++] = c;
902 }
904 @Inline private void appendSecondHyphenToBogusComment() throws SAXException {
905 // [NOCPP[
906 switch (commentPolicy) {
907 case ALTER_INFOSET:
908 // detachLongStrBuf();
909 appendLongStrBuf(' ');
910 // FALLTHROUGH
911 case ALLOW:
912 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
913 // ]NOCPP]
914 appendLongStrBuf('-');
915 // [NOCPP[
916 break;
917 case FATAL:
918 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
919 break;
920 }
921 // ]NOCPP]
922 }
924 // [NOCPP[
925 private void maybeAppendSpaceToBogusComment() throws SAXException {
926 switch (commentPolicy) {
927 case ALTER_INFOSET:
928 // detachLongStrBuf();
929 appendLongStrBuf(' ');
930 // FALLTHROUGH
931 case ALLOW:
932 warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
933 break;
934 case FATAL:
935 fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
936 break;
937 }
938 }
940 // ]NOCPP]
942 @Inline private void adjustDoubleHyphenAndAppendToLongStrBufAndErr(char c)
943 throws SAXException {
944 errConsecutiveHyphens();
945 // [NOCPP[
946 switch (commentPolicy) {
947 case ALTER_INFOSET:
948 // detachLongStrBuf();
949 longStrBufLen--;
950 appendLongStrBuf(' ');
951 appendLongStrBuf('-');
952 // FALLTHROUGH
953 case ALLOW:
954 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
955 // ]NOCPP]
956 appendLongStrBuf(c);
957 // [NOCPP[
958 break;
959 case FATAL:
960 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
961 break;
962 }
963 // ]NOCPP]
964 }
966 private void appendLongStrBuf(@NoLength char[] buffer, int offset, int length) {
967 int reqLen = longStrBufLen + length;
968 if (longStrBuf.length < reqLen) {
969 char[] newBuf = new char[reqLen + (reqLen >> 1)];
970 System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);
971 longStrBuf = newBuf;
972 }
973 System.arraycopy(buffer, offset, longStrBuf, longStrBufLen, length);
974 longStrBufLen = reqLen;
975 }
977 /**
978 * Append the contents of the smaller buffer to the larger one.
979 */
980 @Inline private void appendStrBufToLongStrBuf() {
981 appendLongStrBuf(strBuf, 0, strBufLen);
982 }
984 /**
985 * The larger buffer as a string.
986 *
987 * <p>
988 * C++ memory note: The return value must be released.
989 *
990 * @return the larger buffer as a string
991 */
992 private String longStrBufToString() {
993 return Portability.newStringFromBuffer(longStrBuf, 0, longStrBufLen);
994 }
996 /**
997 * Emits the current comment token.
998 *
999 * @param pos
1000 * TODO
1001 *
1002 * @throws SAXException
1003 */
1004 private void emitComment(int provisionalHyphens, int pos)
1005 throws SAXException {
1006 // [NOCPP[
1007 if (wantsComments) {
1008 // ]NOCPP]
1009 // if (longStrBufOffset != -1) {
1010 // tokenHandler.comment(buf, longStrBufOffset, longStrBufLen
1011 // - provisionalHyphens);
1012 // } else {
1013 tokenHandler.comment(longStrBuf, 0, longStrBufLen
1014 - provisionalHyphens);
1015 // }
1016 // [NOCPP[
1017 }
1018 // ]NOCPP]
1019 cstart = pos + 1;
1020 }
1022 /**
1023 * Flushes coalesced character tokens.
1024 *
1025 * @param buf
1026 * TODO
1027 * @param pos
1028 * TODO
1029 *
1030 * @throws SAXException
1031 */
1032 protected void flushChars(@NoLength char[] buf, int pos)
1033 throws SAXException {
1034 if (pos > cstart) {
1035 tokenHandler.characters(buf, cstart, pos - cstart);
1036 }
1037 cstart = Integer.MAX_VALUE;
1038 }
1040 /**
1041 * Reports an condition that would make the infoset incompatible with XML
1042 * 1.0 as fatal.
1043 *
1044 * @param message
1045 * the message
1046 * @throws SAXException
1047 * @throws SAXParseException
1048 */
1049 public void fatal(String message) throws SAXException {
1050 SAXParseException spe = new SAXParseException(message, this);
1051 if (errorHandler != null) {
1052 errorHandler.fatalError(spe);
1053 }
1054 throw spe;
1055 }
1057 /**
1058 * Reports a Parse Error.
1059 *
1060 * @param message
1061 * the message
1062 * @throws SAXException
1063 */
1064 public void err(String message) throws SAXException {
1065 if (errorHandler == null) {
1066 return;
1067 }
1068 SAXParseException spe = new SAXParseException(message, this);
1069 errorHandler.error(spe);
1070 }
1072 public void errTreeBuilder(String message) throws SAXException {
1073 ErrorHandler eh = null;
1074 if (tokenHandler instanceof TreeBuilder<?>) {
1075 TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler;
1076 eh = treeBuilder.getErrorHandler();
1077 }
1078 if (eh == null) {
1079 eh = errorHandler;
1080 }
1081 if (eh == null) {
1082 return;
1083 }
1084 SAXParseException spe = new SAXParseException(message, this);
1085 eh.error(spe);
1086 }
1088 /**
1089 * Reports a warning
1090 *
1091 * @param message
1092 * the message
1093 * @throws SAXException
1094 */
1095 public void warn(String message) throws SAXException {
1096 if (errorHandler == null) {
1097 return;
1098 }
1099 SAXParseException spe = new SAXParseException(message, this);
1100 errorHandler.warning(spe);
1101 }
1103 private void strBufToElementNameString() {
1104 // if (strBufOffset != -1) {
1105 // return ElementName.elementNameByBuffer(buf, strBufOffset, strBufLen);
1106 // } else {
1107 tagName = ElementName.elementNameByBuffer(strBuf, 0, strBufLen,
1108 interner);
1109 // }
1110 }
1112 private int emitCurrentTagToken(boolean selfClosing, int pos)
1113 throws SAXException {
1114 cstart = pos + 1;
1115 maybeErrSlashInEndTag(selfClosing);
1116 stateSave = Tokenizer.DATA;
1117 HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES
1118 : attributes);
1119 if (endTag) {
1120 /*
1121 * When an end tag token is emitted, the content model flag must be
1122 * switched to the PCDATA state.
1123 */
1124 maybeErrAttributesOnEndTag(attrs);
1125 // CPPONLY: if (!viewingXmlSource) {
1126 tokenHandler.endTag(tagName);
1127 // CPPONLY: }
1128 // CPPONLY: if (newAttributesEachTime) {
1129 // CPPONLY: Portability.delete(attributes);
1130 // CPPONLY: attributes = null;
1131 // CPPONLY: }
1132 } else {
1133 // CPPONLY: if (viewingXmlSource) {
1134 // CPPONLY: assert newAttributesEachTime;
1135 // CPPONLY: Portability.delete(attributes);
1136 // CPPONLY: attributes = null;
1137 // CPPONLY: } else {
1138 tokenHandler.startTag(tagName, attrs, selfClosing);
1139 // CPPONLY: }
1140 }
1141 tagName.release();
1142 tagName = null;
1143 if (newAttributesEachTime) {
1144 attributes = null;
1145 } else {
1146 attributes.clear(mappingLangToXmlLang);
1147 }
1148 /*
1149 * The token handler may have called setStateAndEndTagExpectation
1150 * and changed stateSave since the start of this method.
1151 */
1152 return stateSave;
1153 }
1155 private void attributeNameComplete() throws SAXException {
1156 // if (strBufOffset != -1) {
1157 // attributeName = AttributeName.nameByBuffer(buf, strBufOffset,
1158 // strBufLen, namePolicy != XmlViolationPolicy.ALLOW);
1159 // } else {
1160 attributeName = AttributeName.nameByBuffer(strBuf, 0, strBufLen
1161 // [NOCPP[
1162 , namePolicy != XmlViolationPolicy.ALLOW
1163 // ]NOCPP]
1164 , interner);
1165 // }
1167 if (attributes == null) {
1168 attributes = new HtmlAttributes(mappingLangToXmlLang);
1169 }
1171 /*
1172 * When the user agent leaves the attribute name state (and before
1173 * emitting the tag token, if appropriate), the complete attribute's
1174 * name must be compared to the other attributes on the same token; if
1175 * there is already an attribute on the token with the exact same name,
1176 * then this is a parse error and the new attribute must be dropped,
1177 * along with the value that gets associated with it (if any).
1178 */
1179 if (attributes.contains(attributeName)) {
1180 errDuplicateAttribute();
1181 attributeName.release();
1182 attributeName = null;
1183 }
1184 }
1186 private void addAttributeWithoutValue() throws SAXException {
1187 noteAttributeWithoutValue();
1189 // [NOCPP[
1190 if (metaBoundaryPassed && AttributeName.CHARSET == attributeName
1191 && ElementName.META == tagName) {
1192 err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
1193 }
1194 // ]NOCPP]
1195 if (attributeName != null) {
1196 // [NOCPP[
1197 if (html4) {
1198 if (attributeName.isBoolean()) {
1199 if (html4ModeCompatibleWithXhtml1Schemata) {
1200 attributes.addAttribute(attributeName,
1201 attributeName.getLocal(AttributeName.HTML),
1202 xmlnsPolicy);
1203 } else {
1204 attributes.addAttribute(attributeName, "", xmlnsPolicy);
1205 }
1206 } else {
1207 if (AttributeName.BORDER != attributeName) {
1208 err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)");
1209 attributes.addAttribute(attributeName, "", xmlnsPolicy);
1210 }
1211 }
1212 } else {
1213 if (AttributeName.SRC == attributeName
1214 || AttributeName.HREF == attributeName) {
1215 warn("Attribute \u201C"
1216 + attributeName.getLocal(AttributeName.HTML)
1217 + "\u201D without an explicit value seen. The attribute may be dropped by IE7.");
1218 }
1219 // ]NOCPP]
1220 attributes.addAttribute(attributeName,
1221 Portability.newEmptyString()
1222 // [NOCPP[
1223 , xmlnsPolicy
1224 // ]NOCPP]
1225 );
1226 // [NOCPP[
1227 }
1228 // ]NOCPP]
1229 attributeName = null; // attributeName has been adopted by the
1230 // |attributes| object
1231 }
1232 }
1234 private void addAttributeWithValue() throws SAXException {
1235 // [NOCPP[
1236 if (metaBoundaryPassed && ElementName.META == tagName
1237 && AttributeName.CHARSET == attributeName) {
1238 err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
1239 }
1240 // ]NOCPP]
1241 if (attributeName != null) {
1242 String val = longStrBufToString(); // Ownership transferred to
1243 // HtmlAttributes
1244 // CPPONLY: if (mViewSource) {
1245 // CPPONLY: mViewSource.MaybeLinkifyAttributeValue(attributeName, val);
1246 // CPPONLY: }
1247 // [NOCPP[
1248 if (!endTag && html4 && html4ModeCompatibleWithXhtml1Schemata
1249 && attributeName.isCaseFolded()) {
1250 val = newAsciiLowerCaseStringFromString(val);
1251 }
1252 // ]NOCPP]
1253 attributes.addAttribute(attributeName, val
1254 // [NOCPP[
1255 , xmlnsPolicy
1256 // ]NOCPP]
1257 );
1258 attributeName = null; // attributeName has been adopted by the
1259 // |attributes| object
1260 }
1261 }
1263 // [NOCPP[
1265 private static String newAsciiLowerCaseStringFromString(String str) {
1266 if (str == null) {
1267 return null;
1268 }
1269 char[] buf = new char[str.length()];
1270 for (int i = 0; i < str.length(); i++) {
1271 char c = str.charAt(i);
1272 if (c >= 'A' && c <= 'Z') {
1273 c += 0x20;
1274 }
1275 buf[i] = c;
1276 }
1277 return new String(buf);
1278 }
1280 protected void startErrorReporting() throws SAXException {
1282 }
1284 // ]NOCPP]
1286 public void start() throws SAXException {
1287 initializeWithoutStarting();
1288 tokenHandler.startTokenization(this);
1289 // [NOCPP[
1290 startErrorReporting();
1291 // ]NOCPP]
1292 }
1294 public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException {
1295 int state = stateSave;
1296 int returnState = returnStateSave;
1297 char c = '\u0000';
1298 shouldSuspend = false;
1299 lastCR = false;
1301 int start = buffer.getStart();
1302 /**
1303 * The index of the last <code>char</code> read from <code>buf</code>.
1304 */
1305 int pos = start - 1;
1307 /**
1308 * The index of the first <code>char</code> in <code>buf</code> that is
1309 * part of a coalesced run of character tokens or
1310 * <code>Integer.MAX_VALUE</code> if there is not a current run being
1311 * coalesced.
1312 */
1313 switch (state) {
1314 case DATA:
1315 case RCDATA:
1316 case SCRIPT_DATA:
1317 case PLAINTEXT:
1318 case RAWTEXT:
1319 case CDATA_SECTION:
1320 case SCRIPT_DATA_ESCAPED:
1321 case SCRIPT_DATA_ESCAPE_START:
1322 case SCRIPT_DATA_ESCAPE_START_DASH:
1323 case SCRIPT_DATA_ESCAPED_DASH:
1324 case SCRIPT_DATA_ESCAPED_DASH_DASH:
1325 case SCRIPT_DATA_DOUBLE_ESCAPE_START:
1326 case SCRIPT_DATA_DOUBLE_ESCAPED:
1327 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
1328 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
1329 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
1330 case SCRIPT_DATA_DOUBLE_ESCAPE_END:
1331 cstart = start;
1332 break;
1333 default:
1334 cstart = Integer.MAX_VALUE;
1335 break;
1336 }
1338 /**
1339 * The number of <code>char</code>s in <code>buf</code> that have
1340 * meaning. (The rest of the array is garbage and should not be
1341 * examined.)
1342 */
1343 // CPPONLY: if (mViewSource) {
1344 // CPPONLY: mViewSource.SetBuffer(buffer);
1345 // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
1346 // CPPONLY: mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1);
1347 // CPPONLY: } else {
1348 // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
1349 // CPPONLY: }
1350 // [NOCPP[
1351 pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState,
1352 buffer.getEnd());
1353 // ]NOCPP]
1354 if (pos == buffer.getEnd()) {
1355 // exiting due to end of buffer
1356 buffer.setStart(pos);
1357 } else {
1358 buffer.setStart(pos + 1);
1359 }
1360 return lastCR;
1361 }
1363 @SuppressWarnings("unused") private int stateLoop(int state, char c,
1364 int pos, @NoLength char[] buf, boolean reconsume, int returnState,
1365 int endPos) throws SAXException {
1366 /*
1367 * Idioms used in this code:
1368 *
1369 *
1370 * Consuming the next input character
1371 *
1372 * To consume the next input character, the code does this: if (++pos ==
1373 * endPos) { break stateloop; } c = checkChar(buf, pos);
1374 *
1375 *
1376 * Staying in a state
1377 *
1378 * When there's a state that the tokenizer may stay in over multiple
1379 * input characters, the state has a wrapper |for(;;)| loop and staying
1380 * in the state continues the loop.
1381 *
1382 *
1383 * Switching to another state
1384 *
1385 * To switch to another state, the code sets the state variable to the
1386 * magic number of the new state. Then it either continues stateloop or
1387 * breaks out of the state's own wrapper loop if the target state is
1388 * right after the current state in source order. (This is a partial
1389 * workaround for Java's lack of goto.)
1390 *
1391 *
1392 * Reconsume support
1393 *
1394 * The spec sometimes says that an input character is reconsumed in
1395 * another state. If a state can ever be entered so that an input
1396 * character can be reconsumed in it, the state's code starts with an
1397 * |if (reconsume)| that sets reconsume to false and skips over the
1398 * normal code for consuming a new character.
1399 *
1400 * To reconsume the current character in another state, the code sets
1401 * |reconsume| to true and then switches to the other state.
1402 *
1403 *
1404 * Emitting character tokens
1405 *
1406 * This method emits character tokens lazily. Whenever a new range of
1407 * character tokens starts, the field cstart must be set to the start
1408 * index of the range. The flushChars() method must be called at the end
1409 * of a range to flush it.
1410 *
1411 *
1412 * U+0000 handling
1413 *
1414 * The various states have to handle the replacement of U+0000 with
1415 * U+FFFD. However, if U+0000 would be reconsumed in another state, the
1416 * replacement doesn't need to happen, because it's handled by the
1417 * reconsuming state.
1418 *
1419 *
1420 * LF handling
1421 *
1422 * Every state needs to increment the line number upon LF unless the LF
1423 * gets reconsumed by another state which increments the line number.
1424 *
1425 *
1426 * CR handling
1427 *
1428 * Every state needs to handle CR unless the CR gets reconsumed and is
1429 * handled by the reconsuming state. The CR needs to be handled as if it
1430 * were and LF, the lastCR field must be set to true and then this
1431 * method must return. The IO driver will then swallow the next
1432 * character if it is an LF to coalesce CRLF.
1433 */
1434 stateloop: for (;;) {
1435 switch (state) {
1436 case DATA:
1437 dataloop: for (;;) {
1438 if (reconsume) {
1439 reconsume = false;
1440 } else {
1441 if (++pos == endPos) {
1442 break stateloop;
1443 }
1444 c = checkChar(buf, pos);
1445 }
1446 switch (c) {
1447 case '&':
1448 /*
1449 * U+0026 AMPERSAND (&) Switch to the character
1450 * reference in data state.
1451 */
1452 flushChars(buf, pos);
1453 clearStrBufAndAppend(c);
1454 setAdditionalAndRememberAmpersandLocation('\u0000');
1455 returnState = state;
1456 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
1457 continue stateloop;
1458 case '<':
1459 /*
1460 * U+003C LESS-THAN SIGN (<) Switch to the tag
1461 * open state.
1462 */
1463 flushChars(buf, pos);
1465 state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos);
1466 break dataloop; // FALL THROUGH continue
1467 // stateloop;
1468 case '\u0000':
1469 emitReplacementCharacter(buf, pos);
1470 continue;
1471 case '\r':
1472 emitCarriageReturn(buf, pos);
1473 break stateloop;
1474 case '\n':
1475 silentLineFeed();
1476 default:
1477 /*
1478 * Anything else Emit the input character as a
1479 * character token.
1480 *
1481 * Stay in the data state.
1482 */
1483 continue;
1484 }
1485 }
1486 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
1487 case TAG_OPEN:
1488 tagopenloop: for (;;) {
1489 /*
1490 * The behavior of this state depends on the content
1491 * model flag.
1492 */
1493 if (++pos == endPos) {
1494 break stateloop;
1495 }
1496 c = checkChar(buf, pos);
1497 /*
1498 * If the content model flag is set to the PCDATA state
1499 * Consume the next input character:
1500 */
1501 if (c >= 'A' && c <= 'Z') {
1502 /*
1503 * U+0041 LATIN CAPITAL LETTER A through to U+005A
1504 * LATIN CAPITAL LETTER Z Create a new start tag
1505 * token,
1506 */
1507 endTag = false;
1508 /*
1509 * set its tag name to the lowercase version of the
1510 * input character (add 0x0020 to the character's
1511 * code point),
1512 */
1513 clearStrBufAndAppend((char) (c + 0x20));
1514 /* then switch to the tag name state. */
1515 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
1516 /*
1517 * (Don't emit the token yet; further details will
1518 * be filled in before it is emitted.)
1519 */
1520 break tagopenloop;
1521 // continue stateloop;
1522 } else if (c >= 'a' && c <= 'z') {
1523 /*
1524 * U+0061 LATIN SMALL LETTER A through to U+007A
1525 * LATIN SMALL LETTER Z Create a new start tag
1526 * token,
1527 */
1528 endTag = false;
1529 /*
1530 * set its tag name to the input character,
1531 */
1532 clearStrBufAndAppend(c);
1533 /* then switch to the tag name state. */
1534 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
1535 /*
1536 * (Don't emit the token yet; further details will
1537 * be filled in before it is emitted.)
1538 */
1539 break tagopenloop;
1540 // continue stateloop;
1541 }
1542 switch (c) {
1543 case '!':
1544 /*
1545 * U+0021 EXCLAMATION MARK (!) Switch to the
1546 * markup declaration open state.
1547 */
1548 state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos);
1549 continue stateloop;
1550 case '/':
1551 /*
1552 * U+002F SOLIDUS (/) Switch to the close tag
1553 * open state.
1554 */
1555 state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos);
1556 continue stateloop;
1557 case '?':
1558 // CPPONLY: if (viewingXmlSource) {
1559 // CPPONLY: state = transition(state,
1560 // CPPONLY: Tokenizer.PROCESSING_INSTRUCTION,
1561 // CPPONLY: reconsume,
1562 // CPPONLY: pos);
1563 // CPPONLY: continue stateloop;
1564 // CPPONLY: }
1565 /*
1566 * U+003F QUESTION MARK (?) Parse error.
1567 */
1568 errProcessingInstruction();
1569 /*
1570 * Switch to the bogus comment state.
1571 */
1572 clearLongStrBufAndAppend(c);
1573 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
1574 continue stateloop;
1575 case '>':
1576 /*
1577 * U+003E GREATER-THAN SIGN (>) Parse error.
1578 */
1579 errLtGt();
1580 /*
1581 * Emit a U+003C LESS-THAN SIGN character token
1582 * and a U+003E GREATER-THAN SIGN character
1583 * token.
1584 */
1585 tokenHandler.characters(Tokenizer.LT_GT, 0, 2);
1586 /* Switch to the data state. */
1587 cstart = pos + 1;
1588 state = transition(state, Tokenizer.DATA, reconsume, pos);
1589 continue stateloop;
1590 default:
1591 /*
1592 * Anything else Parse error.
1593 */
1594 errBadCharAfterLt(c);
1595 /*
1596 * Emit a U+003C LESS-THAN SIGN character token
1597 */
1598 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
1599 /*
1600 * and reconsume the current input character in
1601 * the data state.
1602 */
1603 cstart = pos;
1604 reconsume = true;
1605 state = transition(state, Tokenizer.DATA, reconsume, pos);
1606 continue stateloop;
1607 }
1608 }
1609 // FALL THROUGH DON'T REORDER
1610 case TAG_NAME:
1611 tagnameloop: for (;;) {
1612 if (++pos == endPos) {
1613 break stateloop;
1614 }
1615 c = checkChar(buf, pos);
1616 /*
1617 * Consume the next input character:
1618 */
1619 switch (c) {
1620 case '\r':
1621 silentCarriageReturn();
1622 strBufToElementNameString();
1623 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
1624 break stateloop;
1625 case '\n':
1626 silentLineFeed();
1627 case ' ':
1628 case '\t':
1629 case '\u000C':
1630 /*
1631 * U+0009 CHARACTER TABULATION U+000A LINE FEED
1632 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
1633 * Switch to the before attribute name state.
1634 */
1635 strBufToElementNameString();
1636 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
1637 break tagnameloop;
1638 // continue stateloop;
1639 case '/':
1640 /*
1641 * U+002F SOLIDUS (/) Switch to the self-closing
1642 * start tag state.
1643 */
1644 strBufToElementNameString();
1645 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
1646 continue stateloop;
1647 case '>':
1648 /*
1649 * U+003E GREATER-THAN SIGN (>) Emit the current
1650 * tag token.
1651 */
1652 strBufToElementNameString();
1653 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
1654 if (shouldSuspend) {
1655 break stateloop;
1656 }
1657 /*
1658 * Switch to the data state.
1659 */
1660 continue stateloop;
1661 case '\u0000':
1662 c = '\uFFFD';
1663 // fall thru
1664 default:
1665 if (c >= 'A' && c <= 'Z') {
1666 /*
1667 * U+0041 LATIN CAPITAL LETTER A through to
1668 * U+005A LATIN CAPITAL LETTER Z Append the
1669 * lowercase version of the current input
1670 * character (add 0x0020 to the character's
1671 * code point) to the current tag token's
1672 * tag name.
1673 */
1674 c += 0x20;
1675 }
1676 /*
1677 * Anything else Append the current input
1678 * character to the current tag token's tag
1679 * name.
1680 */
1681 appendStrBuf(c);
1682 /*
1683 * Stay in the tag name state.
1684 */
1685 continue;
1686 }
1687 }
1688 // FALLTHRU DON'T REORDER
1689 case BEFORE_ATTRIBUTE_NAME:
1690 beforeattributenameloop: for (;;) {
1691 if (reconsume) {
1692 reconsume = false;
1693 } else {
1694 if (++pos == endPos) {
1695 break stateloop;
1696 }
1697 c = checkChar(buf, pos);
1698 }
1699 /*
1700 * Consume the next input character:
1701 */
1702 switch (c) {
1703 case '\r':
1704 silentCarriageReturn();
1705 break stateloop;
1706 case '\n':
1707 silentLineFeed();
1708 // fall thru
1709 case ' ':
1710 case '\t':
1711 case '\u000C':
1712 /*
1713 * U+0009 CHARACTER TABULATION U+000A LINE FEED
1714 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
1715 * in the before attribute name state.
1716 */
1717 continue;
1718 case '/':
1719 /*
1720 * U+002F SOLIDUS (/) Switch to the self-closing
1721 * start tag state.
1722 */
1723 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
1724 continue stateloop;
1725 case '>':
1726 /*
1727 * U+003E GREATER-THAN SIGN (>) Emit the current
1728 * tag token.
1729 */
1730 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
1731 if (shouldSuspend) {
1732 break stateloop;
1733 }
1734 /*
1735 * Switch to the data state.
1736 */
1737 continue stateloop;
1738 case '\u0000':
1739 c = '\uFFFD';
1740 // fall thru
1741 case '\"':
1742 case '\'':
1743 case '<':
1744 case '=':
1745 /*
1746 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
1747 * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
1748 * SIGN (=) Parse error.
1749 */
1750 errBadCharBeforeAttributeNameOrNull(c);
1751 /*
1752 * Treat it as per the "anything else" entry
1753 * below.
1754 */
1755 default:
1756 /*
1757 * Anything else Start a new attribute in the
1758 * current tag token.
1759 */
1760 if (c >= 'A' && c <= 'Z') {
1761 /*
1762 * U+0041 LATIN CAPITAL LETTER A through to
1763 * U+005A LATIN CAPITAL LETTER Z Set that
1764 * attribute's name to the lowercase version
1765 * of the current input character (add
1766 * 0x0020 to the character's code point)
1767 */
1768 c += 0x20;
1769 }
1770 /*
1771 * Set that attribute's name to the current
1772 * input character,
1773 */
1774 clearStrBufAndAppend(c);
1775 /*
1776 * and its value to the empty string.
1777 */
1778 // Will do later.
1779 /*
1780 * Switch to the attribute name state.
1781 */
1782 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
1783 break beforeattributenameloop;
1784 // continue stateloop;
1785 }
1786 }
1787 // FALLTHRU DON'T REORDER
1788 case ATTRIBUTE_NAME:
1789 attributenameloop: for (;;) {
1790 if (++pos == endPos) {
1791 break stateloop;
1792 }
1793 c = checkChar(buf, pos);
1794 /*
1795 * Consume the next input character:
1796 */
1797 switch (c) {
1798 case '\r':
1799 silentCarriageReturn();
1800 attributeNameComplete();
1801 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
1802 break stateloop;
1803 case '\n':
1804 silentLineFeed();
1805 // fall thru
1806 case ' ':
1807 case '\t':
1808 case '\u000C':
1809 /*
1810 * U+0009 CHARACTER TABULATION U+000A LINE FEED
1811 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
1812 * Switch to the after attribute name state.
1813 */
1814 attributeNameComplete();
1815 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
1816 continue stateloop;
1817 case '/':
1818 /*
1819 * U+002F SOLIDUS (/) Switch to the self-closing
1820 * start tag state.
1821 */
1822 attributeNameComplete();
1823 addAttributeWithoutValue();
1824 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
1825 continue stateloop;
1826 case '=':
1827 /*
1828 * U+003D EQUALS SIGN (=) Switch to the before
1829 * attribute value state.
1830 */
1831 attributeNameComplete();
1832 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
1833 break attributenameloop;
1834 // continue stateloop;
1835 case '>':
1836 /*
1837 * U+003E GREATER-THAN SIGN (>) Emit the current
1838 * tag token.
1839 */
1840 attributeNameComplete();
1841 addAttributeWithoutValue();
1842 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
1843 if (shouldSuspend) {
1844 break stateloop;
1845 }
1846 /*
1847 * Switch to the data state.
1848 */
1849 continue stateloop;
1850 case '\u0000':
1851 c = '\uFFFD';
1852 // fall thru
1853 case '\"':
1854 case '\'':
1855 case '<':
1856 /*
1857 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
1858 * (') U+003C LESS-THAN SIGN (<) Parse error.
1859 */
1860 errQuoteOrLtInAttributeNameOrNull(c);
1861 /*
1862 * Treat it as per the "anything else" entry
1863 * below.
1864 */
1865 default:
1866 if (c >= 'A' && c <= 'Z') {
1867 /*
1868 * U+0041 LATIN CAPITAL LETTER A through to
1869 * U+005A LATIN CAPITAL LETTER Z Append the
1870 * lowercase version of the current input
1871 * character (add 0x0020 to the character's
1872 * code point) to the current attribute's
1873 * name.
1874 */
1875 c += 0x20;
1876 }
1877 /*
1878 * Anything else Append the current input
1879 * character to the current attribute's name.
1880 */
1881 appendStrBuf(c);
1882 /*
1883 * Stay in the attribute name state.
1884 */
1885 continue;
1886 }
1887 }
1888 // FALLTHRU DON'T REORDER
1889 case BEFORE_ATTRIBUTE_VALUE:
1890 beforeattributevalueloop: for (;;) {
1891 if (++pos == endPos) {
1892 break stateloop;
1893 }
1894 c = checkChar(buf, pos);
1895 /*
1896 * Consume the next input character:
1897 */
1898 switch (c) {
1899 case '\r':
1900 silentCarriageReturn();
1901 break stateloop;
1902 case '\n':
1903 silentLineFeed();
1904 // fall thru
1905 case ' ':
1906 case '\t':
1907 case '\u000C':
1908 /*
1909 * U+0009 CHARACTER TABULATION U+000A LINE FEED
1910 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
1911 * in the before attribute value state.
1912 */
1913 continue;
1914 case '"':
1915 /*
1916 * U+0022 QUOTATION MARK (") Switch to the
1917 * attribute value (double-quoted) state.
1918 */
1919 clearLongStrBuf();
1920 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos);
1921 break beforeattributevalueloop;
1922 // continue stateloop;
1923 case '&':
1924 /*
1925 * U+0026 AMPERSAND (&) Switch to the attribute
1926 * value (unquoted) state and reconsume this
1927 * input character.
1928 */
1929 clearLongStrBuf();
1930 reconsume = true;
1931 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
1932 noteUnquotedAttributeValue();
1933 continue stateloop;
1934 case '\'':
1935 /*
1936 * U+0027 APOSTROPHE (') Switch to the attribute
1937 * value (single-quoted) state.
1938 */
1939 clearLongStrBuf();
1940 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos);
1941 continue stateloop;
1942 case '>':
1943 /*
1944 * U+003E GREATER-THAN SIGN (>) Parse error.
1945 */
1946 errAttributeValueMissing();
1947 /*
1948 * Emit the current tag token.
1949 */
1950 addAttributeWithoutValue();
1951 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
1952 if (shouldSuspend) {
1953 break stateloop;
1954 }
1955 /*
1956 * Switch to the data state.
1957 */
1958 continue stateloop;
1959 case '\u0000':
1960 c = '\uFFFD';
1961 // fall thru
1962 case '<':
1963 case '=':
1964 case '`':
1965 /*
1966 * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN
1967 * (=) U+0060 GRAVE ACCENT (`)
1968 */
1969 errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c);
1970 /*
1971 * Treat it as per the "anything else" entry
1972 * below.
1973 */
1974 default:
1975 // [NOCPP[
1976 errHtml4NonNameInUnquotedAttribute(c);
1977 // ]NOCPP]
1978 /*
1979 * Anything else Append the current input
1980 * character to the current attribute's value.
1981 */
1982 clearLongStrBufAndAppend(c);
1983 /*
1984 * Switch to the attribute value (unquoted)
1985 * state.
1986 */
1988 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
1989 noteUnquotedAttributeValue();
1990 continue stateloop;
1991 }
1992 }
1993 // FALLTHRU DON'T REORDER
1994 case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
1995 attributevaluedoublequotedloop: for (;;) {
1996 if (reconsume) {
1997 reconsume = false;
1998 } else {
1999 if (++pos == endPos) {
2000 break stateloop;
2001 }
2002 c = checkChar(buf, pos);
2003 }
2004 /*
2005 * Consume the next input character:
2006 */
2007 switch (c) {
2008 case '"':
2009 /*
2010 * U+0022 QUOTATION MARK (") Switch to the after
2011 * attribute value (quoted) state.
2012 */
2013 addAttributeWithValue();
2015 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
2016 break attributevaluedoublequotedloop;
2017 // continue stateloop;
2018 case '&':
2019 /*
2020 * U+0026 AMPERSAND (&) Switch to the character
2021 * reference in attribute value state, with the
2022 * additional allowed character being U+0022
2023 * QUOTATION MARK (").
2024 */
2025 clearStrBufAndAppend(c);
2026 setAdditionalAndRememberAmpersandLocation('\"');
2027 returnState = state;
2028 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
2029 continue stateloop;
2030 case '\r':
2031 appendLongStrBufCarriageReturn();
2032 break stateloop;
2033 case '\n':
2034 appendLongStrBufLineFeed();
2035 continue;
2036 case '\u0000':
2037 c = '\uFFFD';
2038 // fall thru
2039 default:
2040 /*
2041 * Anything else Append the current input
2042 * character to the current attribute's value.
2043 */
2044 appendLongStrBuf(c);
2045 /*
2046 * Stay in the attribute value (double-quoted)
2047 * state.
2048 */
2049 continue;
2050 }
2051 }
2052 // FALLTHRU DON'T REORDER
2053 case AFTER_ATTRIBUTE_VALUE_QUOTED:
2054 afterattributevaluequotedloop: for (;;) {
2055 if (++pos == endPos) {
2056 break stateloop;
2057 }
2058 c = checkChar(buf, pos);
2059 /*
2060 * Consume the next input character:
2061 */
2062 switch (c) {
2063 case '\r':
2064 silentCarriageReturn();
2065 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2066 break stateloop;
2067 case '\n':
2068 silentLineFeed();
2069 // fall thru
2070 case ' ':
2071 case '\t':
2072 case '\u000C':
2073 /*
2074 * U+0009 CHARACTER TABULATION U+000A LINE FEED
2075 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
2076 * Switch to the before attribute name state.
2077 */
2078 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2079 continue stateloop;
2080 case '/':
2081 /*
2082 * U+002F SOLIDUS (/) Switch to the self-closing
2083 * start tag state.
2084 */
2085 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
2086 break afterattributevaluequotedloop;
2087 // continue stateloop;
2088 case '>':
2089 /*
2090 * U+003E GREATER-THAN SIGN (>) Emit the current
2091 * tag token.
2092 */
2093 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
2094 if (shouldSuspend) {
2095 break stateloop;
2096 }
2097 /*
2098 * Switch to the data state.
2099 */
2100 continue stateloop;
2101 default:
2102 /*
2103 * Anything else Parse error.
2104 */
2105 errNoSpaceBetweenAttributes();
2106 /*
2107 * Reconsume the character in the before
2108 * attribute name state.
2109 */
2110 reconsume = true;
2111 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2112 continue stateloop;
2113 }
2114 }
2115 // FALLTHRU DON'T REORDER
2116 case SELF_CLOSING_START_TAG:
2117 if (++pos == endPos) {
2118 break stateloop;
2119 }
2120 c = checkChar(buf, pos);
2121 /*
2122 * Consume the next input character:
2123 */
2124 switch (c) {
2125 case '>':
2126 /*
2127 * U+003E GREATER-THAN SIGN (>) Set the self-closing
2128 * flag of the current tag token. Emit the current
2129 * tag token.
2130 */
2131 // [NOCPP[
2132 errHtml4XmlVoidSyntax();
2133 // ]NOCPP]
2134 state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos);
2135 if (shouldSuspend) {
2136 break stateloop;
2137 }
2138 /*
2139 * Switch to the data state.
2140 */
2141 continue stateloop;
2142 default:
2143 /* Anything else Parse error. */
2144 errSlashNotFollowedByGt();
2145 /*
2146 * Reconsume the character in the before attribute
2147 * name state.
2148 */
2149 reconsume = true;
2150 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2151 continue stateloop;
2152 }
2153 // XXX reorder point
2154 case ATTRIBUTE_VALUE_UNQUOTED:
2155 for (;;) {
2156 if (reconsume) {
2157 reconsume = false;
2158 } else {
2159 if (++pos == endPos) {
2160 break stateloop;
2161 }
2162 c = checkChar(buf, pos);
2163 }
2164 /*
2165 * Consume the next input character:
2166 */
2167 switch (c) {
2168 case '\r':
2169 silentCarriageReturn();
2170 addAttributeWithValue();
2171 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2172 break stateloop;
2173 case '\n':
2174 silentLineFeed();
2175 // fall thru
2176 case ' ':
2177 case '\t':
2178 case '\u000C':
2179 /*
2180 * U+0009 CHARACTER TABULATION U+000A LINE FEED
2181 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
2182 * Switch to the before attribute name state.
2183 */
2184 addAttributeWithValue();
2185 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2186 continue stateloop;
2187 case '&':
2188 /*
2189 * U+0026 AMPERSAND (&) Switch to the character
2190 * reference in attribute value state, with the
2191 * additional allowed character being U+003E
2192 * GREATER-THAN SIGN (>)
2193 */
2194 clearStrBufAndAppend(c);
2195 setAdditionalAndRememberAmpersandLocation('>');
2196 returnState = state;
2197 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
2198 continue stateloop;
2199 case '>':
2200 /*
2201 * U+003E GREATER-THAN SIGN (>) Emit the current
2202 * tag token.
2203 */
2204 addAttributeWithValue();
2205 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
2206 if (shouldSuspend) {
2207 break stateloop;
2208 }
2209 /*
2210 * Switch to the data state.
2211 */
2212 continue stateloop;
2213 case '\u0000':
2214 c = '\uFFFD';
2215 // fall thru
2216 case '<':
2217 case '\"':
2218 case '\'':
2219 case '=':
2220 case '`':
2221 /*
2222 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
2223 * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
2224 * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error.
2225 */
2226 errUnquotedAttributeValOrNull(c);
2227 /*
2228 * Treat it as per the "anything else" entry
2229 * below.
2230 */
2231 // fall through
2232 default:
2233 // [NOCPP]
2234 errHtml4NonNameInUnquotedAttribute(c);
2235 // ]NOCPP]
2236 /*
2237 * Anything else Append the current input
2238 * character to the current attribute's value.
2239 */
2240 appendLongStrBuf(c);
2241 /*
2242 * Stay in the attribute value (unquoted) state.
2243 */
2244 continue;
2245 }
2246 }
2247 // XXX reorder point
2248 case AFTER_ATTRIBUTE_NAME:
2249 for (;;) {
2250 if (++pos == endPos) {
2251 break stateloop;
2252 }
2253 c = checkChar(buf, pos);
2254 /*
2255 * Consume the next input character:
2256 */
2257 switch (c) {
2258 case '\r':
2259 silentCarriageReturn();
2260 break stateloop;
2261 case '\n':
2262 silentLineFeed();
2263 // fall thru
2264 case ' ':
2265 case '\t':
2266 case '\u000C':
2267 /*
2268 * U+0009 CHARACTER TABULATION U+000A LINE FEED
2269 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
2270 * in the after attribute name state.
2271 */
2272 continue;
2273 case '/':
2274 /*
2275 * U+002F SOLIDUS (/) Switch to the self-closing
2276 * start tag state.
2277 */
2278 addAttributeWithoutValue();
2279 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
2280 continue stateloop;
2281 case '=':
2282 /*
2283 * U+003D EQUALS SIGN (=) Switch to the before
2284 * attribute value state.
2285 */
2286 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
2287 continue stateloop;
2288 case '>':
2289 /*
2290 * U+003E GREATER-THAN SIGN (>) Emit the current
2291 * tag token.
2292 */
2293 addAttributeWithoutValue();
2294 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
2295 if (shouldSuspend) {
2296 break stateloop;
2297 }
2298 /*
2299 * Switch to the data state.
2300 */
2301 continue stateloop;
2302 case '\u0000':
2303 c = '\uFFFD';
2304 // fall thru
2305 case '\"':
2306 case '\'':
2307 case '<':
2308 errQuoteOrLtInAttributeNameOrNull(c);
2309 /*
2310 * Treat it as per the "anything else" entry
2311 * below.
2312 */
2313 default:
2314 addAttributeWithoutValue();
2315 /*
2316 * Anything else Start a new attribute in the
2317 * current tag token.
2318 */
2319 if (c >= 'A' && c <= 'Z') {
2320 /*
2321 * U+0041 LATIN CAPITAL LETTER A through to
2322 * U+005A LATIN CAPITAL LETTER Z Set that
2323 * attribute's name to the lowercase version
2324 * of the current input character (add
2325 * 0x0020 to the character's code point)
2326 */
2327 c += 0x20;
2328 }
2329 /*
2330 * Set that attribute's name to the current
2331 * input character,
2332 */
2333 clearStrBufAndAppend(c);
2334 /*
2335 * and its value to the empty string.
2336 */
2337 // Will do later.
2338 /*
2339 * Switch to the attribute name state.
2340 */
2341 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
2342 continue stateloop;
2343 }
2344 }
2345 // XXX reorder point
2346 case MARKUP_DECLARATION_OPEN:
2347 markupdeclarationopenloop: for (;;) {
2348 if (++pos == endPos) {
2349 break stateloop;
2350 }
2351 c = checkChar(buf, pos);
2352 /*
2353 * If the next two characters are both U+002D
2354 * HYPHEN-MINUS characters (-), consume those two
2355 * characters, create a comment token whose data is the
2356 * empty string, and switch to the comment start state.
2357 *
2358 * Otherwise, if the next seven characters are an ASCII
2359 * case-insensitive match for the word "DOCTYPE", then
2360 * consume those characters and switch to the DOCTYPE
2361 * state.
2362 *
2363 * Otherwise, if the insertion mode is
2364 * "in foreign content" and the current node is not an
2365 * element in the HTML namespace and the next seven
2366 * characters are an case-sensitive match for the string
2367 * "[CDATA[" (the five uppercase letters "CDATA" with a
2368 * U+005B LEFT SQUARE BRACKET character before and
2369 * after), then consume those characters and switch to
2370 * the CDATA section state.
2371 *
2372 * Otherwise, is is a parse error. Switch to the bogus
2373 * comment state. The next character that is consumed,
2374 * if any, is the first character that will be in the
2375 * comment.
2376 */
2377 switch (c) {
2378 case '-':
2379 clearLongStrBufAndAppend(c);
2380 state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos);
2381 break markupdeclarationopenloop;
2382 // continue stateloop;
2383 case 'd':
2384 case 'D':
2385 clearLongStrBufAndAppend(c);
2386 index = 0;
2387 state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos);
2388 continue stateloop;
2389 case '[':
2390 if (tokenHandler.cdataSectionAllowed()) {
2391 clearLongStrBufAndAppend(c);
2392 index = 0;
2393 state = transition(state, Tokenizer.CDATA_START, reconsume, pos);
2394 continue stateloop;
2395 }
2396 // else fall through
2397 default:
2398 errBogusComment();
2399 clearLongStrBuf();
2400 reconsume = true;
2401 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
2402 continue stateloop;
2403 }
2404 }
2405 // FALLTHRU DON'T REORDER
2406 case MARKUP_DECLARATION_HYPHEN:
2407 markupdeclarationhyphenloop: for (;;) {
2408 if (++pos == endPos) {
2409 break stateloop;
2410 }
2411 c = checkChar(buf, pos);
2412 switch (c) {
2413 case '\u0000':
2414 break stateloop;
2415 case '-':
2416 clearLongStrBuf();
2417 state = transition(state, Tokenizer.COMMENT_START, reconsume, pos);
2418 break markupdeclarationhyphenloop;
2419 // continue stateloop;
2420 default:
2421 errBogusComment();
2422 reconsume = true;
2423 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
2424 continue stateloop;
2425 }
2426 }
2427 // FALLTHRU DON'T REORDER
2428 case COMMENT_START:
2429 commentstartloop: for (;;) {
2430 if (++pos == endPos) {
2431 break stateloop;
2432 }
2433 c = checkChar(buf, pos);
2434 /*
2435 * Comment start state
2436 *
2437 *
2438 * Consume the next input character:
2439 */
2440 switch (c) {
2441 case '-':
2442 /*
2443 * U+002D HYPHEN-MINUS (-) Switch to the comment
2444 * start dash state.
2445 */
2446 appendLongStrBuf(c);
2447 state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos);
2448 continue stateloop;
2449 case '>':
2450 /*
2451 * U+003E GREATER-THAN SIGN (>) Parse error.
2452 */
2453 errPrematureEndOfComment();
2454 /* Emit the comment token. */
2455 emitComment(0, pos);
2456 /*
2457 * Switch to the data state.
2458 */
2459 state = transition(state, Tokenizer.DATA, reconsume, pos);
2460 continue stateloop;
2461 case '\r':
2462 appendLongStrBufCarriageReturn();
2463 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2464 break stateloop;
2465 case '\n':
2466 appendLongStrBufLineFeed();
2467 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2468 break commentstartloop;
2469 case '\u0000':
2470 c = '\uFFFD';
2471 // fall thru
2472 default:
2473 /*
2474 * Anything else Append the input character to
2475 * the comment token's data.
2476 */
2477 appendLongStrBuf(c);
2478 /*
2479 * Switch to the comment state.
2480 */
2481 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2482 break commentstartloop;
2483 // continue stateloop;
2484 }
2485 }
2486 // FALLTHRU DON'T REORDER
2487 case COMMENT:
2488 commentloop: for (;;) {
2489 if (++pos == endPos) {
2490 break stateloop;
2491 }
2492 c = checkChar(buf, pos);
2493 /*
2494 * Comment state Consume the next input character:
2495 */
2496 switch (c) {
2497 case '-':
2498 /*
2499 * U+002D HYPHEN-MINUS (-) Switch to the comment
2500 * end dash state
2501 */
2502 appendLongStrBuf(c);
2503 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
2504 break commentloop;
2505 // continue stateloop;
2506 case '\r':
2507 appendLongStrBufCarriageReturn();
2508 break stateloop;
2509 case '\n':
2510 appendLongStrBufLineFeed();
2511 continue;
2512 case '\u0000':
2513 c = '\uFFFD';
2514 // fall thru
2515 default:
2516 /*
2517 * Anything else Append the input character to
2518 * the comment token's data.
2519 */
2520 appendLongStrBuf(c);
2521 /*
2522 * Stay in the comment state.
2523 */
2524 continue;
2525 }
2526 }
2527 // FALLTHRU DON'T REORDER
2528 case COMMENT_END_DASH:
2529 commentenddashloop: for (;;) {
2530 if (++pos == endPos) {
2531 break stateloop;
2532 }
2533 c = checkChar(buf, pos);
2534 /*
2535 * Comment end dash state Consume the next input
2536 * character:
2537 */
2538 switch (c) {
2539 case '-':
2540 /*
2541 * U+002D HYPHEN-MINUS (-) Switch to the comment
2542 * end state
2543 */
2544 appendLongStrBuf(c);
2545 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
2546 break commentenddashloop;
2547 // continue stateloop;
2548 case '\r':
2549 appendLongStrBufCarriageReturn();
2550 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2551 break stateloop;
2552 case '\n':
2553 appendLongStrBufLineFeed();
2554 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2555 continue stateloop;
2556 case '\u0000':
2557 c = '\uFFFD';
2558 // fall thru
2559 default:
2560 /*
2561 * Anything else Append a U+002D HYPHEN-MINUS
2562 * (-) character and the input character to the
2563 * comment token's data.
2564 */
2565 appendLongStrBuf(c);
2566 /*
2567 * Switch to the comment state.
2568 */
2569 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2570 continue stateloop;
2571 }
2572 }
2573 // FALLTHRU DON'T REORDER
2574 case COMMENT_END:
2575 commentendloop: for (;;) {
2576 if (++pos == endPos) {
2577 break stateloop;
2578 }
2579 c = checkChar(buf, pos);
2580 /*
2581 * Comment end dash state Consume the next input
2582 * character:
2583 */
2584 switch (c) {
2585 case '>':
2586 /*
2587 * U+003E GREATER-THAN SIGN (>) Emit the comment
2588 * token.
2589 */
2590 emitComment(2, pos);
2591 /*
2592 * Switch to the data state.
2593 */
2594 state = transition(state, Tokenizer.DATA, reconsume, pos);
2595 continue stateloop;
2596 case '-':
2597 /* U+002D HYPHEN-MINUS (-) Parse error. */
2598 /*
2599 * Append a U+002D HYPHEN-MINUS (-) character to
2600 * the comment token's data.
2601 */
2602 adjustDoubleHyphenAndAppendToLongStrBufAndErr(c);
2603 /*
2604 * Stay in the comment end state.
2605 */
2606 continue;
2607 case '\r':
2608 adjustDoubleHyphenAndAppendToLongStrBufCarriageReturn();
2609 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2610 break stateloop;
2611 case '\n':
2612 adjustDoubleHyphenAndAppendToLongStrBufLineFeed();
2613 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2614 continue stateloop;
2615 case '!':
2616 errHyphenHyphenBang();
2617 appendLongStrBuf(c);
2618 state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos);
2619 continue stateloop;
2620 case '\u0000':
2621 c = '\uFFFD';
2622 // fall thru
2623 default:
2624 /*
2625 * Append two U+002D HYPHEN-MINUS (-) characters
2626 * and the input character to the comment
2627 * token's data.
2628 */
2629 adjustDoubleHyphenAndAppendToLongStrBufAndErr(c);
2630 /*
2631 * Switch to the comment state.
2632 */
2633 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2634 continue stateloop;
2635 }
2636 }
2637 // XXX reorder point
2638 case COMMENT_END_BANG:
2639 for (;;) {
2640 if (++pos == endPos) {
2641 break stateloop;
2642 }
2643 c = checkChar(buf, pos);
2644 /*
2645 * Comment end bang state
2646 *
2647 * Consume the next input character:
2648 */
2649 switch (c) {
2650 case '>':
2651 /*
2652 * U+003E GREATER-THAN SIGN (>) Emit the comment
2653 * token.
2654 */
2655 emitComment(3, pos);
2656 /*
2657 * Switch to the data state.
2658 */
2659 state = transition(state, Tokenizer.DATA, reconsume, pos);
2660 continue stateloop;
2661 case '-':
2662 /*
2663 * Append two U+002D HYPHEN-MINUS (-) characters
2664 * and a U+0021 EXCLAMATION MARK (!) character
2665 * to the comment token's data.
2666 */
2667 appendLongStrBuf(c);
2668 /*
2669 * Switch to the comment end dash state.
2670 */
2671 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
2672 continue stateloop;
2673 case '\r':
2674 appendLongStrBufCarriageReturn();
2675 break stateloop;
2676 case '\n':
2677 appendLongStrBufLineFeed();
2678 continue;
2679 case '\u0000':
2680 c = '\uFFFD';
2681 // fall thru
2682 default:
2683 /*
2684 * Anything else Append two U+002D HYPHEN-MINUS
2685 * (-) characters, a U+0021 EXCLAMATION MARK (!)
2686 * character, and the input character to the
2687 * comment token's data. Switch to the comment
2688 * state.
2689 */
2690 appendLongStrBuf(c);
2691 /*
2692 * Switch to the comment state.
2693 */
2694 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2695 continue stateloop;
2696 }
2697 }
2698 // XXX reorder point
2699 case COMMENT_START_DASH:
2700 if (++pos == endPos) {
2701 break stateloop;
2702 }
2703 c = checkChar(buf, pos);
2704 /*
2705 * Comment start dash state
2706 *
2707 * Consume the next input character:
2708 */
2709 switch (c) {
2710 case '-':
2711 /*
2712 * U+002D HYPHEN-MINUS (-) Switch to the comment end
2713 * state
2714 */
2715 appendLongStrBuf(c);
2716 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
2717 continue stateloop;
2718 case '>':
2719 errPrematureEndOfComment();
2720 /* Emit the comment token. */
2721 emitComment(1, pos);
2722 /*
2723 * Switch to the data state.
2724 */
2725 state = transition(state, Tokenizer.DATA, reconsume, pos);
2726 continue stateloop;
2727 case '\r':
2728 appendLongStrBufCarriageReturn();
2729 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2730 break stateloop;
2731 case '\n':
2732 appendLongStrBufLineFeed();
2733 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2734 continue stateloop;
2735 case '\u0000':
2736 c = '\uFFFD';
2737 // fall thru
2738 default:
2739 /*
2740 * Append a U+002D HYPHEN-MINUS character (-) and
2741 * the current input character to the comment
2742 * token's data.
2743 */
2744 appendLongStrBuf(c);
2745 /*
2746 * Switch to the comment state.
2747 */
2748 state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2749 continue stateloop;
2750 }
2751 // XXX reorder point
2752 case CDATA_START:
2753 for (;;) {
2754 if (++pos == endPos) {
2755 break stateloop;
2756 }
2757 c = checkChar(buf, pos);
2758 if (index < 6) { // CDATA_LSQB.length
2759 if (c == Tokenizer.CDATA_LSQB[index]) {
2760 appendLongStrBuf(c);
2761 } else {
2762 errBogusComment();
2763 reconsume = true;
2764 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
2765 continue stateloop;
2766 }
2767 index++;
2768 continue;
2769 } else {
2770 cstart = pos; // start coalescing
2771 reconsume = true;
2772 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
2773 break; // FALL THROUGH continue stateloop;
2774 }
2775 }
2776 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
2777 case CDATA_SECTION:
2778 cdatasectionloop: for (;;) {
2779 if (reconsume) {
2780 reconsume = false;
2781 } else {
2782 if (++pos == endPos) {
2783 break stateloop;
2784 }
2785 c = checkChar(buf, pos);
2786 }
2787 switch (c) {
2788 case ']':
2789 flushChars(buf, pos);
2790 state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos);
2791 break cdatasectionloop; // FALL THROUGH
2792 case '\u0000':
2793 emitReplacementCharacter(buf, pos);
2794 continue;
2795 case '\r':
2796 emitCarriageReturn(buf, pos);
2797 break stateloop;
2798 case '\n':
2799 silentLineFeed();
2800 // fall thru
2801 default:
2802 continue;
2803 }
2804 }
2805 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
2806 case CDATA_RSQB:
2807 cdatarsqb: for (;;) {
2808 if (++pos == endPos) {
2809 break stateloop;
2810 }
2811 c = checkChar(buf, pos);
2812 switch (c) {
2813 case ']':
2814 state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos);
2815 break cdatarsqb;
2816 default:
2817 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0,
2818 1);
2819 cstart = pos;
2820 reconsume = true;
2821 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
2822 continue stateloop;
2823 }
2824 }
2825 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
2826 case CDATA_RSQB_RSQB:
2827 cdatarsqbrsqb: for (;;) {
2828 if (++pos == endPos) {
2829 break stateloop;
2830 }
2831 c = checkChar(buf, pos);
2832 switch (c) {
2833 case ']':
2834 // Saw a third ]. Emit one ] (logically the
2835 // first one) and stay in this state to
2836 // remember that the last two characters seen
2837 // have been ]].
2838 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
2839 continue;
2840 case '>':
2841 cstart = pos + 1;
2842 state = transition(state, Tokenizer.DATA, reconsume, pos);
2843 continue stateloop;
2844 default:
2845 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
2846 cstart = pos;
2847 reconsume = true;
2848 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
2849 continue stateloop;
2850 }
2851 }
2852 // XXX reorder point
2853 case ATTRIBUTE_VALUE_SINGLE_QUOTED:
2854 attributevaluesinglequotedloop: for (;;) {
2855 if (reconsume) {
2856 reconsume = false;
2857 } else {
2858 if (++pos == endPos) {
2859 break stateloop;
2860 }
2861 c = checkChar(buf, pos);
2862 }
2863 /*
2864 * Consume the next input character:
2865 */
2866 switch (c) {
2867 case '\'':
2868 /*
2869 * U+0027 APOSTROPHE (') Switch to the after
2870 * attribute value (quoted) state.
2871 */
2872 addAttributeWithValue();
2874 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
2875 continue stateloop;
2876 case '&':
2877 /*
2878 * U+0026 AMPERSAND (&) Switch to the character
2879 * reference in attribute value state, with the
2880 * + additional allowed character being U+0027
2881 * APOSTROPHE (').
2882 */
2883 clearStrBufAndAppend(c);
2884 setAdditionalAndRememberAmpersandLocation('\'');
2885 returnState = state;
2886 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
2887 break attributevaluesinglequotedloop;
2888 // continue stateloop;
2889 case '\r':
2890 appendLongStrBufCarriageReturn();
2891 break stateloop;
2892 case '\n':
2893 appendLongStrBufLineFeed();
2894 continue;
2895 case '\u0000':
2896 c = '\uFFFD';
2897 // fall thru
2898 default:
2899 /*
2900 * Anything else Append the current input
2901 * character to the current attribute's value.
2902 */
2903 appendLongStrBuf(c);
2904 /*
2905 * Stay in the attribute value (double-quoted)
2906 * state.
2907 */
2908 continue;
2909 }
2910 }
2911 // FALLTHRU DON'T REORDER
2912 case CONSUME_CHARACTER_REFERENCE:
2913 if (++pos == endPos) {
2914 break stateloop;
2915 }
2916 c = checkChar(buf, pos);
2917 if (c == '\u0000') {
2918 break stateloop;
2919 }
2920 /*
2921 * Unlike the definition is the spec, this state does not
2922 * return a value and never requires the caller to
2923 * backtrack. This state takes care of emitting characters
2924 * or appending to the current attribute value. It also
2925 * takes care of that in the case when consuming the
2926 * character reference fails.
2927 */
2928 /*
2929 * This section defines how to consume a character
2930 * reference. This definition is used when parsing character
2931 * references in text and in attributes.
2932 *
2933 * The behavior depends on the identity of the next
2934 * character (the one immediately after the U+0026 AMPERSAND
2935 * character):
2936 */
2937 switch (c) {
2938 case ' ':
2939 case '\t':
2940 case '\n':
2941 case '\r': // we'll reconsume!
2942 case '\u000C':
2943 case '<':
2944 case '&':
2945 emitOrAppendStrBuf(returnState);
2946 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
2947 cstart = pos;
2948 }
2949 reconsume = true;
2950 state = transition(state, returnState, reconsume, pos);
2951 continue stateloop;
2952 case '#':
2953 /*
2954 * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER
2955 * SIGN.
2956 */
2957 appendStrBuf('#');
2958 state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos);
2959 continue stateloop;
2960 default:
2961 if (c == additional) {
2962 emitOrAppendStrBuf(returnState);
2963 reconsume = true;
2964 state = transition(state, returnState, reconsume, pos);
2965 continue stateloop;
2966 }
2967 if (c >= 'a' && c <= 'z') {
2968 firstCharKey = c - 'a' + 26;
2969 } else if (c >= 'A' && c <= 'Z') {
2970 firstCharKey = c - 'A';
2971 } else {
2972 // No match
2973 /*
2974 * If no match can be made, then this is a parse
2975 * error.
2976 */
2977 errNoNamedCharacterMatch();
2978 emitOrAppendStrBuf(returnState);
2979 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
2980 cstart = pos;
2981 }
2982 reconsume = true;
2983 state = transition(state, returnState, reconsume, pos);
2984 continue stateloop;
2985 }
2986 // Didn't fail yet
2987 appendStrBuf(c);
2988 state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos);
2989 // FALL THROUGH continue stateloop;
2990 }
2991 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
2992 case CHARACTER_REFERENCE_HILO_LOOKUP:
2993 {
2994 if (++pos == endPos) {
2995 break stateloop;
2996 }
2997 c = checkChar(buf, pos);
2998 if (c == '\u0000') {
2999 break stateloop;
3000 }
3001 /*
3002 * The data structure is as follows:
3003 *
3004 * HILO_ACCEL is a two-dimensional int array whose major
3005 * index corresponds to the second character of the
3006 * character reference (code point as index) and the
3007 * minor index corresponds to the first character of the
3008 * character reference (packed so that A-Z runs from 0
3009 * to 25 and a-z runs from 26 to 51). This layout makes
3010 * it easier to use the sparseness of the data structure
3011 * to omit parts of it: The second dimension of the
3012 * table is null when no character reference starts with
3013 * the character corresponding to that row.
3014 *
3015 * The int value HILO_ACCEL (by these indeces) is zero
3016 * if there exists no character reference starting with
3017 * that two-letter prefix. Otherwise, the value is an
3018 * int that packs two shorts so that the higher short is
3019 * the index of the highest character reference name
3020 * with that prefix in NAMES and the lower short
3021 * corresponds to the index of the lowest character
3022 * reference name with that prefix. (It happens that the
3023 * first two character reference names share their
3024 * prefix so the packed int cannot be 0 by packing the
3025 * two shorts.)
3026 *
3027 * NAMES is an array of byte arrays where each byte
3028 * array encodes the name of a character references as
3029 * ASCII. The names omit the first two letters of the
3030 * name. (Since storing the first two letters would be
3031 * redundant with the data contained in HILO_ACCEL.) The
3032 * entries are lexically sorted.
3033 *
3034 * For a given index in NAMES, the same index in VALUES
3035 * contains the corresponding expansion as an array of
3036 * two UTF-16 code units (either the character and
3037 * U+0000 or a suggogate pair).
3038 */
3039 int hilo = 0;
3040 if (c <= 'z') {
3041 @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c];
3042 if (row != null) {
3043 hilo = row[firstCharKey];
3044 }
3045 }
3046 if (hilo == 0) {
3047 /*
3048 * If no match can be made, then this is a parse
3049 * error.
3050 */
3051 errNoNamedCharacterMatch();
3052 emitOrAppendStrBuf(returnState);
3053 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3054 cstart = pos;
3055 }
3056 reconsume = true;
3057 state = transition(state, returnState, reconsume, pos);
3058 continue stateloop;
3059 }
3060 // Didn't fail yet
3061 appendStrBuf(c);
3062 lo = hilo & 0xFFFF;
3063 hi = hilo >> 16;
3064 entCol = -1;
3065 candidate = -1;
3066 strBufMark = 0;
3067 state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos);
3068 // FALL THROUGH continue stateloop;
3069 }
3070 case CHARACTER_REFERENCE_TAIL:
3071 outer: for (;;) {
3072 if (++pos == endPos) {
3073 break stateloop;
3074 }
3075 c = checkChar(buf, pos);
3076 if (c == '\u0000') {
3077 break stateloop;
3078 }
3079 entCol++;
3080 /*
3081 * Consume the maximum number of characters possible,
3082 * with the consumed characters matching one of the
3083 * identifiers in the first column of the named
3084 * character references table (in a case-sensitive
3085 * manner).
3086 */
3087 loloop: for (;;) {
3088 if (hi < lo) {
3089 break outer;
3090 }
3091 if (entCol == NamedCharacters.NAMES[lo].length()) {
3092 candidate = lo;
3093 strBufMark = strBufLen;
3094 lo++;
3095 } else if (entCol > NamedCharacters.NAMES[lo].length()) {
3096 break outer;
3097 } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
3098 lo++;
3099 } else {
3100 break loloop;
3101 }
3102 }
3104 hiloop: for (;;) {
3105 if (hi < lo) {
3106 break outer;
3107 }
3108 if (entCol == NamedCharacters.NAMES[hi].length()) {
3109 break hiloop;
3110 }
3111 if (entCol > NamedCharacters.NAMES[hi].length()) {
3112 break outer;
3113 } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
3114 hi--;
3115 } else {
3116 break hiloop;
3117 }
3118 }
3120 if (c == ';') {
3121 // If we see a semicolon, there cannot be a
3122 // longer match. Break the loop. However, before
3123 // breaking, take the longest match so far as the
3124 // candidate, if we are just about to complete a
3125 // match.
3126 if (entCol + 1 == NamedCharacters.NAMES[lo].length()) {
3127 candidate = lo;
3128 strBufMark = strBufLen;
3129 }
3130 break outer;
3131 }
3133 if (hi < lo) {
3134 break outer;
3135 }
3136 appendStrBuf(c);
3137 continue;
3138 }
3140 if (candidate == -1) {
3141 // reconsume deals with CR, LF or nul
3142 /*
3143 * If no match can be made, then this is a parse error.
3144 */
3145 errNoNamedCharacterMatch();
3146 emitOrAppendStrBuf(returnState);
3147 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3148 cstart = pos;
3149 }
3150 reconsume = true;
3151 state = transition(state, returnState, reconsume, pos);
3152 continue stateloop;
3153 } else {
3154 // c can't be CR, LF or nul if we got here
3155 @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
3156 if (candidateName.length() == 0
3157 || candidateName.charAt(candidateName.length() - 1) != ';') {
3158 /*
3159 * If the last character matched is not a U+003B
3160 * SEMICOLON (;), there is a parse error.
3161 */
3162 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
3163 /*
3164 * If the entity is being consumed as part of an
3165 * attribute, and the last character matched is
3166 * not a U+003B SEMICOLON (;),
3167 */
3168 char ch;
3169 if (strBufMark == strBufLen) {
3170 ch = c;
3171 } else {
3172 // if (strBufOffset != -1) {
3173 // ch = buf[strBufOffset + strBufMark];
3174 // } else {
3175 ch = strBuf[strBufMark];
3176 // }
3177 }
3178 if (ch == '=' || (ch >= '0' && ch <= '9')
3179 || (ch >= 'A' && ch <= 'Z')
3180 || (ch >= 'a' && ch <= 'z')) {
3181 /*
3182 * and the next character is either a U+003D
3183 * EQUALS SIGN character (=) or in the range
3184 * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
3185 * U+0041 LATIN CAPITAL LETTER A to U+005A
3186 * LATIN CAPITAL LETTER Z, or U+0061 LATIN
3187 * SMALL LETTER A to U+007A LATIN SMALL
3188 * LETTER Z, then, for historical reasons,
3189 * all the characters that were matched
3190 * after the U+0026 AMPERSAND (&) must be
3191 * unconsumed, and nothing is returned.
3192 */
3193 errNoNamedCharacterMatch();
3194 appendStrBufToLongStrBuf();
3195 reconsume = true;
3196 state = transition(state, returnState, reconsume, pos);
3197 continue stateloop;
3198 }
3199 }
3200 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
3201 errUnescapedAmpersandInterpretedAsCharacterReference();
3202 } else {
3203 errNotSemicolonTerminated();
3204 }
3205 }
3207 /*
3208 * Otherwise, return a character token for the character
3209 * corresponding to the entity name (as given by the
3210 * second column of the named character references
3211 * table).
3212 */
3213 // CPPONLY: completedNamedCharacterReference();
3214 @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
3215 if (
3216 // [NOCPP[
3217 val.length == 1
3218 // ]NOCPP]
3219 // CPPONLY: val[1] == 0
3220 ) {
3221 emitOrAppendOne(val, returnState);
3222 } else {
3223 emitOrAppendTwo(val, returnState);
3224 }
3225 // this is so complicated!
3226 if (strBufMark < strBufLen) {
3227 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
3228 for (int i = strBufMark; i < strBufLen; i++) {
3229 appendLongStrBuf(strBuf[i]);
3230 }
3231 } else {
3232 tokenHandler.characters(strBuf, strBufMark,
3233 strBufLen - strBufMark);
3234 }
3235 }
3236 // Check if we broke out early with c being the last
3237 // character that matched as opposed to being the
3238 // first one that didn't match. In the case of an
3239 // early break, the next run on text should start
3240 // *after* the current character and the current
3241 // character shouldn't be reconsumed.
3242 boolean earlyBreak = (c == ';' && strBufMark == strBufLen);
3243 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3244 cstart = earlyBreak ? pos + 1 : pos;
3245 }
3246 reconsume = !earlyBreak;
3247 state = transition(state, returnState, reconsume, pos);
3248 continue stateloop;
3249 /*
3250 * If the markup contains I'm ¬it; I tell you, the
3251 * entity is parsed as "not", as in, I'm ¬it; I tell
3252 * you. But if the markup was I'm ∉ I tell you,
3253 * the entity would be parsed as "notin;", resulting in
3254 * I'm ∉ I tell you.
3255 */
3256 }
3257 // XXX reorder point
3258 case CONSUME_NCR:
3259 if (++pos == endPos) {
3260 break stateloop;
3261 }
3262 c = checkChar(buf, pos);
3263 prevValue = -1;
3264 value = 0;
3265 seenDigits = false;
3266 /*
3267 * The behavior further depends on the character after the
3268 * U+0023 NUMBER SIGN:
3269 */
3270 switch (c) {
3271 case 'x':
3272 case 'X':
3274 /*
3275 * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL
3276 * LETTER X Consume the X.
3277 *
3278 * Follow the steps below, but using the range of
3279 * characters U+0030 DIGIT ZERO through to U+0039
3280 * DIGIT NINE, U+0061 LATIN SMALL LETTER A through
3281 * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN
3282 * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL
3283 * LETTER F (in other words, 0-9, A-F, a-f).
3284 *
3285 * When it comes to interpreting the number,
3286 * interpret it as a hexadecimal number.
3287 */
3288 appendStrBuf(c);
3289 state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos);
3290 continue stateloop;
3291 default:
3292 /*
3293 * Anything else Follow the steps below, but using
3294 * the range of characters U+0030 DIGIT ZERO through
3295 * to U+0039 DIGIT NINE (i.e. just 0-9).
3296 *
3297 * When it comes to interpreting the number,
3298 * interpret it as a decimal number.
3299 */
3300 reconsume = true;
3301 state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos);
3302 // FALL THROUGH continue stateloop;
3303 }
3304 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
3305 case DECIMAL_NRC_LOOP:
3306 decimalloop: for (;;) {
3307 if (reconsume) {
3308 reconsume = false;
3309 } else {
3310 if (++pos == endPos) {
3311 break stateloop;
3312 }
3313 c = checkChar(buf, pos);
3314 }
3315 // Deal with overflow gracefully
3316 if (value < prevValue) {
3317 value = 0x110000; // Value above Unicode range but
3318 // within int
3319 // range
3320 }
3321 prevValue = value;
3322 /*
3323 * Consume as many characters as match the range of
3324 * characters given above.
3325 */
3326 if (c >= '0' && c <= '9') {
3327 seenDigits = true;
3328 value *= 10;
3329 value += c - '0';
3330 continue;
3331 } else if (c == ';') {
3332 if (seenDigits) {
3333 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3334 cstart = pos + 1;
3335 }
3336 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3337 // FALL THROUGH continue stateloop;
3338 break decimalloop;
3339 } else {
3340 errNoDigitsInNCR();
3341 appendStrBuf(';');
3342 emitOrAppendStrBuf(returnState);
3343 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3344 cstart = pos + 1;
3345 }
3346 state = transition(state, returnState, reconsume, pos);
3347 continue stateloop;
3348 }
3349 } else {
3350 /*
3351 * If no characters match the range, then don't
3352 * consume any characters (and unconsume the U+0023
3353 * NUMBER SIGN character and, if appropriate, the X
3354 * character). This is a parse error; nothing is
3355 * returned.
3356 *
3357 * Otherwise, if the next character is a U+003B
3358 * SEMICOLON, consume that too. If it isn't, there
3359 * is a parse error.
3360 */
3361 if (!seenDigits) {
3362 errNoDigitsInNCR();
3363 emitOrAppendStrBuf(returnState);
3364 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3365 cstart = pos;
3366 }
3367 reconsume = true;
3368 state = transition(state, returnState, reconsume, pos);
3369 continue stateloop;
3370 } else {
3371 errCharRefLacksSemicolon();
3372 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3373 cstart = pos;
3374 }
3375 reconsume = true;
3376 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3377 // FALL THROUGH continue stateloop;
3378 break decimalloop;
3379 }
3380 }
3381 }
3382 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
3383 case HANDLE_NCR_VALUE:
3384 // WARNING previous state sets reconsume
3385 // XXX inline this case if the method size can take it
3386 handleNcrValue(returnState);
3387 state = transition(state, returnState, reconsume, pos);
3388 continue stateloop;
3389 // XXX reorder point
3390 case HEX_NCR_LOOP:
3391 for (;;) {
3392 if (++pos == endPos) {
3393 break stateloop;
3394 }
3395 c = checkChar(buf, pos);
3396 // Deal with overflow gracefully
3397 if (value < prevValue) {
3398 value = 0x110000; // Value above Unicode range but
3399 // within int
3400 // range
3401 }
3402 prevValue = value;
3403 /*
3404 * Consume as many characters as match the range of
3405 * characters given above.
3406 */
3407 if (c >= '0' && c <= '9') {
3408 seenDigits = true;
3409 value *= 16;
3410 value += c - '0';
3411 continue;
3412 } else if (c >= 'A' && c <= 'F') {
3413 seenDigits = true;
3414 value *= 16;
3415 value += c - 'A' + 10;
3416 continue;
3417 } else if (c >= 'a' && c <= 'f') {
3418 seenDigits = true;
3419 value *= 16;
3420 value += c - 'a' + 10;
3421 continue;
3422 } else if (c == ';') {
3423 if (seenDigits) {
3424 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3425 cstart = pos + 1;
3426 }
3427 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3428 continue stateloop;
3429 } else {
3430 errNoDigitsInNCR();
3431 appendStrBuf(';');
3432 emitOrAppendStrBuf(returnState);
3433 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3434 cstart = pos + 1;
3435 }
3436 state = transition(state, returnState, reconsume, pos);
3437 continue stateloop;
3438 }
3439 } else {
3440 /*
3441 * If no characters match the range, then don't
3442 * consume any characters (and unconsume the U+0023
3443 * NUMBER SIGN character and, if appropriate, the X
3444 * character). This is a parse error; nothing is
3445 * returned.
3446 *
3447 * Otherwise, if the next character is a U+003B
3448 * SEMICOLON, consume that too. If it isn't, there
3449 * is a parse error.
3450 */
3451 if (!seenDigits) {
3452 errNoDigitsInNCR();
3453 emitOrAppendStrBuf(returnState);
3454 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3455 cstart = pos;
3456 }
3457 reconsume = true;
3458 state = transition(state, returnState, reconsume, pos);
3459 continue stateloop;
3460 } else {
3461 errCharRefLacksSemicolon();
3462 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3463 cstart = pos;
3464 }
3465 reconsume = true;
3466 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3467 continue stateloop;
3468 }
3469 }
3470 }
3471 // XXX reorder point
3472 case PLAINTEXT:
3473 plaintextloop: for (;;) {
3474 if (reconsume) {
3475 reconsume = false;
3476 } else {
3477 if (++pos == endPos) {
3478 break stateloop;
3479 }
3480 c = checkChar(buf, pos);
3481 }
3482 switch (c) {
3483 case '\u0000':
3484 emitPlaintextReplacementCharacter(buf, pos);
3485 continue;
3486 case '\r':
3487 emitCarriageReturn(buf, pos);
3488 break stateloop;
3489 case '\n':
3490 silentLineFeed();
3491 default:
3492 /*
3493 * Anything else Emit the current input
3494 * character as a character token. Stay in the
3495 * RAWTEXT state.
3496 */
3497 continue;
3498 }
3499 }
3500 // XXX reorder point
3501 case CLOSE_TAG_OPEN:
3502 if (++pos == endPos) {
3503 break stateloop;
3504 }
3505 c = checkChar(buf, pos);
3506 /*
3507 * Otherwise, if the content model flag is set to the PCDATA
3508 * state, or if the next few characters do match that tag
3509 * name, consume the next input character:
3510 */
3511 switch (c) {
3512 case '>':
3513 /* U+003E GREATER-THAN SIGN (>) Parse error. */
3514 errLtSlashGt();
3515 /*
3516 * Switch to the data state.
3517 */
3518 cstart = pos + 1;
3519 state = transition(state, Tokenizer.DATA, reconsume, pos);
3520 continue stateloop;
3521 case '\r':
3522 silentCarriageReturn();
3523 /* Anything else Parse error. */
3524 errGarbageAfterLtSlash();
3525 /*
3526 * Switch to the bogus comment state.
3527 */
3528 clearLongStrBufAndAppend('\n');
3529 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3530 break stateloop;
3531 case '\n':
3532 silentLineFeed();
3533 /* Anything else Parse error. */
3534 errGarbageAfterLtSlash();
3535 /*
3536 * Switch to the bogus comment state.
3537 */
3538 clearLongStrBufAndAppend('\n');
3539 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3540 continue stateloop;
3541 case '\u0000':
3542 c = '\uFFFD';
3543 // fall thru
3544 default:
3545 if (c >= 'A' && c <= 'Z') {
3546 c += 0x20;
3547 }
3548 if (c >= 'a' && c <= 'z') {
3549 /*
3550 * U+0061 LATIN SMALL LETTER A through to U+007A
3551 * LATIN SMALL LETTER Z Create a new end tag
3552 * token,
3553 */
3554 endTag = true;
3555 /*
3556 * set its tag name to the input character,
3557 */
3558 clearStrBufAndAppend(c);
3559 /*
3560 * then switch to the tag name state. (Don't
3561 * emit the token yet; further details will be
3562 * filled in before it is emitted.)
3563 */
3564 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
3565 continue stateloop;
3566 } else {
3567 /* Anything else Parse error. */
3568 errGarbageAfterLtSlash();
3569 /*
3570 * Switch to the bogus comment state.
3571 */
3572 clearLongStrBufAndAppend(c);
3573 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3574 continue stateloop;
3575 }
3576 }
3577 // XXX reorder point
3578 case RCDATA:
3579 rcdataloop: for (;;) {
3580 if (reconsume) {
3581 reconsume = false;
3582 } else {
3583 if (++pos == endPos) {
3584 break stateloop;
3585 }
3586 c = checkChar(buf, pos);
3587 }
3588 switch (c) {
3589 case '&':
3590 /*
3591 * U+0026 AMPERSAND (&) Switch to the character
3592 * reference in RCDATA state.
3593 */
3594 flushChars(buf, pos);
3595 clearStrBufAndAppend(c);
3596 additional = '\u0000';
3597 returnState = state;
3598 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
3599 continue stateloop;
3600 case '<':
3601 /*
3602 * U+003C LESS-THAN SIGN (<) Switch to the
3603 * RCDATA less-than sign state.
3604 */
3605 flushChars(buf, pos);
3607 returnState = state;
3608 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
3609 continue stateloop;
3610 case '\u0000':
3611 emitReplacementCharacter(buf, pos);
3612 continue;
3613 case '\r':
3614 emitCarriageReturn(buf, pos);
3615 break stateloop;
3616 case '\n':
3617 silentLineFeed();
3618 default:
3619 /*
3620 * Emit the current input character as a
3621 * character token. Stay in the RCDATA state.
3622 */
3623 continue;
3624 }
3625 }
3626 // XXX reorder point
3627 case RAWTEXT:
3628 rawtextloop: for (;;) {
3629 if (reconsume) {
3630 reconsume = false;
3631 } else {
3632 if (++pos == endPos) {
3633 break stateloop;
3634 }
3635 c = checkChar(buf, pos);
3636 }
3637 switch (c) {
3638 case '<':
3639 /*
3640 * U+003C LESS-THAN SIGN (<) Switch to the
3641 * RAWTEXT less-than sign state.
3642 */
3643 flushChars(buf, pos);
3645 returnState = state;
3646 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
3647 break rawtextloop;
3648 // FALL THRU continue stateloop;
3649 case '\u0000':
3650 emitReplacementCharacter(buf, pos);
3651 continue;
3652 case '\r':
3653 emitCarriageReturn(buf, pos);
3654 break stateloop;
3655 case '\n':
3656 silentLineFeed();
3657 default:
3658 /*
3659 * Emit the current input character as a
3660 * character token. Stay in the RAWTEXT state.
3661 */
3662 continue;
3663 }
3664 }
3665 // XXX fallthru don't reorder
3666 case RAWTEXT_RCDATA_LESS_THAN_SIGN:
3667 rawtextrcdatalessthansignloop: for (;;) {
3668 if (++pos == endPos) {
3669 break stateloop;
3670 }
3671 c = checkChar(buf, pos);
3672 switch (c) {
3673 case '/':
3674 /*
3675 * U+002F SOLIDUS (/) Set the temporary buffer
3676 * to the empty string. Switch to the script
3677 * data end tag open state.
3678 */
3679 index = 0;
3680 clearStrBuf();
3681 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
3682 break rawtextrcdatalessthansignloop;
3683 // FALL THRU continue stateloop;
3684 default:
3685 /*
3686 * Otherwise, emit a U+003C LESS-THAN SIGN
3687 * character token
3688 */
3689 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
3690 /*
3691 * and reconsume the current input character in
3692 * the data state.
3693 */
3694 cstart = pos;
3695 reconsume = true;
3696 state = transition(state, returnState, reconsume, pos);
3697 continue stateloop;
3698 }
3699 }
3700 // XXX fall thru. don't reorder.
3701 case NON_DATA_END_TAG_NAME:
3702 for (;;) {
3703 if (++pos == endPos) {
3704 break stateloop;
3705 }
3706 c = checkChar(buf, pos);
3707 /*
3708 * ASSERT! when entering this state, set index to 0 and
3709 * call clearStrBuf() assert (contentModelElement !=
3710 * null); Let's implement the above without lookahead.
3711 * strBuf is the 'temporary buffer'.
3712 */
3713 if (index < endTagExpectationAsArray.length) {
3714 char e = endTagExpectationAsArray[index];
3715 char folded = c;
3716 if (c >= 'A' && c <= 'Z') {
3717 folded += 0x20;
3718 }
3719 if (folded != e) {
3720 // [NOCPP[
3721 errHtml4LtSlashInRcdata(folded);
3722 // ]NOCPP]
3723 tokenHandler.characters(Tokenizer.LT_SOLIDUS,
3724 0, 2);
3725 emitStrBuf();
3726 cstart = pos;
3727 reconsume = true;
3728 state = transition(state, returnState, reconsume, pos);
3729 continue stateloop;
3730 }
3731 appendStrBuf(c);
3732 index++;
3733 continue;
3734 } else {
3735 endTag = true;
3736 // XXX replace contentModelElement with different
3737 // type
3738 tagName = endTagExpectation;
3739 switch (c) {
3740 case '\r':
3741 silentCarriageReturn();
3742 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
3743 break stateloop;
3744 case '\n':
3745 silentLineFeed();
3746 // fall thru
3747 case ' ':
3748 case '\t':
3749 case '\u000C':
3750 /*
3751 * U+0009 CHARACTER TABULATION U+000A LINE
3752 * FEED (LF) U+000C FORM FEED (FF) U+0020
3753 * SPACE If the current end tag token is an
3754 * appropriate end tag token, then switch to
3755 * the before attribute name state.
3756 */
3757 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
3758 continue stateloop;
3759 case '/':
3760 /*
3761 * U+002F SOLIDUS (/) If the current end tag
3762 * token is an appropriate end tag token,
3763 * then switch to the self-closing start tag
3764 * state.
3765 */
3766 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
3767 continue stateloop;
3768 case '>':
3769 /*
3770 * U+003E GREATER-THAN SIGN (>) If the
3771 * current end tag token is an appropriate
3772 * end tag token, then emit the current tag
3773 * token and switch to the data state.
3774 */
3775 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
3776 if (shouldSuspend) {
3777 break stateloop;
3778 }
3779 continue stateloop;
3780 default:
3781 /*
3782 * Emit a U+003C LESS-THAN SIGN character
3783 * token, a U+002F SOLIDUS character token,
3784 * a character token for each of the
3785 * characters in the temporary buffer (in
3786 * the order they were added to the buffer),
3787 * and reconsume the current input character
3788 * in the RAWTEXT state.
3789 */
3790 // [NOCPP[
3791 errWarnLtSlashInRcdata();
3792 // ]NOCPP]
3793 tokenHandler.characters(
3794 Tokenizer.LT_SOLIDUS, 0, 2);
3795 emitStrBuf();
3796 if (c == '\u0000') {
3797 emitReplacementCharacter(buf, pos);
3798 } else {
3799 cstart = pos; // don't drop the
3800 // character
3801 }
3802 state = transition(state, returnState, reconsume, pos);
3803 continue stateloop;
3804 }
3805 }
3806 }
3807 // XXX reorder point
3808 // BEGIN HOTSPOT WORKAROUND
3809 case BOGUS_COMMENT:
3810 boguscommentloop: for (;;) {
3811 if (reconsume) {
3812 reconsume = false;
3813 } else {
3814 if (++pos == endPos) {
3815 break stateloop;
3816 }
3817 c = checkChar(buf, pos);
3818 }
3819 /*
3820 * Consume every character up to and including the first
3821 * U+003E GREATER-THAN SIGN character (>) or the end of
3822 * the file (EOF), whichever comes first. Emit a comment
3823 * token whose data is the concatenation of all the
3824 * characters starting from and including the character
3825 * that caused the state machine to switch into the
3826 * bogus comment state, up to and including the
3827 * character immediately before the last consumed
3828 * character (i.e. up to the character just before the
3829 * U+003E or EOF character). (If the comment was started
3830 * by the end of the file (EOF), the token is empty.)
3831 *
3832 * Switch to the data state.
3833 *
3834 * If the end of the file was reached, reconsume the EOF
3835 * character.
3836 */
3837 switch (c) {
3838 case '>':
3839 emitComment(0, pos);
3840 state = transition(state, Tokenizer.DATA, reconsume, pos);
3841 continue stateloop;
3842 case '-':
3843 appendLongStrBuf(c);
3844 state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos);
3845 break boguscommentloop;
3846 case '\r':
3847 appendLongStrBufCarriageReturn();
3848 break stateloop;
3849 case '\n':
3850 appendLongStrBufLineFeed();
3851 continue;
3852 case '\u0000':
3853 c = '\uFFFD';
3854 // fall thru
3855 default:
3856 appendLongStrBuf(c);
3857 continue;
3858 }
3859 }
3860 // FALLTHRU DON'T REORDER
3861 case BOGUS_COMMENT_HYPHEN:
3862 boguscommenthyphenloop: for (;;) {
3863 if (++pos == endPos) {
3864 break stateloop;
3865 }
3866 c = checkChar(buf, pos);
3867 switch (c) {
3868 case '>':
3869 // [NOCPP[
3870 maybeAppendSpaceToBogusComment();
3871 // ]NOCPP]
3872 emitComment(0, pos);
3873 state = transition(state, Tokenizer.DATA, reconsume, pos);
3874 continue stateloop;
3875 case '-':
3876 appendSecondHyphenToBogusComment();
3877 continue boguscommenthyphenloop;
3878 case '\r':
3879 appendLongStrBufCarriageReturn();
3880 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3881 break stateloop;
3882 case '\n':
3883 appendLongStrBufLineFeed();
3884 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3885 continue stateloop;
3886 case '\u0000':
3887 c = '\uFFFD';
3888 // fall thru
3889 default:
3890 appendLongStrBuf(c);
3891 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3892 continue stateloop;
3893 }
3894 }
3895 // XXX reorder point
3896 case SCRIPT_DATA:
3897 scriptdataloop: for (;;) {
3898 if (reconsume) {
3899 reconsume = false;
3900 } else {
3901 if (++pos == endPos) {
3902 break stateloop;
3903 }
3904 c = checkChar(buf, pos);
3905 }
3906 switch (c) {
3907 case '<':
3908 /*
3909 * U+003C LESS-THAN SIGN (<) Switch to the
3910 * script data less-than sign state.
3911 */
3912 flushChars(buf, pos);
3913 returnState = state;
3914 state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos);
3915 break scriptdataloop; // FALL THRU continue
3916 // stateloop;
3917 case '\u0000':
3918 emitReplacementCharacter(buf, pos);
3919 continue;
3920 case '\r':
3921 emitCarriageReturn(buf, pos);
3922 break stateloop;
3923 case '\n':
3924 silentLineFeed();
3925 default:
3926 /*
3927 * Anything else Emit the current input
3928 * character as a character token. Stay in the
3929 * script data state.
3930 */
3931 continue;
3932 }
3933 }
3934 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
3935 case SCRIPT_DATA_LESS_THAN_SIGN:
3936 scriptdatalessthansignloop: for (;;) {
3937 if (++pos == endPos) {
3938 break stateloop;
3939 }
3940 c = checkChar(buf, pos);
3941 switch (c) {
3942 case '/':
3943 /*
3944 * U+002F SOLIDUS (/) Set the temporary buffer
3945 * to the empty string. Switch to the script
3946 * data end tag open state.
3947 */
3948 index = 0;
3949 clearStrBuf();
3950 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
3951 continue stateloop;
3952 case '!':
3953 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
3954 cstart = pos;
3955 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos);
3956 break scriptdatalessthansignloop; // FALL THRU
3957 // continue
3958 // stateloop;
3959 default:
3960 /*
3961 * Otherwise, emit a U+003C LESS-THAN SIGN
3962 * character token
3963 */
3964 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
3965 /*
3966 * and reconsume the current input character in
3967 * the data state.
3968 */
3969 cstart = pos;
3970 reconsume = true;
3971 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
3972 continue stateloop;
3973 }
3974 }
3975 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
3976 case SCRIPT_DATA_ESCAPE_START:
3977 scriptdataescapestartloop: for (;;) {
3978 if (++pos == endPos) {
3979 break stateloop;
3980 }
3981 c = checkChar(buf, pos);
3982 /*
3983 * Consume the next input character:
3984 */
3985 switch (c) {
3986 case '-':
3987 /*
3988 * U+002D HYPHEN-MINUS (-) Emit a U+002D
3989 * HYPHEN-MINUS character token. Switch to the
3990 * script data escape start dash state.
3991 */
3992 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos);
3993 break scriptdataescapestartloop; // FALL THRU
3994 // continue
3995 // stateloop;
3996 default:
3997 /*
3998 * Anything else Reconsume the current input
3999 * character in the script data state.
4000 */
4001 reconsume = true;
4002 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4003 continue stateloop;
4004 }
4005 }
4006 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4007 case SCRIPT_DATA_ESCAPE_START_DASH:
4008 scriptdataescapestartdashloop: for (;;) {
4009 if (++pos == endPos) {
4010 break stateloop;
4011 }
4012 c = checkChar(buf, pos);
4013 /*
4014 * Consume the next input character:
4015 */
4016 switch (c) {
4017 case '-':
4018 /*
4019 * U+002D HYPHEN-MINUS (-) Emit a U+002D
4020 * HYPHEN-MINUS character token. Switch to the
4021 * script data escaped dash dash state.
4022 */
4023 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
4024 break scriptdataescapestartdashloop;
4025 // continue stateloop;
4026 default:
4027 /*
4028 * Anything else Reconsume the current input
4029 * character in the script data state.
4030 */
4031 reconsume = true;
4032 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4033 continue stateloop;
4034 }
4035 }
4036 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4037 case SCRIPT_DATA_ESCAPED_DASH_DASH:
4038 scriptdataescapeddashdashloop: for (;;) {
4039 if (++pos == endPos) {
4040 break stateloop;
4041 }
4042 c = checkChar(buf, pos);
4043 /*
4044 * Consume the next input character:
4045 */
4046 switch (c) {
4047 case '-':
4048 /*
4049 * U+002D HYPHEN-MINUS (-) Emit a U+002D
4050 * HYPHEN-MINUS character token. Stay in the
4051 * script data escaped dash dash state.
4052 */
4053 continue;
4054 case '<':
4055 /*
4056 * U+003C LESS-THAN SIGN (<) Switch to the
4057 * script data escaped less-than sign state.
4058 */
4059 flushChars(buf, pos);
4060 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4061 continue stateloop;
4062 case '>':
4063 /*
4064 * U+003E GREATER-THAN SIGN (>) Emit a U+003E
4065 * GREATER-THAN SIGN character token. Switch to
4066 * the script data state.
4067 */
4068 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4069 continue stateloop;
4070 case '\u0000':
4071 emitReplacementCharacter(buf, pos);
4072 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4073 break scriptdataescapeddashdashloop;
4074 case '\r':
4075 emitCarriageReturn(buf, pos);
4076 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4077 break stateloop;
4078 case '\n':
4079 silentLineFeed();
4080 default:
4081 /*
4082 * Anything else Emit the current input
4083 * character as a character token. Switch to the
4084 * script data escaped state.
4085 */
4086 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4087 break scriptdataescapeddashdashloop;
4088 // continue stateloop;
4089 }
4090 }
4091 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4092 case SCRIPT_DATA_ESCAPED:
4093 scriptdataescapedloop: for (;;) {
4094 if (reconsume) {
4095 reconsume = false;
4096 } else {
4097 if (++pos == endPos) {
4098 break stateloop;
4099 }
4100 c = checkChar(buf, pos);
4101 }
4102 /*
4103 * Consume the next input character:
4104 */
4105 switch (c) {
4106 case '-':
4107 /*
4108 * U+002D HYPHEN-MINUS (-) Emit a U+002D
4109 * HYPHEN-MINUS character token. Switch to the
4110 * script data escaped dash state.
4111 */
4112 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos);
4113 break scriptdataescapedloop; // FALL THRU
4114 // continue
4115 // stateloop;
4116 case '<':
4117 /*
4118 * U+003C LESS-THAN SIGN (<) Switch to the
4119 * script data escaped less-than sign state.
4120 */
4121 flushChars(buf, pos);
4122 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4123 continue stateloop;
4124 case '\u0000':
4125 emitReplacementCharacter(buf, pos);
4126 continue;
4127 case '\r':
4128 emitCarriageReturn(buf, pos);
4129 break stateloop;
4130 case '\n':
4131 silentLineFeed();
4132 default:
4133 /*
4134 * Anything else Emit the current input
4135 * character as a character token. Stay in the
4136 * script data escaped state.
4137 */
4138 continue;
4139 }
4140 }
4141 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4142 case SCRIPT_DATA_ESCAPED_DASH:
4143 scriptdataescapeddashloop: for (;;) {
4144 if (++pos == endPos) {
4145 break stateloop;
4146 }
4147 c = checkChar(buf, pos);
4148 /*
4149 * Consume the next input character:
4150 */
4151 switch (c) {
4152 case '-':
4153 /*
4154 * U+002D HYPHEN-MINUS (-) Emit a U+002D
4155 * HYPHEN-MINUS character token. Switch to the
4156 * script data escaped dash dash state.
4157 */
4158 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
4159 continue stateloop;
4160 case '<':
4161 /*
4162 * U+003C LESS-THAN SIGN (<) Switch to the
4163 * script data escaped less-than sign state.
4164 */
4165 flushChars(buf, pos);
4166 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4167 break scriptdataescapeddashloop;
4168 // continue stateloop;
4169 case '\u0000':
4170 emitReplacementCharacter(buf, pos);
4171 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4172 continue stateloop;
4173 case '\r':
4174 emitCarriageReturn(buf, pos);
4175 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4176 break stateloop;
4177 case '\n':
4178 silentLineFeed();
4179 default:
4180 /*
4181 * Anything else Emit the current input
4182 * character as a character token. Switch to the
4183 * script data escaped state.
4184 */
4185 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4186 continue stateloop;
4187 }
4188 }
4189 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4190 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
4191 scriptdataescapedlessthanloop: for (;;) {
4192 if (++pos == endPos) {
4193 break stateloop;
4194 }
4195 c = checkChar(buf, pos);
4196 /*
4197 * Consume the next input character:
4198 */
4199 switch (c) {
4200 case '/':
4201 /*
4202 * U+002F SOLIDUS (/) Set the temporary buffer
4203 * to the empty string. Switch to the script
4204 * data escaped end tag open state.
4205 */
4206 index = 0;
4207 clearStrBuf();
4208 returnState = Tokenizer.SCRIPT_DATA_ESCAPED;
4209 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
4210 continue stateloop;
4211 case 'S':
4212 case 's':
4213 /*
4214 * U+0041 LATIN CAPITAL LETTER A through to
4215 * U+005A LATIN CAPITAL LETTER Z Emit a U+003C
4216 * LESS-THAN SIGN character token and the
4217 * current input character as a character token.
4218 */
4219 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
4220 cstart = pos;
4221 index = 1;
4222 /*
4223 * Set the temporary buffer to the empty string.
4224 * Append the lowercase version of the current
4225 * input character (add 0x0020 to the
4226 * character's code point) to the temporary
4227 * buffer. Switch to the script data double
4228 * escape start state.
4229 */
4230 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos);
4231 break scriptdataescapedlessthanloop;
4232 // continue stateloop;
4233 default:
4234 /*
4235 * Anything else Emit a U+003C LESS-THAN SIGN
4236 * character token and reconsume the current
4237 * input character in the script data escaped
4238 * state.
4239 */
4240 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
4241 cstart = pos;
4242 reconsume = true;
4243 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4244 continue stateloop;
4245 }
4246 }
4247 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4248 case SCRIPT_DATA_DOUBLE_ESCAPE_START:
4249 scriptdatadoubleescapestartloop: for (;;) {
4250 if (++pos == endPos) {
4251 break stateloop;
4252 }
4253 c = checkChar(buf, pos);
4254 assert index > 0;
4255 if (index < 6) { // SCRIPT_ARR.length
4256 char folded = c;
4257 if (c >= 'A' && c <= 'Z') {
4258 folded += 0x20;
4259 }
4260 if (folded != Tokenizer.SCRIPT_ARR[index]) {
4261 reconsume = true;
4262 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4263 continue stateloop;
4264 }
4265 index++;
4266 continue;
4267 }
4268 switch (c) {
4269 case '\r':
4270 emitCarriageReturn(buf, pos);
4271 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4272 break stateloop;
4273 case '\n':
4274 silentLineFeed();
4275 case ' ':
4276 case '\t':
4277 case '\u000C':
4278 case '/':
4279 case '>':
4280 /*
4281 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4282 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4283 * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
4284 * (>) Emit the current input character as a
4285 * character token. If the temporary buffer is
4286 * the string "script", then switch to the
4287 * script data double escaped state.
4288 */
4289 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4290 break scriptdatadoubleescapestartloop;
4291 // continue stateloop;
4292 default:
4293 /*
4294 * Anything else Reconsume the current input
4295 * character in the script data escaped state.
4296 */
4297 reconsume = true;
4298 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4299 continue stateloop;
4300 }
4301 }
4302 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4303 case SCRIPT_DATA_DOUBLE_ESCAPED:
4304 scriptdatadoubleescapedloop: for (;;) {
4305 if (reconsume) {
4306 reconsume = false;
4307 } else {
4308 if (++pos == endPos) {
4309 break stateloop;
4310 }
4311 c = checkChar(buf, pos);
4312 }
4313 /*
4314 * Consume the next input character:
4315 */
4316 switch (c) {
4317 case '-':
4318 /*
4319 * U+002D HYPHEN-MINUS (-) Emit a U+002D
4320 * HYPHEN-MINUS character token. Switch to the
4321 * script data double escaped dash state.
4322 */
4323 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos);
4324 break scriptdatadoubleescapedloop; // FALL THRU
4325 // continue
4326 // stateloop;
4327 case '<':
4328 /*
4329 * U+003C LESS-THAN SIGN (<) Emit a U+003C
4330 * LESS-THAN SIGN character token. Switch to the
4331 * script data double escaped less-than sign
4332 * state.
4333 */
4334 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4335 continue stateloop;
4336 case '\u0000':
4337 emitReplacementCharacter(buf, pos);
4338 continue;
4339 case '\r':
4340 emitCarriageReturn(buf, pos);
4341 break stateloop;
4342 case '\n':
4343 silentLineFeed();
4344 default:
4345 /*
4346 * Anything else Emit the current input
4347 * character as a character token. Stay in the
4348 * script data double escaped state.
4349 */
4350 continue;
4351 }
4352 }
4353 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4354 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
4355 scriptdatadoubleescapeddashloop: for (;;) {
4356 if (++pos == endPos) {
4357 break stateloop;
4358 }
4359 c = checkChar(buf, pos);
4360 /*
4361 * Consume the next input character:
4362 */
4363 switch (c) {
4364 case '-':
4365 /*
4366 * U+002D HYPHEN-MINUS (-) Emit a U+002D
4367 * HYPHEN-MINUS character token. Switch to the
4368 * script data double escaped dash dash state.
4369 */
4370 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos);
4371 break scriptdatadoubleescapeddashloop;
4372 // continue stateloop;
4373 case '<':
4374 /*
4375 * U+003C LESS-THAN SIGN (<) Emit a U+003C
4376 * LESS-THAN SIGN character token. Switch to the
4377 * script data double escaped less-than sign
4378 * state.
4379 */
4380 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4381 continue stateloop;
4382 case '\u0000':
4383 emitReplacementCharacter(buf, pos);
4384 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4385 continue stateloop;
4386 case '\r':
4387 emitCarriageReturn(buf, pos);
4388 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4389 break stateloop;
4390 case '\n':
4391 silentLineFeed();
4392 default:
4393 /*
4394 * Anything else Emit the current input
4395 * character as a character token. Switch to the
4396 * script data double escaped state.
4397 */
4398 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4399 continue stateloop;
4400 }
4401 }
4402 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4403 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
4404 scriptdatadoubleescapeddashdashloop: for (;;) {
4405 if (++pos == endPos) {
4406 break stateloop;
4407 }
4408 c = checkChar(buf, pos);
4409 /*
4410 * Consume the next input character:
4411 */
4412 switch (c) {
4413 case '-':
4414 /*
4415 * U+002D HYPHEN-MINUS (-) Emit a U+002D
4416 * HYPHEN-MINUS character token. Stay in the
4417 * script data double escaped dash dash state.
4418 */
4419 continue;
4420 case '<':
4421 /*
4422 * U+003C LESS-THAN SIGN (<) Emit a U+003C
4423 * LESS-THAN SIGN character token. Switch to the
4424 * script data double escaped less-than sign
4425 * state.
4426 */
4427 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4428 break scriptdatadoubleescapeddashdashloop;
4429 case '>':
4430 /*
4431 * U+003E GREATER-THAN SIGN (>) Emit a U+003E
4432 * GREATER-THAN SIGN character token. Switch to
4433 * the script data state.
4434 */
4435 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4436 continue stateloop;
4437 case '\u0000':
4438 emitReplacementCharacter(buf, pos);
4439 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4440 continue stateloop;
4441 case '\r':
4442 emitCarriageReturn(buf, pos);
4443 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4444 break stateloop;
4445 case '\n':
4446 silentLineFeed();
4447 default:
4448 /*
4449 * Anything else Emit the current input
4450 * character as a character token. Switch to the
4451 * script data double escaped state.
4452 */
4453 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4454 continue stateloop;
4455 }
4456 }
4457 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4458 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
4459 scriptdatadoubleescapedlessthanloop: for (;;) {
4460 if (++pos == endPos) {
4461 break stateloop;
4462 }
4463 c = checkChar(buf, pos);
4464 /*
4465 * Consume the next input character:
4466 */
4467 switch (c) {
4468 case '/':
4469 /*
4470 * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS
4471 * character token. Set the temporary buffer to
4472 * the empty string. Switch to the script data
4473 * double escape end state.
4474 */
4475 index = 0;
4476 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos);
4477 break scriptdatadoubleescapedlessthanloop;
4478 default:
4479 /*
4480 * Anything else Reconsume the current input
4481 * character in the script data double escaped
4482 * state.
4483 */
4484 reconsume = true;
4485 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4486 continue stateloop;
4487 }
4488 }
4489 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4490 case SCRIPT_DATA_DOUBLE_ESCAPE_END:
4491 scriptdatadoubleescapeendloop: for (;;) {
4492 if (++pos == endPos) {
4493 break stateloop;
4494 }
4495 c = checkChar(buf, pos);
4496 if (index < 6) { // SCRIPT_ARR.length
4497 char folded = c;
4498 if (c >= 'A' && c <= 'Z') {
4499 folded += 0x20;
4500 }
4501 if (folded != Tokenizer.SCRIPT_ARR[index]) {
4502 reconsume = true;
4503 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4504 continue stateloop;
4505 }
4506 index++;
4507 continue;
4508 }
4509 switch (c) {
4510 case '\r':
4511 emitCarriageReturn(buf, pos);
4512 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4513 break stateloop;
4514 case '\n':
4515 silentLineFeed();
4516 case ' ':
4517 case '\t':
4518 case '\u000C':
4519 case '/':
4520 case '>':
4521 /*
4522 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4523 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4524 * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
4525 * (>) Emit the current input character as a
4526 * character token. If the temporary buffer is
4527 * the string "script", then switch to the
4528 * script data escaped state.
4529 */
4530 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4531 continue stateloop;
4532 default:
4533 /*
4534 * Reconsume the current input character in the
4535 * script data double escaped state.
4536 */
4537 reconsume = true;
4538 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4539 continue stateloop;
4540 }
4541 }
4542 // XXX reorder point
4543 case MARKUP_DECLARATION_OCTYPE:
4544 markupdeclarationdoctypeloop: for (;;) {
4545 if (++pos == endPos) {
4546 break stateloop;
4547 }
4548 c = checkChar(buf, pos);
4549 if (index < 6) { // OCTYPE.length
4550 char folded = c;
4551 if (c >= 'A' && c <= 'Z') {
4552 folded += 0x20;
4553 }
4554 if (folded == Tokenizer.OCTYPE[index]) {
4555 appendLongStrBuf(c);
4556 } else {
4557 errBogusComment();
4558 reconsume = true;
4559 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
4560 continue stateloop;
4561 }
4562 index++;
4563 continue;
4564 } else {
4565 reconsume = true;
4566 state = transition(state, Tokenizer.DOCTYPE, reconsume, pos);
4567 break markupdeclarationdoctypeloop;
4568 // continue stateloop;
4569 }
4570 }
4571 // FALLTHRU DON'T REORDER
4572 case DOCTYPE:
4573 doctypeloop: for (;;) {
4574 if (reconsume) {
4575 reconsume = false;
4576 } else {
4577 if (++pos == endPos) {
4578 break stateloop;
4579 }
4580 c = checkChar(buf, pos);
4581 }
4582 initDoctypeFields();
4583 /*
4584 * Consume the next input character:
4585 */
4586 switch (c) {
4587 case '\r':
4588 silentCarriageReturn();
4589 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
4590 break stateloop;
4591 case '\n':
4592 silentLineFeed();
4593 // fall thru
4594 case ' ':
4595 case '\t':
4596 case '\u000C':
4597 /*
4598 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4599 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4600 * Switch to the before DOCTYPE name state.
4601 */
4602 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
4603 break doctypeloop;
4604 // continue stateloop;
4605 default:
4606 /*
4607 * Anything else Parse error.
4608 */
4609 errMissingSpaceBeforeDoctypeName();
4610 /*
4611 * Reconsume the current character in the before
4612 * DOCTYPE name state.
4613 */
4614 reconsume = true;
4615 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
4616 break doctypeloop;
4617 // continue stateloop;
4618 }
4619 }
4620 // FALLTHRU DON'T REORDER
4621 case BEFORE_DOCTYPE_NAME:
4622 beforedoctypenameloop: for (;;) {
4623 if (reconsume) {
4624 reconsume = false;
4625 } else {
4626 if (++pos == endPos) {
4627 break stateloop;
4628 }
4629 c = checkChar(buf, pos);
4630 }
4631 /*
4632 * Consume the next input character:
4633 */
4634 switch (c) {
4635 case '\r':
4636 silentCarriageReturn();
4637 break stateloop;
4638 case '\n':
4639 silentLineFeed();
4640 // fall thru
4641 case ' ':
4642 case '\t':
4643 case '\u000C':
4644 /*
4645 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4646 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
4647 * in the before DOCTYPE name state.
4648 */
4649 continue;
4650 case '>':
4651 /*
4652 * U+003E GREATER-THAN SIGN (>) Parse error.
4653 */
4654 errNamelessDoctype();
4655 /*
4656 * Create a new DOCTYPE token. Set its
4657 * force-quirks flag to on.
4658 */
4659 forceQuirks = true;
4660 /*
4661 * Emit the token.
4662 */
4663 emitDoctypeToken(pos);
4664 /*
4665 * Switch to the data state.
4666 */
4667 state = transition(state, Tokenizer.DATA, reconsume, pos);
4668 continue stateloop;
4669 case '\u0000':
4670 c = '\uFFFD';
4671 // fall thru
4672 default:
4673 if (c >= 'A' && c <= 'Z') {
4674 /*
4675 * U+0041 LATIN CAPITAL LETTER A through to
4676 * U+005A LATIN CAPITAL LETTER Z Create a
4677 * new DOCTYPE token. Set the token's name
4678 * to the lowercase version of the input
4679 * character (add 0x0020 to the character's
4680 * code point).
4681 */
4682 c += 0x20;
4683 }
4684 /* Anything else Create a new DOCTYPE token. */
4685 /*
4686 * Set the token's name name to the current
4687 * input character.
4688 */
4689 clearStrBufAndAppend(c);
4690 /*
4691 * Switch to the DOCTYPE name state.
4692 */
4693 state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos);
4694 break beforedoctypenameloop;
4695 // continue stateloop;
4696 }
4697 }
4698 // FALLTHRU DON'T REORDER
4699 case DOCTYPE_NAME:
4700 doctypenameloop: for (;;) {
4701 if (++pos == endPos) {
4702 break stateloop;
4703 }
4704 c = checkChar(buf, pos);
4705 /*
4706 * Consume the next input character:
4707 */
4708 switch (c) {
4709 case '\r':
4710 silentCarriageReturn();
4711 strBufToDoctypeName();
4712 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
4713 break stateloop;
4714 case '\n':
4715 silentLineFeed();
4716 // fall thru
4717 case ' ':
4718 case '\t':
4719 case '\u000C':
4720 /*
4721 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4722 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4723 * Switch to the after DOCTYPE name state.
4724 */
4725 strBufToDoctypeName();
4726 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
4727 break doctypenameloop;
4728 // continue stateloop;
4729 case '>':
4730 /*
4731 * U+003E GREATER-THAN SIGN (>) Emit the current
4732 * DOCTYPE token.
4733 */
4734 strBufToDoctypeName();
4735 emitDoctypeToken(pos);
4736 /*
4737 * Switch to the data state.
4738 */
4739 state = transition(state, Tokenizer.DATA, reconsume, pos);
4740 continue stateloop;
4741 case '\u0000':
4742 c = '\uFFFD';
4743 // fall thru
4744 default:
4745 /*
4746 * U+0041 LATIN CAPITAL LETTER A through to
4747 * U+005A LATIN CAPITAL LETTER Z Append the
4748 * lowercase version of the input character (add
4749 * 0x0020 to the character's code point) to the
4750 * current DOCTYPE token's name.
4751 */
4752 if (c >= 'A' && c <= 'Z') {
4753 c += 0x0020;
4754 }
4755 /*
4756 * Anything else Append the current input
4757 * character to the current DOCTYPE token's
4758 * name.
4759 */
4760 appendStrBuf(c);
4761 /*
4762 * Stay in the DOCTYPE name state.
4763 */
4764 continue;
4765 }
4766 }
4767 // FALLTHRU DON'T REORDER
4768 case AFTER_DOCTYPE_NAME:
4769 afterdoctypenameloop: for (;;) {
4770 if (++pos == endPos) {
4771 break stateloop;
4772 }
4773 c = checkChar(buf, pos);
4774 /*
4775 * Consume the next input character:
4776 */
4777 switch (c) {
4778 case '\r':
4779 silentCarriageReturn();
4780 break stateloop;
4781 case '\n':
4782 silentLineFeed();
4783 // fall thru
4784 case ' ':
4785 case '\t':
4786 case '\u000C':
4787 /*
4788 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4789 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
4790 * in the after DOCTYPE name state.
4791 */
4792 continue;
4793 case '>':
4794 /*
4795 * U+003E GREATER-THAN SIGN (>) Emit the current
4796 * DOCTYPE token.
4797 */
4798 emitDoctypeToken(pos);
4799 /*
4800 * Switch to the data state.
4801 */
4802 state = transition(state, Tokenizer.DATA, reconsume, pos);
4803 continue stateloop;
4804 case 'p':
4805 case 'P':
4806 index = 0;
4807 state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos);
4808 break afterdoctypenameloop;
4809 // continue stateloop;
4810 case 's':
4811 case 'S':
4812 index = 0;
4813 state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos);
4814 continue stateloop;
4815 default:
4816 /*
4817 * Otherwise, this is the parse error.
4818 */
4819 bogusDoctype();
4821 /*
4822 * Set the DOCTYPE token's force-quirks flag to
4823 * on.
4824 */
4825 // done by bogusDoctype();
4826 /*
4827 * Switch to the bogus DOCTYPE state.
4828 */
4829 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
4830 continue stateloop;
4831 }
4832 }
4833 // FALLTHRU DON'T REORDER
4834 case DOCTYPE_UBLIC:
4835 doctypeublicloop: for (;;) {
4836 if (++pos == endPos) {
4837 break stateloop;
4838 }
4839 c = checkChar(buf, pos);
4840 /*
4841 * If the six characters starting from the current input
4842 * character are an ASCII case-insensitive match for the
4843 * word "PUBLIC", then consume those characters and
4844 * switch to the before DOCTYPE public identifier state.
4845 */
4846 if (index < 5) { // UBLIC.length
4847 char folded = c;
4848 if (c >= 'A' && c <= 'Z') {
4849 folded += 0x20;
4850 }
4851 if (folded != Tokenizer.UBLIC[index]) {
4852 bogusDoctype();
4853 // forceQuirks = true;
4854 reconsume = true;
4855 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
4856 continue stateloop;
4857 }
4858 index++;
4859 continue;
4860 } else {
4861 reconsume = true;
4862 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos);
4863 break doctypeublicloop;
4864 // continue stateloop;
4865 }
4866 }
4867 // FALLTHRU DON'T REORDER
4868 case AFTER_DOCTYPE_PUBLIC_KEYWORD:
4869 afterdoctypepublickeywordloop: for (;;) {
4870 if (reconsume) {
4871 reconsume = false;
4872 } else {
4873 if (++pos == endPos) {
4874 break stateloop;
4875 }
4876 c = checkChar(buf, pos);
4877 }
4878 /*
4879 * Consume the next input character:
4880 */
4881 switch (c) {
4882 case '\r':
4883 silentCarriageReturn();
4884 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
4885 break stateloop;
4886 case '\n':
4887 silentLineFeed();
4888 // fall thru
4889 case ' ':
4890 case '\t':
4891 case '\u000C':
4892 /*
4893 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4894 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4895 * Switch to the before DOCTYPE public
4896 * identifier state.
4897 */
4898 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
4899 break afterdoctypepublickeywordloop;
4900 // FALL THROUGH continue stateloop
4901 case '"':
4902 /*
4903 * U+0022 QUOTATION MARK (") Parse Error.
4904 */
4905 errNoSpaceBetweenDoctypePublicKeywordAndQuote();
4906 /*
4907 * Set the DOCTYPE token's public identifier to
4908 * the empty string (not missing),
4909 */
4910 clearLongStrBuf();
4911 /*
4912 * then switch to the DOCTYPE public identifier
4913 * (double-quoted) state.
4914 */
4915 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
4916 continue stateloop;
4917 case '\'':
4918 /*
4919 * U+0027 APOSTROPHE (') Parse Error.
4920 */
4921 errNoSpaceBetweenDoctypePublicKeywordAndQuote();
4922 /*
4923 * Set the DOCTYPE token's public identifier to
4924 * the empty string (not missing),
4925 */
4926 clearLongStrBuf();
4927 /*
4928 * then switch to the DOCTYPE public identifier
4929 * (single-quoted) state.
4930 */
4931 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
4932 continue stateloop;
4933 case '>':
4934 /* U+003E GREATER-THAN SIGN (>) Parse error. */
4935 errExpectedPublicId();
4936 /*
4937 * Set the DOCTYPE token's force-quirks flag to
4938 * on.
4939 */
4940 forceQuirks = true;
4941 /*
4942 * Emit that DOCTYPE token.
4943 */
4944 emitDoctypeToken(pos);
4945 /*
4946 * Switch to the data state.
4947 */
4948 state = transition(state, Tokenizer.DATA, reconsume, pos);
4949 continue stateloop;
4950 default:
4951 bogusDoctype();
4952 /*
4953 * Set the DOCTYPE token's force-quirks flag to
4954 * on.
4955 */
4956 // done by bogusDoctype();
4957 /*
4958 * Switch to the bogus DOCTYPE state.
4959 */
4960 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
4961 continue stateloop;
4962 }
4963 }
4964 // FALLTHRU DON'T REORDER
4965 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
4966 beforedoctypepublicidentifierloop: for (;;) {
4967 if (++pos == endPos) {
4968 break stateloop;
4969 }
4970 c = checkChar(buf, pos);
4971 /*
4972 * Consume the next input character:
4973 */
4974 switch (c) {
4975 case '\r':
4976 silentCarriageReturn();
4977 break stateloop;
4978 case '\n':
4979 silentLineFeed();
4980 // fall thru
4981 case ' ':
4982 case '\t':
4983 case '\u000C':
4984 /*
4985 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4986 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
4987 * in the before DOCTYPE public identifier
4988 * state.
4989 */
4990 continue;
4991 case '"':
4992 /*
4993 * U+0022 QUOTATION MARK (") Set the DOCTYPE
4994 * token's public identifier to the empty string
4995 * (not missing),
4996 */
4997 clearLongStrBuf();
4998 /*
4999 * then switch to the DOCTYPE public identifier
5000 * (double-quoted) state.
5001 */
5002 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5003 break beforedoctypepublicidentifierloop;
5004 // continue stateloop;
5005 case '\'':
5006 /*
5007 * U+0027 APOSTROPHE (') Set the DOCTYPE token's
5008 * public identifier to the empty string (not
5009 * missing),
5010 */
5011 clearLongStrBuf();
5012 /*
5013 * then switch to the DOCTYPE public identifier
5014 * (single-quoted) state.
5015 */
5016 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5017 continue stateloop;
5018 case '>':
5019 /* U+003E GREATER-THAN SIGN (>) Parse error. */
5020 errExpectedPublicId();
5021 /*
5022 * Set the DOCTYPE token's force-quirks flag to
5023 * on.
5024 */
5025 forceQuirks = true;
5026 /*
5027 * Emit that DOCTYPE token.
5028 */
5029 emitDoctypeToken(pos);
5030 /*
5031 * Switch to the data state.
5032 */
5033 state = transition(state, Tokenizer.DATA, reconsume, pos);
5034 continue stateloop;
5035 default:
5036 bogusDoctype();
5037 /*
5038 * Set the DOCTYPE token's force-quirks flag to
5039 * on.
5040 */
5041 // done by bogusDoctype();
5042 /*
5043 * Switch to the bogus DOCTYPE state.
5044 */
5045 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5046 continue stateloop;
5047 }
5048 }
5049 // FALLTHRU DON'T REORDER
5050 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
5051 doctypepublicidentifierdoublequotedloop: for (;;) {
5052 if (++pos == endPos) {
5053 break stateloop;
5054 }
5055 c = checkChar(buf, pos);
5056 /*
5057 * Consume the next input character:
5058 */
5059 switch (c) {
5060 case '"':
5061 /*
5062 * U+0022 QUOTATION MARK (") Switch to the after
5063 * DOCTYPE public identifier state.
5064 */
5065 publicIdentifier = longStrBufToString();
5066 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
5067 break doctypepublicidentifierdoublequotedloop;
5068 // continue stateloop;
5069 case '>':
5070 /*
5071 * U+003E GREATER-THAN SIGN (>) Parse error.
5072 */
5073 errGtInPublicId();
5074 /*
5075 * Set the DOCTYPE token's force-quirks flag to
5076 * on.
5077 */
5078 forceQuirks = true;
5079 /*
5080 * Emit that DOCTYPE token.
5081 */
5082 publicIdentifier = longStrBufToString();
5083 emitDoctypeToken(pos);
5084 /*
5085 * Switch to the data state.
5086 */
5087 state = transition(state, Tokenizer.DATA, reconsume, pos);
5088 continue stateloop;
5089 case '\r':
5090 appendLongStrBufCarriageReturn();
5091 break stateloop;
5092 case '\n':
5093 appendLongStrBufLineFeed();
5094 continue;
5095 case '\u0000':
5096 c = '\uFFFD';
5097 // fall thru
5098 default:
5099 /*
5100 * Anything else Append the current input
5101 * character to the current DOCTYPE token's
5102 * public identifier.
5103 */
5104 appendLongStrBuf(c);
5105 /*
5106 * Stay in the DOCTYPE public identifier
5107 * (double-quoted) state.
5108 */
5109 continue;
5110 }
5111 }
5112 // FALLTHRU DON'T REORDER
5113 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
5114 afterdoctypepublicidentifierloop: for (;;) {
5115 if (++pos == endPos) {
5116 break stateloop;
5117 }
5118 c = checkChar(buf, pos);
5119 /*
5120 * Consume the next input character:
5121 */
5122 switch (c) {
5123 case '\r':
5124 silentCarriageReturn();
5125 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
5126 break stateloop;
5127 case '\n':
5128 silentLineFeed();
5129 // fall thru
5130 case ' ':
5131 case '\t':
5132 case '\u000C':
5133 /*
5134 * U+0009 CHARACTER TABULATION U+000A LINE FEED
5135 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
5136 * Switch to the between DOCTYPE public and
5137 * system identifiers state.
5138 */
5139 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
5140 break afterdoctypepublicidentifierloop;
5141 // continue stateloop;
5142 case '>':
5143 /*
5144 * U+003E GREATER-THAN SIGN (>) Emit the current
5145 * DOCTYPE token.
5146 */
5147 emitDoctypeToken(pos);
5148 /*
5149 * Switch to the data state.
5150 */
5151 state = transition(state, Tokenizer.DATA, reconsume, pos);
5152 continue stateloop;
5153 case '"':
5154 /*
5155 * U+0022 QUOTATION MARK (") Parse error.
5156 */
5157 errNoSpaceBetweenPublicAndSystemIds();
5158 /*
5159 * Set the DOCTYPE token's system identifier to
5160 * the empty string (not missing),
5161 */
5162 clearLongStrBuf();
5163 /*
5164 * then switch to the DOCTYPE system identifier
5165 * (double-quoted) state.
5166 */
5167 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5168 continue stateloop;
5169 case '\'':
5170 /*
5171 * U+0027 APOSTROPHE (') Parse error.
5172 */
5173 errNoSpaceBetweenPublicAndSystemIds();
5174 /*
5175 * Set the DOCTYPE token's system identifier to
5176 * the empty string (not missing),
5177 */
5178 clearLongStrBuf();
5179 /*
5180 * then switch to the DOCTYPE system identifier
5181 * (single-quoted) state.
5182 */
5183 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5184 continue stateloop;
5185 default:
5186 bogusDoctype();
5187 /*
5188 * Set the DOCTYPE token's force-quirks flag to
5189 * on.
5190 */
5191 // done by bogusDoctype();
5192 /*
5193 * Switch to the bogus DOCTYPE state.
5194 */
5195 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5196 continue stateloop;
5197 }
5198 }
5199 // FALLTHRU DON'T REORDER
5200 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
5201 betweendoctypepublicandsystemidentifiersloop: for (;;) {
5202 if (++pos == endPos) {
5203 break stateloop;
5204 }
5205 c = checkChar(buf, pos);
5206 /*
5207 * Consume the next input character:
5208 */
5209 switch (c) {
5210 case '\r':
5211 silentCarriageReturn();
5212 break stateloop;
5213 case '\n':
5214 silentLineFeed();
5215 // fall thru
5216 case ' ':
5217 case '\t':
5218 case '\u000C':
5219 /*
5220 * U+0009 CHARACTER TABULATION U+000A LINE FEED
5221 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
5222 * in the between DOCTYPE public and system
5223 * identifiers state.
5224 */
5225 continue;
5226 case '>':
5227 /*
5228 * U+003E GREATER-THAN SIGN (>) Emit the current
5229 * DOCTYPE token.
5230 */
5231 emitDoctypeToken(pos);
5232 /*
5233 * Switch to the data state.
5234 */
5235 state = transition(state, Tokenizer.DATA, reconsume, pos);
5236 continue stateloop;
5237 case '"':
5238 /*
5239 * U+0022 QUOTATION MARK (") Set the DOCTYPE
5240 * token's system identifier to the empty string
5241 * (not missing),
5242 */
5243 clearLongStrBuf();
5244 /*
5245 * then switch to the DOCTYPE system identifier
5246 * (double-quoted) state.
5247 */
5248 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5249 break betweendoctypepublicandsystemidentifiersloop;
5250 // continue stateloop;
5251 case '\'':
5252 /*
5253 * U+0027 APOSTROPHE (') Set the DOCTYPE token's
5254 * system identifier to the empty string (not
5255 * missing),
5256 */
5257 clearLongStrBuf();
5258 /*
5259 * then switch to the DOCTYPE system identifier
5260 * (single-quoted) state.
5261 */
5262 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5263 continue stateloop;
5264 default:
5265 bogusDoctype();
5266 /*
5267 * Set the DOCTYPE token's force-quirks flag to
5268 * on.
5269 */
5270 // done by bogusDoctype();
5271 /*
5272 * Switch to the bogus DOCTYPE state.
5273 */
5274 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5275 continue stateloop;
5276 }
5277 }
5278 // FALLTHRU DON'T REORDER
5279 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
5280 doctypesystemidentifierdoublequotedloop: for (;;) {
5281 if (++pos == endPos) {
5282 break stateloop;
5283 }
5284 c = checkChar(buf, pos);
5285 /*
5286 * Consume the next input character:
5287 */
5288 switch (c) {
5289 case '"':
5290 /*
5291 * U+0022 QUOTATION MARK (") Switch to the after
5292 * DOCTYPE system identifier state.
5293 */
5294 systemIdentifier = longStrBufToString();
5295 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5296 continue stateloop;
5297 case '>':
5298 /*
5299 * U+003E GREATER-THAN SIGN (>) Parse error.
5300 */
5301 errGtInSystemId();
5302 /*
5303 * Set the DOCTYPE token's force-quirks flag to
5304 * on.
5305 */
5306 forceQuirks = true;
5307 /*
5308 * Emit that DOCTYPE token.
5309 */
5310 systemIdentifier = longStrBufToString();
5311 emitDoctypeToken(pos);
5312 /*
5313 * Switch to the data state.
5314 */
5315 state = transition(state, Tokenizer.DATA, reconsume, pos);
5316 continue stateloop;
5317 case '\r':
5318 appendLongStrBufCarriageReturn();
5319 break stateloop;
5320 case '\n':
5321 appendLongStrBufLineFeed();
5322 continue;
5323 case '\u0000':
5324 c = '\uFFFD';
5325 // fall thru
5326 default:
5327 /*
5328 * Anything else Append the current input
5329 * character to the current DOCTYPE token's
5330 * system identifier.
5331 */
5332 appendLongStrBuf(c);
5333 /*
5334 * Stay in the DOCTYPE system identifier
5335 * (double-quoted) state.
5336 */
5337 continue;
5338 }
5339 }
5340 // FALLTHRU DON'T REORDER
5341 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
5342 afterdoctypesystemidentifierloop: for (;;) {
5343 if (++pos == endPos) {
5344 break stateloop;
5345 }
5346 c = checkChar(buf, pos);
5347 /*
5348 * Consume the next input character:
5349 */
5350 switch (c) {
5351 case '\r':
5352 silentCarriageReturn();
5353 break stateloop;
5354 case '\n':
5355 silentLineFeed();
5356 // fall thru
5357 case ' ':
5358 case '\t':
5359 case '\u000C':
5360 /*
5361 * U+0009 CHARACTER TABULATION U+000A LINE FEED
5362 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
5363 * in the after DOCTYPE system identifier state.
5364 */
5365 continue;
5366 case '>':
5367 /*
5368 * U+003E GREATER-THAN SIGN (>) Emit the current
5369 * DOCTYPE token.
5370 */
5371 emitDoctypeToken(pos);
5372 /*
5373 * Switch to the data state.
5374 */
5375 state = transition(state, Tokenizer.DATA, reconsume, pos);
5376 continue stateloop;
5377 default:
5378 /*
5379 * Switch to the bogus DOCTYPE state. (This does
5380 * not set the DOCTYPE token's force-quirks flag
5381 * to on.)
5382 */
5383 bogusDoctypeWithoutQuirks();
5384 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5385 break afterdoctypesystemidentifierloop;
5386 // continue stateloop;
5387 }
5388 }
5389 // FALLTHRU DON'T REORDER
5390 case BOGUS_DOCTYPE:
5391 for (;;) {
5392 if (reconsume) {
5393 reconsume = false;
5394 } else {
5395 if (++pos == endPos) {
5396 break stateloop;
5397 }
5398 c = checkChar(buf, pos);
5399 }
5400 /*
5401 * Consume the next input character:
5402 */
5403 switch (c) {
5404 case '>':
5405 /*
5406 * U+003E GREATER-THAN SIGN (>) Emit that
5407 * DOCTYPE token.
5408 */
5409 emitDoctypeToken(pos);
5410 /*
5411 * Switch to the data state.
5412 */
5413 state = transition(state, Tokenizer.DATA, reconsume, pos);
5414 continue stateloop;
5415 case '\r':
5416 silentCarriageReturn();
5417 break stateloop;
5418 case '\n':
5419 silentLineFeed();
5420 // fall thru
5421 default:
5422 /*
5423 * Anything else Stay in the bogus DOCTYPE
5424 * state.
5425 */
5426 continue;
5427 }
5428 }
5429 // XXX reorder point
5430 case DOCTYPE_YSTEM:
5431 doctypeystemloop: for (;;) {
5432 if (++pos == endPos) {
5433 break stateloop;
5434 }
5435 c = checkChar(buf, pos);
5436 /*
5437 * Otherwise, if the six characters starting from the
5438 * current input character are an ASCII case-insensitive
5439 * match for the word "SYSTEM", then consume those
5440 * characters and switch to the before DOCTYPE system
5441 * identifier state.
5442 */
5443 if (index < 5) { // YSTEM.length
5444 char folded = c;
5445 if (c >= 'A' && c <= 'Z') {
5446 folded += 0x20;
5447 }
5448 if (folded != Tokenizer.YSTEM[index]) {
5449 bogusDoctype();
5450 reconsume = true;
5451 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5452 continue stateloop;
5453 }
5454 index++;
5455 continue stateloop;
5456 } else {
5457 reconsume = true;
5458 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos);
5459 break doctypeystemloop;
5460 // continue stateloop;
5461 }
5462 }
5463 // FALLTHRU DON'T REORDER
5464 case AFTER_DOCTYPE_SYSTEM_KEYWORD:
5465 afterdoctypesystemkeywordloop: for (;;) {
5466 if (reconsume) {
5467 reconsume = false;
5468 } else {
5469 if (++pos == endPos) {
5470 break stateloop;
5471 }
5472 c = checkChar(buf, pos);
5473 }
5474 /*
5475 * Consume the next input character:
5476 */
5477 switch (c) {
5478 case '\r':
5479 silentCarriageReturn();
5480 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5481 break stateloop;
5482 case '\n':
5483 silentLineFeed();
5484 // fall thru
5485 case ' ':
5486 case '\t':
5487 case '\u000C':
5488 /*
5489 * U+0009 CHARACTER TABULATION U+000A LINE FEED
5490 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
5491 * Switch to the before DOCTYPE public
5492 * identifier state.
5493 */
5494 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5495 break afterdoctypesystemkeywordloop;
5496 // FALL THROUGH continue stateloop
5497 case '"':
5498 /*
5499 * U+0022 QUOTATION MARK (") Parse Error.
5500 */
5501 errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
5502 /*
5503 * Set the DOCTYPE token's system identifier to
5504 * the empty string (not missing),
5505 */
5506 clearLongStrBuf();
5507 /*
5508 * then switch to the DOCTYPE public identifier
5509 * (double-quoted) state.
5510 */
5511 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5512 continue stateloop;
5513 case '\'':
5514 /*
5515 * U+0027 APOSTROPHE (') Parse Error.
5516 */
5517 errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
5518 /*
5519 * Set the DOCTYPE token's public identifier to
5520 * the empty string (not missing),
5521 */
5522 clearLongStrBuf();
5523 /*
5524 * then switch to the DOCTYPE public identifier
5525 * (single-quoted) state.
5526 */
5527 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5528 continue stateloop;
5529 case '>':
5530 /* U+003E GREATER-THAN SIGN (>) Parse error. */
5531 errExpectedPublicId();
5532 /*
5533 * Set the DOCTYPE token's force-quirks flag to
5534 * on.
5535 */
5536 forceQuirks = true;
5537 /*
5538 * Emit that DOCTYPE token.
5539 */
5540 emitDoctypeToken(pos);
5541 /*
5542 * Switch to the data state.
5543 */
5544 state = transition(state, Tokenizer.DATA, reconsume, pos);
5545 continue stateloop;
5546 default:
5547 bogusDoctype();
5548 /*
5549 * Set the DOCTYPE token's force-quirks flag to
5550 * on.
5551 */
5552 // done by bogusDoctype();
5553 /*
5554 * Switch to the bogus DOCTYPE state.
5555 */
5556 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5557 continue stateloop;
5558 }
5559 }
5560 // FALLTHRU DON'T REORDER
5561 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
5562 beforedoctypesystemidentifierloop: for (;;) {
5563 if (++pos == endPos) {
5564 break stateloop;
5565 }
5566 c = checkChar(buf, pos);
5567 /*
5568 * Consume the next input character:
5569 */
5570 switch (c) {
5571 case '\r':
5572 silentCarriageReturn();
5573 break stateloop;
5574 case '\n':
5575 silentLineFeed();
5576 // fall thru
5577 case ' ':
5578 case '\t':
5579 case '\u000C':
5580 /*
5581 * U+0009 CHARACTER TABULATION U+000A LINE FEED
5582 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
5583 * in the before DOCTYPE system identifier
5584 * state.
5585 */
5586 continue;
5587 case '"':
5588 /*
5589 * U+0022 QUOTATION MARK (") Set the DOCTYPE
5590 * token's system identifier to the empty string
5591 * (not missing),
5592 */
5593 clearLongStrBuf();
5594 /*
5595 * then switch to the DOCTYPE system identifier
5596 * (double-quoted) state.
5597 */
5598 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5599 continue stateloop;
5600 case '\'':
5601 /*
5602 * U+0027 APOSTROPHE (') Set the DOCTYPE token's
5603 * system identifier to the empty string (not
5604 * missing),
5605 */
5606 clearLongStrBuf();
5607 /*
5608 * then switch to the DOCTYPE system identifier
5609 * (single-quoted) state.
5610 */
5611 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5612 break beforedoctypesystemidentifierloop;
5613 // continue stateloop;
5614 case '>':
5615 /* U+003E GREATER-THAN SIGN (>) Parse error. */
5616 errExpectedSystemId();
5617 /*
5618 * Set the DOCTYPE token's force-quirks flag to
5619 * on.
5620 */
5621 forceQuirks = true;
5622 /*
5623 * Emit that DOCTYPE token.
5624 */
5625 emitDoctypeToken(pos);
5626 /*
5627 * Switch to the data state.
5628 */
5629 state = transition(state, Tokenizer.DATA, reconsume, pos);
5630 continue stateloop;
5631 default:
5632 bogusDoctype();
5633 /*
5634 * Set the DOCTYPE token's force-quirks flag to
5635 * on.
5636 */
5637 // done by bogusDoctype();
5638 /*
5639 * Switch to the bogus DOCTYPE state.
5640 */
5641 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5642 continue stateloop;
5643 }
5644 }
5645 // FALLTHRU DON'T REORDER
5646 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
5647 for (;;) {
5648 if (++pos == endPos) {
5649 break stateloop;
5650 }
5651 c = checkChar(buf, pos);
5652 /*
5653 * Consume the next input character:
5654 */
5655 switch (c) {
5656 case '\'':
5657 /*
5658 * U+0027 APOSTROPHE (') Switch to the after
5659 * DOCTYPE system identifier state.
5660 */
5661 systemIdentifier = longStrBufToString();
5662 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5663 continue stateloop;
5664 case '>':
5665 errGtInSystemId();
5666 /*
5667 * Set the DOCTYPE token's force-quirks flag to
5668 * on.
5669 */
5670 forceQuirks = true;
5671 /*
5672 * Emit that DOCTYPE token.
5673 */
5674 systemIdentifier = longStrBufToString();
5675 emitDoctypeToken(pos);
5676 /*
5677 * Switch to the data state.
5678 */
5679 state = transition(state, Tokenizer.DATA, reconsume, pos);
5680 continue stateloop;
5681 case '\r':
5682 appendLongStrBufCarriageReturn();
5683 break stateloop;
5684 case '\n':
5685 appendLongStrBufLineFeed();
5686 continue;
5687 case '\u0000':
5688 c = '\uFFFD';
5689 // fall thru
5690 default:
5691 /*
5692 * Anything else Append the current input
5693 * character to the current DOCTYPE token's
5694 * system identifier.
5695 */
5696 appendLongStrBuf(c);
5697 /*
5698 * Stay in the DOCTYPE system identifier
5699 * (double-quoted) state.
5700 */
5701 continue;
5702 }
5703 }
5704 // XXX reorder point
5705 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
5706 for (;;) {
5707 if (++pos == endPos) {
5708 break stateloop;
5709 }
5710 c = checkChar(buf, pos);
5711 /*
5712 * Consume the next input character:
5713 */
5714 switch (c) {
5715 case '\'':
5716 /*
5717 * U+0027 APOSTROPHE (') Switch to the after
5718 * DOCTYPE public identifier state.
5719 */
5720 publicIdentifier = longStrBufToString();
5721 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
5722 continue stateloop;
5723 case '>':
5724 errGtInPublicId();
5725 /*
5726 * Set the DOCTYPE token's force-quirks flag to
5727 * on.
5728 */
5729 forceQuirks = true;
5730 /*
5731 * Emit that DOCTYPE token.
5732 */
5733 publicIdentifier = longStrBufToString();
5734 emitDoctypeToken(pos);
5735 /*
5736 * Switch to the data state.
5737 */
5738 state = transition(state, Tokenizer.DATA, reconsume, pos);
5739 continue stateloop;
5740 case '\r':
5741 appendLongStrBufCarriageReturn();
5742 break stateloop;
5743 case '\n':
5744 appendLongStrBufLineFeed();
5745 continue;
5746 case '\u0000':
5747 c = '\uFFFD';
5748 // fall thru
5749 default:
5750 /*
5751 * Anything else Append the current input
5752 * character to the current DOCTYPE token's
5753 * public identifier.
5754 */
5755 appendLongStrBuf(c);
5756 /*
5757 * Stay in the DOCTYPE public identifier
5758 * (single-quoted) state.
5759 */
5760 continue;
5761 }
5762 }
5763 // XXX reorder point
5764 case PROCESSING_INSTRUCTION:
5765 processinginstructionloop: for (;;) {
5766 if (++pos == endPos) {
5767 break stateloop;
5768 }
5769 c = checkChar(buf, pos);
5770 switch (c) {
5771 case '?':
5772 state = transition(
5773 state,
5774 Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK,
5775 reconsume, pos);
5776 break processinginstructionloop;
5777 // continue stateloop;
5778 default:
5779 continue;
5780 }
5781 }
5782 case PROCESSING_INSTRUCTION_QUESTION_MARK:
5783 if (++pos == endPos) {
5784 break stateloop;
5785 }
5786 c = checkChar(buf, pos);
5787 switch (c) {
5788 case '>':
5789 state = transition(state, Tokenizer.DATA,
5790 reconsume, pos);
5791 continue stateloop;
5792 default:
5793 state = transition(state,
5794 Tokenizer.PROCESSING_INSTRUCTION,
5795 reconsume, pos);
5796 continue stateloop;
5797 }
5798 // END HOTSPOT WORKAROUND
5799 }
5800 }
5801 flushChars(buf, pos);
5802 /*
5803 * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; }
5804 */
5805 // Save locals
5806 stateSave = state;
5807 returnStateSave = returnState;
5808 return pos;
5809 }
5811 // HOTSPOT WORKAROUND INSERTION POINT
5813 // [NOCPP[
5815 protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException {
5816 return to;
5817 }
5819 // ]NOCPP]
5821 private void initDoctypeFields() {
5822 doctypeName = "";
5823 if (systemIdentifier != null) {
5824 Portability.releaseString(systemIdentifier);
5825 systemIdentifier = null;
5826 }
5827 if (publicIdentifier != null) {
5828 Portability.releaseString(publicIdentifier);
5829 publicIdentifier = null;
5830 }
5831 forceQuirks = false;
5832 }
5834 @Inline private void adjustDoubleHyphenAndAppendToLongStrBufCarriageReturn()
5835 throws SAXException {
5836 silentCarriageReturn();
5837 adjustDoubleHyphenAndAppendToLongStrBufAndErr('\n');
5838 }
5840 @Inline private void adjustDoubleHyphenAndAppendToLongStrBufLineFeed()
5841 throws SAXException {
5842 silentLineFeed();
5843 adjustDoubleHyphenAndAppendToLongStrBufAndErr('\n');
5844 }
5846 @Inline private void appendLongStrBufLineFeed() {
5847 silentLineFeed();
5848 appendLongStrBuf('\n');
5849 }
5851 @Inline private void appendLongStrBufCarriageReturn() {
5852 silentCarriageReturn();
5853 appendLongStrBuf('\n');
5854 }
5856 @Inline protected void silentCarriageReturn() {
5857 ++line;
5858 lastCR = true;
5859 }
5861 @Inline protected void silentLineFeed() {
5862 ++line;
5863 }
5865 private void emitCarriageReturn(@NoLength char[] buf, int pos)
5866 throws SAXException {
5867 silentCarriageReturn();
5868 flushChars(buf, pos);
5869 tokenHandler.characters(Tokenizer.LF, 0, 1);
5870 cstart = Integer.MAX_VALUE;
5871 }
5873 private void emitReplacementCharacter(@NoLength char[] buf, int pos)
5874 throws SAXException {
5875 flushChars(buf, pos);
5876 tokenHandler.zeroOriginatingReplacementCharacter();
5877 cstart = pos + 1;
5878 }
5880 private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos)
5881 throws SAXException {
5882 flushChars(buf, pos);
5883 tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1);
5884 cstart = pos + 1;
5885 }
5887 private void setAdditionalAndRememberAmpersandLocation(char add) {
5888 additional = add;
5889 // [NOCPP[
5890 ampersandLocation = new LocatorImpl(this);
5891 // ]NOCPP]
5892 }
5894 private void bogusDoctype() throws SAXException {
5895 errBogusDoctype();
5896 forceQuirks = true;
5897 }
5899 private void bogusDoctypeWithoutQuirks() throws SAXException {
5900 errBogusDoctype();
5901 forceQuirks = false;
5902 }
5904 private void emitOrAppendStrBuf(int returnState) throws SAXException {
5905 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
5906 appendStrBufToLongStrBuf();
5907 } else {
5908 emitStrBuf();
5909 }
5910 }
5912 private void handleNcrValue(int returnState) throws SAXException {
5913 /*
5914 * If one or more characters match the range, then take them all and
5915 * interpret the string of characters as a number (either hexadecimal or
5916 * decimal as appropriate).
5917 */
5918 if (value <= 0xFFFF) {
5919 if (value >= 0x80 && value <= 0x9f) {
5920 /*
5921 * If that number is one of the numbers in the first column of
5922 * the following table, then this is a parse error.
5923 */
5924 errNcrInC1Range();
5925 /*
5926 * Find the row with that number in the first column, and return
5927 * a character token for the Unicode character given in the
5928 * second column of that row.
5929 */
5930 @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80];
5931 emitOrAppendOne(val, returnState);
5932 // [NOCPP[
5933 } else if (value == 0xC
5934 && contentSpacePolicy != XmlViolationPolicy.ALLOW) {
5935 if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) {
5936 emitOrAppendOne(Tokenizer.SPACE, returnState);
5937 } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) {
5938 fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space.");
5939 }
5940 // ]NOCPP]
5941 } else if (value == 0x0) {
5942 errNcrZero();
5943 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
5944 } else if ((value & 0xF800) == 0xD800) {
5945 errNcrSurrogate();
5946 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
5947 } else {
5948 /*
5949 * Otherwise, return a character token for the Unicode character
5950 * whose code point is that number.
5951 */
5952 char ch = (char) value;
5953 // [NOCPP[
5954 if (value == 0x0D) {
5955 errNcrCr();
5956 } else if ((value <= 0x0008) || (value == 0x000B)
5957 || (value >= 0x000E && value <= 0x001F)) {
5958 ch = errNcrControlChar(ch);
5959 } else if (value >= 0xFDD0 && value <= 0xFDEF) {
5960 errNcrUnassigned();
5961 } else if ((value & 0xFFFE) == 0xFFFE) {
5962 ch = errNcrNonCharacter(ch);
5963 } else if (value >= 0x007F && value <= 0x009F) {
5964 errNcrControlChar();
5965 } else {
5966 maybeWarnPrivateUse(ch);
5967 }
5968 // ]NOCPP]
5969 bmpChar[0] = ch;
5970 emitOrAppendOne(bmpChar, returnState);
5971 }
5972 } else if (value <= 0x10FFFF) {
5973 // [NOCPP[
5974 maybeWarnPrivateUseAstral();
5975 if ((value & 0xFFFE) == 0xFFFE) {
5976 errAstralNonCharacter(value);
5977 }
5978 // ]NOCPP]
5979 astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10));
5980 astralChar[1] = (char) (0xDC00 + (value & 0x3FF));
5981 emitOrAppendTwo(astralChar, returnState);
5982 } else {
5983 errNcrOutOfRange();
5984 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
5985 }
5986 }
5988 public void eof() throws SAXException {
5989 int state = stateSave;
5990 int returnState = returnStateSave;
5992 eofloop: for (;;) {
5993 switch (state) {
5994 case SCRIPT_DATA_LESS_THAN_SIGN:
5995 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
5996 /*
5997 * Otherwise, emit a U+003C LESS-THAN SIGN character token
5998 */
5999 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
6000 /*
6001 * and reconsume the current input character in the data
6002 * state.
6003 */
6004 break eofloop;
6005 case TAG_OPEN:
6006 /*
6007 * The behavior of this state depends on the content model
6008 * flag.
6009 */
6010 /*
6011 * Anything else Parse error.
6012 */
6013 errEofAfterLt();
6014 /*
6015 * Emit a U+003C LESS-THAN SIGN character token
6016 */
6017 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
6018 /*
6019 * and reconsume the current input character in the data
6020 * state.
6021 */
6022 break eofloop;
6023 case RAWTEXT_RCDATA_LESS_THAN_SIGN:
6024 /*
6025 * Emit a U+003C LESS-THAN SIGN character token
6026 */
6027 tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
6028 /*
6029 * and reconsume the current input character in the RCDATA
6030 * state.
6031 */
6032 break eofloop;
6033 case NON_DATA_END_TAG_NAME:
6034 /*
6035 * Emit a U+003C LESS-THAN SIGN character token, a U+002F
6036 * SOLIDUS character token,
6037 */
6038 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
6039 /*
6040 * a character token for each of the characters in the
6041 * temporary buffer (in the order they were added to the
6042 * buffer),
6043 */
6044 emitStrBuf();
6045 /*
6046 * and reconsume the current input character in the RCDATA
6047 * state.
6048 */
6049 break eofloop;
6050 case CLOSE_TAG_OPEN:
6051 /* EOF Parse error. */
6052 errEofAfterLt();
6053 /*
6054 * Emit a U+003C LESS-THAN SIGN character token and a U+002F
6055 * SOLIDUS character token.
6056 */
6057 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
6058 /*
6059 * Reconsume the EOF character in the data state.
6060 */
6061 break eofloop;
6062 case TAG_NAME:
6063 /*
6064 * EOF Parse error.
6065 */
6066 errEofInTagName();
6067 /*
6068 * Reconsume the EOF character in the data state.
6069 */
6070 break eofloop;
6071 case BEFORE_ATTRIBUTE_NAME:
6072 case AFTER_ATTRIBUTE_VALUE_QUOTED:
6073 case SELF_CLOSING_START_TAG:
6074 /* EOF Parse error. */
6075 errEofWithoutGt();
6076 /*
6077 * Reconsume the EOF character in the data state.
6078 */
6079 break eofloop;
6080 case ATTRIBUTE_NAME:
6081 /*
6082 * EOF Parse error.
6083 */
6084 errEofInAttributeName();
6085 /*
6086 * Reconsume the EOF character in the data state.
6087 */
6088 break eofloop;
6089 case AFTER_ATTRIBUTE_NAME:
6090 case BEFORE_ATTRIBUTE_VALUE:
6091 /* EOF Parse error. */
6092 errEofWithoutGt();
6093 /*
6094 * Reconsume the EOF character in the data state.
6095 */
6096 break eofloop;
6097 case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
6098 case ATTRIBUTE_VALUE_SINGLE_QUOTED:
6099 case ATTRIBUTE_VALUE_UNQUOTED:
6100 /* EOF Parse error. */
6101 errEofInAttributeValue();
6102 /*
6103 * Reconsume the EOF character in the data state.
6104 */
6105 break eofloop;
6106 case BOGUS_COMMENT:
6107 emitComment(0, 0);
6108 break eofloop;
6109 case BOGUS_COMMENT_HYPHEN:
6110 // [NOCPP[
6111 maybeAppendSpaceToBogusComment();
6112 // ]NOCPP]
6113 emitComment(0, 0);
6114 break eofloop;
6115 case MARKUP_DECLARATION_OPEN:
6116 errBogusComment();
6117 clearLongStrBuf();
6118 emitComment(0, 0);
6119 break eofloop;
6120 case MARKUP_DECLARATION_HYPHEN:
6121 errBogusComment();
6122 emitComment(0, 0);
6123 break eofloop;
6124 case MARKUP_DECLARATION_OCTYPE:
6125 if (index < 6) {
6126 errBogusComment();
6127 emitComment(0, 0);
6128 } else {
6129 /* EOF Parse error. */
6130 errEofInDoctype();
6131 /*
6132 * Create a new DOCTYPE token. Set its force-quirks flag
6133 * to on.
6134 */
6135 doctypeName = "";
6136 if (systemIdentifier != null) {
6137 Portability.releaseString(systemIdentifier);
6138 systemIdentifier = null;
6139 }
6140 if (publicIdentifier != null) {
6141 Portability.releaseString(publicIdentifier);
6142 publicIdentifier = null;
6143 }
6144 forceQuirks = true;
6145 /*
6146 * Emit the token.
6147 */
6148 emitDoctypeToken(0);
6149 /*
6150 * Reconsume the EOF character in the data state.
6151 */
6152 break eofloop;
6153 }
6154 break eofloop;
6155 case COMMENT_START:
6156 case COMMENT:
6157 /*
6158 * EOF Parse error.
6159 */
6160 errEofInComment();
6161 /* Emit the comment token. */
6162 emitComment(0, 0);
6163 /*
6164 * Reconsume the EOF character in the data state.
6165 */
6166 break eofloop;
6167 case COMMENT_END:
6168 errEofInComment();
6169 /* Emit the comment token. */
6170 emitComment(2, 0);
6171 /*
6172 * Reconsume the EOF character in the data state.
6173 */
6174 break eofloop;
6175 case COMMENT_END_DASH:
6176 case COMMENT_START_DASH:
6177 errEofInComment();
6178 /* Emit the comment token. */
6179 emitComment(1, 0);
6180 /*
6181 * Reconsume the EOF character in the data state.
6182 */
6183 break eofloop;
6184 case COMMENT_END_BANG:
6185 errEofInComment();
6186 /* Emit the comment token. */
6187 emitComment(3, 0);
6188 /*
6189 * Reconsume the EOF character in the data state.
6190 */
6191 break eofloop;
6192 case DOCTYPE:
6193 case BEFORE_DOCTYPE_NAME:
6194 errEofInDoctype();
6195 /*
6196 * Create a new DOCTYPE token. Set its force-quirks flag to
6197 * on.
6198 */
6199 forceQuirks = true;
6200 /*
6201 * Emit the token.
6202 */
6203 emitDoctypeToken(0);
6204 /*
6205 * Reconsume the EOF character in the data state.
6206 */
6207 break eofloop;
6208 case DOCTYPE_NAME:
6209 errEofInDoctype();
6210 strBufToDoctypeName();
6211 /*
6212 * Set the DOCTYPE token's force-quirks flag to on.
6213 */
6214 forceQuirks = true;
6215 /*
6216 * Emit that DOCTYPE token.
6217 */
6218 emitDoctypeToken(0);
6219 /*
6220 * Reconsume the EOF character in the data state.
6221 */
6222 break eofloop;
6223 case DOCTYPE_UBLIC:
6224 case DOCTYPE_YSTEM:
6225 case AFTER_DOCTYPE_NAME:
6226 case AFTER_DOCTYPE_PUBLIC_KEYWORD:
6227 case AFTER_DOCTYPE_SYSTEM_KEYWORD:
6228 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
6229 errEofInDoctype();
6230 /*
6231 * Set the DOCTYPE token's force-quirks flag to on.
6232 */
6233 forceQuirks = true;
6234 /*
6235 * Emit that DOCTYPE token.
6236 */
6237 emitDoctypeToken(0);
6238 /*
6239 * Reconsume the EOF character in the data state.
6240 */
6241 break eofloop;
6242 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
6243 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
6244 /* EOF Parse error. */
6245 errEofInPublicId();
6246 /*
6247 * Set the DOCTYPE token's force-quirks flag to on.
6248 */
6249 forceQuirks = true;
6250 /*
6251 * Emit that DOCTYPE token.
6252 */
6253 publicIdentifier = longStrBufToString();
6254 emitDoctypeToken(0);
6255 /*
6256 * Reconsume the EOF character in the data state.
6257 */
6258 break eofloop;
6259 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
6260 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
6261 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
6262 errEofInDoctype();
6263 /*
6264 * Set the DOCTYPE token's force-quirks flag to on.
6265 */
6266 forceQuirks = true;
6267 /*
6268 * Emit that DOCTYPE token.
6269 */
6270 emitDoctypeToken(0);
6271 /*
6272 * Reconsume the EOF character in the data state.
6273 */
6274 break eofloop;
6275 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
6276 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
6277 /* EOF Parse error. */
6278 errEofInSystemId();
6279 /*
6280 * Set the DOCTYPE token's force-quirks flag to on.
6281 */
6282 forceQuirks = true;
6283 /*
6284 * Emit that DOCTYPE token.
6285 */
6286 systemIdentifier = longStrBufToString();
6287 emitDoctypeToken(0);
6288 /*
6289 * Reconsume the EOF character in the data state.
6290 */
6291 break eofloop;
6292 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
6293 errEofInDoctype();
6294 /*
6295 * Set the DOCTYPE token's force-quirks flag to on.
6296 */
6297 forceQuirks = true;
6298 /*
6299 * Emit that DOCTYPE token.
6300 */
6301 emitDoctypeToken(0);
6302 /*
6303 * Reconsume the EOF character in the data state.
6304 */
6305 break eofloop;
6306 case BOGUS_DOCTYPE:
6307 /*
6308 * Emit that DOCTYPE token.
6309 */
6310 emitDoctypeToken(0);
6311 /*
6312 * Reconsume the EOF character in the data state.
6313 */
6314 break eofloop;
6315 case CONSUME_CHARACTER_REFERENCE:
6316 /*
6317 * Unlike the definition is the spec, this state does not
6318 * return a value and never requires the caller to
6319 * backtrack. This state takes care of emitting characters
6320 * or appending to the current attribute value. It also
6321 * takes care of that in the case when consuming the entity
6322 * fails.
6323 */
6324 /*
6325 * This section defines how to consume an entity. This
6326 * definition is used when parsing entities in text and in
6327 * attributes.
6328 *
6329 * The behavior depends on the identity of the next
6330 * character (the one immediately after the U+0026 AMPERSAND
6331 * character):
6332 */
6334 emitOrAppendStrBuf(returnState);
6335 state = returnState;
6336 continue;
6337 case CHARACTER_REFERENCE_HILO_LOOKUP:
6338 errNoNamedCharacterMatch();
6339 emitOrAppendStrBuf(returnState);
6340 state = returnState;
6341 continue;
6342 case CHARACTER_REFERENCE_TAIL:
6343 outer: for (;;) {
6344 char c = '\u0000';
6345 entCol++;
6346 /*
6347 * Consume the maximum number of characters possible,
6348 * with the consumed characters matching one of the
6349 * identifiers in the first column of the named
6350 * character references table (in a case-sensitive
6351 * manner).
6352 */
6353 hiloop: for (;;) {
6354 if (hi == -1) {
6355 break hiloop;
6356 }
6357 if (entCol == NamedCharacters.NAMES[hi].length()) {
6358 break hiloop;
6359 }
6360 if (entCol > NamedCharacters.NAMES[hi].length()) {
6361 break outer;
6362 } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
6363 hi--;
6364 } else {
6365 break hiloop;
6366 }
6367 }
6369 loloop: for (;;) {
6370 if (hi < lo) {
6371 break outer;
6372 }
6373 if (entCol == NamedCharacters.NAMES[lo].length()) {
6374 candidate = lo;
6375 strBufMark = strBufLen;
6376 lo++;
6377 } else if (entCol > NamedCharacters.NAMES[lo].length()) {
6378 break outer;
6379 } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
6380 lo++;
6381 } else {
6382 break loloop;
6383 }
6384 }
6385 if (hi < lo) {
6386 break outer;
6387 }
6388 continue;
6389 }
6391 if (candidate == -1) {
6392 /*
6393 * If no match can be made, then this is a parse error.
6394 */
6395 errNoNamedCharacterMatch();
6396 emitOrAppendStrBuf(returnState);
6397 state = returnState;
6398 continue eofloop;
6399 } else {
6400 @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
6401 if (candidateName.length() == 0
6402 || candidateName.charAt(candidateName.length() - 1) != ';') {
6403 /*
6404 * If the last character matched is not a U+003B
6405 * SEMICOLON (;), there is a parse error.
6406 */
6407 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6408 /*
6409 * If the entity is being consumed as part of an
6410 * attribute, and the last character matched is
6411 * not a U+003B SEMICOLON (;),
6412 */
6413 char ch;
6414 if (strBufMark == strBufLen) {
6415 ch = '\u0000';
6416 } else {
6417 ch = strBuf[strBufMark];
6418 }
6419 if ((ch >= '0' && ch <= '9')
6420 || (ch >= 'A' && ch <= 'Z')
6421 || (ch >= 'a' && ch <= 'z')) {
6422 /*
6423 * and the next character is in the range
6424 * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
6425 * U+0041 LATIN CAPITAL LETTER A to U+005A
6426 * LATIN CAPITAL LETTER Z, or U+0061 LATIN
6427 * SMALL LETTER A to U+007A LATIN SMALL
6428 * LETTER Z, then, for historical reasons,
6429 * all the characters that were matched
6430 * after the U+0026 AMPERSAND (&) must be
6431 * unconsumed, and nothing is returned.
6432 */
6433 errNoNamedCharacterMatch();
6434 appendStrBufToLongStrBuf();
6435 state = returnState;
6436 continue eofloop;
6437 }
6438 }
6439 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6440 errUnescapedAmpersandInterpretedAsCharacterReference();
6441 } else {
6442 errNotSemicolonTerminated();
6443 }
6444 }
6446 /*
6447 * Otherwise, return a character token for the character
6448 * corresponding to the entity name (as given by the
6449 * second column of the named character references
6450 * table).
6451 */
6452 @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
6453 if (
6454 // [NOCPP[
6455 val.length == 1
6456 // ]NOCPP]
6457 // CPPONLY: val[1] == 0
6458 ) {
6459 emitOrAppendOne(val, returnState);
6460 } else {
6461 emitOrAppendTwo(val, returnState);
6462 }
6463 // this is so complicated!
6464 if (strBufMark < strBufLen) {
6465 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6466 for (int i = strBufMark; i < strBufLen; i++) {
6467 appendLongStrBuf(strBuf[i]);
6468 }
6469 } else {
6470 tokenHandler.characters(strBuf, strBufMark,
6471 strBufLen - strBufMark);
6472 }
6473 }
6474 state = returnState;
6475 continue eofloop;
6476 /*
6477 * If the markup contains I'm ¬it; I tell you, the
6478 * entity is parsed as "not", as in, I'm ¬it; I tell
6479 * you. But if the markup was I'm ∉ I tell you,
6480 * the entity would be parsed as "notin;", resulting in
6481 * I'm ∉ I tell you.
6482 */
6483 }
6484 case CONSUME_NCR:
6485 case DECIMAL_NRC_LOOP:
6486 case HEX_NCR_LOOP:
6487 /*
6488 * If no characters match the range, then don't consume any
6489 * characters (and unconsume the U+0023 NUMBER SIGN
6490 * character and, if appropriate, the X character). This is
6491 * a parse error; nothing is returned.
6492 *
6493 * Otherwise, if the next character is a U+003B SEMICOLON,
6494 * consume that too. If it isn't, there is a parse error.
6495 */
6496 if (!seenDigits) {
6497 errNoDigitsInNCR();
6498 emitOrAppendStrBuf(returnState);
6499 state = returnState;
6500 continue;
6501 } else {
6502 errCharRefLacksSemicolon();
6503 }
6504 // WARNING previous state sets reconsume
6505 handleNcrValue(returnState);
6506 state = returnState;
6507 continue;
6508 case CDATA_RSQB:
6509 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
6510 break eofloop;
6511 case CDATA_RSQB_RSQB:
6512 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
6513 break eofloop;
6514 case DATA:
6515 default:
6516 break eofloop;
6517 }
6518 }
6519 // case DATA:
6520 /*
6521 * EOF Emit an end-of-file token.
6522 */
6523 tokenHandler.eof();
6524 return;
6525 }
6527 private void emitDoctypeToken(int pos) throws SAXException {
6528 cstart = pos + 1;
6529 tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier,
6530 forceQuirks);
6531 // It is OK and sufficient to release these here, since
6532 // there's no way out of the doctype states than through paths
6533 // that call this method.
6534 doctypeName = null;
6535 Portability.releaseString(publicIdentifier);
6536 publicIdentifier = null;
6537 Portability.releaseString(systemIdentifier);
6538 systemIdentifier = null;
6539 }
6541 @Inline protected char checkChar(@NoLength char[] buf, int pos)
6542 throws SAXException {
6543 return buf[pos];
6544 }
6546 public boolean internalEncodingDeclaration(String internalCharset)
6547 throws SAXException {
6548 if (encodingDeclarationHandler != null) {
6549 return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset);
6550 }
6551 return false;
6552 }
6554 /**
6555 * @param val
6556 * @throws SAXException
6557 */
6558 private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState)
6559 throws SAXException {
6560 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6561 appendLongStrBuf(val[0]);
6562 appendLongStrBuf(val[1]);
6563 } else {
6564 tokenHandler.characters(val, 0, 2);
6565 }
6566 }
6568 private void emitOrAppendOne(@Const @NoLength char[] val, int returnState)
6569 throws SAXException {
6570 if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6571 appendLongStrBuf(val[0]);
6572 } else {
6573 tokenHandler.characters(val, 0, 1);
6574 }
6575 }
6577 public void end() throws SAXException {
6578 strBuf = null;
6579 longStrBuf = null;
6580 doctypeName = null;
6581 if (systemIdentifier != null) {
6582 Portability.releaseString(systemIdentifier);
6583 systemIdentifier = null;
6584 }
6585 if (publicIdentifier != null) {
6586 Portability.releaseString(publicIdentifier);
6587 publicIdentifier = null;
6588 }
6589 if (tagName != null) {
6590 tagName.release();
6591 tagName = null;
6592 }
6593 if (attributeName != null) {
6594 attributeName.release();
6595 attributeName = null;
6596 }
6597 tokenHandler.endTokenization();
6598 if (attributes != null) {
6599 // [NOCPP[
6600 attributes = null;
6601 // ]NOCPP]
6602 // CPPONLY: attributes.clear(mappingLangToXmlLang);
6603 }
6604 }
6606 public void requestSuspension() {
6607 shouldSuspend = true;
6608 }
6610 // [NOCPP[
6612 public void becomeConfident() {
6613 confident = true;
6614 }
6616 /**
6617 * Returns the nextCharOnNewLine.
6618 *
6619 * @return the nextCharOnNewLine
6620 */
6621 public boolean isNextCharOnNewLine() {
6622 return false;
6623 }
6625 public boolean isPrevCR() {
6626 return lastCR;
6627 }
6629 /**
6630 * Returns the line.
6631 *
6632 * @return the line
6633 */
6634 public int getLine() {
6635 return -1;
6636 }
6638 /**
6639 * Returns the col.
6640 *
6641 * @return the col
6642 */
6643 public int getCol() {
6644 return -1;
6645 }
6647 // ]NOCPP]
6649 public boolean isInDataState() {
6650 return (stateSave == DATA);
6651 }
6653 public void resetToDataState() {
6654 strBufLen = 0;
6655 longStrBufLen = 0;
6656 stateSave = Tokenizer.DATA;
6657 // line = 1; XXX line numbers
6658 lastCR = false;
6659 index = 0;
6660 forceQuirks = false;
6661 additional = '\u0000';
6662 entCol = -1;
6663 firstCharKey = -1;
6664 lo = 0;
6665 hi = 0; // will always be overwritten before use anyway
6666 candidate = -1;
6667 strBufMark = 0;
6668 prevValue = -1;
6669 value = 0;
6670 seenDigits = false;
6671 endTag = false;
6672 shouldSuspend = false;
6673 initDoctypeFields();
6674 if (tagName != null) {
6675 tagName.release();
6676 tagName = null;
6677 }
6678 if (attributeName != null) {
6679 attributeName.release();
6680 attributeName = null;
6681 }
6682 if (newAttributesEachTime) {
6683 if (attributes != null) {
6684 Portability.delete(attributes);
6685 attributes = null;
6686 }
6687 }
6688 }
6690 public void loadState(Tokenizer other) throws SAXException {
6691 strBufLen = other.strBufLen;
6692 if (strBufLen > strBuf.length) {
6693 strBuf = new char[strBufLen];
6694 }
6695 System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen);
6697 longStrBufLen = other.longStrBufLen;
6698 if (longStrBufLen > longStrBuf.length) {
6699 longStrBuf = new char[longStrBufLen];
6700 }
6701 System.arraycopy(other.longStrBuf, 0, longStrBuf, 0, longStrBufLen);
6703 stateSave = other.stateSave;
6704 returnStateSave = other.returnStateSave;
6705 endTagExpectation = other.endTagExpectation;
6706 endTagExpectationAsArray = other.endTagExpectationAsArray;
6707 // line = 1; XXX line numbers
6708 lastCR = other.lastCR;
6709 index = other.index;
6710 forceQuirks = other.forceQuirks;
6711 additional = other.additional;
6712 entCol = other.entCol;
6713 firstCharKey = other.firstCharKey;
6714 lo = other.lo;
6715 hi = other.hi;
6716 candidate = other.candidate;
6717 strBufMark = other.strBufMark;
6718 prevValue = other.prevValue;
6719 value = other.value;
6720 seenDigits = other.seenDigits;
6721 endTag = other.endTag;
6722 shouldSuspend = false;
6724 if (other.doctypeName == null) {
6725 doctypeName = null;
6726 } else {
6727 doctypeName = Portability.newLocalFromLocal(other.doctypeName,
6728 interner);
6729 }
6731 Portability.releaseString(systemIdentifier);
6732 if (other.systemIdentifier == null) {
6733 systemIdentifier = null;
6734 } else {
6735 systemIdentifier = Portability.newStringFromString(other.systemIdentifier);
6736 }
6738 Portability.releaseString(publicIdentifier);
6739 if (other.publicIdentifier == null) {
6740 publicIdentifier = null;
6741 } else {
6742 publicIdentifier = Portability.newStringFromString(other.publicIdentifier);
6743 }
6745 if (tagName != null) {
6746 tagName.release();
6747 }
6748 if (other.tagName == null) {
6749 tagName = null;
6750 } else {
6751 tagName = other.tagName.cloneElementName(interner);
6752 }
6754 if (attributeName != null) {
6755 attributeName.release();
6756 }
6757 if (other.attributeName == null) {
6758 attributeName = null;
6759 } else {
6760 attributeName = other.attributeName.cloneAttributeName(interner);
6761 }
6763 Portability.delete(attributes);
6764 if (other.attributes == null) {
6765 attributes = null;
6766 } else {
6767 attributes = other.attributes.cloneAttributes(interner);
6768 }
6769 }
6771 public void initializeWithoutStarting() throws SAXException {
6772 confident = false;
6773 strBuf = new char[64];
6774 longStrBuf = new char[1024];
6775 line = 1;
6776 // [NOCPP[
6777 html4 = false;
6778 metaBoundaryPassed = false;
6779 wantsComments = tokenHandler.wantsComments();
6780 if (!newAttributesEachTime) {
6781 attributes = new HtmlAttributes(mappingLangToXmlLang);
6782 }
6783 // ]NOCPP]
6784 resetToDataState();
6785 }
6787 protected void errGarbageAfterLtSlash() throws SAXException {
6788 }
6790 protected void errLtSlashGt() throws SAXException {
6791 }
6793 protected void errWarnLtSlashInRcdata() throws SAXException {
6794 }
6796 protected void errHtml4LtSlashInRcdata(char folded) throws SAXException {
6797 }
6799 protected void errCharRefLacksSemicolon() throws SAXException {
6800 }
6802 protected void errNoDigitsInNCR() throws SAXException {
6803 }
6805 protected void errGtInSystemId() throws SAXException {
6806 }
6808 protected void errGtInPublicId() throws SAXException {
6809 }
6811 protected void errNamelessDoctype() throws SAXException {
6812 }
6814 protected void errConsecutiveHyphens() throws SAXException {
6815 }
6817 protected void errPrematureEndOfComment() throws SAXException {
6818 }
6820 protected void errBogusComment() throws SAXException {
6821 }
6823 protected void errUnquotedAttributeValOrNull(char c) throws SAXException {
6824 }
6826 protected void errSlashNotFollowedByGt() throws SAXException {
6827 }
6829 protected void errHtml4XmlVoidSyntax() throws SAXException {
6830 }
6832 protected void errNoSpaceBetweenAttributes() throws SAXException {
6833 }
6835 protected void errHtml4NonNameInUnquotedAttribute(char c)
6836 throws SAXException {
6837 }
6839 protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c)
6840 throws SAXException {
6841 }
6843 protected void errAttributeValueMissing() throws SAXException {
6844 }
6846 protected void errBadCharBeforeAttributeNameOrNull(char c)
6847 throws SAXException {
6848 }
6850 protected void errEqualsSignBeforeAttributeName() throws SAXException {
6851 }
6853 protected void errBadCharAfterLt(char c) throws SAXException {
6854 }
6856 protected void errLtGt() throws SAXException {
6857 }
6859 protected void errProcessingInstruction() throws SAXException {
6860 }
6862 protected void errUnescapedAmpersandInterpretedAsCharacterReference()
6863 throws SAXException {
6864 }
6866 protected void errNotSemicolonTerminated() throws SAXException {
6867 }
6869 protected void errNoNamedCharacterMatch() throws SAXException {
6870 }
6872 protected void errQuoteBeforeAttributeName(char c) throws SAXException {
6873 }
6875 protected void errQuoteOrLtInAttributeNameOrNull(char c)
6876 throws SAXException {
6877 }
6879 protected void errExpectedPublicId() throws SAXException {
6880 }
6882 protected void errBogusDoctype() throws SAXException {
6883 }
6885 protected void maybeWarnPrivateUseAstral() throws SAXException {
6886 }
6888 protected void maybeWarnPrivateUse(char ch) throws SAXException {
6889 }
6891 protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs)
6892 throws SAXException {
6893 }
6895 protected void maybeErrSlashInEndTag(boolean selfClosing)
6896 throws SAXException {
6897 }
6899 protected char errNcrNonCharacter(char ch) throws SAXException {
6900 return ch;
6901 }
6903 protected void errAstralNonCharacter(int ch) throws SAXException {
6904 }
6906 protected void errNcrSurrogate() throws SAXException {
6907 }
6909 protected char errNcrControlChar(char ch) throws SAXException {
6910 return ch;
6911 }
6913 protected void errNcrCr() throws SAXException {
6914 }
6916 protected void errNcrInC1Range() throws SAXException {
6917 }
6919 protected void errEofInPublicId() throws SAXException {
6920 }
6922 protected void errEofInComment() throws SAXException {
6923 }
6925 protected void errEofInDoctype() throws SAXException {
6926 }
6928 protected void errEofInAttributeValue() throws SAXException {
6929 }
6931 protected void errEofInAttributeName() throws SAXException {
6932 }
6934 protected void errEofWithoutGt() throws SAXException {
6935 }
6937 protected void errEofInTagName() throws SAXException {
6938 }
6940 protected void errEofInEndTag() throws SAXException {
6941 }
6943 protected void errEofAfterLt() throws SAXException {
6944 }
6946 protected void errNcrOutOfRange() throws SAXException {
6947 }
6949 protected void errNcrUnassigned() throws SAXException {
6950 }
6952 protected void errDuplicateAttribute() throws SAXException {
6953 }
6955 protected void errEofInSystemId() throws SAXException {
6956 }
6958 protected void errExpectedSystemId() throws SAXException {
6959 }
6961 protected void errMissingSpaceBeforeDoctypeName() throws SAXException {
6962 }
6964 protected void errHyphenHyphenBang() throws SAXException {
6965 }
6967 protected void errNcrControlChar() throws SAXException {
6968 }
6970 protected void errNcrZero() throws SAXException {
6971 }
6973 protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote()
6974 throws SAXException {
6975 }
6977 protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException {
6978 }
6980 protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote()
6981 throws SAXException {
6982 }
6984 protected void noteAttributeWithoutValue() throws SAXException {
6985 }
6987 protected void noteUnquotedAttributeValue() throws SAXException {
6988 }
6990 /**
6991 * Sets the encodingDeclarationHandler.
6992 *
6993 * @param encodingDeclarationHandler
6994 * the encodingDeclarationHandler to set
6995 */
6996 public void setEncodingDeclarationHandler(
6997 EncodingDeclarationHandler encodingDeclarationHandler) {
6998 this.encodingDeclarationHandler = encodingDeclarationHandler;
6999 }
7001 void destructor() {
7002 // The translator will write refcount tracing stuff here
7003 Portability.delete(attributes);
7004 attributes = null;
7005 }
7007 // [NOCPP[
7009 /**
7010 * Sets an offset to be added to the position reported to
7011 * <code>TransitionHandler</code>.
7012 *
7013 * @param offset the offset
7014 */
7015 public void setTransitionBaseOffset(int offset) {
7017 }
7019 // ]NOCPP]
7021 }