|
1 /* |
|
2 * Copyright (c) 2005-2007 Henri Sivonen |
|
3 * Copyright (c) 2007-2013 Mozilla Foundation |
|
4 * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla |
|
5 * Foundation, and Opera Software ASA. |
|
6 * |
|
7 * Permission is hereby granted, free of charge, to any person obtaining a |
|
8 * copy of this software and associated documentation files (the "Software"), |
|
9 * to deal in the Software without restriction, including without limitation |
|
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
|
11 * and/or sell copies of the Software, and to permit persons to whom the |
|
12 * Software is furnished to do so, subject to the following conditions: |
|
13 * |
|
14 * The above copyright notice and this permission notice shall be included in |
|
15 * all copies or substantial portions of the Software. |
|
16 * |
|
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
|
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
|
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
|
23 * DEALINGS IN THE SOFTWARE. |
|
24 */ |
|
25 |
|
26 /* |
|
27 * The comments following this one that use the same comment syntax as this |
|
28 * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007 |
|
29 * amended as of June 18 2008 and May 31 2010. |
|
30 * That document came with this statement: |
|
31 * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and |
|
32 * Opera Software ASA. You are granted a license to use, reproduce and |
|
33 * create derivative works of this document." |
|
34 */ |
|
35 |
|
36 package nu.validator.htmlparser.impl; |
|
37 |
|
38 import nu.validator.htmlparser.annotation.Auto; |
|
39 import nu.validator.htmlparser.annotation.CharacterName; |
|
40 import nu.validator.htmlparser.annotation.Const; |
|
41 import nu.validator.htmlparser.annotation.Inline; |
|
42 import nu.validator.htmlparser.annotation.Local; |
|
43 import nu.validator.htmlparser.annotation.NoLength; |
|
44 import nu.validator.htmlparser.common.EncodingDeclarationHandler; |
|
45 import nu.validator.htmlparser.common.Interner; |
|
46 import nu.validator.htmlparser.common.TokenHandler; |
|
47 import nu.validator.htmlparser.common.XmlViolationPolicy; |
|
48 |
|
49 import org.xml.sax.ErrorHandler; |
|
50 import org.xml.sax.Locator; |
|
51 import org.xml.sax.SAXException; |
|
52 import org.xml.sax.SAXParseException; |
|
53 |
|
54 /** |
|
55 * An implementation of |
|
56 * http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html |
|
57 * |
|
58 * This class implements the <code>Locator</code> interface. This is not an |
|
59 * incidental implementation detail: Users of this class are encouraged to make |
|
60 * use of the <code>Locator</code> nature. |
|
61 * |
|
62 * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer |
|
63 * can be configured to treat these conditions as fatal or to coerce the infoset |
|
64 * to something that XML 1.0 allows. |
|
65 * |
|
66 * @version $Id$ |
|
67 * @author hsivonen |
|
68 */ |
|
69 public class Tokenizer implements Locator { |
|
70 |
|
71 private static final int DATA_AND_RCDATA_MASK = ~1; |
|
72 |
|
73 public static final int DATA = 0; |
|
74 |
|
75 public static final int RCDATA = 1; |
|
76 |
|
77 public static final int SCRIPT_DATA = 2; |
|
78 |
|
79 public static final int RAWTEXT = 3; |
|
80 |
|
81 public static final int SCRIPT_DATA_ESCAPED = 4; |
|
82 |
|
83 public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5; |
|
84 |
|
85 public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6; |
|
86 |
|
87 public static final int ATTRIBUTE_VALUE_UNQUOTED = 7; |
|
88 |
|
89 public static final int PLAINTEXT = 8; |
|
90 |
|
91 public static final int TAG_OPEN = 9; |
|
92 |
|
93 public static final int CLOSE_TAG_OPEN = 10; |
|
94 |
|
95 public static final int TAG_NAME = 11; |
|
96 |
|
97 public static final int BEFORE_ATTRIBUTE_NAME = 12; |
|
98 |
|
99 public static final int ATTRIBUTE_NAME = 13; |
|
100 |
|
101 public static final int AFTER_ATTRIBUTE_NAME = 14; |
|
102 |
|
103 public static final int BEFORE_ATTRIBUTE_VALUE = 15; |
|
104 |
|
105 public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16; |
|
106 |
|
107 public static final int BOGUS_COMMENT = 17; |
|
108 |
|
109 public static final int MARKUP_DECLARATION_OPEN = 18; |
|
110 |
|
111 public static final int DOCTYPE = 19; |
|
112 |
|
113 public static final int BEFORE_DOCTYPE_NAME = 20; |
|
114 |
|
115 public static final int DOCTYPE_NAME = 21; |
|
116 |
|
117 public static final int AFTER_DOCTYPE_NAME = 22; |
|
118 |
|
119 public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23; |
|
120 |
|
121 public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24; |
|
122 |
|
123 public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25; |
|
124 |
|
125 public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26; |
|
126 |
|
127 public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27; |
|
128 |
|
129 public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28; |
|
130 |
|
131 public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29; |
|
132 |
|
133 public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30; |
|
134 |
|
135 public static final int BOGUS_DOCTYPE = 31; |
|
136 |
|
137 public static final int COMMENT_START = 32; |
|
138 |
|
139 public static final int COMMENT_START_DASH = 33; |
|
140 |
|
141 public static final int COMMENT = 34; |
|
142 |
|
143 public static final int COMMENT_END_DASH = 35; |
|
144 |
|
145 public static final int COMMENT_END = 36; |
|
146 |
|
147 public static final int COMMENT_END_BANG = 37; |
|
148 |
|
149 public static final int NON_DATA_END_TAG_NAME = 38; |
|
150 |
|
151 public static final int MARKUP_DECLARATION_HYPHEN = 39; |
|
152 |
|
153 public static final int MARKUP_DECLARATION_OCTYPE = 40; |
|
154 |
|
155 public static final int DOCTYPE_UBLIC = 41; |
|
156 |
|
157 public static final int DOCTYPE_YSTEM = 42; |
|
158 |
|
159 public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43; |
|
160 |
|
161 public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44; |
|
162 |
|
163 public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45; |
|
164 |
|
165 public static final int CONSUME_CHARACTER_REFERENCE = 46; |
|
166 |
|
167 public static final int CONSUME_NCR = 47; |
|
168 |
|
169 public static final int CHARACTER_REFERENCE_TAIL = 48; |
|
170 |
|
171 public static final int HEX_NCR_LOOP = 49; |
|
172 |
|
173 public static final int DECIMAL_NRC_LOOP = 50; |
|
174 |
|
175 public static final int HANDLE_NCR_VALUE = 51; |
|
176 |
|
177 public static final int HANDLE_NCR_VALUE_RECONSUME = 52; |
|
178 |
|
179 public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53; |
|
180 |
|
181 public static final int SELF_CLOSING_START_TAG = 54; |
|
182 |
|
183 public static final int CDATA_START = 55; |
|
184 |
|
185 public static final int CDATA_SECTION = 56; |
|
186 |
|
187 public static final int CDATA_RSQB = 57; |
|
188 |
|
189 public static final int CDATA_RSQB_RSQB = 58; |
|
190 |
|
191 public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59; |
|
192 |
|
193 public static final int SCRIPT_DATA_ESCAPE_START = 60; |
|
194 |
|
195 public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61; |
|
196 |
|
197 public static final int SCRIPT_DATA_ESCAPED_DASH = 62; |
|
198 |
|
199 public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63; |
|
200 |
|
201 public static final int BOGUS_COMMENT_HYPHEN = 64; |
|
202 |
|
203 public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65; |
|
204 |
|
205 public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66; |
|
206 |
|
207 public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67; |
|
208 |
|
209 public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68; |
|
210 |
|
211 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69; |
|
212 |
|
213 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70; |
|
214 |
|
215 public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71; |
|
216 |
|
217 public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72; |
|
218 |
|
219 public static final int PROCESSING_INSTRUCTION = 73; |
|
220 |
|
221 public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74; |
|
222 |
|
223 /** |
|
224 * Magic value for UTF-16 operations. |
|
225 */ |
|
226 private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10)); |
|
227 |
|
228 /** |
|
229 * UTF-16 code unit array containing less than and greater than for emitting |
|
230 * those characters on certain parse errors. |
|
231 */ |
|
232 private static final @NoLength char[] LT_GT = { '<', '>' }; |
|
233 |
|
234 /** |
|
235 * UTF-16 code unit array containing less than and solidus for emitting |
|
236 * those characters on certain parse errors. |
|
237 */ |
|
238 private static final @NoLength char[] LT_SOLIDUS = { '<', '/' }; |
|
239 |
|
240 /** |
|
241 * UTF-16 code unit array containing ]] for emitting those characters on |
|
242 * state transitions. |
|
243 */ |
|
244 private static final @NoLength char[] RSQB_RSQB = { ']', ']' }; |
|
245 |
|
246 /** |
|
247 * Array version of U+FFFD. |
|
248 */ |
|
249 private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' }; |
|
250 |
|
251 // [NOCPP[ |
|
252 |
|
253 /** |
|
254 * Array version of space. |
|
255 */ |
|
256 private static final @NoLength char[] SPACE = { ' ' }; |
|
257 |
|
258 // ]NOCPP] |
|
259 |
|
260 /** |
|
261 * Array version of line feed. |
|
262 */ |
|
263 private static final @NoLength char[] LF = { '\n' }; |
|
264 |
|
265 /** |
|
266 * Buffer growth parameter. |
|
267 */ |
|
268 private static final int BUFFER_GROW_BY = 1024; |
|
269 |
|
270 /** |
|
271 * "CDATA[" as <code>char[]</code> |
|
272 */ |
|
273 private static final @NoLength char[] CDATA_LSQB = { 'C', 'D', 'A', 'T', |
|
274 'A', '[' }; |
|
275 |
|
276 /** |
|
277 * "octype" as <code>char[]</code> |
|
278 */ |
|
279 private static final @NoLength char[] OCTYPE = { 'o', 'c', 't', 'y', 'p', |
|
280 'e' }; |
|
281 |
|
282 /** |
|
283 * "ublic" as <code>char[]</code> |
|
284 */ |
|
285 private static final @NoLength char[] UBLIC = { 'u', 'b', 'l', 'i', 'c' }; |
|
286 |
|
287 /** |
|
288 * "ystem" as <code>char[]</code> |
|
289 */ |
|
290 private static final @NoLength char[] YSTEM = { 'y', 's', 't', 'e', 'm' }; |
|
291 |
|
292 private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' }; |
|
293 |
|
294 private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' }; |
|
295 |
|
296 private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' }; |
|
297 |
|
298 private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't', |
|
299 'e', 'x', 't' }; |
|
300 |
|
301 private static final char[] XMP_ARR = { 'x', 'm', 'p' }; |
|
302 |
|
303 private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r', |
|
304 'e', 'a' }; |
|
305 |
|
306 private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' }; |
|
307 |
|
308 private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e', |
|
309 'd' }; |
|
310 |
|
311 private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i', |
|
312 'p', 't' }; |
|
313 |
|
314 private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm', |
|
315 'e', 's' }; |
|
316 |
|
317 /** |
|
318 * The token handler. |
|
319 */ |
|
320 protected final TokenHandler tokenHandler; |
|
321 |
|
322 protected EncodingDeclarationHandler encodingDeclarationHandler; |
|
323 |
|
324 // [NOCPP[ |
|
325 |
|
326 /** |
|
327 * The error handler. |
|
328 */ |
|
329 protected ErrorHandler errorHandler; |
|
330 |
|
331 // ]NOCPP] |
|
332 |
|
333 /** |
|
334 * Whether the previous char read was CR. |
|
335 */ |
|
336 protected boolean lastCR; |
|
337 |
|
338 protected int stateSave; |
|
339 |
|
340 private int returnStateSave; |
|
341 |
|
342 protected int index; |
|
343 |
|
344 private boolean forceQuirks; |
|
345 |
|
346 private char additional; |
|
347 |
|
348 private int entCol; |
|
349 |
|
350 private int firstCharKey; |
|
351 |
|
352 private int lo; |
|
353 |
|
354 private int hi; |
|
355 |
|
356 private int candidate; |
|
357 |
|
358 private int strBufMark; |
|
359 |
|
360 private int prevValue; |
|
361 |
|
362 protected int value; |
|
363 |
|
364 private boolean seenDigits; |
|
365 |
|
366 protected int cstart; |
|
367 |
|
368 /** |
|
369 * The SAX public id for the resource being tokenized. (Only passed to back |
|
370 * as part of locator data.) |
|
371 */ |
|
372 private String publicId; |
|
373 |
|
374 /** |
|
375 * The SAX system id for the resource being tokenized. (Only passed to back |
|
376 * as part of locator data.) |
|
377 */ |
|
378 private String systemId; |
|
379 |
|
380 /** |
|
381 * Buffer for short identifiers. |
|
382 */ |
|
383 private @Auto char[] strBuf; |
|
384 |
|
385 /** |
|
386 * Number of significant <code>char</code>s in <code>strBuf</code>. |
|
387 */ |
|
388 private int strBufLen; |
|
389 |
|
390 /** |
|
391 * <code>-1</code> to indicate that <code>strBuf</code> is used or otherwise |
|
392 * an offset to the main buffer. |
|
393 */ |
|
394 // private int strBufOffset = -1; |
|
395 /** |
|
396 * Buffer for long strings. |
|
397 */ |
|
398 private @Auto char[] longStrBuf; |
|
399 |
|
400 /** |
|
401 * Number of significant <code>char</code>s in <code>longStrBuf</code>. |
|
402 */ |
|
403 private int longStrBufLen; |
|
404 |
|
405 /** |
|
406 * <code>-1</code> to indicate that <code>longStrBuf</code> is used or |
|
407 * otherwise an offset to the main buffer. |
|
408 */ |
|
409 // private int longStrBufOffset = -1; |
|
410 |
|
411 /** |
|
412 * Buffer for expanding NCRs falling into the Basic Multilingual Plane. |
|
413 */ |
|
414 private final @Auto char[] bmpChar; |
|
415 |
|
416 /** |
|
417 * Buffer for expanding astral NCRs. |
|
418 */ |
|
419 private final @Auto char[] astralChar; |
|
420 |
|
421 /** |
|
422 * The element whose end tag closes the current CDATA or RCDATA element. |
|
423 */ |
|
424 protected ElementName endTagExpectation = null; |
|
425 |
|
426 private char[] endTagExpectationAsArray; // not @Auto! |
|
427 |
|
428 /** |
|
429 * <code>true</code> if tokenizing an end tag |
|
430 */ |
|
431 protected boolean endTag; |
|
432 |
|
433 /** |
|
434 * The current tag token name. |
|
435 */ |
|
436 private ElementName tagName = null; |
|
437 |
|
438 /** |
|
439 * The current attribute name. |
|
440 */ |
|
441 protected AttributeName attributeName = null; |
|
442 |
|
443 // [NOCPP[ |
|
444 |
|
445 /** |
|
446 * Whether comment tokens are emitted. |
|
447 */ |
|
448 private boolean wantsComments = false; |
|
449 |
|
450 /** |
|
451 * <code>true</code> when HTML4-specific additional errors are requested. |
|
452 */ |
|
453 protected boolean html4; |
|
454 |
|
455 /** |
|
456 * Whether the stream is past the first 512 bytes. |
|
457 */ |
|
458 private boolean metaBoundaryPassed; |
|
459 |
|
460 // ]NOCPP] |
|
461 |
|
462 /** |
|
463 * The name of the current doctype token. |
|
464 */ |
|
465 private @Local String doctypeName; |
|
466 |
|
467 /** |
|
468 * The public id of the current doctype token. |
|
469 */ |
|
470 private String publicIdentifier; |
|
471 |
|
472 /** |
|
473 * The system id of the current doctype token. |
|
474 */ |
|
475 private String systemIdentifier; |
|
476 |
|
477 /** |
|
478 * The attribute holder. |
|
479 */ |
|
480 private HtmlAttributes attributes; |
|
481 |
|
482 // [NOCPP[ |
|
483 |
|
484 /** |
|
485 * The policy for vertical tab and form feed. |
|
486 */ |
|
487 private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET; |
|
488 |
|
489 /** |
|
490 * The policy for comments. |
|
491 */ |
|
492 private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET; |
|
493 |
|
494 private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET; |
|
495 |
|
496 private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET; |
|
497 |
|
498 private boolean html4ModeCompatibleWithXhtml1Schemata; |
|
499 |
|
500 private int mappingLangToXmlLang; |
|
501 |
|
502 // ]NOCPP] |
|
503 |
|
504 private final boolean newAttributesEachTime; |
|
505 |
|
506 private boolean shouldSuspend; |
|
507 |
|
508 protected boolean confident; |
|
509 |
|
510 private int line; |
|
511 |
|
512 private Interner interner; |
|
513 |
|
514 // CPPONLY: private boolean viewingXmlSource; |
|
515 |
|
516 // [NOCPP[ |
|
517 |
|
518 protected LocatorImpl ampersandLocation; |
|
519 |
|
520 public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) { |
|
521 this.tokenHandler = tokenHandler; |
|
522 this.encodingDeclarationHandler = null; |
|
523 this.newAttributesEachTime = newAttributesEachTime; |
|
524 this.bmpChar = new char[1]; |
|
525 this.astralChar = new char[2]; |
|
526 this.tagName = null; |
|
527 this.attributeName = null; |
|
528 this.doctypeName = null; |
|
529 this.publicIdentifier = null; |
|
530 this.systemIdentifier = null; |
|
531 this.attributes = null; |
|
532 } |
|
533 |
|
534 // ]NOCPP] |
|
535 |
|
536 /** |
|
537 * The constructor. |
|
538 * |
|
539 * @param tokenHandler |
|
540 * the handler for receiving tokens |
|
541 */ |
|
542 public Tokenizer(TokenHandler tokenHandler |
|
543 // CPPONLY: , boolean viewingXmlSource |
|
544 ) { |
|
545 this.tokenHandler = tokenHandler; |
|
546 this.encodingDeclarationHandler = null; |
|
547 // [NOCPP[ |
|
548 this.newAttributesEachTime = false; |
|
549 // ]NOCPP] |
|
550 this.bmpChar = new char[1]; |
|
551 this.astralChar = new char[2]; |
|
552 this.tagName = null; |
|
553 this.attributeName = null; |
|
554 this.doctypeName = null; |
|
555 this.publicIdentifier = null; |
|
556 this.systemIdentifier = null; |
|
557 // [NOCPP[ |
|
558 this.attributes = null; |
|
559 // ]NOCPP] |
|
560 // CPPONLY: this.attributes = tokenHandler.HasBuilder() ? new HtmlAttributes(mappingLangToXmlLang) : null; |
|
561 // CPPONLY: this.newAttributesEachTime = !tokenHandler.HasBuilder(); |
|
562 // CPPONLY: this.viewingXmlSource = viewingXmlSource; |
|
563 } |
|
564 |
|
565 public void setInterner(Interner interner) { |
|
566 this.interner = interner; |
|
567 } |
|
568 |
|
569 public void initLocation(String newPublicId, String newSystemId) { |
|
570 this.systemId = newSystemId; |
|
571 this.publicId = newPublicId; |
|
572 |
|
573 } |
|
574 |
|
575 // CPPONLY: boolean isViewingXmlSource() { |
|
576 // CPPONLY: return viewingXmlSource; |
|
577 // CPPONLY: } |
|
578 |
|
579 // [NOCPP[ |
|
580 |
|
581 /** |
|
582 * Returns the mappingLangToXmlLang. |
|
583 * |
|
584 * @return the mappingLangToXmlLang |
|
585 */ |
|
586 public boolean isMappingLangToXmlLang() { |
|
587 return mappingLangToXmlLang == AttributeName.HTML_LANG; |
|
588 } |
|
589 |
|
590 /** |
|
591 * Sets the mappingLangToXmlLang. |
|
592 * |
|
593 * @param mappingLangToXmlLang |
|
594 * the mappingLangToXmlLang to set |
|
595 */ |
|
596 public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) { |
|
597 this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG |
|
598 : AttributeName.HTML; |
|
599 } |
|
600 |
|
601 /** |
|
602 * Sets the error handler. |
|
603 * |
|
604 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler) |
|
605 */ |
|
606 public void setErrorHandler(ErrorHandler eh) { |
|
607 this.errorHandler = eh; |
|
608 } |
|
609 |
|
610 public ErrorHandler getErrorHandler() { |
|
611 return this.errorHandler; |
|
612 } |
|
613 |
|
614 /** |
|
615 * Sets the commentPolicy. |
|
616 * |
|
617 * @param commentPolicy |
|
618 * the commentPolicy to set |
|
619 */ |
|
620 public void setCommentPolicy(XmlViolationPolicy commentPolicy) { |
|
621 this.commentPolicy = commentPolicy; |
|
622 } |
|
623 |
|
624 /** |
|
625 * Sets the contentNonXmlCharPolicy. |
|
626 * |
|
627 * @param contentNonXmlCharPolicy |
|
628 * the contentNonXmlCharPolicy to set |
|
629 */ |
|
630 public void setContentNonXmlCharPolicy( |
|
631 XmlViolationPolicy contentNonXmlCharPolicy) { |
|
632 if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) { |
|
633 throw new IllegalArgumentException( |
|
634 "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW."); |
|
635 } |
|
636 } |
|
637 |
|
638 /** |
|
639 * Sets the contentSpacePolicy. |
|
640 * |
|
641 * @param contentSpacePolicy |
|
642 * the contentSpacePolicy to set |
|
643 */ |
|
644 public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) { |
|
645 this.contentSpacePolicy = contentSpacePolicy; |
|
646 } |
|
647 |
|
648 /** |
|
649 * Sets the xmlnsPolicy. |
|
650 * |
|
651 * @param xmlnsPolicy |
|
652 * the xmlnsPolicy to set |
|
653 */ |
|
654 public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) { |
|
655 if (xmlnsPolicy == XmlViolationPolicy.FATAL) { |
|
656 throw new IllegalArgumentException("Can't use FATAL here."); |
|
657 } |
|
658 this.xmlnsPolicy = xmlnsPolicy; |
|
659 } |
|
660 |
|
661 public void setNamePolicy(XmlViolationPolicy namePolicy) { |
|
662 this.namePolicy = namePolicy; |
|
663 } |
|
664 |
|
665 /** |
|
666 * Sets the html4ModeCompatibleWithXhtml1Schemata. |
|
667 * |
|
668 * @param html4ModeCompatibleWithXhtml1Schemata |
|
669 * the html4ModeCompatibleWithXhtml1Schemata to set |
|
670 */ |
|
671 public void setHtml4ModeCompatibleWithXhtml1Schemata( |
|
672 boolean html4ModeCompatibleWithXhtml1Schemata) { |
|
673 this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata; |
|
674 } |
|
675 |
|
676 // ]NOCPP] |
|
677 |
|
678 // For the token handler to call |
|
679 /** |
|
680 * Sets the tokenizer state and the associated element name. This should |
|
681 * only ever used to put the tokenizer into one of the states that have |
|
682 * a special end tag expectation. |
|
683 * |
|
684 * @param specialTokenizerState |
|
685 * the tokenizer state to set |
|
686 * @param endTagExpectation |
|
687 * the expected end tag for transitioning back to normal |
|
688 */ |
|
689 public void setStateAndEndTagExpectation(int specialTokenizerState, |
|
690 @Local String endTagExpectation) { |
|
691 this.stateSave = specialTokenizerState; |
|
692 if (specialTokenizerState == Tokenizer.DATA) { |
|
693 return; |
|
694 } |
|
695 @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation); |
|
696 this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 0, |
|
697 asArray.length, interner); |
|
698 endTagExpectationToArray(); |
|
699 } |
|
700 |
|
701 /** |
|
702 * Sets the tokenizer state and the associated element name. This should |
|
703 * only ever used to put the tokenizer into one of the states that have |
|
704 * a special end tag expectation. |
|
705 * |
|
706 * @param specialTokenizerState |
|
707 * the tokenizer state to set |
|
708 * @param endTagExpectation |
|
709 * the expected end tag for transitioning back to normal |
|
710 */ |
|
711 public void setStateAndEndTagExpectation(int specialTokenizerState, |
|
712 ElementName endTagExpectation) { |
|
713 this.stateSave = specialTokenizerState; |
|
714 this.endTagExpectation = endTagExpectation; |
|
715 endTagExpectationToArray(); |
|
716 } |
|
717 |
|
718 private void endTagExpectationToArray() { |
|
719 switch (endTagExpectation.getGroup()) { |
|
720 case TreeBuilder.TITLE: |
|
721 endTagExpectationAsArray = TITLE_ARR; |
|
722 return; |
|
723 case TreeBuilder.SCRIPT: |
|
724 endTagExpectationAsArray = SCRIPT_ARR; |
|
725 return; |
|
726 case TreeBuilder.STYLE: |
|
727 endTagExpectationAsArray = STYLE_ARR; |
|
728 return; |
|
729 case TreeBuilder.PLAINTEXT: |
|
730 endTagExpectationAsArray = PLAINTEXT_ARR; |
|
731 return; |
|
732 case TreeBuilder.XMP: |
|
733 endTagExpectationAsArray = XMP_ARR; |
|
734 return; |
|
735 case TreeBuilder.TEXTAREA: |
|
736 endTagExpectationAsArray = TEXTAREA_ARR; |
|
737 return; |
|
738 case TreeBuilder.IFRAME: |
|
739 endTagExpectationAsArray = IFRAME_ARR; |
|
740 return; |
|
741 case TreeBuilder.NOEMBED: |
|
742 endTagExpectationAsArray = NOEMBED_ARR; |
|
743 return; |
|
744 case TreeBuilder.NOSCRIPT: |
|
745 endTagExpectationAsArray = NOSCRIPT_ARR; |
|
746 return; |
|
747 case TreeBuilder.NOFRAMES: |
|
748 endTagExpectationAsArray = NOFRAMES_ARR; |
|
749 return; |
|
750 default: |
|
751 assert false: "Bad end tag expectation."; |
|
752 return; |
|
753 } |
|
754 } |
|
755 |
|
756 /** |
|
757 * For C++ use only. |
|
758 */ |
|
759 public void setLineNumber(int line) { |
|
760 this.line = line; |
|
761 } |
|
762 |
|
763 // start Locator impl |
|
764 |
|
765 /** |
|
766 * @see org.xml.sax.Locator#getLineNumber() |
|
767 */ |
|
768 @Inline public int getLineNumber() { |
|
769 return line; |
|
770 } |
|
771 |
|
772 // [NOCPP[ |
|
773 |
|
774 /** |
|
775 * @see org.xml.sax.Locator#getColumnNumber() |
|
776 */ |
|
777 @Inline public int getColumnNumber() { |
|
778 return -1; |
|
779 } |
|
780 |
|
781 /** |
|
782 * @see org.xml.sax.Locator#getPublicId() |
|
783 */ |
|
784 public String getPublicId() { |
|
785 return publicId; |
|
786 } |
|
787 |
|
788 /** |
|
789 * @see org.xml.sax.Locator#getSystemId() |
|
790 */ |
|
791 public String getSystemId() { |
|
792 return systemId; |
|
793 } |
|
794 |
|
795 // end Locator impl |
|
796 |
|
797 // end public API |
|
798 |
|
799 public void notifyAboutMetaBoundary() { |
|
800 metaBoundaryPassed = true; |
|
801 } |
|
802 |
|
803 void turnOnAdditionalHtml4Errors() { |
|
804 html4 = true; |
|
805 } |
|
806 |
|
807 // ]NOCPP] |
|
808 |
|
809 HtmlAttributes emptyAttributes() { |
|
810 // [NOCPP[ |
|
811 if (newAttributesEachTime) { |
|
812 return new HtmlAttributes(mappingLangToXmlLang); |
|
813 } else { |
|
814 // ]NOCPP] |
|
815 return HtmlAttributes.EMPTY_ATTRIBUTES; |
|
816 // [NOCPP[ |
|
817 } |
|
818 // ]NOCPP] |
|
819 } |
|
820 |
|
821 @Inline private void clearStrBufAndAppend(char c) { |
|
822 strBuf[0] = c; |
|
823 strBufLen = 1; |
|
824 } |
|
825 |
|
826 @Inline private void clearStrBuf() { |
|
827 strBufLen = 0; |
|
828 } |
|
829 |
|
830 /** |
|
831 * Appends to the smaller buffer. |
|
832 * |
|
833 * @param c |
|
834 * the UTF-16 code unit to append |
|
835 */ |
|
836 private void appendStrBuf(char c) { |
|
837 if (strBufLen == strBuf.length) { |
|
838 char[] newBuf = new char[strBuf.length + Tokenizer.BUFFER_GROW_BY]; |
|
839 System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length); |
|
840 strBuf = newBuf; |
|
841 } |
|
842 strBuf[strBufLen++] = c; |
|
843 } |
|
844 |
|
845 /** |
|
846 * The smaller buffer as a String. Currently only used for error reporting. |
|
847 * |
|
848 * <p> |
|
849 * C++ memory note: The return value must be released. |
|
850 * |
|
851 * @return the smaller buffer as a string |
|
852 */ |
|
853 protected String strBufToString() { |
|
854 return Portability.newStringFromBuffer(strBuf, 0, strBufLen); |
|
855 } |
|
856 |
|
857 /** |
|
858 * Returns the short buffer as a local name. The return value is released in |
|
859 * emitDoctypeToken(). |
|
860 * |
|
861 * @return the smaller buffer as local name |
|
862 */ |
|
863 private void strBufToDoctypeName() { |
|
864 doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen, |
|
865 interner); |
|
866 } |
|
867 |
|
868 /** |
|
869 * Emits the smaller buffer as character tokens. |
|
870 * |
|
871 * @throws SAXException |
|
872 * if the token handler threw |
|
873 */ |
|
874 private void emitStrBuf() throws SAXException { |
|
875 if (strBufLen > 0) { |
|
876 tokenHandler.characters(strBuf, 0, strBufLen); |
|
877 } |
|
878 } |
|
879 |
|
880 @Inline private void clearLongStrBuf() { |
|
881 longStrBufLen = 0; |
|
882 } |
|
883 |
|
884 @Inline private void clearLongStrBufAndAppend(char c) { |
|
885 longStrBuf[0] = c; |
|
886 longStrBufLen = 1; |
|
887 } |
|
888 |
|
889 /** |
|
890 * Appends to the larger buffer. |
|
891 * |
|
892 * @param c |
|
893 * the UTF-16 code unit to append |
|
894 */ |
|
895 private void appendLongStrBuf(char c) { |
|
896 if (longStrBufLen == longStrBuf.length) { |
|
897 char[] newBuf = new char[longStrBufLen + (longStrBufLen >> 1)]; |
|
898 System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length); |
|
899 longStrBuf = newBuf; |
|
900 } |
|
901 longStrBuf[longStrBufLen++] = c; |
|
902 } |
|
903 |
|
904 @Inline private void appendSecondHyphenToBogusComment() throws SAXException { |
|
905 // [NOCPP[ |
|
906 switch (commentPolicy) { |
|
907 case ALTER_INFOSET: |
|
908 // detachLongStrBuf(); |
|
909 appendLongStrBuf(' '); |
|
910 // FALLTHROUGH |
|
911 case ALLOW: |
|
912 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); |
|
913 // ]NOCPP] |
|
914 appendLongStrBuf('-'); |
|
915 // [NOCPP[ |
|
916 break; |
|
917 case FATAL: |
|
918 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); |
|
919 break; |
|
920 } |
|
921 // ]NOCPP] |
|
922 } |
|
923 |
|
924 // [NOCPP[ |
|
925 private void maybeAppendSpaceToBogusComment() throws SAXException { |
|
926 switch (commentPolicy) { |
|
927 case ALTER_INFOSET: |
|
928 // detachLongStrBuf(); |
|
929 appendLongStrBuf(' '); |
|
930 // FALLTHROUGH |
|
931 case ALLOW: |
|
932 warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment."); |
|
933 break; |
|
934 case FATAL: |
|
935 fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment."); |
|
936 break; |
|
937 } |
|
938 } |
|
939 |
|
940 // ]NOCPP] |
|
941 |
|
942 @Inline private void adjustDoubleHyphenAndAppendToLongStrBufAndErr(char c) |
|
943 throws SAXException { |
|
944 errConsecutiveHyphens(); |
|
945 // [NOCPP[ |
|
946 switch (commentPolicy) { |
|
947 case ALTER_INFOSET: |
|
948 // detachLongStrBuf(); |
|
949 longStrBufLen--; |
|
950 appendLongStrBuf(' '); |
|
951 appendLongStrBuf('-'); |
|
952 // FALLTHROUGH |
|
953 case ALLOW: |
|
954 warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); |
|
955 // ]NOCPP] |
|
956 appendLongStrBuf(c); |
|
957 // [NOCPP[ |
|
958 break; |
|
959 case FATAL: |
|
960 fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); |
|
961 break; |
|
962 } |
|
963 // ]NOCPP] |
|
964 } |
|
965 |
|
966 private void appendLongStrBuf(@NoLength char[] buffer, int offset, int length) { |
|
967 int reqLen = longStrBufLen + length; |
|
968 if (longStrBuf.length < reqLen) { |
|
969 char[] newBuf = new char[reqLen + (reqLen >> 1)]; |
|
970 System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length); |
|
971 longStrBuf = newBuf; |
|
972 } |
|
973 System.arraycopy(buffer, offset, longStrBuf, longStrBufLen, length); |
|
974 longStrBufLen = reqLen; |
|
975 } |
|
976 |
|
977 /** |
|
978 * Append the contents of the smaller buffer to the larger one. |
|
979 */ |
|
980 @Inline private void appendStrBufToLongStrBuf() { |
|
981 appendLongStrBuf(strBuf, 0, strBufLen); |
|
982 } |
|
983 |
|
984 /** |
|
985 * The larger buffer as a string. |
|
986 * |
|
987 * <p> |
|
988 * C++ memory note: The return value must be released. |
|
989 * |
|
990 * @return the larger buffer as a string |
|
991 */ |
|
992 private String longStrBufToString() { |
|
993 return Portability.newStringFromBuffer(longStrBuf, 0, longStrBufLen); |
|
994 } |
|
995 |
|
996 /** |
|
997 * Emits the current comment token. |
|
998 * |
|
999 * @param pos |
|
1000 * TODO |
|
1001 * |
|
1002 * @throws SAXException |
|
1003 */ |
|
1004 private void emitComment(int provisionalHyphens, int pos) |
|
1005 throws SAXException { |
|
1006 // [NOCPP[ |
|
1007 if (wantsComments) { |
|
1008 // ]NOCPP] |
|
1009 // if (longStrBufOffset != -1) { |
|
1010 // tokenHandler.comment(buf, longStrBufOffset, longStrBufLen |
|
1011 // - provisionalHyphens); |
|
1012 // } else { |
|
1013 tokenHandler.comment(longStrBuf, 0, longStrBufLen |
|
1014 - provisionalHyphens); |
|
1015 // } |
|
1016 // [NOCPP[ |
|
1017 } |
|
1018 // ]NOCPP] |
|
1019 cstart = pos + 1; |
|
1020 } |
|
1021 |
|
1022 /** |
|
1023 * Flushes coalesced character tokens. |
|
1024 * |
|
1025 * @param buf |
|
1026 * TODO |
|
1027 * @param pos |
|
1028 * TODO |
|
1029 * |
|
1030 * @throws SAXException |
|
1031 */ |
|
1032 protected void flushChars(@NoLength char[] buf, int pos) |
|
1033 throws SAXException { |
|
1034 if (pos > cstart) { |
|
1035 tokenHandler.characters(buf, cstart, pos - cstart); |
|
1036 } |
|
1037 cstart = Integer.MAX_VALUE; |
|
1038 } |
|
1039 |
|
1040 /** |
|
1041 * Reports an condition that would make the infoset incompatible with XML |
|
1042 * 1.0 as fatal. |
|
1043 * |
|
1044 * @param message |
|
1045 * the message |
|
1046 * @throws SAXException |
|
1047 * @throws SAXParseException |
|
1048 */ |
|
1049 public void fatal(String message) throws SAXException { |
|
1050 SAXParseException spe = new SAXParseException(message, this); |
|
1051 if (errorHandler != null) { |
|
1052 errorHandler.fatalError(spe); |
|
1053 } |
|
1054 throw spe; |
|
1055 } |
|
1056 |
|
1057 /** |
|
1058 * Reports a Parse Error. |
|
1059 * |
|
1060 * @param message |
|
1061 * the message |
|
1062 * @throws SAXException |
|
1063 */ |
|
1064 public void err(String message) throws SAXException { |
|
1065 if (errorHandler == null) { |
|
1066 return; |
|
1067 } |
|
1068 SAXParseException spe = new SAXParseException(message, this); |
|
1069 errorHandler.error(spe); |
|
1070 } |
|
1071 |
|
1072 public void errTreeBuilder(String message) throws SAXException { |
|
1073 ErrorHandler eh = null; |
|
1074 if (tokenHandler instanceof TreeBuilder<?>) { |
|
1075 TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler; |
|
1076 eh = treeBuilder.getErrorHandler(); |
|
1077 } |
|
1078 if (eh == null) { |
|
1079 eh = errorHandler; |
|
1080 } |
|
1081 if (eh == null) { |
|
1082 return; |
|
1083 } |
|
1084 SAXParseException spe = new SAXParseException(message, this); |
|
1085 eh.error(spe); |
|
1086 } |
|
1087 |
|
1088 /** |
|
1089 * Reports a warning |
|
1090 * |
|
1091 * @param message |
|
1092 * the message |
|
1093 * @throws SAXException |
|
1094 */ |
|
1095 public void warn(String message) throws SAXException { |
|
1096 if (errorHandler == null) { |
|
1097 return; |
|
1098 } |
|
1099 SAXParseException spe = new SAXParseException(message, this); |
|
1100 errorHandler.warning(spe); |
|
1101 } |
|
1102 |
|
1103 private void strBufToElementNameString() { |
|
1104 // if (strBufOffset != -1) { |
|
1105 // return ElementName.elementNameByBuffer(buf, strBufOffset, strBufLen); |
|
1106 // } else { |
|
1107 tagName = ElementName.elementNameByBuffer(strBuf, 0, strBufLen, |
|
1108 interner); |
|
1109 // } |
|
1110 } |
|
1111 |
|
1112 private int emitCurrentTagToken(boolean selfClosing, int pos) |
|
1113 throws SAXException { |
|
1114 cstart = pos + 1; |
|
1115 maybeErrSlashInEndTag(selfClosing); |
|
1116 stateSave = Tokenizer.DATA; |
|
1117 HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES |
|
1118 : attributes); |
|
1119 if (endTag) { |
|
1120 /* |
|
1121 * When an end tag token is emitted, the content model flag must be |
|
1122 * switched to the PCDATA state. |
|
1123 */ |
|
1124 maybeErrAttributesOnEndTag(attrs); |
|
1125 // CPPONLY: if (!viewingXmlSource) { |
|
1126 tokenHandler.endTag(tagName); |
|
1127 // CPPONLY: } |
|
1128 // CPPONLY: if (newAttributesEachTime) { |
|
1129 // CPPONLY: Portability.delete(attributes); |
|
1130 // CPPONLY: attributes = null; |
|
1131 // CPPONLY: } |
|
1132 } else { |
|
1133 // CPPONLY: if (viewingXmlSource) { |
|
1134 // CPPONLY: assert newAttributesEachTime; |
|
1135 // CPPONLY: Portability.delete(attributes); |
|
1136 // CPPONLY: attributes = null; |
|
1137 // CPPONLY: } else { |
|
1138 tokenHandler.startTag(tagName, attrs, selfClosing); |
|
1139 // CPPONLY: } |
|
1140 } |
|
1141 tagName.release(); |
|
1142 tagName = null; |
|
1143 if (newAttributesEachTime) { |
|
1144 attributes = null; |
|
1145 } else { |
|
1146 attributes.clear(mappingLangToXmlLang); |
|
1147 } |
|
1148 /* |
|
1149 * The token handler may have called setStateAndEndTagExpectation |
|
1150 * and changed stateSave since the start of this method. |
|
1151 */ |
|
1152 return stateSave; |
|
1153 } |
|
1154 |
|
1155 private void attributeNameComplete() throws SAXException { |
|
1156 // if (strBufOffset != -1) { |
|
1157 // attributeName = AttributeName.nameByBuffer(buf, strBufOffset, |
|
1158 // strBufLen, namePolicy != XmlViolationPolicy.ALLOW); |
|
1159 // } else { |
|
1160 attributeName = AttributeName.nameByBuffer(strBuf, 0, strBufLen |
|
1161 // [NOCPP[ |
|
1162 , namePolicy != XmlViolationPolicy.ALLOW |
|
1163 // ]NOCPP] |
|
1164 , interner); |
|
1165 // } |
|
1166 |
|
1167 if (attributes == null) { |
|
1168 attributes = new HtmlAttributes(mappingLangToXmlLang); |
|
1169 } |
|
1170 |
|
1171 /* |
|
1172 * When the user agent leaves the attribute name state (and before |
|
1173 * emitting the tag token, if appropriate), the complete attribute's |
|
1174 * name must be compared to the other attributes on the same token; if |
|
1175 * there is already an attribute on the token with the exact same name, |
|
1176 * then this is a parse error and the new attribute must be dropped, |
|
1177 * along with the value that gets associated with it (if any). |
|
1178 */ |
|
1179 if (attributes.contains(attributeName)) { |
|
1180 errDuplicateAttribute(); |
|
1181 attributeName.release(); |
|
1182 attributeName = null; |
|
1183 } |
|
1184 } |
|
1185 |
|
1186 private void addAttributeWithoutValue() throws SAXException { |
|
1187 noteAttributeWithoutValue(); |
|
1188 |
|
1189 // [NOCPP[ |
|
1190 if (metaBoundaryPassed && AttributeName.CHARSET == attributeName |
|
1191 && ElementName.META == tagName) { |
|
1192 err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes."); |
|
1193 } |
|
1194 // ]NOCPP] |
|
1195 if (attributeName != null) { |
|
1196 // [NOCPP[ |
|
1197 if (html4) { |
|
1198 if (attributeName.isBoolean()) { |
|
1199 if (html4ModeCompatibleWithXhtml1Schemata) { |
|
1200 attributes.addAttribute(attributeName, |
|
1201 attributeName.getLocal(AttributeName.HTML), |
|
1202 xmlnsPolicy); |
|
1203 } else { |
|
1204 attributes.addAttribute(attributeName, "", xmlnsPolicy); |
|
1205 } |
|
1206 } else { |
|
1207 if (AttributeName.BORDER != attributeName) { |
|
1208 err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)"); |
|
1209 attributes.addAttribute(attributeName, "", xmlnsPolicy); |
|
1210 } |
|
1211 } |
|
1212 } else { |
|
1213 if (AttributeName.SRC == attributeName |
|
1214 || AttributeName.HREF == attributeName) { |
|
1215 warn("Attribute \u201C" |
|
1216 + attributeName.getLocal(AttributeName.HTML) |
|
1217 + "\u201D without an explicit value seen. The attribute may be dropped by IE7."); |
|
1218 } |
|
1219 // ]NOCPP] |
|
1220 attributes.addAttribute(attributeName, |
|
1221 Portability.newEmptyString() |
|
1222 // [NOCPP[ |
|
1223 , xmlnsPolicy |
|
1224 // ]NOCPP] |
|
1225 ); |
|
1226 // [NOCPP[ |
|
1227 } |
|
1228 // ]NOCPP] |
|
1229 attributeName = null; // attributeName has been adopted by the |
|
1230 // |attributes| object |
|
1231 } |
|
1232 } |
|
1233 |
|
1234 private void addAttributeWithValue() throws SAXException { |
|
1235 // [NOCPP[ |
|
1236 if (metaBoundaryPassed && ElementName.META == tagName |
|
1237 && AttributeName.CHARSET == attributeName) { |
|
1238 err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes."); |
|
1239 } |
|
1240 // ]NOCPP] |
|
1241 if (attributeName != null) { |
|
1242 String val = longStrBufToString(); // Ownership transferred to |
|
1243 // HtmlAttributes |
|
1244 // CPPONLY: if (mViewSource) { |
|
1245 // CPPONLY: mViewSource.MaybeLinkifyAttributeValue(attributeName, val); |
|
1246 // CPPONLY: } |
|
1247 // [NOCPP[ |
|
1248 if (!endTag && html4 && html4ModeCompatibleWithXhtml1Schemata |
|
1249 && attributeName.isCaseFolded()) { |
|
1250 val = newAsciiLowerCaseStringFromString(val); |
|
1251 } |
|
1252 // ]NOCPP] |
|
1253 attributes.addAttribute(attributeName, val |
|
1254 // [NOCPP[ |
|
1255 , xmlnsPolicy |
|
1256 // ]NOCPP] |
|
1257 ); |
|
1258 attributeName = null; // attributeName has been adopted by the |
|
1259 // |attributes| object |
|
1260 } |
|
1261 } |
|
1262 |
|
1263 // [NOCPP[ |
|
1264 |
|
1265 private static String newAsciiLowerCaseStringFromString(String str) { |
|
1266 if (str == null) { |
|
1267 return null; |
|
1268 } |
|
1269 char[] buf = new char[str.length()]; |
|
1270 for (int i = 0; i < str.length(); i++) { |
|
1271 char c = str.charAt(i); |
|
1272 if (c >= 'A' && c <= 'Z') { |
|
1273 c += 0x20; |
|
1274 } |
|
1275 buf[i] = c; |
|
1276 } |
|
1277 return new String(buf); |
|
1278 } |
|
1279 |
|
1280 protected void startErrorReporting() throws SAXException { |
|
1281 |
|
1282 } |
|
1283 |
|
1284 // ]NOCPP] |
|
1285 |
|
1286 public void start() throws SAXException { |
|
1287 initializeWithoutStarting(); |
|
1288 tokenHandler.startTokenization(this); |
|
1289 // [NOCPP[ |
|
1290 startErrorReporting(); |
|
1291 // ]NOCPP] |
|
1292 } |
|
1293 |
|
1294 public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException { |
|
1295 int state = stateSave; |
|
1296 int returnState = returnStateSave; |
|
1297 char c = '\u0000'; |
|
1298 shouldSuspend = false; |
|
1299 lastCR = false; |
|
1300 |
|
1301 int start = buffer.getStart(); |
|
1302 /** |
|
1303 * The index of the last <code>char</code> read from <code>buf</code>. |
|
1304 */ |
|
1305 int pos = start - 1; |
|
1306 |
|
1307 /** |
|
1308 * The index of the first <code>char</code> in <code>buf</code> that is |
|
1309 * part of a coalesced run of character tokens or |
|
1310 * <code>Integer.MAX_VALUE</code> if there is not a current run being |
|
1311 * coalesced. |
|
1312 */ |
|
1313 switch (state) { |
|
1314 case DATA: |
|
1315 case RCDATA: |
|
1316 case SCRIPT_DATA: |
|
1317 case PLAINTEXT: |
|
1318 case RAWTEXT: |
|
1319 case CDATA_SECTION: |
|
1320 case SCRIPT_DATA_ESCAPED: |
|
1321 case SCRIPT_DATA_ESCAPE_START: |
|
1322 case SCRIPT_DATA_ESCAPE_START_DASH: |
|
1323 case SCRIPT_DATA_ESCAPED_DASH: |
|
1324 case SCRIPT_DATA_ESCAPED_DASH_DASH: |
|
1325 case SCRIPT_DATA_DOUBLE_ESCAPE_START: |
|
1326 case SCRIPT_DATA_DOUBLE_ESCAPED: |
|
1327 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: |
|
1328 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH: |
|
1329 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: |
|
1330 case SCRIPT_DATA_DOUBLE_ESCAPE_END: |
|
1331 cstart = start; |
|
1332 break; |
|
1333 default: |
|
1334 cstart = Integer.MAX_VALUE; |
|
1335 break; |
|
1336 } |
|
1337 |
|
1338 /** |
|
1339 * The number of <code>char</code>s in <code>buf</code> that have |
|
1340 * meaning. (The rest of the array is garbage and should not be |
|
1341 * examined.) |
|
1342 */ |
|
1343 // CPPONLY: if (mViewSource) { |
|
1344 // CPPONLY: mViewSource.SetBuffer(buffer); |
|
1345 // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); |
|
1346 // CPPONLY: mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1); |
|
1347 // CPPONLY: } else { |
|
1348 // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); |
|
1349 // CPPONLY: } |
|
1350 // [NOCPP[ |
|
1351 pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, |
|
1352 buffer.getEnd()); |
|
1353 // ]NOCPP] |
|
1354 if (pos == buffer.getEnd()) { |
|
1355 // exiting due to end of buffer |
|
1356 buffer.setStart(pos); |
|
1357 } else { |
|
1358 buffer.setStart(pos + 1); |
|
1359 } |
|
1360 return lastCR; |
|
1361 } |
|
1362 |
|
1363 @SuppressWarnings("unused") private int stateLoop(int state, char c, |
|
1364 int pos, @NoLength char[] buf, boolean reconsume, int returnState, |
|
1365 int endPos) throws SAXException { |
|
1366 /* |
|
1367 * Idioms used in this code: |
|
1368 * |
|
1369 * |
|
1370 * Consuming the next input character |
|
1371 * |
|
1372 * To consume the next input character, the code does this: if (++pos == |
|
1373 * endPos) { break stateloop; } c = checkChar(buf, pos); |
|
1374 * |
|
1375 * |
|
1376 * Staying in a state |
|
1377 * |
|
1378 * When there's a state that the tokenizer may stay in over multiple |
|
1379 * input characters, the state has a wrapper |for(;;)| loop and staying |
|
1380 * in the state continues the loop. |
|
1381 * |
|
1382 * |
|
1383 * Switching to another state |
|
1384 * |
|
1385 * To switch to another state, the code sets the state variable to the |
|
1386 * magic number of the new state. Then it either continues stateloop or |
|
1387 * breaks out of the state's own wrapper loop if the target state is |
|
1388 * right after the current state in source order. (This is a partial |
|
1389 * workaround for Java's lack of goto.) |
|
1390 * |
|
1391 * |
|
1392 * Reconsume support |
|
1393 * |
|
1394 * The spec sometimes says that an input character is reconsumed in |
|
1395 * another state. If a state can ever be entered so that an input |
|
1396 * character can be reconsumed in it, the state's code starts with an |
|
1397 * |if (reconsume)| that sets reconsume to false and skips over the |
|
1398 * normal code for consuming a new character. |
|
1399 * |
|
1400 * To reconsume the current character in another state, the code sets |
|
1401 * |reconsume| to true and then switches to the other state. |
|
1402 * |
|
1403 * |
|
1404 * Emitting character tokens |
|
1405 * |
|
1406 * This method emits character tokens lazily. Whenever a new range of |
|
1407 * character tokens starts, the field cstart must be set to the start |
|
1408 * index of the range. The flushChars() method must be called at the end |
|
1409 * of a range to flush it. |
|
1410 * |
|
1411 * |
|
1412 * U+0000 handling |
|
1413 * |
|
1414 * The various states have to handle the replacement of U+0000 with |
|
1415 * U+FFFD. However, if U+0000 would be reconsumed in another state, the |
|
1416 * replacement doesn't need to happen, because it's handled by the |
|
1417 * reconsuming state. |
|
1418 * |
|
1419 * |
|
1420 * LF handling |
|
1421 * |
|
1422 * Every state needs to increment the line number upon LF unless the LF |
|
1423 * gets reconsumed by another state which increments the line number. |
|
1424 * |
|
1425 * |
|
1426 * CR handling |
|
1427 * |
|
1428 * Every state needs to handle CR unless the CR gets reconsumed and is |
|
1429 * handled by the reconsuming state. The CR needs to be handled as if it |
|
1430 * were and LF, the lastCR field must be set to true and then this |
|
1431 * method must return. The IO driver will then swallow the next |
|
1432 * character if it is an LF to coalesce CRLF. |
|
1433 */ |
|
1434 stateloop: for (;;) { |
|
1435 switch (state) { |
|
1436 case DATA: |
|
1437 dataloop: for (;;) { |
|
1438 if (reconsume) { |
|
1439 reconsume = false; |
|
1440 } else { |
|
1441 if (++pos == endPos) { |
|
1442 break stateloop; |
|
1443 } |
|
1444 c = checkChar(buf, pos); |
|
1445 } |
|
1446 switch (c) { |
|
1447 case '&': |
|
1448 /* |
|
1449 * U+0026 AMPERSAND (&) Switch to the character |
|
1450 * reference in data state. |
|
1451 */ |
|
1452 flushChars(buf, pos); |
|
1453 clearStrBufAndAppend(c); |
|
1454 setAdditionalAndRememberAmpersandLocation('\u0000'); |
|
1455 returnState = state; |
|
1456 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); |
|
1457 continue stateloop; |
|
1458 case '<': |
|
1459 /* |
|
1460 * U+003C LESS-THAN SIGN (<) Switch to the tag |
|
1461 * open state. |
|
1462 */ |
|
1463 flushChars(buf, pos); |
|
1464 |
|
1465 state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos); |
|
1466 break dataloop; // FALL THROUGH continue |
|
1467 // stateloop; |
|
1468 case '\u0000': |
|
1469 emitReplacementCharacter(buf, pos); |
|
1470 continue; |
|
1471 case '\r': |
|
1472 emitCarriageReturn(buf, pos); |
|
1473 break stateloop; |
|
1474 case '\n': |
|
1475 silentLineFeed(); |
|
1476 default: |
|
1477 /* |
|
1478 * Anything else Emit the input character as a |
|
1479 * character token. |
|
1480 * |
|
1481 * Stay in the data state. |
|
1482 */ |
|
1483 continue; |
|
1484 } |
|
1485 } |
|
1486 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER |
|
1487 case TAG_OPEN: |
|
1488 tagopenloop: for (;;) { |
|
1489 /* |
|
1490 * The behavior of this state depends on the content |
|
1491 * model flag. |
|
1492 */ |
|
1493 if (++pos == endPos) { |
|
1494 break stateloop; |
|
1495 } |
|
1496 c = checkChar(buf, pos); |
|
1497 /* |
|
1498 * If the content model flag is set to the PCDATA state |
|
1499 * Consume the next input character: |
|
1500 */ |
|
1501 if (c >= 'A' && c <= 'Z') { |
|
1502 /* |
|
1503 * U+0041 LATIN CAPITAL LETTER A through to U+005A |
|
1504 * LATIN CAPITAL LETTER Z Create a new start tag |
|
1505 * token, |
|
1506 */ |
|
1507 endTag = false; |
|
1508 /* |
|
1509 * set its tag name to the lowercase version of the |
|
1510 * input character (add 0x0020 to the character's |
|
1511 * code point), |
|
1512 */ |
|
1513 clearStrBufAndAppend((char) (c + 0x20)); |
|
1514 /* then switch to the tag name state. */ |
|
1515 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); |
|
1516 /* |
|
1517 * (Don't emit the token yet; further details will |
|
1518 * be filled in before it is emitted.) |
|
1519 */ |
|
1520 break tagopenloop; |
|
1521 // continue stateloop; |
|
1522 } else if (c >= 'a' && c <= 'z') { |
|
1523 /* |
|
1524 * U+0061 LATIN SMALL LETTER A through to U+007A |
|
1525 * LATIN SMALL LETTER Z Create a new start tag |
|
1526 * token, |
|
1527 */ |
|
1528 endTag = false; |
|
1529 /* |
|
1530 * set its tag name to the input character, |
|
1531 */ |
|
1532 clearStrBufAndAppend(c); |
|
1533 /* then switch to the tag name state. */ |
|
1534 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); |
|
1535 /* |
|
1536 * (Don't emit the token yet; further details will |
|
1537 * be filled in before it is emitted.) |
|
1538 */ |
|
1539 break tagopenloop; |
|
1540 // continue stateloop; |
|
1541 } |
|
1542 switch (c) { |
|
1543 case '!': |
|
1544 /* |
|
1545 * U+0021 EXCLAMATION MARK (!) Switch to the |
|
1546 * markup declaration open state. |
|
1547 */ |
|
1548 state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos); |
|
1549 continue stateloop; |
|
1550 case '/': |
|
1551 /* |
|
1552 * U+002F SOLIDUS (/) Switch to the close tag |
|
1553 * open state. |
|
1554 */ |
|
1555 state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos); |
|
1556 continue stateloop; |
|
1557 case '?': |
|
1558 // CPPONLY: if (viewingXmlSource) { |
|
1559 // CPPONLY: state = transition(state, |
|
1560 // CPPONLY: Tokenizer.PROCESSING_INSTRUCTION, |
|
1561 // CPPONLY: reconsume, |
|
1562 // CPPONLY: pos); |
|
1563 // CPPONLY: continue stateloop; |
|
1564 // CPPONLY: } |
|
1565 /* |
|
1566 * U+003F QUESTION MARK (?) Parse error. |
|
1567 */ |
|
1568 errProcessingInstruction(); |
|
1569 /* |
|
1570 * Switch to the bogus comment state. |
|
1571 */ |
|
1572 clearLongStrBufAndAppend(c); |
|
1573 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); |
|
1574 continue stateloop; |
|
1575 case '>': |
|
1576 /* |
|
1577 * U+003E GREATER-THAN SIGN (>) Parse error. |
|
1578 */ |
|
1579 errLtGt(); |
|
1580 /* |
|
1581 * Emit a U+003C LESS-THAN SIGN character token |
|
1582 * and a U+003E GREATER-THAN SIGN character |
|
1583 * token. |
|
1584 */ |
|
1585 tokenHandler.characters(Tokenizer.LT_GT, 0, 2); |
|
1586 /* Switch to the data state. */ |
|
1587 cstart = pos + 1; |
|
1588 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
1589 continue stateloop; |
|
1590 default: |
|
1591 /* |
|
1592 * Anything else Parse error. |
|
1593 */ |
|
1594 errBadCharAfterLt(c); |
|
1595 /* |
|
1596 * Emit a U+003C LESS-THAN SIGN character token |
|
1597 */ |
|
1598 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); |
|
1599 /* |
|
1600 * and reconsume the current input character in |
|
1601 * the data state. |
|
1602 */ |
|
1603 cstart = pos; |
|
1604 reconsume = true; |
|
1605 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
1606 continue stateloop; |
|
1607 } |
|
1608 } |
|
1609 // FALL THROUGH DON'T REORDER |
|
1610 case TAG_NAME: |
|
1611 tagnameloop: for (;;) { |
|
1612 if (++pos == endPos) { |
|
1613 break stateloop; |
|
1614 } |
|
1615 c = checkChar(buf, pos); |
|
1616 /* |
|
1617 * Consume the next input character: |
|
1618 */ |
|
1619 switch (c) { |
|
1620 case '\r': |
|
1621 silentCarriageReturn(); |
|
1622 strBufToElementNameString(); |
|
1623 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); |
|
1624 break stateloop; |
|
1625 case '\n': |
|
1626 silentLineFeed(); |
|
1627 case ' ': |
|
1628 case '\t': |
|
1629 case '\u000C': |
|
1630 /* |
|
1631 * U+0009 CHARACTER TABULATION U+000A LINE FEED |
|
1632 * (LF) U+000C FORM FEED (FF) U+0020 SPACE |
|
1633 * Switch to the before attribute name state. |
|
1634 */ |
|
1635 strBufToElementNameString(); |
|
1636 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); |
|
1637 break tagnameloop; |
|
1638 // continue stateloop; |
|
1639 case '/': |
|
1640 /* |
|
1641 * U+002F SOLIDUS (/) Switch to the self-closing |
|
1642 * start tag state. |
|
1643 */ |
|
1644 strBufToElementNameString(); |
|
1645 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); |
|
1646 continue stateloop; |
|
1647 case '>': |
|
1648 /* |
|
1649 * U+003E GREATER-THAN SIGN (>) Emit the current |
|
1650 * tag token. |
|
1651 */ |
|
1652 strBufToElementNameString(); |
|
1653 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); |
|
1654 if (shouldSuspend) { |
|
1655 break stateloop; |
|
1656 } |
|
1657 /* |
|
1658 * Switch to the data state. |
|
1659 */ |
|
1660 continue stateloop; |
|
1661 case '\u0000': |
|
1662 c = '\uFFFD'; |
|
1663 // fall thru |
|
1664 default: |
|
1665 if (c >= 'A' && c <= 'Z') { |
|
1666 /* |
|
1667 * U+0041 LATIN CAPITAL LETTER A through to |
|
1668 * U+005A LATIN CAPITAL LETTER Z Append the |
|
1669 * lowercase version of the current input |
|
1670 * character (add 0x0020 to the character's |
|
1671 * code point) to the current tag token's |
|
1672 * tag name. |
|
1673 */ |
|
1674 c += 0x20; |
|
1675 } |
|
1676 /* |
|
1677 * Anything else Append the current input |
|
1678 * character to the current tag token's tag |
|
1679 * name. |
|
1680 */ |
|
1681 appendStrBuf(c); |
|
1682 /* |
|
1683 * Stay in the tag name state. |
|
1684 */ |
|
1685 continue; |
|
1686 } |
|
1687 } |
|
1688 // FALLTHRU DON'T REORDER |
|
1689 case BEFORE_ATTRIBUTE_NAME: |
|
1690 beforeattributenameloop: for (;;) { |
|
1691 if (reconsume) { |
|
1692 reconsume = false; |
|
1693 } else { |
|
1694 if (++pos == endPos) { |
|
1695 break stateloop; |
|
1696 } |
|
1697 c = checkChar(buf, pos); |
|
1698 } |
|
1699 /* |
|
1700 * Consume the next input character: |
|
1701 */ |
|
1702 switch (c) { |
|
1703 case '\r': |
|
1704 silentCarriageReturn(); |
|
1705 break stateloop; |
|
1706 case '\n': |
|
1707 silentLineFeed(); |
|
1708 // fall thru |
|
1709 case ' ': |
|
1710 case '\t': |
|
1711 case '\u000C': |
|
1712 /* |
|
1713 * U+0009 CHARACTER TABULATION U+000A LINE FEED |
|
1714 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay |
|
1715 * in the before attribute name state. |
|
1716 */ |
|
1717 continue; |
|
1718 case '/': |
|
1719 /* |
|
1720 * U+002F SOLIDUS (/) Switch to the self-closing |
|
1721 * start tag state. |
|
1722 */ |
|
1723 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); |
|
1724 continue stateloop; |
|
1725 case '>': |
|
1726 /* |
|
1727 * U+003E GREATER-THAN SIGN (>) Emit the current |
|
1728 * tag token. |
|
1729 */ |
|
1730 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); |
|
1731 if (shouldSuspend) { |
|
1732 break stateloop; |
|
1733 } |
|
1734 /* |
|
1735 * Switch to the data state. |
|
1736 */ |
|
1737 continue stateloop; |
|
1738 case '\u0000': |
|
1739 c = '\uFFFD'; |
|
1740 // fall thru |
|
1741 case '\"': |
|
1742 case '\'': |
|
1743 case '<': |
|
1744 case '=': |
|
1745 /* |
|
1746 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE |
|
1747 * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS |
|
1748 * SIGN (=) Parse error. |
|
1749 */ |
|
1750 errBadCharBeforeAttributeNameOrNull(c); |
|
1751 /* |
|
1752 * Treat it as per the "anything else" entry |
|
1753 * below. |
|
1754 */ |
|
1755 default: |
|
1756 /* |
|
1757 * Anything else Start a new attribute in the |
|
1758 * current tag token. |
|
1759 */ |
|
1760 if (c >= 'A' && c <= 'Z') { |
|
1761 /* |
|
1762 * U+0041 LATIN CAPITAL LETTER A through to |
|
1763 * U+005A LATIN CAPITAL LETTER Z Set that |
|
1764 * attribute's name to the lowercase version |
|
1765 * of the current input character (add |
|
1766 * 0x0020 to the character's code point) |
|
1767 */ |
|
1768 c += 0x20; |
|
1769 } |
|
1770 /* |
|
1771 * Set that attribute's name to the current |
|
1772 * input character, |
|
1773 */ |
|
1774 clearStrBufAndAppend(c); |
|
1775 /* |
|
1776 * and its value to the empty string. |
|
1777 */ |
|
1778 // Will do later. |
|
1779 /* |
|
1780 * Switch to the attribute name state. |
|
1781 */ |
|
1782 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); |
|
1783 break beforeattributenameloop; |
|
1784 // continue stateloop; |
|
1785 } |
|
1786 } |
|
1787 // FALLTHRU DON'T REORDER |
|
1788 case ATTRIBUTE_NAME: |
|
1789 attributenameloop: for (;;) { |
|
1790 if (++pos == endPos) { |
|
1791 break stateloop; |
|
1792 } |
|
1793 c = checkChar(buf, pos); |
|
1794 /* |
|
1795 * Consume the next input character: |
|
1796 */ |
|
1797 switch (c) { |
|
1798 case '\r': |
|
1799 silentCarriageReturn(); |
|
1800 attributeNameComplete(); |
|
1801 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos); |
|
1802 break stateloop; |
|
1803 case '\n': |
|
1804 silentLineFeed(); |
|
1805 // fall thru |
|
1806 case ' ': |
|
1807 case '\t': |
|
1808 case '\u000C': |
|
1809 /* |
|
1810 * U+0009 CHARACTER TABULATION U+000A LINE FEED |
|
1811 * (LF) U+000C FORM FEED (FF) U+0020 SPACE |
|
1812 * Switch to the after attribute name state. |
|
1813 */ |
|
1814 attributeNameComplete(); |
|
1815 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos); |
|
1816 continue stateloop; |
|
1817 case '/': |
|
1818 /* |
|
1819 * U+002F SOLIDUS (/) Switch to the self-closing |
|
1820 * start tag state. |
|
1821 */ |
|
1822 attributeNameComplete(); |
|
1823 addAttributeWithoutValue(); |
|
1824 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); |
|
1825 continue stateloop; |
|
1826 case '=': |
|
1827 /* |
|
1828 * U+003D EQUALS SIGN (=) Switch to the before |
|
1829 * attribute value state. |
|
1830 */ |
|
1831 attributeNameComplete(); |
|
1832 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); |
|
1833 break attributenameloop; |
|
1834 // continue stateloop; |
|
1835 case '>': |
|
1836 /* |
|
1837 * U+003E GREATER-THAN SIGN (>) Emit the current |
|
1838 * tag token. |
|
1839 */ |
|
1840 attributeNameComplete(); |
|
1841 addAttributeWithoutValue(); |
|
1842 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); |
|
1843 if (shouldSuspend) { |
|
1844 break stateloop; |
|
1845 } |
|
1846 /* |
|
1847 * Switch to the data state. |
|
1848 */ |
|
1849 continue stateloop; |
|
1850 case '\u0000': |
|
1851 c = '\uFFFD'; |
|
1852 // fall thru |
|
1853 case '\"': |
|
1854 case '\'': |
|
1855 case '<': |
|
1856 /* |
|
1857 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE |
|
1858 * (') U+003C LESS-THAN SIGN (<) Parse error. |
|
1859 */ |
|
1860 errQuoteOrLtInAttributeNameOrNull(c); |
|
1861 /* |
|
1862 * Treat it as per the "anything else" entry |
|
1863 * below. |
|
1864 */ |
|
1865 default: |
|
1866 if (c >= 'A' && c <= 'Z') { |
|
1867 /* |
|
1868 * U+0041 LATIN CAPITAL LETTER A through to |
|
1869 * U+005A LATIN CAPITAL LETTER Z Append the |
|
1870 * lowercase version of the current input |
|
1871 * character (add 0x0020 to the character's |
|
1872 * code point) to the current attribute's |
|
1873 * name. |
|
1874 */ |
|
1875 c += 0x20; |
|
1876 } |
|
1877 /* |
|
1878 * Anything else Append the current input |
|
1879 * character to the current attribute's name. |
|
1880 */ |
|
1881 appendStrBuf(c); |
|
1882 /* |
|
1883 * Stay in the attribute name state. |
|
1884 */ |
|
1885 continue; |
|
1886 } |
|
1887 } |
|
1888 // FALLTHRU DON'T REORDER |
|
1889 case BEFORE_ATTRIBUTE_VALUE: |
|
1890 beforeattributevalueloop: for (;;) { |
|
1891 if (++pos == endPos) { |
|
1892 break stateloop; |
|
1893 } |
|
1894 c = checkChar(buf, pos); |
|
1895 /* |
|
1896 * Consume the next input character: |
|
1897 */ |
|
1898 switch (c) { |
|
1899 case '\r': |
|
1900 silentCarriageReturn(); |
|
1901 break stateloop; |
|
1902 case '\n': |
|
1903 silentLineFeed(); |
|
1904 // fall thru |
|
1905 case ' ': |
|
1906 case '\t': |
|
1907 case '\u000C': |
|
1908 /* |
|
1909 * U+0009 CHARACTER TABULATION U+000A LINE FEED |
|
1910 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay |
|
1911 * in the before attribute value state. |
|
1912 */ |
|
1913 continue; |
|
1914 case '"': |
|
1915 /* |
|
1916 * U+0022 QUOTATION MARK (") Switch to the |
|
1917 * attribute value (double-quoted) state. |
|
1918 */ |
|
1919 clearLongStrBuf(); |
|
1920 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos); |
|
1921 break beforeattributevalueloop; |
|
1922 // continue stateloop; |
|
1923 case '&': |
|
1924 /* |
|
1925 * U+0026 AMPERSAND (&) Switch to the attribute |
|
1926 * value (unquoted) state and reconsume this |
|
1927 * input character. |
|
1928 */ |
|
1929 clearLongStrBuf(); |
|
1930 reconsume = true; |
|
1931 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); |
|
1932 noteUnquotedAttributeValue(); |
|
1933 continue stateloop; |
|
1934 case '\'': |
|
1935 /* |
|
1936 * U+0027 APOSTROPHE (') Switch to the attribute |
|
1937 * value (single-quoted) state. |
|
1938 */ |
|
1939 clearLongStrBuf(); |
|
1940 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos); |
|
1941 continue stateloop; |
|
1942 case '>': |
|
1943 /* |
|
1944 * U+003E GREATER-THAN SIGN (>) Parse error. |
|
1945 */ |
|
1946 errAttributeValueMissing(); |
|
1947 /* |
|
1948 * Emit the current tag token. |
|
1949 */ |
|
1950 addAttributeWithoutValue(); |
|
1951 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); |
|
1952 if (shouldSuspend) { |
|
1953 break stateloop; |
|
1954 } |
|
1955 /* |
|
1956 * Switch to the data state. |
|
1957 */ |
|
1958 continue stateloop; |
|
1959 case '\u0000': |
|
1960 c = '\uFFFD'; |
|
1961 // fall thru |
|
1962 case '<': |
|
1963 case '=': |
|
1964 case '`': |
|
1965 /* |
|
1966 * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN |
|
1967 * (=) U+0060 GRAVE ACCENT (`) |
|
1968 */ |
|
1969 errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c); |
|
1970 /* |
|
1971 * Treat it as per the "anything else" entry |
|
1972 * below. |
|
1973 */ |
|
1974 default: |
|
1975 // [NOCPP[ |
|
1976 errHtml4NonNameInUnquotedAttribute(c); |
|
1977 // ]NOCPP] |
|
1978 /* |
|
1979 * Anything else Append the current input |
|
1980 * character to the current attribute's value. |
|
1981 */ |
|
1982 clearLongStrBufAndAppend(c); |
|
1983 /* |
|
1984 * Switch to the attribute value (unquoted) |
|
1985 * state. |
|
1986 */ |
|
1987 |
|
1988 state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); |
|
1989 noteUnquotedAttributeValue(); |
|
1990 continue stateloop; |
|
1991 } |
|
1992 } |
|
1993 // FALLTHRU DON'T REORDER |
|
1994 case ATTRIBUTE_VALUE_DOUBLE_QUOTED: |
|
1995 attributevaluedoublequotedloop: for (;;) { |
|
1996 if (reconsume) { |
|
1997 reconsume = false; |
|
1998 } else { |
|
1999 if (++pos == endPos) { |
|
2000 break stateloop; |
|
2001 } |
|
2002 c = checkChar(buf, pos); |
|
2003 } |
|
2004 /* |
|
2005 * Consume the next input character: |
|
2006 */ |
|
2007 switch (c) { |
|
2008 case '"': |
|
2009 /* |
|
2010 * U+0022 QUOTATION MARK (") Switch to the after |
|
2011 * attribute value (quoted) state. |
|
2012 */ |
|
2013 addAttributeWithValue(); |
|
2014 |
|
2015 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); |
|
2016 break attributevaluedoublequotedloop; |
|
2017 // continue stateloop; |
|
2018 case '&': |
|
2019 /* |
|
2020 * U+0026 AMPERSAND (&) Switch to the character |
|
2021 * reference in attribute value state, with the |
|
2022 * additional allowed character being U+0022 |
|
2023 * QUOTATION MARK ("). |
|
2024 */ |
|
2025 clearStrBufAndAppend(c); |
|
2026 setAdditionalAndRememberAmpersandLocation('\"'); |
|
2027 returnState = state; |
|
2028 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); |
|
2029 continue stateloop; |
|
2030 case '\r': |
|
2031 appendLongStrBufCarriageReturn(); |
|
2032 break stateloop; |
|
2033 case '\n': |
|
2034 appendLongStrBufLineFeed(); |
|
2035 continue; |
|
2036 case '\u0000': |
|
2037 c = '\uFFFD'; |
|
2038 // fall thru |
|
2039 default: |
|
2040 /* |
|
2041 * Anything else Append the current input |
|
2042 * character to the current attribute's value. |
|
2043 */ |
|
2044 appendLongStrBuf(c); |
|
2045 /* |
|
2046 * Stay in the attribute value (double-quoted) |
|
2047 * state. |
|
2048 */ |
|
2049 continue; |
|
2050 } |
|
2051 } |
|
2052 // FALLTHRU DON'T REORDER |
|
2053 case AFTER_ATTRIBUTE_VALUE_QUOTED: |
|
2054 afterattributevaluequotedloop: for (;;) { |
|
2055 if (++pos == endPos) { |
|
2056 break stateloop; |
|
2057 } |
|
2058 c = checkChar(buf, pos); |
|
2059 /* |
|
2060 * Consume the next input character: |
|
2061 */ |
|
2062 switch (c) { |
|
2063 case '\r': |
|
2064 silentCarriageReturn(); |
|
2065 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); |
|
2066 break stateloop; |
|
2067 case '\n': |
|
2068 silentLineFeed(); |
|
2069 // fall thru |
|
2070 case ' ': |
|
2071 case '\t': |
|
2072 case '\u000C': |
|
2073 /* |
|
2074 * U+0009 CHARACTER TABULATION U+000A LINE FEED |
|
2075 * (LF) U+000C FORM FEED (FF) U+0020 SPACE |
|
2076 * Switch to the before attribute name state. |
|
2077 */ |
|
2078 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); |
|
2079 continue stateloop; |
|
2080 case '/': |
|
2081 /* |
|
2082 * U+002F SOLIDUS (/) Switch to the self-closing |
|
2083 * start tag state. |
|
2084 */ |
|
2085 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); |
|
2086 break afterattributevaluequotedloop; |
|
2087 // continue stateloop; |
|
2088 case '>': |
|
2089 /* |
|
2090 * U+003E GREATER-THAN SIGN (>) Emit the current |
|
2091 * tag token. |
|
2092 */ |
|
2093 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); |
|
2094 if (shouldSuspend) { |
|
2095 break stateloop; |
|
2096 } |
|
2097 /* |
|
2098 * Switch to the data state. |
|
2099 */ |
|
2100 continue stateloop; |
|
2101 default: |
|
2102 /* |
|
2103 * Anything else Parse error. |
|
2104 */ |
|
2105 errNoSpaceBetweenAttributes(); |
|
2106 /* |
|
2107 * Reconsume the character in the before |
|
2108 * attribute name state. |
|
2109 */ |
|
2110 reconsume = true; |
|
2111 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); |
|
2112 continue stateloop; |
|
2113 } |
|
2114 } |
|
2115 // FALLTHRU DON'T REORDER |
|
2116 case SELF_CLOSING_START_TAG: |
|
2117 if (++pos == endPos) { |
|
2118 break stateloop; |
|
2119 } |
|
2120 c = checkChar(buf, pos); |
|
2121 /* |
|
2122 * Consume the next input character: |
|
2123 */ |
|
2124 switch (c) { |
|
2125 case '>': |
|
2126 /* |
|
2127 * U+003E GREATER-THAN SIGN (>) Set the self-closing |
|
2128 * flag of the current tag token. Emit the current |
|
2129 * tag token. |
|
2130 */ |
|
2131 // [NOCPP[ |
|
2132 errHtml4XmlVoidSyntax(); |
|
2133 // ]NOCPP] |
|
2134 state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos); |
|
2135 if (shouldSuspend) { |
|
2136 break stateloop; |
|
2137 } |
|
2138 /* |
|
2139 * Switch to the data state. |
|
2140 */ |
|
2141 continue stateloop; |
|
2142 default: |
|
2143 /* Anything else Parse error. */ |
|
2144 errSlashNotFollowedByGt(); |
|
2145 /* |
|
2146 * Reconsume the character in the before attribute |
|
2147 * name state. |
|
2148 */ |
|
2149 reconsume = true; |
|
2150 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); |
|
2151 continue stateloop; |
|
2152 } |
|
2153 // XXX reorder point |
|
2154 case ATTRIBUTE_VALUE_UNQUOTED: |
|
2155 for (;;) { |
|
2156 if (reconsume) { |
|
2157 reconsume = false; |
|
2158 } else { |
|
2159 if (++pos == endPos) { |
|
2160 break stateloop; |
|
2161 } |
|
2162 c = checkChar(buf, pos); |
|
2163 } |
|
2164 /* |
|
2165 * Consume the next input character: |
|
2166 */ |
|
2167 switch (c) { |
|
2168 case '\r': |
|
2169 silentCarriageReturn(); |
|
2170 addAttributeWithValue(); |
|
2171 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); |
|
2172 break stateloop; |
|
2173 case '\n': |
|
2174 silentLineFeed(); |
|
2175 // fall thru |
|
2176 case ' ': |
|
2177 case '\t': |
|
2178 case '\u000C': |
|
2179 /* |
|
2180 * U+0009 CHARACTER TABULATION U+000A LINE FEED |
|
2181 * (LF) U+000C FORM FEED (FF) U+0020 SPACE |
|
2182 * Switch to the before attribute name state. |
|
2183 */ |
|
2184 addAttributeWithValue(); |
|
2185 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); |
|
2186 continue stateloop; |
|
2187 case '&': |
|
2188 /* |
|
2189 * U+0026 AMPERSAND (&) Switch to the character |
|
2190 * reference in attribute value state, with the |
|
2191 * additional allowed character being U+003E |
|
2192 * GREATER-THAN SIGN (>) |
|
2193 */ |
|
2194 clearStrBufAndAppend(c); |
|
2195 setAdditionalAndRememberAmpersandLocation('>'); |
|
2196 returnState = state; |
|
2197 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); |
|
2198 continue stateloop; |
|
2199 case '>': |
|
2200 /* |
|
2201 * U+003E GREATER-THAN SIGN (>) Emit the current |
|
2202 * tag token. |
|
2203 */ |
|
2204 addAttributeWithValue(); |
|
2205 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); |
|
2206 if (shouldSuspend) { |
|
2207 break stateloop; |
|
2208 } |
|
2209 /* |
|
2210 * Switch to the data state. |
|
2211 */ |
|
2212 continue stateloop; |
|
2213 case '\u0000': |
|
2214 c = '\uFFFD'; |
|
2215 // fall thru |
|
2216 case '<': |
|
2217 case '\"': |
|
2218 case '\'': |
|
2219 case '=': |
|
2220 case '`': |
|
2221 /* |
|
2222 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE |
|
2223 * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS |
|
2224 * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error. |
|
2225 */ |
|
2226 errUnquotedAttributeValOrNull(c); |
|
2227 /* |
|
2228 * Treat it as per the "anything else" entry |
|
2229 * below. |
|
2230 */ |
|
2231 // fall through |
|
2232 default: |
|
2233 // [NOCPP] |
|
2234 errHtml4NonNameInUnquotedAttribute(c); |
|
2235 // ]NOCPP] |
|
2236 /* |
|
2237 * Anything else Append the current input |
|
2238 * character to the current attribute's value. |
|
2239 */ |
|
2240 appendLongStrBuf(c); |
|
2241 /* |
|
2242 * Stay in the attribute value (unquoted) state. |
|
2243 */ |
|
2244 continue; |
|
2245 } |
|
2246 } |
|
2247 // XXX reorder point |
|
2248 case AFTER_ATTRIBUTE_NAME: |
|
2249 for (;;) { |
|
2250 if (++pos == endPos) { |
|
2251 break stateloop; |
|
2252 } |
|
2253 c = checkChar(buf, pos); |
|
2254 /* |
|
2255 * Consume the next input character: |
|
2256 */ |
|
2257 switch (c) { |
|
2258 case '\r': |
|
2259 silentCarriageReturn(); |
|
2260 break stateloop; |
|
2261 case '\n': |
|
2262 silentLineFeed(); |
|
2263 // fall thru |
|
2264 case ' ': |
|
2265 case '\t': |
|
2266 case '\u000C': |
|
2267 /* |
|
2268 * U+0009 CHARACTER TABULATION U+000A LINE FEED |
|
2269 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay |
|
2270 * in the after attribute name state. |
|
2271 */ |
|
2272 continue; |
|
2273 case '/': |
|
2274 /* |
|
2275 * U+002F SOLIDUS (/) Switch to the self-closing |
|
2276 * start tag state. |
|
2277 */ |
|
2278 addAttributeWithoutValue(); |
|
2279 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); |
|
2280 continue stateloop; |
|
2281 case '=': |
|
2282 /* |
|
2283 * U+003D EQUALS SIGN (=) Switch to the before |
|
2284 * attribute value state. |
|
2285 */ |
|
2286 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); |
|
2287 continue stateloop; |
|
2288 case '>': |
|
2289 /* |
|
2290 * U+003E GREATER-THAN SIGN (>) Emit the current |
|
2291 * tag token. |
|
2292 */ |
|
2293 addAttributeWithoutValue(); |
|
2294 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); |
|
2295 if (shouldSuspend) { |
|
2296 break stateloop; |
|
2297 } |
|
2298 /* |
|
2299 * Switch to the data state. |
|
2300 */ |
|
2301 continue stateloop; |
|
2302 case '\u0000': |
|
2303 c = '\uFFFD'; |
|
2304 // fall thru |
|
2305 case '\"': |
|
2306 case '\'': |
|
2307 case '<': |
|
2308 errQuoteOrLtInAttributeNameOrNull(c); |
|
2309 /* |
|
2310 * Treat it as per the "anything else" entry |
|
2311 * below. |
|
2312 */ |
|
2313 default: |
|
2314 addAttributeWithoutValue(); |
|
2315 /* |
|
2316 * Anything else Start a new attribute in the |
|
2317 * current tag token. |
|
2318 */ |
|
2319 if (c >= 'A' && c <= 'Z') { |
|
2320 /* |
|
2321 * U+0041 LATIN CAPITAL LETTER A through to |
|
2322 * U+005A LATIN CAPITAL LETTER Z Set that |
|
2323 * attribute's name to the lowercase version |
|
2324 * of the current input character (add |
|
2325 * 0x0020 to the character's code point) |
|
2326 */ |
|
2327 c += 0x20; |
|
2328 } |
|
2329 /* |
|
2330 * Set that attribute's name to the current |
|
2331 * input character, |
|
2332 */ |
|
2333 clearStrBufAndAppend(c); |
|
2334 /* |
|
2335 * and its value to the empty string. |
|
2336 */ |
|
2337 // Will do later. |
|
2338 /* |
|
2339 * Switch to the attribute name state. |
|
2340 */ |
|
2341 state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); |
|
2342 continue stateloop; |
|
2343 } |
|
2344 } |
|
2345 // XXX reorder point |
|
2346 case MARKUP_DECLARATION_OPEN: |
|
2347 markupdeclarationopenloop: for (;;) { |
|
2348 if (++pos == endPos) { |
|
2349 break stateloop; |
|
2350 } |
|
2351 c = checkChar(buf, pos); |
|
2352 /* |
|
2353 * If the next two characters are both U+002D |
|
2354 * HYPHEN-MINUS characters (-), consume those two |
|
2355 * characters, create a comment token whose data is the |
|
2356 * empty string, and switch to the comment start state. |
|
2357 * |
|
2358 * Otherwise, if the next seven characters are an ASCII |
|
2359 * case-insensitive match for the word "DOCTYPE", then |
|
2360 * consume those characters and switch to the DOCTYPE |
|
2361 * state. |
|
2362 * |
|
2363 * Otherwise, if the insertion mode is |
|
2364 * "in foreign content" and the current node is not an |
|
2365 * element in the HTML namespace and the next seven |
|
2366 * characters are an case-sensitive match for the string |
|
2367 * "[CDATA[" (the five uppercase letters "CDATA" with a |
|
2368 * U+005B LEFT SQUARE BRACKET character before and |
|
2369 * after), then consume those characters and switch to |
|
2370 * the CDATA section state. |
|
2371 * |
|
2372 * Otherwise, is is a parse error. Switch to the bogus |
|
2373 * comment state. The next character that is consumed, |
|
2374 * if any, is the first character that will be in the |
|
2375 * comment. |
|
2376 */ |
|
2377 switch (c) { |
|
2378 case '-': |
|
2379 clearLongStrBufAndAppend(c); |
|
2380 state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos); |
|
2381 break markupdeclarationopenloop; |
|
2382 // continue stateloop; |
|
2383 case 'd': |
|
2384 case 'D': |
|
2385 clearLongStrBufAndAppend(c); |
|
2386 index = 0; |
|
2387 state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos); |
|
2388 continue stateloop; |
|
2389 case '[': |
|
2390 if (tokenHandler.cdataSectionAllowed()) { |
|
2391 clearLongStrBufAndAppend(c); |
|
2392 index = 0; |
|
2393 state = transition(state, Tokenizer.CDATA_START, reconsume, pos); |
|
2394 continue stateloop; |
|
2395 } |
|
2396 // else fall through |
|
2397 default: |
|
2398 errBogusComment(); |
|
2399 clearLongStrBuf(); |
|
2400 reconsume = true; |
|
2401 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); |
|
2402 continue stateloop; |
|
2403 } |
|
2404 } |
|
2405 // FALLTHRU DON'T REORDER |
|
2406 case MARKUP_DECLARATION_HYPHEN: |
|
2407 markupdeclarationhyphenloop: for (;;) { |
|
2408 if (++pos == endPos) { |
|
2409 break stateloop; |
|
2410 } |
|
2411 c = checkChar(buf, pos); |
|
2412 switch (c) { |
|
2413 case '\u0000': |
|
2414 break stateloop; |
|
2415 case '-': |
|
2416 clearLongStrBuf(); |
|
2417 state = transition(state, Tokenizer.COMMENT_START, reconsume, pos); |
|
2418 break markupdeclarationhyphenloop; |
|
2419 // continue stateloop; |
|
2420 default: |
|
2421 errBogusComment(); |
|
2422 reconsume = true; |
|
2423 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); |
|
2424 continue stateloop; |
|
2425 } |
|
2426 } |
|
2427 // FALLTHRU DON'T REORDER |
|
2428 case COMMENT_START: |
|
2429 commentstartloop: for (;;) { |
|
2430 if (++pos == endPos) { |
|
2431 break stateloop; |
|
2432 } |
|
2433 c = checkChar(buf, pos); |
|
2434 /* |
|
2435 * Comment start state |
|
2436 * |
|
2437 * |
|
2438 * Consume the next input character: |
|
2439 */ |
|
2440 switch (c) { |
|
2441 case '-': |
|
2442 /* |
|
2443 * U+002D HYPHEN-MINUS (-) Switch to the comment |
|
2444 * start dash state. |
|
2445 */ |
|
2446 appendLongStrBuf(c); |
|
2447 state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos); |
|
2448 continue stateloop; |
|
2449 case '>': |
|
2450 /* |
|
2451 * U+003E GREATER-THAN SIGN (>) Parse error. |
|
2452 */ |
|
2453 errPrematureEndOfComment(); |
|
2454 /* Emit the comment token. */ |
|
2455 emitComment(0, pos); |
|
2456 /* |
|
2457 * Switch to the data state. |
|
2458 */ |
|
2459 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
2460 continue stateloop; |
|
2461 case '\r': |
|
2462 appendLongStrBufCarriageReturn(); |
|
2463 state = transition(state, Tokenizer.COMMENT, reconsume, pos); |
|
2464 break stateloop; |
|
2465 case '\n': |
|
2466 appendLongStrBufLineFeed(); |
|
2467 state = transition(state, Tokenizer.COMMENT, reconsume, pos); |
|
2468 break commentstartloop; |
|
2469 case '\u0000': |
|
2470 c = '\uFFFD'; |
|
2471 // fall thru |
|
2472 default: |
|
2473 /* |
|
2474 * Anything else Append the input character to |
|
2475 * the comment token's data. |
|
2476 */ |
|
2477 appendLongStrBuf(c); |
|
2478 /* |
|
2479 * Switch to the comment state. |
|
2480 */ |
|
2481 state = transition(state, Tokenizer.COMMENT, reconsume, pos); |
|
2482 break commentstartloop; |
|
2483 // continue stateloop; |
|
2484 } |
|
2485 } |
|
2486 // FALLTHRU DON'T REORDER |
|
2487 case COMMENT: |
|
2488 commentloop: for (;;) { |
|
2489 if (++pos == endPos) { |
|
2490 break stateloop; |
|
2491 } |
|
2492 c = checkChar(buf, pos); |
|
2493 /* |
|
2494 * Comment state Consume the next input character: |
|
2495 */ |
|
2496 switch (c) { |
|
2497 case '-': |
|
2498 /* |
|
2499 * U+002D HYPHEN-MINUS (-) Switch to the comment |
|
2500 * end dash state |
|
2501 */ |
|
2502 appendLongStrBuf(c); |
|
2503 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); |
|
2504 break commentloop; |
|
2505 // continue stateloop; |
|
2506 case '\r': |
|
2507 appendLongStrBufCarriageReturn(); |
|
2508 break stateloop; |
|
2509 case '\n': |
|
2510 appendLongStrBufLineFeed(); |
|
2511 continue; |
|
2512 case '\u0000': |
|
2513 c = '\uFFFD'; |
|
2514 // fall thru |
|
2515 default: |
|
2516 /* |
|
2517 * Anything else Append the input character to |
|
2518 * the comment token's data. |
|
2519 */ |
|
2520 appendLongStrBuf(c); |
|
2521 /* |
|
2522 * Stay in the comment state. |
|
2523 */ |
|
2524 continue; |
|
2525 } |
|
2526 } |
|
2527 // FALLTHRU DON'T REORDER |
|
2528 case COMMENT_END_DASH: |
|
2529 commentenddashloop: for (;;) { |
|
2530 if (++pos == endPos) { |
|
2531 break stateloop; |
|
2532 } |
|
2533 c = checkChar(buf, pos); |
|
2534 /* |
|
2535 * Comment end dash state Consume the next input |
|
2536 * character: |
|
2537 */ |
|
2538 switch (c) { |
|
2539 case '-': |
|
2540 /* |
|
2541 * U+002D HYPHEN-MINUS (-) Switch to the comment |
|
2542 * end state |
|
2543 */ |
|
2544 appendLongStrBuf(c); |
|
2545 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos); |
|
2546 break commentenddashloop; |
|
2547 // continue stateloop; |
|
2548 case '\r': |
|
2549 appendLongStrBufCarriageReturn(); |
|
2550 state = transition(state, Tokenizer.COMMENT, reconsume, pos); |
|
2551 break stateloop; |
|
2552 case '\n': |
|
2553 appendLongStrBufLineFeed(); |
|
2554 state = transition(state, Tokenizer.COMMENT, reconsume, pos); |
|
2555 continue stateloop; |
|
2556 case '\u0000': |
|
2557 c = '\uFFFD'; |
|
2558 // fall thru |
|
2559 default: |
|
2560 /* |
|
2561 * Anything else Append a U+002D HYPHEN-MINUS |
|
2562 * (-) character and the input character to the |
|
2563 * comment token's data. |
|
2564 */ |
|
2565 appendLongStrBuf(c); |
|
2566 /* |
|
2567 * Switch to the comment state. |
|
2568 */ |
|
2569 state = transition(state, Tokenizer.COMMENT, reconsume, pos); |
|
2570 continue stateloop; |
|
2571 } |
|
2572 } |
|
2573 // FALLTHRU DON'T REORDER |
|
2574 case COMMENT_END: |
|
2575 commentendloop: for (;;) { |
|
2576 if (++pos == endPos) { |
|
2577 break stateloop; |
|
2578 } |
|
2579 c = checkChar(buf, pos); |
|
2580 /* |
|
2581 * Comment end dash state Consume the next input |
|
2582 * character: |
|
2583 */ |
|
2584 switch (c) { |
|
2585 case '>': |
|
2586 /* |
|
2587 * U+003E GREATER-THAN SIGN (>) Emit the comment |
|
2588 * token. |
|
2589 */ |
|
2590 emitComment(2, pos); |
|
2591 /* |
|
2592 * Switch to the data state. |
|
2593 */ |
|
2594 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
2595 continue stateloop; |
|
2596 case '-': |
|
2597 /* U+002D HYPHEN-MINUS (-) Parse error. */ |
|
2598 /* |
|
2599 * Append a U+002D HYPHEN-MINUS (-) character to |
|
2600 * the comment token's data. |
|
2601 */ |
|
2602 adjustDoubleHyphenAndAppendToLongStrBufAndErr(c); |
|
2603 /* |
|
2604 * Stay in the comment end state. |
|
2605 */ |
|
2606 continue; |
|
2607 case '\r': |
|
2608 adjustDoubleHyphenAndAppendToLongStrBufCarriageReturn(); |
|
2609 state = transition(state, Tokenizer.COMMENT, reconsume, pos); |
|
2610 break stateloop; |
|
2611 case '\n': |
|
2612 adjustDoubleHyphenAndAppendToLongStrBufLineFeed(); |
|
2613 state = transition(state, Tokenizer.COMMENT, reconsume, pos); |
|
2614 continue stateloop; |
|
2615 case '!': |
|
2616 errHyphenHyphenBang(); |
|
2617 appendLongStrBuf(c); |
|
2618 state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos); |
|
2619 continue stateloop; |
|
2620 case '\u0000': |
|
2621 c = '\uFFFD'; |
|
2622 // fall thru |
|
2623 default: |
|
2624 /* |
|
2625 * Append two U+002D HYPHEN-MINUS (-) characters |
|
2626 * and the input character to the comment |
|
2627 * token's data. |
|
2628 */ |
|
2629 adjustDoubleHyphenAndAppendToLongStrBufAndErr(c); |
|
2630 /* |
|
2631 * Switch to the comment state. |
|
2632 */ |
|
2633 state = transition(state, Tokenizer.COMMENT, reconsume, pos); |
|
2634 continue stateloop; |
|
2635 } |
|
2636 } |
|
2637 // XXX reorder point |
|
2638 case COMMENT_END_BANG: |
|
2639 for (;;) { |
|
2640 if (++pos == endPos) { |
|
2641 break stateloop; |
|
2642 } |
|
2643 c = checkChar(buf, pos); |
|
2644 /* |
|
2645 * Comment end bang state |
|
2646 * |
|
2647 * Consume the next input character: |
|
2648 */ |
|
2649 switch (c) { |
|
2650 case '>': |
|
2651 /* |
|
2652 * U+003E GREATER-THAN SIGN (>) Emit the comment |
|
2653 * token. |
|
2654 */ |
|
2655 emitComment(3, pos); |
|
2656 /* |
|
2657 * Switch to the data state. |
|
2658 */ |
|
2659 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
2660 continue stateloop; |
|
2661 case '-': |
|
2662 /* |
|
2663 * Append two U+002D HYPHEN-MINUS (-) characters |
|
2664 * and a U+0021 EXCLAMATION MARK (!) character |
|
2665 * to the comment token's data. |
|
2666 */ |
|
2667 appendLongStrBuf(c); |
|
2668 /* |
|
2669 * Switch to the comment end dash state. |
|
2670 */ |
|
2671 state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); |
|
2672 continue stateloop; |
|
2673 case '\r': |
|
2674 appendLongStrBufCarriageReturn(); |
|
2675 break stateloop; |
|
2676 case '\n': |
|
2677 appendLongStrBufLineFeed(); |
|
2678 continue; |
|
2679 case '\u0000': |
|
2680 c = '\uFFFD'; |
|
2681 // fall thru |
|
2682 default: |
|
2683 /* |
|
2684 * Anything else Append two U+002D HYPHEN-MINUS |
|
2685 * (-) characters, a U+0021 EXCLAMATION MARK (!) |
|
2686 * character, and the input character to the |
|
2687 * comment token's data. Switch to the comment |
|
2688 * state. |
|
2689 */ |
|
2690 appendLongStrBuf(c); |
|
2691 /* |
|
2692 * Switch to the comment state. |
|
2693 */ |
|
2694 state = transition(state, Tokenizer.COMMENT, reconsume, pos); |
|
2695 continue stateloop; |
|
2696 } |
|
2697 } |
|
2698 // XXX reorder point |
|
2699 case COMMENT_START_DASH: |
|
2700 if (++pos == endPos) { |
|
2701 break stateloop; |
|
2702 } |
|
2703 c = checkChar(buf, pos); |
|
2704 /* |
|
2705 * Comment start dash state |
|
2706 * |
|
2707 * Consume the next input character: |
|
2708 */ |
|
2709 switch (c) { |
|
2710 case '-': |
|
2711 /* |
|
2712 * U+002D HYPHEN-MINUS (-) Switch to the comment end |
|
2713 * state |
|
2714 */ |
|
2715 appendLongStrBuf(c); |
|
2716 state = transition(state, Tokenizer.COMMENT_END, reconsume, pos); |
|
2717 continue stateloop; |
|
2718 case '>': |
|
2719 errPrematureEndOfComment(); |
|
2720 /* Emit the comment token. */ |
|
2721 emitComment(1, pos); |
|
2722 /* |
|
2723 * Switch to the data state. |
|
2724 */ |
|
2725 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
2726 continue stateloop; |
|
2727 case '\r': |
|
2728 appendLongStrBufCarriageReturn(); |
|
2729 state = transition(state, Tokenizer.COMMENT, reconsume, pos); |
|
2730 break stateloop; |
|
2731 case '\n': |
|
2732 appendLongStrBufLineFeed(); |
|
2733 state = transition(state, Tokenizer.COMMENT, reconsume, pos); |
|
2734 continue stateloop; |
|
2735 case '\u0000': |
|
2736 c = '\uFFFD'; |
|
2737 // fall thru |
|
2738 default: |
|
2739 /* |
|
2740 * Append a U+002D HYPHEN-MINUS character (-) and |
|
2741 * the current input character to the comment |
|
2742 * token's data. |
|
2743 */ |
|
2744 appendLongStrBuf(c); |
|
2745 /* |
|
2746 * Switch to the comment state. |
|
2747 */ |
|
2748 state = transition(state, Tokenizer.COMMENT, reconsume, pos); |
|
2749 continue stateloop; |
|
2750 } |
|
2751 // XXX reorder point |
|
2752 case CDATA_START: |
|
2753 for (;;) { |
|
2754 if (++pos == endPos) { |
|
2755 break stateloop; |
|
2756 } |
|
2757 c = checkChar(buf, pos); |
|
2758 if (index < 6) { // CDATA_LSQB.length |
|
2759 if (c == Tokenizer.CDATA_LSQB[index]) { |
|
2760 appendLongStrBuf(c); |
|
2761 } else { |
|
2762 errBogusComment(); |
|
2763 reconsume = true; |
|
2764 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); |
|
2765 continue stateloop; |
|
2766 } |
|
2767 index++; |
|
2768 continue; |
|
2769 } else { |
|
2770 cstart = pos; // start coalescing |
|
2771 reconsume = true; |
|
2772 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); |
|
2773 break; // FALL THROUGH continue stateloop; |
|
2774 } |
|
2775 } |
|
2776 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER |
|
2777 case CDATA_SECTION: |
|
2778 cdatasectionloop: for (;;) { |
|
2779 if (reconsume) { |
|
2780 reconsume = false; |
|
2781 } else { |
|
2782 if (++pos == endPos) { |
|
2783 break stateloop; |
|
2784 } |
|
2785 c = checkChar(buf, pos); |
|
2786 } |
|
2787 switch (c) { |
|
2788 case ']': |
|
2789 flushChars(buf, pos); |
|
2790 state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos); |
|
2791 break cdatasectionloop; // FALL THROUGH |
|
2792 case '\u0000': |
|
2793 emitReplacementCharacter(buf, pos); |
|
2794 continue; |
|
2795 case '\r': |
|
2796 emitCarriageReturn(buf, pos); |
|
2797 break stateloop; |
|
2798 case '\n': |
|
2799 silentLineFeed(); |
|
2800 // fall thru |
|
2801 default: |
|
2802 continue; |
|
2803 } |
|
2804 } |
|
2805 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER |
|
2806 case CDATA_RSQB: |
|
2807 cdatarsqb: for (;;) { |
|
2808 if (++pos == endPos) { |
|
2809 break stateloop; |
|
2810 } |
|
2811 c = checkChar(buf, pos); |
|
2812 switch (c) { |
|
2813 case ']': |
|
2814 state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos); |
|
2815 break cdatarsqb; |
|
2816 default: |
|
2817 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, |
|
2818 1); |
|
2819 cstart = pos; |
|
2820 reconsume = true; |
|
2821 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); |
|
2822 continue stateloop; |
|
2823 } |
|
2824 } |
|
2825 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER |
|
2826 case CDATA_RSQB_RSQB: |
|
2827 cdatarsqbrsqb: for (;;) { |
|
2828 if (++pos == endPos) { |
|
2829 break stateloop; |
|
2830 } |
|
2831 c = checkChar(buf, pos); |
|
2832 switch (c) { |
|
2833 case ']': |
|
2834 // Saw a third ]. Emit one ] (logically the |
|
2835 // first one) and stay in this state to |
|
2836 // remember that the last two characters seen |
|
2837 // have been ]]. |
|
2838 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1); |
|
2839 continue; |
|
2840 case '>': |
|
2841 cstart = pos + 1; |
|
2842 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
2843 continue stateloop; |
|
2844 default: |
|
2845 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2); |
|
2846 cstart = pos; |
|
2847 reconsume = true; |
|
2848 state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); |
|
2849 continue stateloop; |
|
2850 } |
|
2851 } |
|
2852 // XXX reorder point |
|
2853 case ATTRIBUTE_VALUE_SINGLE_QUOTED: |
|
2854 attributevaluesinglequotedloop: for (;;) { |
|
2855 if (reconsume) { |
|
2856 reconsume = false; |
|
2857 } else { |
|
2858 if (++pos == endPos) { |
|
2859 break stateloop; |
|
2860 } |
|
2861 c = checkChar(buf, pos); |
|
2862 } |
|
2863 /* |
|
2864 * Consume the next input character: |
|
2865 */ |
|
2866 switch (c) { |
|
2867 case '\'': |
|
2868 /* |
|
2869 * U+0027 APOSTROPHE (') Switch to the after |
|
2870 * attribute value (quoted) state. |
|
2871 */ |
|
2872 addAttributeWithValue(); |
|
2873 |
|
2874 state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); |
|
2875 continue stateloop; |
|
2876 case '&': |
|
2877 /* |
|
2878 * U+0026 AMPERSAND (&) Switch to the character |
|
2879 * reference in attribute value state, with the |
|
2880 * + additional allowed character being U+0027 |
|
2881 * APOSTROPHE ('). |
|
2882 */ |
|
2883 clearStrBufAndAppend(c); |
|
2884 setAdditionalAndRememberAmpersandLocation('\''); |
|
2885 returnState = state; |
|
2886 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); |
|
2887 break attributevaluesinglequotedloop; |
|
2888 // continue stateloop; |
|
2889 case '\r': |
|
2890 appendLongStrBufCarriageReturn(); |
|
2891 break stateloop; |
|
2892 case '\n': |
|
2893 appendLongStrBufLineFeed(); |
|
2894 continue; |
|
2895 case '\u0000': |
|
2896 c = '\uFFFD'; |
|
2897 // fall thru |
|
2898 default: |
|
2899 /* |
|
2900 * Anything else Append the current input |
|
2901 * character to the current attribute's value. |
|
2902 */ |
|
2903 appendLongStrBuf(c); |
|
2904 /* |
|
2905 * Stay in the attribute value (double-quoted) |
|
2906 * state. |
|
2907 */ |
|
2908 continue; |
|
2909 } |
|
2910 } |
|
2911 // FALLTHRU DON'T REORDER |
|
2912 case CONSUME_CHARACTER_REFERENCE: |
|
2913 if (++pos == endPos) { |
|
2914 break stateloop; |
|
2915 } |
|
2916 c = checkChar(buf, pos); |
|
2917 if (c == '\u0000') { |
|
2918 break stateloop; |
|
2919 } |
|
2920 /* |
|
2921 * Unlike the definition is the spec, this state does not |
|
2922 * return a value and never requires the caller to |
|
2923 * backtrack. This state takes care of emitting characters |
|
2924 * or appending to the current attribute value. It also |
|
2925 * takes care of that in the case when consuming the |
|
2926 * character reference fails. |
|
2927 */ |
|
2928 /* |
|
2929 * This section defines how to consume a character |
|
2930 * reference. This definition is used when parsing character |
|
2931 * references in text and in attributes. |
|
2932 * |
|
2933 * The behavior depends on the identity of the next |
|
2934 * character (the one immediately after the U+0026 AMPERSAND |
|
2935 * character): |
|
2936 */ |
|
2937 switch (c) { |
|
2938 case ' ': |
|
2939 case '\t': |
|
2940 case '\n': |
|
2941 case '\r': // we'll reconsume! |
|
2942 case '\u000C': |
|
2943 case '<': |
|
2944 case '&': |
|
2945 emitOrAppendStrBuf(returnState); |
|
2946 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { |
|
2947 cstart = pos; |
|
2948 } |
|
2949 reconsume = true; |
|
2950 state = transition(state, returnState, reconsume, pos); |
|
2951 continue stateloop; |
|
2952 case '#': |
|
2953 /* |
|
2954 * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER |
|
2955 * SIGN. |
|
2956 */ |
|
2957 appendStrBuf('#'); |
|
2958 state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos); |
|
2959 continue stateloop; |
|
2960 default: |
|
2961 if (c == additional) { |
|
2962 emitOrAppendStrBuf(returnState); |
|
2963 reconsume = true; |
|
2964 state = transition(state, returnState, reconsume, pos); |
|
2965 continue stateloop; |
|
2966 } |
|
2967 if (c >= 'a' && c <= 'z') { |
|
2968 firstCharKey = c - 'a' + 26; |
|
2969 } else if (c >= 'A' && c <= 'Z') { |
|
2970 firstCharKey = c - 'A'; |
|
2971 } else { |
|
2972 // No match |
|
2973 /* |
|
2974 * If no match can be made, then this is a parse |
|
2975 * error. |
|
2976 */ |
|
2977 errNoNamedCharacterMatch(); |
|
2978 emitOrAppendStrBuf(returnState); |
|
2979 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { |
|
2980 cstart = pos; |
|
2981 } |
|
2982 reconsume = true; |
|
2983 state = transition(state, returnState, reconsume, pos); |
|
2984 continue stateloop; |
|
2985 } |
|
2986 // Didn't fail yet |
|
2987 appendStrBuf(c); |
|
2988 state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos); |
|
2989 // FALL THROUGH continue stateloop; |
|
2990 } |
|
2991 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER |
|
2992 case CHARACTER_REFERENCE_HILO_LOOKUP: |
|
2993 { |
|
2994 if (++pos == endPos) { |
|
2995 break stateloop; |
|
2996 } |
|
2997 c = checkChar(buf, pos); |
|
2998 if (c == '\u0000') { |
|
2999 break stateloop; |
|
3000 } |
|
3001 /* |
|
3002 * The data structure is as follows: |
|
3003 * |
|
3004 * HILO_ACCEL is a two-dimensional int array whose major |
|
3005 * index corresponds to the second character of the |
|
3006 * character reference (code point as index) and the |
|
3007 * minor index corresponds to the first character of the |
|
3008 * character reference (packed so that A-Z runs from 0 |
|
3009 * to 25 and a-z runs from 26 to 51). This layout makes |
|
3010 * it easier to use the sparseness of the data structure |
|
3011 * to omit parts of it: The second dimension of the |
|
3012 * table is null when no character reference starts with |
|
3013 * the character corresponding to that row. |
|
3014 * |
|
3015 * The int value HILO_ACCEL (by these indeces) is zero |
|
3016 * if there exists no character reference starting with |
|
3017 * that two-letter prefix. Otherwise, the value is an |
|
3018 * int that packs two shorts so that the higher short is |
|
3019 * the index of the highest character reference name |
|
3020 * with that prefix in NAMES and the lower short |
|
3021 * corresponds to the index of the lowest character |
|
3022 * reference name with that prefix. (It happens that the |
|
3023 * first two character reference names share their |
|
3024 * prefix so the packed int cannot be 0 by packing the |
|
3025 * two shorts.) |
|
3026 * |
|
3027 * NAMES is an array of byte arrays where each byte |
|
3028 * array encodes the name of a character references as |
|
3029 * ASCII. The names omit the first two letters of the |
|
3030 * name. (Since storing the first two letters would be |
|
3031 * redundant with the data contained in HILO_ACCEL.) The |
|
3032 * entries are lexically sorted. |
|
3033 * |
|
3034 * For a given index in NAMES, the same index in VALUES |
|
3035 * contains the corresponding expansion as an array of |
|
3036 * two UTF-16 code units (either the character and |
|
3037 * U+0000 or a suggogate pair). |
|
3038 */ |
|
3039 int hilo = 0; |
|
3040 if (c <= 'z') { |
|
3041 @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c]; |
|
3042 if (row != null) { |
|
3043 hilo = row[firstCharKey]; |
|
3044 } |
|
3045 } |
|
3046 if (hilo == 0) { |
|
3047 /* |
|
3048 * If no match can be made, then this is a parse |
|
3049 * error. |
|
3050 */ |
|
3051 errNoNamedCharacterMatch(); |
|
3052 emitOrAppendStrBuf(returnState); |
|
3053 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { |
|
3054 cstart = pos; |
|
3055 } |
|
3056 reconsume = true; |
|
3057 state = transition(state, returnState, reconsume, pos); |
|
3058 continue stateloop; |
|
3059 } |
|
3060 // Didn't fail yet |
|
3061 appendStrBuf(c); |
|
3062 lo = hilo & 0xFFFF; |
|
3063 hi = hilo >> 16; |
|
3064 entCol = -1; |
|
3065 candidate = -1; |
|
3066 strBufMark = 0; |
|
3067 state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos); |
|
3068 // FALL THROUGH continue stateloop; |
|
3069 } |
|
3070 case CHARACTER_REFERENCE_TAIL: |
|
3071 outer: for (;;) { |
|
3072 if (++pos == endPos) { |
|
3073 break stateloop; |
|
3074 } |
|
3075 c = checkChar(buf, pos); |
|
3076 if (c == '\u0000') { |
|
3077 break stateloop; |
|
3078 } |
|
3079 entCol++; |
|
3080 /* |
|
3081 * Consume the maximum number of characters possible, |
|
3082 * with the consumed characters matching one of the |
|
3083 * identifiers in the first column of the named |
|
3084 * character references table (in a case-sensitive |
|
3085 * manner). |
|
3086 */ |
|
3087 loloop: for (;;) { |
|
3088 if (hi < lo) { |
|
3089 break outer; |
|
3090 } |
|
3091 if (entCol == NamedCharacters.NAMES[lo].length()) { |
|
3092 candidate = lo; |
|
3093 strBufMark = strBufLen; |
|
3094 lo++; |
|
3095 } else if (entCol > NamedCharacters.NAMES[lo].length()) { |
|
3096 break outer; |
|
3097 } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) { |
|
3098 lo++; |
|
3099 } else { |
|
3100 break loloop; |
|
3101 } |
|
3102 } |
|
3103 |
|
3104 hiloop: for (;;) { |
|
3105 if (hi < lo) { |
|
3106 break outer; |
|
3107 } |
|
3108 if (entCol == NamedCharacters.NAMES[hi].length()) { |
|
3109 break hiloop; |
|
3110 } |
|
3111 if (entCol > NamedCharacters.NAMES[hi].length()) { |
|
3112 break outer; |
|
3113 } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) { |
|
3114 hi--; |
|
3115 } else { |
|
3116 break hiloop; |
|
3117 } |
|
3118 } |
|
3119 |
|
3120 if (c == ';') { |
|
3121 // If we see a semicolon, there cannot be a |
|
3122 // longer match. Break the loop. However, before |
|
3123 // breaking, take the longest match so far as the |
|
3124 // candidate, if we are just about to complete a |
|
3125 // match. |
|
3126 if (entCol + 1 == NamedCharacters.NAMES[lo].length()) { |
|
3127 candidate = lo; |
|
3128 strBufMark = strBufLen; |
|
3129 } |
|
3130 break outer; |
|
3131 } |
|
3132 |
|
3133 if (hi < lo) { |
|
3134 break outer; |
|
3135 } |
|
3136 appendStrBuf(c); |
|
3137 continue; |
|
3138 } |
|
3139 |
|
3140 if (candidate == -1) { |
|
3141 // reconsume deals with CR, LF or nul |
|
3142 /* |
|
3143 * If no match can be made, then this is a parse error. |
|
3144 */ |
|
3145 errNoNamedCharacterMatch(); |
|
3146 emitOrAppendStrBuf(returnState); |
|
3147 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { |
|
3148 cstart = pos; |
|
3149 } |
|
3150 reconsume = true; |
|
3151 state = transition(state, returnState, reconsume, pos); |
|
3152 continue stateloop; |
|
3153 } else { |
|
3154 // c can't be CR, LF or nul if we got here |
|
3155 @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate]; |
|
3156 if (candidateName.length() == 0 |
|
3157 || candidateName.charAt(candidateName.length() - 1) != ';') { |
|
3158 /* |
|
3159 * If the last character matched is not a U+003B |
|
3160 * SEMICOLON (;), there is a parse error. |
|
3161 */ |
|
3162 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { |
|
3163 /* |
|
3164 * If the entity is being consumed as part of an |
|
3165 * attribute, and the last character matched is |
|
3166 * not a U+003B SEMICOLON (;), |
|
3167 */ |
|
3168 char ch; |
|
3169 if (strBufMark == strBufLen) { |
|
3170 ch = c; |
|
3171 } else { |
|
3172 // if (strBufOffset != -1) { |
|
3173 // ch = buf[strBufOffset + strBufMark]; |
|
3174 // } else { |
|
3175 ch = strBuf[strBufMark]; |
|
3176 // } |
|
3177 } |
|
3178 if (ch == '=' || (ch >= '0' && ch <= '9') |
|
3179 || (ch >= 'A' && ch <= 'Z') |
|
3180 || (ch >= 'a' && ch <= 'z')) { |
|
3181 /* |
|
3182 * and the next character is either a U+003D |
|
3183 * EQUALS SIGN character (=) or in the range |
|
3184 * U+0030 DIGIT ZERO to U+0039 DIGIT NINE, |
|
3185 * U+0041 LATIN CAPITAL LETTER A to U+005A |
|
3186 * LATIN CAPITAL LETTER Z, or U+0061 LATIN |
|
3187 * SMALL LETTER A to U+007A LATIN SMALL |
|
3188 * LETTER Z, then, for historical reasons, |
|
3189 * all the characters that were matched |
|
3190 * after the U+0026 AMPERSAND (&) must be |
|
3191 * unconsumed, and nothing is returned. |
|
3192 */ |
|
3193 errNoNamedCharacterMatch(); |
|
3194 appendStrBufToLongStrBuf(); |
|
3195 reconsume = true; |
|
3196 state = transition(state, returnState, reconsume, pos); |
|
3197 continue stateloop; |
|
3198 } |
|
3199 } |
|
3200 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { |
|
3201 errUnescapedAmpersandInterpretedAsCharacterReference(); |
|
3202 } else { |
|
3203 errNotSemicolonTerminated(); |
|
3204 } |
|
3205 } |
|
3206 |
|
3207 /* |
|
3208 * Otherwise, return a character token for the character |
|
3209 * corresponding to the entity name (as given by the |
|
3210 * second column of the named character references |
|
3211 * table). |
|
3212 */ |
|
3213 // CPPONLY: completedNamedCharacterReference(); |
|
3214 @Const @NoLength char[] val = NamedCharacters.VALUES[candidate]; |
|
3215 if ( |
|
3216 // [NOCPP[ |
|
3217 val.length == 1 |
|
3218 // ]NOCPP] |
|
3219 // CPPONLY: val[1] == 0 |
|
3220 ) { |
|
3221 emitOrAppendOne(val, returnState); |
|
3222 } else { |
|
3223 emitOrAppendTwo(val, returnState); |
|
3224 } |
|
3225 // this is so complicated! |
|
3226 if (strBufMark < strBufLen) { |
|
3227 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { |
|
3228 for (int i = strBufMark; i < strBufLen; i++) { |
|
3229 appendLongStrBuf(strBuf[i]); |
|
3230 } |
|
3231 } else { |
|
3232 tokenHandler.characters(strBuf, strBufMark, |
|
3233 strBufLen - strBufMark); |
|
3234 } |
|
3235 } |
|
3236 // Check if we broke out early with c being the last |
|
3237 // character that matched as opposed to being the |
|
3238 // first one that didn't match. In the case of an |
|
3239 // early break, the next run on text should start |
|
3240 // *after* the current character and the current |
|
3241 // character shouldn't be reconsumed. |
|
3242 boolean earlyBreak = (c == ';' && strBufMark == strBufLen); |
|
3243 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { |
|
3244 cstart = earlyBreak ? pos + 1 : pos; |
|
3245 } |
|
3246 reconsume = !earlyBreak; |
|
3247 state = transition(state, returnState, reconsume, pos); |
|
3248 continue stateloop; |
|
3249 /* |
|
3250 * If the markup contains I'm ¬it; I tell you, the |
|
3251 * entity is parsed as "not", as in, I'm ¬it; I tell |
|
3252 * you. But if the markup was I'm ∉ I tell you, |
|
3253 * the entity would be parsed as "notin;", resulting in |
|
3254 * I'm ∉ I tell you. |
|
3255 */ |
|
3256 } |
|
3257 // XXX reorder point |
|
3258 case CONSUME_NCR: |
|
3259 if (++pos == endPos) { |
|
3260 break stateloop; |
|
3261 } |
|
3262 c = checkChar(buf, pos); |
|
3263 prevValue = -1; |
|
3264 value = 0; |
|
3265 seenDigits = false; |
|
3266 /* |
|
3267 * The behavior further depends on the character after the |
|
3268 * U+0023 NUMBER SIGN: |
|
3269 */ |
|
3270 switch (c) { |
|
3271 case 'x': |
|
3272 case 'X': |
|
3273 |
|
3274 /* |
|
3275 * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL |
|
3276 * LETTER X Consume the X. |
|
3277 * |
|
3278 * Follow the steps below, but using the range of |
|
3279 * characters U+0030 DIGIT ZERO through to U+0039 |
|
3280 * DIGIT NINE, U+0061 LATIN SMALL LETTER A through |
|
3281 * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN |
|
3282 * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL |
|
3283 * LETTER F (in other words, 0-9, A-F, a-f). |
|
3284 * |
|
3285 * When it comes to interpreting the number, |
|
3286 * interpret it as a hexadecimal number. |
|
3287 */ |
|
3288 appendStrBuf(c); |
|
3289 state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos); |
|
3290 continue stateloop; |
|
3291 default: |
|
3292 /* |
|
3293 * Anything else Follow the steps below, but using |
|
3294 * the range of characters U+0030 DIGIT ZERO through |
|
3295 * to U+0039 DIGIT NINE (i.e. just 0-9). |
|
3296 * |
|
3297 * When it comes to interpreting the number, |
|
3298 * interpret it as a decimal number. |
|
3299 */ |
|
3300 reconsume = true; |
|
3301 state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos); |
|
3302 // FALL THROUGH continue stateloop; |
|
3303 } |
|
3304 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER |
|
3305 case DECIMAL_NRC_LOOP: |
|
3306 decimalloop: for (;;) { |
|
3307 if (reconsume) { |
|
3308 reconsume = false; |
|
3309 } else { |
|
3310 if (++pos == endPos) { |
|
3311 break stateloop; |
|
3312 } |
|
3313 c = checkChar(buf, pos); |
|
3314 } |
|
3315 // Deal with overflow gracefully |
|
3316 if (value < prevValue) { |
|
3317 value = 0x110000; // Value above Unicode range but |
|
3318 // within int |
|
3319 // range |
|
3320 } |
|
3321 prevValue = value; |
|
3322 /* |
|
3323 * Consume as many characters as match the range of |
|
3324 * characters given above. |
|
3325 */ |
|
3326 if (c >= '0' && c <= '9') { |
|
3327 seenDigits = true; |
|
3328 value *= 10; |
|
3329 value += c - '0'; |
|
3330 continue; |
|
3331 } else if (c == ';') { |
|
3332 if (seenDigits) { |
|
3333 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { |
|
3334 cstart = pos + 1; |
|
3335 } |
|
3336 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); |
|
3337 // FALL THROUGH continue stateloop; |
|
3338 break decimalloop; |
|
3339 } else { |
|
3340 errNoDigitsInNCR(); |
|
3341 appendStrBuf(';'); |
|
3342 emitOrAppendStrBuf(returnState); |
|
3343 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { |
|
3344 cstart = pos + 1; |
|
3345 } |
|
3346 state = transition(state, returnState, reconsume, pos); |
|
3347 continue stateloop; |
|
3348 } |
|
3349 } else { |
|
3350 /* |
|
3351 * If no characters match the range, then don't |
|
3352 * consume any characters (and unconsume the U+0023 |
|
3353 * NUMBER SIGN character and, if appropriate, the X |
|
3354 * character). This is a parse error; nothing is |
|
3355 * returned. |
|
3356 * |
|
3357 * Otherwise, if the next character is a U+003B |
|
3358 * SEMICOLON, consume that too. If it isn't, there |
|
3359 * is a parse error. |
|
3360 */ |
|
3361 if (!seenDigits) { |
|
3362 errNoDigitsInNCR(); |
|
3363 emitOrAppendStrBuf(returnState); |
|
3364 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { |
|
3365 cstart = pos; |
|
3366 } |
|
3367 reconsume = true; |
|
3368 state = transition(state, returnState, reconsume, pos); |
|
3369 continue stateloop; |
|
3370 } else { |
|
3371 errCharRefLacksSemicolon(); |
|
3372 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { |
|
3373 cstart = pos; |
|
3374 } |
|
3375 reconsume = true; |
|
3376 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); |
|
3377 // FALL THROUGH continue stateloop; |
|
3378 break decimalloop; |
|
3379 } |
|
3380 } |
|
3381 } |
|
3382 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER |
|
3383 case HANDLE_NCR_VALUE: |
|
3384 // WARNING previous state sets reconsume |
|
3385 // XXX inline this case if the method size can take it |
|
3386 handleNcrValue(returnState); |
|
3387 state = transition(state, returnState, reconsume, pos); |
|
3388 continue stateloop; |
|
3389 // XXX reorder point |
|
3390 case HEX_NCR_LOOP: |
|
3391 for (;;) { |
|
3392 if (++pos == endPos) { |
|
3393 break stateloop; |
|
3394 } |
|
3395 c = checkChar(buf, pos); |
|
3396 // Deal with overflow gracefully |
|
3397 if (value < prevValue) { |
|
3398 value = 0x110000; // Value above Unicode range but |
|
3399 // within int |
|
3400 // range |
|
3401 } |
|
3402 prevValue = value; |
|
3403 /* |
|
3404 * Consume as many characters as match the range of |
|
3405 * characters given above. |
|
3406 */ |
|
3407 if (c >= '0' && c <= '9') { |
|
3408 seenDigits = true; |
|
3409 value *= 16; |
|
3410 value += c - '0'; |
|
3411 continue; |
|
3412 } else if (c >= 'A' && c <= 'F') { |
|
3413 seenDigits = true; |
|
3414 value *= 16; |
|
3415 value += c - 'A' + 10; |
|
3416 continue; |
|
3417 } else if (c >= 'a' && c <= 'f') { |
|
3418 seenDigits = true; |
|
3419 value *= 16; |
|
3420 value += c - 'a' + 10; |
|
3421 continue; |
|
3422 } else if (c == ';') { |
|
3423 if (seenDigits) { |
|
3424 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { |
|
3425 cstart = pos + 1; |
|
3426 } |
|
3427 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); |
|
3428 continue stateloop; |
|
3429 } else { |
|
3430 errNoDigitsInNCR(); |
|
3431 appendStrBuf(';'); |
|
3432 emitOrAppendStrBuf(returnState); |
|
3433 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { |
|
3434 cstart = pos + 1; |
|
3435 } |
|
3436 state = transition(state, returnState, reconsume, pos); |
|
3437 continue stateloop; |
|
3438 } |
|
3439 } else { |
|
3440 /* |
|
3441 * If no characters match the range, then don't |
|
3442 * consume any characters (and unconsume the U+0023 |
|
3443 * NUMBER SIGN character and, if appropriate, the X |
|
3444 * character). This is a parse error; nothing is |
|
3445 * returned. |
|
3446 * |
|
3447 * Otherwise, if the next character is a U+003B |
|
3448 * SEMICOLON, consume that too. If it isn't, there |
|
3449 * is a parse error. |
|
3450 */ |
|
3451 if (!seenDigits) { |
|
3452 errNoDigitsInNCR(); |
|
3453 emitOrAppendStrBuf(returnState); |
|
3454 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { |
|
3455 cstart = pos; |
|
3456 } |
|
3457 reconsume = true; |
|
3458 state = transition(state, returnState, reconsume, pos); |
|
3459 continue stateloop; |
|
3460 } else { |
|
3461 errCharRefLacksSemicolon(); |
|
3462 if ((returnState & DATA_AND_RCDATA_MASK) == 0) { |
|
3463 cstart = pos; |
|
3464 } |
|
3465 reconsume = true; |
|
3466 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); |
|
3467 continue stateloop; |
|
3468 } |
|
3469 } |
|
3470 } |
|
3471 // XXX reorder point |
|
3472 case PLAINTEXT: |
|
3473 plaintextloop: for (;;) { |
|
3474 if (reconsume) { |
|
3475 reconsume = false; |
|
3476 } else { |
|
3477 if (++pos == endPos) { |
|
3478 break stateloop; |
|
3479 } |
|
3480 c = checkChar(buf, pos); |
|
3481 } |
|
3482 switch (c) { |
|
3483 case '\u0000': |
|
3484 emitPlaintextReplacementCharacter(buf, pos); |
|
3485 continue; |
|
3486 case '\r': |
|
3487 emitCarriageReturn(buf, pos); |
|
3488 break stateloop; |
|
3489 case '\n': |
|
3490 silentLineFeed(); |
|
3491 default: |
|
3492 /* |
|
3493 * Anything else Emit the current input |
|
3494 * character as a character token. Stay in the |
|
3495 * RAWTEXT state. |
|
3496 */ |
|
3497 continue; |
|
3498 } |
|
3499 } |
|
3500 // XXX reorder point |
|
3501 case CLOSE_TAG_OPEN: |
|
3502 if (++pos == endPos) { |
|
3503 break stateloop; |
|
3504 } |
|
3505 c = checkChar(buf, pos); |
|
3506 /* |
|
3507 * Otherwise, if the content model flag is set to the PCDATA |
|
3508 * state, or if the next few characters do match that tag |
|
3509 * name, consume the next input character: |
|
3510 */ |
|
3511 switch (c) { |
|
3512 case '>': |
|
3513 /* U+003E GREATER-THAN SIGN (>) Parse error. */ |
|
3514 errLtSlashGt(); |
|
3515 /* |
|
3516 * Switch to the data state. |
|
3517 */ |
|
3518 cstart = pos + 1; |
|
3519 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
3520 continue stateloop; |
|
3521 case '\r': |
|
3522 silentCarriageReturn(); |
|
3523 /* Anything else Parse error. */ |
|
3524 errGarbageAfterLtSlash(); |
|
3525 /* |
|
3526 * Switch to the bogus comment state. |
|
3527 */ |
|
3528 clearLongStrBufAndAppend('\n'); |
|
3529 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); |
|
3530 break stateloop; |
|
3531 case '\n': |
|
3532 silentLineFeed(); |
|
3533 /* Anything else Parse error. */ |
|
3534 errGarbageAfterLtSlash(); |
|
3535 /* |
|
3536 * Switch to the bogus comment state. |
|
3537 */ |
|
3538 clearLongStrBufAndAppend('\n'); |
|
3539 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); |
|
3540 continue stateloop; |
|
3541 case '\u0000': |
|
3542 c = '\uFFFD'; |
|
3543 // fall thru |
|
3544 default: |
|
3545 if (c >= 'A' && c <= 'Z') { |
|
3546 c += 0x20; |
|
3547 } |
|
3548 if (c >= 'a' && c <= 'z') { |
|
3549 /* |
|
3550 * U+0061 LATIN SMALL LETTER A through to U+007A |
|
3551 * LATIN SMALL LETTER Z Create a new end tag |
|
3552 * token, |
|
3553 */ |
|
3554 endTag = true; |
|
3555 /* |
|
3556 * set its tag name to the input character, |
|
3557 */ |
|
3558 clearStrBufAndAppend(c); |
|
3559 /* |
|
3560 * then switch to the tag name state. (Don't |
|
3561 * emit the token yet; further details will be |
|
3562 * filled in before it is emitted.) |
|
3563 */ |
|
3564 state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); |
|
3565 continue stateloop; |
|
3566 } else { |
|
3567 /* Anything else Parse error. */ |
|
3568 errGarbageAfterLtSlash(); |
|
3569 /* |
|
3570 * Switch to the bogus comment state. |
|
3571 */ |
|
3572 clearLongStrBufAndAppend(c); |
|
3573 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); |
|
3574 continue stateloop; |
|
3575 } |
|
3576 } |
|
3577 // XXX reorder point |
|
3578 case RCDATA: |
|
3579 rcdataloop: for (;;) { |
|
3580 if (reconsume) { |
|
3581 reconsume = false; |
|
3582 } else { |
|
3583 if (++pos == endPos) { |
|
3584 break stateloop; |
|
3585 } |
|
3586 c = checkChar(buf, pos); |
|
3587 } |
|
3588 switch (c) { |
|
3589 case '&': |
|
3590 /* |
|
3591 * U+0026 AMPERSAND (&) Switch to the character |
|
3592 * reference in RCDATA state. |
|
3593 */ |
|
3594 flushChars(buf, pos); |
|
3595 clearStrBufAndAppend(c); |
|
3596 additional = '\u0000'; |
|
3597 returnState = state; |
|
3598 state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); |
|
3599 continue stateloop; |
|
3600 case '<': |
|
3601 /* |
|
3602 * U+003C LESS-THAN SIGN (<) Switch to the |
|
3603 * RCDATA less-than sign state. |
|
3604 */ |
|
3605 flushChars(buf, pos); |
|
3606 |
|
3607 returnState = state; |
|
3608 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); |
|
3609 continue stateloop; |
|
3610 case '\u0000': |
|
3611 emitReplacementCharacter(buf, pos); |
|
3612 continue; |
|
3613 case '\r': |
|
3614 emitCarriageReturn(buf, pos); |
|
3615 break stateloop; |
|
3616 case '\n': |
|
3617 silentLineFeed(); |
|
3618 default: |
|
3619 /* |
|
3620 * Emit the current input character as a |
|
3621 * character token. Stay in the RCDATA state. |
|
3622 */ |
|
3623 continue; |
|
3624 } |
|
3625 } |
|
3626 // XXX reorder point |
|
3627 case RAWTEXT: |
|
3628 rawtextloop: for (;;) { |
|
3629 if (reconsume) { |
|
3630 reconsume = false; |
|
3631 } else { |
|
3632 if (++pos == endPos) { |
|
3633 break stateloop; |
|
3634 } |
|
3635 c = checkChar(buf, pos); |
|
3636 } |
|
3637 switch (c) { |
|
3638 case '<': |
|
3639 /* |
|
3640 * U+003C LESS-THAN SIGN (<) Switch to the |
|
3641 * RAWTEXT less-than sign state. |
|
3642 */ |
|
3643 flushChars(buf, pos); |
|
3644 |
|
3645 returnState = state; |
|
3646 state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); |
|
3647 break rawtextloop; |
|
3648 // FALL THRU continue stateloop; |
|
3649 case '\u0000': |
|
3650 emitReplacementCharacter(buf, pos); |
|
3651 continue; |
|
3652 case '\r': |
|
3653 emitCarriageReturn(buf, pos); |
|
3654 break stateloop; |
|
3655 case '\n': |
|
3656 silentLineFeed(); |
|
3657 default: |
|
3658 /* |
|
3659 * Emit the current input character as a |
|
3660 * character token. Stay in the RAWTEXT state. |
|
3661 */ |
|
3662 continue; |
|
3663 } |
|
3664 } |
|
3665 // XXX fallthru don't reorder |
|
3666 case RAWTEXT_RCDATA_LESS_THAN_SIGN: |
|
3667 rawtextrcdatalessthansignloop: for (;;) { |
|
3668 if (++pos == endPos) { |
|
3669 break stateloop; |
|
3670 } |
|
3671 c = checkChar(buf, pos); |
|
3672 switch (c) { |
|
3673 case '/': |
|
3674 /* |
|
3675 * U+002F SOLIDUS (/) Set the temporary buffer |
|
3676 * to the empty string. Switch to the script |
|
3677 * data end tag open state. |
|
3678 */ |
|
3679 index = 0; |
|
3680 clearStrBuf(); |
|
3681 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); |
|
3682 break rawtextrcdatalessthansignloop; |
|
3683 // FALL THRU continue stateloop; |
|
3684 default: |
|
3685 /* |
|
3686 * Otherwise, emit a U+003C LESS-THAN SIGN |
|
3687 * character token |
|
3688 */ |
|
3689 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); |
|
3690 /* |
|
3691 * and reconsume the current input character in |
|
3692 * the data state. |
|
3693 */ |
|
3694 cstart = pos; |
|
3695 reconsume = true; |
|
3696 state = transition(state, returnState, reconsume, pos); |
|
3697 continue stateloop; |
|
3698 } |
|
3699 } |
|
3700 // XXX fall thru. don't reorder. |
|
3701 case NON_DATA_END_TAG_NAME: |
|
3702 for (;;) { |
|
3703 if (++pos == endPos) { |
|
3704 break stateloop; |
|
3705 } |
|
3706 c = checkChar(buf, pos); |
|
3707 /* |
|
3708 * ASSERT! when entering this state, set index to 0 and |
|
3709 * call clearStrBuf() assert (contentModelElement != |
|
3710 * null); Let's implement the above without lookahead. |
|
3711 * strBuf is the 'temporary buffer'. |
|
3712 */ |
|
3713 if (index < endTagExpectationAsArray.length) { |
|
3714 char e = endTagExpectationAsArray[index]; |
|
3715 char folded = c; |
|
3716 if (c >= 'A' && c <= 'Z') { |
|
3717 folded += 0x20; |
|
3718 } |
|
3719 if (folded != e) { |
|
3720 // [NOCPP[ |
|
3721 errHtml4LtSlashInRcdata(folded); |
|
3722 // ]NOCPP] |
|
3723 tokenHandler.characters(Tokenizer.LT_SOLIDUS, |
|
3724 0, 2); |
|
3725 emitStrBuf(); |
|
3726 cstart = pos; |
|
3727 reconsume = true; |
|
3728 state = transition(state, returnState, reconsume, pos); |
|
3729 continue stateloop; |
|
3730 } |
|
3731 appendStrBuf(c); |
|
3732 index++; |
|
3733 continue; |
|
3734 } else { |
|
3735 endTag = true; |
|
3736 // XXX replace contentModelElement with different |
|
3737 // type |
|
3738 tagName = endTagExpectation; |
|
3739 switch (c) { |
|
3740 case '\r': |
|
3741 silentCarriageReturn(); |
|
3742 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); |
|
3743 break stateloop; |
|
3744 case '\n': |
|
3745 silentLineFeed(); |
|
3746 // fall thru |
|
3747 case ' ': |
|
3748 case '\t': |
|
3749 case '\u000C': |
|
3750 /* |
|
3751 * U+0009 CHARACTER TABULATION U+000A LINE |
|
3752 * FEED (LF) U+000C FORM FEED (FF) U+0020 |
|
3753 * SPACE If the current end tag token is an |
|
3754 * appropriate end tag token, then switch to |
|
3755 * the before attribute name state. |
|
3756 */ |
|
3757 state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); |
|
3758 continue stateloop; |
|
3759 case '/': |
|
3760 /* |
|
3761 * U+002F SOLIDUS (/) If the current end tag |
|
3762 * token is an appropriate end tag token, |
|
3763 * then switch to the self-closing start tag |
|
3764 * state. |
|
3765 */ |
|
3766 state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); |
|
3767 continue stateloop; |
|
3768 case '>': |
|
3769 /* |
|
3770 * U+003E GREATER-THAN SIGN (>) If the |
|
3771 * current end tag token is an appropriate |
|
3772 * end tag token, then emit the current tag |
|
3773 * token and switch to the data state. |
|
3774 */ |
|
3775 state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); |
|
3776 if (shouldSuspend) { |
|
3777 break stateloop; |
|
3778 } |
|
3779 continue stateloop; |
|
3780 default: |
|
3781 /* |
|
3782 * Emit a U+003C LESS-THAN SIGN character |
|
3783 * token, a U+002F SOLIDUS character token, |
|
3784 * a character token for each of the |
|
3785 * characters in the temporary buffer (in |
|
3786 * the order they were added to the buffer), |
|
3787 * and reconsume the current input character |
|
3788 * in the RAWTEXT state. |
|
3789 */ |
|
3790 // [NOCPP[ |
|
3791 errWarnLtSlashInRcdata(); |
|
3792 // ]NOCPP] |
|
3793 tokenHandler.characters( |
|
3794 Tokenizer.LT_SOLIDUS, 0, 2); |
|
3795 emitStrBuf(); |
|
3796 if (c == '\u0000') { |
|
3797 emitReplacementCharacter(buf, pos); |
|
3798 } else { |
|
3799 cstart = pos; // don't drop the |
|
3800 // character |
|
3801 } |
|
3802 state = transition(state, returnState, reconsume, pos); |
|
3803 continue stateloop; |
|
3804 } |
|
3805 } |
|
3806 } |
|
3807 // XXX reorder point |
|
3808 // BEGIN HOTSPOT WORKAROUND |
|
3809 case BOGUS_COMMENT: |
|
3810 boguscommentloop: for (;;) { |
|
3811 if (reconsume) { |
|
3812 reconsume = false; |
|
3813 } else { |
|
3814 if (++pos == endPos) { |
|
3815 break stateloop; |
|
3816 } |
|
3817 c = checkChar(buf, pos); |
|
3818 } |
|
3819 /* |
|
3820 * Consume every character up to and including the first |
|
3821 * U+003E GREATER-THAN SIGN character (>) or the end of |
|
3822 * the file (EOF), whichever comes first. Emit a comment |
|
3823 * token whose data is the concatenation of all the |
|
3824 * characters starting from and including the character |
|
3825 * that caused the state machine to switch into the |
|
3826 * bogus comment state, up to and including the |
|
3827 * character immediately before the last consumed |
|
3828 * character (i.e. up to the character just before the |
|
3829 * U+003E or EOF character). (If the comment was started |
|
3830 * by the end of the file (EOF), the token is empty.) |
|
3831 * |
|
3832 * Switch to the data state. |
|
3833 * |
|
3834 * If the end of the file was reached, reconsume the EOF |
|
3835 * character. |
|
3836 */ |
|
3837 switch (c) { |
|
3838 case '>': |
|
3839 emitComment(0, pos); |
|
3840 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
3841 continue stateloop; |
|
3842 case '-': |
|
3843 appendLongStrBuf(c); |
|
3844 state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos); |
|
3845 break boguscommentloop; |
|
3846 case '\r': |
|
3847 appendLongStrBufCarriageReturn(); |
|
3848 break stateloop; |
|
3849 case '\n': |
|
3850 appendLongStrBufLineFeed(); |
|
3851 continue; |
|
3852 case '\u0000': |
|
3853 c = '\uFFFD'; |
|
3854 // fall thru |
|
3855 default: |
|
3856 appendLongStrBuf(c); |
|
3857 continue; |
|
3858 } |
|
3859 } |
|
3860 // FALLTHRU DON'T REORDER |
|
3861 case BOGUS_COMMENT_HYPHEN: |
|
3862 boguscommenthyphenloop: for (;;) { |
|
3863 if (++pos == endPos) { |
|
3864 break stateloop; |
|
3865 } |
|
3866 c = checkChar(buf, pos); |
|
3867 switch (c) { |
|
3868 case '>': |
|
3869 // [NOCPP[ |
|
3870 maybeAppendSpaceToBogusComment(); |
|
3871 // ]NOCPP] |
|
3872 emitComment(0, pos); |
|
3873 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
3874 continue stateloop; |
|
3875 case '-': |
|
3876 appendSecondHyphenToBogusComment(); |
|
3877 continue boguscommenthyphenloop; |
|
3878 case '\r': |
|
3879 appendLongStrBufCarriageReturn(); |
|
3880 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); |
|
3881 break stateloop; |
|
3882 case '\n': |
|
3883 appendLongStrBufLineFeed(); |
|
3884 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); |
|
3885 continue stateloop; |
|
3886 case '\u0000': |
|
3887 c = '\uFFFD'; |
|
3888 // fall thru |
|
3889 default: |
|
3890 appendLongStrBuf(c); |
|
3891 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); |
|
3892 continue stateloop; |
|
3893 } |
|
3894 } |
|
3895 // XXX reorder point |
|
3896 case SCRIPT_DATA: |
|
3897 scriptdataloop: for (;;) { |
|
3898 if (reconsume) { |
|
3899 reconsume = false; |
|
3900 } else { |
|
3901 if (++pos == endPos) { |
|
3902 break stateloop; |
|
3903 } |
|
3904 c = checkChar(buf, pos); |
|
3905 } |
|
3906 switch (c) { |
|
3907 case '<': |
|
3908 /* |
|
3909 * U+003C LESS-THAN SIGN (<) Switch to the |
|
3910 * script data less-than sign state. |
|
3911 */ |
|
3912 flushChars(buf, pos); |
|
3913 returnState = state; |
|
3914 state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos); |
|
3915 break scriptdataloop; // FALL THRU continue |
|
3916 // stateloop; |
|
3917 case '\u0000': |
|
3918 emitReplacementCharacter(buf, pos); |
|
3919 continue; |
|
3920 case '\r': |
|
3921 emitCarriageReturn(buf, pos); |
|
3922 break stateloop; |
|
3923 case '\n': |
|
3924 silentLineFeed(); |
|
3925 default: |
|
3926 /* |
|
3927 * Anything else Emit the current input |
|
3928 * character as a character token. Stay in the |
|
3929 * script data state. |
|
3930 */ |
|
3931 continue; |
|
3932 } |
|
3933 } |
|
3934 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER |
|
3935 case SCRIPT_DATA_LESS_THAN_SIGN: |
|
3936 scriptdatalessthansignloop: for (;;) { |
|
3937 if (++pos == endPos) { |
|
3938 break stateloop; |
|
3939 } |
|
3940 c = checkChar(buf, pos); |
|
3941 switch (c) { |
|
3942 case '/': |
|
3943 /* |
|
3944 * U+002F SOLIDUS (/) Set the temporary buffer |
|
3945 * to the empty string. Switch to the script |
|
3946 * data end tag open state. |
|
3947 */ |
|
3948 index = 0; |
|
3949 clearStrBuf(); |
|
3950 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); |
|
3951 continue stateloop; |
|
3952 case '!': |
|
3953 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); |
|
3954 cstart = pos; |
|
3955 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos); |
|
3956 break scriptdatalessthansignloop; // FALL THRU |
|
3957 // continue |
|
3958 // stateloop; |
|
3959 default: |
|
3960 /* |
|
3961 * Otherwise, emit a U+003C LESS-THAN SIGN |
|
3962 * character token |
|
3963 */ |
|
3964 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); |
|
3965 /* |
|
3966 * and reconsume the current input character in |
|
3967 * the data state. |
|
3968 */ |
|
3969 cstart = pos; |
|
3970 reconsume = true; |
|
3971 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); |
|
3972 continue stateloop; |
|
3973 } |
|
3974 } |
|
3975 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER |
|
3976 case SCRIPT_DATA_ESCAPE_START: |
|
3977 scriptdataescapestartloop: for (;;) { |
|
3978 if (++pos == endPos) { |
|
3979 break stateloop; |
|
3980 } |
|
3981 c = checkChar(buf, pos); |
|
3982 /* |
|
3983 * Consume the next input character: |
|
3984 */ |
|
3985 switch (c) { |
|
3986 case '-': |
|
3987 /* |
|
3988 * U+002D HYPHEN-MINUS (-) Emit a U+002D |
|
3989 * HYPHEN-MINUS character token. Switch to the |
|
3990 * script data escape start dash state. |
|
3991 */ |
|
3992 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos); |
|
3993 break scriptdataescapestartloop; // FALL THRU |
|
3994 // continue |
|
3995 // stateloop; |
|
3996 default: |
|
3997 /* |
|
3998 * Anything else Reconsume the current input |
|
3999 * character in the script data state. |
|
4000 */ |
|
4001 reconsume = true; |
|
4002 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); |
|
4003 continue stateloop; |
|
4004 } |
|
4005 } |
|
4006 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER |
|
4007 case SCRIPT_DATA_ESCAPE_START_DASH: |
|
4008 scriptdataescapestartdashloop: for (;;) { |
|
4009 if (++pos == endPos) { |
|
4010 break stateloop; |
|
4011 } |
|
4012 c = checkChar(buf, pos); |
|
4013 /* |
|
4014 * Consume the next input character: |
|
4015 */ |
|
4016 switch (c) { |
|
4017 case '-': |
|
4018 /* |
|
4019 * U+002D HYPHEN-MINUS (-) Emit a U+002D |
|
4020 * HYPHEN-MINUS character token. Switch to the |
|
4021 * script data escaped dash dash state. |
|
4022 */ |
|
4023 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); |
|
4024 break scriptdataescapestartdashloop; |
|
4025 // continue stateloop; |
|
4026 default: |
|
4027 /* |
|
4028 * Anything else Reconsume the current input |
|
4029 * character in the script data state. |
|
4030 */ |
|
4031 reconsume = true; |
|
4032 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); |
|
4033 continue stateloop; |
|
4034 } |
|
4035 } |
|
4036 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER |
|
4037 case SCRIPT_DATA_ESCAPED_DASH_DASH: |
|
4038 scriptdataescapeddashdashloop: for (;;) { |
|
4039 if (++pos == endPos) { |
|
4040 break stateloop; |
|
4041 } |
|
4042 c = checkChar(buf, pos); |
|
4043 /* |
|
4044 * Consume the next input character: |
|
4045 */ |
|
4046 switch (c) { |
|
4047 case '-': |
|
4048 /* |
|
4049 * U+002D HYPHEN-MINUS (-) Emit a U+002D |
|
4050 * HYPHEN-MINUS character token. Stay in the |
|
4051 * script data escaped dash dash state. |
|
4052 */ |
|
4053 continue; |
|
4054 case '<': |
|
4055 /* |
|
4056 * U+003C LESS-THAN SIGN (<) Switch to the |
|
4057 * script data escaped less-than sign state. |
|
4058 */ |
|
4059 flushChars(buf, pos); |
|
4060 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); |
|
4061 continue stateloop; |
|
4062 case '>': |
|
4063 /* |
|
4064 * U+003E GREATER-THAN SIGN (>) Emit a U+003E |
|
4065 * GREATER-THAN SIGN character token. Switch to |
|
4066 * the script data state. |
|
4067 */ |
|
4068 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); |
|
4069 continue stateloop; |
|
4070 case '\u0000': |
|
4071 emitReplacementCharacter(buf, pos); |
|
4072 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); |
|
4073 break scriptdataescapeddashdashloop; |
|
4074 case '\r': |
|
4075 emitCarriageReturn(buf, pos); |
|
4076 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); |
|
4077 break stateloop; |
|
4078 case '\n': |
|
4079 silentLineFeed(); |
|
4080 default: |
|
4081 /* |
|
4082 * Anything else Emit the current input |
|
4083 * character as a character token. Switch to the |
|
4084 * script data escaped state. |
|
4085 */ |
|
4086 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); |
|
4087 break scriptdataescapeddashdashloop; |
|
4088 // continue stateloop; |
|
4089 } |
|
4090 } |
|
4091 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER |
|
4092 case SCRIPT_DATA_ESCAPED: |
|
4093 scriptdataescapedloop: for (;;) { |
|
4094 if (reconsume) { |
|
4095 reconsume = false; |
|
4096 } else { |
|
4097 if (++pos == endPos) { |
|
4098 break stateloop; |
|
4099 } |
|
4100 c = checkChar(buf, pos); |
|
4101 } |
|
4102 /* |
|
4103 * Consume the next input character: |
|
4104 */ |
|
4105 switch (c) { |
|
4106 case '-': |
|
4107 /* |
|
4108 * U+002D HYPHEN-MINUS (-) Emit a U+002D |
|
4109 * HYPHEN-MINUS character token. Switch to the |
|
4110 * script data escaped dash state. |
|
4111 */ |
|
4112 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos); |
|
4113 break scriptdataescapedloop; // FALL THRU |
|
4114 // continue |
|
4115 // stateloop; |
|
4116 case '<': |
|
4117 /* |
|
4118 * U+003C LESS-THAN SIGN (<) Switch to the |
|
4119 * script data escaped less-than sign state. |
|
4120 */ |
|
4121 flushChars(buf, pos); |
|
4122 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); |
|
4123 continue stateloop; |
|
4124 case '\u0000': |
|
4125 emitReplacementCharacter(buf, pos); |
|
4126 continue; |
|
4127 case '\r': |
|
4128 emitCarriageReturn(buf, pos); |
|
4129 break stateloop; |
|
4130 case '\n': |
|
4131 silentLineFeed(); |
|
4132 default: |
|
4133 /* |
|
4134 * Anything else Emit the current input |
|
4135 * character as a character token. Stay in the |
|
4136 * script data escaped state. |
|
4137 */ |
|
4138 continue; |
|
4139 } |
|
4140 } |
|
4141 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER |
|
4142 case SCRIPT_DATA_ESCAPED_DASH: |
|
4143 scriptdataescapeddashloop: for (;;) { |
|
4144 if (++pos == endPos) { |
|
4145 break stateloop; |
|
4146 } |
|
4147 c = checkChar(buf, pos); |
|
4148 /* |
|
4149 * Consume the next input character: |
|
4150 */ |
|
4151 switch (c) { |
|
4152 case '-': |
|
4153 /* |
|
4154 * U+002D HYPHEN-MINUS (-) Emit a U+002D |
|
4155 * HYPHEN-MINUS character token. Switch to the |
|
4156 * script data escaped dash dash state. |
|
4157 */ |
|
4158 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); |
|
4159 continue stateloop; |
|
4160 case '<': |
|
4161 /* |
|
4162 * U+003C LESS-THAN SIGN (<) Switch to the |
|
4163 * script data escaped less-than sign state. |
|
4164 */ |
|
4165 flushChars(buf, pos); |
|
4166 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); |
|
4167 break scriptdataescapeddashloop; |
|
4168 // continue stateloop; |
|
4169 case '\u0000': |
|
4170 emitReplacementCharacter(buf, pos); |
|
4171 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); |
|
4172 continue stateloop; |
|
4173 case '\r': |
|
4174 emitCarriageReturn(buf, pos); |
|
4175 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); |
|
4176 break stateloop; |
|
4177 case '\n': |
|
4178 silentLineFeed(); |
|
4179 default: |
|
4180 /* |
|
4181 * Anything else Emit the current input |
|
4182 * character as a character token. Switch to the |
|
4183 * script data escaped state. |
|
4184 */ |
|
4185 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); |
|
4186 continue stateloop; |
|
4187 } |
|
4188 } |
|
4189 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER |
|
4190 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: |
|
4191 scriptdataescapedlessthanloop: for (;;) { |
|
4192 if (++pos == endPos) { |
|
4193 break stateloop; |
|
4194 } |
|
4195 c = checkChar(buf, pos); |
|
4196 /* |
|
4197 * Consume the next input character: |
|
4198 */ |
|
4199 switch (c) { |
|
4200 case '/': |
|
4201 /* |
|
4202 * U+002F SOLIDUS (/) Set the temporary buffer |
|
4203 * to the empty string. Switch to the script |
|
4204 * data escaped end tag open state. |
|
4205 */ |
|
4206 index = 0; |
|
4207 clearStrBuf(); |
|
4208 returnState = Tokenizer.SCRIPT_DATA_ESCAPED; |
|
4209 state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); |
|
4210 continue stateloop; |
|
4211 case 'S': |
|
4212 case 's': |
|
4213 /* |
|
4214 * U+0041 LATIN CAPITAL LETTER A through to |
|
4215 * U+005A LATIN CAPITAL LETTER Z Emit a U+003C |
|
4216 * LESS-THAN SIGN character token and the |
|
4217 * current input character as a character token. |
|
4218 */ |
|
4219 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); |
|
4220 cstart = pos; |
|
4221 index = 1; |
|
4222 /* |
|
4223 * Set the temporary buffer to the empty string. |
|
4224 * Append the lowercase version of the current |
|
4225 * input character (add 0x0020 to the |
|
4226 * character's code point) to the temporary |
|
4227 * buffer. Switch to the script data double |
|
4228 * escape start state. |
|
4229 */ |
|
4230 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos); |
|
4231 break scriptdataescapedlessthanloop; |
|
4232 // continue stateloop; |
|
4233 default: |
|
4234 /* |
|
4235 * Anything else Emit a U+003C LESS-THAN SIGN |
|
4236 * character token and reconsume the current |
|
4237 * input character in the script data escaped |
|
4238 * state. |
|
4239 */ |
|
4240 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); |
|
4241 cstart = pos; |
|
4242 reconsume = true; |
|
4243 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); |
|
4244 continue stateloop; |
|
4245 } |
|
4246 } |
|
4247 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER |
|
4248 case SCRIPT_DATA_DOUBLE_ESCAPE_START: |
|
4249 scriptdatadoubleescapestartloop: for (;;) { |
|
4250 if (++pos == endPos) { |
|
4251 break stateloop; |
|
4252 } |
|
4253 c = checkChar(buf, pos); |
|
4254 assert index > 0; |
|
4255 if (index < 6) { // SCRIPT_ARR.length |
|
4256 char folded = c; |
|
4257 if (c >= 'A' && c <= 'Z') { |
|
4258 folded += 0x20; |
|
4259 } |
|
4260 if (folded != Tokenizer.SCRIPT_ARR[index]) { |
|
4261 reconsume = true; |
|
4262 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); |
|
4263 continue stateloop; |
|
4264 } |
|
4265 index++; |
|
4266 continue; |
|
4267 } |
|
4268 switch (c) { |
|
4269 case '\r': |
|
4270 emitCarriageReturn(buf, pos); |
|
4271 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); |
|
4272 break stateloop; |
|
4273 case '\n': |
|
4274 silentLineFeed(); |
|
4275 case ' ': |
|
4276 case '\t': |
|
4277 case '\u000C': |
|
4278 case '/': |
|
4279 case '>': |
|
4280 /* |
|
4281 * U+0009 CHARACTER TABULATION U+000A LINE FEED |
|
4282 * (LF) U+000C FORM FEED (FF) U+0020 SPACE |
|
4283 * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN |
|
4284 * (>) Emit the current input character as a |
|
4285 * character token. If the temporary buffer is |
|
4286 * the string "script", then switch to the |
|
4287 * script data double escaped state. |
|
4288 */ |
|
4289 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); |
|
4290 break scriptdatadoubleescapestartloop; |
|
4291 // continue stateloop; |
|
4292 default: |
|
4293 /* |
|
4294 * Anything else Reconsume the current input |
|
4295 * character in the script data escaped state. |
|
4296 */ |
|
4297 reconsume = true; |
|
4298 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); |
|
4299 continue stateloop; |
|
4300 } |
|
4301 } |
|
4302 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER |
|
4303 case SCRIPT_DATA_DOUBLE_ESCAPED: |
|
4304 scriptdatadoubleescapedloop: for (;;) { |
|
4305 if (reconsume) { |
|
4306 reconsume = false; |
|
4307 } else { |
|
4308 if (++pos == endPos) { |
|
4309 break stateloop; |
|
4310 } |
|
4311 c = checkChar(buf, pos); |
|
4312 } |
|
4313 /* |
|
4314 * Consume the next input character: |
|
4315 */ |
|
4316 switch (c) { |
|
4317 case '-': |
|
4318 /* |
|
4319 * U+002D HYPHEN-MINUS (-) Emit a U+002D |
|
4320 * HYPHEN-MINUS character token. Switch to the |
|
4321 * script data double escaped dash state. |
|
4322 */ |
|
4323 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos); |
|
4324 break scriptdatadoubleescapedloop; // FALL THRU |
|
4325 // continue |
|
4326 // stateloop; |
|
4327 case '<': |
|
4328 /* |
|
4329 * U+003C LESS-THAN SIGN (<) Emit a U+003C |
|
4330 * LESS-THAN SIGN character token. Switch to the |
|
4331 * script data double escaped less-than sign |
|
4332 * state. |
|
4333 */ |
|
4334 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); |
|
4335 continue stateloop; |
|
4336 case '\u0000': |
|
4337 emitReplacementCharacter(buf, pos); |
|
4338 continue; |
|
4339 case '\r': |
|
4340 emitCarriageReturn(buf, pos); |
|
4341 break stateloop; |
|
4342 case '\n': |
|
4343 silentLineFeed(); |
|
4344 default: |
|
4345 /* |
|
4346 * Anything else Emit the current input |
|
4347 * character as a character token. Stay in the |
|
4348 * script data double escaped state. |
|
4349 */ |
|
4350 continue; |
|
4351 } |
|
4352 } |
|
4353 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER |
|
4354 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH: |
|
4355 scriptdatadoubleescapeddashloop: for (;;) { |
|
4356 if (++pos == endPos) { |
|
4357 break stateloop; |
|
4358 } |
|
4359 c = checkChar(buf, pos); |
|
4360 /* |
|
4361 * Consume the next input character: |
|
4362 */ |
|
4363 switch (c) { |
|
4364 case '-': |
|
4365 /* |
|
4366 * U+002D HYPHEN-MINUS (-) Emit a U+002D |
|
4367 * HYPHEN-MINUS character token. Switch to the |
|
4368 * script data double escaped dash dash state. |
|
4369 */ |
|
4370 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos); |
|
4371 break scriptdatadoubleescapeddashloop; |
|
4372 // continue stateloop; |
|
4373 case '<': |
|
4374 /* |
|
4375 * U+003C LESS-THAN SIGN (<) Emit a U+003C |
|
4376 * LESS-THAN SIGN character token. Switch to the |
|
4377 * script data double escaped less-than sign |
|
4378 * state. |
|
4379 */ |
|
4380 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); |
|
4381 continue stateloop; |
|
4382 case '\u0000': |
|
4383 emitReplacementCharacter(buf, pos); |
|
4384 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); |
|
4385 continue stateloop; |
|
4386 case '\r': |
|
4387 emitCarriageReturn(buf, pos); |
|
4388 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); |
|
4389 break stateloop; |
|
4390 case '\n': |
|
4391 silentLineFeed(); |
|
4392 default: |
|
4393 /* |
|
4394 * Anything else Emit the current input |
|
4395 * character as a character token. Switch to the |
|
4396 * script data double escaped state. |
|
4397 */ |
|
4398 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); |
|
4399 continue stateloop; |
|
4400 } |
|
4401 } |
|
4402 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER |
|
4403 case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: |
|
4404 scriptdatadoubleescapeddashdashloop: for (;;) { |
|
4405 if (++pos == endPos) { |
|
4406 break stateloop; |
|
4407 } |
|
4408 c = checkChar(buf, pos); |
|
4409 /* |
|
4410 * Consume the next input character: |
|
4411 */ |
|
4412 switch (c) { |
|
4413 case '-': |
|
4414 /* |
|
4415 * U+002D HYPHEN-MINUS (-) Emit a U+002D |
|
4416 * HYPHEN-MINUS character token. Stay in the |
|
4417 * script data double escaped dash dash state. |
|
4418 */ |
|
4419 continue; |
|
4420 case '<': |
|
4421 /* |
|
4422 * U+003C LESS-THAN SIGN (<) Emit a U+003C |
|
4423 * LESS-THAN SIGN character token. Switch to the |
|
4424 * script data double escaped less-than sign |
|
4425 * state. |
|
4426 */ |
|
4427 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); |
|
4428 break scriptdatadoubleescapeddashdashloop; |
|
4429 case '>': |
|
4430 /* |
|
4431 * U+003E GREATER-THAN SIGN (>) Emit a U+003E |
|
4432 * GREATER-THAN SIGN character token. Switch to |
|
4433 * the script data state. |
|
4434 */ |
|
4435 state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); |
|
4436 continue stateloop; |
|
4437 case '\u0000': |
|
4438 emitReplacementCharacter(buf, pos); |
|
4439 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); |
|
4440 continue stateloop; |
|
4441 case '\r': |
|
4442 emitCarriageReturn(buf, pos); |
|
4443 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); |
|
4444 break stateloop; |
|
4445 case '\n': |
|
4446 silentLineFeed(); |
|
4447 default: |
|
4448 /* |
|
4449 * Anything else Emit the current input |
|
4450 * character as a character token. Switch to the |
|
4451 * script data double escaped state. |
|
4452 */ |
|
4453 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); |
|
4454 continue stateloop; |
|
4455 } |
|
4456 } |
|
4457 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER |
|
4458 case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: |
|
4459 scriptdatadoubleescapedlessthanloop: for (;;) { |
|
4460 if (++pos == endPos) { |
|
4461 break stateloop; |
|
4462 } |
|
4463 c = checkChar(buf, pos); |
|
4464 /* |
|
4465 * Consume the next input character: |
|
4466 */ |
|
4467 switch (c) { |
|
4468 case '/': |
|
4469 /* |
|
4470 * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS |
|
4471 * character token. Set the temporary buffer to |
|
4472 * the empty string. Switch to the script data |
|
4473 * double escape end state. |
|
4474 */ |
|
4475 index = 0; |
|
4476 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos); |
|
4477 break scriptdatadoubleescapedlessthanloop; |
|
4478 default: |
|
4479 /* |
|
4480 * Anything else Reconsume the current input |
|
4481 * character in the script data double escaped |
|
4482 * state. |
|
4483 */ |
|
4484 reconsume = true; |
|
4485 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); |
|
4486 continue stateloop; |
|
4487 } |
|
4488 } |
|
4489 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER |
|
4490 case SCRIPT_DATA_DOUBLE_ESCAPE_END: |
|
4491 scriptdatadoubleescapeendloop: for (;;) { |
|
4492 if (++pos == endPos) { |
|
4493 break stateloop; |
|
4494 } |
|
4495 c = checkChar(buf, pos); |
|
4496 if (index < 6) { // SCRIPT_ARR.length |
|
4497 char folded = c; |
|
4498 if (c >= 'A' && c <= 'Z') { |
|
4499 folded += 0x20; |
|
4500 } |
|
4501 if (folded != Tokenizer.SCRIPT_ARR[index]) { |
|
4502 reconsume = true; |
|
4503 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); |
|
4504 continue stateloop; |
|
4505 } |
|
4506 index++; |
|
4507 continue; |
|
4508 } |
|
4509 switch (c) { |
|
4510 case '\r': |
|
4511 emitCarriageReturn(buf, pos); |
|
4512 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); |
|
4513 break stateloop; |
|
4514 case '\n': |
|
4515 silentLineFeed(); |
|
4516 case ' ': |
|
4517 case '\t': |
|
4518 case '\u000C': |
|
4519 case '/': |
|
4520 case '>': |
|
4521 /* |
|
4522 * U+0009 CHARACTER TABULATION U+000A LINE FEED |
|
4523 * (LF) U+000C FORM FEED (FF) U+0020 SPACE |
|
4524 * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN |
|
4525 * (>) Emit the current input character as a |
|
4526 * character token. If the temporary buffer is |
|
4527 * the string "script", then switch to the |
|
4528 * script data escaped state. |
|
4529 */ |
|
4530 state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); |
|
4531 continue stateloop; |
|
4532 default: |
|
4533 /* |
|
4534 * Reconsume the current input character in the |
|
4535 * script data double escaped state. |
|
4536 */ |
|
4537 reconsume = true; |
|
4538 state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); |
|
4539 continue stateloop; |
|
4540 } |
|
4541 } |
|
4542 // XXX reorder point |
|
4543 case MARKUP_DECLARATION_OCTYPE: |
|
4544 markupdeclarationdoctypeloop: for (;;) { |
|
4545 if (++pos == endPos) { |
|
4546 break stateloop; |
|
4547 } |
|
4548 c = checkChar(buf, pos); |
|
4549 if (index < 6) { // OCTYPE.length |
|
4550 char folded = c; |
|
4551 if (c >= 'A' && c <= 'Z') { |
|
4552 folded += 0x20; |
|
4553 } |
|
4554 if (folded == Tokenizer.OCTYPE[index]) { |
|
4555 appendLongStrBuf(c); |
|
4556 } else { |
|
4557 errBogusComment(); |
|
4558 reconsume = true; |
|
4559 state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); |
|
4560 continue stateloop; |
|
4561 } |
|
4562 index++; |
|
4563 continue; |
|
4564 } else { |
|
4565 reconsume = true; |
|
4566 state = transition(state, Tokenizer.DOCTYPE, reconsume, pos); |
|
4567 break markupdeclarationdoctypeloop; |
|
4568 // continue stateloop; |
|
4569 } |
|
4570 } |
|
4571 // FALLTHRU DON'T REORDER |
|
4572 case DOCTYPE: |
|
4573 doctypeloop: for (;;) { |
|
4574 if (reconsume) { |
|
4575 reconsume = false; |
|
4576 } else { |
|
4577 if (++pos == endPos) { |
|
4578 break stateloop; |
|
4579 } |
|
4580 c = checkChar(buf, pos); |
|
4581 } |
|
4582 initDoctypeFields(); |
|
4583 /* |
|
4584 * Consume the next input character: |
|
4585 */ |
|
4586 switch (c) { |
|
4587 case '\r': |
|
4588 silentCarriageReturn(); |
|
4589 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); |
|
4590 break stateloop; |
|
4591 case '\n': |
|
4592 silentLineFeed(); |
|
4593 // fall thru |
|
4594 case ' ': |
|
4595 case '\t': |
|
4596 case '\u000C': |
|
4597 /* |
|
4598 * U+0009 CHARACTER TABULATION U+000A LINE FEED |
|
4599 * (LF) U+000C FORM FEED (FF) U+0020 SPACE |
|
4600 * Switch to the before DOCTYPE name state. |
|
4601 */ |
|
4602 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); |
|
4603 break doctypeloop; |
|
4604 // continue stateloop; |
|
4605 default: |
|
4606 /* |
|
4607 * Anything else Parse error. |
|
4608 */ |
|
4609 errMissingSpaceBeforeDoctypeName(); |
|
4610 /* |
|
4611 * Reconsume the current character in the before |
|
4612 * DOCTYPE name state. |
|
4613 */ |
|
4614 reconsume = true; |
|
4615 state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); |
|
4616 break doctypeloop; |
|
4617 // continue stateloop; |
|
4618 } |
|
4619 } |
|
4620 // FALLTHRU DON'T REORDER |
|
4621 case BEFORE_DOCTYPE_NAME: |
|
4622 beforedoctypenameloop: for (;;) { |
|
4623 if (reconsume) { |
|
4624 reconsume = false; |
|
4625 } else { |
|
4626 if (++pos == endPos) { |
|
4627 break stateloop; |
|
4628 } |
|
4629 c = checkChar(buf, pos); |
|
4630 } |
|
4631 /* |
|
4632 * Consume the next input character: |
|
4633 */ |
|
4634 switch (c) { |
|
4635 case '\r': |
|
4636 silentCarriageReturn(); |
|
4637 break stateloop; |
|
4638 case '\n': |
|
4639 silentLineFeed(); |
|
4640 // fall thru |
|
4641 case ' ': |
|
4642 case '\t': |
|
4643 case '\u000C': |
|
4644 /* |
|
4645 * U+0009 CHARACTER TABULATION U+000A LINE FEED |
|
4646 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay |
|
4647 * in the before DOCTYPE name state. |
|
4648 */ |
|
4649 continue; |
|
4650 case '>': |
|
4651 /* |
|
4652 * U+003E GREATER-THAN SIGN (>) Parse error. |
|
4653 */ |
|
4654 errNamelessDoctype(); |
|
4655 /* |
|
4656 * Create a new DOCTYPE token. Set its |
|
4657 * force-quirks flag to on. |
|
4658 */ |
|
4659 forceQuirks = true; |
|
4660 /* |
|
4661 * Emit the token. |
|
4662 */ |
|
4663 emitDoctypeToken(pos); |
|
4664 /* |
|
4665 * Switch to the data state. |
|
4666 */ |
|
4667 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
4668 continue stateloop; |
|
4669 case '\u0000': |
|
4670 c = '\uFFFD'; |
|
4671 // fall thru |
|
4672 default: |
|
4673 if (c >= 'A' && c <= 'Z') { |
|
4674 /* |
|
4675 * U+0041 LATIN CAPITAL LETTER A through to |
|
4676 * U+005A LATIN CAPITAL LETTER Z Create a |
|
4677 * new DOCTYPE token. Set the token's name |
|
4678 * to the lowercase version of the input |
|
4679 * character (add 0x0020 to the character's |
|
4680 * code point). |
|
4681 */ |
|
4682 c += 0x20; |
|
4683 } |
|
4684 /* Anything else Create a new DOCTYPE token. */ |
|
4685 /* |
|
4686 * Set the token's name name to the current |
|
4687 * input character. |
|
4688 */ |
|
4689 clearStrBufAndAppend(c); |
|
4690 /* |
|
4691 * Switch to the DOCTYPE name state. |
|
4692 */ |
|
4693 state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos); |
|
4694 break beforedoctypenameloop; |
|
4695 // continue stateloop; |
|
4696 } |
|
4697 } |
|
4698 // FALLTHRU DON'T REORDER |
|
4699 case DOCTYPE_NAME: |
|
4700 doctypenameloop: for (;;) { |
|
4701 if (++pos == endPos) { |
|
4702 break stateloop; |
|
4703 } |
|
4704 c = checkChar(buf, pos); |
|
4705 /* |
|
4706 * Consume the next input character: |
|
4707 */ |
|
4708 switch (c) { |
|
4709 case '\r': |
|
4710 silentCarriageReturn(); |
|
4711 strBufToDoctypeName(); |
|
4712 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); |
|
4713 break stateloop; |
|
4714 case '\n': |
|
4715 silentLineFeed(); |
|
4716 // fall thru |
|
4717 case ' ': |
|
4718 case '\t': |
|
4719 case '\u000C': |
|
4720 /* |
|
4721 * U+0009 CHARACTER TABULATION U+000A LINE FEED |
|
4722 * (LF) U+000C FORM FEED (FF) U+0020 SPACE |
|
4723 * Switch to the after DOCTYPE name state. |
|
4724 */ |
|
4725 strBufToDoctypeName(); |
|
4726 state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); |
|
4727 break doctypenameloop; |
|
4728 // continue stateloop; |
|
4729 case '>': |
|
4730 /* |
|
4731 * U+003E GREATER-THAN SIGN (>) Emit the current |
|
4732 * DOCTYPE token. |
|
4733 */ |
|
4734 strBufToDoctypeName(); |
|
4735 emitDoctypeToken(pos); |
|
4736 /* |
|
4737 * Switch to the data state. |
|
4738 */ |
|
4739 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
4740 continue stateloop; |
|
4741 case '\u0000': |
|
4742 c = '\uFFFD'; |
|
4743 // fall thru |
|
4744 default: |
|
4745 /* |
|
4746 * U+0041 LATIN CAPITAL LETTER A through to |
|
4747 * U+005A LATIN CAPITAL LETTER Z Append the |
|
4748 * lowercase version of the input character (add |
|
4749 * 0x0020 to the character's code point) to the |
|
4750 * current DOCTYPE token's name. |
|
4751 */ |
|
4752 if (c >= 'A' && c <= 'Z') { |
|
4753 c += 0x0020; |
|
4754 } |
|
4755 /* |
|
4756 * Anything else Append the current input |
|
4757 * character to the current DOCTYPE token's |
|
4758 * name. |
|
4759 */ |
|
4760 appendStrBuf(c); |
|
4761 /* |
|
4762 * Stay in the DOCTYPE name state. |
|
4763 */ |
|
4764 continue; |
|
4765 } |
|
4766 } |
|
4767 // FALLTHRU DON'T REORDER |
|
4768 case AFTER_DOCTYPE_NAME: |
|
4769 afterdoctypenameloop: for (;;) { |
|
4770 if (++pos == endPos) { |
|
4771 break stateloop; |
|
4772 } |
|
4773 c = checkChar(buf, pos); |
|
4774 /* |
|
4775 * Consume the next input character: |
|
4776 */ |
|
4777 switch (c) { |
|
4778 case '\r': |
|
4779 silentCarriageReturn(); |
|
4780 break stateloop; |
|
4781 case '\n': |
|
4782 silentLineFeed(); |
|
4783 // fall thru |
|
4784 case ' ': |
|
4785 case '\t': |
|
4786 case '\u000C': |
|
4787 /* |
|
4788 * U+0009 CHARACTER TABULATION U+000A LINE FEED |
|
4789 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay |
|
4790 * in the after DOCTYPE name state. |
|
4791 */ |
|
4792 continue; |
|
4793 case '>': |
|
4794 /* |
|
4795 * U+003E GREATER-THAN SIGN (>) Emit the current |
|
4796 * DOCTYPE token. |
|
4797 */ |
|
4798 emitDoctypeToken(pos); |
|
4799 /* |
|
4800 * Switch to the data state. |
|
4801 */ |
|
4802 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
4803 continue stateloop; |
|
4804 case 'p': |
|
4805 case 'P': |
|
4806 index = 0; |
|
4807 state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos); |
|
4808 break afterdoctypenameloop; |
|
4809 // continue stateloop; |
|
4810 case 's': |
|
4811 case 'S': |
|
4812 index = 0; |
|
4813 state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos); |
|
4814 continue stateloop; |
|
4815 default: |
|
4816 /* |
|
4817 * Otherwise, this is the parse error. |
|
4818 */ |
|
4819 bogusDoctype(); |
|
4820 |
|
4821 /* |
|
4822 * Set the DOCTYPE token's force-quirks flag to |
|
4823 * on. |
|
4824 */ |
|
4825 // done by bogusDoctype(); |
|
4826 /* |
|
4827 * Switch to the bogus DOCTYPE state. |
|
4828 */ |
|
4829 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); |
|
4830 continue stateloop; |
|
4831 } |
|
4832 } |
|
4833 // FALLTHRU DON'T REORDER |
|
4834 case DOCTYPE_UBLIC: |
|
4835 doctypeublicloop: for (;;) { |
|
4836 if (++pos == endPos) { |
|
4837 break stateloop; |
|
4838 } |
|
4839 c = checkChar(buf, pos); |
|
4840 /* |
|
4841 * If the six characters starting from the current input |
|
4842 * character are an ASCII case-insensitive match for the |
|
4843 * word "PUBLIC", then consume those characters and |
|
4844 * switch to the before DOCTYPE public identifier state. |
|
4845 */ |
|
4846 if (index < 5) { // UBLIC.length |
|
4847 char folded = c; |
|
4848 if (c >= 'A' && c <= 'Z') { |
|
4849 folded += 0x20; |
|
4850 } |
|
4851 if (folded != Tokenizer.UBLIC[index]) { |
|
4852 bogusDoctype(); |
|
4853 // forceQuirks = true; |
|
4854 reconsume = true; |
|
4855 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); |
|
4856 continue stateloop; |
|
4857 } |
|
4858 index++; |
|
4859 continue; |
|
4860 } else { |
|
4861 reconsume = true; |
|
4862 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos); |
|
4863 break doctypeublicloop; |
|
4864 // continue stateloop; |
|
4865 } |
|
4866 } |
|
4867 // FALLTHRU DON'T REORDER |
|
4868 case AFTER_DOCTYPE_PUBLIC_KEYWORD: |
|
4869 afterdoctypepublickeywordloop: for (;;) { |
|
4870 if (reconsume) { |
|
4871 reconsume = false; |
|
4872 } else { |
|
4873 if (++pos == endPos) { |
|
4874 break stateloop; |
|
4875 } |
|
4876 c = checkChar(buf, pos); |
|
4877 } |
|
4878 /* |
|
4879 * Consume the next input character: |
|
4880 */ |
|
4881 switch (c) { |
|
4882 case '\r': |
|
4883 silentCarriageReturn(); |
|
4884 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); |
|
4885 break stateloop; |
|
4886 case '\n': |
|
4887 silentLineFeed(); |
|
4888 // fall thru |
|
4889 case ' ': |
|
4890 case '\t': |
|
4891 case '\u000C': |
|
4892 /* |
|
4893 * U+0009 CHARACTER TABULATION U+000A LINE FEED |
|
4894 * (LF) U+000C FORM FEED (FF) U+0020 SPACE |
|
4895 * Switch to the before DOCTYPE public |
|
4896 * identifier state. |
|
4897 */ |
|
4898 state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); |
|
4899 break afterdoctypepublickeywordloop; |
|
4900 // FALL THROUGH continue stateloop |
|
4901 case '"': |
|
4902 /* |
|
4903 * U+0022 QUOTATION MARK (") Parse Error. |
|
4904 */ |
|
4905 errNoSpaceBetweenDoctypePublicKeywordAndQuote(); |
|
4906 /* |
|
4907 * Set the DOCTYPE token's public identifier to |
|
4908 * the empty string (not missing), |
|
4909 */ |
|
4910 clearLongStrBuf(); |
|
4911 /* |
|
4912 * then switch to the DOCTYPE public identifier |
|
4913 * (double-quoted) state. |
|
4914 */ |
|
4915 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); |
|
4916 continue stateloop; |
|
4917 case '\'': |
|
4918 /* |
|
4919 * U+0027 APOSTROPHE (') Parse Error. |
|
4920 */ |
|
4921 errNoSpaceBetweenDoctypePublicKeywordAndQuote(); |
|
4922 /* |
|
4923 * Set the DOCTYPE token's public identifier to |
|
4924 * the empty string (not missing), |
|
4925 */ |
|
4926 clearLongStrBuf(); |
|
4927 /* |
|
4928 * then switch to the DOCTYPE public identifier |
|
4929 * (single-quoted) state. |
|
4930 */ |
|
4931 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); |
|
4932 continue stateloop; |
|
4933 case '>': |
|
4934 /* U+003E GREATER-THAN SIGN (>) Parse error. */ |
|
4935 errExpectedPublicId(); |
|
4936 /* |
|
4937 * Set the DOCTYPE token's force-quirks flag to |
|
4938 * on. |
|
4939 */ |
|
4940 forceQuirks = true; |
|
4941 /* |
|
4942 * Emit that DOCTYPE token. |
|
4943 */ |
|
4944 emitDoctypeToken(pos); |
|
4945 /* |
|
4946 * Switch to the data state. |
|
4947 */ |
|
4948 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
4949 continue stateloop; |
|
4950 default: |
|
4951 bogusDoctype(); |
|
4952 /* |
|
4953 * Set the DOCTYPE token's force-quirks flag to |
|
4954 * on. |
|
4955 */ |
|
4956 // done by bogusDoctype(); |
|
4957 /* |
|
4958 * Switch to the bogus DOCTYPE state. |
|
4959 */ |
|
4960 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); |
|
4961 continue stateloop; |
|
4962 } |
|
4963 } |
|
4964 // FALLTHRU DON'T REORDER |
|
4965 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: |
|
4966 beforedoctypepublicidentifierloop: for (;;) { |
|
4967 if (++pos == endPos) { |
|
4968 break stateloop; |
|
4969 } |
|
4970 c = checkChar(buf, pos); |
|
4971 /* |
|
4972 * Consume the next input character: |
|
4973 */ |
|
4974 switch (c) { |
|
4975 case '\r': |
|
4976 silentCarriageReturn(); |
|
4977 break stateloop; |
|
4978 case '\n': |
|
4979 silentLineFeed(); |
|
4980 // fall thru |
|
4981 case ' ': |
|
4982 case '\t': |
|
4983 case '\u000C': |
|
4984 /* |
|
4985 * U+0009 CHARACTER TABULATION U+000A LINE FEED |
|
4986 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay |
|
4987 * in the before DOCTYPE public identifier |
|
4988 * state. |
|
4989 */ |
|
4990 continue; |
|
4991 case '"': |
|
4992 /* |
|
4993 * U+0022 QUOTATION MARK (") Set the DOCTYPE |
|
4994 * token's public identifier to the empty string |
|
4995 * (not missing), |
|
4996 */ |
|
4997 clearLongStrBuf(); |
|
4998 /* |
|
4999 * then switch to the DOCTYPE public identifier |
|
5000 * (double-quoted) state. |
|
5001 */ |
|
5002 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); |
|
5003 break beforedoctypepublicidentifierloop; |
|
5004 // continue stateloop; |
|
5005 case '\'': |
|
5006 /* |
|
5007 * U+0027 APOSTROPHE (') Set the DOCTYPE token's |
|
5008 * public identifier to the empty string (not |
|
5009 * missing), |
|
5010 */ |
|
5011 clearLongStrBuf(); |
|
5012 /* |
|
5013 * then switch to the DOCTYPE public identifier |
|
5014 * (single-quoted) state. |
|
5015 */ |
|
5016 state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); |
|
5017 continue stateloop; |
|
5018 case '>': |
|
5019 /* U+003E GREATER-THAN SIGN (>) Parse error. */ |
|
5020 errExpectedPublicId(); |
|
5021 /* |
|
5022 * Set the DOCTYPE token's force-quirks flag to |
|
5023 * on. |
|
5024 */ |
|
5025 forceQuirks = true; |
|
5026 /* |
|
5027 * Emit that DOCTYPE token. |
|
5028 */ |
|
5029 emitDoctypeToken(pos); |
|
5030 /* |
|
5031 * Switch to the data state. |
|
5032 */ |
|
5033 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
5034 continue stateloop; |
|
5035 default: |
|
5036 bogusDoctype(); |
|
5037 /* |
|
5038 * Set the DOCTYPE token's force-quirks flag to |
|
5039 * on. |
|
5040 */ |
|
5041 // done by bogusDoctype(); |
|
5042 /* |
|
5043 * Switch to the bogus DOCTYPE state. |
|
5044 */ |
|
5045 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); |
|
5046 continue stateloop; |
|
5047 } |
|
5048 } |
|
5049 // FALLTHRU DON'T REORDER |
|
5050 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: |
|
5051 doctypepublicidentifierdoublequotedloop: for (;;) { |
|
5052 if (++pos == endPos) { |
|
5053 break stateloop; |
|
5054 } |
|
5055 c = checkChar(buf, pos); |
|
5056 /* |
|
5057 * Consume the next input character: |
|
5058 */ |
|
5059 switch (c) { |
|
5060 case '"': |
|
5061 /* |
|
5062 * U+0022 QUOTATION MARK (") Switch to the after |
|
5063 * DOCTYPE public identifier state. |
|
5064 */ |
|
5065 publicIdentifier = longStrBufToString(); |
|
5066 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); |
|
5067 break doctypepublicidentifierdoublequotedloop; |
|
5068 // continue stateloop; |
|
5069 case '>': |
|
5070 /* |
|
5071 * U+003E GREATER-THAN SIGN (>) Parse error. |
|
5072 */ |
|
5073 errGtInPublicId(); |
|
5074 /* |
|
5075 * Set the DOCTYPE token's force-quirks flag to |
|
5076 * on. |
|
5077 */ |
|
5078 forceQuirks = true; |
|
5079 /* |
|
5080 * Emit that DOCTYPE token. |
|
5081 */ |
|
5082 publicIdentifier = longStrBufToString(); |
|
5083 emitDoctypeToken(pos); |
|
5084 /* |
|
5085 * Switch to the data state. |
|
5086 */ |
|
5087 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
5088 continue stateloop; |
|
5089 case '\r': |
|
5090 appendLongStrBufCarriageReturn(); |
|
5091 break stateloop; |
|
5092 case '\n': |
|
5093 appendLongStrBufLineFeed(); |
|
5094 continue; |
|
5095 case '\u0000': |
|
5096 c = '\uFFFD'; |
|
5097 // fall thru |
|
5098 default: |
|
5099 /* |
|
5100 * Anything else Append the current input |
|
5101 * character to the current DOCTYPE token's |
|
5102 * public identifier. |
|
5103 */ |
|
5104 appendLongStrBuf(c); |
|
5105 /* |
|
5106 * Stay in the DOCTYPE public identifier |
|
5107 * (double-quoted) state. |
|
5108 */ |
|
5109 continue; |
|
5110 } |
|
5111 } |
|
5112 // FALLTHRU DON'T REORDER |
|
5113 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER: |
|
5114 afterdoctypepublicidentifierloop: for (;;) { |
|
5115 if (++pos == endPos) { |
|
5116 break stateloop; |
|
5117 } |
|
5118 c = checkChar(buf, pos); |
|
5119 /* |
|
5120 * Consume the next input character: |
|
5121 */ |
|
5122 switch (c) { |
|
5123 case '\r': |
|
5124 silentCarriageReturn(); |
|
5125 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); |
|
5126 break stateloop; |
|
5127 case '\n': |
|
5128 silentLineFeed(); |
|
5129 // fall thru |
|
5130 case ' ': |
|
5131 case '\t': |
|
5132 case '\u000C': |
|
5133 /* |
|
5134 * U+0009 CHARACTER TABULATION U+000A LINE FEED |
|
5135 * (LF) U+000C FORM FEED (FF) U+0020 SPACE |
|
5136 * Switch to the between DOCTYPE public and |
|
5137 * system identifiers state. |
|
5138 */ |
|
5139 state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); |
|
5140 break afterdoctypepublicidentifierloop; |
|
5141 // continue stateloop; |
|
5142 case '>': |
|
5143 /* |
|
5144 * U+003E GREATER-THAN SIGN (>) Emit the current |
|
5145 * DOCTYPE token. |
|
5146 */ |
|
5147 emitDoctypeToken(pos); |
|
5148 /* |
|
5149 * Switch to the data state. |
|
5150 */ |
|
5151 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
5152 continue stateloop; |
|
5153 case '"': |
|
5154 /* |
|
5155 * U+0022 QUOTATION MARK (") Parse error. |
|
5156 */ |
|
5157 errNoSpaceBetweenPublicAndSystemIds(); |
|
5158 /* |
|
5159 * Set the DOCTYPE token's system identifier to |
|
5160 * the empty string (not missing), |
|
5161 */ |
|
5162 clearLongStrBuf(); |
|
5163 /* |
|
5164 * then switch to the DOCTYPE system identifier |
|
5165 * (double-quoted) state. |
|
5166 */ |
|
5167 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); |
|
5168 continue stateloop; |
|
5169 case '\'': |
|
5170 /* |
|
5171 * U+0027 APOSTROPHE (') Parse error. |
|
5172 */ |
|
5173 errNoSpaceBetweenPublicAndSystemIds(); |
|
5174 /* |
|
5175 * Set the DOCTYPE token's system identifier to |
|
5176 * the empty string (not missing), |
|
5177 */ |
|
5178 clearLongStrBuf(); |
|
5179 /* |
|
5180 * then switch to the DOCTYPE system identifier |
|
5181 * (single-quoted) state. |
|
5182 */ |
|
5183 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); |
|
5184 continue stateloop; |
|
5185 default: |
|
5186 bogusDoctype(); |
|
5187 /* |
|
5188 * Set the DOCTYPE token's force-quirks flag to |
|
5189 * on. |
|
5190 */ |
|
5191 // done by bogusDoctype(); |
|
5192 /* |
|
5193 * Switch to the bogus DOCTYPE state. |
|
5194 */ |
|
5195 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); |
|
5196 continue stateloop; |
|
5197 } |
|
5198 } |
|
5199 // FALLTHRU DON'T REORDER |
|
5200 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: |
|
5201 betweendoctypepublicandsystemidentifiersloop: for (;;) { |
|
5202 if (++pos == endPos) { |
|
5203 break stateloop; |
|
5204 } |
|
5205 c = checkChar(buf, pos); |
|
5206 /* |
|
5207 * Consume the next input character: |
|
5208 */ |
|
5209 switch (c) { |
|
5210 case '\r': |
|
5211 silentCarriageReturn(); |
|
5212 break stateloop; |
|
5213 case '\n': |
|
5214 silentLineFeed(); |
|
5215 // fall thru |
|
5216 case ' ': |
|
5217 case '\t': |
|
5218 case '\u000C': |
|
5219 /* |
|
5220 * U+0009 CHARACTER TABULATION U+000A LINE FEED |
|
5221 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay |
|
5222 * in the between DOCTYPE public and system |
|
5223 * identifiers state. |
|
5224 */ |
|
5225 continue; |
|
5226 case '>': |
|
5227 /* |
|
5228 * U+003E GREATER-THAN SIGN (>) Emit the current |
|
5229 * DOCTYPE token. |
|
5230 */ |
|
5231 emitDoctypeToken(pos); |
|
5232 /* |
|
5233 * Switch to the data state. |
|
5234 */ |
|
5235 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
5236 continue stateloop; |
|
5237 case '"': |
|
5238 /* |
|
5239 * U+0022 QUOTATION MARK (") Set the DOCTYPE |
|
5240 * token's system identifier to the empty string |
|
5241 * (not missing), |
|
5242 */ |
|
5243 clearLongStrBuf(); |
|
5244 /* |
|
5245 * then switch to the DOCTYPE system identifier |
|
5246 * (double-quoted) state. |
|
5247 */ |
|
5248 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); |
|
5249 break betweendoctypepublicandsystemidentifiersloop; |
|
5250 // continue stateloop; |
|
5251 case '\'': |
|
5252 /* |
|
5253 * U+0027 APOSTROPHE (') Set the DOCTYPE token's |
|
5254 * system identifier to the empty string (not |
|
5255 * missing), |
|
5256 */ |
|
5257 clearLongStrBuf(); |
|
5258 /* |
|
5259 * then switch to the DOCTYPE system identifier |
|
5260 * (single-quoted) state. |
|
5261 */ |
|
5262 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); |
|
5263 continue stateloop; |
|
5264 default: |
|
5265 bogusDoctype(); |
|
5266 /* |
|
5267 * Set the DOCTYPE token's force-quirks flag to |
|
5268 * on. |
|
5269 */ |
|
5270 // done by bogusDoctype(); |
|
5271 /* |
|
5272 * Switch to the bogus DOCTYPE state. |
|
5273 */ |
|
5274 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); |
|
5275 continue stateloop; |
|
5276 } |
|
5277 } |
|
5278 // FALLTHRU DON'T REORDER |
|
5279 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: |
|
5280 doctypesystemidentifierdoublequotedloop: for (;;) { |
|
5281 if (++pos == endPos) { |
|
5282 break stateloop; |
|
5283 } |
|
5284 c = checkChar(buf, pos); |
|
5285 /* |
|
5286 * Consume the next input character: |
|
5287 */ |
|
5288 switch (c) { |
|
5289 case '"': |
|
5290 /* |
|
5291 * U+0022 QUOTATION MARK (") Switch to the after |
|
5292 * DOCTYPE system identifier state. |
|
5293 */ |
|
5294 systemIdentifier = longStrBufToString(); |
|
5295 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); |
|
5296 continue stateloop; |
|
5297 case '>': |
|
5298 /* |
|
5299 * U+003E GREATER-THAN SIGN (>) Parse error. |
|
5300 */ |
|
5301 errGtInSystemId(); |
|
5302 /* |
|
5303 * Set the DOCTYPE token's force-quirks flag to |
|
5304 * on. |
|
5305 */ |
|
5306 forceQuirks = true; |
|
5307 /* |
|
5308 * Emit that DOCTYPE token. |
|
5309 */ |
|
5310 systemIdentifier = longStrBufToString(); |
|
5311 emitDoctypeToken(pos); |
|
5312 /* |
|
5313 * Switch to the data state. |
|
5314 */ |
|
5315 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
5316 continue stateloop; |
|
5317 case '\r': |
|
5318 appendLongStrBufCarriageReturn(); |
|
5319 break stateloop; |
|
5320 case '\n': |
|
5321 appendLongStrBufLineFeed(); |
|
5322 continue; |
|
5323 case '\u0000': |
|
5324 c = '\uFFFD'; |
|
5325 // fall thru |
|
5326 default: |
|
5327 /* |
|
5328 * Anything else Append the current input |
|
5329 * character to the current DOCTYPE token's |
|
5330 * system identifier. |
|
5331 */ |
|
5332 appendLongStrBuf(c); |
|
5333 /* |
|
5334 * Stay in the DOCTYPE system identifier |
|
5335 * (double-quoted) state. |
|
5336 */ |
|
5337 continue; |
|
5338 } |
|
5339 } |
|
5340 // FALLTHRU DON'T REORDER |
|
5341 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: |
|
5342 afterdoctypesystemidentifierloop: for (;;) { |
|
5343 if (++pos == endPos) { |
|
5344 break stateloop; |
|
5345 } |
|
5346 c = checkChar(buf, pos); |
|
5347 /* |
|
5348 * Consume the next input character: |
|
5349 */ |
|
5350 switch (c) { |
|
5351 case '\r': |
|
5352 silentCarriageReturn(); |
|
5353 break stateloop; |
|
5354 case '\n': |
|
5355 silentLineFeed(); |
|
5356 // fall thru |
|
5357 case ' ': |
|
5358 case '\t': |
|
5359 case '\u000C': |
|
5360 /* |
|
5361 * U+0009 CHARACTER TABULATION U+000A LINE FEED |
|
5362 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay |
|
5363 * in the after DOCTYPE system identifier state. |
|
5364 */ |
|
5365 continue; |
|
5366 case '>': |
|
5367 /* |
|
5368 * U+003E GREATER-THAN SIGN (>) Emit the current |
|
5369 * DOCTYPE token. |
|
5370 */ |
|
5371 emitDoctypeToken(pos); |
|
5372 /* |
|
5373 * Switch to the data state. |
|
5374 */ |
|
5375 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
5376 continue stateloop; |
|
5377 default: |
|
5378 /* |
|
5379 * Switch to the bogus DOCTYPE state. (This does |
|
5380 * not set the DOCTYPE token's force-quirks flag |
|
5381 * to on.) |
|
5382 */ |
|
5383 bogusDoctypeWithoutQuirks(); |
|
5384 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); |
|
5385 break afterdoctypesystemidentifierloop; |
|
5386 // continue stateloop; |
|
5387 } |
|
5388 } |
|
5389 // FALLTHRU DON'T REORDER |
|
5390 case BOGUS_DOCTYPE: |
|
5391 for (;;) { |
|
5392 if (reconsume) { |
|
5393 reconsume = false; |
|
5394 } else { |
|
5395 if (++pos == endPos) { |
|
5396 break stateloop; |
|
5397 } |
|
5398 c = checkChar(buf, pos); |
|
5399 } |
|
5400 /* |
|
5401 * Consume the next input character: |
|
5402 */ |
|
5403 switch (c) { |
|
5404 case '>': |
|
5405 /* |
|
5406 * U+003E GREATER-THAN SIGN (>) Emit that |
|
5407 * DOCTYPE token. |
|
5408 */ |
|
5409 emitDoctypeToken(pos); |
|
5410 /* |
|
5411 * Switch to the data state. |
|
5412 */ |
|
5413 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
5414 continue stateloop; |
|
5415 case '\r': |
|
5416 silentCarriageReturn(); |
|
5417 break stateloop; |
|
5418 case '\n': |
|
5419 silentLineFeed(); |
|
5420 // fall thru |
|
5421 default: |
|
5422 /* |
|
5423 * Anything else Stay in the bogus DOCTYPE |
|
5424 * state. |
|
5425 */ |
|
5426 continue; |
|
5427 } |
|
5428 } |
|
5429 // XXX reorder point |
|
5430 case DOCTYPE_YSTEM: |
|
5431 doctypeystemloop: for (;;) { |
|
5432 if (++pos == endPos) { |
|
5433 break stateloop; |
|
5434 } |
|
5435 c = checkChar(buf, pos); |
|
5436 /* |
|
5437 * Otherwise, if the six characters starting from the |
|
5438 * current input character are an ASCII case-insensitive |
|
5439 * match for the word "SYSTEM", then consume those |
|
5440 * characters and switch to the before DOCTYPE system |
|
5441 * identifier state. |
|
5442 */ |
|
5443 if (index < 5) { // YSTEM.length |
|
5444 char folded = c; |
|
5445 if (c >= 'A' && c <= 'Z') { |
|
5446 folded += 0x20; |
|
5447 } |
|
5448 if (folded != Tokenizer.YSTEM[index]) { |
|
5449 bogusDoctype(); |
|
5450 reconsume = true; |
|
5451 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); |
|
5452 continue stateloop; |
|
5453 } |
|
5454 index++; |
|
5455 continue stateloop; |
|
5456 } else { |
|
5457 reconsume = true; |
|
5458 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos); |
|
5459 break doctypeystemloop; |
|
5460 // continue stateloop; |
|
5461 } |
|
5462 } |
|
5463 // FALLTHRU DON'T REORDER |
|
5464 case AFTER_DOCTYPE_SYSTEM_KEYWORD: |
|
5465 afterdoctypesystemkeywordloop: for (;;) { |
|
5466 if (reconsume) { |
|
5467 reconsume = false; |
|
5468 } else { |
|
5469 if (++pos == endPos) { |
|
5470 break stateloop; |
|
5471 } |
|
5472 c = checkChar(buf, pos); |
|
5473 } |
|
5474 /* |
|
5475 * Consume the next input character: |
|
5476 */ |
|
5477 switch (c) { |
|
5478 case '\r': |
|
5479 silentCarriageReturn(); |
|
5480 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); |
|
5481 break stateloop; |
|
5482 case '\n': |
|
5483 silentLineFeed(); |
|
5484 // fall thru |
|
5485 case ' ': |
|
5486 case '\t': |
|
5487 case '\u000C': |
|
5488 /* |
|
5489 * U+0009 CHARACTER TABULATION U+000A LINE FEED |
|
5490 * (LF) U+000C FORM FEED (FF) U+0020 SPACE |
|
5491 * Switch to the before DOCTYPE public |
|
5492 * identifier state. |
|
5493 */ |
|
5494 state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); |
|
5495 break afterdoctypesystemkeywordloop; |
|
5496 // FALL THROUGH continue stateloop |
|
5497 case '"': |
|
5498 /* |
|
5499 * U+0022 QUOTATION MARK (") Parse Error. |
|
5500 */ |
|
5501 errNoSpaceBetweenDoctypeSystemKeywordAndQuote(); |
|
5502 /* |
|
5503 * Set the DOCTYPE token's system identifier to |
|
5504 * the empty string (not missing), |
|
5505 */ |
|
5506 clearLongStrBuf(); |
|
5507 /* |
|
5508 * then switch to the DOCTYPE public identifier |
|
5509 * (double-quoted) state. |
|
5510 */ |
|
5511 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); |
|
5512 continue stateloop; |
|
5513 case '\'': |
|
5514 /* |
|
5515 * U+0027 APOSTROPHE (') Parse Error. |
|
5516 */ |
|
5517 errNoSpaceBetweenDoctypeSystemKeywordAndQuote(); |
|
5518 /* |
|
5519 * Set the DOCTYPE token's public identifier to |
|
5520 * the empty string (not missing), |
|
5521 */ |
|
5522 clearLongStrBuf(); |
|
5523 /* |
|
5524 * then switch to the DOCTYPE public identifier |
|
5525 * (single-quoted) state. |
|
5526 */ |
|
5527 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); |
|
5528 continue stateloop; |
|
5529 case '>': |
|
5530 /* U+003E GREATER-THAN SIGN (>) Parse error. */ |
|
5531 errExpectedPublicId(); |
|
5532 /* |
|
5533 * Set the DOCTYPE token's force-quirks flag to |
|
5534 * on. |
|
5535 */ |
|
5536 forceQuirks = true; |
|
5537 /* |
|
5538 * Emit that DOCTYPE token. |
|
5539 */ |
|
5540 emitDoctypeToken(pos); |
|
5541 /* |
|
5542 * Switch to the data state. |
|
5543 */ |
|
5544 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
5545 continue stateloop; |
|
5546 default: |
|
5547 bogusDoctype(); |
|
5548 /* |
|
5549 * Set the DOCTYPE token's force-quirks flag to |
|
5550 * on. |
|
5551 */ |
|
5552 // done by bogusDoctype(); |
|
5553 /* |
|
5554 * Switch to the bogus DOCTYPE state. |
|
5555 */ |
|
5556 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); |
|
5557 continue stateloop; |
|
5558 } |
|
5559 } |
|
5560 // FALLTHRU DON'T REORDER |
|
5561 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: |
|
5562 beforedoctypesystemidentifierloop: for (;;) { |
|
5563 if (++pos == endPos) { |
|
5564 break stateloop; |
|
5565 } |
|
5566 c = checkChar(buf, pos); |
|
5567 /* |
|
5568 * Consume the next input character: |
|
5569 */ |
|
5570 switch (c) { |
|
5571 case '\r': |
|
5572 silentCarriageReturn(); |
|
5573 break stateloop; |
|
5574 case '\n': |
|
5575 silentLineFeed(); |
|
5576 // fall thru |
|
5577 case ' ': |
|
5578 case '\t': |
|
5579 case '\u000C': |
|
5580 /* |
|
5581 * U+0009 CHARACTER TABULATION U+000A LINE FEED |
|
5582 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay |
|
5583 * in the before DOCTYPE system identifier |
|
5584 * state. |
|
5585 */ |
|
5586 continue; |
|
5587 case '"': |
|
5588 /* |
|
5589 * U+0022 QUOTATION MARK (") Set the DOCTYPE |
|
5590 * token's system identifier to the empty string |
|
5591 * (not missing), |
|
5592 */ |
|
5593 clearLongStrBuf(); |
|
5594 /* |
|
5595 * then switch to the DOCTYPE system identifier |
|
5596 * (double-quoted) state. |
|
5597 */ |
|
5598 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); |
|
5599 continue stateloop; |
|
5600 case '\'': |
|
5601 /* |
|
5602 * U+0027 APOSTROPHE (') Set the DOCTYPE token's |
|
5603 * system identifier to the empty string (not |
|
5604 * missing), |
|
5605 */ |
|
5606 clearLongStrBuf(); |
|
5607 /* |
|
5608 * then switch to the DOCTYPE system identifier |
|
5609 * (single-quoted) state. |
|
5610 */ |
|
5611 state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); |
|
5612 break beforedoctypesystemidentifierloop; |
|
5613 // continue stateloop; |
|
5614 case '>': |
|
5615 /* U+003E GREATER-THAN SIGN (>) Parse error. */ |
|
5616 errExpectedSystemId(); |
|
5617 /* |
|
5618 * Set the DOCTYPE token's force-quirks flag to |
|
5619 * on. |
|
5620 */ |
|
5621 forceQuirks = true; |
|
5622 /* |
|
5623 * Emit that DOCTYPE token. |
|
5624 */ |
|
5625 emitDoctypeToken(pos); |
|
5626 /* |
|
5627 * Switch to the data state. |
|
5628 */ |
|
5629 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
5630 continue stateloop; |
|
5631 default: |
|
5632 bogusDoctype(); |
|
5633 /* |
|
5634 * Set the DOCTYPE token's force-quirks flag to |
|
5635 * on. |
|
5636 */ |
|
5637 // done by bogusDoctype(); |
|
5638 /* |
|
5639 * Switch to the bogus DOCTYPE state. |
|
5640 */ |
|
5641 state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); |
|
5642 continue stateloop; |
|
5643 } |
|
5644 } |
|
5645 // FALLTHRU DON'T REORDER |
|
5646 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: |
|
5647 for (;;) { |
|
5648 if (++pos == endPos) { |
|
5649 break stateloop; |
|
5650 } |
|
5651 c = checkChar(buf, pos); |
|
5652 /* |
|
5653 * Consume the next input character: |
|
5654 */ |
|
5655 switch (c) { |
|
5656 case '\'': |
|
5657 /* |
|
5658 * U+0027 APOSTROPHE (') Switch to the after |
|
5659 * DOCTYPE system identifier state. |
|
5660 */ |
|
5661 systemIdentifier = longStrBufToString(); |
|
5662 state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); |
|
5663 continue stateloop; |
|
5664 case '>': |
|
5665 errGtInSystemId(); |
|
5666 /* |
|
5667 * Set the DOCTYPE token's force-quirks flag to |
|
5668 * on. |
|
5669 */ |
|
5670 forceQuirks = true; |
|
5671 /* |
|
5672 * Emit that DOCTYPE token. |
|
5673 */ |
|
5674 systemIdentifier = longStrBufToString(); |
|
5675 emitDoctypeToken(pos); |
|
5676 /* |
|
5677 * Switch to the data state. |
|
5678 */ |
|
5679 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
5680 continue stateloop; |
|
5681 case '\r': |
|
5682 appendLongStrBufCarriageReturn(); |
|
5683 break stateloop; |
|
5684 case '\n': |
|
5685 appendLongStrBufLineFeed(); |
|
5686 continue; |
|
5687 case '\u0000': |
|
5688 c = '\uFFFD'; |
|
5689 // fall thru |
|
5690 default: |
|
5691 /* |
|
5692 * Anything else Append the current input |
|
5693 * character to the current DOCTYPE token's |
|
5694 * system identifier. |
|
5695 */ |
|
5696 appendLongStrBuf(c); |
|
5697 /* |
|
5698 * Stay in the DOCTYPE system identifier |
|
5699 * (double-quoted) state. |
|
5700 */ |
|
5701 continue; |
|
5702 } |
|
5703 } |
|
5704 // XXX reorder point |
|
5705 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: |
|
5706 for (;;) { |
|
5707 if (++pos == endPos) { |
|
5708 break stateloop; |
|
5709 } |
|
5710 c = checkChar(buf, pos); |
|
5711 /* |
|
5712 * Consume the next input character: |
|
5713 */ |
|
5714 switch (c) { |
|
5715 case '\'': |
|
5716 /* |
|
5717 * U+0027 APOSTROPHE (') Switch to the after |
|
5718 * DOCTYPE public identifier state. |
|
5719 */ |
|
5720 publicIdentifier = longStrBufToString(); |
|
5721 state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); |
|
5722 continue stateloop; |
|
5723 case '>': |
|
5724 errGtInPublicId(); |
|
5725 /* |
|
5726 * Set the DOCTYPE token's force-quirks flag to |
|
5727 * on. |
|
5728 */ |
|
5729 forceQuirks = true; |
|
5730 /* |
|
5731 * Emit that DOCTYPE token. |
|
5732 */ |
|
5733 publicIdentifier = longStrBufToString(); |
|
5734 emitDoctypeToken(pos); |
|
5735 /* |
|
5736 * Switch to the data state. |
|
5737 */ |
|
5738 state = transition(state, Tokenizer.DATA, reconsume, pos); |
|
5739 continue stateloop; |
|
5740 case '\r': |
|
5741 appendLongStrBufCarriageReturn(); |
|
5742 break stateloop; |
|
5743 case '\n': |
|
5744 appendLongStrBufLineFeed(); |
|
5745 continue; |
|
5746 case '\u0000': |
|
5747 c = '\uFFFD'; |
|
5748 // fall thru |
|
5749 default: |
|
5750 /* |
|
5751 * Anything else Append the current input |
|
5752 * character to the current DOCTYPE token's |
|
5753 * public identifier. |
|
5754 */ |
|
5755 appendLongStrBuf(c); |
|
5756 /* |
|
5757 * Stay in the DOCTYPE public identifier |
|
5758 * (single-quoted) state. |
|
5759 */ |
|
5760 continue; |
|
5761 } |
|
5762 } |
|
5763 // XXX reorder point |
|
5764 case PROCESSING_INSTRUCTION: |
|
5765 processinginstructionloop: for (;;) { |
|
5766 if (++pos == endPos) { |
|
5767 break stateloop; |
|
5768 } |
|
5769 c = checkChar(buf, pos); |
|
5770 switch (c) { |
|
5771 case '?': |
|
5772 state = transition( |
|
5773 state, |
|
5774 Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK, |
|
5775 reconsume, pos); |
|
5776 break processinginstructionloop; |
|
5777 // continue stateloop; |
|
5778 default: |
|
5779 continue; |
|
5780 } |
|
5781 } |
|
5782 case PROCESSING_INSTRUCTION_QUESTION_MARK: |
|
5783 if (++pos == endPos) { |
|
5784 break stateloop; |
|
5785 } |
|
5786 c = checkChar(buf, pos); |
|
5787 switch (c) { |
|
5788 case '>': |
|
5789 state = transition(state, Tokenizer.DATA, |
|
5790 reconsume, pos); |
|
5791 continue stateloop; |
|
5792 default: |
|
5793 state = transition(state, |
|
5794 Tokenizer.PROCESSING_INSTRUCTION, |
|
5795 reconsume, pos); |
|
5796 continue stateloop; |
|
5797 } |
|
5798 // END HOTSPOT WORKAROUND |
|
5799 } |
|
5800 } |
|
5801 flushChars(buf, pos); |
|
5802 /* |
|
5803 * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; } |
|
5804 */ |
|
5805 // Save locals |
|
5806 stateSave = state; |
|
5807 returnStateSave = returnState; |
|
5808 return pos; |
|
5809 } |
|
5810 |
|
5811 // HOTSPOT WORKAROUND INSERTION POINT |
|
5812 |
|
5813 // [NOCPP[ |
|
5814 |
|
5815 protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException { |
|
5816 return to; |
|
5817 } |
|
5818 |
|
5819 // ]NOCPP] |
|
5820 |
|
5821 private void initDoctypeFields() { |
|
5822 doctypeName = ""; |
|
5823 if (systemIdentifier != null) { |
|
5824 Portability.releaseString(systemIdentifier); |
|
5825 systemIdentifier = null; |
|
5826 } |
|
5827 if (publicIdentifier != null) { |
|
5828 Portability.releaseString(publicIdentifier); |
|
5829 publicIdentifier = null; |
|
5830 } |
|
5831 forceQuirks = false; |
|
5832 } |
|
5833 |
|
5834 @Inline private void adjustDoubleHyphenAndAppendToLongStrBufCarriageReturn() |
|
5835 throws SAXException { |
|
5836 silentCarriageReturn(); |
|
5837 adjustDoubleHyphenAndAppendToLongStrBufAndErr('\n'); |
|
5838 } |
|
5839 |
|
5840 @Inline private void adjustDoubleHyphenAndAppendToLongStrBufLineFeed() |
|
5841 throws SAXException { |
|
5842 silentLineFeed(); |
|
5843 adjustDoubleHyphenAndAppendToLongStrBufAndErr('\n'); |
|
5844 } |
|
5845 |
|
5846 @Inline private void appendLongStrBufLineFeed() { |
|
5847 silentLineFeed(); |
|
5848 appendLongStrBuf('\n'); |
|
5849 } |
|
5850 |
|
5851 @Inline private void appendLongStrBufCarriageReturn() { |
|
5852 silentCarriageReturn(); |
|
5853 appendLongStrBuf('\n'); |
|
5854 } |
|
5855 |
|
5856 @Inline protected void silentCarriageReturn() { |
|
5857 ++line; |
|
5858 lastCR = true; |
|
5859 } |
|
5860 |
|
5861 @Inline protected void silentLineFeed() { |
|
5862 ++line; |
|
5863 } |
|
5864 |
|
5865 private void emitCarriageReturn(@NoLength char[] buf, int pos) |
|
5866 throws SAXException { |
|
5867 silentCarriageReturn(); |
|
5868 flushChars(buf, pos); |
|
5869 tokenHandler.characters(Tokenizer.LF, 0, 1); |
|
5870 cstart = Integer.MAX_VALUE; |
|
5871 } |
|
5872 |
|
5873 private void emitReplacementCharacter(@NoLength char[] buf, int pos) |
|
5874 throws SAXException { |
|
5875 flushChars(buf, pos); |
|
5876 tokenHandler.zeroOriginatingReplacementCharacter(); |
|
5877 cstart = pos + 1; |
|
5878 } |
|
5879 |
|
5880 private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos) |
|
5881 throws SAXException { |
|
5882 flushChars(buf, pos); |
|
5883 tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1); |
|
5884 cstart = pos + 1; |
|
5885 } |
|
5886 |
|
5887 private void setAdditionalAndRememberAmpersandLocation(char add) { |
|
5888 additional = add; |
|
5889 // [NOCPP[ |
|
5890 ampersandLocation = new LocatorImpl(this); |
|
5891 // ]NOCPP] |
|
5892 } |
|
5893 |
|
5894 private void bogusDoctype() throws SAXException { |
|
5895 errBogusDoctype(); |
|
5896 forceQuirks = true; |
|
5897 } |
|
5898 |
|
5899 private void bogusDoctypeWithoutQuirks() throws SAXException { |
|
5900 errBogusDoctype(); |
|
5901 forceQuirks = false; |
|
5902 } |
|
5903 |
|
5904 private void emitOrAppendStrBuf(int returnState) throws SAXException { |
|
5905 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { |
|
5906 appendStrBufToLongStrBuf(); |
|
5907 } else { |
|
5908 emitStrBuf(); |
|
5909 } |
|
5910 } |
|
5911 |
|
5912 private void handleNcrValue(int returnState) throws SAXException { |
|
5913 /* |
|
5914 * If one or more characters match the range, then take them all and |
|
5915 * interpret the string of characters as a number (either hexadecimal or |
|
5916 * decimal as appropriate). |
|
5917 */ |
|
5918 if (value <= 0xFFFF) { |
|
5919 if (value >= 0x80 && value <= 0x9f) { |
|
5920 /* |
|
5921 * If that number is one of the numbers in the first column of |
|
5922 * the following table, then this is a parse error. |
|
5923 */ |
|
5924 errNcrInC1Range(); |
|
5925 /* |
|
5926 * Find the row with that number in the first column, and return |
|
5927 * a character token for the Unicode character given in the |
|
5928 * second column of that row. |
|
5929 */ |
|
5930 @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80]; |
|
5931 emitOrAppendOne(val, returnState); |
|
5932 // [NOCPP[ |
|
5933 } else if (value == 0xC |
|
5934 && contentSpacePolicy != XmlViolationPolicy.ALLOW) { |
|
5935 if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) { |
|
5936 emitOrAppendOne(Tokenizer.SPACE, returnState); |
|
5937 } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) { |
|
5938 fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space."); |
|
5939 } |
|
5940 // ]NOCPP] |
|
5941 } else if (value == 0x0) { |
|
5942 errNcrZero(); |
|
5943 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); |
|
5944 } else if ((value & 0xF800) == 0xD800) { |
|
5945 errNcrSurrogate(); |
|
5946 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); |
|
5947 } else { |
|
5948 /* |
|
5949 * Otherwise, return a character token for the Unicode character |
|
5950 * whose code point is that number. |
|
5951 */ |
|
5952 char ch = (char) value; |
|
5953 // [NOCPP[ |
|
5954 if (value == 0x0D) { |
|
5955 errNcrCr(); |
|
5956 } else if ((value <= 0x0008) || (value == 0x000B) |
|
5957 || (value >= 0x000E && value <= 0x001F)) { |
|
5958 ch = errNcrControlChar(ch); |
|
5959 } else if (value >= 0xFDD0 && value <= 0xFDEF) { |
|
5960 errNcrUnassigned(); |
|
5961 } else if ((value & 0xFFFE) == 0xFFFE) { |
|
5962 ch = errNcrNonCharacter(ch); |
|
5963 } else if (value >= 0x007F && value <= 0x009F) { |
|
5964 errNcrControlChar(); |
|
5965 } else { |
|
5966 maybeWarnPrivateUse(ch); |
|
5967 } |
|
5968 // ]NOCPP] |
|
5969 bmpChar[0] = ch; |
|
5970 emitOrAppendOne(bmpChar, returnState); |
|
5971 } |
|
5972 } else if (value <= 0x10FFFF) { |
|
5973 // [NOCPP[ |
|
5974 maybeWarnPrivateUseAstral(); |
|
5975 if ((value & 0xFFFE) == 0xFFFE) { |
|
5976 errAstralNonCharacter(value); |
|
5977 } |
|
5978 // ]NOCPP] |
|
5979 astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10)); |
|
5980 astralChar[1] = (char) (0xDC00 + (value & 0x3FF)); |
|
5981 emitOrAppendTwo(astralChar, returnState); |
|
5982 } else { |
|
5983 errNcrOutOfRange(); |
|
5984 emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); |
|
5985 } |
|
5986 } |
|
5987 |
|
5988 public void eof() throws SAXException { |
|
5989 int state = stateSave; |
|
5990 int returnState = returnStateSave; |
|
5991 |
|
5992 eofloop: for (;;) { |
|
5993 switch (state) { |
|
5994 case SCRIPT_DATA_LESS_THAN_SIGN: |
|
5995 case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: |
|
5996 /* |
|
5997 * Otherwise, emit a U+003C LESS-THAN SIGN character token |
|
5998 */ |
|
5999 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); |
|
6000 /* |
|
6001 * and reconsume the current input character in the data |
|
6002 * state. |
|
6003 */ |
|
6004 break eofloop; |
|
6005 case TAG_OPEN: |
|
6006 /* |
|
6007 * The behavior of this state depends on the content model |
|
6008 * flag. |
|
6009 */ |
|
6010 /* |
|
6011 * Anything else Parse error. |
|
6012 */ |
|
6013 errEofAfterLt(); |
|
6014 /* |
|
6015 * Emit a U+003C LESS-THAN SIGN character token |
|
6016 */ |
|
6017 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); |
|
6018 /* |
|
6019 * and reconsume the current input character in the data |
|
6020 * state. |
|
6021 */ |
|
6022 break eofloop; |
|
6023 case RAWTEXT_RCDATA_LESS_THAN_SIGN: |
|
6024 /* |
|
6025 * Emit a U+003C LESS-THAN SIGN character token |
|
6026 */ |
|
6027 tokenHandler.characters(Tokenizer.LT_GT, 0, 1); |
|
6028 /* |
|
6029 * and reconsume the current input character in the RCDATA |
|
6030 * state. |
|
6031 */ |
|
6032 break eofloop; |
|
6033 case NON_DATA_END_TAG_NAME: |
|
6034 /* |
|
6035 * Emit a U+003C LESS-THAN SIGN character token, a U+002F |
|
6036 * SOLIDUS character token, |
|
6037 */ |
|
6038 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2); |
|
6039 /* |
|
6040 * a character token for each of the characters in the |
|
6041 * temporary buffer (in the order they were added to the |
|
6042 * buffer), |
|
6043 */ |
|
6044 emitStrBuf(); |
|
6045 /* |
|
6046 * and reconsume the current input character in the RCDATA |
|
6047 * state. |
|
6048 */ |
|
6049 break eofloop; |
|
6050 case CLOSE_TAG_OPEN: |
|
6051 /* EOF Parse error. */ |
|
6052 errEofAfterLt(); |
|
6053 /* |
|
6054 * Emit a U+003C LESS-THAN SIGN character token and a U+002F |
|
6055 * SOLIDUS character token. |
|
6056 */ |
|
6057 tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2); |
|
6058 /* |
|
6059 * Reconsume the EOF character in the data state. |
|
6060 */ |
|
6061 break eofloop; |
|
6062 case TAG_NAME: |
|
6063 /* |
|
6064 * EOF Parse error. |
|
6065 */ |
|
6066 errEofInTagName(); |
|
6067 /* |
|
6068 * Reconsume the EOF character in the data state. |
|
6069 */ |
|
6070 break eofloop; |
|
6071 case BEFORE_ATTRIBUTE_NAME: |
|
6072 case AFTER_ATTRIBUTE_VALUE_QUOTED: |
|
6073 case SELF_CLOSING_START_TAG: |
|
6074 /* EOF Parse error. */ |
|
6075 errEofWithoutGt(); |
|
6076 /* |
|
6077 * Reconsume the EOF character in the data state. |
|
6078 */ |
|
6079 break eofloop; |
|
6080 case ATTRIBUTE_NAME: |
|
6081 /* |
|
6082 * EOF Parse error. |
|
6083 */ |
|
6084 errEofInAttributeName(); |
|
6085 /* |
|
6086 * Reconsume the EOF character in the data state. |
|
6087 */ |
|
6088 break eofloop; |
|
6089 case AFTER_ATTRIBUTE_NAME: |
|
6090 case BEFORE_ATTRIBUTE_VALUE: |
|
6091 /* EOF Parse error. */ |
|
6092 errEofWithoutGt(); |
|
6093 /* |
|
6094 * Reconsume the EOF character in the data state. |
|
6095 */ |
|
6096 break eofloop; |
|
6097 case ATTRIBUTE_VALUE_DOUBLE_QUOTED: |
|
6098 case ATTRIBUTE_VALUE_SINGLE_QUOTED: |
|
6099 case ATTRIBUTE_VALUE_UNQUOTED: |
|
6100 /* EOF Parse error. */ |
|
6101 errEofInAttributeValue(); |
|
6102 /* |
|
6103 * Reconsume the EOF character in the data state. |
|
6104 */ |
|
6105 break eofloop; |
|
6106 case BOGUS_COMMENT: |
|
6107 emitComment(0, 0); |
|
6108 break eofloop; |
|
6109 case BOGUS_COMMENT_HYPHEN: |
|
6110 // [NOCPP[ |
|
6111 maybeAppendSpaceToBogusComment(); |
|
6112 // ]NOCPP] |
|
6113 emitComment(0, 0); |
|
6114 break eofloop; |
|
6115 case MARKUP_DECLARATION_OPEN: |
|
6116 errBogusComment(); |
|
6117 clearLongStrBuf(); |
|
6118 emitComment(0, 0); |
|
6119 break eofloop; |
|
6120 case MARKUP_DECLARATION_HYPHEN: |
|
6121 errBogusComment(); |
|
6122 emitComment(0, 0); |
|
6123 break eofloop; |
|
6124 case MARKUP_DECLARATION_OCTYPE: |
|
6125 if (index < 6) { |
|
6126 errBogusComment(); |
|
6127 emitComment(0, 0); |
|
6128 } else { |
|
6129 /* EOF Parse error. */ |
|
6130 errEofInDoctype(); |
|
6131 /* |
|
6132 * Create a new DOCTYPE token. Set its force-quirks flag |
|
6133 * to on. |
|
6134 */ |
|
6135 doctypeName = ""; |
|
6136 if (systemIdentifier != null) { |
|
6137 Portability.releaseString(systemIdentifier); |
|
6138 systemIdentifier = null; |
|
6139 } |
|
6140 if (publicIdentifier != null) { |
|
6141 Portability.releaseString(publicIdentifier); |
|
6142 publicIdentifier = null; |
|
6143 } |
|
6144 forceQuirks = true; |
|
6145 /* |
|
6146 * Emit the token. |
|
6147 */ |
|
6148 emitDoctypeToken(0); |
|
6149 /* |
|
6150 * Reconsume the EOF character in the data state. |
|
6151 */ |
|
6152 break eofloop; |
|
6153 } |
|
6154 break eofloop; |
|
6155 case COMMENT_START: |
|
6156 case COMMENT: |
|
6157 /* |
|
6158 * EOF Parse error. |
|
6159 */ |
|
6160 errEofInComment(); |
|
6161 /* Emit the comment token. */ |
|
6162 emitComment(0, 0); |
|
6163 /* |
|
6164 * Reconsume the EOF character in the data state. |
|
6165 */ |
|
6166 break eofloop; |
|
6167 case COMMENT_END: |
|
6168 errEofInComment(); |
|
6169 /* Emit the comment token. */ |
|
6170 emitComment(2, 0); |
|
6171 /* |
|
6172 * Reconsume the EOF character in the data state. |
|
6173 */ |
|
6174 break eofloop; |
|
6175 case COMMENT_END_DASH: |
|
6176 case COMMENT_START_DASH: |
|
6177 errEofInComment(); |
|
6178 /* Emit the comment token. */ |
|
6179 emitComment(1, 0); |
|
6180 /* |
|
6181 * Reconsume the EOF character in the data state. |
|
6182 */ |
|
6183 break eofloop; |
|
6184 case COMMENT_END_BANG: |
|
6185 errEofInComment(); |
|
6186 /* Emit the comment token. */ |
|
6187 emitComment(3, 0); |
|
6188 /* |
|
6189 * Reconsume the EOF character in the data state. |
|
6190 */ |
|
6191 break eofloop; |
|
6192 case DOCTYPE: |
|
6193 case BEFORE_DOCTYPE_NAME: |
|
6194 errEofInDoctype(); |
|
6195 /* |
|
6196 * Create a new DOCTYPE token. Set its force-quirks flag to |
|
6197 * on. |
|
6198 */ |
|
6199 forceQuirks = true; |
|
6200 /* |
|
6201 * Emit the token. |
|
6202 */ |
|
6203 emitDoctypeToken(0); |
|
6204 /* |
|
6205 * Reconsume the EOF character in the data state. |
|
6206 */ |
|
6207 break eofloop; |
|
6208 case DOCTYPE_NAME: |
|
6209 errEofInDoctype(); |
|
6210 strBufToDoctypeName(); |
|
6211 /* |
|
6212 * Set the DOCTYPE token's force-quirks flag to on. |
|
6213 */ |
|
6214 forceQuirks = true; |
|
6215 /* |
|
6216 * Emit that DOCTYPE token. |
|
6217 */ |
|
6218 emitDoctypeToken(0); |
|
6219 /* |
|
6220 * Reconsume the EOF character in the data state. |
|
6221 */ |
|
6222 break eofloop; |
|
6223 case DOCTYPE_UBLIC: |
|
6224 case DOCTYPE_YSTEM: |
|
6225 case AFTER_DOCTYPE_NAME: |
|
6226 case AFTER_DOCTYPE_PUBLIC_KEYWORD: |
|
6227 case AFTER_DOCTYPE_SYSTEM_KEYWORD: |
|
6228 case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: |
|
6229 errEofInDoctype(); |
|
6230 /* |
|
6231 * Set the DOCTYPE token's force-quirks flag to on. |
|
6232 */ |
|
6233 forceQuirks = true; |
|
6234 /* |
|
6235 * Emit that DOCTYPE token. |
|
6236 */ |
|
6237 emitDoctypeToken(0); |
|
6238 /* |
|
6239 * Reconsume the EOF character in the data state. |
|
6240 */ |
|
6241 break eofloop; |
|
6242 case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: |
|
6243 case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: |
|
6244 /* EOF Parse error. */ |
|
6245 errEofInPublicId(); |
|
6246 /* |
|
6247 * Set the DOCTYPE token's force-quirks flag to on. |
|
6248 */ |
|
6249 forceQuirks = true; |
|
6250 /* |
|
6251 * Emit that DOCTYPE token. |
|
6252 */ |
|
6253 publicIdentifier = longStrBufToString(); |
|
6254 emitDoctypeToken(0); |
|
6255 /* |
|
6256 * Reconsume the EOF character in the data state. |
|
6257 */ |
|
6258 break eofloop; |
|
6259 case AFTER_DOCTYPE_PUBLIC_IDENTIFIER: |
|
6260 case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: |
|
6261 case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: |
|
6262 errEofInDoctype(); |
|
6263 /* |
|
6264 * Set the DOCTYPE token's force-quirks flag to on. |
|
6265 */ |
|
6266 forceQuirks = true; |
|
6267 /* |
|
6268 * Emit that DOCTYPE token. |
|
6269 */ |
|
6270 emitDoctypeToken(0); |
|
6271 /* |
|
6272 * Reconsume the EOF character in the data state. |
|
6273 */ |
|
6274 break eofloop; |
|
6275 case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: |
|
6276 case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: |
|
6277 /* EOF Parse error. */ |
|
6278 errEofInSystemId(); |
|
6279 /* |
|
6280 * Set the DOCTYPE token's force-quirks flag to on. |
|
6281 */ |
|
6282 forceQuirks = true; |
|
6283 /* |
|
6284 * Emit that DOCTYPE token. |
|
6285 */ |
|
6286 systemIdentifier = longStrBufToString(); |
|
6287 emitDoctypeToken(0); |
|
6288 /* |
|
6289 * Reconsume the EOF character in the data state. |
|
6290 */ |
|
6291 break eofloop; |
|
6292 case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: |
|
6293 errEofInDoctype(); |
|
6294 /* |
|
6295 * Set the DOCTYPE token's force-quirks flag to on. |
|
6296 */ |
|
6297 forceQuirks = true; |
|
6298 /* |
|
6299 * Emit that DOCTYPE token. |
|
6300 */ |
|
6301 emitDoctypeToken(0); |
|
6302 /* |
|
6303 * Reconsume the EOF character in the data state. |
|
6304 */ |
|
6305 break eofloop; |
|
6306 case BOGUS_DOCTYPE: |
|
6307 /* |
|
6308 * Emit that DOCTYPE token. |
|
6309 */ |
|
6310 emitDoctypeToken(0); |
|
6311 /* |
|
6312 * Reconsume the EOF character in the data state. |
|
6313 */ |
|
6314 break eofloop; |
|
6315 case CONSUME_CHARACTER_REFERENCE: |
|
6316 /* |
|
6317 * Unlike the definition is the spec, this state does not |
|
6318 * return a value and never requires the caller to |
|
6319 * backtrack. This state takes care of emitting characters |
|
6320 * or appending to the current attribute value. It also |
|
6321 * takes care of that in the case when consuming the entity |
|
6322 * fails. |
|
6323 */ |
|
6324 /* |
|
6325 * This section defines how to consume an entity. This |
|
6326 * definition is used when parsing entities in text and in |
|
6327 * attributes. |
|
6328 * |
|
6329 * The behavior depends on the identity of the next |
|
6330 * character (the one immediately after the U+0026 AMPERSAND |
|
6331 * character): |
|
6332 */ |
|
6333 |
|
6334 emitOrAppendStrBuf(returnState); |
|
6335 state = returnState; |
|
6336 continue; |
|
6337 case CHARACTER_REFERENCE_HILO_LOOKUP: |
|
6338 errNoNamedCharacterMatch(); |
|
6339 emitOrAppendStrBuf(returnState); |
|
6340 state = returnState; |
|
6341 continue; |
|
6342 case CHARACTER_REFERENCE_TAIL: |
|
6343 outer: for (;;) { |
|
6344 char c = '\u0000'; |
|
6345 entCol++; |
|
6346 /* |
|
6347 * Consume the maximum number of characters possible, |
|
6348 * with the consumed characters matching one of the |
|
6349 * identifiers in the first column of the named |
|
6350 * character references table (in a case-sensitive |
|
6351 * manner). |
|
6352 */ |
|
6353 hiloop: for (;;) { |
|
6354 if (hi == -1) { |
|
6355 break hiloop; |
|
6356 } |
|
6357 if (entCol == NamedCharacters.NAMES[hi].length()) { |
|
6358 break hiloop; |
|
6359 } |
|
6360 if (entCol > NamedCharacters.NAMES[hi].length()) { |
|
6361 break outer; |
|
6362 } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) { |
|
6363 hi--; |
|
6364 } else { |
|
6365 break hiloop; |
|
6366 } |
|
6367 } |
|
6368 |
|
6369 loloop: for (;;) { |
|
6370 if (hi < lo) { |
|
6371 break outer; |
|
6372 } |
|
6373 if (entCol == NamedCharacters.NAMES[lo].length()) { |
|
6374 candidate = lo; |
|
6375 strBufMark = strBufLen; |
|
6376 lo++; |
|
6377 } else if (entCol > NamedCharacters.NAMES[lo].length()) { |
|
6378 break outer; |
|
6379 } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) { |
|
6380 lo++; |
|
6381 } else { |
|
6382 break loloop; |
|
6383 } |
|
6384 } |
|
6385 if (hi < lo) { |
|
6386 break outer; |
|
6387 } |
|
6388 continue; |
|
6389 } |
|
6390 |
|
6391 if (candidate == -1) { |
|
6392 /* |
|
6393 * If no match can be made, then this is a parse error. |
|
6394 */ |
|
6395 errNoNamedCharacterMatch(); |
|
6396 emitOrAppendStrBuf(returnState); |
|
6397 state = returnState; |
|
6398 continue eofloop; |
|
6399 } else { |
|
6400 @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate]; |
|
6401 if (candidateName.length() == 0 |
|
6402 || candidateName.charAt(candidateName.length() - 1) != ';') { |
|
6403 /* |
|
6404 * If the last character matched is not a U+003B |
|
6405 * SEMICOLON (;), there is a parse error. |
|
6406 */ |
|
6407 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { |
|
6408 /* |
|
6409 * If the entity is being consumed as part of an |
|
6410 * attribute, and the last character matched is |
|
6411 * not a U+003B SEMICOLON (;), |
|
6412 */ |
|
6413 char ch; |
|
6414 if (strBufMark == strBufLen) { |
|
6415 ch = '\u0000'; |
|
6416 } else { |
|
6417 ch = strBuf[strBufMark]; |
|
6418 } |
|
6419 if ((ch >= '0' && ch <= '9') |
|
6420 || (ch >= 'A' && ch <= 'Z') |
|
6421 || (ch >= 'a' && ch <= 'z')) { |
|
6422 /* |
|
6423 * and the next character is in the range |
|
6424 * U+0030 DIGIT ZERO to U+0039 DIGIT NINE, |
|
6425 * U+0041 LATIN CAPITAL LETTER A to U+005A |
|
6426 * LATIN CAPITAL LETTER Z, or U+0061 LATIN |
|
6427 * SMALL LETTER A to U+007A LATIN SMALL |
|
6428 * LETTER Z, then, for historical reasons, |
|
6429 * all the characters that were matched |
|
6430 * after the U+0026 AMPERSAND (&) must be |
|
6431 * unconsumed, and nothing is returned. |
|
6432 */ |
|
6433 errNoNamedCharacterMatch(); |
|
6434 appendStrBufToLongStrBuf(); |
|
6435 state = returnState; |
|
6436 continue eofloop; |
|
6437 } |
|
6438 } |
|
6439 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { |
|
6440 errUnescapedAmpersandInterpretedAsCharacterReference(); |
|
6441 } else { |
|
6442 errNotSemicolonTerminated(); |
|
6443 } |
|
6444 } |
|
6445 |
|
6446 /* |
|
6447 * Otherwise, return a character token for the character |
|
6448 * corresponding to the entity name (as given by the |
|
6449 * second column of the named character references |
|
6450 * table). |
|
6451 */ |
|
6452 @Const @NoLength char[] val = NamedCharacters.VALUES[candidate]; |
|
6453 if ( |
|
6454 // [NOCPP[ |
|
6455 val.length == 1 |
|
6456 // ]NOCPP] |
|
6457 // CPPONLY: val[1] == 0 |
|
6458 ) { |
|
6459 emitOrAppendOne(val, returnState); |
|
6460 } else { |
|
6461 emitOrAppendTwo(val, returnState); |
|
6462 } |
|
6463 // this is so complicated! |
|
6464 if (strBufMark < strBufLen) { |
|
6465 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { |
|
6466 for (int i = strBufMark; i < strBufLen; i++) { |
|
6467 appendLongStrBuf(strBuf[i]); |
|
6468 } |
|
6469 } else { |
|
6470 tokenHandler.characters(strBuf, strBufMark, |
|
6471 strBufLen - strBufMark); |
|
6472 } |
|
6473 } |
|
6474 state = returnState; |
|
6475 continue eofloop; |
|
6476 /* |
|
6477 * If the markup contains I'm ¬it; I tell you, the |
|
6478 * entity is parsed as "not", as in, I'm ¬it; I tell |
|
6479 * you. But if the markup was I'm ∉ I tell you, |
|
6480 * the entity would be parsed as "notin;", resulting in |
|
6481 * I'm ∉ I tell you. |
|
6482 */ |
|
6483 } |
|
6484 case CONSUME_NCR: |
|
6485 case DECIMAL_NRC_LOOP: |
|
6486 case HEX_NCR_LOOP: |
|
6487 /* |
|
6488 * If no characters match the range, then don't consume any |
|
6489 * characters (and unconsume the U+0023 NUMBER SIGN |
|
6490 * character and, if appropriate, the X character). This is |
|
6491 * a parse error; nothing is returned. |
|
6492 * |
|
6493 * Otherwise, if the next character is a U+003B SEMICOLON, |
|
6494 * consume that too. If it isn't, there is a parse error. |
|
6495 */ |
|
6496 if (!seenDigits) { |
|
6497 errNoDigitsInNCR(); |
|
6498 emitOrAppendStrBuf(returnState); |
|
6499 state = returnState; |
|
6500 continue; |
|
6501 } else { |
|
6502 errCharRefLacksSemicolon(); |
|
6503 } |
|
6504 // WARNING previous state sets reconsume |
|
6505 handleNcrValue(returnState); |
|
6506 state = returnState; |
|
6507 continue; |
|
6508 case CDATA_RSQB: |
|
6509 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1); |
|
6510 break eofloop; |
|
6511 case CDATA_RSQB_RSQB: |
|
6512 tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2); |
|
6513 break eofloop; |
|
6514 case DATA: |
|
6515 default: |
|
6516 break eofloop; |
|
6517 } |
|
6518 } |
|
6519 // case DATA: |
|
6520 /* |
|
6521 * EOF Emit an end-of-file token. |
|
6522 */ |
|
6523 tokenHandler.eof(); |
|
6524 return; |
|
6525 } |
|
6526 |
|
6527 private void emitDoctypeToken(int pos) throws SAXException { |
|
6528 cstart = pos + 1; |
|
6529 tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier, |
|
6530 forceQuirks); |
|
6531 // It is OK and sufficient to release these here, since |
|
6532 // there's no way out of the doctype states than through paths |
|
6533 // that call this method. |
|
6534 doctypeName = null; |
|
6535 Portability.releaseString(publicIdentifier); |
|
6536 publicIdentifier = null; |
|
6537 Portability.releaseString(systemIdentifier); |
|
6538 systemIdentifier = null; |
|
6539 } |
|
6540 |
|
6541 @Inline protected char checkChar(@NoLength char[] buf, int pos) |
|
6542 throws SAXException { |
|
6543 return buf[pos]; |
|
6544 } |
|
6545 |
|
6546 public boolean internalEncodingDeclaration(String internalCharset) |
|
6547 throws SAXException { |
|
6548 if (encodingDeclarationHandler != null) { |
|
6549 return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset); |
|
6550 } |
|
6551 return false; |
|
6552 } |
|
6553 |
|
6554 /** |
|
6555 * @param val |
|
6556 * @throws SAXException |
|
6557 */ |
|
6558 private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState) |
|
6559 throws SAXException { |
|
6560 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { |
|
6561 appendLongStrBuf(val[0]); |
|
6562 appendLongStrBuf(val[1]); |
|
6563 } else { |
|
6564 tokenHandler.characters(val, 0, 2); |
|
6565 } |
|
6566 } |
|
6567 |
|
6568 private void emitOrAppendOne(@Const @NoLength char[] val, int returnState) |
|
6569 throws SAXException { |
|
6570 if ((returnState & DATA_AND_RCDATA_MASK) != 0) { |
|
6571 appendLongStrBuf(val[0]); |
|
6572 } else { |
|
6573 tokenHandler.characters(val, 0, 1); |
|
6574 } |
|
6575 } |
|
6576 |
|
6577 public void end() throws SAXException { |
|
6578 strBuf = null; |
|
6579 longStrBuf = null; |
|
6580 doctypeName = null; |
|
6581 if (systemIdentifier != null) { |
|
6582 Portability.releaseString(systemIdentifier); |
|
6583 systemIdentifier = null; |
|
6584 } |
|
6585 if (publicIdentifier != null) { |
|
6586 Portability.releaseString(publicIdentifier); |
|
6587 publicIdentifier = null; |
|
6588 } |
|
6589 if (tagName != null) { |
|
6590 tagName.release(); |
|
6591 tagName = null; |
|
6592 } |
|
6593 if (attributeName != null) { |
|
6594 attributeName.release(); |
|
6595 attributeName = null; |
|
6596 } |
|
6597 tokenHandler.endTokenization(); |
|
6598 if (attributes != null) { |
|
6599 // [NOCPP[ |
|
6600 attributes = null; |
|
6601 // ]NOCPP] |
|
6602 // CPPONLY: attributes.clear(mappingLangToXmlLang); |
|
6603 } |
|
6604 } |
|
6605 |
|
6606 public void requestSuspension() { |
|
6607 shouldSuspend = true; |
|
6608 } |
|
6609 |
|
6610 // [NOCPP[ |
|
6611 |
|
6612 public void becomeConfident() { |
|
6613 confident = true; |
|
6614 } |
|
6615 |
|
6616 /** |
|
6617 * Returns the nextCharOnNewLine. |
|
6618 * |
|
6619 * @return the nextCharOnNewLine |
|
6620 */ |
|
6621 public boolean isNextCharOnNewLine() { |
|
6622 return false; |
|
6623 } |
|
6624 |
|
6625 public boolean isPrevCR() { |
|
6626 return lastCR; |
|
6627 } |
|
6628 |
|
6629 /** |
|
6630 * Returns the line. |
|
6631 * |
|
6632 * @return the line |
|
6633 */ |
|
6634 public int getLine() { |
|
6635 return -1; |
|
6636 } |
|
6637 |
|
6638 /** |
|
6639 * Returns the col. |
|
6640 * |
|
6641 * @return the col |
|
6642 */ |
|
6643 public int getCol() { |
|
6644 return -1; |
|
6645 } |
|
6646 |
|
6647 // ]NOCPP] |
|
6648 |
|
6649 public boolean isInDataState() { |
|
6650 return (stateSave == DATA); |
|
6651 } |
|
6652 |
|
6653 public void resetToDataState() { |
|
6654 strBufLen = 0; |
|
6655 longStrBufLen = 0; |
|
6656 stateSave = Tokenizer.DATA; |
|
6657 // line = 1; XXX line numbers |
|
6658 lastCR = false; |
|
6659 index = 0; |
|
6660 forceQuirks = false; |
|
6661 additional = '\u0000'; |
|
6662 entCol = -1; |
|
6663 firstCharKey = -1; |
|
6664 lo = 0; |
|
6665 hi = 0; // will always be overwritten before use anyway |
|
6666 candidate = -1; |
|
6667 strBufMark = 0; |
|
6668 prevValue = -1; |
|
6669 value = 0; |
|
6670 seenDigits = false; |
|
6671 endTag = false; |
|
6672 shouldSuspend = false; |
|
6673 initDoctypeFields(); |
|
6674 if (tagName != null) { |
|
6675 tagName.release(); |
|
6676 tagName = null; |
|
6677 } |
|
6678 if (attributeName != null) { |
|
6679 attributeName.release(); |
|
6680 attributeName = null; |
|
6681 } |
|
6682 if (newAttributesEachTime) { |
|
6683 if (attributes != null) { |
|
6684 Portability.delete(attributes); |
|
6685 attributes = null; |
|
6686 } |
|
6687 } |
|
6688 } |
|
6689 |
|
6690 public void loadState(Tokenizer other) throws SAXException { |
|
6691 strBufLen = other.strBufLen; |
|
6692 if (strBufLen > strBuf.length) { |
|
6693 strBuf = new char[strBufLen]; |
|
6694 } |
|
6695 System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen); |
|
6696 |
|
6697 longStrBufLen = other.longStrBufLen; |
|
6698 if (longStrBufLen > longStrBuf.length) { |
|
6699 longStrBuf = new char[longStrBufLen]; |
|
6700 } |
|
6701 System.arraycopy(other.longStrBuf, 0, longStrBuf, 0, longStrBufLen); |
|
6702 |
|
6703 stateSave = other.stateSave; |
|
6704 returnStateSave = other.returnStateSave; |
|
6705 endTagExpectation = other.endTagExpectation; |
|
6706 endTagExpectationAsArray = other.endTagExpectationAsArray; |
|
6707 // line = 1; XXX line numbers |
|
6708 lastCR = other.lastCR; |
|
6709 index = other.index; |
|
6710 forceQuirks = other.forceQuirks; |
|
6711 additional = other.additional; |
|
6712 entCol = other.entCol; |
|
6713 firstCharKey = other.firstCharKey; |
|
6714 lo = other.lo; |
|
6715 hi = other.hi; |
|
6716 candidate = other.candidate; |
|
6717 strBufMark = other.strBufMark; |
|
6718 prevValue = other.prevValue; |
|
6719 value = other.value; |
|
6720 seenDigits = other.seenDigits; |
|
6721 endTag = other.endTag; |
|
6722 shouldSuspend = false; |
|
6723 |
|
6724 if (other.doctypeName == null) { |
|
6725 doctypeName = null; |
|
6726 } else { |
|
6727 doctypeName = Portability.newLocalFromLocal(other.doctypeName, |
|
6728 interner); |
|
6729 } |
|
6730 |
|
6731 Portability.releaseString(systemIdentifier); |
|
6732 if (other.systemIdentifier == null) { |
|
6733 systemIdentifier = null; |
|
6734 } else { |
|
6735 systemIdentifier = Portability.newStringFromString(other.systemIdentifier); |
|
6736 } |
|
6737 |
|
6738 Portability.releaseString(publicIdentifier); |
|
6739 if (other.publicIdentifier == null) { |
|
6740 publicIdentifier = null; |
|
6741 } else { |
|
6742 publicIdentifier = Portability.newStringFromString(other.publicIdentifier); |
|
6743 } |
|
6744 |
|
6745 if (tagName != null) { |
|
6746 tagName.release(); |
|
6747 } |
|
6748 if (other.tagName == null) { |
|
6749 tagName = null; |
|
6750 } else { |
|
6751 tagName = other.tagName.cloneElementName(interner); |
|
6752 } |
|
6753 |
|
6754 if (attributeName != null) { |
|
6755 attributeName.release(); |
|
6756 } |
|
6757 if (other.attributeName == null) { |
|
6758 attributeName = null; |
|
6759 } else { |
|
6760 attributeName = other.attributeName.cloneAttributeName(interner); |
|
6761 } |
|
6762 |
|
6763 Portability.delete(attributes); |
|
6764 if (other.attributes == null) { |
|
6765 attributes = null; |
|
6766 } else { |
|
6767 attributes = other.attributes.cloneAttributes(interner); |
|
6768 } |
|
6769 } |
|
6770 |
|
6771 public void initializeWithoutStarting() throws SAXException { |
|
6772 confident = false; |
|
6773 strBuf = new char[64]; |
|
6774 longStrBuf = new char[1024]; |
|
6775 line = 1; |
|
6776 // [NOCPP[ |
|
6777 html4 = false; |
|
6778 metaBoundaryPassed = false; |
|
6779 wantsComments = tokenHandler.wantsComments(); |
|
6780 if (!newAttributesEachTime) { |
|
6781 attributes = new HtmlAttributes(mappingLangToXmlLang); |
|
6782 } |
|
6783 // ]NOCPP] |
|
6784 resetToDataState(); |
|
6785 } |
|
6786 |
|
6787 protected void errGarbageAfterLtSlash() throws SAXException { |
|
6788 } |
|
6789 |
|
6790 protected void errLtSlashGt() throws SAXException { |
|
6791 } |
|
6792 |
|
6793 protected void errWarnLtSlashInRcdata() throws SAXException { |
|
6794 } |
|
6795 |
|
6796 protected void errHtml4LtSlashInRcdata(char folded) throws SAXException { |
|
6797 } |
|
6798 |
|
6799 protected void errCharRefLacksSemicolon() throws SAXException { |
|
6800 } |
|
6801 |
|
6802 protected void errNoDigitsInNCR() throws SAXException { |
|
6803 } |
|
6804 |
|
6805 protected void errGtInSystemId() throws SAXException { |
|
6806 } |
|
6807 |
|
6808 protected void errGtInPublicId() throws SAXException { |
|
6809 } |
|
6810 |
|
6811 protected void errNamelessDoctype() throws SAXException { |
|
6812 } |
|
6813 |
|
6814 protected void errConsecutiveHyphens() throws SAXException { |
|
6815 } |
|
6816 |
|
6817 protected void errPrematureEndOfComment() throws SAXException { |
|
6818 } |
|
6819 |
|
6820 protected void errBogusComment() throws SAXException { |
|
6821 } |
|
6822 |
|
6823 protected void errUnquotedAttributeValOrNull(char c) throws SAXException { |
|
6824 } |
|
6825 |
|
6826 protected void errSlashNotFollowedByGt() throws SAXException { |
|
6827 } |
|
6828 |
|
6829 protected void errHtml4XmlVoidSyntax() throws SAXException { |
|
6830 } |
|
6831 |
|
6832 protected void errNoSpaceBetweenAttributes() throws SAXException { |
|
6833 } |
|
6834 |
|
6835 protected void errHtml4NonNameInUnquotedAttribute(char c) |
|
6836 throws SAXException { |
|
6837 } |
|
6838 |
|
6839 protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c) |
|
6840 throws SAXException { |
|
6841 } |
|
6842 |
|
6843 protected void errAttributeValueMissing() throws SAXException { |
|
6844 } |
|
6845 |
|
6846 protected void errBadCharBeforeAttributeNameOrNull(char c) |
|
6847 throws SAXException { |
|
6848 } |
|
6849 |
|
6850 protected void errEqualsSignBeforeAttributeName() throws SAXException { |
|
6851 } |
|
6852 |
|
6853 protected void errBadCharAfterLt(char c) throws SAXException { |
|
6854 } |
|
6855 |
|
6856 protected void errLtGt() throws SAXException { |
|
6857 } |
|
6858 |
|
6859 protected void errProcessingInstruction() throws SAXException { |
|
6860 } |
|
6861 |
|
6862 protected void errUnescapedAmpersandInterpretedAsCharacterReference() |
|
6863 throws SAXException { |
|
6864 } |
|
6865 |
|
6866 protected void errNotSemicolonTerminated() throws SAXException { |
|
6867 } |
|
6868 |
|
6869 protected void errNoNamedCharacterMatch() throws SAXException { |
|
6870 } |
|
6871 |
|
6872 protected void errQuoteBeforeAttributeName(char c) throws SAXException { |
|
6873 } |
|
6874 |
|
6875 protected void errQuoteOrLtInAttributeNameOrNull(char c) |
|
6876 throws SAXException { |
|
6877 } |
|
6878 |
|
6879 protected void errExpectedPublicId() throws SAXException { |
|
6880 } |
|
6881 |
|
6882 protected void errBogusDoctype() throws SAXException { |
|
6883 } |
|
6884 |
|
6885 protected void maybeWarnPrivateUseAstral() throws SAXException { |
|
6886 } |
|
6887 |
|
6888 protected void maybeWarnPrivateUse(char ch) throws SAXException { |
|
6889 } |
|
6890 |
|
6891 protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs) |
|
6892 throws SAXException { |
|
6893 } |
|
6894 |
|
6895 protected void maybeErrSlashInEndTag(boolean selfClosing) |
|
6896 throws SAXException { |
|
6897 } |
|
6898 |
|
6899 protected char errNcrNonCharacter(char ch) throws SAXException { |
|
6900 return ch; |
|
6901 } |
|
6902 |
|
6903 protected void errAstralNonCharacter(int ch) throws SAXException { |
|
6904 } |
|
6905 |
|
6906 protected void errNcrSurrogate() throws SAXException { |
|
6907 } |
|
6908 |
|
6909 protected char errNcrControlChar(char ch) throws SAXException { |
|
6910 return ch; |
|
6911 } |
|
6912 |
|
6913 protected void errNcrCr() throws SAXException { |
|
6914 } |
|
6915 |
|
6916 protected void errNcrInC1Range() throws SAXException { |
|
6917 } |
|
6918 |
|
6919 protected void errEofInPublicId() throws SAXException { |
|
6920 } |
|
6921 |
|
6922 protected void errEofInComment() throws SAXException { |
|
6923 } |
|
6924 |
|
6925 protected void errEofInDoctype() throws SAXException { |
|
6926 } |
|
6927 |
|
6928 protected void errEofInAttributeValue() throws SAXException { |
|
6929 } |
|
6930 |
|
6931 protected void errEofInAttributeName() throws SAXException { |
|
6932 } |
|
6933 |
|
6934 protected void errEofWithoutGt() throws SAXException { |
|
6935 } |
|
6936 |
|
6937 protected void errEofInTagName() throws SAXException { |
|
6938 } |
|
6939 |
|
6940 protected void errEofInEndTag() throws SAXException { |
|
6941 } |
|
6942 |
|
6943 protected void errEofAfterLt() throws SAXException { |
|
6944 } |
|
6945 |
|
6946 protected void errNcrOutOfRange() throws SAXException { |
|
6947 } |
|
6948 |
|
6949 protected void errNcrUnassigned() throws SAXException { |
|
6950 } |
|
6951 |
|
6952 protected void errDuplicateAttribute() throws SAXException { |
|
6953 } |
|
6954 |
|
6955 protected void errEofInSystemId() throws SAXException { |
|
6956 } |
|
6957 |
|
6958 protected void errExpectedSystemId() throws SAXException { |
|
6959 } |
|
6960 |
|
6961 protected void errMissingSpaceBeforeDoctypeName() throws SAXException { |
|
6962 } |
|
6963 |
|
6964 protected void errHyphenHyphenBang() throws SAXException { |
|
6965 } |
|
6966 |
|
6967 protected void errNcrControlChar() throws SAXException { |
|
6968 } |
|
6969 |
|
6970 protected void errNcrZero() throws SAXException { |
|
6971 } |
|
6972 |
|
6973 protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote() |
|
6974 throws SAXException { |
|
6975 } |
|
6976 |
|
6977 protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException { |
|
6978 } |
|
6979 |
|
6980 protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote() |
|
6981 throws SAXException { |
|
6982 } |
|
6983 |
|
6984 protected void noteAttributeWithoutValue() throws SAXException { |
|
6985 } |
|
6986 |
|
6987 protected void noteUnquotedAttributeValue() throws SAXException { |
|
6988 } |
|
6989 |
|
6990 /** |
|
6991 * Sets the encodingDeclarationHandler. |
|
6992 * |
|
6993 * @param encodingDeclarationHandler |
|
6994 * the encodingDeclarationHandler to set |
|
6995 */ |
|
6996 public void setEncodingDeclarationHandler( |
|
6997 EncodingDeclarationHandler encodingDeclarationHandler) { |
|
6998 this.encodingDeclarationHandler = encodingDeclarationHandler; |
|
6999 } |
|
7000 |
|
7001 void destructor() { |
|
7002 // The translator will write refcount tracing stuff here |
|
7003 Portability.delete(attributes); |
|
7004 attributes = null; |
|
7005 } |
|
7006 |
|
7007 // [NOCPP[ |
|
7008 |
|
7009 /** |
|
7010 * Sets an offset to be added to the position reported to |
|
7011 * <code>TransitionHandler</code>. |
|
7012 * |
|
7013 * @param offset the offset |
|
7014 */ |
|
7015 public void setTransitionBaseOffset(int offset) { |
|
7016 |
|
7017 } |
|
7018 |
|
7019 // ]NOCPP] |
|
7020 |
|
7021 } |