Fri, 16 Jan 2015 18:13:44 +0100
Integrate suggestion from review to improve consistency with existing code.
1 /*
2 * Copyright (c) 2007 Henri Sivonen
3 * Copyright (c) 2008-2010 Mozilla Foundation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
24 package nu.validator.htmlparser.impl;
26 import java.io.IOException;
28 import nu.validator.htmlparser.annotation.Auto;
29 import nu.validator.htmlparser.annotation.Inline;
30 import nu.validator.htmlparser.common.ByteReadable;
32 import org.xml.sax.SAXException;
34 public abstract class MetaScanner {
36 /**
37 * Constant for "charset".
38 */
39 private static final char[] CHARSET = { 'h', 'a', 'r', 's', 'e', 't' };
41 /**
42 * Constant for "content".
43 */
44 private static final char[] CONTENT = { 'o', 'n', 't', 'e', 'n', 't' };
46 /**
47 * Constant for "http-equiv".
48 */
49 private static final char[] HTTP_EQUIV = { 't', 't', 'p', '-', 'e', 'q',
50 'u', 'i', 'v' };
52 /**
53 * Constant for "content-type".
54 */
55 private static final char[] CONTENT_TYPE = { 'c', 'o', 'n', 't', 'e', 'n',
56 't', '-', 't', 'y', 'p', 'e' };
58 private static final int NO = 0;
60 private static final int M = 1;
62 private static final int E = 2;
64 private static final int T = 3;
66 private static final int A = 4;
68 private static final int DATA = 0;
70 private static final int TAG_OPEN = 1;
72 private static final int SCAN_UNTIL_GT = 2;
74 private static final int TAG_NAME = 3;
76 private static final int BEFORE_ATTRIBUTE_NAME = 4;
78 private static final int ATTRIBUTE_NAME = 5;
80 private static final int AFTER_ATTRIBUTE_NAME = 6;
82 private static final int BEFORE_ATTRIBUTE_VALUE = 7;
84 private static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 8;
86 private static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 9;
88 private static final int ATTRIBUTE_VALUE_UNQUOTED = 10;
90 private static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 11;
92 private static final int MARKUP_DECLARATION_OPEN = 13;
94 private static final int MARKUP_DECLARATION_HYPHEN = 14;
96 private static final int COMMENT_START = 15;
98 private static final int COMMENT_START_DASH = 16;
100 private static final int COMMENT = 17;
102 private static final int COMMENT_END_DASH = 18;
104 private static final int COMMENT_END = 19;
106 private static final int SELF_CLOSING_START_TAG = 20;
108 private static final int HTTP_EQUIV_NOT_SEEN = 0;
110 private static final int HTTP_EQUIV_CONTENT_TYPE = 1;
112 private static final int HTTP_EQUIV_OTHER = 2;
114 /**
115 * The data source.
116 */
117 protected ByteReadable readable;
119 /**
120 * The state of the state machine that recognizes the tag name "meta".
121 */
122 private int metaState = NO;
124 /**
125 * The current position in recognizing the attribute name "content".
126 */
127 private int contentIndex = Integer.MAX_VALUE;
129 /**
130 * The current position in recognizing the attribute name "charset".
131 */
132 private int charsetIndex = Integer.MAX_VALUE;
134 /**
135 * The current position in recognizing the attribute name "http-equive".
136 */
137 private int httpEquivIndex = Integer.MAX_VALUE;
139 /**
140 * The current position in recognizing the attribute value "content-type".
141 */
142 private int contentTypeIndex = Integer.MAX_VALUE;
144 /**
145 * The tokenizer state.
146 */
147 protected int stateSave = DATA;
149 /**
150 * The currently filled length of strBuf.
151 */
152 private int strBufLen;
154 /**
155 * Accumulation buffer for attribute values.
156 */
157 private @Auto char[] strBuf;
159 private String content;
161 private String charset;
163 private int httpEquivState;
165 public MetaScanner() {
166 this.readable = null;
167 this.metaState = NO;
168 this.contentIndex = Integer.MAX_VALUE;
169 this.charsetIndex = Integer.MAX_VALUE;
170 this.httpEquivIndex = Integer.MAX_VALUE;
171 this.contentTypeIndex = Integer.MAX_VALUE;
172 this.stateSave = DATA;
173 this.strBufLen = 0;
174 this.strBuf = new char[36];
175 this.content = null;
176 this.charset = null;
177 this.httpEquivState = HTTP_EQUIV_NOT_SEEN;
178 }
180 @SuppressWarnings("unused") private void destructor() {
181 Portability.releaseString(content);
182 Portability.releaseString(charset);
183 }
185 // [NOCPP[
187 /**
188 * Reads a byte from the data source.
189 *
190 * -1 means end.
191 * @return
192 * @throws IOException
193 */
194 protected int read() throws IOException {
195 return readable.readByte();
196 }
198 // ]NOCPP]
200 // WARNING When editing this, makes sure the bytecode length shown by javap
201 // stays under 8000 bytes!
202 /**
203 * The runs the meta scanning algorithm.
204 */
205 protected final void stateLoop(int state)
206 throws SAXException, IOException {
207 int c = -1;
208 boolean reconsume = false;
209 stateloop: for (;;) {
210 switch (state) {
211 case DATA:
212 dataloop: for (;;) {
213 if (reconsume) {
214 reconsume = false;
215 } else {
216 c = read();
217 }
218 switch (c) {
219 case -1:
220 break stateloop;
221 case '<':
222 state = MetaScanner.TAG_OPEN;
223 break dataloop; // FALL THROUGH continue
224 // stateloop;
225 default:
226 continue;
227 }
228 }
229 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
230 case TAG_OPEN:
231 tagopenloop: for (;;) {
232 c = read();
233 switch (c) {
234 case -1:
235 break stateloop;
236 case 'm':
237 case 'M':
238 metaState = M;
239 state = MetaScanner.TAG_NAME;
240 break tagopenloop;
241 // continue stateloop;
242 case '!':
243 state = MetaScanner.MARKUP_DECLARATION_OPEN;
244 continue stateloop;
245 case '?':
246 case '/':
247 state = MetaScanner.SCAN_UNTIL_GT;
248 continue stateloop;
249 case '>':
250 state = MetaScanner.DATA;
251 continue stateloop;
252 default:
253 if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
254 metaState = NO;
255 state = MetaScanner.TAG_NAME;
256 break tagopenloop;
257 // continue stateloop;
258 }
259 state = MetaScanner.DATA;
260 reconsume = true;
261 continue stateloop;
262 }
263 }
264 // FALL THROUGH DON'T REORDER
265 case TAG_NAME:
266 tagnameloop: for (;;) {
267 c = read();
268 switch (c) {
269 case -1:
270 break stateloop;
271 case ' ':
272 case '\t':
273 case '\n':
274 case '\u000C':
275 state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
276 break tagnameloop;
277 // continue stateloop;
278 case '/':
279 state = MetaScanner.SELF_CLOSING_START_TAG;
280 continue stateloop;
281 case '>':
282 state = MetaScanner.DATA;
283 continue stateloop;
284 case 'e':
285 case 'E':
286 if (metaState == M) {
287 metaState = E;
288 } else {
289 metaState = NO;
290 }
291 continue;
292 case 't':
293 case 'T':
294 if (metaState == E) {
295 metaState = T;
296 } else {
297 metaState = NO;
298 }
299 continue;
300 case 'a':
301 case 'A':
302 if (metaState == T) {
303 metaState = A;
304 } else {
305 metaState = NO;
306 }
307 continue;
308 default:
309 metaState = NO;
310 continue;
311 }
312 }
313 // FALLTHRU DON'T REORDER
314 case BEFORE_ATTRIBUTE_NAME:
315 beforeattributenameloop: for (;;) {
316 if (reconsume) {
317 reconsume = false;
318 } else {
319 c = read();
320 }
321 /*
322 * Consume the next input character:
323 */
324 switch (c) {
325 case -1:
326 break stateloop;
327 case ' ':
328 case '\t':
329 case '\n':
330 case '\u000C':
331 continue;
332 case '/':
333 state = MetaScanner.SELF_CLOSING_START_TAG;
334 continue stateloop;
335 case '>':
336 if (handleTag()) {
337 break stateloop;
338 }
339 state = DATA;
340 continue stateloop;
341 case 'c':
342 case 'C':
343 contentIndex = 0;
344 charsetIndex = 0;
345 httpEquivIndex = Integer.MAX_VALUE;
346 contentTypeIndex = Integer.MAX_VALUE;
347 state = MetaScanner.ATTRIBUTE_NAME;
348 break beforeattributenameloop;
349 case 'h':
350 case 'H':
351 contentIndex = Integer.MAX_VALUE;
352 charsetIndex = Integer.MAX_VALUE;
353 httpEquivIndex = 0;
354 contentTypeIndex = Integer.MAX_VALUE;
355 state = MetaScanner.ATTRIBUTE_NAME;
356 break beforeattributenameloop;
357 default:
358 contentIndex = Integer.MAX_VALUE;
359 charsetIndex = Integer.MAX_VALUE;
360 httpEquivIndex = Integer.MAX_VALUE;
361 contentTypeIndex = Integer.MAX_VALUE;
362 state = MetaScanner.ATTRIBUTE_NAME;
363 break beforeattributenameloop;
364 // continue stateloop;
365 }
366 }
367 // FALLTHRU DON'T REORDER
368 case ATTRIBUTE_NAME:
369 attributenameloop: for (;;) {
370 c = read();
371 switch (c) {
372 case -1:
373 break stateloop;
374 case ' ':
375 case '\t':
376 case '\n':
377 case '\u000C':
378 state = MetaScanner.AFTER_ATTRIBUTE_NAME;
379 continue stateloop;
380 case '/':
381 state = MetaScanner.SELF_CLOSING_START_TAG;
382 continue stateloop;
383 case '=':
384 strBufLen = 0;
385 contentTypeIndex = 0;
386 state = MetaScanner.BEFORE_ATTRIBUTE_VALUE;
387 break attributenameloop;
388 // continue stateloop;
389 case '>':
390 if (handleTag()) {
391 break stateloop;
392 }
393 state = MetaScanner.DATA;
394 continue stateloop;
395 default:
396 if (metaState == A) {
397 if (c >= 'A' && c <= 'Z') {
398 c += 0x20;
399 }
400 if (contentIndex < CONTENT.length && c == CONTENT[contentIndex]) {
401 ++contentIndex;
402 } else {
403 contentIndex = Integer.MAX_VALUE;
404 }
405 if (charsetIndex < CHARSET.length && c == CHARSET[charsetIndex]) {
406 ++charsetIndex;
407 } else {
408 charsetIndex = Integer.MAX_VALUE;
409 }
410 if (httpEquivIndex < HTTP_EQUIV.length && c == HTTP_EQUIV[httpEquivIndex]) {
411 ++httpEquivIndex;
412 } else {
413 httpEquivIndex = Integer.MAX_VALUE;
414 }
415 }
416 continue;
417 }
418 }
419 // FALLTHRU DON'T REORDER
420 case BEFORE_ATTRIBUTE_VALUE:
421 beforeattributevalueloop: for (;;) {
422 c = read();
423 switch (c) {
424 case -1:
425 break stateloop;
426 case ' ':
427 case '\t':
428 case '\n':
429 case '\u000C':
430 continue;
431 case '"':
432 state = MetaScanner.ATTRIBUTE_VALUE_DOUBLE_QUOTED;
433 break beforeattributevalueloop;
434 // continue stateloop;
435 case '\'':
436 state = MetaScanner.ATTRIBUTE_VALUE_SINGLE_QUOTED;
437 continue stateloop;
438 case '>':
439 if (handleTag()) {
440 break stateloop;
441 }
442 state = MetaScanner.DATA;
443 continue stateloop;
444 default:
445 handleCharInAttributeValue(c);
446 state = MetaScanner.ATTRIBUTE_VALUE_UNQUOTED;
447 continue stateloop;
448 }
449 }
450 // FALLTHRU DON'T REORDER
451 case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
452 attributevaluedoublequotedloop: for (;;) {
453 if (reconsume) {
454 reconsume = false;
455 } else {
456 c = read();
457 }
458 switch (c) {
459 case -1:
460 break stateloop;
461 case '"':
462 handleAttributeValue();
463 state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED;
464 break attributevaluedoublequotedloop;
465 // continue stateloop;
466 default:
467 handleCharInAttributeValue(c);
468 continue;
469 }
470 }
471 // FALLTHRU DON'T REORDER
472 case AFTER_ATTRIBUTE_VALUE_QUOTED:
473 afterattributevaluequotedloop: for (;;) {
474 c = read();
475 switch (c) {
476 case -1:
477 break stateloop;
478 case ' ':
479 case '\t':
480 case '\n':
481 case '\u000C':
482 state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
483 continue stateloop;
484 case '/':
485 state = MetaScanner.SELF_CLOSING_START_TAG;
486 break afterattributevaluequotedloop;
487 // continue stateloop;
488 case '>':
489 if (handleTag()) {
490 break stateloop;
491 }
492 state = MetaScanner.DATA;
493 continue stateloop;
494 default:
495 state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
496 reconsume = true;
497 continue stateloop;
498 }
499 }
500 // FALLTHRU DON'T REORDER
501 case SELF_CLOSING_START_TAG:
502 c = read();
503 switch (c) {
504 case -1:
505 break stateloop;
506 case '>':
507 if (handleTag()) {
508 break stateloop;
509 }
510 state = MetaScanner.DATA;
511 continue stateloop;
512 default:
513 state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
514 reconsume = true;
515 continue stateloop;
516 }
517 // XXX reorder point
518 case ATTRIBUTE_VALUE_UNQUOTED:
519 for (;;) {
520 if (reconsume) {
521 reconsume = false;
522 } else {
523 c = read();
524 }
525 switch (c) {
526 case -1:
527 break stateloop;
528 case ' ':
529 case '\t':
530 case '\n':
532 case '\u000C':
533 handleAttributeValue();
534 state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
535 continue stateloop;
536 case '>':
537 handleAttributeValue();
538 if (handleTag()) {
539 break stateloop;
540 }
541 state = MetaScanner.DATA;
542 continue stateloop;
543 default:
544 handleCharInAttributeValue(c);
545 continue;
546 }
547 }
548 // XXX reorder point
549 case AFTER_ATTRIBUTE_NAME:
550 for (;;) {
551 c = read();
552 switch (c) {
553 case -1:
554 break stateloop;
555 case ' ':
556 case '\t':
557 case '\n':
558 case '\u000C':
559 continue;
560 case '/':
561 handleAttributeValue();
562 state = MetaScanner.SELF_CLOSING_START_TAG;
563 continue stateloop;
564 case '=':
565 strBufLen = 0;
566 contentTypeIndex = 0;
567 state = MetaScanner.BEFORE_ATTRIBUTE_VALUE;
568 continue stateloop;
569 case '>':
570 handleAttributeValue();
571 if (handleTag()) {
572 break stateloop;
573 }
574 state = MetaScanner.DATA;
575 continue stateloop;
576 case 'c':
577 case 'C':
578 contentIndex = 0;
579 charsetIndex = 0;
580 state = MetaScanner.ATTRIBUTE_NAME;
581 continue stateloop;
582 default:
583 contentIndex = Integer.MAX_VALUE;
584 charsetIndex = Integer.MAX_VALUE;
585 state = MetaScanner.ATTRIBUTE_NAME;
586 continue stateloop;
587 }
588 }
589 // XXX reorder point
590 case MARKUP_DECLARATION_OPEN:
591 markupdeclarationopenloop: for (;;) {
592 c = read();
593 switch (c) {
594 case -1:
595 break stateloop;
596 case '-':
597 state = MetaScanner.MARKUP_DECLARATION_HYPHEN;
598 break markupdeclarationopenloop;
599 // continue stateloop;
600 default:
601 state = MetaScanner.SCAN_UNTIL_GT;
602 reconsume = true;
603 continue stateloop;
604 }
605 }
606 // FALLTHRU DON'T REORDER
607 case MARKUP_DECLARATION_HYPHEN:
608 markupdeclarationhyphenloop: for (;;) {
609 c = read();
610 switch (c) {
611 case -1:
612 break stateloop;
613 case '-':
614 state = MetaScanner.COMMENT_START;
615 break markupdeclarationhyphenloop;
616 // continue stateloop;
617 default:
618 state = MetaScanner.SCAN_UNTIL_GT;
619 reconsume = true;
620 continue stateloop;
621 }
622 }
623 // FALLTHRU DON'T REORDER
624 case COMMENT_START:
625 commentstartloop: for (;;) {
626 c = read();
627 switch (c) {
628 case -1:
629 break stateloop;
630 case '-':
631 state = MetaScanner.COMMENT_START_DASH;
632 continue stateloop;
633 case '>':
634 state = MetaScanner.DATA;
635 continue stateloop;
636 default:
637 state = MetaScanner.COMMENT;
638 break commentstartloop;
639 // continue stateloop;
640 }
641 }
642 // FALLTHRU DON'T REORDER
643 case COMMENT:
644 commentloop: for (;;) {
645 c = read();
646 switch (c) {
647 case -1:
648 break stateloop;
649 case '-':
650 state = MetaScanner.COMMENT_END_DASH;
651 break commentloop;
652 // continue stateloop;
653 default:
654 continue;
655 }
656 }
657 // FALLTHRU DON'T REORDER
658 case COMMENT_END_DASH:
659 commentenddashloop: for (;;) {
660 c = read();
661 switch (c) {
662 case -1:
663 break stateloop;
664 case '-':
665 state = MetaScanner.COMMENT_END;
666 break commentenddashloop;
667 // continue stateloop;
668 default:
669 state = MetaScanner.COMMENT;
670 continue stateloop;
671 }
672 }
673 // FALLTHRU DON'T REORDER
674 case COMMENT_END:
675 for (;;) {
676 c = read();
677 switch (c) {
678 case -1:
679 break stateloop;
680 case '>':
681 state = MetaScanner.DATA;
682 continue stateloop;
683 case '-':
684 continue;
685 default:
686 state = MetaScanner.COMMENT;
687 continue stateloop;
688 }
689 }
690 // XXX reorder point
691 case COMMENT_START_DASH:
692 c = read();
693 switch (c) {
694 case -1:
695 break stateloop;
696 case '-':
697 state = MetaScanner.COMMENT_END;
698 continue stateloop;
699 case '>':
700 state = MetaScanner.DATA;
701 continue stateloop;
702 default:
703 state = MetaScanner.COMMENT;
704 continue stateloop;
705 }
706 // XXX reorder point
707 case ATTRIBUTE_VALUE_SINGLE_QUOTED:
708 for (;;) {
709 if (reconsume) {
710 reconsume = false;
711 } else {
712 c = read();
713 }
714 switch (c) {
715 case -1:
716 break stateloop;
717 case '\'':
718 handleAttributeValue();
719 state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED;
720 continue stateloop;
721 default:
722 handleCharInAttributeValue(c);
723 continue;
724 }
725 }
726 // XXX reorder point
727 case SCAN_UNTIL_GT:
728 for (;;) {
729 if (reconsume) {
730 reconsume = false;
731 } else {
732 c = read();
733 }
734 switch (c) {
735 case -1:
736 break stateloop;
737 case '>':
738 state = MetaScanner.DATA;
739 continue stateloop;
740 default:
741 continue;
742 }
743 }
744 }
745 }
746 stateSave = state;
747 }
749 private void handleCharInAttributeValue(int c) {
750 if (metaState == A) {
751 if (contentIndex == CONTENT.length || charsetIndex == CHARSET.length) {
752 addToBuffer(c);
753 } else if (httpEquivIndex == HTTP_EQUIV.length) {
754 if (contentTypeIndex < CONTENT_TYPE.length && toAsciiLowerCase(c) == CONTENT_TYPE[contentTypeIndex]) {
755 ++contentTypeIndex;
756 } else {
757 contentTypeIndex = Integer.MAX_VALUE;
758 }
759 }
760 }
761 }
763 @Inline private int toAsciiLowerCase(int c) {
764 if (c >= 'A' && c <= 'Z') {
765 return c + 0x20;
766 }
767 return c;
768 }
770 /**
771 * Adds a character to the accumulation buffer.
772 * @param c the character to add
773 */
774 private void addToBuffer(int c) {
775 if (strBufLen == strBuf.length) {
776 char[] newBuf = new char[strBuf.length + (strBuf.length << 1)];
777 System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length);
778 strBuf = newBuf;
779 }
780 strBuf[strBufLen++] = (char)c;
781 }
783 /**
784 * Attempts to extract a charset name from the accumulation buffer.
785 * @return <code>true</code> if successful
786 * @throws SAXException
787 */
788 private void handleAttributeValue() throws SAXException {
789 if (metaState != A) {
790 return;
791 }
792 if (contentIndex == CONTENT.length && content == null) {
793 content = Portability.newStringFromBuffer(strBuf, 0, strBufLen);
794 return;
795 }
796 if (charsetIndex == CHARSET.length && charset == null) {
797 charset = Portability.newStringFromBuffer(strBuf, 0, strBufLen);
798 return;
799 }
800 if (httpEquivIndex == HTTP_EQUIV.length
801 && httpEquivState == HTTP_EQUIV_NOT_SEEN) {
802 httpEquivState = (contentTypeIndex == CONTENT_TYPE.length) ? HTTP_EQUIV_CONTENT_TYPE
803 : HTTP_EQUIV_OTHER;
804 return;
805 }
806 }
808 private boolean handleTag() throws SAXException {
809 boolean stop = handleTagInner();
810 Portability.releaseString(content);
811 content = null;
812 Portability.releaseString(charset);
813 charset = null;
814 httpEquivState = HTTP_EQUIV_NOT_SEEN;
815 return stop;
816 }
818 private boolean handleTagInner() throws SAXException {
819 if (charset != null && tryCharset(charset)) {
820 return true;
821 }
822 if (content != null && httpEquivState == HTTP_EQUIV_CONTENT_TYPE) {
823 String extract = TreeBuilder.extractCharsetFromContent(content);
824 if (extract == null) {
825 return false;
826 }
827 boolean success = tryCharset(extract);
828 Portability.releaseString(extract);
829 return success;
830 }
831 return false;
832 }
834 /**
835 * Tries to switch to an encoding.
836 *
837 * @param encoding
838 * @return <code>true</code> if successful
839 * @throws SAXException
840 */
841 protected abstract boolean tryCharset(String encoding) throws SAXException;
843 }