michael@0: /*
michael@0: * Copyright (c) 2007 Henri Sivonen
michael@0: * Copyright (c) 2008-2010 Mozilla Foundation
michael@0: *
michael@0: * Permission is hereby granted, free of charge, to any person obtaining a
michael@0: * copy of this software and associated documentation files (the "Software"),
michael@0: * to deal in the Software without restriction, including without limitation
michael@0: * the rights to use, copy, modify, merge, publish, distribute, sublicense,
michael@0: * and/or sell copies of the Software, and to permit persons to whom the
michael@0: * Software is furnished to do so, subject to the following conditions:
michael@0: *
michael@0: * The above copyright notice and this permission notice shall be included in
michael@0: * all copies or substantial portions of the Software.
michael@0: *
michael@0: * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
michael@0: * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
michael@0: * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
michael@0: * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
michael@0: * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
michael@0: * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
michael@0: * DEALINGS IN THE SOFTWARE.
michael@0: */
michael@0:
michael@0: package nu.validator.htmlparser.impl;
michael@0:
michael@0: import java.io.IOException;
michael@0:
michael@0: import nu.validator.htmlparser.annotation.Auto;
michael@0: import nu.validator.htmlparser.annotation.Inline;
michael@0: import nu.validator.htmlparser.common.ByteReadable;
michael@0:
michael@0: import org.xml.sax.SAXException;
michael@0:
michael@0: public abstract class MetaScanner {
michael@0:
michael@0: /**
michael@0: * Constant for "charset".
michael@0: */
michael@0: private static final char[] CHARSET = { 'h', 'a', 'r', 's', 'e', 't' };
michael@0:
michael@0: /**
michael@0: * Constant for "content".
michael@0: */
michael@0: private static final char[] CONTENT = { 'o', 'n', 't', 'e', 'n', 't' };
michael@0:
michael@0: /**
michael@0: * Constant for "http-equiv".
michael@0: */
michael@0: private static final char[] HTTP_EQUIV = { 't', 't', 'p', '-', 'e', 'q',
michael@0: 'u', 'i', 'v' };
michael@0:
michael@0: /**
michael@0: * Constant for "content-type".
michael@0: */
michael@0: private static final char[] CONTENT_TYPE = { 'c', 'o', 'n', 't', 'e', 'n',
michael@0: 't', '-', 't', 'y', 'p', 'e' };
michael@0:
michael@0: private static final int NO = 0;
michael@0:
michael@0: private static final int M = 1;
michael@0:
michael@0: private static final int E = 2;
michael@0:
michael@0: private static final int T = 3;
michael@0:
michael@0: private static final int A = 4;
michael@0:
michael@0: private static final int DATA = 0;
michael@0:
michael@0: private static final int TAG_OPEN = 1;
michael@0:
michael@0: private static final int SCAN_UNTIL_GT = 2;
michael@0:
michael@0: private static final int TAG_NAME = 3;
michael@0:
michael@0: private static final int BEFORE_ATTRIBUTE_NAME = 4;
michael@0:
michael@0: private static final int ATTRIBUTE_NAME = 5;
michael@0:
michael@0: private static final int AFTER_ATTRIBUTE_NAME = 6;
michael@0:
michael@0: private static final int BEFORE_ATTRIBUTE_VALUE = 7;
michael@0:
michael@0: private static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 8;
michael@0:
michael@0: private static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 9;
michael@0:
michael@0: private static final int ATTRIBUTE_VALUE_UNQUOTED = 10;
michael@0:
michael@0: private static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 11;
michael@0:
michael@0: private static final int MARKUP_DECLARATION_OPEN = 13;
michael@0:
michael@0: private static final int MARKUP_DECLARATION_HYPHEN = 14;
michael@0:
michael@0: private static final int COMMENT_START = 15;
michael@0:
michael@0: private static final int COMMENT_START_DASH = 16;
michael@0:
michael@0: private static final int COMMENT = 17;
michael@0:
michael@0: private static final int COMMENT_END_DASH = 18;
michael@0:
michael@0: private static final int COMMENT_END = 19;
michael@0:
michael@0: private static final int SELF_CLOSING_START_TAG = 20;
michael@0:
michael@0: private static final int HTTP_EQUIV_NOT_SEEN = 0;
michael@0:
michael@0: private static final int HTTP_EQUIV_CONTENT_TYPE = 1;
michael@0:
michael@0: private static final int HTTP_EQUIV_OTHER = 2;
michael@0:
michael@0: /**
michael@0: * The data source.
michael@0: */
michael@0: protected ByteReadable readable;
michael@0:
michael@0: /**
michael@0: * The state of the state machine that recognizes the tag name "meta".
michael@0: */
michael@0: private int metaState = NO;
michael@0:
michael@0: /**
michael@0: * The current position in recognizing the attribute name "content".
michael@0: */
michael@0: private int contentIndex = Integer.MAX_VALUE;
michael@0:
michael@0: /**
michael@0: * The current position in recognizing the attribute name "charset".
michael@0: */
michael@0: private int charsetIndex = Integer.MAX_VALUE;
michael@0:
michael@0: /**
michael@0: * The current position in recognizing the attribute name "http-equive".
michael@0: */
michael@0: private int httpEquivIndex = Integer.MAX_VALUE;
michael@0:
michael@0: /**
michael@0: * The current position in recognizing the attribute value "content-type".
michael@0: */
michael@0: private int contentTypeIndex = Integer.MAX_VALUE;
michael@0:
michael@0: /**
michael@0: * The tokenizer state.
michael@0: */
michael@0: protected int stateSave = DATA;
michael@0:
michael@0: /**
michael@0: * The currently filled length of strBuf.
michael@0: */
michael@0: private int strBufLen;
michael@0:
michael@0: /**
michael@0: * Accumulation buffer for attribute values.
michael@0: */
michael@0: private @Auto char[] strBuf;
michael@0:
michael@0: private String content;
michael@0:
michael@0: private String charset;
michael@0:
michael@0: private int httpEquivState;
michael@0:
michael@0: public MetaScanner() {
michael@0: this.readable = null;
michael@0: this.metaState = NO;
michael@0: this.contentIndex = Integer.MAX_VALUE;
michael@0: this.charsetIndex = Integer.MAX_VALUE;
michael@0: this.httpEquivIndex = Integer.MAX_VALUE;
michael@0: this.contentTypeIndex = Integer.MAX_VALUE;
michael@0: this.stateSave = DATA;
michael@0: this.strBufLen = 0;
michael@0: this.strBuf = new char[36];
michael@0: this.content = null;
michael@0: this.charset = null;
michael@0: this.httpEquivState = HTTP_EQUIV_NOT_SEEN;
michael@0: }
michael@0:
michael@0: @SuppressWarnings("unused") private void destructor() {
michael@0: Portability.releaseString(content);
michael@0: Portability.releaseString(charset);
michael@0: }
michael@0:
michael@0: // [NOCPP[
michael@0:
michael@0: /**
michael@0: * Reads a byte from the data source.
michael@0: *
michael@0: * -1 means end.
michael@0: * @return
michael@0: * @throws IOException
michael@0: */
michael@0: protected int read() throws IOException {
michael@0: return readable.readByte();
michael@0: }
michael@0:
michael@0: // ]NOCPP]
michael@0:
michael@0: // WARNING When editing this, makes sure the bytecode length shown by javap
michael@0: // stays under 8000 bytes!
michael@0: /**
michael@0: * The runs the meta scanning algorithm.
michael@0: */
michael@0: protected final void stateLoop(int state)
michael@0: throws SAXException, IOException {
michael@0: int c = -1;
michael@0: boolean reconsume = false;
michael@0: stateloop: for (;;) {
michael@0: switch (state) {
michael@0: case DATA:
michael@0: dataloop: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: c = read();
michael@0: }
michael@0: switch (c) {
michael@0: case -1:
michael@0: break stateloop;
michael@0: case '<':
michael@0: state = MetaScanner.TAG_OPEN;
michael@0: break dataloop; // FALL THROUGH continue
michael@0: // stateloop;
michael@0: default:
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0: case TAG_OPEN:
michael@0: tagopenloop: for (;;) {
michael@0: c = read();
michael@0: switch (c) {
michael@0: case -1:
michael@0: break stateloop;
michael@0: case 'm':
michael@0: case 'M':
michael@0: metaState = M;
michael@0: state = MetaScanner.TAG_NAME;
michael@0: break tagopenloop;
michael@0: // continue stateloop;
michael@0: case '!':
michael@0: state = MetaScanner.MARKUP_DECLARATION_OPEN;
michael@0: continue stateloop;
michael@0: case '?':
michael@0: case '/':
michael@0: state = MetaScanner.SCAN_UNTIL_GT;
michael@0: continue stateloop;
michael@0: case '>':
michael@0: state = MetaScanner.DATA;
michael@0: continue stateloop;
michael@0: default:
michael@0: if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
michael@0: metaState = NO;
michael@0: state = MetaScanner.TAG_NAME;
michael@0: break tagopenloop;
michael@0: // continue stateloop;
michael@0: }
michael@0: state = MetaScanner.DATA;
michael@0: reconsume = true;
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALL THROUGH DON'T REORDER
michael@0: case TAG_NAME:
michael@0: tagnameloop: for (;;) {
michael@0: c = read();
michael@0: switch (c) {
michael@0: case -1:
michael@0: break stateloop;
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\n':
michael@0: case '\u000C':
michael@0: state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
michael@0: break tagnameloop;
michael@0: // continue stateloop;
michael@0: case '/':
michael@0: state = MetaScanner.SELF_CLOSING_START_TAG;
michael@0: continue stateloop;
michael@0: case '>':
michael@0: state = MetaScanner.DATA;
michael@0: continue stateloop;
michael@0: case 'e':
michael@0: case 'E':
michael@0: if (metaState == M) {
michael@0: metaState = E;
michael@0: } else {
michael@0: metaState = NO;
michael@0: }
michael@0: continue;
michael@0: case 't':
michael@0: case 'T':
michael@0: if (metaState == E) {
michael@0: metaState = T;
michael@0: } else {
michael@0: metaState = NO;
michael@0: }
michael@0: continue;
michael@0: case 'a':
michael@0: case 'A':
michael@0: if (metaState == T) {
michael@0: metaState = A;
michael@0: } else {
michael@0: metaState = NO;
michael@0: }
michael@0: continue;
michael@0: default:
michael@0: metaState = NO;
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case BEFORE_ATTRIBUTE_NAME:
michael@0: beforeattributenameloop: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: c = read();
michael@0: }
michael@0: /*
michael@0: * Consume the next input character:
michael@0: */
michael@0: switch (c) {
michael@0: case -1:
michael@0: break stateloop;
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\n':
michael@0: case '\u000C':
michael@0: continue;
michael@0: case '/':
michael@0: state = MetaScanner.SELF_CLOSING_START_TAG;
michael@0: continue stateloop;
michael@0: case '>':
michael@0: if (handleTag()) {
michael@0: break stateloop;
michael@0: }
michael@0: state = DATA;
michael@0: continue stateloop;
michael@0: case 'c':
michael@0: case 'C':
michael@0: contentIndex = 0;
michael@0: charsetIndex = 0;
michael@0: httpEquivIndex = Integer.MAX_VALUE;
michael@0: contentTypeIndex = Integer.MAX_VALUE;
michael@0: state = MetaScanner.ATTRIBUTE_NAME;
michael@0: break beforeattributenameloop;
michael@0: case 'h':
michael@0: case 'H':
michael@0: contentIndex = Integer.MAX_VALUE;
michael@0: charsetIndex = Integer.MAX_VALUE;
michael@0: httpEquivIndex = 0;
michael@0: contentTypeIndex = Integer.MAX_VALUE;
michael@0: state = MetaScanner.ATTRIBUTE_NAME;
michael@0: break beforeattributenameloop;
michael@0: default:
michael@0: contentIndex = Integer.MAX_VALUE;
michael@0: charsetIndex = Integer.MAX_VALUE;
michael@0: httpEquivIndex = Integer.MAX_VALUE;
michael@0: contentTypeIndex = Integer.MAX_VALUE;
michael@0: state = MetaScanner.ATTRIBUTE_NAME;
michael@0: break beforeattributenameloop;
michael@0: // continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case ATTRIBUTE_NAME:
michael@0: attributenameloop: for (;;) {
michael@0: c = read();
michael@0: switch (c) {
michael@0: case -1:
michael@0: break stateloop;
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\n':
michael@0: case '\u000C':
michael@0: state = MetaScanner.AFTER_ATTRIBUTE_NAME;
michael@0: continue stateloop;
michael@0: case '/':
michael@0: state = MetaScanner.SELF_CLOSING_START_TAG;
michael@0: continue stateloop;
michael@0: case '=':
michael@0: strBufLen = 0;
michael@0: contentTypeIndex = 0;
michael@0: state = MetaScanner.BEFORE_ATTRIBUTE_VALUE;
michael@0: break attributenameloop;
michael@0: // continue stateloop;
michael@0: case '>':
michael@0: if (handleTag()) {
michael@0: break stateloop;
michael@0: }
michael@0: state = MetaScanner.DATA;
michael@0: continue stateloop;
michael@0: default:
michael@0: if (metaState == A) {
michael@0: if (c >= 'A' && c <= 'Z') {
michael@0: c += 0x20;
michael@0: }
michael@0: if (contentIndex < CONTENT.length && c == CONTENT[contentIndex]) {
michael@0: ++contentIndex;
michael@0: } else {
michael@0: contentIndex = Integer.MAX_VALUE;
michael@0: }
michael@0: if (charsetIndex < CHARSET.length && c == CHARSET[charsetIndex]) {
michael@0: ++charsetIndex;
michael@0: } else {
michael@0: charsetIndex = Integer.MAX_VALUE;
michael@0: }
michael@0: if (httpEquivIndex < HTTP_EQUIV.length && c == HTTP_EQUIV[httpEquivIndex]) {
michael@0: ++httpEquivIndex;
michael@0: } else {
michael@0: httpEquivIndex = Integer.MAX_VALUE;
michael@0: }
michael@0: }
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case BEFORE_ATTRIBUTE_VALUE:
michael@0: beforeattributevalueloop: for (;;) {
michael@0: c = read();
michael@0: switch (c) {
michael@0: case -1:
michael@0: break stateloop;
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\n':
michael@0: case '\u000C':
michael@0: continue;
michael@0: case '"':
michael@0: state = MetaScanner.ATTRIBUTE_VALUE_DOUBLE_QUOTED;
michael@0: break beforeattributevalueloop;
michael@0: // continue stateloop;
michael@0: case '\'':
michael@0: state = MetaScanner.ATTRIBUTE_VALUE_SINGLE_QUOTED;
michael@0: continue stateloop;
michael@0: case '>':
michael@0: if (handleTag()) {
michael@0: break stateloop;
michael@0: }
michael@0: state = MetaScanner.DATA;
michael@0: continue stateloop;
michael@0: default:
michael@0: handleCharInAttributeValue(c);
michael@0: state = MetaScanner.ATTRIBUTE_VALUE_UNQUOTED;
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
michael@0: attributevaluedoublequotedloop: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: c = read();
michael@0: }
michael@0: switch (c) {
michael@0: case -1:
michael@0: break stateloop;
michael@0: case '"':
michael@0: handleAttributeValue();
michael@0: state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED;
michael@0: break attributevaluedoublequotedloop;
michael@0: // continue stateloop;
michael@0: default:
michael@0: handleCharInAttributeValue(c);
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case AFTER_ATTRIBUTE_VALUE_QUOTED:
michael@0: afterattributevaluequotedloop: for (;;) {
michael@0: c = read();
michael@0: switch (c) {
michael@0: case -1:
michael@0: break stateloop;
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\n':
michael@0: case '\u000C':
michael@0: state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
michael@0: continue stateloop;
michael@0: case '/':
michael@0: state = MetaScanner.SELF_CLOSING_START_TAG;
michael@0: break afterattributevaluequotedloop;
michael@0: // continue stateloop;
michael@0: case '>':
michael@0: if (handleTag()) {
michael@0: break stateloop;
michael@0: }
michael@0: state = MetaScanner.DATA;
michael@0: continue stateloop;
michael@0: default:
michael@0: state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
michael@0: reconsume = true;
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case SELF_CLOSING_START_TAG:
michael@0: c = read();
michael@0: switch (c) {
michael@0: case -1:
michael@0: break stateloop;
michael@0: case '>':
michael@0: if (handleTag()) {
michael@0: break stateloop;
michael@0: }
michael@0: state = MetaScanner.DATA;
michael@0: continue stateloop;
michael@0: default:
michael@0: state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
michael@0: reconsume = true;
michael@0: continue stateloop;
michael@0: }
michael@0: // XXX reorder point
michael@0: case ATTRIBUTE_VALUE_UNQUOTED:
michael@0: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: c = read();
michael@0: }
michael@0: switch (c) {
michael@0: case -1:
michael@0: break stateloop;
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\n':
michael@0:
michael@0: case '\u000C':
michael@0: handleAttributeValue();
michael@0: state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
michael@0: continue stateloop;
michael@0: case '>':
michael@0: handleAttributeValue();
michael@0: if (handleTag()) {
michael@0: break stateloop;
michael@0: }
michael@0: state = MetaScanner.DATA;
michael@0: continue stateloop;
michael@0: default:
michael@0: handleCharInAttributeValue(c);
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // XXX reorder point
michael@0: case AFTER_ATTRIBUTE_NAME:
michael@0: for (;;) {
michael@0: c = read();
michael@0: switch (c) {
michael@0: case -1:
michael@0: break stateloop;
michael@0: case ' ':
michael@0: case '\t':
michael@0: case '\n':
michael@0: case '\u000C':
michael@0: continue;
michael@0: case '/':
michael@0: handleAttributeValue();
michael@0: state = MetaScanner.SELF_CLOSING_START_TAG;
michael@0: continue stateloop;
michael@0: case '=':
michael@0: strBufLen = 0;
michael@0: contentTypeIndex = 0;
michael@0: state = MetaScanner.BEFORE_ATTRIBUTE_VALUE;
michael@0: continue stateloop;
michael@0: case '>':
michael@0: handleAttributeValue();
michael@0: if (handleTag()) {
michael@0: break stateloop;
michael@0: }
michael@0: state = MetaScanner.DATA;
michael@0: continue stateloop;
michael@0: case 'c':
michael@0: case 'C':
michael@0: contentIndex = 0;
michael@0: charsetIndex = 0;
michael@0: state = MetaScanner.ATTRIBUTE_NAME;
michael@0: continue stateloop;
michael@0: default:
michael@0: contentIndex = Integer.MAX_VALUE;
michael@0: charsetIndex = Integer.MAX_VALUE;
michael@0: state = MetaScanner.ATTRIBUTE_NAME;
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // XXX reorder point
michael@0: case MARKUP_DECLARATION_OPEN:
michael@0: markupdeclarationopenloop: for (;;) {
michael@0: c = read();
michael@0: switch (c) {
michael@0: case -1:
michael@0: break stateloop;
michael@0: case '-':
michael@0: state = MetaScanner.MARKUP_DECLARATION_HYPHEN;
michael@0: break markupdeclarationopenloop;
michael@0: // continue stateloop;
michael@0: default:
michael@0: state = MetaScanner.SCAN_UNTIL_GT;
michael@0: reconsume = true;
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case MARKUP_DECLARATION_HYPHEN:
michael@0: markupdeclarationhyphenloop: for (;;) {
michael@0: c = read();
michael@0: switch (c) {
michael@0: case -1:
michael@0: break stateloop;
michael@0: case '-':
michael@0: state = MetaScanner.COMMENT_START;
michael@0: break markupdeclarationhyphenloop;
michael@0: // continue stateloop;
michael@0: default:
michael@0: state = MetaScanner.SCAN_UNTIL_GT;
michael@0: reconsume = true;
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case COMMENT_START:
michael@0: commentstartloop: for (;;) {
michael@0: c = read();
michael@0: switch (c) {
michael@0: case -1:
michael@0: break stateloop;
michael@0: case '-':
michael@0: state = MetaScanner.COMMENT_START_DASH;
michael@0: continue stateloop;
michael@0: case '>':
michael@0: state = MetaScanner.DATA;
michael@0: continue stateloop;
michael@0: default:
michael@0: state = MetaScanner.COMMENT;
michael@0: break commentstartloop;
michael@0: // continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case COMMENT:
michael@0: commentloop: for (;;) {
michael@0: c = read();
michael@0: switch (c) {
michael@0: case -1:
michael@0: break stateloop;
michael@0: case '-':
michael@0: state = MetaScanner.COMMENT_END_DASH;
michael@0: break commentloop;
michael@0: // continue stateloop;
michael@0: default:
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case COMMENT_END_DASH:
michael@0: commentenddashloop: for (;;) {
michael@0: c = read();
michael@0: switch (c) {
michael@0: case -1:
michael@0: break stateloop;
michael@0: case '-':
michael@0: state = MetaScanner.COMMENT_END;
michael@0: break commentenddashloop;
michael@0: // continue stateloop;
michael@0: default:
michael@0: state = MetaScanner.COMMENT;
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // FALLTHRU DON'T REORDER
michael@0: case COMMENT_END:
michael@0: for (;;) {
michael@0: c = read();
michael@0: switch (c) {
michael@0: case -1:
michael@0: break stateloop;
michael@0: case '>':
michael@0: state = MetaScanner.DATA;
michael@0: continue stateloop;
michael@0: case '-':
michael@0: continue;
michael@0: default:
michael@0: state = MetaScanner.COMMENT;
michael@0: continue stateloop;
michael@0: }
michael@0: }
michael@0: // XXX reorder point
michael@0: case COMMENT_START_DASH:
michael@0: c = read();
michael@0: switch (c) {
michael@0: case -1:
michael@0: break stateloop;
michael@0: case '-':
michael@0: state = MetaScanner.COMMENT_END;
michael@0: continue stateloop;
michael@0: case '>':
michael@0: state = MetaScanner.DATA;
michael@0: continue stateloop;
michael@0: default:
michael@0: state = MetaScanner.COMMENT;
michael@0: continue stateloop;
michael@0: }
michael@0: // XXX reorder point
michael@0: case ATTRIBUTE_VALUE_SINGLE_QUOTED:
michael@0: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: c = read();
michael@0: }
michael@0: switch (c) {
michael@0: case -1:
michael@0: break stateloop;
michael@0: case '\'':
michael@0: handleAttributeValue();
michael@0: state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED;
michael@0: continue stateloop;
michael@0: default:
michael@0: handleCharInAttributeValue(c);
michael@0: continue;
michael@0: }
michael@0: }
michael@0: // XXX reorder point
michael@0: case SCAN_UNTIL_GT:
michael@0: for (;;) {
michael@0: if (reconsume) {
michael@0: reconsume = false;
michael@0: } else {
michael@0: c = read();
michael@0: }
michael@0: switch (c) {
michael@0: case -1:
michael@0: break stateloop;
michael@0: case '>':
michael@0: state = MetaScanner.DATA;
michael@0: continue stateloop;
michael@0: default:
michael@0: continue;
michael@0: }
michael@0: }
michael@0: }
michael@0: }
michael@0: stateSave = state;
michael@0: }
michael@0:
michael@0: private void handleCharInAttributeValue(int c) {
michael@0: if (metaState == A) {
michael@0: if (contentIndex == CONTENT.length || charsetIndex == CHARSET.length) {
michael@0: addToBuffer(c);
michael@0: } else if (httpEquivIndex == HTTP_EQUIV.length) {
michael@0: if (contentTypeIndex < CONTENT_TYPE.length && toAsciiLowerCase(c) == CONTENT_TYPE[contentTypeIndex]) {
michael@0: ++contentTypeIndex;
michael@0: } else {
michael@0: contentTypeIndex = Integer.MAX_VALUE;
michael@0: }
michael@0: }
michael@0: }
michael@0: }
michael@0:
michael@0: @Inline private int toAsciiLowerCase(int c) {
michael@0: if (c >= 'A' && c <= 'Z') {
michael@0: return c + 0x20;
michael@0: }
michael@0: return c;
michael@0: }
michael@0:
michael@0: /**
michael@0: * Adds a character to the accumulation buffer.
michael@0: * @param c the character to add
michael@0: */
michael@0: private void addToBuffer(int c) {
michael@0: if (strBufLen == strBuf.length) {
michael@0: char[] newBuf = new char[strBuf.length + (strBuf.length << 1)];
michael@0: System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length);
michael@0: strBuf = newBuf;
michael@0: }
michael@0: strBuf[strBufLen++] = (char)c;
michael@0: }
michael@0:
michael@0: /**
michael@0: * Attempts to extract a charset name from the accumulation buffer.
michael@0: * @return true
if successful
michael@0: * @throws SAXException
michael@0: */
michael@0: private void handleAttributeValue() throws SAXException {
michael@0: if (metaState != A) {
michael@0: return;
michael@0: }
michael@0: if (contentIndex == CONTENT.length && content == null) {
michael@0: content = Portability.newStringFromBuffer(strBuf, 0, strBufLen);
michael@0: return;
michael@0: }
michael@0: if (charsetIndex == CHARSET.length && charset == null) {
michael@0: charset = Portability.newStringFromBuffer(strBuf, 0, strBufLen);
michael@0: return;
michael@0: }
michael@0: if (httpEquivIndex == HTTP_EQUIV.length
michael@0: && httpEquivState == HTTP_EQUIV_NOT_SEEN) {
michael@0: httpEquivState = (contentTypeIndex == CONTENT_TYPE.length) ? HTTP_EQUIV_CONTENT_TYPE
michael@0: : HTTP_EQUIV_OTHER;
michael@0: return;
michael@0: }
michael@0: }
michael@0:
michael@0: private boolean handleTag() throws SAXException {
michael@0: boolean stop = handleTagInner();
michael@0: Portability.releaseString(content);
michael@0: content = null;
michael@0: Portability.releaseString(charset);
michael@0: charset = null;
michael@0: httpEquivState = HTTP_EQUIV_NOT_SEEN;
michael@0: return stop;
michael@0: }
michael@0:
michael@0: private boolean handleTagInner() throws SAXException {
michael@0: if (charset != null && tryCharset(charset)) {
michael@0: return true;
michael@0: }
michael@0: if (content != null && httpEquivState == HTTP_EQUIV_CONTENT_TYPE) {
michael@0: String extract = TreeBuilder.extractCharsetFromContent(content);
michael@0: if (extract == null) {
michael@0: return false;
michael@0: }
michael@0: boolean success = tryCharset(extract);
michael@0: Portability.releaseString(extract);
michael@0: return success;
michael@0: }
michael@0: return false;
michael@0: }
michael@0:
michael@0: /**
michael@0: * Tries to switch to an encoding.
michael@0: *
michael@0: * @param encoding
michael@0: * @return true
if successful
michael@0: * @throws SAXException
michael@0: */
michael@0: protected abstract boolean tryCharset(String encoding) throws SAXException;
michael@0:
michael@0: }