michael@0: /* michael@0: * Copyright (c) 2007 Henri Sivonen michael@0: * Copyright (c) 2008-2010 Mozilla Foundation michael@0: * michael@0: * Permission is hereby granted, free of charge, to any person obtaining a michael@0: * copy of this software and associated documentation files (the "Software"), michael@0: * to deal in the Software without restriction, including without limitation michael@0: * the rights to use, copy, modify, merge, publish, distribute, sublicense, michael@0: * and/or sell copies of the Software, and to permit persons to whom the michael@0: * Software is furnished to do so, subject to the following conditions: michael@0: * michael@0: * The above copyright notice and this permission notice shall be included in michael@0: * all copies or substantial portions of the Software. michael@0: * michael@0: * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR michael@0: * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, michael@0: * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL michael@0: * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER michael@0: * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING michael@0: * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER michael@0: * DEALINGS IN THE SOFTWARE. michael@0: */ michael@0: michael@0: package nu.validator.htmlparser.impl; michael@0: michael@0: import java.io.IOException; michael@0: michael@0: import nu.validator.htmlparser.annotation.Auto; michael@0: import nu.validator.htmlparser.annotation.Inline; michael@0: import nu.validator.htmlparser.common.ByteReadable; michael@0: michael@0: import org.xml.sax.SAXException; michael@0: michael@0: public abstract class MetaScanner { michael@0: michael@0: /** michael@0: * Constant for "charset". michael@0: */ michael@0: private static final char[] CHARSET = { 'h', 'a', 'r', 's', 'e', 't' }; michael@0: michael@0: /** michael@0: * Constant for "content". michael@0: */ michael@0: private static final char[] CONTENT = { 'o', 'n', 't', 'e', 'n', 't' }; michael@0: michael@0: /** michael@0: * Constant for "http-equiv". michael@0: */ michael@0: private static final char[] HTTP_EQUIV = { 't', 't', 'p', '-', 'e', 'q', michael@0: 'u', 'i', 'v' }; michael@0: michael@0: /** michael@0: * Constant for "content-type". michael@0: */ michael@0: private static final char[] CONTENT_TYPE = { 'c', 'o', 'n', 't', 'e', 'n', michael@0: 't', '-', 't', 'y', 'p', 'e' }; michael@0: michael@0: private static final int NO = 0; michael@0: michael@0: private static final int M = 1; michael@0: michael@0: private static final int E = 2; michael@0: michael@0: private static final int T = 3; michael@0: michael@0: private static final int A = 4; michael@0: michael@0: private static final int DATA = 0; michael@0: michael@0: private static final int TAG_OPEN = 1; michael@0: michael@0: private static final int SCAN_UNTIL_GT = 2; michael@0: michael@0: private static final int TAG_NAME = 3; michael@0: michael@0: private static final int BEFORE_ATTRIBUTE_NAME = 4; michael@0: michael@0: private static final int ATTRIBUTE_NAME = 5; michael@0: michael@0: private static final int AFTER_ATTRIBUTE_NAME = 6; michael@0: michael@0: private static final int BEFORE_ATTRIBUTE_VALUE = 7; michael@0: michael@0: private static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 8; michael@0: michael@0: private static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 9; michael@0: michael@0: private static final int ATTRIBUTE_VALUE_UNQUOTED = 10; michael@0: michael@0: private static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 11; michael@0: michael@0: private static final int MARKUP_DECLARATION_OPEN = 13; michael@0: michael@0: private static final int MARKUP_DECLARATION_HYPHEN = 14; michael@0: michael@0: private static final int COMMENT_START = 15; michael@0: michael@0: private static final int COMMENT_START_DASH = 16; michael@0: michael@0: private static final int COMMENT = 17; michael@0: michael@0: private static final int COMMENT_END_DASH = 18; michael@0: michael@0: private static final int COMMENT_END = 19; michael@0: michael@0: private static final int SELF_CLOSING_START_TAG = 20; michael@0: michael@0: private static final int HTTP_EQUIV_NOT_SEEN = 0; michael@0: michael@0: private static final int HTTP_EQUIV_CONTENT_TYPE = 1; michael@0: michael@0: private static final int HTTP_EQUIV_OTHER = 2; michael@0: michael@0: /** michael@0: * The data source. michael@0: */ michael@0: protected ByteReadable readable; michael@0: michael@0: /** michael@0: * The state of the state machine that recognizes the tag name "meta". michael@0: */ michael@0: private int metaState = NO; michael@0: michael@0: /** michael@0: * The current position in recognizing the attribute name "content". michael@0: */ michael@0: private int contentIndex = Integer.MAX_VALUE; michael@0: michael@0: /** michael@0: * The current position in recognizing the attribute name "charset". michael@0: */ michael@0: private int charsetIndex = Integer.MAX_VALUE; michael@0: michael@0: /** michael@0: * The current position in recognizing the attribute name "http-equive". michael@0: */ michael@0: private int httpEquivIndex = Integer.MAX_VALUE; michael@0: michael@0: /** michael@0: * The current position in recognizing the attribute value "content-type". michael@0: */ michael@0: private int contentTypeIndex = Integer.MAX_VALUE; michael@0: michael@0: /** michael@0: * The tokenizer state. michael@0: */ michael@0: protected int stateSave = DATA; michael@0: michael@0: /** michael@0: * The currently filled length of strBuf. michael@0: */ michael@0: private int strBufLen; michael@0: michael@0: /** michael@0: * Accumulation buffer for attribute values. michael@0: */ michael@0: private @Auto char[] strBuf; michael@0: michael@0: private String content; michael@0: michael@0: private String charset; michael@0: michael@0: private int httpEquivState; michael@0: michael@0: public MetaScanner() { michael@0: this.readable = null; michael@0: this.metaState = NO; michael@0: this.contentIndex = Integer.MAX_VALUE; michael@0: this.charsetIndex = Integer.MAX_VALUE; michael@0: this.httpEquivIndex = Integer.MAX_VALUE; michael@0: this.contentTypeIndex = Integer.MAX_VALUE; michael@0: this.stateSave = DATA; michael@0: this.strBufLen = 0; michael@0: this.strBuf = new char[36]; michael@0: this.content = null; michael@0: this.charset = null; michael@0: this.httpEquivState = HTTP_EQUIV_NOT_SEEN; michael@0: } michael@0: michael@0: @SuppressWarnings("unused") private void destructor() { michael@0: Portability.releaseString(content); michael@0: Portability.releaseString(charset); michael@0: } michael@0: michael@0: // [NOCPP[ michael@0: michael@0: /** michael@0: * Reads a byte from the data source. michael@0: * michael@0: * -1 means end. michael@0: * @return michael@0: * @throws IOException michael@0: */ michael@0: protected int read() throws IOException { michael@0: return readable.readByte(); michael@0: } michael@0: michael@0: // ]NOCPP] michael@0: michael@0: // WARNING When editing this, makes sure the bytecode length shown by javap michael@0: // stays under 8000 bytes! michael@0: /** michael@0: * The runs the meta scanning algorithm. michael@0: */ michael@0: protected final void stateLoop(int state) michael@0: throws SAXException, IOException { michael@0: int c = -1; michael@0: boolean reconsume = false; michael@0: stateloop: for (;;) { michael@0: switch (state) { michael@0: case DATA: michael@0: dataloop: for (;;) { michael@0: if (reconsume) { michael@0: reconsume = false; michael@0: } else { michael@0: c = read(); michael@0: } michael@0: switch (c) { michael@0: case -1: michael@0: break stateloop; michael@0: case '<': michael@0: state = MetaScanner.TAG_OPEN; michael@0: break dataloop; // FALL THROUGH continue michael@0: // stateloop; michael@0: default: michael@0: continue; michael@0: } michael@0: } michael@0: // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER michael@0: case TAG_OPEN: michael@0: tagopenloop: for (;;) { michael@0: c = read(); michael@0: switch (c) { michael@0: case -1: michael@0: break stateloop; michael@0: case 'm': michael@0: case 'M': michael@0: metaState = M; michael@0: state = MetaScanner.TAG_NAME; michael@0: break tagopenloop; michael@0: // continue stateloop; michael@0: case '!': michael@0: state = MetaScanner.MARKUP_DECLARATION_OPEN; michael@0: continue stateloop; michael@0: case '?': michael@0: case '/': michael@0: state = MetaScanner.SCAN_UNTIL_GT; michael@0: continue stateloop; michael@0: case '>': michael@0: state = MetaScanner.DATA; michael@0: continue stateloop; michael@0: default: michael@0: if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { michael@0: metaState = NO; michael@0: state = MetaScanner.TAG_NAME; michael@0: break tagopenloop; michael@0: // continue stateloop; michael@0: } michael@0: state = MetaScanner.DATA; michael@0: reconsume = true; michael@0: continue stateloop; michael@0: } michael@0: } michael@0: // FALL THROUGH DON'T REORDER michael@0: case TAG_NAME: michael@0: tagnameloop: for (;;) { michael@0: c = read(); michael@0: switch (c) { michael@0: case -1: michael@0: break stateloop; michael@0: case ' ': michael@0: case '\t': michael@0: case '\n': michael@0: case '\u000C': michael@0: state = MetaScanner.BEFORE_ATTRIBUTE_NAME; michael@0: break tagnameloop; michael@0: // continue stateloop; michael@0: case '/': michael@0: state = MetaScanner.SELF_CLOSING_START_TAG; michael@0: continue stateloop; michael@0: case '>': michael@0: state = MetaScanner.DATA; michael@0: continue stateloop; michael@0: case 'e': michael@0: case 'E': michael@0: if (metaState == M) { michael@0: metaState = E; michael@0: } else { michael@0: metaState = NO; michael@0: } michael@0: continue; michael@0: case 't': michael@0: case 'T': michael@0: if (metaState == E) { michael@0: metaState = T; michael@0: } else { michael@0: metaState = NO; michael@0: } michael@0: continue; michael@0: case 'a': michael@0: case 'A': michael@0: if (metaState == T) { michael@0: metaState = A; michael@0: } else { michael@0: metaState = NO; michael@0: } michael@0: continue; michael@0: default: michael@0: metaState = NO; michael@0: continue; michael@0: } michael@0: } michael@0: // FALLTHRU DON'T REORDER michael@0: case BEFORE_ATTRIBUTE_NAME: michael@0: beforeattributenameloop: for (;;) { michael@0: if (reconsume) { michael@0: reconsume = false; michael@0: } else { michael@0: c = read(); michael@0: } michael@0: /* michael@0: * Consume the next input character: michael@0: */ michael@0: switch (c) { michael@0: case -1: michael@0: break stateloop; michael@0: case ' ': michael@0: case '\t': michael@0: case '\n': michael@0: case '\u000C': michael@0: continue; michael@0: case '/': michael@0: state = MetaScanner.SELF_CLOSING_START_TAG; michael@0: continue stateloop; michael@0: case '>': michael@0: if (handleTag()) { michael@0: break stateloop; michael@0: } michael@0: state = DATA; michael@0: continue stateloop; michael@0: case 'c': michael@0: case 'C': michael@0: contentIndex = 0; michael@0: charsetIndex = 0; michael@0: httpEquivIndex = Integer.MAX_VALUE; michael@0: contentTypeIndex = Integer.MAX_VALUE; michael@0: state = MetaScanner.ATTRIBUTE_NAME; michael@0: break beforeattributenameloop; michael@0: case 'h': michael@0: case 'H': michael@0: contentIndex = Integer.MAX_VALUE; michael@0: charsetIndex = Integer.MAX_VALUE; michael@0: httpEquivIndex = 0; michael@0: contentTypeIndex = Integer.MAX_VALUE; michael@0: state = MetaScanner.ATTRIBUTE_NAME; michael@0: break beforeattributenameloop; michael@0: default: michael@0: contentIndex = Integer.MAX_VALUE; michael@0: charsetIndex = Integer.MAX_VALUE; michael@0: httpEquivIndex = Integer.MAX_VALUE; michael@0: contentTypeIndex = Integer.MAX_VALUE; michael@0: state = MetaScanner.ATTRIBUTE_NAME; michael@0: break beforeattributenameloop; michael@0: // continue stateloop; michael@0: } michael@0: } michael@0: // FALLTHRU DON'T REORDER michael@0: case ATTRIBUTE_NAME: michael@0: attributenameloop: for (;;) { michael@0: c = read(); michael@0: switch (c) { michael@0: case -1: michael@0: break stateloop; michael@0: case ' ': michael@0: case '\t': michael@0: case '\n': michael@0: case '\u000C': michael@0: state = MetaScanner.AFTER_ATTRIBUTE_NAME; michael@0: continue stateloop; michael@0: case '/': michael@0: state = MetaScanner.SELF_CLOSING_START_TAG; michael@0: continue stateloop; michael@0: case '=': michael@0: strBufLen = 0; michael@0: contentTypeIndex = 0; michael@0: state = MetaScanner.BEFORE_ATTRIBUTE_VALUE; michael@0: break attributenameloop; michael@0: // continue stateloop; michael@0: case '>': michael@0: if (handleTag()) { michael@0: break stateloop; michael@0: } michael@0: state = MetaScanner.DATA; michael@0: continue stateloop; michael@0: default: michael@0: if (metaState == A) { michael@0: if (c >= 'A' && c <= 'Z') { michael@0: c += 0x20; michael@0: } michael@0: if (contentIndex < CONTENT.length && c == CONTENT[contentIndex]) { michael@0: ++contentIndex; michael@0: } else { michael@0: contentIndex = Integer.MAX_VALUE; michael@0: } michael@0: if (charsetIndex < CHARSET.length && c == CHARSET[charsetIndex]) { michael@0: ++charsetIndex; michael@0: } else { michael@0: charsetIndex = Integer.MAX_VALUE; michael@0: } michael@0: if (httpEquivIndex < HTTP_EQUIV.length && c == HTTP_EQUIV[httpEquivIndex]) { michael@0: ++httpEquivIndex; michael@0: } else { michael@0: httpEquivIndex = Integer.MAX_VALUE; michael@0: } michael@0: } michael@0: continue; michael@0: } michael@0: } michael@0: // FALLTHRU DON'T REORDER michael@0: case BEFORE_ATTRIBUTE_VALUE: michael@0: beforeattributevalueloop: for (;;) { michael@0: c = read(); michael@0: switch (c) { michael@0: case -1: michael@0: break stateloop; michael@0: case ' ': michael@0: case '\t': michael@0: case '\n': michael@0: case '\u000C': michael@0: continue; michael@0: case '"': michael@0: state = MetaScanner.ATTRIBUTE_VALUE_DOUBLE_QUOTED; michael@0: break beforeattributevalueloop; michael@0: // continue stateloop; michael@0: case '\'': michael@0: state = MetaScanner.ATTRIBUTE_VALUE_SINGLE_QUOTED; michael@0: continue stateloop; michael@0: case '>': michael@0: if (handleTag()) { michael@0: break stateloop; michael@0: } michael@0: state = MetaScanner.DATA; michael@0: continue stateloop; michael@0: default: michael@0: handleCharInAttributeValue(c); michael@0: state = MetaScanner.ATTRIBUTE_VALUE_UNQUOTED; michael@0: continue stateloop; michael@0: } michael@0: } michael@0: // FALLTHRU DON'T REORDER michael@0: case ATTRIBUTE_VALUE_DOUBLE_QUOTED: michael@0: attributevaluedoublequotedloop: for (;;) { michael@0: if (reconsume) { michael@0: reconsume = false; michael@0: } else { michael@0: c = read(); michael@0: } michael@0: switch (c) { michael@0: case -1: michael@0: break stateloop; michael@0: case '"': michael@0: handleAttributeValue(); michael@0: state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED; michael@0: break attributevaluedoublequotedloop; michael@0: // continue stateloop; michael@0: default: michael@0: handleCharInAttributeValue(c); michael@0: continue; michael@0: } michael@0: } michael@0: // FALLTHRU DON'T REORDER michael@0: case AFTER_ATTRIBUTE_VALUE_QUOTED: michael@0: afterattributevaluequotedloop: for (;;) { michael@0: c = read(); michael@0: switch (c) { michael@0: case -1: michael@0: break stateloop; michael@0: case ' ': michael@0: case '\t': michael@0: case '\n': michael@0: case '\u000C': michael@0: state = MetaScanner.BEFORE_ATTRIBUTE_NAME; michael@0: continue stateloop; michael@0: case '/': michael@0: state = MetaScanner.SELF_CLOSING_START_TAG; michael@0: break afterattributevaluequotedloop; michael@0: // continue stateloop; michael@0: case '>': michael@0: if (handleTag()) { michael@0: break stateloop; michael@0: } michael@0: state = MetaScanner.DATA; michael@0: continue stateloop; michael@0: default: michael@0: state = MetaScanner.BEFORE_ATTRIBUTE_NAME; michael@0: reconsume = true; michael@0: continue stateloop; michael@0: } michael@0: } michael@0: // FALLTHRU DON'T REORDER michael@0: case SELF_CLOSING_START_TAG: michael@0: c = read(); michael@0: switch (c) { michael@0: case -1: michael@0: break stateloop; michael@0: case '>': michael@0: if (handleTag()) { michael@0: break stateloop; michael@0: } michael@0: state = MetaScanner.DATA; michael@0: continue stateloop; michael@0: default: michael@0: state = MetaScanner.BEFORE_ATTRIBUTE_NAME; michael@0: reconsume = true; michael@0: continue stateloop; michael@0: } michael@0: // XXX reorder point michael@0: case ATTRIBUTE_VALUE_UNQUOTED: michael@0: for (;;) { michael@0: if (reconsume) { michael@0: reconsume = false; michael@0: } else { michael@0: c = read(); michael@0: } michael@0: switch (c) { michael@0: case -1: michael@0: break stateloop; michael@0: case ' ': michael@0: case '\t': michael@0: case '\n': michael@0: michael@0: case '\u000C': michael@0: handleAttributeValue(); michael@0: state = MetaScanner.BEFORE_ATTRIBUTE_NAME; michael@0: continue stateloop; michael@0: case '>': michael@0: handleAttributeValue(); michael@0: if (handleTag()) { michael@0: break stateloop; michael@0: } michael@0: state = MetaScanner.DATA; michael@0: continue stateloop; michael@0: default: michael@0: handleCharInAttributeValue(c); michael@0: continue; michael@0: } michael@0: } michael@0: // XXX reorder point michael@0: case AFTER_ATTRIBUTE_NAME: michael@0: for (;;) { michael@0: c = read(); michael@0: switch (c) { michael@0: case -1: michael@0: break stateloop; michael@0: case ' ': michael@0: case '\t': michael@0: case '\n': michael@0: case '\u000C': michael@0: continue; michael@0: case '/': michael@0: handleAttributeValue(); michael@0: state = MetaScanner.SELF_CLOSING_START_TAG; michael@0: continue stateloop; michael@0: case '=': michael@0: strBufLen = 0; michael@0: contentTypeIndex = 0; michael@0: state = MetaScanner.BEFORE_ATTRIBUTE_VALUE; michael@0: continue stateloop; michael@0: case '>': michael@0: handleAttributeValue(); michael@0: if (handleTag()) { michael@0: break stateloop; michael@0: } michael@0: state = MetaScanner.DATA; michael@0: continue stateloop; michael@0: case 'c': michael@0: case 'C': michael@0: contentIndex = 0; michael@0: charsetIndex = 0; michael@0: state = MetaScanner.ATTRIBUTE_NAME; michael@0: continue stateloop; michael@0: default: michael@0: contentIndex = Integer.MAX_VALUE; michael@0: charsetIndex = Integer.MAX_VALUE; michael@0: state = MetaScanner.ATTRIBUTE_NAME; michael@0: continue stateloop; michael@0: } michael@0: } michael@0: // XXX reorder point michael@0: case MARKUP_DECLARATION_OPEN: michael@0: markupdeclarationopenloop: for (;;) { michael@0: c = read(); michael@0: switch (c) { michael@0: case -1: michael@0: break stateloop; michael@0: case '-': michael@0: state = MetaScanner.MARKUP_DECLARATION_HYPHEN; michael@0: break markupdeclarationopenloop; michael@0: // continue stateloop; michael@0: default: michael@0: state = MetaScanner.SCAN_UNTIL_GT; michael@0: reconsume = true; michael@0: continue stateloop; michael@0: } michael@0: } michael@0: // FALLTHRU DON'T REORDER michael@0: case MARKUP_DECLARATION_HYPHEN: michael@0: markupdeclarationhyphenloop: for (;;) { michael@0: c = read(); michael@0: switch (c) { michael@0: case -1: michael@0: break stateloop; michael@0: case '-': michael@0: state = MetaScanner.COMMENT_START; michael@0: break markupdeclarationhyphenloop; michael@0: // continue stateloop; michael@0: default: michael@0: state = MetaScanner.SCAN_UNTIL_GT; michael@0: reconsume = true; michael@0: continue stateloop; michael@0: } michael@0: } michael@0: // FALLTHRU DON'T REORDER michael@0: case COMMENT_START: michael@0: commentstartloop: for (;;) { michael@0: c = read(); michael@0: switch (c) { michael@0: case -1: michael@0: break stateloop; michael@0: case '-': michael@0: state = MetaScanner.COMMENT_START_DASH; michael@0: continue stateloop; michael@0: case '>': michael@0: state = MetaScanner.DATA; michael@0: continue stateloop; michael@0: default: michael@0: state = MetaScanner.COMMENT; michael@0: break commentstartloop; michael@0: // continue stateloop; michael@0: } michael@0: } michael@0: // FALLTHRU DON'T REORDER michael@0: case COMMENT: michael@0: commentloop: for (;;) { michael@0: c = read(); michael@0: switch (c) { michael@0: case -1: michael@0: break stateloop; michael@0: case '-': michael@0: state = MetaScanner.COMMENT_END_DASH; michael@0: break commentloop; michael@0: // continue stateloop; michael@0: default: michael@0: continue; michael@0: } michael@0: } michael@0: // FALLTHRU DON'T REORDER michael@0: case COMMENT_END_DASH: michael@0: commentenddashloop: for (;;) { michael@0: c = read(); michael@0: switch (c) { michael@0: case -1: michael@0: break stateloop; michael@0: case '-': michael@0: state = MetaScanner.COMMENT_END; michael@0: break commentenddashloop; michael@0: // continue stateloop; michael@0: default: michael@0: state = MetaScanner.COMMENT; michael@0: continue stateloop; michael@0: } michael@0: } michael@0: // FALLTHRU DON'T REORDER michael@0: case COMMENT_END: michael@0: for (;;) { michael@0: c = read(); michael@0: switch (c) { michael@0: case -1: michael@0: break stateloop; michael@0: case '>': michael@0: state = MetaScanner.DATA; michael@0: continue stateloop; michael@0: case '-': michael@0: continue; michael@0: default: michael@0: state = MetaScanner.COMMENT; michael@0: continue stateloop; michael@0: } michael@0: } michael@0: // XXX reorder point michael@0: case COMMENT_START_DASH: michael@0: c = read(); michael@0: switch (c) { michael@0: case -1: michael@0: break stateloop; michael@0: case '-': michael@0: state = MetaScanner.COMMENT_END; michael@0: continue stateloop; michael@0: case '>': michael@0: state = MetaScanner.DATA; michael@0: continue stateloop; michael@0: default: michael@0: state = MetaScanner.COMMENT; michael@0: continue stateloop; michael@0: } michael@0: // XXX reorder point michael@0: case ATTRIBUTE_VALUE_SINGLE_QUOTED: michael@0: for (;;) { michael@0: if (reconsume) { michael@0: reconsume = false; michael@0: } else { michael@0: c = read(); michael@0: } michael@0: switch (c) { michael@0: case -1: michael@0: break stateloop; michael@0: case '\'': michael@0: handleAttributeValue(); michael@0: state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED; michael@0: continue stateloop; michael@0: default: michael@0: handleCharInAttributeValue(c); michael@0: continue; michael@0: } michael@0: } michael@0: // XXX reorder point michael@0: case SCAN_UNTIL_GT: michael@0: for (;;) { michael@0: if (reconsume) { michael@0: reconsume = false; michael@0: } else { michael@0: c = read(); michael@0: } michael@0: switch (c) { michael@0: case -1: michael@0: break stateloop; michael@0: case '>': michael@0: state = MetaScanner.DATA; michael@0: continue stateloop; michael@0: default: michael@0: continue; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: stateSave = state; michael@0: } michael@0: michael@0: private void handleCharInAttributeValue(int c) { michael@0: if (metaState == A) { michael@0: if (contentIndex == CONTENT.length || charsetIndex == CHARSET.length) { michael@0: addToBuffer(c); michael@0: } else if (httpEquivIndex == HTTP_EQUIV.length) { michael@0: if (contentTypeIndex < CONTENT_TYPE.length && toAsciiLowerCase(c) == CONTENT_TYPE[contentTypeIndex]) { michael@0: ++contentTypeIndex; michael@0: } else { michael@0: contentTypeIndex = Integer.MAX_VALUE; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: @Inline private int toAsciiLowerCase(int c) { michael@0: if (c >= 'A' && c <= 'Z') { michael@0: return c + 0x20; michael@0: } michael@0: return c; michael@0: } michael@0: michael@0: /** michael@0: * Adds a character to the accumulation buffer. michael@0: * @param c the character to add michael@0: */ michael@0: private void addToBuffer(int c) { michael@0: if (strBufLen == strBuf.length) { michael@0: char[] newBuf = new char[strBuf.length + (strBuf.length << 1)]; michael@0: System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length); michael@0: strBuf = newBuf; michael@0: } michael@0: strBuf[strBufLen++] = (char)c; michael@0: } michael@0: michael@0: /** michael@0: * Attempts to extract a charset name from the accumulation buffer. michael@0: * @return true if successful michael@0: * @throws SAXException michael@0: */ michael@0: private void handleAttributeValue() throws SAXException { michael@0: if (metaState != A) { michael@0: return; michael@0: } michael@0: if (contentIndex == CONTENT.length && content == null) { michael@0: content = Portability.newStringFromBuffer(strBuf, 0, strBufLen); michael@0: return; michael@0: } michael@0: if (charsetIndex == CHARSET.length && charset == null) { michael@0: charset = Portability.newStringFromBuffer(strBuf, 0, strBufLen); michael@0: return; michael@0: } michael@0: if (httpEquivIndex == HTTP_EQUIV.length michael@0: && httpEquivState == HTTP_EQUIV_NOT_SEEN) { michael@0: httpEquivState = (contentTypeIndex == CONTENT_TYPE.length) ? HTTP_EQUIV_CONTENT_TYPE michael@0: : HTTP_EQUIV_OTHER; michael@0: return; michael@0: } michael@0: } michael@0: michael@0: private boolean handleTag() throws SAXException { michael@0: boolean stop = handleTagInner(); michael@0: Portability.releaseString(content); michael@0: content = null; michael@0: Portability.releaseString(charset); michael@0: charset = null; michael@0: httpEquivState = HTTP_EQUIV_NOT_SEEN; michael@0: return stop; michael@0: } michael@0: michael@0: private boolean handleTagInner() throws SAXException { michael@0: if (charset != null && tryCharset(charset)) { michael@0: return true; michael@0: } michael@0: if (content != null && httpEquivState == HTTP_EQUIV_CONTENT_TYPE) { michael@0: String extract = TreeBuilder.extractCharsetFromContent(content); michael@0: if (extract == null) { michael@0: return false; michael@0: } michael@0: boolean success = tryCharset(extract); michael@0: Portability.releaseString(extract); michael@0: return success; michael@0: } michael@0: return false; michael@0: } michael@0: michael@0: /** michael@0: * Tries to switch to an encoding. michael@0: * michael@0: * @param encoding michael@0: * @return true if successful michael@0: * @throws SAXException michael@0: */ michael@0: protected abstract boolean tryCharset(String encoding) throws SAXException; michael@0: michael@0: }