parser/html/javasrc/MetaScanner.java

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/parser/html/javasrc/MetaScanner.java	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,843 @@
     1.4 +/*
     1.5 + * Copyright (c) 2007 Henri Sivonen
     1.6 + * Copyright (c) 2008-2010 Mozilla Foundation
     1.7 + *
     1.8 + * Permission is hereby granted, free of charge, to any person obtaining a 
     1.9 + * copy of this software and associated documentation files (the "Software"), 
    1.10 + * to deal in the Software without restriction, including without limitation 
    1.11 + * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
    1.12 + * and/or sell copies of the Software, and to permit persons to whom the 
    1.13 + * Software is furnished to do so, subject to the following conditions:
    1.14 + *
    1.15 + * The above copyright notice and this permission notice shall be included in 
    1.16 + * all copies or substantial portions of the Software.
    1.17 + *
    1.18 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
    1.19 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
    1.20 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
    1.21 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
    1.22 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
    1.23 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
    1.24 + * DEALINGS IN THE SOFTWARE.
    1.25 + */
    1.26 +
    1.27 +package nu.validator.htmlparser.impl;
    1.28 +
    1.29 +import java.io.IOException;
    1.30 +
    1.31 +import nu.validator.htmlparser.annotation.Auto;
    1.32 +import nu.validator.htmlparser.annotation.Inline;
    1.33 +import nu.validator.htmlparser.common.ByteReadable;
    1.34 +
    1.35 +import org.xml.sax.SAXException;
    1.36 +
    1.37 +public abstract class MetaScanner {
    1.38 +
    1.39 +    /**
    1.40 +     * Constant for "charset".
    1.41 +     */
    1.42 +    private static final char[] CHARSET = { 'h', 'a', 'r', 's', 'e', 't' };
    1.43 +    
    1.44 +    /**
    1.45 +     * Constant for "content".
    1.46 +     */
    1.47 +    private static final char[] CONTENT = { 'o', 'n', 't', 'e', 'n', 't' };
    1.48 +
    1.49 +    /**
    1.50 +     * Constant for "http-equiv".
    1.51 +     */
    1.52 +    private static final char[] HTTP_EQUIV = { 't', 't', 'p', '-', 'e', 'q',
    1.53 +            'u', 'i', 'v' };
    1.54 +
    1.55 +    /**
    1.56 +     * Constant for "content-type".
    1.57 +     */
    1.58 +    private static final char[] CONTENT_TYPE = { 'c', 'o', 'n', 't', 'e', 'n',
    1.59 +            't', '-', 't', 'y', 'p', 'e' };
    1.60 +
    1.61 +    private static final int NO = 0;
    1.62 +
    1.63 +    private static final int M = 1;
    1.64 +    
    1.65 +    private static final int E = 2;
    1.66 +    
    1.67 +    private static final int T = 3;
    1.68 +
    1.69 +    private static final int A = 4;
    1.70 +    
    1.71 +    private static final int DATA = 0;
    1.72 +
    1.73 +    private static final int TAG_OPEN = 1;
    1.74 +
    1.75 +    private static final int SCAN_UNTIL_GT = 2;
    1.76 +
    1.77 +    private static final int TAG_NAME = 3;
    1.78 +
    1.79 +    private static final int BEFORE_ATTRIBUTE_NAME = 4;
    1.80 +
    1.81 +    private static final int ATTRIBUTE_NAME = 5;
    1.82 +
    1.83 +    private static final int AFTER_ATTRIBUTE_NAME = 6;
    1.84 +
    1.85 +    private static final int BEFORE_ATTRIBUTE_VALUE = 7;
    1.86 +
    1.87 +    private static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 8;
    1.88 +
    1.89 +    private static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 9;
    1.90 +
    1.91 +    private static final int ATTRIBUTE_VALUE_UNQUOTED = 10;
    1.92 +
    1.93 +    private static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 11;
    1.94 +
    1.95 +    private static final int MARKUP_DECLARATION_OPEN = 13;
    1.96 +    
    1.97 +    private static final int MARKUP_DECLARATION_HYPHEN = 14;
    1.98 +
    1.99 +    private static final int COMMENT_START = 15;
   1.100 +
   1.101 +    private static final int COMMENT_START_DASH = 16;
   1.102 +
   1.103 +    private static final int COMMENT = 17;
   1.104 +
   1.105 +    private static final int COMMENT_END_DASH = 18;
   1.106 +
   1.107 +    private static final int COMMENT_END = 19;
   1.108 +    
   1.109 +    private static final int SELF_CLOSING_START_TAG = 20;
   1.110 +    
   1.111 +    private static final int HTTP_EQUIV_NOT_SEEN = 0;
   1.112 +    
   1.113 +    private static final int HTTP_EQUIV_CONTENT_TYPE = 1;
   1.114 +
   1.115 +    private static final int HTTP_EQUIV_OTHER = 2;
   1.116 +
   1.117 +    /**
   1.118 +     * The data source.
   1.119 +     */
   1.120 +    protected ByteReadable readable;
   1.121 +    
   1.122 +    /**
   1.123 +     * The state of the state machine that recognizes the tag name "meta".
   1.124 +     */
   1.125 +    private int metaState = NO;
   1.126 +
   1.127 +    /**
   1.128 +     * The current position in recognizing the attribute name "content".
   1.129 +     */
   1.130 +    private int contentIndex = Integer.MAX_VALUE;
   1.131 +    
   1.132 +    /**
   1.133 +     * The current position in recognizing the attribute name "charset".
   1.134 +     */
   1.135 +    private int charsetIndex = Integer.MAX_VALUE;
   1.136 +
   1.137 +    /**
   1.138 +     * The current position in recognizing the attribute name "http-equive".
   1.139 +     */
   1.140 +    private int httpEquivIndex = Integer.MAX_VALUE;
   1.141 +
   1.142 +    /**
   1.143 +     * The current position in recognizing the attribute value "content-type".
   1.144 +     */
   1.145 +    private int contentTypeIndex = Integer.MAX_VALUE;
   1.146 +
   1.147 +    /**
   1.148 +     * The tokenizer state.
   1.149 +     */
   1.150 +    protected int stateSave = DATA;
   1.151 +
   1.152 +    /**
   1.153 +     * The currently filled length of strBuf.
   1.154 +     */
   1.155 +    private int strBufLen;
   1.156 +
   1.157 +    /**
   1.158 +     * Accumulation buffer for attribute values.
   1.159 +     */
   1.160 +    private @Auto char[] strBuf;
   1.161 +    
   1.162 +    private String content;
   1.163 +    
   1.164 +    private String charset;
   1.165 +    
   1.166 +    private int httpEquivState;
   1.167 +    
   1.168 +    public MetaScanner() {
   1.169 +        this.readable = null;
   1.170 +        this.metaState = NO;
   1.171 +        this.contentIndex = Integer.MAX_VALUE;
   1.172 +        this.charsetIndex = Integer.MAX_VALUE;
   1.173 +        this.httpEquivIndex = Integer.MAX_VALUE;
   1.174 +        this.contentTypeIndex = Integer.MAX_VALUE;
   1.175 +        this.stateSave = DATA;
   1.176 +        this.strBufLen = 0;
   1.177 +        this.strBuf = new char[36];
   1.178 +        this.content = null;
   1.179 +        this.charset = null;
   1.180 +        this.httpEquivState = HTTP_EQUIV_NOT_SEEN;
   1.181 +    }
   1.182 +    
   1.183 +    @SuppressWarnings("unused") private void destructor() {
   1.184 +        Portability.releaseString(content);
   1.185 +        Portability.releaseString(charset);
   1.186 +    }
   1.187 +
   1.188 +    // [NOCPP[
   1.189 +    
   1.190 +    /**
   1.191 +     * Reads a byte from the data source.
   1.192 +     * 
   1.193 +     * -1 means end.
   1.194 +     * @return
   1.195 +     * @throws IOException
   1.196 +     */
   1.197 +    protected int read() throws IOException {
   1.198 +        return readable.readByte();
   1.199 +    }
   1.200 +
   1.201 +    // ]NOCPP]
   1.202 +
   1.203 +    // WARNING When editing this, makes sure the bytecode length shown by javap
   1.204 +    // stays under 8000 bytes!
   1.205 +    /**
   1.206 +     * The runs the meta scanning algorithm.
   1.207 +     */
   1.208 +    protected final void stateLoop(int state)
   1.209 +            throws SAXException, IOException {
   1.210 +        int c = -1;
   1.211 +        boolean reconsume = false;
   1.212 +        stateloop: for (;;) {
   1.213 +            switch (state) {
   1.214 +                case DATA:
   1.215 +                    dataloop: for (;;) {
   1.216 +                        if (reconsume) {
   1.217 +                            reconsume = false;
   1.218 +                        } else {
   1.219 +                            c = read();
   1.220 +                        }
   1.221 +                        switch (c) {
   1.222 +                            case -1:
   1.223 +                                break stateloop;
   1.224 +                            case '<':
   1.225 +                                state = MetaScanner.TAG_OPEN;
   1.226 +                                break dataloop; // FALL THROUGH continue
   1.227 +                            // stateloop;
   1.228 +                            default:
   1.229 +                                continue;
   1.230 +                        }
   1.231 +                    }
   1.232 +                    // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
   1.233 +                case TAG_OPEN:
   1.234 +                    tagopenloop: for (;;) {
   1.235 +                        c = read();
   1.236 +                        switch (c) {
   1.237 +                            case -1:
   1.238 +                                break stateloop;
   1.239 +                            case 'm':
   1.240 +                            case 'M':
   1.241 +                                metaState = M;
   1.242 +                                state = MetaScanner.TAG_NAME;
   1.243 +                                break tagopenloop;
   1.244 +                                // continue stateloop;                                
   1.245 +                            case '!':
   1.246 +                                state = MetaScanner.MARKUP_DECLARATION_OPEN;
   1.247 +                                continue stateloop;
   1.248 +                            case '?':
   1.249 +                            case '/':
   1.250 +                                state = MetaScanner.SCAN_UNTIL_GT;
   1.251 +                                continue stateloop;
   1.252 +                            case '>':
   1.253 +                                state = MetaScanner.DATA;
   1.254 +                                continue stateloop;
   1.255 +                            default:
   1.256 +                                if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
   1.257 +                                    metaState = NO;
   1.258 +                                    state = MetaScanner.TAG_NAME;
   1.259 +                                    break tagopenloop;
   1.260 +                                    // continue stateloop;
   1.261 +                                }
   1.262 +                                state = MetaScanner.DATA;
   1.263 +                                reconsume = true;
   1.264 +                                continue stateloop;
   1.265 +                        }
   1.266 +                    }
   1.267 +                    // FALL THROUGH DON'T REORDER
   1.268 +                case TAG_NAME:
   1.269 +                    tagnameloop: for (;;) {
   1.270 +                        c = read();
   1.271 +                        switch (c) {
   1.272 +                            case -1:
   1.273 +                                break stateloop;
   1.274 +                            case ' ':
   1.275 +                            case '\t':
   1.276 +                            case '\n':
   1.277 +                            case '\u000C':
   1.278 +                                state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
   1.279 +                                break tagnameloop;
   1.280 +                            // continue stateloop;
   1.281 +                            case '/':
   1.282 +                                state = MetaScanner.SELF_CLOSING_START_TAG;
   1.283 +                                continue stateloop;
   1.284 +                            case '>':
   1.285 +                                state = MetaScanner.DATA;
   1.286 +                                continue stateloop;
   1.287 +                            case 'e':
   1.288 +                            case 'E':
   1.289 +                                if (metaState == M) {
   1.290 +                                    metaState = E;
   1.291 +                                } else {
   1.292 +                                    metaState = NO;
   1.293 +                                }
   1.294 +                                continue;
   1.295 +                            case 't':
   1.296 +                            case 'T':
   1.297 +                                if (metaState == E) {
   1.298 +                                    metaState = T;
   1.299 +                                } else {
   1.300 +                                    metaState = NO;
   1.301 +                                }
   1.302 +                                continue;
   1.303 +                            case 'a':
   1.304 +                            case 'A':
   1.305 +                                if (metaState == T) {
   1.306 +                                    metaState = A;
   1.307 +                                } else {
   1.308 +                                    metaState = NO;
   1.309 +                                }
   1.310 +                                continue;
   1.311 +                            default:
   1.312 +                                metaState = NO;
   1.313 +                                continue;
   1.314 +                        }
   1.315 +                    }
   1.316 +                    // FALLTHRU DON'T REORDER
   1.317 +                case BEFORE_ATTRIBUTE_NAME:
   1.318 +                    beforeattributenameloop: for (;;) {
   1.319 +                        if (reconsume) {
   1.320 +                            reconsume = false;
   1.321 +                        } else {
   1.322 +                            c = read();
   1.323 +                        }
   1.324 +                        /*
   1.325 +                         * Consume the next input character:
   1.326 +                         */
   1.327 +                        switch (c) {
   1.328 +                            case -1:
   1.329 +                                break stateloop;
   1.330 +                            case ' ':
   1.331 +                            case '\t':
   1.332 +                            case '\n':
   1.333 +                            case '\u000C':
   1.334 +                                continue;
   1.335 +                            case '/':
   1.336 +                                state = MetaScanner.SELF_CLOSING_START_TAG;
   1.337 +                                continue stateloop;
   1.338 +                            case '>':
   1.339 +                                if (handleTag()) {
   1.340 +                                    break stateloop;
   1.341 +                                }
   1.342 +                                state = DATA;
   1.343 +                                continue stateloop;
   1.344 +                            case 'c':
   1.345 +                            case 'C':
   1.346 +                                contentIndex = 0;
   1.347 +                                charsetIndex = 0;
   1.348 +                                httpEquivIndex = Integer.MAX_VALUE;
   1.349 +                                contentTypeIndex = Integer.MAX_VALUE;
   1.350 +                                state = MetaScanner.ATTRIBUTE_NAME;
   1.351 +                                break beforeattributenameloop;                                
   1.352 +                            case 'h':
   1.353 +                            case 'H':
   1.354 +                                contentIndex = Integer.MAX_VALUE;
   1.355 +                                charsetIndex = Integer.MAX_VALUE;
   1.356 +                                httpEquivIndex = 0;
   1.357 +                                contentTypeIndex = Integer.MAX_VALUE;
   1.358 +                                state = MetaScanner.ATTRIBUTE_NAME;
   1.359 +                                break beforeattributenameloop;                                
   1.360 +                            default:
   1.361 +                                contentIndex = Integer.MAX_VALUE;
   1.362 +                                charsetIndex = Integer.MAX_VALUE;
   1.363 +                                httpEquivIndex = Integer.MAX_VALUE;
   1.364 +                                contentTypeIndex = Integer.MAX_VALUE;
   1.365 +                                state = MetaScanner.ATTRIBUTE_NAME;
   1.366 +                                break beforeattributenameloop;
   1.367 +                            // continue stateloop;
   1.368 +                        }
   1.369 +                    }
   1.370 +                    // FALLTHRU DON'T REORDER
   1.371 +                case ATTRIBUTE_NAME:
   1.372 +                    attributenameloop: for (;;) {
   1.373 +                        c = read();
   1.374 +                        switch (c) {
   1.375 +                            case -1:
   1.376 +                                break stateloop;
   1.377 +                            case ' ':
   1.378 +                            case '\t':
   1.379 +                            case '\n':
   1.380 +                            case '\u000C':
   1.381 +                                state = MetaScanner.AFTER_ATTRIBUTE_NAME;
   1.382 +                                continue stateloop;
   1.383 +                            case '/':
   1.384 +                                state = MetaScanner.SELF_CLOSING_START_TAG;
   1.385 +                                continue stateloop;
   1.386 +                            case '=':
   1.387 +                                strBufLen = 0;
   1.388 +                                contentTypeIndex = 0;
   1.389 +                                state = MetaScanner.BEFORE_ATTRIBUTE_VALUE;
   1.390 +                                break attributenameloop;
   1.391 +                            // continue stateloop;
   1.392 +                            case '>':
   1.393 +                                if (handleTag()) {
   1.394 +                                    break stateloop;
   1.395 +                                }
   1.396 +                                state = MetaScanner.DATA;
   1.397 +                                continue stateloop;
   1.398 +                            default:
   1.399 +                                if (metaState == A) {
   1.400 +                                    if (c >= 'A' && c <= 'Z') {
   1.401 +                                        c += 0x20;
   1.402 +                                    }
   1.403 +                                    if (contentIndex < CONTENT.length && c == CONTENT[contentIndex]) {
   1.404 +                                        ++contentIndex;
   1.405 +                                    } else {
   1.406 +                                        contentIndex = Integer.MAX_VALUE;
   1.407 +                                    }
   1.408 +                                    if (charsetIndex < CHARSET.length && c == CHARSET[charsetIndex]) {
   1.409 +                                        ++charsetIndex;
   1.410 +                                    } else {
   1.411 +                                        charsetIndex = Integer.MAX_VALUE;
   1.412 +                                    }
   1.413 +                                    if (httpEquivIndex < HTTP_EQUIV.length && c == HTTP_EQUIV[httpEquivIndex]) {
   1.414 +                                        ++httpEquivIndex;
   1.415 +                                    } else {
   1.416 +                                        httpEquivIndex = Integer.MAX_VALUE;
   1.417 +                                    }                                    
   1.418 +                                }
   1.419 +                                continue;
   1.420 +                        }
   1.421 +                    }
   1.422 +                    // FALLTHRU DON'T REORDER
   1.423 +                case BEFORE_ATTRIBUTE_VALUE:
   1.424 +                    beforeattributevalueloop: for (;;) {
   1.425 +                        c = read();
   1.426 +                        switch (c) {
   1.427 +                            case -1:
   1.428 +                                break stateloop;
   1.429 +                            case ' ':
   1.430 +                            case '\t':
   1.431 +                            case '\n':
   1.432 +                            case '\u000C':
   1.433 +                                continue;
   1.434 +                            case '"':
   1.435 +                                state = MetaScanner.ATTRIBUTE_VALUE_DOUBLE_QUOTED;
   1.436 +                                break beforeattributevalueloop;
   1.437 +                            // continue stateloop;
   1.438 +                            case '\'':
   1.439 +                                state = MetaScanner.ATTRIBUTE_VALUE_SINGLE_QUOTED;
   1.440 +                                continue stateloop;
   1.441 +                            case '>':
   1.442 +                                if (handleTag()) {
   1.443 +                                    break stateloop;
   1.444 +                                }
   1.445 +                                state = MetaScanner.DATA;
   1.446 +                                continue stateloop;
   1.447 +                            default:
   1.448 +                                handleCharInAttributeValue(c);
   1.449 +                                state = MetaScanner.ATTRIBUTE_VALUE_UNQUOTED;
   1.450 +                                continue stateloop;
   1.451 +                        }
   1.452 +                    }
   1.453 +                    // FALLTHRU DON'T REORDER
   1.454 +                case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
   1.455 +                    attributevaluedoublequotedloop: for (;;) {
   1.456 +                        if (reconsume) {
   1.457 +                            reconsume = false;
   1.458 +                        } else {
   1.459 +                            c = read();
   1.460 +                        }
   1.461 +                        switch (c) {
   1.462 +                            case -1:
   1.463 +                                break stateloop;
   1.464 +                            case '"':
   1.465 +                                handleAttributeValue();
   1.466 +                                state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED;
   1.467 +                                break attributevaluedoublequotedloop;
   1.468 +                            // continue stateloop;
   1.469 +                            default:
   1.470 +                                handleCharInAttributeValue(c);
   1.471 +                                continue;
   1.472 +                        }
   1.473 +                    }
   1.474 +                    // FALLTHRU DON'T REORDER
   1.475 +                case AFTER_ATTRIBUTE_VALUE_QUOTED:
   1.476 +                    afterattributevaluequotedloop: for (;;) {
   1.477 +                        c = read();
   1.478 +                        switch (c) {
   1.479 +                            case -1:
   1.480 +                                break stateloop;
   1.481 +                            case ' ':
   1.482 +                            case '\t':
   1.483 +                            case '\n':
   1.484 +                            case '\u000C':
   1.485 +                                state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
   1.486 +                                continue stateloop;
   1.487 +                            case '/':
   1.488 +                                state = MetaScanner.SELF_CLOSING_START_TAG;
   1.489 +                                break afterattributevaluequotedloop;
   1.490 +                            // continue stateloop;
   1.491 +                            case '>':
   1.492 +                                if (handleTag()) {
   1.493 +                                    break stateloop;
   1.494 +                                }
   1.495 +                                state = MetaScanner.DATA;
   1.496 +                                continue stateloop;
   1.497 +                            default:
   1.498 +                                state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
   1.499 +                                reconsume = true;
   1.500 +                                continue stateloop;
   1.501 +                        }
   1.502 +                    }
   1.503 +                    // FALLTHRU DON'T REORDER
   1.504 +                case SELF_CLOSING_START_TAG:
   1.505 +                    c = read();
   1.506 +                    switch (c) {
   1.507 +                        case -1:
   1.508 +                            break stateloop;
   1.509 +                        case '>':
   1.510 +                            if (handleTag()) {
   1.511 +                                break stateloop;
   1.512 +                            }
   1.513 +                            state = MetaScanner.DATA;
   1.514 +                            continue stateloop;
   1.515 +                        default:
   1.516 +                            state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
   1.517 +                            reconsume = true;
   1.518 +                            continue stateloop;
   1.519 +                    }
   1.520 +                    // XXX reorder point
   1.521 +                case ATTRIBUTE_VALUE_UNQUOTED:
   1.522 +                    for (;;) {
   1.523 +                        if (reconsume) {
   1.524 +                            reconsume = false;
   1.525 +                        } else {
   1.526 +                            c = read();
   1.527 +                        }
   1.528 +                        switch (c) {
   1.529 +                            case -1:
   1.530 +                                break stateloop;
   1.531 +                            case ' ':
   1.532 +                            case '\t':
   1.533 +                            case '\n':
   1.534 +
   1.535 +                            case '\u000C':
   1.536 +                                handleAttributeValue();
   1.537 +                                state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
   1.538 +                                continue stateloop;
   1.539 +                            case '>':
   1.540 +                                handleAttributeValue();
   1.541 +                                if (handleTag()) {
   1.542 +                                    break stateloop;
   1.543 +                                }
   1.544 +                                state = MetaScanner.DATA;
   1.545 +                                continue stateloop;
   1.546 +                            default:
   1.547 +                                handleCharInAttributeValue(c);
   1.548 +                                continue;
   1.549 +                        }
   1.550 +                    }
   1.551 +                    // XXX reorder point
   1.552 +                case AFTER_ATTRIBUTE_NAME:
   1.553 +                    for (;;) {
   1.554 +                        c = read();
   1.555 +                        switch (c) {
   1.556 +                            case -1:
   1.557 +                                break stateloop;
   1.558 +                            case ' ':
   1.559 +                            case '\t':
   1.560 +                            case '\n':
   1.561 +                            case '\u000C':
   1.562 +                                continue;
   1.563 +                            case '/':
   1.564 +                                handleAttributeValue();
   1.565 +                                state = MetaScanner.SELF_CLOSING_START_TAG;
   1.566 +                                continue stateloop;
   1.567 +                            case '=':
   1.568 +                                strBufLen = 0;
   1.569 +                                contentTypeIndex = 0;
   1.570 +                                state = MetaScanner.BEFORE_ATTRIBUTE_VALUE;
   1.571 +                                continue stateloop;
   1.572 +                            case '>':
   1.573 +                                handleAttributeValue();
   1.574 +                                if (handleTag()) {
   1.575 +                                    break stateloop;
   1.576 +                                }
   1.577 +                                state = MetaScanner.DATA;
   1.578 +                                continue stateloop;
   1.579 +                            case 'c':
   1.580 +                            case 'C':
   1.581 +                                contentIndex = 0;
   1.582 +                                charsetIndex = 0;
   1.583 +                                state = MetaScanner.ATTRIBUTE_NAME;
   1.584 +                                continue stateloop;
   1.585 +                            default:
   1.586 +                                contentIndex = Integer.MAX_VALUE;
   1.587 +                                charsetIndex = Integer.MAX_VALUE;
   1.588 +                                state = MetaScanner.ATTRIBUTE_NAME;
   1.589 +                                continue stateloop;
   1.590 +                        }
   1.591 +                    }
   1.592 +                    // XXX reorder point
   1.593 +                case MARKUP_DECLARATION_OPEN:
   1.594 +                    markupdeclarationopenloop: for (;;) {
   1.595 +                        c = read();
   1.596 +                        switch (c) {
   1.597 +                            case -1:
   1.598 +                                break stateloop;
   1.599 +                            case '-':
   1.600 +                                state = MetaScanner.MARKUP_DECLARATION_HYPHEN;
   1.601 +                                break markupdeclarationopenloop;
   1.602 +                            // continue stateloop;
   1.603 +                            default:
   1.604 +                                state = MetaScanner.SCAN_UNTIL_GT;
   1.605 +                                reconsume = true;
   1.606 +                                continue stateloop;
   1.607 +                        }
   1.608 +                    }
   1.609 +                    // FALLTHRU DON'T REORDER
   1.610 +                case MARKUP_DECLARATION_HYPHEN:
   1.611 +                    markupdeclarationhyphenloop: for (;;) {
   1.612 +                        c = read();
   1.613 +                        switch (c) {
   1.614 +                            case -1:
   1.615 +                                break stateloop;
   1.616 +                            case '-':
   1.617 +                                state = MetaScanner.COMMENT_START;
   1.618 +                                break markupdeclarationhyphenloop;
   1.619 +                            // continue stateloop;
   1.620 +                            default:
   1.621 +                                state = MetaScanner.SCAN_UNTIL_GT;
   1.622 +                                reconsume = true;
   1.623 +                                continue stateloop;
   1.624 +                        }
   1.625 +                    }
   1.626 +                    // FALLTHRU DON'T REORDER
   1.627 +                case COMMENT_START:
   1.628 +                    commentstartloop: for (;;) {
   1.629 +                        c = read();
   1.630 +                        switch (c) {
   1.631 +                            case -1:
   1.632 +                                break stateloop;
   1.633 +                            case '-':
   1.634 +                                state = MetaScanner.COMMENT_START_DASH;
   1.635 +                                continue stateloop;
   1.636 +                            case '>':
   1.637 +                                state = MetaScanner.DATA;
   1.638 +                                continue stateloop;
   1.639 +                            default:
   1.640 +                                state = MetaScanner.COMMENT;
   1.641 +                                break commentstartloop;
   1.642 +                            // continue stateloop;
   1.643 +                        }
   1.644 +                    }
   1.645 +                    // FALLTHRU DON'T REORDER
   1.646 +                case COMMENT:
   1.647 +                    commentloop: for (;;) {
   1.648 +                        c = read();
   1.649 +                        switch (c) {
   1.650 +                            case -1:
   1.651 +                                break stateloop;
   1.652 +                            case '-':
   1.653 +                                state = MetaScanner.COMMENT_END_DASH;
   1.654 +                                break commentloop;
   1.655 +                            // continue stateloop;
   1.656 +                            default:
   1.657 +                                continue;
   1.658 +                        }
   1.659 +                    }
   1.660 +                    // FALLTHRU DON'T REORDER
   1.661 +                case COMMENT_END_DASH:
   1.662 +                    commentenddashloop: for (;;) {
   1.663 +                        c = read();
   1.664 +                        switch (c) {
   1.665 +                            case -1:
   1.666 +                                break stateloop;
   1.667 +                            case '-':
   1.668 +                                state = MetaScanner.COMMENT_END;
   1.669 +                                break commentenddashloop;
   1.670 +                            // continue stateloop;
   1.671 +                            default:
   1.672 +                                state = MetaScanner.COMMENT;
   1.673 +                                continue stateloop;
   1.674 +                        }
   1.675 +                    }
   1.676 +                    // FALLTHRU DON'T REORDER
   1.677 +                case COMMENT_END:
   1.678 +                    for (;;) {
   1.679 +                        c = read();
   1.680 +                        switch (c) {
   1.681 +                            case -1:
   1.682 +                                break stateloop;
   1.683 +                            case '>':
   1.684 +                                state = MetaScanner.DATA;
   1.685 +                                continue stateloop;
   1.686 +                            case '-':
   1.687 +                                continue;
   1.688 +                            default:
   1.689 +                                state = MetaScanner.COMMENT;
   1.690 +                                continue stateloop;
   1.691 +                        }
   1.692 +                    }
   1.693 +                    // XXX reorder point
   1.694 +                case COMMENT_START_DASH:
   1.695 +                    c = read();
   1.696 +                    switch (c) {
   1.697 +                        case -1:
   1.698 +                            break stateloop;
   1.699 +                        case '-':
   1.700 +                            state = MetaScanner.COMMENT_END;
   1.701 +                            continue stateloop;
   1.702 +                        case '>':
   1.703 +                            state = MetaScanner.DATA;
   1.704 +                            continue stateloop;
   1.705 +                        default:
   1.706 +                            state = MetaScanner.COMMENT;
   1.707 +                            continue stateloop;
   1.708 +                    }
   1.709 +                    // XXX reorder point
   1.710 +                case ATTRIBUTE_VALUE_SINGLE_QUOTED:
   1.711 +                    for (;;) {
   1.712 +                        if (reconsume) {
   1.713 +                            reconsume = false;
   1.714 +                        } else {
   1.715 +                            c = read();
   1.716 +                        }
   1.717 +                        switch (c) {
   1.718 +                            case -1:
   1.719 +                                break stateloop;
   1.720 +                            case '\'':
   1.721 +                                handleAttributeValue();
   1.722 +                                state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED;
   1.723 +                                continue stateloop;
   1.724 +                            default:
   1.725 +                                handleCharInAttributeValue(c);
   1.726 +                                continue;
   1.727 +                        }
   1.728 +                    }
   1.729 +                    // XXX reorder point
   1.730 +                case SCAN_UNTIL_GT:
   1.731 +                    for (;;) {
   1.732 +                        if (reconsume) {
   1.733 +                            reconsume = false;
   1.734 +                        } else {
   1.735 +                            c = read();
   1.736 +                        }
   1.737 +                        switch (c) {
   1.738 +                            case -1:
   1.739 +                                break stateloop;
   1.740 +                            case '>':
   1.741 +                                state = MetaScanner.DATA;
   1.742 +                                continue stateloop;
   1.743 +                            default:
   1.744 +                                continue;
   1.745 +                        }
   1.746 +                    }
   1.747 +            }
   1.748 +        }
   1.749 +        stateSave  = state;
   1.750 +    }
   1.751 +
   1.752 +    private void handleCharInAttributeValue(int c) {
   1.753 +        if (metaState == A) {
   1.754 +            if (contentIndex == CONTENT.length || charsetIndex == CHARSET.length) {
   1.755 +                addToBuffer(c);
   1.756 +            } else if (httpEquivIndex == HTTP_EQUIV.length) {
   1.757 +                if (contentTypeIndex < CONTENT_TYPE.length && toAsciiLowerCase(c) == CONTENT_TYPE[contentTypeIndex]) {
   1.758 +                    ++contentTypeIndex;
   1.759 +                } else {
   1.760 +                    contentTypeIndex = Integer.MAX_VALUE;
   1.761 +                }
   1.762 +            }
   1.763 +        }
   1.764 +    }
   1.765 +
   1.766 +    @Inline private int toAsciiLowerCase(int c) {
   1.767 +        if (c >= 'A' && c <= 'Z') {
   1.768 +            return c + 0x20;
   1.769 +        }
   1.770 +        return c;
   1.771 +    }
   1.772 +
   1.773 +    /**
   1.774 +     * Adds a character to the accumulation buffer.
   1.775 +     * @param c the character to add
   1.776 +     */
   1.777 +    private void addToBuffer(int c) {
   1.778 +        if (strBufLen == strBuf.length) {
   1.779 +            char[] newBuf = new char[strBuf.length + (strBuf.length << 1)];
   1.780 +            System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length);
   1.781 +            strBuf = newBuf;
   1.782 +        }
   1.783 +        strBuf[strBufLen++] = (char)c;
   1.784 +    }
   1.785 +
   1.786 +    /**
   1.787 +     * Attempts to extract a charset name from the accumulation buffer.
   1.788 +     * @return <code>true</code> if successful
   1.789 +     * @throws SAXException
   1.790 +     */
   1.791 +    private void handleAttributeValue() throws SAXException {
   1.792 +        if (metaState != A) {
   1.793 +            return;
   1.794 +        }
   1.795 +        if (contentIndex == CONTENT.length && content == null) {
   1.796 +            content = Portability.newStringFromBuffer(strBuf, 0, strBufLen);
   1.797 +            return;
   1.798 +        }
   1.799 +        if (charsetIndex == CHARSET.length && charset == null) {
   1.800 +            charset = Portability.newStringFromBuffer(strBuf, 0, strBufLen);            
   1.801 +            return;
   1.802 +        }
   1.803 +        if (httpEquivIndex == HTTP_EQUIV.length
   1.804 +                && httpEquivState == HTTP_EQUIV_NOT_SEEN) {
   1.805 +            httpEquivState = (contentTypeIndex == CONTENT_TYPE.length) ? HTTP_EQUIV_CONTENT_TYPE
   1.806 +                    : HTTP_EQUIV_OTHER;
   1.807 +            return;
   1.808 +        }
   1.809 +    }
   1.810 +
   1.811 +    private boolean handleTag() throws SAXException {
   1.812 +        boolean stop = handleTagInner();
   1.813 +        Portability.releaseString(content);
   1.814 +        content = null;
   1.815 +        Portability.releaseString(charset);
   1.816 +        charset = null;
   1.817 +        httpEquivState = HTTP_EQUIV_NOT_SEEN;
   1.818 +        return stop;
   1.819 +    }
   1.820 +    
   1.821 +    private boolean handleTagInner() throws SAXException {
   1.822 +        if (charset != null && tryCharset(charset)) {
   1.823 +                return true;
   1.824 +        }
   1.825 +        if (content != null && httpEquivState == HTTP_EQUIV_CONTENT_TYPE) {
   1.826 +            String extract = TreeBuilder.extractCharsetFromContent(content);
   1.827 +            if (extract == null) {
   1.828 +                return false;
   1.829 +            }
   1.830 +            boolean success = tryCharset(extract);
   1.831 +            Portability.releaseString(extract);
   1.832 +            return success;
   1.833 +        }
   1.834 +        return false;
   1.835 +    }
   1.836 +
   1.837 +    /**
   1.838 +     * Tries to switch to an encoding.
   1.839 +     * 
   1.840 +     * @param encoding
   1.841 +     * @return <code>true</code> if successful
   1.842 +     * @throws SAXException
   1.843 +     */
   1.844 +    protected abstract boolean tryCharset(String encoding) throws SAXException;
   1.845 +    
   1.846 +}

mercurial