1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/parser/html/javasrc/MetaScanner.java Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,843 @@ 1.4 +/* 1.5 + * Copyright (c) 2007 Henri Sivonen 1.6 + * Copyright (c) 2008-2010 Mozilla Foundation 1.7 + * 1.8 + * Permission is hereby granted, free of charge, to any person obtaining a 1.9 + * copy of this software and associated documentation files (the "Software"), 1.10 + * to deal in the Software without restriction, including without limitation 1.11 + * the rights to use, copy, modify, merge, publish, distribute, sublicense, 1.12 + * and/or sell copies of the Software, and to permit persons to whom the 1.13 + * Software is furnished to do so, subject to the following conditions: 1.14 + * 1.15 + * The above copyright notice and this permission notice shall be included in 1.16 + * all copies or substantial portions of the Software. 1.17 + * 1.18 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1.19 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1.20 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1.21 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1.22 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 1.23 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 1.24 + * DEALINGS IN THE SOFTWARE. 1.25 + */ 1.26 + 1.27 +package nu.validator.htmlparser.impl; 1.28 + 1.29 +import java.io.IOException; 1.30 + 1.31 +import nu.validator.htmlparser.annotation.Auto; 1.32 +import nu.validator.htmlparser.annotation.Inline; 1.33 +import nu.validator.htmlparser.common.ByteReadable; 1.34 + 1.35 +import org.xml.sax.SAXException; 1.36 + 1.37 +public abstract class MetaScanner { 1.38 + 1.39 + /** 1.40 + * Constant for "charset". 1.41 + */ 1.42 + private static final char[] CHARSET = { 'h', 'a', 'r', 's', 'e', 't' }; 1.43 + 1.44 + /** 1.45 + * Constant for "content". 1.46 + */ 1.47 + private static final char[] CONTENT = { 'o', 'n', 't', 'e', 'n', 't' }; 1.48 + 1.49 + /** 1.50 + * Constant for "http-equiv". 1.51 + */ 1.52 + private static final char[] HTTP_EQUIV = { 't', 't', 'p', '-', 'e', 'q', 1.53 + 'u', 'i', 'v' }; 1.54 + 1.55 + /** 1.56 + * Constant for "content-type". 1.57 + */ 1.58 + private static final char[] CONTENT_TYPE = { 'c', 'o', 'n', 't', 'e', 'n', 1.59 + 't', '-', 't', 'y', 'p', 'e' }; 1.60 + 1.61 + private static final int NO = 0; 1.62 + 1.63 + private static final int M = 1; 1.64 + 1.65 + private static final int E = 2; 1.66 + 1.67 + private static final int T = 3; 1.68 + 1.69 + private static final int A = 4; 1.70 + 1.71 + private static final int DATA = 0; 1.72 + 1.73 + private static final int TAG_OPEN = 1; 1.74 + 1.75 + private static final int SCAN_UNTIL_GT = 2; 1.76 + 1.77 + private static final int TAG_NAME = 3; 1.78 + 1.79 + private static final int BEFORE_ATTRIBUTE_NAME = 4; 1.80 + 1.81 + private static final int ATTRIBUTE_NAME = 5; 1.82 + 1.83 + private static final int AFTER_ATTRIBUTE_NAME = 6; 1.84 + 1.85 + private static final int BEFORE_ATTRIBUTE_VALUE = 7; 1.86 + 1.87 + private static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 8; 1.88 + 1.89 + private static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 9; 1.90 + 1.91 + private static final int ATTRIBUTE_VALUE_UNQUOTED = 10; 1.92 + 1.93 + private static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 11; 1.94 + 1.95 + private static final int MARKUP_DECLARATION_OPEN = 13; 1.96 + 1.97 + private static final int MARKUP_DECLARATION_HYPHEN = 14; 1.98 + 1.99 + private static final int COMMENT_START = 15; 1.100 + 1.101 + private static final int COMMENT_START_DASH = 16; 1.102 + 1.103 + private static final int COMMENT = 17; 1.104 + 1.105 + private static final int COMMENT_END_DASH = 18; 1.106 + 1.107 + private static final int COMMENT_END = 19; 1.108 + 1.109 + private static final int SELF_CLOSING_START_TAG = 20; 1.110 + 1.111 + private static final int HTTP_EQUIV_NOT_SEEN = 0; 1.112 + 1.113 + private static final int HTTP_EQUIV_CONTENT_TYPE = 1; 1.114 + 1.115 + private static final int HTTP_EQUIV_OTHER = 2; 1.116 + 1.117 + /** 1.118 + * The data source. 1.119 + */ 1.120 + protected ByteReadable readable; 1.121 + 1.122 + /** 1.123 + * The state of the state machine that recognizes the tag name "meta". 1.124 + */ 1.125 + private int metaState = NO; 1.126 + 1.127 + /** 1.128 + * The current position in recognizing the attribute name "content". 1.129 + */ 1.130 + private int contentIndex = Integer.MAX_VALUE; 1.131 + 1.132 + /** 1.133 + * The current position in recognizing the attribute name "charset". 1.134 + */ 1.135 + private int charsetIndex = Integer.MAX_VALUE; 1.136 + 1.137 + /** 1.138 + * The current position in recognizing the attribute name "http-equive". 1.139 + */ 1.140 + private int httpEquivIndex = Integer.MAX_VALUE; 1.141 + 1.142 + /** 1.143 + * The current position in recognizing the attribute value "content-type". 1.144 + */ 1.145 + private int contentTypeIndex = Integer.MAX_VALUE; 1.146 + 1.147 + /** 1.148 + * The tokenizer state. 1.149 + */ 1.150 + protected int stateSave = DATA; 1.151 + 1.152 + /** 1.153 + * The currently filled length of strBuf. 1.154 + */ 1.155 + private int strBufLen; 1.156 + 1.157 + /** 1.158 + * Accumulation buffer for attribute values. 1.159 + */ 1.160 + private @Auto char[] strBuf; 1.161 + 1.162 + private String content; 1.163 + 1.164 + private String charset; 1.165 + 1.166 + private int httpEquivState; 1.167 + 1.168 + public MetaScanner() { 1.169 + this.readable = null; 1.170 + this.metaState = NO; 1.171 + this.contentIndex = Integer.MAX_VALUE; 1.172 + this.charsetIndex = Integer.MAX_VALUE; 1.173 + this.httpEquivIndex = Integer.MAX_VALUE; 1.174 + this.contentTypeIndex = Integer.MAX_VALUE; 1.175 + this.stateSave = DATA; 1.176 + this.strBufLen = 0; 1.177 + this.strBuf = new char[36]; 1.178 + this.content = null; 1.179 + this.charset = null; 1.180 + this.httpEquivState = HTTP_EQUIV_NOT_SEEN; 1.181 + } 1.182 + 1.183 + @SuppressWarnings("unused") private void destructor() { 1.184 + Portability.releaseString(content); 1.185 + Portability.releaseString(charset); 1.186 + } 1.187 + 1.188 + // [NOCPP[ 1.189 + 1.190 + /** 1.191 + * Reads a byte from the data source. 1.192 + * 1.193 + * -1 means end. 1.194 + * @return 1.195 + * @throws IOException 1.196 + */ 1.197 + protected int read() throws IOException { 1.198 + return readable.readByte(); 1.199 + } 1.200 + 1.201 + // ]NOCPP] 1.202 + 1.203 + // WARNING When editing this, makes sure the bytecode length shown by javap 1.204 + // stays under 8000 bytes! 1.205 + /** 1.206 + * The runs the meta scanning algorithm. 1.207 + */ 1.208 + protected final void stateLoop(int state) 1.209 + throws SAXException, IOException { 1.210 + int c = -1; 1.211 + boolean reconsume = false; 1.212 + stateloop: for (;;) { 1.213 + switch (state) { 1.214 + case DATA: 1.215 + dataloop: for (;;) { 1.216 + if (reconsume) { 1.217 + reconsume = false; 1.218 + } else { 1.219 + c = read(); 1.220 + } 1.221 + switch (c) { 1.222 + case -1: 1.223 + break stateloop; 1.224 + case '<': 1.225 + state = MetaScanner.TAG_OPEN; 1.226 + break dataloop; // FALL THROUGH continue 1.227 + // stateloop; 1.228 + default: 1.229 + continue; 1.230 + } 1.231 + } 1.232 + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER 1.233 + case TAG_OPEN: 1.234 + tagopenloop: for (;;) { 1.235 + c = read(); 1.236 + switch (c) { 1.237 + case -1: 1.238 + break stateloop; 1.239 + case 'm': 1.240 + case 'M': 1.241 + metaState = M; 1.242 + state = MetaScanner.TAG_NAME; 1.243 + break tagopenloop; 1.244 + // continue stateloop; 1.245 + case '!': 1.246 + state = MetaScanner.MARKUP_DECLARATION_OPEN; 1.247 + continue stateloop; 1.248 + case '?': 1.249 + case '/': 1.250 + state = MetaScanner.SCAN_UNTIL_GT; 1.251 + continue stateloop; 1.252 + case '>': 1.253 + state = MetaScanner.DATA; 1.254 + continue stateloop; 1.255 + default: 1.256 + if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { 1.257 + metaState = NO; 1.258 + state = MetaScanner.TAG_NAME; 1.259 + break tagopenloop; 1.260 + // continue stateloop; 1.261 + } 1.262 + state = MetaScanner.DATA; 1.263 + reconsume = true; 1.264 + continue stateloop; 1.265 + } 1.266 + } 1.267 + // FALL THROUGH DON'T REORDER 1.268 + case TAG_NAME: 1.269 + tagnameloop: for (;;) { 1.270 + c = read(); 1.271 + switch (c) { 1.272 + case -1: 1.273 + break stateloop; 1.274 + case ' ': 1.275 + case '\t': 1.276 + case '\n': 1.277 + case '\u000C': 1.278 + state = MetaScanner.BEFORE_ATTRIBUTE_NAME; 1.279 + break tagnameloop; 1.280 + // continue stateloop; 1.281 + case '/': 1.282 + state = MetaScanner.SELF_CLOSING_START_TAG; 1.283 + continue stateloop; 1.284 + case '>': 1.285 + state = MetaScanner.DATA; 1.286 + continue stateloop; 1.287 + case 'e': 1.288 + case 'E': 1.289 + if (metaState == M) { 1.290 + metaState = E; 1.291 + } else { 1.292 + metaState = NO; 1.293 + } 1.294 + continue; 1.295 + case 't': 1.296 + case 'T': 1.297 + if (metaState == E) { 1.298 + metaState = T; 1.299 + } else { 1.300 + metaState = NO; 1.301 + } 1.302 + continue; 1.303 + case 'a': 1.304 + case 'A': 1.305 + if (metaState == T) { 1.306 + metaState = A; 1.307 + } else { 1.308 + metaState = NO; 1.309 + } 1.310 + continue; 1.311 + default: 1.312 + metaState = NO; 1.313 + continue; 1.314 + } 1.315 + } 1.316 + // FALLTHRU DON'T REORDER 1.317 + case BEFORE_ATTRIBUTE_NAME: 1.318 + beforeattributenameloop: for (;;) { 1.319 + if (reconsume) { 1.320 + reconsume = false; 1.321 + } else { 1.322 + c = read(); 1.323 + } 1.324 + /* 1.325 + * Consume the next input character: 1.326 + */ 1.327 + switch (c) { 1.328 + case -1: 1.329 + break stateloop; 1.330 + case ' ': 1.331 + case '\t': 1.332 + case '\n': 1.333 + case '\u000C': 1.334 + continue; 1.335 + case '/': 1.336 + state = MetaScanner.SELF_CLOSING_START_TAG; 1.337 + continue stateloop; 1.338 + case '>': 1.339 + if (handleTag()) { 1.340 + break stateloop; 1.341 + } 1.342 + state = DATA; 1.343 + continue stateloop; 1.344 + case 'c': 1.345 + case 'C': 1.346 + contentIndex = 0; 1.347 + charsetIndex = 0; 1.348 + httpEquivIndex = Integer.MAX_VALUE; 1.349 + contentTypeIndex = Integer.MAX_VALUE; 1.350 + state = MetaScanner.ATTRIBUTE_NAME; 1.351 + break beforeattributenameloop; 1.352 + case 'h': 1.353 + case 'H': 1.354 + contentIndex = Integer.MAX_VALUE; 1.355 + charsetIndex = Integer.MAX_VALUE; 1.356 + httpEquivIndex = 0; 1.357 + contentTypeIndex = Integer.MAX_VALUE; 1.358 + state = MetaScanner.ATTRIBUTE_NAME; 1.359 + break beforeattributenameloop; 1.360 + default: 1.361 + contentIndex = Integer.MAX_VALUE; 1.362 + charsetIndex = Integer.MAX_VALUE; 1.363 + httpEquivIndex = Integer.MAX_VALUE; 1.364 + contentTypeIndex = Integer.MAX_VALUE; 1.365 + state = MetaScanner.ATTRIBUTE_NAME; 1.366 + break beforeattributenameloop; 1.367 + // continue stateloop; 1.368 + } 1.369 + } 1.370 + // FALLTHRU DON'T REORDER 1.371 + case ATTRIBUTE_NAME: 1.372 + attributenameloop: for (;;) { 1.373 + c = read(); 1.374 + switch (c) { 1.375 + case -1: 1.376 + break stateloop; 1.377 + case ' ': 1.378 + case '\t': 1.379 + case '\n': 1.380 + case '\u000C': 1.381 + state = MetaScanner.AFTER_ATTRIBUTE_NAME; 1.382 + continue stateloop; 1.383 + case '/': 1.384 + state = MetaScanner.SELF_CLOSING_START_TAG; 1.385 + continue stateloop; 1.386 + case '=': 1.387 + strBufLen = 0; 1.388 + contentTypeIndex = 0; 1.389 + state = MetaScanner.BEFORE_ATTRIBUTE_VALUE; 1.390 + break attributenameloop; 1.391 + // continue stateloop; 1.392 + case '>': 1.393 + if (handleTag()) { 1.394 + break stateloop; 1.395 + } 1.396 + state = MetaScanner.DATA; 1.397 + continue stateloop; 1.398 + default: 1.399 + if (metaState == A) { 1.400 + if (c >= 'A' && c <= 'Z') { 1.401 + c += 0x20; 1.402 + } 1.403 + if (contentIndex < CONTENT.length && c == CONTENT[contentIndex]) { 1.404 + ++contentIndex; 1.405 + } else { 1.406 + contentIndex = Integer.MAX_VALUE; 1.407 + } 1.408 + if (charsetIndex < CHARSET.length && c == CHARSET[charsetIndex]) { 1.409 + ++charsetIndex; 1.410 + } else { 1.411 + charsetIndex = Integer.MAX_VALUE; 1.412 + } 1.413 + if (httpEquivIndex < HTTP_EQUIV.length && c == HTTP_EQUIV[httpEquivIndex]) { 1.414 + ++httpEquivIndex; 1.415 + } else { 1.416 + httpEquivIndex = Integer.MAX_VALUE; 1.417 + } 1.418 + } 1.419 + continue; 1.420 + } 1.421 + } 1.422 + // FALLTHRU DON'T REORDER 1.423 + case BEFORE_ATTRIBUTE_VALUE: 1.424 + beforeattributevalueloop: for (;;) { 1.425 + c = read(); 1.426 + switch (c) { 1.427 + case -1: 1.428 + break stateloop; 1.429 + case ' ': 1.430 + case '\t': 1.431 + case '\n': 1.432 + case '\u000C': 1.433 + continue; 1.434 + case '"': 1.435 + state = MetaScanner.ATTRIBUTE_VALUE_DOUBLE_QUOTED; 1.436 + break beforeattributevalueloop; 1.437 + // continue stateloop; 1.438 + case '\'': 1.439 + state = MetaScanner.ATTRIBUTE_VALUE_SINGLE_QUOTED; 1.440 + continue stateloop; 1.441 + case '>': 1.442 + if (handleTag()) { 1.443 + break stateloop; 1.444 + } 1.445 + state = MetaScanner.DATA; 1.446 + continue stateloop; 1.447 + default: 1.448 + handleCharInAttributeValue(c); 1.449 + state = MetaScanner.ATTRIBUTE_VALUE_UNQUOTED; 1.450 + continue stateloop; 1.451 + } 1.452 + } 1.453 + // FALLTHRU DON'T REORDER 1.454 + case ATTRIBUTE_VALUE_DOUBLE_QUOTED: 1.455 + attributevaluedoublequotedloop: for (;;) { 1.456 + if (reconsume) { 1.457 + reconsume = false; 1.458 + } else { 1.459 + c = read(); 1.460 + } 1.461 + switch (c) { 1.462 + case -1: 1.463 + break stateloop; 1.464 + case '"': 1.465 + handleAttributeValue(); 1.466 + state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED; 1.467 + break attributevaluedoublequotedloop; 1.468 + // continue stateloop; 1.469 + default: 1.470 + handleCharInAttributeValue(c); 1.471 + continue; 1.472 + } 1.473 + } 1.474 + // FALLTHRU DON'T REORDER 1.475 + case AFTER_ATTRIBUTE_VALUE_QUOTED: 1.476 + afterattributevaluequotedloop: for (;;) { 1.477 + c = read(); 1.478 + switch (c) { 1.479 + case -1: 1.480 + break stateloop; 1.481 + case ' ': 1.482 + case '\t': 1.483 + case '\n': 1.484 + case '\u000C': 1.485 + state = MetaScanner.BEFORE_ATTRIBUTE_NAME; 1.486 + continue stateloop; 1.487 + case '/': 1.488 + state = MetaScanner.SELF_CLOSING_START_TAG; 1.489 + break afterattributevaluequotedloop; 1.490 + // continue stateloop; 1.491 + case '>': 1.492 + if (handleTag()) { 1.493 + break stateloop; 1.494 + } 1.495 + state = MetaScanner.DATA; 1.496 + continue stateloop; 1.497 + default: 1.498 + state = MetaScanner.BEFORE_ATTRIBUTE_NAME; 1.499 + reconsume = true; 1.500 + continue stateloop; 1.501 + } 1.502 + } 1.503 + // FALLTHRU DON'T REORDER 1.504 + case SELF_CLOSING_START_TAG: 1.505 + c = read(); 1.506 + switch (c) { 1.507 + case -1: 1.508 + break stateloop; 1.509 + case '>': 1.510 + if (handleTag()) { 1.511 + break stateloop; 1.512 + } 1.513 + state = MetaScanner.DATA; 1.514 + continue stateloop; 1.515 + default: 1.516 + state = MetaScanner.BEFORE_ATTRIBUTE_NAME; 1.517 + reconsume = true; 1.518 + continue stateloop; 1.519 + } 1.520 + // XXX reorder point 1.521 + case ATTRIBUTE_VALUE_UNQUOTED: 1.522 + for (;;) { 1.523 + if (reconsume) { 1.524 + reconsume = false; 1.525 + } else { 1.526 + c = read(); 1.527 + } 1.528 + switch (c) { 1.529 + case -1: 1.530 + break stateloop; 1.531 + case ' ': 1.532 + case '\t': 1.533 + case '\n': 1.534 + 1.535 + case '\u000C': 1.536 + handleAttributeValue(); 1.537 + state = MetaScanner.BEFORE_ATTRIBUTE_NAME; 1.538 + continue stateloop; 1.539 + case '>': 1.540 + handleAttributeValue(); 1.541 + if (handleTag()) { 1.542 + break stateloop; 1.543 + } 1.544 + state = MetaScanner.DATA; 1.545 + continue stateloop; 1.546 + default: 1.547 + handleCharInAttributeValue(c); 1.548 + continue; 1.549 + } 1.550 + } 1.551 + // XXX reorder point 1.552 + case AFTER_ATTRIBUTE_NAME: 1.553 + for (;;) { 1.554 + c = read(); 1.555 + switch (c) { 1.556 + case -1: 1.557 + break stateloop; 1.558 + case ' ': 1.559 + case '\t': 1.560 + case '\n': 1.561 + case '\u000C': 1.562 + continue; 1.563 + case '/': 1.564 + handleAttributeValue(); 1.565 + state = MetaScanner.SELF_CLOSING_START_TAG; 1.566 + continue stateloop; 1.567 + case '=': 1.568 + strBufLen = 0; 1.569 + contentTypeIndex = 0; 1.570 + state = MetaScanner.BEFORE_ATTRIBUTE_VALUE; 1.571 + continue stateloop; 1.572 + case '>': 1.573 + handleAttributeValue(); 1.574 + if (handleTag()) { 1.575 + break stateloop; 1.576 + } 1.577 + state = MetaScanner.DATA; 1.578 + continue stateloop; 1.579 + case 'c': 1.580 + case 'C': 1.581 + contentIndex = 0; 1.582 + charsetIndex = 0; 1.583 + state = MetaScanner.ATTRIBUTE_NAME; 1.584 + continue stateloop; 1.585 + default: 1.586 + contentIndex = Integer.MAX_VALUE; 1.587 + charsetIndex = Integer.MAX_VALUE; 1.588 + state = MetaScanner.ATTRIBUTE_NAME; 1.589 + continue stateloop; 1.590 + } 1.591 + } 1.592 + // XXX reorder point 1.593 + case MARKUP_DECLARATION_OPEN: 1.594 + markupdeclarationopenloop: for (;;) { 1.595 + c = read(); 1.596 + switch (c) { 1.597 + case -1: 1.598 + break stateloop; 1.599 + case '-': 1.600 + state = MetaScanner.MARKUP_DECLARATION_HYPHEN; 1.601 + break markupdeclarationopenloop; 1.602 + // continue stateloop; 1.603 + default: 1.604 + state = MetaScanner.SCAN_UNTIL_GT; 1.605 + reconsume = true; 1.606 + continue stateloop; 1.607 + } 1.608 + } 1.609 + // FALLTHRU DON'T REORDER 1.610 + case MARKUP_DECLARATION_HYPHEN: 1.611 + markupdeclarationhyphenloop: for (;;) { 1.612 + c = read(); 1.613 + switch (c) { 1.614 + case -1: 1.615 + break stateloop; 1.616 + case '-': 1.617 + state = MetaScanner.COMMENT_START; 1.618 + break markupdeclarationhyphenloop; 1.619 + // continue stateloop; 1.620 + default: 1.621 + state = MetaScanner.SCAN_UNTIL_GT; 1.622 + reconsume = true; 1.623 + continue stateloop; 1.624 + } 1.625 + } 1.626 + // FALLTHRU DON'T REORDER 1.627 + case COMMENT_START: 1.628 + commentstartloop: for (;;) { 1.629 + c = read(); 1.630 + switch (c) { 1.631 + case -1: 1.632 + break stateloop; 1.633 + case '-': 1.634 + state = MetaScanner.COMMENT_START_DASH; 1.635 + continue stateloop; 1.636 + case '>': 1.637 + state = MetaScanner.DATA; 1.638 + continue stateloop; 1.639 + default: 1.640 + state = MetaScanner.COMMENT; 1.641 + break commentstartloop; 1.642 + // continue stateloop; 1.643 + } 1.644 + } 1.645 + // FALLTHRU DON'T REORDER 1.646 + case COMMENT: 1.647 + commentloop: for (;;) { 1.648 + c = read(); 1.649 + switch (c) { 1.650 + case -1: 1.651 + break stateloop; 1.652 + case '-': 1.653 + state = MetaScanner.COMMENT_END_DASH; 1.654 + break commentloop; 1.655 + // continue stateloop; 1.656 + default: 1.657 + continue; 1.658 + } 1.659 + } 1.660 + // FALLTHRU DON'T REORDER 1.661 + case COMMENT_END_DASH: 1.662 + commentenddashloop: for (;;) { 1.663 + c = read(); 1.664 + switch (c) { 1.665 + case -1: 1.666 + break stateloop; 1.667 + case '-': 1.668 + state = MetaScanner.COMMENT_END; 1.669 + break commentenddashloop; 1.670 + // continue stateloop; 1.671 + default: 1.672 + state = MetaScanner.COMMENT; 1.673 + continue stateloop; 1.674 + } 1.675 + } 1.676 + // FALLTHRU DON'T REORDER 1.677 + case COMMENT_END: 1.678 + for (;;) { 1.679 + c = read(); 1.680 + switch (c) { 1.681 + case -1: 1.682 + break stateloop; 1.683 + case '>': 1.684 + state = MetaScanner.DATA; 1.685 + continue stateloop; 1.686 + case '-': 1.687 + continue; 1.688 + default: 1.689 + state = MetaScanner.COMMENT; 1.690 + continue stateloop; 1.691 + } 1.692 + } 1.693 + // XXX reorder point 1.694 + case COMMENT_START_DASH: 1.695 + c = read(); 1.696 + switch (c) { 1.697 + case -1: 1.698 + break stateloop; 1.699 + case '-': 1.700 + state = MetaScanner.COMMENT_END; 1.701 + continue stateloop; 1.702 + case '>': 1.703 + state = MetaScanner.DATA; 1.704 + continue stateloop; 1.705 + default: 1.706 + state = MetaScanner.COMMENT; 1.707 + continue stateloop; 1.708 + } 1.709 + // XXX reorder point 1.710 + case ATTRIBUTE_VALUE_SINGLE_QUOTED: 1.711 + for (;;) { 1.712 + if (reconsume) { 1.713 + reconsume = false; 1.714 + } else { 1.715 + c = read(); 1.716 + } 1.717 + switch (c) { 1.718 + case -1: 1.719 + break stateloop; 1.720 + case '\'': 1.721 + handleAttributeValue(); 1.722 + state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED; 1.723 + continue stateloop; 1.724 + default: 1.725 + handleCharInAttributeValue(c); 1.726 + continue; 1.727 + } 1.728 + } 1.729 + // XXX reorder point 1.730 + case SCAN_UNTIL_GT: 1.731 + for (;;) { 1.732 + if (reconsume) { 1.733 + reconsume = false; 1.734 + } else { 1.735 + c = read(); 1.736 + } 1.737 + switch (c) { 1.738 + case -1: 1.739 + break stateloop; 1.740 + case '>': 1.741 + state = MetaScanner.DATA; 1.742 + continue stateloop; 1.743 + default: 1.744 + continue; 1.745 + } 1.746 + } 1.747 + } 1.748 + } 1.749 + stateSave = state; 1.750 + } 1.751 + 1.752 + private void handleCharInAttributeValue(int c) { 1.753 + if (metaState == A) { 1.754 + if (contentIndex == CONTENT.length || charsetIndex == CHARSET.length) { 1.755 + addToBuffer(c); 1.756 + } else if (httpEquivIndex == HTTP_EQUIV.length) { 1.757 + if (contentTypeIndex < CONTENT_TYPE.length && toAsciiLowerCase(c) == CONTENT_TYPE[contentTypeIndex]) { 1.758 + ++contentTypeIndex; 1.759 + } else { 1.760 + contentTypeIndex = Integer.MAX_VALUE; 1.761 + } 1.762 + } 1.763 + } 1.764 + } 1.765 + 1.766 + @Inline private int toAsciiLowerCase(int c) { 1.767 + if (c >= 'A' && c <= 'Z') { 1.768 + return c + 0x20; 1.769 + } 1.770 + return c; 1.771 + } 1.772 + 1.773 + /** 1.774 + * Adds a character to the accumulation buffer. 1.775 + * @param c the character to add 1.776 + */ 1.777 + private void addToBuffer(int c) { 1.778 + if (strBufLen == strBuf.length) { 1.779 + char[] newBuf = new char[strBuf.length + (strBuf.length << 1)]; 1.780 + System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length); 1.781 + strBuf = newBuf; 1.782 + } 1.783 + strBuf[strBufLen++] = (char)c; 1.784 + } 1.785 + 1.786 + /** 1.787 + * Attempts to extract a charset name from the accumulation buffer. 1.788 + * @return <code>true</code> if successful 1.789 + * @throws SAXException 1.790 + */ 1.791 + private void handleAttributeValue() throws SAXException { 1.792 + if (metaState != A) { 1.793 + return; 1.794 + } 1.795 + if (contentIndex == CONTENT.length && content == null) { 1.796 + content = Portability.newStringFromBuffer(strBuf, 0, strBufLen); 1.797 + return; 1.798 + } 1.799 + if (charsetIndex == CHARSET.length && charset == null) { 1.800 + charset = Portability.newStringFromBuffer(strBuf, 0, strBufLen); 1.801 + return; 1.802 + } 1.803 + if (httpEquivIndex == HTTP_EQUIV.length 1.804 + && httpEquivState == HTTP_EQUIV_NOT_SEEN) { 1.805 + httpEquivState = (contentTypeIndex == CONTENT_TYPE.length) ? HTTP_EQUIV_CONTENT_TYPE 1.806 + : HTTP_EQUIV_OTHER; 1.807 + return; 1.808 + } 1.809 + } 1.810 + 1.811 + private boolean handleTag() throws SAXException { 1.812 + boolean stop = handleTagInner(); 1.813 + Portability.releaseString(content); 1.814 + content = null; 1.815 + Portability.releaseString(charset); 1.816 + charset = null; 1.817 + httpEquivState = HTTP_EQUIV_NOT_SEEN; 1.818 + return stop; 1.819 + } 1.820 + 1.821 + private boolean handleTagInner() throws SAXException { 1.822 + if (charset != null && tryCharset(charset)) { 1.823 + return true; 1.824 + } 1.825 + if (content != null && httpEquivState == HTTP_EQUIV_CONTENT_TYPE) { 1.826 + String extract = TreeBuilder.extractCharsetFromContent(content); 1.827 + if (extract == null) { 1.828 + return false; 1.829 + } 1.830 + boolean success = tryCharset(extract); 1.831 + Portability.releaseString(extract); 1.832 + return success; 1.833 + } 1.834 + return false; 1.835 + } 1.836 + 1.837 + /** 1.838 + * Tries to switch to an encoding. 1.839 + * 1.840 + * @param encoding 1.841 + * @return <code>true</code> if successful 1.842 + * @throws SAXException 1.843 + */ 1.844 + protected abstract boolean tryCharset(String encoding) throws SAXException; 1.845 + 1.846 +}