parser/html/javasrc/MetaScanner.java

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 * Copyright (c) 2007 Henri Sivonen
michael@0 3 * Copyright (c) 2008-2010 Mozilla Foundation
michael@0 4 *
michael@0 5 * Permission is hereby granted, free of charge, to any person obtaining a
michael@0 6 * copy of this software and associated documentation files (the "Software"),
michael@0 7 * to deal in the Software without restriction, including without limitation
michael@0 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
michael@0 9 * and/or sell copies of the Software, and to permit persons to whom the
michael@0 10 * Software is furnished to do so, subject to the following conditions:
michael@0 11 *
michael@0 12 * The above copyright notice and this permission notice shall be included in
michael@0 13 * all copies or substantial portions of the Software.
michael@0 14 *
michael@0 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
michael@0 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
michael@0 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
michael@0 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
michael@0 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
michael@0 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
michael@0 21 * DEALINGS IN THE SOFTWARE.
michael@0 22 */
michael@0 23
michael@0 24 package nu.validator.htmlparser.impl;
michael@0 25
michael@0 26 import java.io.IOException;
michael@0 27
michael@0 28 import nu.validator.htmlparser.annotation.Auto;
michael@0 29 import nu.validator.htmlparser.annotation.Inline;
michael@0 30 import nu.validator.htmlparser.common.ByteReadable;
michael@0 31
michael@0 32 import org.xml.sax.SAXException;
michael@0 33
michael@0 34 public abstract class MetaScanner {
michael@0 35
michael@0 36 /**
michael@0 37 * Constant for "charset".
michael@0 38 */
michael@0 39 private static final char[] CHARSET = { 'h', 'a', 'r', 's', 'e', 't' };
michael@0 40
michael@0 41 /**
michael@0 42 * Constant for "content".
michael@0 43 */
michael@0 44 private static final char[] CONTENT = { 'o', 'n', 't', 'e', 'n', 't' };
michael@0 45
michael@0 46 /**
michael@0 47 * Constant for "http-equiv".
michael@0 48 */
michael@0 49 private static final char[] HTTP_EQUIV = { 't', 't', 'p', '-', 'e', 'q',
michael@0 50 'u', 'i', 'v' };
michael@0 51
michael@0 52 /**
michael@0 53 * Constant for "content-type".
michael@0 54 */
michael@0 55 private static final char[] CONTENT_TYPE = { 'c', 'o', 'n', 't', 'e', 'n',
michael@0 56 't', '-', 't', 'y', 'p', 'e' };
michael@0 57
michael@0 58 private static final int NO = 0;
michael@0 59
michael@0 60 private static final int M = 1;
michael@0 61
michael@0 62 private static final int E = 2;
michael@0 63
michael@0 64 private static final int T = 3;
michael@0 65
michael@0 66 private static final int A = 4;
michael@0 67
michael@0 68 private static final int DATA = 0;
michael@0 69
michael@0 70 private static final int TAG_OPEN = 1;
michael@0 71
michael@0 72 private static final int SCAN_UNTIL_GT = 2;
michael@0 73
michael@0 74 private static final int TAG_NAME = 3;
michael@0 75
michael@0 76 private static final int BEFORE_ATTRIBUTE_NAME = 4;
michael@0 77
michael@0 78 private static final int ATTRIBUTE_NAME = 5;
michael@0 79
michael@0 80 private static final int AFTER_ATTRIBUTE_NAME = 6;
michael@0 81
michael@0 82 private static final int BEFORE_ATTRIBUTE_VALUE = 7;
michael@0 83
michael@0 84 private static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 8;
michael@0 85
michael@0 86 private static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 9;
michael@0 87
michael@0 88 private static final int ATTRIBUTE_VALUE_UNQUOTED = 10;
michael@0 89
michael@0 90 private static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 11;
michael@0 91
michael@0 92 private static final int MARKUP_DECLARATION_OPEN = 13;
michael@0 93
michael@0 94 private static final int MARKUP_DECLARATION_HYPHEN = 14;
michael@0 95
michael@0 96 private static final int COMMENT_START = 15;
michael@0 97
michael@0 98 private static final int COMMENT_START_DASH = 16;
michael@0 99
michael@0 100 private static final int COMMENT = 17;
michael@0 101
michael@0 102 private static final int COMMENT_END_DASH = 18;
michael@0 103
michael@0 104 private static final int COMMENT_END = 19;
michael@0 105
michael@0 106 private static final int SELF_CLOSING_START_TAG = 20;
michael@0 107
michael@0 108 private static final int HTTP_EQUIV_NOT_SEEN = 0;
michael@0 109
michael@0 110 private static final int HTTP_EQUIV_CONTENT_TYPE = 1;
michael@0 111
michael@0 112 private static final int HTTP_EQUIV_OTHER = 2;
michael@0 113
michael@0 114 /**
michael@0 115 * The data source.
michael@0 116 */
michael@0 117 protected ByteReadable readable;
michael@0 118
michael@0 119 /**
michael@0 120 * The state of the state machine that recognizes the tag name "meta".
michael@0 121 */
michael@0 122 private int metaState = NO;
michael@0 123
michael@0 124 /**
michael@0 125 * The current position in recognizing the attribute name "content".
michael@0 126 */
michael@0 127 private int contentIndex = Integer.MAX_VALUE;
michael@0 128
michael@0 129 /**
michael@0 130 * The current position in recognizing the attribute name "charset".
michael@0 131 */
michael@0 132 private int charsetIndex = Integer.MAX_VALUE;
michael@0 133
michael@0 134 /**
michael@0 135 * The current position in recognizing the attribute name "http-equive".
michael@0 136 */
michael@0 137 private int httpEquivIndex = Integer.MAX_VALUE;
michael@0 138
michael@0 139 /**
michael@0 140 * The current position in recognizing the attribute value "content-type".
michael@0 141 */
michael@0 142 private int contentTypeIndex = Integer.MAX_VALUE;
michael@0 143
michael@0 144 /**
michael@0 145 * The tokenizer state.
michael@0 146 */
michael@0 147 protected int stateSave = DATA;
michael@0 148
michael@0 149 /**
michael@0 150 * The currently filled length of strBuf.
michael@0 151 */
michael@0 152 private int strBufLen;
michael@0 153
michael@0 154 /**
michael@0 155 * Accumulation buffer for attribute values.
michael@0 156 */
michael@0 157 private @Auto char[] strBuf;
michael@0 158
michael@0 159 private String content;
michael@0 160
michael@0 161 private String charset;
michael@0 162
michael@0 163 private int httpEquivState;
michael@0 164
michael@0 165 public MetaScanner() {
michael@0 166 this.readable = null;
michael@0 167 this.metaState = NO;
michael@0 168 this.contentIndex = Integer.MAX_VALUE;
michael@0 169 this.charsetIndex = Integer.MAX_VALUE;
michael@0 170 this.httpEquivIndex = Integer.MAX_VALUE;
michael@0 171 this.contentTypeIndex = Integer.MAX_VALUE;
michael@0 172 this.stateSave = DATA;
michael@0 173 this.strBufLen = 0;
michael@0 174 this.strBuf = new char[36];
michael@0 175 this.content = null;
michael@0 176 this.charset = null;
michael@0 177 this.httpEquivState = HTTP_EQUIV_NOT_SEEN;
michael@0 178 }
michael@0 179
michael@0 180 @SuppressWarnings("unused") private void destructor() {
michael@0 181 Portability.releaseString(content);
michael@0 182 Portability.releaseString(charset);
michael@0 183 }
michael@0 184
michael@0 185 // [NOCPP[
michael@0 186
michael@0 187 /**
michael@0 188 * Reads a byte from the data source.
michael@0 189 *
michael@0 190 * -1 means end.
michael@0 191 * @return
michael@0 192 * @throws IOException
michael@0 193 */
michael@0 194 protected int read() throws IOException {
michael@0 195 return readable.readByte();
michael@0 196 }
michael@0 197
michael@0 198 // ]NOCPP]
michael@0 199
michael@0 200 // WARNING When editing this, makes sure the bytecode length shown by javap
michael@0 201 // stays under 8000 bytes!
michael@0 202 /**
michael@0 203 * The runs the meta scanning algorithm.
michael@0 204 */
michael@0 205 protected final void stateLoop(int state)
michael@0 206 throws SAXException, IOException {
michael@0 207 int c = -1;
michael@0 208 boolean reconsume = false;
michael@0 209 stateloop: for (;;) {
michael@0 210 switch (state) {
michael@0 211 case DATA:
michael@0 212 dataloop: for (;;) {
michael@0 213 if (reconsume) {
michael@0 214 reconsume = false;
michael@0 215 } else {
michael@0 216 c = read();
michael@0 217 }
michael@0 218 switch (c) {
michael@0 219 case -1:
michael@0 220 break stateloop;
michael@0 221 case '<':
michael@0 222 state = MetaScanner.TAG_OPEN;
michael@0 223 break dataloop; // FALL THROUGH continue
michael@0 224 // stateloop;
michael@0 225 default:
michael@0 226 continue;
michael@0 227 }
michael@0 228 }
michael@0 229 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
michael@0 230 case TAG_OPEN:
michael@0 231 tagopenloop: for (;;) {
michael@0 232 c = read();
michael@0 233 switch (c) {
michael@0 234 case -1:
michael@0 235 break stateloop;
michael@0 236 case 'm':
michael@0 237 case 'M':
michael@0 238 metaState = M;
michael@0 239 state = MetaScanner.TAG_NAME;
michael@0 240 break tagopenloop;
michael@0 241 // continue stateloop;
michael@0 242 case '!':
michael@0 243 state = MetaScanner.MARKUP_DECLARATION_OPEN;
michael@0 244 continue stateloop;
michael@0 245 case '?':
michael@0 246 case '/':
michael@0 247 state = MetaScanner.SCAN_UNTIL_GT;
michael@0 248 continue stateloop;
michael@0 249 case '>':
michael@0 250 state = MetaScanner.DATA;
michael@0 251 continue stateloop;
michael@0 252 default:
michael@0 253 if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
michael@0 254 metaState = NO;
michael@0 255 state = MetaScanner.TAG_NAME;
michael@0 256 break tagopenloop;
michael@0 257 // continue stateloop;
michael@0 258 }
michael@0 259 state = MetaScanner.DATA;
michael@0 260 reconsume = true;
michael@0 261 continue stateloop;
michael@0 262 }
michael@0 263 }
michael@0 264 // FALL THROUGH DON'T REORDER
michael@0 265 case TAG_NAME:
michael@0 266 tagnameloop: for (;;) {
michael@0 267 c = read();
michael@0 268 switch (c) {
michael@0 269 case -1:
michael@0 270 break stateloop;
michael@0 271 case ' ':
michael@0 272 case '\t':
michael@0 273 case '\n':
michael@0 274 case '\u000C':
michael@0 275 state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
michael@0 276 break tagnameloop;
michael@0 277 // continue stateloop;
michael@0 278 case '/':
michael@0 279 state = MetaScanner.SELF_CLOSING_START_TAG;
michael@0 280 continue stateloop;
michael@0 281 case '>':
michael@0 282 state = MetaScanner.DATA;
michael@0 283 continue stateloop;
michael@0 284 case 'e':
michael@0 285 case 'E':
michael@0 286 if (metaState == M) {
michael@0 287 metaState = E;
michael@0 288 } else {
michael@0 289 metaState = NO;
michael@0 290 }
michael@0 291 continue;
michael@0 292 case 't':
michael@0 293 case 'T':
michael@0 294 if (metaState == E) {
michael@0 295 metaState = T;
michael@0 296 } else {
michael@0 297 metaState = NO;
michael@0 298 }
michael@0 299 continue;
michael@0 300 case 'a':
michael@0 301 case 'A':
michael@0 302 if (metaState == T) {
michael@0 303 metaState = A;
michael@0 304 } else {
michael@0 305 metaState = NO;
michael@0 306 }
michael@0 307 continue;
michael@0 308 default:
michael@0 309 metaState = NO;
michael@0 310 continue;
michael@0 311 }
michael@0 312 }
michael@0 313 // FALLTHRU DON'T REORDER
michael@0 314 case BEFORE_ATTRIBUTE_NAME:
michael@0 315 beforeattributenameloop: for (;;) {
michael@0 316 if (reconsume) {
michael@0 317 reconsume = false;
michael@0 318 } else {
michael@0 319 c = read();
michael@0 320 }
michael@0 321 /*
michael@0 322 * Consume the next input character:
michael@0 323 */
michael@0 324 switch (c) {
michael@0 325 case -1:
michael@0 326 break stateloop;
michael@0 327 case ' ':
michael@0 328 case '\t':
michael@0 329 case '\n':
michael@0 330 case '\u000C':
michael@0 331 continue;
michael@0 332 case '/':
michael@0 333 state = MetaScanner.SELF_CLOSING_START_TAG;
michael@0 334 continue stateloop;
michael@0 335 case '>':
michael@0 336 if (handleTag()) {
michael@0 337 break stateloop;
michael@0 338 }
michael@0 339 state = DATA;
michael@0 340 continue stateloop;
michael@0 341 case 'c':
michael@0 342 case 'C':
michael@0 343 contentIndex = 0;
michael@0 344 charsetIndex = 0;
michael@0 345 httpEquivIndex = Integer.MAX_VALUE;
michael@0 346 contentTypeIndex = Integer.MAX_VALUE;
michael@0 347 state = MetaScanner.ATTRIBUTE_NAME;
michael@0 348 break beforeattributenameloop;
michael@0 349 case 'h':
michael@0 350 case 'H':
michael@0 351 contentIndex = Integer.MAX_VALUE;
michael@0 352 charsetIndex = Integer.MAX_VALUE;
michael@0 353 httpEquivIndex = 0;
michael@0 354 contentTypeIndex = Integer.MAX_VALUE;
michael@0 355 state = MetaScanner.ATTRIBUTE_NAME;
michael@0 356 break beforeattributenameloop;
michael@0 357 default:
michael@0 358 contentIndex = Integer.MAX_VALUE;
michael@0 359 charsetIndex = Integer.MAX_VALUE;
michael@0 360 httpEquivIndex = Integer.MAX_VALUE;
michael@0 361 contentTypeIndex = Integer.MAX_VALUE;
michael@0 362 state = MetaScanner.ATTRIBUTE_NAME;
michael@0 363 break beforeattributenameloop;
michael@0 364 // continue stateloop;
michael@0 365 }
michael@0 366 }
michael@0 367 // FALLTHRU DON'T REORDER
michael@0 368 case ATTRIBUTE_NAME:
michael@0 369 attributenameloop: for (;;) {
michael@0 370 c = read();
michael@0 371 switch (c) {
michael@0 372 case -1:
michael@0 373 break stateloop;
michael@0 374 case ' ':
michael@0 375 case '\t':
michael@0 376 case '\n':
michael@0 377 case '\u000C':
michael@0 378 state = MetaScanner.AFTER_ATTRIBUTE_NAME;
michael@0 379 continue stateloop;
michael@0 380 case '/':
michael@0 381 state = MetaScanner.SELF_CLOSING_START_TAG;
michael@0 382 continue stateloop;
michael@0 383 case '=':
michael@0 384 strBufLen = 0;
michael@0 385 contentTypeIndex = 0;
michael@0 386 state = MetaScanner.BEFORE_ATTRIBUTE_VALUE;
michael@0 387 break attributenameloop;
michael@0 388 // continue stateloop;
michael@0 389 case '>':
michael@0 390 if (handleTag()) {
michael@0 391 break stateloop;
michael@0 392 }
michael@0 393 state = MetaScanner.DATA;
michael@0 394 continue stateloop;
michael@0 395 default:
michael@0 396 if (metaState == A) {
michael@0 397 if (c >= 'A' && c <= 'Z') {
michael@0 398 c += 0x20;
michael@0 399 }
michael@0 400 if (contentIndex < CONTENT.length && c == CONTENT[contentIndex]) {
michael@0 401 ++contentIndex;
michael@0 402 } else {
michael@0 403 contentIndex = Integer.MAX_VALUE;
michael@0 404 }
michael@0 405 if (charsetIndex < CHARSET.length && c == CHARSET[charsetIndex]) {
michael@0 406 ++charsetIndex;
michael@0 407 } else {
michael@0 408 charsetIndex = Integer.MAX_VALUE;
michael@0 409 }
michael@0 410 if (httpEquivIndex < HTTP_EQUIV.length && c == HTTP_EQUIV[httpEquivIndex]) {
michael@0 411 ++httpEquivIndex;
michael@0 412 } else {
michael@0 413 httpEquivIndex = Integer.MAX_VALUE;
michael@0 414 }
michael@0 415 }
michael@0 416 continue;
michael@0 417 }
michael@0 418 }
michael@0 419 // FALLTHRU DON'T REORDER
michael@0 420 case BEFORE_ATTRIBUTE_VALUE:
michael@0 421 beforeattributevalueloop: for (;;) {
michael@0 422 c = read();
michael@0 423 switch (c) {
michael@0 424 case -1:
michael@0 425 break stateloop;
michael@0 426 case ' ':
michael@0 427 case '\t':
michael@0 428 case '\n':
michael@0 429 case '\u000C':
michael@0 430 continue;
michael@0 431 case '"':
michael@0 432 state = MetaScanner.ATTRIBUTE_VALUE_DOUBLE_QUOTED;
michael@0 433 break beforeattributevalueloop;
michael@0 434 // continue stateloop;
michael@0 435 case '\'':
michael@0 436 state = MetaScanner.ATTRIBUTE_VALUE_SINGLE_QUOTED;
michael@0 437 continue stateloop;
michael@0 438 case '>':
michael@0 439 if (handleTag()) {
michael@0 440 break stateloop;
michael@0 441 }
michael@0 442 state = MetaScanner.DATA;
michael@0 443 continue stateloop;
michael@0 444 default:
michael@0 445 handleCharInAttributeValue(c);
michael@0 446 state = MetaScanner.ATTRIBUTE_VALUE_UNQUOTED;
michael@0 447 continue stateloop;
michael@0 448 }
michael@0 449 }
michael@0 450 // FALLTHRU DON'T REORDER
michael@0 451 case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
michael@0 452 attributevaluedoublequotedloop: for (;;) {
michael@0 453 if (reconsume) {
michael@0 454 reconsume = false;
michael@0 455 } else {
michael@0 456 c = read();
michael@0 457 }
michael@0 458 switch (c) {
michael@0 459 case -1:
michael@0 460 break stateloop;
michael@0 461 case '"':
michael@0 462 handleAttributeValue();
michael@0 463 state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED;
michael@0 464 break attributevaluedoublequotedloop;
michael@0 465 // continue stateloop;
michael@0 466 default:
michael@0 467 handleCharInAttributeValue(c);
michael@0 468 continue;
michael@0 469 }
michael@0 470 }
michael@0 471 // FALLTHRU DON'T REORDER
michael@0 472 case AFTER_ATTRIBUTE_VALUE_QUOTED:
michael@0 473 afterattributevaluequotedloop: for (;;) {
michael@0 474 c = read();
michael@0 475 switch (c) {
michael@0 476 case -1:
michael@0 477 break stateloop;
michael@0 478 case ' ':
michael@0 479 case '\t':
michael@0 480 case '\n':
michael@0 481 case '\u000C':
michael@0 482 state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
michael@0 483 continue stateloop;
michael@0 484 case '/':
michael@0 485 state = MetaScanner.SELF_CLOSING_START_TAG;
michael@0 486 break afterattributevaluequotedloop;
michael@0 487 // continue stateloop;
michael@0 488 case '>':
michael@0 489 if (handleTag()) {
michael@0 490 break stateloop;
michael@0 491 }
michael@0 492 state = MetaScanner.DATA;
michael@0 493 continue stateloop;
michael@0 494 default:
michael@0 495 state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
michael@0 496 reconsume = true;
michael@0 497 continue stateloop;
michael@0 498 }
michael@0 499 }
michael@0 500 // FALLTHRU DON'T REORDER
michael@0 501 case SELF_CLOSING_START_TAG:
michael@0 502 c = read();
michael@0 503 switch (c) {
michael@0 504 case -1:
michael@0 505 break stateloop;
michael@0 506 case '>':
michael@0 507 if (handleTag()) {
michael@0 508 break stateloop;
michael@0 509 }
michael@0 510 state = MetaScanner.DATA;
michael@0 511 continue stateloop;
michael@0 512 default:
michael@0 513 state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
michael@0 514 reconsume = true;
michael@0 515 continue stateloop;
michael@0 516 }
michael@0 517 // XXX reorder point
michael@0 518 case ATTRIBUTE_VALUE_UNQUOTED:
michael@0 519 for (;;) {
michael@0 520 if (reconsume) {
michael@0 521 reconsume = false;
michael@0 522 } else {
michael@0 523 c = read();
michael@0 524 }
michael@0 525 switch (c) {
michael@0 526 case -1:
michael@0 527 break stateloop;
michael@0 528 case ' ':
michael@0 529 case '\t':
michael@0 530 case '\n':
michael@0 531
michael@0 532 case '\u000C':
michael@0 533 handleAttributeValue();
michael@0 534 state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
michael@0 535 continue stateloop;
michael@0 536 case '>':
michael@0 537 handleAttributeValue();
michael@0 538 if (handleTag()) {
michael@0 539 break stateloop;
michael@0 540 }
michael@0 541 state = MetaScanner.DATA;
michael@0 542 continue stateloop;
michael@0 543 default:
michael@0 544 handleCharInAttributeValue(c);
michael@0 545 continue;
michael@0 546 }
michael@0 547 }
michael@0 548 // XXX reorder point
michael@0 549 case AFTER_ATTRIBUTE_NAME:
michael@0 550 for (;;) {
michael@0 551 c = read();
michael@0 552 switch (c) {
michael@0 553 case -1:
michael@0 554 break stateloop;
michael@0 555 case ' ':
michael@0 556 case '\t':
michael@0 557 case '\n':
michael@0 558 case '\u000C':
michael@0 559 continue;
michael@0 560 case '/':
michael@0 561 handleAttributeValue();
michael@0 562 state = MetaScanner.SELF_CLOSING_START_TAG;
michael@0 563 continue stateloop;
michael@0 564 case '=':
michael@0 565 strBufLen = 0;
michael@0 566 contentTypeIndex = 0;
michael@0 567 state = MetaScanner.BEFORE_ATTRIBUTE_VALUE;
michael@0 568 continue stateloop;
michael@0 569 case '>':
michael@0 570 handleAttributeValue();
michael@0 571 if (handleTag()) {
michael@0 572 break stateloop;
michael@0 573 }
michael@0 574 state = MetaScanner.DATA;
michael@0 575 continue stateloop;
michael@0 576 case 'c':
michael@0 577 case 'C':
michael@0 578 contentIndex = 0;
michael@0 579 charsetIndex = 0;
michael@0 580 state = MetaScanner.ATTRIBUTE_NAME;
michael@0 581 continue stateloop;
michael@0 582 default:
michael@0 583 contentIndex = Integer.MAX_VALUE;
michael@0 584 charsetIndex = Integer.MAX_VALUE;
michael@0 585 state = MetaScanner.ATTRIBUTE_NAME;
michael@0 586 continue stateloop;
michael@0 587 }
michael@0 588 }
michael@0 589 // XXX reorder point
michael@0 590 case MARKUP_DECLARATION_OPEN:
michael@0 591 markupdeclarationopenloop: for (;;) {
michael@0 592 c = read();
michael@0 593 switch (c) {
michael@0 594 case -1:
michael@0 595 break stateloop;
michael@0 596 case '-':
michael@0 597 state = MetaScanner.MARKUP_DECLARATION_HYPHEN;
michael@0 598 break markupdeclarationopenloop;
michael@0 599 // continue stateloop;
michael@0 600 default:
michael@0 601 state = MetaScanner.SCAN_UNTIL_GT;
michael@0 602 reconsume = true;
michael@0 603 continue stateloop;
michael@0 604 }
michael@0 605 }
michael@0 606 // FALLTHRU DON'T REORDER
michael@0 607 case MARKUP_DECLARATION_HYPHEN:
michael@0 608 markupdeclarationhyphenloop: for (;;) {
michael@0 609 c = read();
michael@0 610 switch (c) {
michael@0 611 case -1:
michael@0 612 break stateloop;
michael@0 613 case '-':
michael@0 614 state = MetaScanner.COMMENT_START;
michael@0 615 break markupdeclarationhyphenloop;
michael@0 616 // continue stateloop;
michael@0 617 default:
michael@0 618 state = MetaScanner.SCAN_UNTIL_GT;
michael@0 619 reconsume = true;
michael@0 620 continue stateloop;
michael@0 621 }
michael@0 622 }
michael@0 623 // FALLTHRU DON'T REORDER
michael@0 624 case COMMENT_START:
michael@0 625 commentstartloop: for (;;) {
michael@0 626 c = read();
michael@0 627 switch (c) {
michael@0 628 case -1:
michael@0 629 break stateloop;
michael@0 630 case '-':
michael@0 631 state = MetaScanner.COMMENT_START_DASH;
michael@0 632 continue stateloop;
michael@0 633 case '>':
michael@0 634 state = MetaScanner.DATA;
michael@0 635 continue stateloop;
michael@0 636 default:
michael@0 637 state = MetaScanner.COMMENT;
michael@0 638 break commentstartloop;
michael@0 639 // continue stateloop;
michael@0 640 }
michael@0 641 }
michael@0 642 // FALLTHRU DON'T REORDER
michael@0 643 case COMMENT:
michael@0 644 commentloop: for (;;) {
michael@0 645 c = read();
michael@0 646 switch (c) {
michael@0 647 case -1:
michael@0 648 break stateloop;
michael@0 649 case '-':
michael@0 650 state = MetaScanner.COMMENT_END_DASH;
michael@0 651 break commentloop;
michael@0 652 // continue stateloop;
michael@0 653 default:
michael@0 654 continue;
michael@0 655 }
michael@0 656 }
michael@0 657 // FALLTHRU DON'T REORDER
michael@0 658 case COMMENT_END_DASH:
michael@0 659 commentenddashloop: for (;;) {
michael@0 660 c = read();
michael@0 661 switch (c) {
michael@0 662 case -1:
michael@0 663 break stateloop;
michael@0 664 case '-':
michael@0 665 state = MetaScanner.COMMENT_END;
michael@0 666 break commentenddashloop;
michael@0 667 // continue stateloop;
michael@0 668 default:
michael@0 669 state = MetaScanner.COMMENT;
michael@0 670 continue stateloop;
michael@0 671 }
michael@0 672 }
michael@0 673 // FALLTHRU DON'T REORDER
michael@0 674 case COMMENT_END:
michael@0 675 for (;;) {
michael@0 676 c = read();
michael@0 677 switch (c) {
michael@0 678 case -1:
michael@0 679 break stateloop;
michael@0 680 case '>':
michael@0 681 state = MetaScanner.DATA;
michael@0 682 continue stateloop;
michael@0 683 case '-':
michael@0 684 continue;
michael@0 685 default:
michael@0 686 state = MetaScanner.COMMENT;
michael@0 687 continue stateloop;
michael@0 688 }
michael@0 689 }
michael@0 690 // XXX reorder point
michael@0 691 case COMMENT_START_DASH:
michael@0 692 c = read();
michael@0 693 switch (c) {
michael@0 694 case -1:
michael@0 695 break stateloop;
michael@0 696 case '-':
michael@0 697 state = MetaScanner.COMMENT_END;
michael@0 698 continue stateloop;
michael@0 699 case '>':
michael@0 700 state = MetaScanner.DATA;
michael@0 701 continue stateloop;
michael@0 702 default:
michael@0 703 state = MetaScanner.COMMENT;
michael@0 704 continue stateloop;
michael@0 705 }
michael@0 706 // XXX reorder point
michael@0 707 case ATTRIBUTE_VALUE_SINGLE_QUOTED:
michael@0 708 for (;;) {
michael@0 709 if (reconsume) {
michael@0 710 reconsume = false;
michael@0 711 } else {
michael@0 712 c = read();
michael@0 713 }
michael@0 714 switch (c) {
michael@0 715 case -1:
michael@0 716 break stateloop;
michael@0 717 case '\'':
michael@0 718 handleAttributeValue();
michael@0 719 state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED;
michael@0 720 continue stateloop;
michael@0 721 default:
michael@0 722 handleCharInAttributeValue(c);
michael@0 723 continue;
michael@0 724 }
michael@0 725 }
michael@0 726 // XXX reorder point
michael@0 727 case SCAN_UNTIL_GT:
michael@0 728 for (;;) {
michael@0 729 if (reconsume) {
michael@0 730 reconsume = false;
michael@0 731 } else {
michael@0 732 c = read();
michael@0 733 }
michael@0 734 switch (c) {
michael@0 735 case -1:
michael@0 736 break stateloop;
michael@0 737 case '>':
michael@0 738 state = MetaScanner.DATA;
michael@0 739 continue stateloop;
michael@0 740 default:
michael@0 741 continue;
michael@0 742 }
michael@0 743 }
michael@0 744 }
michael@0 745 }
michael@0 746 stateSave = state;
michael@0 747 }
michael@0 748
michael@0 749 private void handleCharInAttributeValue(int c) {
michael@0 750 if (metaState == A) {
michael@0 751 if (contentIndex == CONTENT.length || charsetIndex == CHARSET.length) {
michael@0 752 addToBuffer(c);
michael@0 753 } else if (httpEquivIndex == HTTP_EQUIV.length) {
michael@0 754 if (contentTypeIndex < CONTENT_TYPE.length && toAsciiLowerCase(c) == CONTENT_TYPE[contentTypeIndex]) {
michael@0 755 ++contentTypeIndex;
michael@0 756 } else {
michael@0 757 contentTypeIndex = Integer.MAX_VALUE;
michael@0 758 }
michael@0 759 }
michael@0 760 }
michael@0 761 }
michael@0 762
michael@0 763 @Inline private int toAsciiLowerCase(int c) {
michael@0 764 if (c >= 'A' && c <= 'Z') {
michael@0 765 return c + 0x20;
michael@0 766 }
michael@0 767 return c;
michael@0 768 }
michael@0 769
michael@0 770 /**
michael@0 771 * Adds a character to the accumulation buffer.
michael@0 772 * @param c the character to add
michael@0 773 */
michael@0 774 private void addToBuffer(int c) {
michael@0 775 if (strBufLen == strBuf.length) {
michael@0 776 char[] newBuf = new char[strBuf.length + (strBuf.length << 1)];
michael@0 777 System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length);
michael@0 778 strBuf = newBuf;
michael@0 779 }
michael@0 780 strBuf[strBufLen++] = (char)c;
michael@0 781 }
michael@0 782
michael@0 783 /**
michael@0 784 * Attempts to extract a charset name from the accumulation buffer.
michael@0 785 * @return <code>true</code> if successful
michael@0 786 * @throws SAXException
michael@0 787 */
michael@0 788 private void handleAttributeValue() throws SAXException {
michael@0 789 if (metaState != A) {
michael@0 790 return;
michael@0 791 }
michael@0 792 if (contentIndex == CONTENT.length && content == null) {
michael@0 793 content = Portability.newStringFromBuffer(strBuf, 0, strBufLen);
michael@0 794 return;
michael@0 795 }
michael@0 796 if (charsetIndex == CHARSET.length && charset == null) {
michael@0 797 charset = Portability.newStringFromBuffer(strBuf, 0, strBufLen);
michael@0 798 return;
michael@0 799 }
michael@0 800 if (httpEquivIndex == HTTP_EQUIV.length
michael@0 801 && httpEquivState == HTTP_EQUIV_NOT_SEEN) {
michael@0 802 httpEquivState = (contentTypeIndex == CONTENT_TYPE.length) ? HTTP_EQUIV_CONTENT_TYPE
michael@0 803 : HTTP_EQUIV_OTHER;
michael@0 804 return;
michael@0 805 }
michael@0 806 }
michael@0 807
michael@0 808 private boolean handleTag() throws SAXException {
michael@0 809 boolean stop = handleTagInner();
michael@0 810 Portability.releaseString(content);
michael@0 811 content = null;
michael@0 812 Portability.releaseString(charset);
michael@0 813 charset = null;
michael@0 814 httpEquivState = HTTP_EQUIV_NOT_SEEN;
michael@0 815 return stop;
michael@0 816 }
michael@0 817
michael@0 818 private boolean handleTagInner() throws SAXException {
michael@0 819 if (charset != null && tryCharset(charset)) {
michael@0 820 return true;
michael@0 821 }
michael@0 822 if (content != null && httpEquivState == HTTP_EQUIV_CONTENT_TYPE) {
michael@0 823 String extract = TreeBuilder.extractCharsetFromContent(content);
michael@0 824 if (extract == null) {
michael@0 825 return false;
michael@0 826 }
michael@0 827 boolean success = tryCharset(extract);
michael@0 828 Portability.releaseString(extract);
michael@0 829 return success;
michael@0 830 }
michael@0 831 return false;
michael@0 832 }
michael@0 833
michael@0 834 /**
michael@0 835 * Tries to switch to an encoding.
michael@0 836 *
michael@0 837 * @param encoding
michael@0 838 * @return <code>true</code> if successful
michael@0 839 * @throws SAXException
michael@0 840 */
michael@0 841 protected abstract boolean tryCharset(String encoding) throws SAXException;
michael@0 842
michael@0 843 }

mercurial