|
1 /* |
|
2 * Copyright (c) 2007 Henri Sivonen |
|
3 * Copyright (c) 2008-2010 Mozilla Foundation |
|
4 * |
|
5 * Permission is hereby granted, free of charge, to any person obtaining a |
|
6 * copy of this software and associated documentation files (the "Software"), |
|
7 * to deal in the Software without restriction, including without limitation |
|
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
|
9 * and/or sell copies of the Software, and to permit persons to whom the |
|
10 * Software is furnished to do so, subject to the following conditions: |
|
11 * |
|
12 * The above copyright notice and this permission notice shall be included in |
|
13 * all copies or substantial portions of the Software. |
|
14 * |
|
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
|
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
|
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
|
21 * DEALINGS IN THE SOFTWARE. |
|
22 */ |
|
23 |
|
24 package nu.validator.htmlparser.impl; |
|
25 |
|
26 import java.io.IOException; |
|
27 |
|
28 import nu.validator.htmlparser.annotation.Auto; |
|
29 import nu.validator.htmlparser.annotation.Inline; |
|
30 import nu.validator.htmlparser.common.ByteReadable; |
|
31 |
|
32 import org.xml.sax.SAXException; |
|
33 |
|
34 public abstract class MetaScanner { |
|
35 |
|
36 /** |
|
37 * Constant for "charset". |
|
38 */ |
|
39 private static final char[] CHARSET = { 'h', 'a', 'r', 's', 'e', 't' }; |
|
40 |
|
41 /** |
|
42 * Constant for "content". |
|
43 */ |
|
44 private static final char[] CONTENT = { 'o', 'n', 't', 'e', 'n', 't' }; |
|
45 |
|
46 /** |
|
47 * Constant for "http-equiv". |
|
48 */ |
|
49 private static final char[] HTTP_EQUIV = { 't', 't', 'p', '-', 'e', 'q', |
|
50 'u', 'i', 'v' }; |
|
51 |
|
52 /** |
|
53 * Constant for "content-type". |
|
54 */ |
|
55 private static final char[] CONTENT_TYPE = { 'c', 'o', 'n', 't', 'e', 'n', |
|
56 't', '-', 't', 'y', 'p', 'e' }; |
|
57 |
|
58 private static final int NO = 0; |
|
59 |
|
60 private static final int M = 1; |
|
61 |
|
62 private static final int E = 2; |
|
63 |
|
64 private static final int T = 3; |
|
65 |
|
66 private static final int A = 4; |
|
67 |
|
68 private static final int DATA = 0; |
|
69 |
|
70 private static final int TAG_OPEN = 1; |
|
71 |
|
72 private static final int SCAN_UNTIL_GT = 2; |
|
73 |
|
74 private static final int TAG_NAME = 3; |
|
75 |
|
76 private static final int BEFORE_ATTRIBUTE_NAME = 4; |
|
77 |
|
78 private static final int ATTRIBUTE_NAME = 5; |
|
79 |
|
80 private static final int AFTER_ATTRIBUTE_NAME = 6; |
|
81 |
|
82 private static final int BEFORE_ATTRIBUTE_VALUE = 7; |
|
83 |
|
84 private static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 8; |
|
85 |
|
86 private static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 9; |
|
87 |
|
88 private static final int ATTRIBUTE_VALUE_UNQUOTED = 10; |
|
89 |
|
90 private static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 11; |
|
91 |
|
92 private static final int MARKUP_DECLARATION_OPEN = 13; |
|
93 |
|
94 private static final int MARKUP_DECLARATION_HYPHEN = 14; |
|
95 |
|
96 private static final int COMMENT_START = 15; |
|
97 |
|
98 private static final int COMMENT_START_DASH = 16; |
|
99 |
|
100 private static final int COMMENT = 17; |
|
101 |
|
102 private static final int COMMENT_END_DASH = 18; |
|
103 |
|
104 private static final int COMMENT_END = 19; |
|
105 |
|
106 private static final int SELF_CLOSING_START_TAG = 20; |
|
107 |
|
108 private static final int HTTP_EQUIV_NOT_SEEN = 0; |
|
109 |
|
110 private static final int HTTP_EQUIV_CONTENT_TYPE = 1; |
|
111 |
|
112 private static final int HTTP_EQUIV_OTHER = 2; |
|
113 |
|
114 /** |
|
115 * The data source. |
|
116 */ |
|
117 protected ByteReadable readable; |
|
118 |
|
119 /** |
|
120 * The state of the state machine that recognizes the tag name "meta". |
|
121 */ |
|
122 private int metaState = NO; |
|
123 |
|
124 /** |
|
125 * The current position in recognizing the attribute name "content". |
|
126 */ |
|
127 private int contentIndex = Integer.MAX_VALUE; |
|
128 |
|
129 /** |
|
130 * The current position in recognizing the attribute name "charset". |
|
131 */ |
|
132 private int charsetIndex = Integer.MAX_VALUE; |
|
133 |
|
134 /** |
|
135 * The current position in recognizing the attribute name "http-equive". |
|
136 */ |
|
137 private int httpEquivIndex = Integer.MAX_VALUE; |
|
138 |
|
139 /** |
|
140 * The current position in recognizing the attribute value "content-type". |
|
141 */ |
|
142 private int contentTypeIndex = Integer.MAX_VALUE; |
|
143 |
|
144 /** |
|
145 * The tokenizer state. |
|
146 */ |
|
147 protected int stateSave = DATA; |
|
148 |
|
149 /** |
|
150 * The currently filled length of strBuf. |
|
151 */ |
|
152 private int strBufLen; |
|
153 |
|
154 /** |
|
155 * Accumulation buffer for attribute values. |
|
156 */ |
|
157 private @Auto char[] strBuf; |
|
158 |
|
159 private String content; |
|
160 |
|
161 private String charset; |
|
162 |
|
163 private int httpEquivState; |
|
164 |
|
165 public MetaScanner() { |
|
166 this.readable = null; |
|
167 this.metaState = NO; |
|
168 this.contentIndex = Integer.MAX_VALUE; |
|
169 this.charsetIndex = Integer.MAX_VALUE; |
|
170 this.httpEquivIndex = Integer.MAX_VALUE; |
|
171 this.contentTypeIndex = Integer.MAX_VALUE; |
|
172 this.stateSave = DATA; |
|
173 this.strBufLen = 0; |
|
174 this.strBuf = new char[36]; |
|
175 this.content = null; |
|
176 this.charset = null; |
|
177 this.httpEquivState = HTTP_EQUIV_NOT_SEEN; |
|
178 } |
|
179 |
|
180 @SuppressWarnings("unused") private void destructor() { |
|
181 Portability.releaseString(content); |
|
182 Portability.releaseString(charset); |
|
183 } |
|
184 |
|
185 // [NOCPP[ |
|
186 |
|
187 /** |
|
188 * Reads a byte from the data source. |
|
189 * |
|
190 * -1 means end. |
|
191 * @return |
|
192 * @throws IOException |
|
193 */ |
|
194 protected int read() throws IOException { |
|
195 return readable.readByte(); |
|
196 } |
|
197 |
|
198 // ]NOCPP] |
|
199 |
|
200 // WARNING When editing this, makes sure the bytecode length shown by javap |
|
201 // stays under 8000 bytes! |
|
202 /** |
|
203 * The runs the meta scanning algorithm. |
|
204 */ |
|
205 protected final void stateLoop(int state) |
|
206 throws SAXException, IOException { |
|
207 int c = -1; |
|
208 boolean reconsume = false; |
|
209 stateloop: for (;;) { |
|
210 switch (state) { |
|
211 case DATA: |
|
212 dataloop: for (;;) { |
|
213 if (reconsume) { |
|
214 reconsume = false; |
|
215 } else { |
|
216 c = read(); |
|
217 } |
|
218 switch (c) { |
|
219 case -1: |
|
220 break stateloop; |
|
221 case '<': |
|
222 state = MetaScanner.TAG_OPEN; |
|
223 break dataloop; // FALL THROUGH continue |
|
224 // stateloop; |
|
225 default: |
|
226 continue; |
|
227 } |
|
228 } |
|
229 // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER |
|
230 case TAG_OPEN: |
|
231 tagopenloop: for (;;) { |
|
232 c = read(); |
|
233 switch (c) { |
|
234 case -1: |
|
235 break stateloop; |
|
236 case 'm': |
|
237 case 'M': |
|
238 metaState = M; |
|
239 state = MetaScanner.TAG_NAME; |
|
240 break tagopenloop; |
|
241 // continue stateloop; |
|
242 case '!': |
|
243 state = MetaScanner.MARKUP_DECLARATION_OPEN; |
|
244 continue stateloop; |
|
245 case '?': |
|
246 case '/': |
|
247 state = MetaScanner.SCAN_UNTIL_GT; |
|
248 continue stateloop; |
|
249 case '>': |
|
250 state = MetaScanner.DATA; |
|
251 continue stateloop; |
|
252 default: |
|
253 if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { |
|
254 metaState = NO; |
|
255 state = MetaScanner.TAG_NAME; |
|
256 break tagopenloop; |
|
257 // continue stateloop; |
|
258 } |
|
259 state = MetaScanner.DATA; |
|
260 reconsume = true; |
|
261 continue stateloop; |
|
262 } |
|
263 } |
|
264 // FALL THROUGH DON'T REORDER |
|
265 case TAG_NAME: |
|
266 tagnameloop: for (;;) { |
|
267 c = read(); |
|
268 switch (c) { |
|
269 case -1: |
|
270 break stateloop; |
|
271 case ' ': |
|
272 case '\t': |
|
273 case '\n': |
|
274 case '\u000C': |
|
275 state = MetaScanner.BEFORE_ATTRIBUTE_NAME; |
|
276 break tagnameloop; |
|
277 // continue stateloop; |
|
278 case '/': |
|
279 state = MetaScanner.SELF_CLOSING_START_TAG; |
|
280 continue stateloop; |
|
281 case '>': |
|
282 state = MetaScanner.DATA; |
|
283 continue stateloop; |
|
284 case 'e': |
|
285 case 'E': |
|
286 if (metaState == M) { |
|
287 metaState = E; |
|
288 } else { |
|
289 metaState = NO; |
|
290 } |
|
291 continue; |
|
292 case 't': |
|
293 case 'T': |
|
294 if (metaState == E) { |
|
295 metaState = T; |
|
296 } else { |
|
297 metaState = NO; |
|
298 } |
|
299 continue; |
|
300 case 'a': |
|
301 case 'A': |
|
302 if (metaState == T) { |
|
303 metaState = A; |
|
304 } else { |
|
305 metaState = NO; |
|
306 } |
|
307 continue; |
|
308 default: |
|
309 metaState = NO; |
|
310 continue; |
|
311 } |
|
312 } |
|
313 // FALLTHRU DON'T REORDER |
|
314 case BEFORE_ATTRIBUTE_NAME: |
|
315 beforeattributenameloop: for (;;) { |
|
316 if (reconsume) { |
|
317 reconsume = false; |
|
318 } else { |
|
319 c = read(); |
|
320 } |
|
321 /* |
|
322 * Consume the next input character: |
|
323 */ |
|
324 switch (c) { |
|
325 case -1: |
|
326 break stateloop; |
|
327 case ' ': |
|
328 case '\t': |
|
329 case '\n': |
|
330 case '\u000C': |
|
331 continue; |
|
332 case '/': |
|
333 state = MetaScanner.SELF_CLOSING_START_TAG; |
|
334 continue stateloop; |
|
335 case '>': |
|
336 if (handleTag()) { |
|
337 break stateloop; |
|
338 } |
|
339 state = DATA; |
|
340 continue stateloop; |
|
341 case 'c': |
|
342 case 'C': |
|
343 contentIndex = 0; |
|
344 charsetIndex = 0; |
|
345 httpEquivIndex = Integer.MAX_VALUE; |
|
346 contentTypeIndex = Integer.MAX_VALUE; |
|
347 state = MetaScanner.ATTRIBUTE_NAME; |
|
348 break beforeattributenameloop; |
|
349 case 'h': |
|
350 case 'H': |
|
351 contentIndex = Integer.MAX_VALUE; |
|
352 charsetIndex = Integer.MAX_VALUE; |
|
353 httpEquivIndex = 0; |
|
354 contentTypeIndex = Integer.MAX_VALUE; |
|
355 state = MetaScanner.ATTRIBUTE_NAME; |
|
356 break beforeattributenameloop; |
|
357 default: |
|
358 contentIndex = Integer.MAX_VALUE; |
|
359 charsetIndex = Integer.MAX_VALUE; |
|
360 httpEquivIndex = Integer.MAX_VALUE; |
|
361 contentTypeIndex = Integer.MAX_VALUE; |
|
362 state = MetaScanner.ATTRIBUTE_NAME; |
|
363 break beforeattributenameloop; |
|
364 // continue stateloop; |
|
365 } |
|
366 } |
|
367 // FALLTHRU DON'T REORDER |
|
368 case ATTRIBUTE_NAME: |
|
369 attributenameloop: for (;;) { |
|
370 c = read(); |
|
371 switch (c) { |
|
372 case -1: |
|
373 break stateloop; |
|
374 case ' ': |
|
375 case '\t': |
|
376 case '\n': |
|
377 case '\u000C': |
|
378 state = MetaScanner.AFTER_ATTRIBUTE_NAME; |
|
379 continue stateloop; |
|
380 case '/': |
|
381 state = MetaScanner.SELF_CLOSING_START_TAG; |
|
382 continue stateloop; |
|
383 case '=': |
|
384 strBufLen = 0; |
|
385 contentTypeIndex = 0; |
|
386 state = MetaScanner.BEFORE_ATTRIBUTE_VALUE; |
|
387 break attributenameloop; |
|
388 // continue stateloop; |
|
389 case '>': |
|
390 if (handleTag()) { |
|
391 break stateloop; |
|
392 } |
|
393 state = MetaScanner.DATA; |
|
394 continue stateloop; |
|
395 default: |
|
396 if (metaState == A) { |
|
397 if (c >= 'A' && c <= 'Z') { |
|
398 c += 0x20; |
|
399 } |
|
400 if (contentIndex < CONTENT.length && c == CONTENT[contentIndex]) { |
|
401 ++contentIndex; |
|
402 } else { |
|
403 contentIndex = Integer.MAX_VALUE; |
|
404 } |
|
405 if (charsetIndex < CHARSET.length && c == CHARSET[charsetIndex]) { |
|
406 ++charsetIndex; |
|
407 } else { |
|
408 charsetIndex = Integer.MAX_VALUE; |
|
409 } |
|
410 if (httpEquivIndex < HTTP_EQUIV.length && c == HTTP_EQUIV[httpEquivIndex]) { |
|
411 ++httpEquivIndex; |
|
412 } else { |
|
413 httpEquivIndex = Integer.MAX_VALUE; |
|
414 } |
|
415 } |
|
416 continue; |
|
417 } |
|
418 } |
|
419 // FALLTHRU DON'T REORDER |
|
420 case BEFORE_ATTRIBUTE_VALUE: |
|
421 beforeattributevalueloop: for (;;) { |
|
422 c = read(); |
|
423 switch (c) { |
|
424 case -1: |
|
425 break stateloop; |
|
426 case ' ': |
|
427 case '\t': |
|
428 case '\n': |
|
429 case '\u000C': |
|
430 continue; |
|
431 case '"': |
|
432 state = MetaScanner.ATTRIBUTE_VALUE_DOUBLE_QUOTED; |
|
433 break beforeattributevalueloop; |
|
434 // continue stateloop; |
|
435 case '\'': |
|
436 state = MetaScanner.ATTRIBUTE_VALUE_SINGLE_QUOTED; |
|
437 continue stateloop; |
|
438 case '>': |
|
439 if (handleTag()) { |
|
440 break stateloop; |
|
441 } |
|
442 state = MetaScanner.DATA; |
|
443 continue stateloop; |
|
444 default: |
|
445 handleCharInAttributeValue(c); |
|
446 state = MetaScanner.ATTRIBUTE_VALUE_UNQUOTED; |
|
447 continue stateloop; |
|
448 } |
|
449 } |
|
450 // FALLTHRU DON'T REORDER |
|
451 case ATTRIBUTE_VALUE_DOUBLE_QUOTED: |
|
452 attributevaluedoublequotedloop: for (;;) { |
|
453 if (reconsume) { |
|
454 reconsume = false; |
|
455 } else { |
|
456 c = read(); |
|
457 } |
|
458 switch (c) { |
|
459 case -1: |
|
460 break stateloop; |
|
461 case '"': |
|
462 handleAttributeValue(); |
|
463 state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED; |
|
464 break attributevaluedoublequotedloop; |
|
465 // continue stateloop; |
|
466 default: |
|
467 handleCharInAttributeValue(c); |
|
468 continue; |
|
469 } |
|
470 } |
|
471 // FALLTHRU DON'T REORDER |
|
472 case AFTER_ATTRIBUTE_VALUE_QUOTED: |
|
473 afterattributevaluequotedloop: for (;;) { |
|
474 c = read(); |
|
475 switch (c) { |
|
476 case -1: |
|
477 break stateloop; |
|
478 case ' ': |
|
479 case '\t': |
|
480 case '\n': |
|
481 case '\u000C': |
|
482 state = MetaScanner.BEFORE_ATTRIBUTE_NAME; |
|
483 continue stateloop; |
|
484 case '/': |
|
485 state = MetaScanner.SELF_CLOSING_START_TAG; |
|
486 break afterattributevaluequotedloop; |
|
487 // continue stateloop; |
|
488 case '>': |
|
489 if (handleTag()) { |
|
490 break stateloop; |
|
491 } |
|
492 state = MetaScanner.DATA; |
|
493 continue stateloop; |
|
494 default: |
|
495 state = MetaScanner.BEFORE_ATTRIBUTE_NAME; |
|
496 reconsume = true; |
|
497 continue stateloop; |
|
498 } |
|
499 } |
|
500 // FALLTHRU DON'T REORDER |
|
501 case SELF_CLOSING_START_TAG: |
|
502 c = read(); |
|
503 switch (c) { |
|
504 case -1: |
|
505 break stateloop; |
|
506 case '>': |
|
507 if (handleTag()) { |
|
508 break stateloop; |
|
509 } |
|
510 state = MetaScanner.DATA; |
|
511 continue stateloop; |
|
512 default: |
|
513 state = MetaScanner.BEFORE_ATTRIBUTE_NAME; |
|
514 reconsume = true; |
|
515 continue stateloop; |
|
516 } |
|
517 // XXX reorder point |
|
518 case ATTRIBUTE_VALUE_UNQUOTED: |
|
519 for (;;) { |
|
520 if (reconsume) { |
|
521 reconsume = false; |
|
522 } else { |
|
523 c = read(); |
|
524 } |
|
525 switch (c) { |
|
526 case -1: |
|
527 break stateloop; |
|
528 case ' ': |
|
529 case '\t': |
|
530 case '\n': |
|
531 |
|
532 case '\u000C': |
|
533 handleAttributeValue(); |
|
534 state = MetaScanner.BEFORE_ATTRIBUTE_NAME; |
|
535 continue stateloop; |
|
536 case '>': |
|
537 handleAttributeValue(); |
|
538 if (handleTag()) { |
|
539 break stateloop; |
|
540 } |
|
541 state = MetaScanner.DATA; |
|
542 continue stateloop; |
|
543 default: |
|
544 handleCharInAttributeValue(c); |
|
545 continue; |
|
546 } |
|
547 } |
|
548 // XXX reorder point |
|
549 case AFTER_ATTRIBUTE_NAME: |
|
550 for (;;) { |
|
551 c = read(); |
|
552 switch (c) { |
|
553 case -1: |
|
554 break stateloop; |
|
555 case ' ': |
|
556 case '\t': |
|
557 case '\n': |
|
558 case '\u000C': |
|
559 continue; |
|
560 case '/': |
|
561 handleAttributeValue(); |
|
562 state = MetaScanner.SELF_CLOSING_START_TAG; |
|
563 continue stateloop; |
|
564 case '=': |
|
565 strBufLen = 0; |
|
566 contentTypeIndex = 0; |
|
567 state = MetaScanner.BEFORE_ATTRIBUTE_VALUE; |
|
568 continue stateloop; |
|
569 case '>': |
|
570 handleAttributeValue(); |
|
571 if (handleTag()) { |
|
572 break stateloop; |
|
573 } |
|
574 state = MetaScanner.DATA; |
|
575 continue stateloop; |
|
576 case 'c': |
|
577 case 'C': |
|
578 contentIndex = 0; |
|
579 charsetIndex = 0; |
|
580 state = MetaScanner.ATTRIBUTE_NAME; |
|
581 continue stateloop; |
|
582 default: |
|
583 contentIndex = Integer.MAX_VALUE; |
|
584 charsetIndex = Integer.MAX_VALUE; |
|
585 state = MetaScanner.ATTRIBUTE_NAME; |
|
586 continue stateloop; |
|
587 } |
|
588 } |
|
589 // XXX reorder point |
|
590 case MARKUP_DECLARATION_OPEN: |
|
591 markupdeclarationopenloop: for (;;) { |
|
592 c = read(); |
|
593 switch (c) { |
|
594 case -1: |
|
595 break stateloop; |
|
596 case '-': |
|
597 state = MetaScanner.MARKUP_DECLARATION_HYPHEN; |
|
598 break markupdeclarationopenloop; |
|
599 // continue stateloop; |
|
600 default: |
|
601 state = MetaScanner.SCAN_UNTIL_GT; |
|
602 reconsume = true; |
|
603 continue stateloop; |
|
604 } |
|
605 } |
|
606 // FALLTHRU DON'T REORDER |
|
607 case MARKUP_DECLARATION_HYPHEN: |
|
608 markupdeclarationhyphenloop: for (;;) { |
|
609 c = read(); |
|
610 switch (c) { |
|
611 case -1: |
|
612 break stateloop; |
|
613 case '-': |
|
614 state = MetaScanner.COMMENT_START; |
|
615 break markupdeclarationhyphenloop; |
|
616 // continue stateloop; |
|
617 default: |
|
618 state = MetaScanner.SCAN_UNTIL_GT; |
|
619 reconsume = true; |
|
620 continue stateloop; |
|
621 } |
|
622 } |
|
623 // FALLTHRU DON'T REORDER |
|
624 case COMMENT_START: |
|
625 commentstartloop: for (;;) { |
|
626 c = read(); |
|
627 switch (c) { |
|
628 case -1: |
|
629 break stateloop; |
|
630 case '-': |
|
631 state = MetaScanner.COMMENT_START_DASH; |
|
632 continue stateloop; |
|
633 case '>': |
|
634 state = MetaScanner.DATA; |
|
635 continue stateloop; |
|
636 default: |
|
637 state = MetaScanner.COMMENT; |
|
638 break commentstartloop; |
|
639 // continue stateloop; |
|
640 } |
|
641 } |
|
642 // FALLTHRU DON'T REORDER |
|
643 case COMMENT: |
|
644 commentloop: for (;;) { |
|
645 c = read(); |
|
646 switch (c) { |
|
647 case -1: |
|
648 break stateloop; |
|
649 case '-': |
|
650 state = MetaScanner.COMMENT_END_DASH; |
|
651 break commentloop; |
|
652 // continue stateloop; |
|
653 default: |
|
654 continue; |
|
655 } |
|
656 } |
|
657 // FALLTHRU DON'T REORDER |
|
658 case COMMENT_END_DASH: |
|
659 commentenddashloop: for (;;) { |
|
660 c = read(); |
|
661 switch (c) { |
|
662 case -1: |
|
663 break stateloop; |
|
664 case '-': |
|
665 state = MetaScanner.COMMENT_END; |
|
666 break commentenddashloop; |
|
667 // continue stateloop; |
|
668 default: |
|
669 state = MetaScanner.COMMENT; |
|
670 continue stateloop; |
|
671 } |
|
672 } |
|
673 // FALLTHRU DON'T REORDER |
|
674 case COMMENT_END: |
|
675 for (;;) { |
|
676 c = read(); |
|
677 switch (c) { |
|
678 case -1: |
|
679 break stateloop; |
|
680 case '>': |
|
681 state = MetaScanner.DATA; |
|
682 continue stateloop; |
|
683 case '-': |
|
684 continue; |
|
685 default: |
|
686 state = MetaScanner.COMMENT; |
|
687 continue stateloop; |
|
688 } |
|
689 } |
|
690 // XXX reorder point |
|
691 case COMMENT_START_DASH: |
|
692 c = read(); |
|
693 switch (c) { |
|
694 case -1: |
|
695 break stateloop; |
|
696 case '-': |
|
697 state = MetaScanner.COMMENT_END; |
|
698 continue stateloop; |
|
699 case '>': |
|
700 state = MetaScanner.DATA; |
|
701 continue stateloop; |
|
702 default: |
|
703 state = MetaScanner.COMMENT; |
|
704 continue stateloop; |
|
705 } |
|
706 // XXX reorder point |
|
707 case ATTRIBUTE_VALUE_SINGLE_QUOTED: |
|
708 for (;;) { |
|
709 if (reconsume) { |
|
710 reconsume = false; |
|
711 } else { |
|
712 c = read(); |
|
713 } |
|
714 switch (c) { |
|
715 case -1: |
|
716 break stateloop; |
|
717 case '\'': |
|
718 handleAttributeValue(); |
|
719 state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED; |
|
720 continue stateloop; |
|
721 default: |
|
722 handleCharInAttributeValue(c); |
|
723 continue; |
|
724 } |
|
725 } |
|
726 // XXX reorder point |
|
727 case SCAN_UNTIL_GT: |
|
728 for (;;) { |
|
729 if (reconsume) { |
|
730 reconsume = false; |
|
731 } else { |
|
732 c = read(); |
|
733 } |
|
734 switch (c) { |
|
735 case -1: |
|
736 break stateloop; |
|
737 case '>': |
|
738 state = MetaScanner.DATA; |
|
739 continue stateloop; |
|
740 default: |
|
741 continue; |
|
742 } |
|
743 } |
|
744 } |
|
745 } |
|
746 stateSave = state; |
|
747 } |
|
748 |
|
749 private void handleCharInAttributeValue(int c) { |
|
750 if (metaState == A) { |
|
751 if (contentIndex == CONTENT.length || charsetIndex == CHARSET.length) { |
|
752 addToBuffer(c); |
|
753 } else if (httpEquivIndex == HTTP_EQUIV.length) { |
|
754 if (contentTypeIndex < CONTENT_TYPE.length && toAsciiLowerCase(c) == CONTENT_TYPE[contentTypeIndex]) { |
|
755 ++contentTypeIndex; |
|
756 } else { |
|
757 contentTypeIndex = Integer.MAX_VALUE; |
|
758 } |
|
759 } |
|
760 } |
|
761 } |
|
762 |
|
763 @Inline private int toAsciiLowerCase(int c) { |
|
764 if (c >= 'A' && c <= 'Z') { |
|
765 return c + 0x20; |
|
766 } |
|
767 return c; |
|
768 } |
|
769 |
|
770 /** |
|
771 * Adds a character to the accumulation buffer. |
|
772 * @param c the character to add |
|
773 */ |
|
774 private void addToBuffer(int c) { |
|
775 if (strBufLen == strBuf.length) { |
|
776 char[] newBuf = new char[strBuf.length + (strBuf.length << 1)]; |
|
777 System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length); |
|
778 strBuf = newBuf; |
|
779 } |
|
780 strBuf[strBufLen++] = (char)c; |
|
781 } |
|
782 |
|
783 /** |
|
784 * Attempts to extract a charset name from the accumulation buffer. |
|
785 * @return <code>true</code> if successful |
|
786 * @throws SAXException |
|
787 */ |
|
788 private void handleAttributeValue() throws SAXException { |
|
789 if (metaState != A) { |
|
790 return; |
|
791 } |
|
792 if (contentIndex == CONTENT.length && content == null) { |
|
793 content = Portability.newStringFromBuffer(strBuf, 0, strBufLen); |
|
794 return; |
|
795 } |
|
796 if (charsetIndex == CHARSET.length && charset == null) { |
|
797 charset = Portability.newStringFromBuffer(strBuf, 0, strBufLen); |
|
798 return; |
|
799 } |
|
800 if (httpEquivIndex == HTTP_EQUIV.length |
|
801 && httpEquivState == HTTP_EQUIV_NOT_SEEN) { |
|
802 httpEquivState = (contentTypeIndex == CONTENT_TYPE.length) ? HTTP_EQUIV_CONTENT_TYPE |
|
803 : HTTP_EQUIV_OTHER; |
|
804 return; |
|
805 } |
|
806 } |
|
807 |
|
808 private boolean handleTag() throws SAXException { |
|
809 boolean stop = handleTagInner(); |
|
810 Portability.releaseString(content); |
|
811 content = null; |
|
812 Portability.releaseString(charset); |
|
813 charset = null; |
|
814 httpEquivState = HTTP_EQUIV_NOT_SEEN; |
|
815 return stop; |
|
816 } |
|
817 |
|
818 private boolean handleTagInner() throws SAXException { |
|
819 if (charset != null && tryCharset(charset)) { |
|
820 return true; |
|
821 } |
|
822 if (content != null && httpEquivState == HTTP_EQUIV_CONTENT_TYPE) { |
|
823 String extract = TreeBuilder.extractCharsetFromContent(content); |
|
824 if (extract == null) { |
|
825 return false; |
|
826 } |
|
827 boolean success = tryCharset(extract); |
|
828 Portability.releaseString(extract); |
|
829 return success; |
|
830 } |
|
831 return false; |
|
832 } |
|
833 |
|
834 /** |
|
835 * Tries to switch to an encoding. |
|
836 * |
|
837 * @param encoding |
|
838 * @return <code>true</code> if successful |
|
839 * @throws SAXException |
|
840 */ |
|
841 protected abstract boolean tryCharset(String encoding) throws SAXException; |
|
842 |
|
843 } |