|
1 /* |
|
2 * ==================================================================== |
|
3 * Licensed to the Apache Software Foundation (ASF) under one |
|
4 * or more contributor license agreements. See the NOTICE file |
|
5 * distributed with this work for additional information |
|
6 * regarding copyright ownership. The ASF licenses this file |
|
7 * to you under the Apache License, Version 2.0 (the |
|
8 * "License"); you may not use this file except in compliance |
|
9 * with the License. You may obtain a copy of the License at |
|
10 * |
|
11 * http://www.apache.org/licenses/LICENSE-2.0 |
|
12 * |
|
13 * Unless required by applicable law or agreed to in writing, |
|
14 * software distributed under the License is distributed on an |
|
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
|
16 * KIND, either express or implied. See the License for the |
|
17 * specific language governing permissions and limitations |
|
18 * under the License. |
|
19 * ==================================================================== |
|
20 * |
|
21 * This software consists of voluntary contributions made by many |
|
22 * individuals on behalf of the Apache Software Foundation. For more |
|
23 * information on the Apache Software Foundation, please see |
|
24 * <http://www.apache.org/>. |
|
25 * |
|
26 */ |
|
27 |
|
28 package ch.boye.httpclientandroidlib.message; |
|
29 |
|
30 import java.util.NoSuchElementException; |
|
31 |
|
32 import ch.boye.httpclientandroidlib.HeaderIterator; |
|
33 import ch.boye.httpclientandroidlib.ParseException; |
|
34 import ch.boye.httpclientandroidlib.TokenIterator; |
|
35 |
|
36 /** |
|
37 * Basic implementation of a {@link TokenIterator}. |
|
38 * This implementation parses <tt>#token<tt> sequences as |
|
39 * defined by RFC 2616, section 2. |
|
40 * It extends that definition somewhat beyond US-ASCII. |
|
41 * |
|
42 * @since 4.0 |
|
43 */ |
|
44 public class BasicTokenIterator implements TokenIterator { |
|
45 |
|
46 /** The HTTP separator characters. Defined in RFC 2616, section 2.2. */ |
|
47 // the order of the characters here is adjusted to put the |
|
48 // most likely candidates at the beginning of the collection |
|
49 public final static String HTTP_SEPARATORS = " ,;=()<>@:\\\"/[]?{}\t"; |
|
50 |
|
51 |
|
52 /** The iterator from which to obtain the next header. */ |
|
53 protected final HeaderIterator headerIt; |
|
54 |
|
55 /** |
|
56 * The value of the current header. |
|
57 * This is the header value that includes {@link #currentToken}. |
|
58 * Undefined if the iteration is over. |
|
59 */ |
|
60 protected String currentHeader; |
|
61 |
|
62 /** |
|
63 * The token to be returned by the next call to {@link #currentToken}. |
|
64 * <code>null</code> if the iteration is over. |
|
65 */ |
|
66 protected String currentToken; |
|
67 |
|
68 /** |
|
69 * The position after {@link #currentToken} in {@link #currentHeader}. |
|
70 * Undefined if the iteration is over. |
|
71 */ |
|
72 protected int searchPos; |
|
73 |
|
74 |
|
75 /** |
|
76 * Creates a new instance of {@link BasicTokenIterator}. |
|
77 * |
|
78 * @param headerIterator the iterator for the headers to tokenize |
|
79 */ |
|
80 public BasicTokenIterator(final HeaderIterator headerIterator) { |
|
81 if (headerIterator == null) { |
|
82 throw new IllegalArgumentException |
|
83 ("Header iterator must not be null."); |
|
84 } |
|
85 |
|
86 this.headerIt = headerIterator; |
|
87 this.searchPos = findNext(-1); |
|
88 } |
|
89 |
|
90 |
|
91 // non-javadoc, see interface TokenIterator |
|
92 public boolean hasNext() { |
|
93 return (this.currentToken != null); |
|
94 } |
|
95 |
|
96 |
|
97 /** |
|
98 * Obtains the next token from this iteration. |
|
99 * |
|
100 * @return the next token in this iteration |
|
101 * |
|
102 * @throws NoSuchElementException if the iteration is already over |
|
103 * @throws ParseException if an invalid header value is encountered |
|
104 */ |
|
105 public String nextToken() |
|
106 throws NoSuchElementException, ParseException { |
|
107 |
|
108 if (this.currentToken == null) { |
|
109 throw new NoSuchElementException("Iteration already finished."); |
|
110 } |
|
111 |
|
112 final String result = this.currentToken; |
|
113 // updates currentToken, may trigger ParseException: |
|
114 this.searchPos = findNext(this.searchPos); |
|
115 |
|
116 return result; |
|
117 } |
|
118 |
|
119 |
|
120 /** |
|
121 * Returns the next token. |
|
122 * Same as {@link #nextToken}, but with generic return type. |
|
123 * |
|
124 * @return the next token in this iteration |
|
125 * |
|
126 * @throws NoSuchElementException if there are no more tokens |
|
127 * @throws ParseException if an invalid header value is encountered |
|
128 */ |
|
129 public final Object next() |
|
130 throws NoSuchElementException, ParseException { |
|
131 return nextToken(); |
|
132 } |
|
133 |
|
134 |
|
135 /** |
|
136 * Removing tokens is not supported. |
|
137 * |
|
138 * @throws UnsupportedOperationException always |
|
139 */ |
|
140 public final void remove() |
|
141 throws UnsupportedOperationException { |
|
142 |
|
143 throw new UnsupportedOperationException |
|
144 ("Removing tokens is not supported."); |
|
145 } |
|
146 |
|
147 |
|
148 /** |
|
149 * Determines the next token. |
|
150 * If found, the token is stored in {@link #currentToken}. |
|
151 * The return value indicates the position after the token |
|
152 * in {@link #currentHeader}. If necessary, the next header |
|
153 * will be obtained from {@link #headerIt}. |
|
154 * If not found, {@link #currentToken} is set to <code>null</code>. |
|
155 * |
|
156 * @param from the position in the current header at which to |
|
157 * start the search, -1 to search in the first header |
|
158 * |
|
159 * @return the position after the found token in the current header, or |
|
160 * negative if there was no next token |
|
161 * |
|
162 * @throws ParseException if an invalid header value is encountered |
|
163 */ |
|
164 protected int findNext(int from) |
|
165 throws ParseException { |
|
166 |
|
167 if (from < 0) { |
|
168 // called from the constructor, initialize the first header |
|
169 if (!this.headerIt.hasNext()) { |
|
170 return -1; |
|
171 } |
|
172 this.currentHeader = this.headerIt.nextHeader().getValue(); |
|
173 from = 0; |
|
174 } else { |
|
175 // called after a token, make sure there is a separator |
|
176 from = findTokenSeparator(from); |
|
177 } |
|
178 |
|
179 int start = findTokenStart(from); |
|
180 if (start < 0) { |
|
181 this.currentToken = null; |
|
182 return -1; // nothing found |
|
183 } |
|
184 |
|
185 int end = findTokenEnd(start); |
|
186 this.currentToken = createToken(this.currentHeader, start, end); |
|
187 return end; |
|
188 } |
|
189 |
|
190 |
|
191 /** |
|
192 * Creates a new token to be returned. |
|
193 * Called from {@link #findNext findNext} after the token is identified. |
|
194 * The default implementation simply calls |
|
195 * {@link java.lang.String#substring String.substring}. |
|
196 * <br/> |
|
197 * If header values are significantly longer than tokens, and some |
|
198 * tokens are permanently referenced by the application, there can |
|
199 * be problems with garbage collection. A substring will hold a |
|
200 * reference to the full characters of the original string and |
|
201 * therefore occupies more memory than might be expected. |
|
202 * To avoid this, override this method and create a new string |
|
203 * instead of a substring. |
|
204 * |
|
205 * @param value the full header value from which to create a token |
|
206 * @param start the index of the first token character |
|
207 * @param end the index after the last token character |
|
208 * |
|
209 * @return a string representing the token identified by the arguments |
|
210 */ |
|
211 protected String createToken(String value, int start, int end) { |
|
212 return value.substring(start, end); |
|
213 } |
|
214 |
|
215 |
|
216 /** |
|
217 * Determines the starting position of the next token. |
|
218 * This method will iterate over headers if necessary. |
|
219 * |
|
220 * @param from the position in the current header at which to |
|
221 * start the search |
|
222 * |
|
223 * @return the position of the token start in the current header, |
|
224 * negative if no token start could be found |
|
225 */ |
|
226 protected int findTokenStart(int from) { |
|
227 if (from < 0) { |
|
228 throw new IllegalArgumentException |
|
229 ("Search position must not be negative: " + from); |
|
230 } |
|
231 |
|
232 boolean found = false; |
|
233 while (!found && (this.currentHeader != null)) { |
|
234 |
|
235 final int to = this.currentHeader.length(); |
|
236 while (!found && (from < to)) { |
|
237 |
|
238 final char ch = this.currentHeader.charAt(from); |
|
239 if (isTokenSeparator(ch) || isWhitespace(ch)) { |
|
240 // whitspace and token separators are skipped |
|
241 from++; |
|
242 } else if (isTokenChar(this.currentHeader.charAt(from))) { |
|
243 // found the start of a token |
|
244 found = true; |
|
245 } else { |
|
246 throw new ParseException |
|
247 ("Invalid character before token (pos " + from + |
|
248 "): " + this.currentHeader); |
|
249 } |
|
250 } |
|
251 if (!found) { |
|
252 if (this.headerIt.hasNext()) { |
|
253 this.currentHeader = this.headerIt.nextHeader().getValue(); |
|
254 from = 0; |
|
255 } else { |
|
256 this.currentHeader = null; |
|
257 } |
|
258 } |
|
259 } // while headers |
|
260 |
|
261 return found ? from : -1; |
|
262 } |
|
263 |
|
264 |
|
265 /** |
|
266 * Determines the position of the next token separator. |
|
267 * Because of multi-header joining rules, the end of a |
|
268 * header value is a token separator. This method does |
|
269 * therefore not need to iterate over headers. |
|
270 * |
|
271 * @param from the position in the current header at which to |
|
272 * start the search |
|
273 * |
|
274 * @return the position of a token separator in the current header, |
|
275 * or at the end |
|
276 * |
|
277 * @throws ParseException |
|
278 * if a new token is found before a token separator. |
|
279 * RFC 2616, section 2.1 explicitly requires a comma between |
|
280 * tokens for <tt>#</tt>. |
|
281 */ |
|
282 protected int findTokenSeparator(int from) { |
|
283 if (from < 0) { |
|
284 throw new IllegalArgumentException |
|
285 ("Search position must not be negative: " + from); |
|
286 } |
|
287 |
|
288 boolean found = false; |
|
289 final int to = this.currentHeader.length(); |
|
290 while (!found && (from < to)) { |
|
291 final char ch = this.currentHeader.charAt(from); |
|
292 if (isTokenSeparator(ch)) { |
|
293 found = true; |
|
294 } else if (isWhitespace(ch)) { |
|
295 from++; |
|
296 } else if (isTokenChar(ch)) { |
|
297 throw new ParseException |
|
298 ("Tokens without separator (pos " + from + |
|
299 "): " + this.currentHeader); |
|
300 } else { |
|
301 throw new ParseException |
|
302 ("Invalid character after token (pos " + from + |
|
303 "): " + this.currentHeader); |
|
304 } |
|
305 } |
|
306 |
|
307 return from; |
|
308 } |
|
309 |
|
310 |
|
311 /** |
|
312 * Determines the ending position of the current token. |
|
313 * This method will not leave the current header value, |
|
314 * since the end of the header value is a token boundary. |
|
315 * |
|
316 * @param from the position of the first character of the token |
|
317 * |
|
318 * @return the position after the last character of the token. |
|
319 * The behavior is undefined if <code>from</code> does not |
|
320 * point to a token character in the current header value. |
|
321 */ |
|
322 protected int findTokenEnd(int from) { |
|
323 if (from < 0) { |
|
324 throw new IllegalArgumentException |
|
325 ("Token start position must not be negative: " + from); |
|
326 } |
|
327 |
|
328 final int to = this.currentHeader.length(); |
|
329 int end = from+1; |
|
330 while ((end < to) && isTokenChar(this.currentHeader.charAt(end))) { |
|
331 end++; |
|
332 } |
|
333 |
|
334 return end; |
|
335 } |
|
336 |
|
337 |
|
338 /** |
|
339 * Checks whether a character is a token separator. |
|
340 * RFC 2616, section 2.1 defines comma as the separator for |
|
341 * <tt>#token</tt> sequences. The end of a header value will |
|
342 * also separate tokens, but that is not a character check. |
|
343 * |
|
344 * @param ch the character to check |
|
345 * |
|
346 * @return <code>true</code> if the character is a token separator, |
|
347 * <code>false</code> otherwise |
|
348 */ |
|
349 protected boolean isTokenSeparator(char ch) { |
|
350 return (ch == ','); |
|
351 } |
|
352 |
|
353 |
|
354 /** |
|
355 * Checks whether a character is a whitespace character. |
|
356 * RFC 2616, section 2.2 defines space and horizontal tab as whitespace. |
|
357 * The optional preceeding line break is irrelevant, since header |
|
358 * continuation is handled transparently when parsing messages. |
|
359 * |
|
360 * @param ch the character to check |
|
361 * |
|
362 * @return <code>true</code> if the character is whitespace, |
|
363 * <code>false</code> otherwise |
|
364 */ |
|
365 protected boolean isWhitespace(char ch) { |
|
366 |
|
367 // we do not use Character.isWhitspace(ch) here, since that allows |
|
368 // many control characters which are not whitespace as per RFC 2616 |
|
369 return ((ch == '\t') || Character.isSpaceChar(ch)); |
|
370 } |
|
371 |
|
372 |
|
373 /** |
|
374 * Checks whether a character is a valid token character. |
|
375 * Whitespace, control characters, and HTTP separators are not |
|
376 * valid token characters. The HTTP specification (RFC 2616, section 2.2) |
|
377 * defines tokens only for the US-ASCII character set, this |
|
378 * method extends the definition to other character sets. |
|
379 * |
|
380 * @param ch the character to check |
|
381 * |
|
382 * @return <code>true</code> if the character is a valid token start, |
|
383 * <code>false</code> otherwise |
|
384 */ |
|
385 protected boolean isTokenChar(char ch) { |
|
386 |
|
387 // common sense extension of ALPHA + DIGIT |
|
388 if (Character.isLetterOrDigit(ch)) |
|
389 return true; |
|
390 |
|
391 // common sense extension of CTL |
|
392 if (Character.isISOControl(ch)) |
|
393 return false; |
|
394 |
|
395 // no common sense extension for this |
|
396 if (isHttpSeparator(ch)) |
|
397 return false; |
|
398 |
|
399 // RFC 2616, section 2.2 defines a token character as |
|
400 // "any CHAR except CTLs or separators". The controls |
|
401 // and separators are included in the checks above. |
|
402 // This will yield unexpected results for Unicode format characters. |
|
403 // If that is a problem, overwrite isHttpSeparator(char) to filter |
|
404 // out the false positives. |
|
405 return true; |
|
406 } |
|
407 |
|
408 |
|
409 /** |
|
410 * Checks whether a character is an HTTP separator. |
|
411 * The implementation in this class checks only for the HTTP separators |
|
412 * defined in RFC 2616, section 2.2. If you need to detect other |
|
413 * separators beyond the US-ASCII character set, override this method. |
|
414 * |
|
415 * @param ch the character to check |
|
416 * |
|
417 * @return <code>true</code> if the character is an HTTP separator |
|
418 */ |
|
419 protected boolean isHttpSeparator(char ch) { |
|
420 return (HTTP_SEPARATORS.indexOf(ch) >= 0); |
|
421 } |
|
422 |
|
423 |
|
424 } // class BasicTokenIterator |
|
425 |