michael@0: /*
michael@0: * ====================================================================
michael@0: * Licensed to the Apache Software Foundation (ASF) under one
michael@0: * or more contributor license agreements. See the NOTICE file
michael@0: * distributed with this work for additional information
michael@0: * regarding copyright ownership. The ASF licenses this file
michael@0: * to you under the Apache License, Version 2.0 (the
michael@0: * "License"); you may not use this file except in compliance
michael@0: * with the License. You may obtain a copy of the License at
michael@0: *
michael@0: * http://www.apache.org/licenses/LICENSE-2.0
michael@0: *
michael@0: * Unless required by applicable law or agreed to in writing,
michael@0: * software distributed under the License is distributed on an
michael@0: * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
michael@0: * KIND, either express or implied. See the License for the
michael@0: * specific language governing permissions and limitations
michael@0: * under the License.
michael@0: * ====================================================================
michael@0: *
michael@0: * This software consists of voluntary contributions made by many
michael@0: * individuals on behalf of the Apache Software Foundation. For more
michael@0: * information on the Apache Software Foundation, please see
michael@0: * .
michael@0: *
michael@0: */
michael@0:
michael@0: package ch.boye.httpclientandroidlib.message;
michael@0:
michael@0: import java.util.NoSuchElementException;
michael@0:
michael@0: import ch.boye.httpclientandroidlib.HeaderIterator;
michael@0: import ch.boye.httpclientandroidlib.ParseException;
michael@0: import ch.boye.httpclientandroidlib.TokenIterator;
michael@0:
michael@0: /**
michael@0: * Basic implementation of a {@link TokenIterator}.
michael@0: * This implementation parses #token sequences as
michael@0: * defined by RFC 2616, section 2.
michael@0: * It extends that definition somewhat beyond US-ASCII.
michael@0: *
michael@0: * @since 4.0
michael@0: */
michael@0: public class BasicTokenIterator implements TokenIterator {
michael@0:
michael@0: /** The HTTP separator characters. Defined in RFC 2616, section 2.2. */
michael@0: // the order of the characters here is adjusted to put the
michael@0: // most likely candidates at the beginning of the collection
michael@0: public final static String HTTP_SEPARATORS = " ,;=()<>@:\\\"/[]?{}\t";
michael@0:
michael@0:
michael@0: /** The iterator from which to obtain the next header. */
michael@0: protected final HeaderIterator headerIt;
michael@0:
michael@0: /**
michael@0: * The value of the current header.
michael@0: * This is the header value that includes {@link #currentToken}.
michael@0: * Undefined if the iteration is over.
michael@0: */
michael@0: protected String currentHeader;
michael@0:
michael@0: /**
michael@0: * The token to be returned by the next call to {@link #currentToken}.
michael@0: * null
if the iteration is over.
michael@0: */
michael@0: protected String currentToken;
michael@0:
michael@0: /**
michael@0: * The position after {@link #currentToken} in {@link #currentHeader}.
michael@0: * Undefined if the iteration is over.
michael@0: */
michael@0: protected int searchPos;
michael@0:
michael@0:
michael@0: /**
michael@0: * Creates a new instance of {@link BasicTokenIterator}.
michael@0: *
michael@0: * @param headerIterator the iterator for the headers to tokenize
michael@0: */
michael@0: public BasicTokenIterator(final HeaderIterator headerIterator) {
michael@0: if (headerIterator == null) {
michael@0: throw new IllegalArgumentException
michael@0: ("Header iterator must not be null.");
michael@0: }
michael@0:
michael@0: this.headerIt = headerIterator;
michael@0: this.searchPos = findNext(-1);
michael@0: }
michael@0:
michael@0:
michael@0: // non-javadoc, see interface TokenIterator
michael@0: public boolean hasNext() {
michael@0: return (this.currentToken != null);
michael@0: }
michael@0:
michael@0:
michael@0: /**
michael@0: * Obtains the next token from this iteration.
michael@0: *
michael@0: * @return the next token in this iteration
michael@0: *
michael@0: * @throws NoSuchElementException if the iteration is already over
michael@0: * @throws ParseException if an invalid header value is encountered
michael@0: */
michael@0: public String nextToken()
michael@0: throws NoSuchElementException, ParseException {
michael@0:
michael@0: if (this.currentToken == null) {
michael@0: throw new NoSuchElementException("Iteration already finished.");
michael@0: }
michael@0:
michael@0: final String result = this.currentToken;
michael@0: // updates currentToken, may trigger ParseException:
michael@0: this.searchPos = findNext(this.searchPos);
michael@0:
michael@0: return result;
michael@0: }
michael@0:
michael@0:
michael@0: /**
michael@0: * Returns the next token.
michael@0: * Same as {@link #nextToken}, but with generic return type.
michael@0: *
michael@0: * @return the next token in this iteration
michael@0: *
michael@0: * @throws NoSuchElementException if there are no more tokens
michael@0: * @throws ParseException if an invalid header value is encountered
michael@0: */
michael@0: public final Object next()
michael@0: throws NoSuchElementException, ParseException {
michael@0: return nextToken();
michael@0: }
michael@0:
michael@0:
michael@0: /**
michael@0: * Removing tokens is not supported.
michael@0: *
michael@0: * @throws UnsupportedOperationException always
michael@0: */
michael@0: public final void remove()
michael@0: throws UnsupportedOperationException {
michael@0:
michael@0: throw new UnsupportedOperationException
michael@0: ("Removing tokens is not supported.");
michael@0: }
michael@0:
michael@0:
michael@0: /**
michael@0: * Determines the next token.
michael@0: * If found, the token is stored in {@link #currentToken}.
michael@0: * The return value indicates the position after the token
michael@0: * in {@link #currentHeader}. If necessary, the next header
michael@0: * will be obtained from {@link #headerIt}.
michael@0: * If not found, {@link #currentToken} is set to null
.
michael@0: *
michael@0: * @param from the position in the current header at which to
michael@0: * start the search, -1 to search in the first header
michael@0: *
michael@0: * @return the position after the found token in the current header, or
michael@0: * negative if there was no next token
michael@0: *
michael@0: * @throws ParseException if an invalid header value is encountered
michael@0: */
michael@0: protected int findNext(int from)
michael@0: throws ParseException {
michael@0:
michael@0: if (from < 0) {
michael@0: // called from the constructor, initialize the first header
michael@0: if (!this.headerIt.hasNext()) {
michael@0: return -1;
michael@0: }
michael@0: this.currentHeader = this.headerIt.nextHeader().getValue();
michael@0: from = 0;
michael@0: } else {
michael@0: // called after a token, make sure there is a separator
michael@0: from = findTokenSeparator(from);
michael@0: }
michael@0:
michael@0: int start = findTokenStart(from);
michael@0: if (start < 0) {
michael@0: this.currentToken = null;
michael@0: return -1; // nothing found
michael@0: }
michael@0:
michael@0: int end = findTokenEnd(start);
michael@0: this.currentToken = createToken(this.currentHeader, start, end);
michael@0: return end;
michael@0: }
michael@0:
michael@0:
michael@0: /**
michael@0: * Creates a new token to be returned.
michael@0: * Called from {@link #findNext findNext} after the token is identified.
michael@0: * The default implementation simply calls
michael@0: * {@link java.lang.String#substring String.substring}.
michael@0: *
michael@0: * If header values are significantly longer than tokens, and some
michael@0: * tokens are permanently referenced by the application, there can
michael@0: * be problems with garbage collection. A substring will hold a
michael@0: * reference to the full characters of the original string and
michael@0: * therefore occupies more memory than might be expected.
michael@0: * To avoid this, override this method and create a new string
michael@0: * instead of a substring.
michael@0: *
michael@0: * @param value the full header value from which to create a token
michael@0: * @param start the index of the first token character
michael@0: * @param end the index after the last token character
michael@0: *
michael@0: * @return a string representing the token identified by the arguments
michael@0: */
michael@0: protected String createToken(String value, int start, int end) {
michael@0: return value.substring(start, end);
michael@0: }
michael@0:
michael@0:
michael@0: /**
michael@0: * Determines the starting position of the next token.
michael@0: * This method will iterate over headers if necessary.
michael@0: *
michael@0: * @param from the position in the current header at which to
michael@0: * start the search
michael@0: *
michael@0: * @return the position of the token start in the current header,
michael@0: * negative if no token start could be found
michael@0: */
michael@0: protected int findTokenStart(int from) {
michael@0: if (from < 0) {
michael@0: throw new IllegalArgumentException
michael@0: ("Search position must not be negative: " + from);
michael@0: }
michael@0:
michael@0: boolean found = false;
michael@0: while (!found && (this.currentHeader != null)) {
michael@0:
michael@0: final int to = this.currentHeader.length();
michael@0: while (!found && (from < to)) {
michael@0:
michael@0: final char ch = this.currentHeader.charAt(from);
michael@0: if (isTokenSeparator(ch) || isWhitespace(ch)) {
michael@0: // whitspace and token separators are skipped
michael@0: from++;
michael@0: } else if (isTokenChar(this.currentHeader.charAt(from))) {
michael@0: // found the start of a token
michael@0: found = true;
michael@0: } else {
michael@0: throw new ParseException
michael@0: ("Invalid character before token (pos " + from +
michael@0: "): " + this.currentHeader);
michael@0: }
michael@0: }
michael@0: if (!found) {
michael@0: if (this.headerIt.hasNext()) {
michael@0: this.currentHeader = this.headerIt.nextHeader().getValue();
michael@0: from = 0;
michael@0: } else {
michael@0: this.currentHeader = null;
michael@0: }
michael@0: }
michael@0: } // while headers
michael@0:
michael@0: return found ? from : -1;
michael@0: }
michael@0:
michael@0:
michael@0: /**
michael@0: * Determines the position of the next token separator.
michael@0: * Because of multi-header joining rules, the end of a
michael@0: * header value is a token separator. This method does
michael@0: * therefore not need to iterate over headers.
michael@0: *
michael@0: * @param from the position in the current header at which to
michael@0: * start the search
michael@0: *
michael@0: * @return the position of a token separator in the current header,
michael@0: * or at the end
michael@0: *
michael@0: * @throws ParseException
michael@0: * if a new token is found before a token separator.
michael@0: * RFC 2616, section 2.1 explicitly requires a comma between
michael@0: * tokens for #.
michael@0: */
michael@0: protected int findTokenSeparator(int from) {
michael@0: if (from < 0) {
michael@0: throw new IllegalArgumentException
michael@0: ("Search position must not be negative: " + from);
michael@0: }
michael@0:
michael@0: boolean found = false;
michael@0: final int to = this.currentHeader.length();
michael@0: while (!found && (from < to)) {
michael@0: final char ch = this.currentHeader.charAt(from);
michael@0: if (isTokenSeparator(ch)) {
michael@0: found = true;
michael@0: } else if (isWhitespace(ch)) {
michael@0: from++;
michael@0: } else if (isTokenChar(ch)) {
michael@0: throw new ParseException
michael@0: ("Tokens without separator (pos " + from +
michael@0: "): " + this.currentHeader);
michael@0: } else {
michael@0: throw new ParseException
michael@0: ("Invalid character after token (pos " + from +
michael@0: "): " + this.currentHeader);
michael@0: }
michael@0: }
michael@0:
michael@0: return from;
michael@0: }
michael@0:
michael@0:
michael@0: /**
michael@0: * Determines the ending position of the current token.
michael@0: * This method will not leave the current header value,
michael@0: * since the end of the header value is a token boundary.
michael@0: *
michael@0: * @param from the position of the first character of the token
michael@0: *
michael@0: * @return the position after the last character of the token.
michael@0: * The behavior is undefined if from
does not
michael@0: * point to a token character in the current header value.
michael@0: */
michael@0: protected int findTokenEnd(int from) {
michael@0: if (from < 0) {
michael@0: throw new IllegalArgumentException
michael@0: ("Token start position must not be negative: " + from);
michael@0: }
michael@0:
michael@0: final int to = this.currentHeader.length();
michael@0: int end = from+1;
michael@0: while ((end < to) && isTokenChar(this.currentHeader.charAt(end))) {
michael@0: end++;
michael@0: }
michael@0:
michael@0: return end;
michael@0: }
michael@0:
michael@0:
michael@0: /**
michael@0: * Checks whether a character is a token separator.
michael@0: * RFC 2616, section 2.1 defines comma as the separator for
michael@0: * #token sequences. The end of a header value will
michael@0: * also separate tokens, but that is not a character check.
michael@0: *
michael@0: * @param ch the character to check
michael@0: *
michael@0: * @return true
if the character is a token separator,
michael@0: * false
otherwise
michael@0: */
michael@0: protected boolean isTokenSeparator(char ch) {
michael@0: return (ch == ',');
michael@0: }
michael@0:
michael@0:
michael@0: /**
michael@0: * Checks whether a character is a whitespace character.
michael@0: * RFC 2616, section 2.2 defines space and horizontal tab as whitespace.
michael@0: * The optional preceeding line break is irrelevant, since header
michael@0: * continuation is handled transparently when parsing messages.
michael@0: *
michael@0: * @param ch the character to check
michael@0: *
michael@0: * @return true
if the character is whitespace,
michael@0: * false
otherwise
michael@0: */
michael@0: protected boolean isWhitespace(char ch) {
michael@0:
michael@0: // we do not use Character.isWhitspace(ch) here, since that allows
michael@0: // many control characters which are not whitespace as per RFC 2616
michael@0: return ((ch == '\t') || Character.isSpaceChar(ch));
michael@0: }
michael@0:
michael@0:
michael@0: /**
michael@0: * Checks whether a character is a valid token character.
michael@0: * Whitespace, control characters, and HTTP separators are not
michael@0: * valid token characters. The HTTP specification (RFC 2616, section 2.2)
michael@0: * defines tokens only for the US-ASCII character set, this
michael@0: * method extends the definition to other character sets.
michael@0: *
michael@0: * @param ch the character to check
michael@0: *
michael@0: * @return true
if the character is a valid token start,
michael@0: * false
otherwise
michael@0: */
michael@0: protected boolean isTokenChar(char ch) {
michael@0:
michael@0: // common sense extension of ALPHA + DIGIT
michael@0: if (Character.isLetterOrDigit(ch))
michael@0: return true;
michael@0:
michael@0: // common sense extension of CTL
michael@0: if (Character.isISOControl(ch))
michael@0: return false;
michael@0:
michael@0: // no common sense extension for this
michael@0: if (isHttpSeparator(ch))
michael@0: return false;
michael@0:
michael@0: // RFC 2616, section 2.2 defines a token character as
michael@0: // "any CHAR except CTLs or separators". The controls
michael@0: // and separators are included in the checks above.
michael@0: // This will yield unexpected results for Unicode format characters.
michael@0: // If that is a problem, overwrite isHttpSeparator(char) to filter
michael@0: // out the false positives.
michael@0: return true;
michael@0: }
michael@0:
michael@0:
michael@0: /**
michael@0: * Checks whether a character is an HTTP separator.
michael@0: * The implementation in this class checks only for the HTTP separators
michael@0: * defined in RFC 2616, section 2.2. If you need to detect other
michael@0: * separators beyond the US-ASCII character set, override this method.
michael@0: *
michael@0: * @param ch the character to check
michael@0: *
michael@0: * @return true
if the character is an HTTP separator
michael@0: */
michael@0: protected boolean isHttpSeparator(char ch) {
michael@0: return (HTTP_SEPARATORS.indexOf(ch) >= 0);
michael@0: }
michael@0:
michael@0:
michael@0: } // class BasicTokenIterator
michael@0: