michael@0: /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #ifndef __nsCharSeparatedTokenizer_h michael@0: #define __nsCharSeparatedTokenizer_h michael@0: michael@0: #include "mozilla/RangedPtr.h" michael@0: michael@0: #include "nsDependentSubstring.h" michael@0: #include "nsCRT.h" michael@0: michael@0: /** michael@0: * This parses a SeparatorChar-separated string into tokens. michael@0: * Whitespace surrounding tokens is not treated as part of tokens, however michael@0: * whitespace inside a token is. If the final token is the empty string, it is michael@0: * not returned. michael@0: * michael@0: * Some examples, with SeparatorChar = ',': michael@0: * michael@0: * "foo, bar, baz" -> "foo" "bar" "baz" michael@0: * "foo,bar,baz" -> "foo" "bar" "baz" michael@0: * "foo , bar hi , baz" -> "foo" "bar hi" "baz" michael@0: * "foo, ,bar,baz" -> "foo" "" "bar" "baz" michael@0: * "foo,,bar,baz" -> "foo" "" "bar" "baz" michael@0: * "foo,bar,baz," -> "foo" "bar" "baz" michael@0: * michael@0: * The function used for whitespace detection is a template argument. michael@0: * By default, it is NS_IsAsciiWhitespace. michael@0: */ michael@0: template michael@0: class nsCharSeparatedTokenizerTemplate michael@0: { michael@0: public: michael@0: // Flags -- only one for now. If we need more, they should be defined to michael@0: // be 1 << 1, 1 << 2, etc. (They're masks, and aFlags is a bitfield.) michael@0: enum { michael@0: SEPARATOR_OPTIONAL = 1 michael@0: }; michael@0: michael@0: nsCharSeparatedTokenizerTemplate(const nsSubstring& aSource, michael@0: char16_t aSeparatorChar, michael@0: uint32_t aFlags = 0) michael@0: : mIter(aSource.Data(), aSource.Length()), michael@0: mEnd(aSource.Data() + aSource.Length(), aSource.Data(), michael@0: aSource.Length()), michael@0: mSeparatorChar(aSeparatorChar), michael@0: mWhitespaceBeforeFirstToken(false), michael@0: mWhitespaceAfterCurrentToken(false), michael@0: mSeparatorAfterCurrentToken(false), michael@0: mSeparatorOptional(aFlags & SEPARATOR_OPTIONAL) michael@0: { michael@0: // Skip initial whitespace michael@0: while (mIter < mEnd && IsWhitespace(*mIter)) { michael@0: mWhitespaceBeforeFirstToken = true; michael@0: ++mIter; michael@0: } michael@0: } michael@0: michael@0: /** michael@0: * Checks if any more tokens are available. michael@0: */ michael@0: bool hasMoreTokens() const michael@0: { michael@0: MOZ_ASSERT(mIter == mEnd || !IsWhitespace(*mIter), michael@0: "Should be at beginning of token if there is one"); michael@0: michael@0: return mIter < mEnd; michael@0: } michael@0: michael@0: /* michael@0: * Returns true if there is whitespace prior to the first token. michael@0: */ michael@0: bool whitespaceBeforeFirstToken() const michael@0: { michael@0: return mWhitespaceBeforeFirstToken; michael@0: } michael@0: michael@0: /* michael@0: * Returns true if there is a separator after the current token. michael@0: * Useful if you want to check whether the last token has a separator michael@0: * after it which may not be valid. michael@0: */ michael@0: bool separatorAfterCurrentToken() const michael@0: { michael@0: return mSeparatorAfterCurrentToken; michael@0: } michael@0: michael@0: /* michael@0: * Returns true if there is any whitespace after the current token. michael@0: */ michael@0: bool whitespaceAfterCurrentToken() const michael@0: { michael@0: return mWhitespaceAfterCurrentToken; michael@0: } michael@0: michael@0: /** michael@0: * Returns the next token. michael@0: */ michael@0: const nsDependentSubstring nextToken() michael@0: { michael@0: mozilla::RangedPtr tokenStart = mIter, tokenEnd = mIter; michael@0: michael@0: MOZ_ASSERT(mIter == mEnd || !IsWhitespace(*mIter), michael@0: "Should be at beginning of token if there is one"); michael@0: michael@0: // Search until we hit separator or end (or whitespace, if a separator michael@0: // isn't required -- see clause with 'break' below). michael@0: while (mIter < mEnd && *mIter != mSeparatorChar) { michael@0: // Skip to end of the current word. michael@0: while (mIter < mEnd && michael@0: !IsWhitespace(*mIter) && *mIter != mSeparatorChar) { michael@0: ++mIter; michael@0: } michael@0: tokenEnd = mIter; michael@0: michael@0: // Skip whitespace after the current word. michael@0: mWhitespaceAfterCurrentToken = false; michael@0: while (mIter < mEnd && IsWhitespace(*mIter)) { michael@0: mWhitespaceAfterCurrentToken = true; michael@0: ++mIter; michael@0: } michael@0: if (mSeparatorOptional) { michael@0: // We've hit (and skipped) whitespace, and that's sufficient to end michael@0: // our token, regardless of whether we've reached a SeparatorChar. michael@0: break; michael@0: } // (else, we'll keep looping until we hit mEnd or SeparatorChar) michael@0: } michael@0: michael@0: mSeparatorAfterCurrentToken = (mIter != mEnd && michael@0: *mIter == mSeparatorChar); michael@0: MOZ_ASSERT(mSeparatorOptional || michael@0: (mSeparatorAfterCurrentToken == (mIter < mEnd)), michael@0: "If we require a separator and haven't hit the end of " michael@0: "our string, then we shouldn't have left the loop " michael@0: "unless we hit a separator"); michael@0: michael@0: // Skip separator (and any whitespace after it), if we're at one. michael@0: if (mSeparatorAfterCurrentToken) { michael@0: ++mIter; michael@0: michael@0: while (mIter < mEnd && IsWhitespace(*mIter)) { michael@0: mWhitespaceAfterCurrentToken = true; michael@0: ++mIter; michael@0: } michael@0: } michael@0: michael@0: return Substring(tokenStart.get(), tokenEnd.get()); michael@0: } michael@0: michael@0: private: michael@0: mozilla::RangedPtr mIter; michael@0: const mozilla::RangedPtr mEnd; michael@0: char16_t mSeparatorChar; michael@0: bool mWhitespaceBeforeFirstToken; michael@0: bool mWhitespaceAfterCurrentToken; michael@0: bool mSeparatorAfterCurrentToken; michael@0: bool mSeparatorOptional; michael@0: }; michael@0: michael@0: class nsCharSeparatedTokenizer: public nsCharSeparatedTokenizerTemplate<> michael@0: { michael@0: public: michael@0: nsCharSeparatedTokenizer(const nsSubstring& aSource, michael@0: char16_t aSeparatorChar, michael@0: uint32_t aFlags = 0) michael@0: : nsCharSeparatedTokenizerTemplate<>(aSource, aSeparatorChar, aFlags) michael@0: { michael@0: } michael@0: }; michael@0: michael@0: template michael@0: class nsCCharSeparatedTokenizerTemplate michael@0: { michael@0: public: michael@0: // Flags -- only one for now. If we need more, they should be defined to michael@0: // be 1 << 1, 1 << 2, etc. (They're masks, and aFlags is a bitfield.) michael@0: enum { michael@0: SEPARATOR_OPTIONAL = 1 michael@0: }; michael@0: michael@0: nsCCharSeparatedTokenizerTemplate(const nsCSubstring& aSource, michael@0: char aSeparatorChar, michael@0: uint32_t aFlags = 0) michael@0: : mIter(aSource.Data(), aSource.Length()), michael@0: mEnd(aSource.Data() + aSource.Length(), aSource.Data(), michael@0: aSource.Length()), michael@0: mSeparatorChar(aSeparatorChar), michael@0: mWhitespaceBeforeFirstToken(false), michael@0: mWhitespaceAfterCurrentToken(false), michael@0: mSeparatorAfterCurrentToken(false), michael@0: mSeparatorOptional(aFlags & SEPARATOR_OPTIONAL) michael@0: { michael@0: // Skip initial whitespace michael@0: while (mIter < mEnd && IsWhitespace(*mIter)) { michael@0: mWhitespaceBeforeFirstToken = true; michael@0: ++mIter; michael@0: } michael@0: } michael@0: michael@0: /** michael@0: * Checks if any more tokens are available. michael@0: */ michael@0: bool hasMoreTokens() const michael@0: { michael@0: MOZ_ASSERT(mIter == mEnd || !IsWhitespace(*mIter), michael@0: "Should be at beginning of token if there is one"); michael@0: michael@0: return mIter < mEnd; michael@0: } michael@0: michael@0: /* michael@0: * Returns true if there is whitespace prior to the first token. michael@0: */ michael@0: bool whitespaceBeforeFirstToken() const michael@0: { michael@0: return mWhitespaceBeforeFirstToken; michael@0: } michael@0: michael@0: /* michael@0: * Returns true if there is a separator after the current token. michael@0: * Useful if you want to check whether the last token has a separator michael@0: * after it which may not be valid. michael@0: */ michael@0: bool separatorAfterCurrentToken() const michael@0: { michael@0: return mSeparatorAfterCurrentToken; michael@0: } michael@0: michael@0: /* michael@0: * Returns true if there is any whitespace after the current token. michael@0: */ michael@0: bool whitespaceAfterCurrentToken() const michael@0: { michael@0: return mWhitespaceAfterCurrentToken; michael@0: } michael@0: michael@0: /** michael@0: * Returns the next token. michael@0: */ michael@0: const nsDependentCSubstring nextToken() michael@0: { michael@0: mozilla::RangedPtr tokenStart = mIter, tokenEnd = mIter; michael@0: michael@0: MOZ_ASSERT(mIter == mEnd || !IsWhitespace(*mIter), michael@0: "Should be at beginning of token if there is one"); michael@0: michael@0: // Search until we hit separator or end (or whitespace, if a separator michael@0: // isn't required -- see clause with 'break' below). michael@0: while (mIter < mEnd && *mIter != mSeparatorChar) { michael@0: // Skip to end of the current word. michael@0: while (mIter < mEnd && michael@0: !IsWhitespace(*mIter) && *mIter != mSeparatorChar) { michael@0: ++mIter; michael@0: } michael@0: tokenEnd = mIter; michael@0: michael@0: // Skip whitespace after the current word. michael@0: mWhitespaceAfterCurrentToken = false; michael@0: while (mIter < mEnd && IsWhitespace(*mIter)) { michael@0: mWhitespaceAfterCurrentToken = true; michael@0: ++mIter; michael@0: } michael@0: if (mSeparatorOptional) { michael@0: // We've hit (and skipped) whitespace, and that's sufficient to end michael@0: // our token, regardless of whether we've reached a SeparatorChar. michael@0: break; michael@0: } // (else, we'll keep looping until we hit mEnd or SeparatorChar) michael@0: } michael@0: michael@0: mSeparatorAfterCurrentToken = (mIter != mEnd && michael@0: *mIter == mSeparatorChar); michael@0: MOZ_ASSERT(mSeparatorOptional || michael@0: (mSeparatorAfterCurrentToken == (mIter < mEnd)), michael@0: "If we require a separator and haven't hit the end of " michael@0: "our string, then we shouldn't have left the loop " michael@0: "unless we hit a separator"); michael@0: michael@0: // Skip separator (and any whitespace after it), if we're at one. michael@0: if (mSeparatorAfterCurrentToken) { michael@0: ++mIter; michael@0: michael@0: while (mIter < mEnd && IsWhitespace(*mIter)) { michael@0: mWhitespaceAfterCurrentToken = true; michael@0: ++mIter; michael@0: } michael@0: } michael@0: michael@0: return Substring(tokenStart.get(), tokenEnd.get()); michael@0: } michael@0: michael@0: private: michael@0: mozilla::RangedPtr mIter; michael@0: const mozilla::RangedPtr mEnd; michael@0: char mSeparatorChar; michael@0: bool mWhitespaceBeforeFirstToken; michael@0: bool mWhitespaceAfterCurrentToken; michael@0: bool mSeparatorAfterCurrentToken; michael@0: bool mSeparatorOptional; michael@0: }; michael@0: michael@0: class nsCCharSeparatedTokenizer: public nsCCharSeparatedTokenizerTemplate<> michael@0: { michael@0: public: michael@0: nsCCharSeparatedTokenizer(const nsCSubstring& aSource, michael@0: char aSeparatorChar, michael@0: uint32_t aFlags = 0) michael@0: : nsCCharSeparatedTokenizerTemplate<>(aSource, aSeparatorChar, aFlags) michael@0: { michael@0: } michael@0: }; michael@0: michael@0: #endif /* __nsCharSeparatedTokenizer_h */