michael@0: /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: michael@0: /** michael@0: * MODULE NOTES: michael@0: * @update gess 4/1/98 michael@0: * michael@0: * The scanner is a low-level service class that knows michael@0: * how to consume characters out of an (internal) stream. michael@0: * This class also offers a series of utility methods michael@0: * that most tokenizers want, such as readUntil() michael@0: * and SkipWhitespace(). michael@0: */ michael@0: michael@0: michael@0: #ifndef SCANNER michael@0: #define SCANNER michael@0: michael@0: #include "nsCOMPtr.h" michael@0: #include "nsString.h" michael@0: #include "nsIParser.h" michael@0: #include "nsIUnicodeDecoder.h" michael@0: #include "nsScannerString.h" michael@0: michael@0: class nsParser; michael@0: michael@0: class nsReadEndCondition { michael@0: public: michael@0: const char16_t *mChars; michael@0: char16_t mFilter; michael@0: explicit nsReadEndCondition(const char16_t* aTerminateChars); michael@0: private: michael@0: nsReadEndCondition(const nsReadEndCondition& aOther); // No copying michael@0: void operator=(const nsReadEndCondition& aOther); // No assigning michael@0: }; michael@0: michael@0: class nsScanner { michael@0: public: michael@0: michael@0: /** michael@0: * Use this constructor for the XML fragment parsing case michael@0: */ michael@0: nsScanner(const nsAString& anHTMLString); michael@0: michael@0: /** michael@0: * Use this constructor if you want i/o to be based on michael@0: * a file (therefore a stream) or just data you provide via Append(). michael@0: */ michael@0: nsScanner(nsString& aFilename, bool aCreateStream); michael@0: michael@0: ~nsScanner(); michael@0: michael@0: /** michael@0: * retrieve next char from internal input stream michael@0: * michael@0: * @update gess 3/25/98 michael@0: * @param ch is the char to accept new value michael@0: * @return error code reflecting read status michael@0: */ michael@0: nsresult GetChar(char16_t& ch); michael@0: michael@0: /** michael@0: * peek ahead to consume next char from scanner's internal michael@0: * input buffer michael@0: * michael@0: * @update gess 3/25/98 michael@0: * @param ch is the char to accept new value michael@0: * @return error code reflecting read status michael@0: */ michael@0: nsresult Peek(char16_t& ch, uint32_t aOffset=0); michael@0: michael@0: nsresult Peek(nsAString& aStr, int32_t aNumChars, int32_t aOffset = 0); michael@0: michael@0: /** michael@0: * Skip over chars as long as they equal given char michael@0: * michael@0: * @update gess 3/25/98 michael@0: * @param char to be skipped michael@0: * @return error code michael@0: */ michael@0: nsresult SkipOver(char16_t aSkipChar); michael@0: michael@0: /** michael@0: * Skip whitespace on scanner input stream michael@0: * michael@0: * @update gess 3/25/98 michael@0: * @return error status michael@0: */ michael@0: nsresult SkipWhitespace(int32_t& aNewlinesSkipped); michael@0: michael@0: /** michael@0: * Consume characters until you run into space, a '<', a '>', or a '/'. michael@0: * michael@0: * @param aString - receives new data from stream michael@0: * @return error code michael@0: */ michael@0: nsresult ReadTagIdentifier(nsScannerSharedSubstring& aString); michael@0: michael@0: /** michael@0: * Consume characters until you run into a char that's not valid in an michael@0: * entity name michael@0: * michael@0: * @param aString - receives new data from stream michael@0: * @return error code michael@0: */ michael@0: nsresult ReadEntityIdentifier(nsString& aString); michael@0: nsresult ReadNumber(nsString& aString,int32_t aBase); michael@0: nsresult ReadWhitespace(nsScannerSharedSubstring& aString, michael@0: int32_t& aNewlinesSkipped, michael@0: bool& aHaveCR); michael@0: nsresult ReadWhitespace(nsScannerIterator& aStart, michael@0: nsScannerIterator& aEnd, michael@0: int32_t& aNewlinesSkipped); michael@0: michael@0: /** michael@0: * Consume characters until you find the terminal char michael@0: * michael@0: * @update gess 3/25/98 michael@0: * @param aString receives new data from stream michael@0: * @param aTerminal contains terminating char michael@0: * @param addTerminal tells us whether to append terminal to aString michael@0: * @return error code michael@0: */ michael@0: nsresult ReadUntil(nsAString& aString, michael@0: char16_t aTerminal, michael@0: bool addTerminal); michael@0: michael@0: /** michael@0: * Consume characters until you find one contained in given michael@0: * terminal set. michael@0: * michael@0: * @update gess 3/25/98 michael@0: * @param aString receives new data from stream michael@0: * @param aTermSet contains set of terminating chars michael@0: * @param addTerminal tells us whether to append terminal to aString michael@0: * @return error code michael@0: */ michael@0: nsresult ReadUntil(nsAString& aString, michael@0: const nsReadEndCondition& aEndCondition, michael@0: bool addTerminal); michael@0: michael@0: nsresult ReadUntil(nsScannerSharedSubstring& aString, michael@0: const nsReadEndCondition& aEndCondition, michael@0: bool addTerminal); michael@0: michael@0: nsresult ReadUntil(nsScannerIterator& aStart, michael@0: nsScannerIterator& aEnd, michael@0: const nsReadEndCondition& aEndCondition, michael@0: bool addTerminal); michael@0: michael@0: /** michael@0: * Records current offset position in input stream. This allows us michael@0: * to back up to this point if the need should arise, such as when michael@0: * tokenization gets interrupted. michael@0: * michael@0: * @update gess 5/12/98 michael@0: * @param michael@0: * @return michael@0: */ michael@0: int32_t Mark(void); michael@0: michael@0: /** michael@0: * Resets current offset position of input stream to marked position. michael@0: * This allows us to back up to this point if the need should arise, michael@0: * such as when tokenization gets interrupted. michael@0: * NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST! michael@0: * michael@0: * @update gess 5/12/98 michael@0: * @param michael@0: * @return michael@0: */ michael@0: void RewindToMark(void); michael@0: michael@0: michael@0: /** michael@0: * michael@0: * michael@0: * @update harishd 01/12/99 michael@0: * @param michael@0: * @return michael@0: */ michael@0: bool UngetReadable(const nsAString& aBuffer); michael@0: michael@0: /** michael@0: * michael@0: * michael@0: * @update gess 5/13/98 michael@0: * @param michael@0: * @return michael@0: */ michael@0: nsresult Append(const nsAString& aBuffer); michael@0: michael@0: /** michael@0: * michael@0: * michael@0: * @update gess 5/21/98 michael@0: * @param michael@0: * @return michael@0: */ michael@0: nsresult Append(const char* aBuffer, uint32_t aLen, michael@0: nsIRequest *aRequest); michael@0: michael@0: /** michael@0: * Call this to copy bytes out of the scanner that have not yet been consumed michael@0: * by the tokenization process. michael@0: * michael@0: * @update gess 5/12/98 michael@0: * @param aCopyBuffer is where the scanner buffer will be copied to michael@0: * @return nada michael@0: */ michael@0: void CopyUnusedData(nsString& aCopyBuffer); michael@0: michael@0: /** michael@0: * Retrieve the name of the file that the scanner is reading from. michael@0: * In some cases, it's just a given name, because the scanner isn't michael@0: * really reading from a file. michael@0: * michael@0: * @update gess 5/12/98 michael@0: * @return michael@0: */ michael@0: nsString& GetFilename(void); michael@0: michael@0: static void SelfTest(); michael@0: michael@0: /** michael@0: * Use this setter to change the scanner's unicode decoder michael@0: * michael@0: * @update ftang 3/02/99 michael@0: * @param aCharset a normalized (alias resolved) charset name michael@0: * @param aCharsetSource- where the charset info came from michael@0: * @return michael@0: */ michael@0: nsresult SetDocumentCharset(const nsACString& aCharset, int32_t aSource); michael@0: michael@0: void BindSubstring(nsScannerSubstring& aSubstring, const nsScannerIterator& aStart, const nsScannerIterator& aEnd); michael@0: void CurrentPosition(nsScannerIterator& aPosition); michael@0: void EndReading(nsScannerIterator& aPosition); michael@0: void SetPosition(nsScannerIterator& aPosition, michael@0: bool aTruncate = false, michael@0: bool aReverse = false); michael@0: void ReplaceCharacter(nsScannerIterator& aPosition, michael@0: char16_t aChar); michael@0: michael@0: /** michael@0: * Internal method used to cause the internal buffer to michael@0: * be filled with data. michael@0: * michael@0: * @update gess4/3/98 michael@0: */ michael@0: bool IsIncremental(void) {return mIncremental;} michael@0: void SetIncremental(bool anIncrValue) {mIncremental=anIncrValue;} michael@0: michael@0: /** michael@0: * Return the position of the first non-whitespace michael@0: * character. This is only reliable before consumers start michael@0: * reading from this scanner. michael@0: */ michael@0: int32_t FirstNonWhitespacePosition() michael@0: { michael@0: return mFirstNonWhitespacePosition; michael@0: } michael@0: michael@0: /** michael@0: * Override replacement character used by nsIUnicodeDecoder. michael@0: * Default behavior is that it uses nsIUnicodeDecoder's mapping. michael@0: * michael@0: * @param aReplacementCharacter the replacement character michael@0: * XML (expat) parser uses 0xffff michael@0: */ michael@0: void OverrideReplacementCharacter(char16_t aReplacementCharacter); michael@0: michael@0: protected: michael@0: michael@0: bool AppendToBuffer(nsScannerString::Buffer *, nsIRequest *aRequest, int32_t aErrorPos = -1); michael@0: bool AppendToBuffer(const nsAString& aStr) michael@0: { michael@0: nsScannerString::Buffer* buf = nsScannerString::AllocBufferFromString(aStr); michael@0: if (!buf) michael@0: return false; michael@0: AppendToBuffer(buf, nullptr); michael@0: return true; michael@0: } michael@0: michael@0: nsScannerString* mSlidingBuffer; michael@0: nsScannerIterator mCurrentPosition; // The position we will next read from in the scanner buffer michael@0: nsScannerIterator mMarkPosition; // The position last marked (we may rewind to here) michael@0: nsScannerIterator mEndPosition; // The current end of the scanner buffer michael@0: nsScannerIterator mFirstInvalidPosition; // The position of the first invalid character that was detected michael@0: nsString mFilename; michael@0: uint32_t mCountRemaining; // The number of bytes still to be read michael@0: // from the scanner buffer michael@0: bool mIncremental; michael@0: bool mHasInvalidCharacter; michael@0: char16_t mReplacementCharacter; michael@0: int32_t mFirstNonWhitespacePosition; michael@0: int32_t mCharsetSource; michael@0: nsCString mCharset; michael@0: nsCOMPtr mUnicodeDecoder; michael@0: michael@0: private: michael@0: nsScanner &operator =(const nsScanner &); // Not implemented. michael@0: }; michael@0: michael@0: #endif michael@0: michael@0: