michael@0: /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0:  * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0:  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0: 
michael@0: 
michael@0: /**
michael@0:  * MODULE NOTES:
michael@0:  * @update  gess 4/1/98
michael@0:  * 
michael@0:  * The scanner is a low-level service class that knows
michael@0:  * how to consume characters out of an (internal) stream.
michael@0:  * This class also offers a series of utility methods
michael@0:  * that most tokenizers want, such as readUntil()
michael@0:  * and SkipWhitespace().
michael@0:  */
michael@0: 
michael@0: 
michael@0: #ifndef SCANNER
michael@0: #define SCANNER
michael@0: 
michael@0: #include "nsCOMPtr.h"
michael@0: #include "nsString.h"
michael@0: #include "nsIParser.h"
michael@0: #include "nsIUnicodeDecoder.h"
michael@0: #include "nsScannerString.h"
michael@0: 
michael@0: class nsParser;
michael@0: 
michael@0: class nsReadEndCondition {
michael@0: public:
michael@0:   const char16_t *mChars;
michael@0:   char16_t mFilter;
michael@0:   explicit nsReadEndCondition(const char16_t* aTerminateChars);
michael@0: private:
michael@0:   nsReadEndCondition(const nsReadEndCondition& aOther); // No copying
michael@0:   void operator=(const nsReadEndCondition& aOther); // No assigning
michael@0: };
michael@0: 
michael@0: class nsScanner {
michael@0:   public:
michael@0: 
michael@0:       /**
michael@0:        *  Use this constructor for the XML fragment parsing case
michael@0:        */
michael@0:       nsScanner(const nsAString& anHTMLString);
michael@0: 
michael@0:       /**
michael@0:        *  Use this constructor if you want i/o to be based on 
michael@0:        *  a file (therefore a stream) or just data you provide via Append().
michael@0:        */
michael@0:       nsScanner(nsString& aFilename, bool aCreateStream);
michael@0: 
michael@0:       ~nsScanner();
michael@0: 
michael@0:       /**
michael@0:        *  retrieve next char from internal input stream
michael@0:        *  
michael@0:        *  @update  gess 3/25/98
michael@0:        *  @param   ch is the char to accept new value
michael@0:        *  @return  error code reflecting read status
michael@0:        */
michael@0:       nsresult GetChar(char16_t& ch);
michael@0: 
michael@0:       /**
michael@0:        *  peek ahead to consume next char from scanner's internal
michael@0:        *  input buffer
michael@0:        *  
michael@0:        *  @update  gess 3/25/98
michael@0:        *  @param   ch is the char to accept new value
michael@0:        *  @return  error code reflecting read status
michael@0:        */
michael@0:       nsresult Peek(char16_t& ch, uint32_t aOffset=0);
michael@0: 
michael@0:       nsresult Peek(nsAString& aStr, int32_t aNumChars, int32_t aOffset = 0);
michael@0: 
michael@0:       /**
michael@0:        *  Skip over chars as long as they equal given char
michael@0:        *  
michael@0:        *  @update  gess 3/25/98
michael@0:        *  @param   char to be skipped
michael@0:        *  @return  error code
michael@0:        */
michael@0:       nsresult SkipOver(char16_t aSkipChar);
michael@0: 
michael@0:       /**
michael@0:        *  Skip whitespace on scanner input stream
michael@0:        *  
michael@0:        *  @update  gess 3/25/98
michael@0:        *  @return  error status
michael@0:        */
michael@0:       nsresult SkipWhitespace(int32_t& aNewlinesSkipped);
michael@0: 
michael@0:       /**
michael@0:        *  Consume characters until you run into space, a '<', a '>', or a '/'.
michael@0:        *  
michael@0:        *  @param   aString - receives new data from stream
michael@0:        *  @return  error code
michael@0:        */
michael@0:       nsresult ReadTagIdentifier(nsScannerSharedSubstring& aString);
michael@0: 
michael@0:       /**
michael@0:        *  Consume characters until you run into a char that's not valid in an
michael@0:        *  entity name
michael@0:        *  
michael@0:        *  @param   aString - receives new data from stream
michael@0:        *  @return  error code
michael@0:        */
michael@0:       nsresult ReadEntityIdentifier(nsString& aString);
michael@0:       nsresult ReadNumber(nsString& aString,int32_t aBase);
michael@0:       nsresult ReadWhitespace(nsScannerSharedSubstring& aString, 
michael@0:                               int32_t& aNewlinesSkipped,
michael@0:                               bool& aHaveCR);
michael@0:       nsresult ReadWhitespace(nsScannerIterator& aStart, 
michael@0:                               nsScannerIterator& aEnd,
michael@0:                               int32_t& aNewlinesSkipped);
michael@0: 
michael@0:       /**
michael@0:        *  Consume characters until you find the terminal char
michael@0:        *  
michael@0:        *  @update  gess 3/25/98
michael@0:        *  @param   aString receives new data from stream
michael@0:        *  @param   aTerminal contains terminating char
michael@0:        *  @param   addTerminal tells us whether to append terminal to aString
michael@0:        *  @return  error code
michael@0:        */
michael@0:       nsresult ReadUntil(nsAString& aString,
michael@0:                          char16_t aTerminal,
michael@0:                          bool addTerminal);
michael@0: 
michael@0:       /**
michael@0:        *  Consume characters until you find one contained in given
michael@0:        *  terminal set.
michael@0:        *  
michael@0:        *  @update  gess 3/25/98
michael@0:        *  @param   aString receives new data from stream
michael@0:        *  @param   aTermSet contains set of terminating chars
michael@0:        *  @param   addTerminal tells us whether to append terminal to aString
michael@0:        *  @return  error code
michael@0:        */
michael@0:       nsresult ReadUntil(nsAString& aString,
michael@0:                          const nsReadEndCondition& aEndCondition, 
michael@0:                          bool addTerminal);
michael@0: 
michael@0:       nsresult ReadUntil(nsScannerSharedSubstring& aString,
michael@0:                          const nsReadEndCondition& aEndCondition,
michael@0:                          bool addTerminal);
michael@0: 
michael@0:       nsresult ReadUntil(nsScannerIterator& aStart,
michael@0:                          nsScannerIterator& aEnd,
michael@0:                          const nsReadEndCondition& aEndCondition, 
michael@0:                          bool addTerminal);
michael@0: 
michael@0:       /**
michael@0:        *  Records current offset position in input stream. This allows us
michael@0:        *  to back up to this point if the need should arise, such as when
michael@0:        *  tokenization gets interrupted.
michael@0:        *  
michael@0:        *  @update  gess 5/12/98
michael@0:        *  @param   
michael@0:        *  @return  
michael@0:        */
michael@0:       int32_t Mark(void);
michael@0: 
michael@0:       /**
michael@0:        *  Resets current offset position of input stream to marked position. 
michael@0:        *  This allows us to back up to this point if the need should arise, 
michael@0:        *  such as when tokenization gets interrupted.
michael@0:        *  NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST!
michael@0:        *  
michael@0:        *  @update  gess 5/12/98
michael@0:        *  @param   
michael@0:        *  @return  
michael@0:        */
michael@0:       void RewindToMark(void);
michael@0: 
michael@0: 
michael@0:       /**
michael@0:        *  
michael@0:        *  
michael@0:        *  @update  harishd 01/12/99
michael@0:        *  @param   
michael@0:        *  @return  
michael@0:        */
michael@0:       bool UngetReadable(const nsAString& aBuffer);
michael@0: 
michael@0:       /**
michael@0:        *  
michael@0:        *  
michael@0:        *  @update  gess 5/13/98
michael@0:        *  @param   
michael@0:        *  @return  
michael@0:        */
michael@0:       nsresult Append(const nsAString& aBuffer);
michael@0: 
michael@0:       /**
michael@0:        *  
michael@0:        *  
michael@0:        *  @update  gess 5/21/98
michael@0:        *  @param   
michael@0:        *  @return  
michael@0:        */
michael@0:       nsresult Append(const char* aBuffer, uint32_t aLen,
michael@0:                       nsIRequest *aRequest);
michael@0: 
michael@0:       /**
michael@0:        *  Call this to copy bytes out of the scanner that have not yet been consumed
michael@0:        *  by the tokenization process.
michael@0:        *  
michael@0:        *  @update  gess 5/12/98
michael@0:        *  @param   aCopyBuffer is where the scanner buffer will be copied to
michael@0:        *  @return  nada
michael@0:        */
michael@0:       void CopyUnusedData(nsString& aCopyBuffer);
michael@0: 
michael@0:       /**
michael@0:        *  Retrieve the name of the file that the scanner is reading from.
michael@0:        *  In some cases, it's just a given name, because the scanner isn't
michael@0:        *  really reading from a file.
michael@0:        *  
michael@0:        *  @update  gess 5/12/98
michael@0:        *  @return  
michael@0:        */
michael@0:       nsString& GetFilename(void);
michael@0: 
michael@0:       static void SelfTest();
michael@0: 
michael@0:       /**
michael@0:        *  Use this setter to change the scanner's unicode decoder
michael@0:        *
michael@0:        *  @update  ftang 3/02/99
michael@0:        *  @param   aCharset a normalized (alias resolved) charset name
michael@0:        *  @param   aCharsetSource- where the charset info came from
michael@0:        *  @return  
michael@0:        */
michael@0:       nsresult SetDocumentCharset(const nsACString& aCharset, int32_t aSource);
michael@0: 
michael@0:       void BindSubstring(nsScannerSubstring& aSubstring, const nsScannerIterator& aStart, const nsScannerIterator& aEnd);
michael@0:       void CurrentPosition(nsScannerIterator& aPosition);
michael@0:       void EndReading(nsScannerIterator& aPosition);
michael@0:       void SetPosition(nsScannerIterator& aPosition,
michael@0:                        bool aTruncate = false,
michael@0:                        bool aReverse = false);
michael@0:       void ReplaceCharacter(nsScannerIterator& aPosition,
michael@0:                             char16_t aChar);
michael@0: 
michael@0:       /**
michael@0:        * Internal method used to cause the internal buffer to
michael@0:        * be filled with data. 
michael@0:        *
michael@0:        * @update  gess4/3/98
michael@0:        */
michael@0:       bool      IsIncremental(void) {return mIncremental;}
michael@0:       void      SetIncremental(bool anIncrValue) {mIncremental=anIncrValue;}
michael@0: 
michael@0:       /**
michael@0:        * Return the position of the first non-whitespace
michael@0:        * character. This is only reliable before consumers start
michael@0:        * reading from this scanner.
michael@0:        */
michael@0:       int32_t FirstNonWhitespacePosition()
michael@0:       {
michael@0:         return mFirstNonWhitespacePosition;
michael@0:       }
michael@0: 
michael@0:       /**
michael@0:        * Override replacement character used by nsIUnicodeDecoder.
michael@0:        * Default behavior is that it uses nsIUnicodeDecoder's mapping.
michael@0:        *
michael@0:        * @param aReplacementCharacter the replacement character
michael@0:        *        XML (expat) parser uses 0xffff
michael@0:        */
michael@0:       void OverrideReplacementCharacter(char16_t aReplacementCharacter);
michael@0: 
michael@0:   protected:
michael@0: 
michael@0:       bool AppendToBuffer(nsScannerString::Buffer *, nsIRequest *aRequest, int32_t aErrorPos = -1);
michael@0:       bool AppendToBuffer(const nsAString& aStr)
michael@0:       {
michael@0:         nsScannerString::Buffer* buf = nsScannerString::AllocBufferFromString(aStr);
michael@0:         if (!buf)
michael@0:           return false;
michael@0:         AppendToBuffer(buf, nullptr);
michael@0:         return true;
michael@0:       }
michael@0: 
michael@0:       nsScannerString*             mSlidingBuffer;
michael@0:       nsScannerIterator            mCurrentPosition; // The position we will next read from in the scanner buffer
michael@0:       nsScannerIterator            mMarkPosition;    // The position last marked (we may rewind to here)
michael@0:       nsScannerIterator            mEndPosition;     // The current end of the scanner buffer
michael@0:       nsScannerIterator            mFirstInvalidPosition; // The position of the first invalid character that was detected
michael@0:       nsString        mFilename;
michael@0:       uint32_t        mCountRemaining; // The number of bytes still to be read
michael@0:                                        // from the scanner buffer
michael@0:       bool            mIncremental;
michael@0:       bool            mHasInvalidCharacter;
michael@0:       char16_t       mReplacementCharacter;
michael@0:       int32_t         mFirstNonWhitespacePosition;
michael@0:       int32_t         mCharsetSource;
michael@0:       nsCString       mCharset;
michael@0:       nsCOMPtr<nsIUnicodeDecoder> mUnicodeDecoder;
michael@0: 
michael@0:   private:
michael@0:       nsScanner &operator =(const nsScanner &); // Not implemented.
michael@0: };
michael@0: 
michael@0: #endif
michael@0: 
michael@0: