michael@0: /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0: /* vim: set ts=2 sw=2 et tw=78: */
michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0:  * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0:  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0: 
michael@0: //#define __INCREMENTAL 1
michael@0: 
michael@0: #include "mozilla/DebugOnly.h"
michael@0: 
michael@0: #include "nsScanner.h"
michael@0: #include "nsDebug.h"
michael@0: #include "nsReadableUtils.h"
michael@0: #include "nsIInputStream.h"
michael@0: #include "nsIFile.h"
michael@0: #include "nsNetUtil.h"
michael@0: #include "nsUTF8Utils.h" // for LossyConvertEncoding
michael@0: #include "nsCRT.h"
michael@0: #include "nsParser.h"
michael@0: #include "nsCharsetSource.h"
michael@0: 
michael@0: #include "mozilla/dom/EncodingUtils.h"
michael@0: 
michael@0: using mozilla::dom::EncodingUtils;
michael@0: 
michael@0: // We replace NUL characters with this character.
michael@0: static char16_t sInvalid = UCS2_REPLACEMENT_CHAR;
michael@0: 
michael@0: nsReadEndCondition::nsReadEndCondition(const char16_t* aTerminateChars) :
michael@0:   mChars(aTerminateChars), mFilter(char16_t(~0)) // All bits set
michael@0: {
michael@0:   // Build filter that will be used to filter out characters with
michael@0:   // bits that none of the terminal chars have. This works very well
michael@0:   // because terminal chars often have only the last 4-6 bits set and
michael@0:   // normal ascii letters have bit 7 set. Other letters have even higher
michael@0:   // bits set.
michael@0:   
michael@0:   // Calculate filter
michael@0:   const char16_t *current = aTerminateChars;
michael@0:   char16_t terminalChar = *current;
michael@0:   while (terminalChar) {
michael@0:     mFilter &= ~terminalChar;
michael@0:     ++current;
michael@0:     terminalChar = *current;
michael@0:   }
michael@0: }
michael@0: 
michael@0: /**
michael@0:  *  Use this constructor if you want i/o to be based on 
michael@0:  *  a single string you hand in during construction.
michael@0:  *  This short cut was added for Javascript.
michael@0:  *
michael@0:  *  @update  gess 5/12/98
michael@0:  *  @param   aMode represents the parser mode (nav, other)
michael@0:  *  @return  
michael@0:  */
michael@0: nsScanner::nsScanner(const nsAString& anHTMLString)
michael@0: {
michael@0:   MOZ_COUNT_CTOR(nsScanner);
michael@0: 
michael@0:   mSlidingBuffer = nullptr;
michael@0:   mCountRemaining = 0;
michael@0:   mFirstNonWhitespacePosition = -1;
michael@0:   if (AppendToBuffer(anHTMLString)) {
michael@0:     mSlidingBuffer->BeginReading(mCurrentPosition);
michael@0:   } else {
michael@0:     /* XXX see hack below, re: bug 182067 */
michael@0:     memset(&mCurrentPosition, 0, sizeof(mCurrentPosition));
michael@0:     mEndPosition = mCurrentPosition;
michael@0:   }
michael@0:   mMarkPosition = mCurrentPosition;
michael@0:   mIncremental = false;
michael@0:   mUnicodeDecoder = 0;
michael@0:   mCharsetSource = kCharsetUninitialized;
michael@0:   mHasInvalidCharacter = false;
michael@0:   mReplacementCharacter = char16_t(0x0);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  *  Use this constructor if you want i/o to be based on strings 
michael@0:  *  the scanner receives. If you pass a null filename, you
michael@0:  *  can still provide data to the scanner via append.
michael@0:  */
michael@0: nsScanner::nsScanner(nsString& aFilename, bool aCreateStream)
michael@0:   : mFilename(aFilename)
michael@0: {
michael@0:   MOZ_COUNT_CTOR(nsScanner);
michael@0:   NS_ASSERTION(!aCreateStream, "This is always true.");
michael@0: 
michael@0:   mSlidingBuffer = nullptr;
michael@0: 
michael@0:   // XXX This is a big hack.  We need to initialize the iterators to something.
michael@0:   // What matters is that mCurrentPosition == mEndPosition, so that our methods
michael@0:   // believe that we are at EOF (see bug 182067).  We null out mCurrentPosition
michael@0:   // so that we have some hope of catching null pointer dereferences associated
michael@0:   // with this hack. --darin
michael@0:   memset(&mCurrentPosition, 0, sizeof(mCurrentPosition));
michael@0:   mMarkPosition = mCurrentPosition;
michael@0:   mEndPosition = mCurrentPosition;
michael@0: 
michael@0:   mIncremental = true;
michael@0:   mFirstNonWhitespacePosition = -1;
michael@0:   mCountRemaining = 0;
michael@0: 
michael@0:   mUnicodeDecoder = 0;
michael@0:   mCharsetSource = kCharsetUninitialized;
michael@0:   mHasInvalidCharacter = false;
michael@0:   mReplacementCharacter = char16_t(0x0);
michael@0:   // XML defaults to UTF-8 and about:blank is UTF-8, too.
michael@0:   SetDocumentCharset(NS_LITERAL_CSTRING("UTF-8"), kCharsetFromDocTypeDefault);
michael@0: }
michael@0: 
michael@0: nsresult nsScanner::SetDocumentCharset(const nsACString& aCharset , int32_t aSource)
michael@0: {
michael@0:   if (aSource < mCharsetSource) // priority is lower than the current one
michael@0:     return NS_OK;
michael@0: 
michael@0:   mCharsetSource = aSource;
michael@0: 
michael@0:   nsCString charsetName;
michael@0:   mozilla::DebugOnly<bool> valid =
michael@0:       EncodingUtils::FindEncodingForLabel(aCharset, charsetName);
michael@0:   MOZ_ASSERT(valid, "Should never call with a bogus aCharset.");
michael@0: 
michael@0:   if (!mCharset.IsEmpty() && charsetName.Equals(mCharset)) {
michael@0:     return NS_OK; // no difference, don't change it
michael@0:   }
michael@0: 
michael@0:   // different, need to change it
michael@0: 
michael@0:   mCharset.Assign(charsetName);
michael@0: 
michael@0:   mUnicodeDecoder = EncodingUtils::DecoderForEncoding(mCharset);
michael@0:   mUnicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Signal);
michael@0: 
michael@0:   return NS_OK;
michael@0: }
michael@0: 
michael@0: 
michael@0: /**
michael@0:  *  default destructor
michael@0:  *  
michael@0:  *  @update  gess 3/25/98
michael@0:  *  @param   
michael@0:  *  @return  
michael@0:  */
michael@0: nsScanner::~nsScanner() {
michael@0: 
michael@0:   delete mSlidingBuffer;
michael@0: 
michael@0:   MOZ_COUNT_DTOR(nsScanner);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  *  Resets current offset position of input stream to marked position. 
michael@0:  *  This allows us to back up to this point if the need should arise, 
michael@0:  *  such as when tokenization gets interrupted.
michael@0:  *  NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST!
michael@0:  *
michael@0:  *  @update  gess 5/12/98
michael@0:  *  @param   
michael@0:  *  @return  
michael@0:  */
michael@0: void nsScanner::RewindToMark(void){
michael@0:   if (mSlidingBuffer) {
michael@0:     mCountRemaining += (Distance(mMarkPosition, mCurrentPosition));
michael@0:     mCurrentPosition = mMarkPosition;
michael@0:   }
michael@0: }
michael@0: 
michael@0: 
michael@0: /**
michael@0:  *  Records current offset position in input stream. This allows us
michael@0:  *  to back up to this point if the need should arise, such as when
michael@0:  *  tokenization gets interrupted.
michael@0:  *
michael@0:  *  @update  gess 7/29/98
michael@0:  *  @param   
michael@0:  *  @return  
michael@0:  */
michael@0: int32_t nsScanner::Mark() {
michael@0:   int32_t distance = 0;
michael@0:   if (mSlidingBuffer) {
michael@0:     nsScannerIterator oldStart;
michael@0:     mSlidingBuffer->BeginReading(oldStart);
michael@0: 
michael@0:     distance = Distance(oldStart, mCurrentPosition);
michael@0: 
michael@0:     mSlidingBuffer->DiscardPrefix(mCurrentPosition);
michael@0:     mSlidingBuffer->BeginReading(mCurrentPosition);
michael@0:     mMarkPosition = mCurrentPosition;
michael@0:   }
michael@0: 
michael@0:   return distance;
michael@0: }
michael@0: 
michael@0: /** 
michael@0:  * Insert data to our underlying input buffer as
michael@0:  * if it were read from an input stream.
michael@0:  *
michael@0:  * @update  harishd 01/12/99
michael@0:  * @return  error code 
michael@0:  */
michael@0: bool nsScanner::UngetReadable(const nsAString& aBuffer) {
michael@0:   if (!mSlidingBuffer) {
michael@0:     return false;
michael@0:   }
michael@0: 
michael@0:   mSlidingBuffer->UngetReadable(aBuffer,mCurrentPosition);
michael@0:   mSlidingBuffer->BeginReading(mCurrentPosition); // Insertion invalidated our iterators
michael@0:   mSlidingBuffer->EndReading(mEndPosition);
michael@0:  
michael@0:   uint32_t length = aBuffer.Length();
michael@0:   mCountRemaining += length; // Ref. bug 117441
michael@0:   return true;
michael@0: }
michael@0: 
michael@0: /** 
michael@0:  * Append data to our underlying input buffer as
michael@0:  * if it were read from an input stream.
michael@0:  *
michael@0:  * @update  gess4/3/98
michael@0:  * @return  error code 
michael@0:  */
michael@0: nsresult nsScanner::Append(const nsAString& aBuffer) {
michael@0:   if (!AppendToBuffer(aBuffer))
michael@0:     return NS_ERROR_OUT_OF_MEMORY;
michael@0:   return NS_OK;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  *  
michael@0:  *  
michael@0:  *  @update  gess 5/21/98
michael@0:  *  @param   
michael@0:  *  @return  
michael@0:  */
michael@0: nsresult nsScanner::Append(const char* aBuffer, uint32_t aLen,
michael@0:                            nsIRequest *aRequest)
michael@0: {
michael@0:   nsresult res = NS_OK;
michael@0:   if (mUnicodeDecoder) {
michael@0:     int32_t unicharBufLen = 0;
michael@0:     mUnicodeDecoder->GetMaxLength(aBuffer, aLen, &unicharBufLen);
michael@0:     nsScannerString::Buffer* buffer = nsScannerString::AllocBuffer(unicharBufLen + 1);
michael@0:     NS_ENSURE_TRUE(buffer,NS_ERROR_OUT_OF_MEMORY);
michael@0:     char16_t *unichars = buffer->DataStart();
michael@0: 
michael@0:     int32_t totalChars = 0;
michael@0:     int32_t unicharLength = unicharBufLen;
michael@0:     int32_t errorPos = -1;
michael@0: 
michael@0:     do {
michael@0:       int32_t srcLength = aLen;
michael@0:       res = mUnicodeDecoder->Convert(aBuffer, &srcLength, unichars, &unicharLength);
michael@0: 
michael@0:       totalChars += unicharLength;
michael@0:       // Continuation of failure case
michael@0:       if(NS_FAILED(res)) {
michael@0:         // if we failed, we consume one byte, replace it with the replacement
michael@0:         // character and try the conversion again.
michael@0: 
michael@0:         // This is only needed because some decoders don't follow the
michael@0:         // nsIUnicodeDecoder contract: they return a failure when *aDestLength
michael@0:         // is 0 rather than the correct NS_OK_UDEC_MOREOUTPUT.  See bug 244177
michael@0:         if ((unichars + unicharLength) >= buffer->DataEnd()) {
michael@0:           NS_ERROR("Unexpected end of destination buffer");
michael@0:           break;
michael@0:         }
michael@0: 
michael@0:         if (mReplacementCharacter == 0x0 && errorPos == -1) {
michael@0:           errorPos = totalChars;
michael@0:         }
michael@0:         unichars[unicharLength++] = mReplacementCharacter == 0x0 ?
michael@0:                                     mUnicodeDecoder->GetCharacterForUnMapped() :
michael@0:                                     mReplacementCharacter;
michael@0: 
michael@0:         unichars = unichars + unicharLength;
michael@0:         unicharLength = unicharBufLen - (++totalChars);
michael@0: 
michael@0:         mUnicodeDecoder->Reset();
michael@0: 
michael@0:         if(((uint32_t) (srcLength + 1)) > aLen) {
michael@0:           srcLength = aLen;
michael@0:         }
michael@0:         else {
michael@0:           ++srcLength;
michael@0:         }
michael@0: 
michael@0:         aBuffer += srcLength;
michael@0:         aLen -= srcLength;
michael@0:       }
michael@0:     } while (NS_FAILED(res) && (aLen > 0));
michael@0: 
michael@0:     buffer->SetDataLength(totalChars);
michael@0:     // Don't propagate return code of unicode decoder
michael@0:     // since it doesn't reflect on our success or failure
michael@0:     // - Ref. bug 87110
michael@0:     res = NS_OK; 
michael@0:     if (!AppendToBuffer(buffer, aRequest, errorPos))
michael@0:       res = NS_ERROR_OUT_OF_MEMORY;
michael@0:   }
michael@0:   else {
michael@0:     NS_WARNING("No decoder found.");
michael@0:     res = NS_ERROR_FAILURE;
michael@0:   }
michael@0: 
michael@0:   return res;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  *  retrieve next char from scanners internal input stream
michael@0:  *  
michael@0:  *  @update  gess 3/25/98
michael@0:  *  @param   
michael@0:  *  @return  error code reflecting read status
michael@0:  */
michael@0: nsresult nsScanner::GetChar(char16_t& aChar) {
michael@0:   if (!mSlidingBuffer || mCurrentPosition == mEndPosition) {
michael@0:     aChar = 0;
michael@0:     return kEOF;
michael@0:   }
michael@0: 
michael@0:   aChar = *mCurrentPosition++;
michael@0:   --mCountRemaining;
michael@0: 
michael@0:   return NS_OK;
michael@0: }
michael@0: 
michael@0: 
michael@0: /**
michael@0:  *  peek ahead to consume next char from scanner's internal
michael@0:  *  input buffer
michael@0:  *  
michael@0:  *  @update  gess 3/25/98
michael@0:  *  @param   
michael@0:  *  @return  
michael@0:  */
michael@0: nsresult nsScanner::Peek(char16_t& aChar, uint32_t aOffset) {
michael@0:   aChar = 0;
michael@0: 
michael@0:   if (!mSlidingBuffer || mCurrentPosition == mEndPosition) {
michael@0:     return kEOF;
michael@0:   }
michael@0: 
michael@0:   if (aOffset > 0) {
michael@0:     if (mCountRemaining <= aOffset)
michael@0:       return kEOF;
michael@0: 
michael@0:     nsScannerIterator pos = mCurrentPosition;
michael@0:     pos.advance(aOffset);
michael@0:     aChar=*pos;
michael@0:   }
michael@0:   else {
michael@0:     aChar=*mCurrentPosition;
michael@0:   }
michael@0: 
michael@0:   return NS_OK;
michael@0: }
michael@0: 
michael@0: nsresult nsScanner::Peek(nsAString& aStr, int32_t aNumChars, int32_t aOffset)
michael@0: {
michael@0:   if (!mSlidingBuffer || mCurrentPosition == mEndPosition) {
michael@0:     return kEOF;
michael@0:   }
michael@0: 
michael@0:   nsScannerIterator start, end;
michael@0: 
michael@0:   start = mCurrentPosition;
michael@0: 
michael@0:   if ((int32_t)mCountRemaining <= aOffset) {
michael@0:     return kEOF;
michael@0:   }
michael@0: 
michael@0:   if (aOffset > 0) {
michael@0:     start.advance(aOffset);
michael@0:   }
michael@0: 
michael@0:   if (mCountRemaining < uint32_t(aNumChars + aOffset)) {
michael@0:     end = mEndPosition;
michael@0:   }
michael@0:   else {
michael@0:     end = start;
michael@0:     end.advance(aNumChars);
michael@0:   }
michael@0: 
michael@0:   CopyUnicodeTo(start, end, aStr);
michael@0: 
michael@0:   return NS_OK;
michael@0: }
michael@0: 
michael@0: 
michael@0: /**
michael@0:  *  Skip whitespace on scanner input stream
michael@0:  *  
michael@0:  *  @update  gess 3/25/98
michael@0:  *  @param   
michael@0:  *  @return  error status
michael@0:  */
michael@0: nsresult nsScanner::SkipWhitespace(int32_t& aNewlinesSkipped) {
michael@0: 
michael@0:   if (!mSlidingBuffer) {
michael@0:     return kEOF;
michael@0:   }
michael@0: 
michael@0:   char16_t theChar = 0;
michael@0:   nsresult  result = Peek(theChar);
michael@0:   
michael@0:   if (NS_FAILED(result)) {
michael@0:     return result;
michael@0:   }
michael@0:   
michael@0:   nsScannerIterator current = mCurrentPosition;
michael@0:   bool      done = false;
michael@0:   bool      skipped = false;
michael@0:   
michael@0:   while (!done && current != mEndPosition) {
michael@0:     switch(theChar) {
michael@0:       case '\n':
michael@0:       case '\r': ++aNewlinesSkipped;
michael@0:       case ' ' :
michael@0:       case '\t':
michael@0:         {
michael@0:           skipped = true;
michael@0:           char16_t thePrevChar = theChar;
michael@0:           theChar = (++current != mEndPosition) ? *current : '\0';
michael@0:           if ((thePrevChar == '\r' && theChar == '\n') ||
michael@0:               (thePrevChar == '\n' && theChar == '\r')) {
michael@0:             theChar = (++current != mEndPosition) ? *current : '\0'; // CRLF == LFCR => LF
michael@0:           }
michael@0:         }
michael@0:         break;
michael@0:       default:
michael@0:         done = true;
michael@0:         break;
michael@0:     }
michael@0:   }
michael@0: 
michael@0:   if (skipped) {
michael@0:     SetPosition(current);
michael@0:     if (current == mEndPosition) {
michael@0:       result = kEOF;
michael@0:     }
michael@0:   }
michael@0: 
michael@0:   return result;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  *  Skip over chars as long as they equal given char
michael@0:  *  
michael@0:  *  @update  gess 3/25/98
michael@0:  *  @param   
michael@0:  *  @return  error code
michael@0:  */
michael@0: nsresult nsScanner::SkipOver(char16_t aSkipChar){
michael@0: 
michael@0:   if (!mSlidingBuffer) {
michael@0:     return kEOF;
michael@0:   }
michael@0: 
michael@0:   char16_t ch=0;
michael@0:   nsresult   result=NS_OK;
michael@0: 
michael@0:   while(NS_OK==result) {
michael@0:     result=Peek(ch);
michael@0:     if(NS_OK == result) {
michael@0:       if(ch!=aSkipChar) {
michael@0:         break;
michael@0:       }
michael@0:       GetChar(ch);
michael@0:     } 
michael@0:     else break;
michael@0:   } //while
michael@0:   return result;
michael@0: 
michael@0: }
michael@0: 
michael@0: #if 0
michael@0: void DoErrTest(nsString& aString) {
michael@0:   int32_t pos=aString.FindChar(0);
michael@0:   if(kNotFound<pos) {
michael@0:     if(aString.Length()-1!=pos) {
michael@0:     }
michael@0:   }
michael@0: }
michael@0: 
michael@0: void DoErrTest(nsCString& aString) {
michael@0:   int32_t pos=aString.FindChar(0);
michael@0:   if(kNotFound<pos) {
michael@0:     if(aString.Length()-1!=pos) {
michael@0:     }
michael@0:   }
michael@0: }
michael@0: #endif
michael@0: 
michael@0: /**
michael@0:  *  Consume characters until you run into space, a '<', a '>', or a '/'.
michael@0:  *  
michael@0:  *  @param   aString - receives new data from stream
michael@0:  *  @return  error code
michael@0:  */
michael@0: nsresult nsScanner::ReadTagIdentifier(nsScannerSharedSubstring& aString) {
michael@0: 
michael@0:   if (!mSlidingBuffer) {
michael@0:     return kEOF;
michael@0:   }
michael@0: 
michael@0:   char16_t         theChar=0;
michael@0:   nsresult          result=Peek(theChar);
michael@0:   nsScannerIterator current, end;
michael@0:   bool              found=false;  
michael@0:   
michael@0:   current = mCurrentPosition;
michael@0:   end = mEndPosition;
michael@0: 
michael@0:   // Loop until we find an illegal character. Everything is then appended
michael@0:   // later.
michael@0:   while(current != end && !found) {
michael@0:     theChar=*current;
michael@0: 
michael@0:     switch(theChar) {
michael@0:       case '\n':
michael@0:       case '\r':
michael@0:       case ' ' :
michael@0:       case '\t':
michael@0:       case '\v':
michael@0:       case '\f':
michael@0:       case '<':
michael@0:       case '>':
michael@0:       case '/':
michael@0:         found = true;
michael@0:         break;
michael@0: 
michael@0:       case '\0':
michael@0:         ReplaceCharacter(current, sInvalid);
michael@0:         break;
michael@0: 
michael@0:       default:
michael@0:         break;
michael@0:     }
michael@0: 
michael@0:     if (!found) {
michael@0:       ++current;
michael@0:     }
michael@0:   }
michael@0: 
michael@0:   // Don't bother appending nothing.
michael@0:   if (current != mCurrentPosition) {
michael@0:     AppendUnicodeTo(mCurrentPosition, current, aString);
michael@0:   }
michael@0: 
michael@0:   SetPosition(current);  
michael@0:   if (current == end) {
michael@0:     result = kEOF;
michael@0:   }
michael@0: 
michael@0:   //DoErrTest(aString);
michael@0: 
michael@0:   return result;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  *  Consume characters until you run into a char that's not valid in an
michael@0:  *  entity name
michael@0:  *  
michael@0:  *  @param   aString - receives new data from stream
michael@0:  *  @return  error code
michael@0:  */
michael@0: nsresult nsScanner::ReadEntityIdentifier(nsString& aString) {
michael@0: 
michael@0:   if (!mSlidingBuffer) {
michael@0:     return kEOF;
michael@0:   }
michael@0: 
michael@0:   char16_t         theChar=0;
michael@0:   nsresult          result=Peek(theChar);
michael@0:   nsScannerIterator origin, current, end;
michael@0:   bool              found=false;  
michael@0: 
michael@0:   origin = mCurrentPosition;
michael@0:   current = mCurrentPosition;
michael@0:   end = mEndPosition;
michael@0: 
michael@0:   while(current != end) {
michael@0:  
michael@0:     theChar=*current;
michael@0:     if(theChar) {
michael@0:       found=false;
michael@0:       switch(theChar) {
michael@0:         case '_':
michael@0:         case '-':
michael@0:         case '.':
michael@0:           // Don't allow ':' in entity names.  See bug 23791
michael@0:           found = true;
michael@0:           break;
michael@0:         default:
michael@0:           found = ('a'<=theChar && theChar<='z') ||
michael@0:                   ('A'<=theChar && theChar<='Z') ||
michael@0:                   ('0'<=theChar && theChar<='9');
michael@0:           break;
michael@0:       }
michael@0: 
michael@0:       if(!found) {
michael@0:         AppendUnicodeTo(mCurrentPosition, current, aString);
michael@0:         break;
michael@0:       }
michael@0:     }
michael@0:     ++current;
michael@0:   }
michael@0:   
michael@0:   SetPosition(current);
michael@0:   if (current == end) {
michael@0:     AppendUnicodeTo(origin, current, aString);
michael@0:     return kEOF;
michael@0:   }
michael@0: 
michael@0:   //DoErrTest(aString);
michael@0: 
michael@0:   return result;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  *  Consume digits 
michael@0:  *  
michael@0:  *  @param   aString - should contain digits
michael@0:  *  @return  error code
michael@0:  */
michael@0: nsresult nsScanner::ReadNumber(nsString& aString,int32_t aBase) {
michael@0: 
michael@0:   if (!mSlidingBuffer) {
michael@0:     return kEOF;
michael@0:   }
michael@0: 
michael@0:   NS_ASSERTION(aBase == 10 || aBase == 16,"base value not supported");
michael@0: 
michael@0:   char16_t         theChar=0;
michael@0:   nsresult          result=Peek(theChar);
michael@0:   nsScannerIterator origin, current, end;
michael@0: 
michael@0:   origin = mCurrentPosition;
michael@0:   current = origin;
michael@0:   end = mEndPosition;
michael@0: 
michael@0:   bool done = false;
michael@0:   while(current != end) {
michael@0:     theChar=*current;
michael@0:     if(theChar) {
michael@0:       done = (theChar < '0' || theChar > '9') && 
michael@0:              ((aBase == 16)? (theChar < 'A' || theChar > 'F') &&
michael@0:                              (theChar < 'a' || theChar > 'f')
michael@0:                              :true);
michael@0:       if(done) {
michael@0:         AppendUnicodeTo(origin, current, aString);
michael@0:         break;
michael@0:       }
michael@0:     }
michael@0:     ++current;
michael@0:   }
michael@0: 
michael@0:   SetPosition(current);
michael@0:   if (current == end) {
michael@0:     AppendUnicodeTo(origin, current, aString);
michael@0:     return kEOF;
michael@0:   }
michael@0: 
michael@0:   //DoErrTest(aString);
michael@0: 
michael@0:   return result;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  *  Consume characters until you find the terminal char
michael@0:  *  
michael@0:  *  @update  gess 3/25/98
michael@0:  *  @param   aString receives new data from stream
michael@0:  *  @param   addTerminal tells us whether to append terminal to aString
michael@0:  *  @return  error code
michael@0:  */
michael@0: nsresult nsScanner::ReadWhitespace(nsScannerSharedSubstring& aString,
michael@0:                                    int32_t& aNewlinesSkipped,
michael@0:                                    bool& aHaveCR) {
michael@0: 
michael@0:   aHaveCR = false;
michael@0: 
michael@0:   if (!mSlidingBuffer) {
michael@0:     return kEOF;
michael@0:   }
michael@0: 
michael@0:   char16_t theChar = 0;
michael@0:   nsresult  result = Peek(theChar);
michael@0:   
michael@0:   if (NS_FAILED(result)) {
michael@0:     return result;
michael@0:   }
michael@0:   
michael@0:   nsScannerIterator origin, current, end;
michael@0:   bool done = false;  
michael@0: 
michael@0:   origin = mCurrentPosition;
michael@0:   current = origin;
michael@0:   end = mEndPosition;
michael@0: 
michael@0:   bool haveCR = false;
michael@0: 
michael@0:   while(!done && current != end) {
michael@0:     switch(theChar) {
michael@0:       case '\n':
michael@0:       case '\r':
michael@0:         {
michael@0:           ++aNewlinesSkipped;
michael@0:           char16_t thePrevChar = theChar;
michael@0:           theChar = (++current != end) ? *current : '\0';
michael@0:           if ((thePrevChar == '\r' && theChar == '\n') ||
michael@0:               (thePrevChar == '\n' && theChar == '\r')) {
michael@0:             theChar = (++current != end) ? *current : '\0'; // CRLF == LFCR => LF
michael@0:             haveCR = true;
michael@0:           } else if (thePrevChar == '\r') {
michael@0:             // Lone CR becomes CRLF; callers should know to remove extra CRs
michael@0:             AppendUnicodeTo(origin, current, aString);
michael@0:             aString.writable().Append(char16_t('\n'));
michael@0:             origin = current;
michael@0:             haveCR = true;
michael@0:           }
michael@0:         }
michael@0:         break;
michael@0:       case ' ' :
michael@0:       case '\t':
michael@0:         theChar = (++current != end) ? *current : '\0';
michael@0:         break;
michael@0:       default:
michael@0:         done = true;
michael@0:         AppendUnicodeTo(origin, current, aString);
michael@0:         break;
michael@0:     }
michael@0:   }
michael@0: 
michael@0:   SetPosition(current);
michael@0:   if (current == end) {
michael@0:     AppendUnicodeTo(origin, current, aString);
michael@0:     result = kEOF;
michael@0:   }
michael@0: 
michael@0:   aHaveCR = haveCR;
michael@0:   return result;
michael@0: }
michael@0: 
michael@0: //XXXbz callers of this have to manage their lone '\r' themselves if they want
michael@0: //it to work.  Good thing they're all in view-source and it deals.
michael@0: nsresult nsScanner::ReadWhitespace(nsScannerIterator& aStart, 
michael@0:                                    nsScannerIterator& aEnd,
michael@0:                                    int32_t& aNewlinesSkipped) {
michael@0: 
michael@0:   if (!mSlidingBuffer) {
michael@0:     return kEOF;
michael@0:   }
michael@0: 
michael@0:   char16_t theChar = 0;
michael@0:   nsresult  result = Peek(theChar);
michael@0:   
michael@0:   if (NS_FAILED(result)) {
michael@0:     return result;
michael@0:   }
michael@0:   
michael@0:   nsScannerIterator origin, current, end;
michael@0:   bool done = false;  
michael@0: 
michael@0:   origin = mCurrentPosition;
michael@0:   current = origin;
michael@0:   end = mEndPosition;
michael@0: 
michael@0:   while(!done && current != end) {
michael@0:     switch(theChar) {
michael@0:       case '\n':
michael@0:       case '\r': ++aNewlinesSkipped;
michael@0:       case ' ' :
michael@0:       case '\t':
michael@0:         {
michael@0:           char16_t thePrevChar = theChar;
michael@0:           theChar = (++current != end) ? *current : '\0';
michael@0:           if ((thePrevChar == '\r' && theChar == '\n') ||
michael@0:               (thePrevChar == '\n' && theChar == '\r')) {
michael@0:             theChar = (++current != end) ? *current : '\0'; // CRLF == LFCR => LF
michael@0:           }
michael@0:         }
michael@0:         break;
michael@0:       default:
michael@0:         done = true;
michael@0:         aStart = origin;
michael@0:         aEnd = current;
michael@0:         break;
michael@0:     }
michael@0:   }
michael@0: 
michael@0:   SetPosition(current);
michael@0:   if (current == end) {
michael@0:     aStart = origin;
michael@0:     aEnd = current;
michael@0:     result = kEOF;
michael@0:   }
michael@0: 
michael@0:   return result;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  *  Consume characters until you encounter one contained in given
michael@0:  *  input set.
michael@0:  *  
michael@0:  *  @update  gess 3/25/98
michael@0:  *  @param   aString will contain the result of this method
michael@0:  *  @param   aTerminalSet is an ordered string that contains
michael@0:  *           the set of INVALID characters
michael@0:  *  @return  error code
michael@0:  */
michael@0: nsresult nsScanner::ReadUntil(nsAString& aString,
michael@0:                               const nsReadEndCondition& aEndCondition,
michael@0:                               bool addTerminal)
michael@0: {  
michael@0:   if (!mSlidingBuffer) {
michael@0:     return kEOF;
michael@0:   }
michael@0: 
michael@0:   nsScannerIterator origin, current;
michael@0:   const char16_t* setstart = aEndCondition.mChars;
michael@0:   const char16_t* setcurrent;
michael@0: 
michael@0:   origin = mCurrentPosition;
michael@0:   current = origin;
michael@0: 
michael@0:   char16_t         theChar=0;
michael@0:   nsresult          result=Peek(theChar);
michael@0: 
michael@0:   if (NS_FAILED(result)) {
michael@0:     return result;
michael@0:   }
michael@0:   
michael@0:   while (current != mEndPosition) {
michael@0:     theChar = *current;
michael@0:     if (theChar == '\0') {
michael@0:       ReplaceCharacter(current, sInvalid);
michael@0:       theChar = sInvalid;
michael@0:     }
michael@0: 
michael@0:     // Filter out completely wrong characters
michael@0:     // Check if all bits are in the required area
michael@0:     if(!(theChar & aEndCondition.mFilter)) {
michael@0:       // They were. Do a thorough check.
michael@0: 
michael@0:       setcurrent = setstart;
michael@0:       while (*setcurrent) {
michael@0:         if (*setcurrent == theChar) {
michael@0:           if(addTerminal)
michael@0:             ++current;
michael@0:           AppendUnicodeTo(origin, current, aString);
michael@0:           SetPosition(current);
michael@0: 
michael@0:           //DoErrTest(aString);
michael@0: 
michael@0:           return NS_OK;
michael@0:         }
michael@0:         ++setcurrent;
michael@0:       }
michael@0:     }
michael@0:     
michael@0:     ++current;
michael@0:   }
michael@0: 
michael@0:   // If we are here, we didn't find any terminator in the string and
michael@0:   // current = mEndPosition
michael@0:   SetPosition(current);
michael@0:   AppendUnicodeTo(origin, current, aString);
michael@0:   return kEOF;
michael@0: }
michael@0: 
michael@0: nsresult nsScanner::ReadUntil(nsScannerSharedSubstring& aString,
michael@0:                               const nsReadEndCondition& aEndCondition,
michael@0:                               bool addTerminal)
michael@0: {  
michael@0:   if (!mSlidingBuffer) {
michael@0:     return kEOF;
michael@0:   }
michael@0: 
michael@0:   nsScannerIterator origin, current;
michael@0:   const char16_t* setstart = aEndCondition.mChars;
michael@0:   const char16_t* setcurrent;
michael@0: 
michael@0:   origin = mCurrentPosition;
michael@0:   current = origin;
michael@0: 
michael@0:   char16_t         theChar=0;
michael@0:   nsresult          result=Peek(theChar);
michael@0: 
michael@0:   if (NS_FAILED(result)) {
michael@0:     return result;
michael@0:   }
michael@0:   
michael@0:   while (current != mEndPosition) {
michael@0:     theChar = *current;
michael@0:     if (theChar == '\0') {
michael@0:       ReplaceCharacter(current, sInvalid);
michael@0:       theChar = sInvalid;
michael@0:     }
michael@0: 
michael@0:     // Filter out completely wrong characters
michael@0:     // Check if all bits are in the required area
michael@0:     if(!(theChar & aEndCondition.mFilter)) {
michael@0:       // They were. Do a thorough check.
michael@0: 
michael@0:       setcurrent = setstart;
michael@0:       while (*setcurrent) {
michael@0:         if (*setcurrent == theChar) {
michael@0:           if(addTerminal)
michael@0:             ++current;
michael@0:           AppendUnicodeTo(origin, current, aString);
michael@0:           SetPosition(current);
michael@0: 
michael@0:           //DoErrTest(aString);
michael@0: 
michael@0:           return NS_OK;
michael@0:         }
michael@0:         ++setcurrent;
michael@0:       }
michael@0:     }
michael@0:     
michael@0:     ++current;
michael@0:   }
michael@0: 
michael@0:   // If we are here, we didn't find any terminator in the string and
michael@0:   // current = mEndPosition
michael@0:   SetPosition(current);
michael@0:   AppendUnicodeTo(origin, current, aString);
michael@0:   return kEOF;
michael@0: }
michael@0: 
michael@0: nsresult nsScanner::ReadUntil(nsScannerIterator& aStart, 
michael@0:                               nsScannerIterator& aEnd,
michael@0:                               const nsReadEndCondition &aEndCondition,
michael@0:                               bool addTerminal)
michael@0: {
michael@0:   if (!mSlidingBuffer) {
michael@0:     return kEOF;
michael@0:   }
michael@0: 
michael@0:   nsScannerIterator origin, current;
michael@0:   const char16_t* setstart = aEndCondition.mChars;
michael@0:   const char16_t* setcurrent;
michael@0: 
michael@0:   origin = mCurrentPosition;
michael@0:   current = origin;
michael@0: 
michael@0:   char16_t         theChar=0;
michael@0:   nsresult          result=Peek(theChar);
michael@0:   
michael@0:   if (NS_FAILED(result)) {
michael@0:     aStart = aEnd = current;
michael@0:     return result;
michael@0:   }
michael@0:   
michael@0:   while (current != mEndPosition) {
michael@0:     theChar = *current;
michael@0:     if (theChar == '\0') {
michael@0:       ReplaceCharacter(current, sInvalid);
michael@0:       theChar = sInvalid;
michael@0:     }
michael@0: 
michael@0:     // Filter out completely wrong characters
michael@0:     // Check if all bits are in the required area
michael@0:     if(!(theChar & aEndCondition.mFilter)) {
michael@0:       // They were. Do a thorough check.
michael@0:       setcurrent = setstart;
michael@0:       while (*setcurrent) {
michael@0:         if (*setcurrent == theChar) {
michael@0:           if(addTerminal)
michael@0:             ++current;
michael@0:           aStart = origin;
michael@0:           aEnd = current;
michael@0:           SetPosition(current);
michael@0: 
michael@0:           return NS_OK;
michael@0:         }
michael@0:         ++setcurrent;
michael@0:       }
michael@0:     }
michael@0: 
michael@0:     ++current;
michael@0:   }
michael@0: 
michael@0:   // If we are here, we didn't find any terminator in the string and
michael@0:   // current = mEndPosition
michael@0:   SetPosition(current);
michael@0:   aStart = origin;
michael@0:   aEnd = current;
michael@0:   return kEOF;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  *  Consumes chars until you see the given terminalChar
michael@0:  *  
michael@0:  *  @update  gess 3/25/98
michael@0:  *  @param   
michael@0:  *  @return  error code
michael@0:  */
michael@0: nsresult nsScanner::ReadUntil(nsAString& aString,
michael@0:                               char16_t aTerminalChar,
michael@0:                               bool addTerminal)
michael@0: {
michael@0:   if (!mSlidingBuffer) {
michael@0:     return kEOF;
michael@0:   }
michael@0: 
michael@0:   nsScannerIterator origin, current;
michael@0: 
michael@0:   origin = mCurrentPosition;
michael@0:   current = origin;
michael@0: 
michael@0:   char16_t theChar;
michael@0:   nsresult result = Peek(theChar);
michael@0: 
michael@0:   if (NS_FAILED(result)) {
michael@0:     return result;
michael@0:   }
michael@0: 
michael@0:   while (current != mEndPosition) {
michael@0:     theChar = *current;
michael@0:     if (theChar == '\0') {
michael@0:       ReplaceCharacter(current, sInvalid);
michael@0:       theChar = sInvalid;
michael@0:     }
michael@0: 
michael@0:     if (aTerminalChar == theChar) {
michael@0:       if(addTerminal)
michael@0:         ++current;
michael@0:       AppendUnicodeTo(origin, current, aString);
michael@0:       SetPosition(current);
michael@0:       return NS_OK;
michael@0:     }
michael@0:     ++current;
michael@0:   }
michael@0: 
michael@0:   // If we are here, we didn't find any terminator in the string and
michael@0:   // current = mEndPosition
michael@0:   AppendUnicodeTo(origin, current, aString);
michael@0:   SetPosition(current);
michael@0:   return kEOF;
michael@0: 
michael@0: }
michael@0: 
michael@0: void nsScanner::BindSubstring(nsScannerSubstring& aSubstring, const nsScannerIterator& aStart, const nsScannerIterator& aEnd)
michael@0: {
michael@0:   aSubstring.Rebind(*mSlidingBuffer, aStart, aEnd);
michael@0: }
michael@0: 
michael@0: void nsScanner::CurrentPosition(nsScannerIterator& aPosition)
michael@0: {
michael@0:   aPosition = mCurrentPosition;
michael@0: }
michael@0: 
michael@0: void nsScanner::EndReading(nsScannerIterator& aPosition)
michael@0: {
michael@0:   aPosition = mEndPosition;
michael@0: }
michael@0:  
michael@0: void nsScanner::SetPosition(nsScannerIterator& aPosition, bool aTerminate, bool aReverse)
michael@0: {
michael@0:   if (mSlidingBuffer) {
michael@0: #ifdef DEBUG
michael@0:     uint32_t origRemaining = mCountRemaining;
michael@0: #endif
michael@0: 
michael@0:     if (aReverse) {
michael@0:       mCountRemaining += (Distance(aPosition, mCurrentPosition));
michael@0:     }
michael@0:     else {
michael@0:       mCountRemaining -= (Distance(mCurrentPosition, aPosition));
michael@0:     }
michael@0: 
michael@0:     NS_ASSERTION((mCountRemaining >= origRemaining && aReverse) ||
michael@0:                  (mCountRemaining <= origRemaining && !aReverse),
michael@0:                  "Improper use of nsScanner::SetPosition. Make sure to set the"
michael@0:                  " aReverse parameter correctly");
michael@0: 
michael@0:     mCurrentPosition = aPosition;
michael@0:     if (aTerminate && (mCurrentPosition == mEndPosition)) {
michael@0:       mMarkPosition = mCurrentPosition;
michael@0:       mSlidingBuffer->DiscardPrefix(mCurrentPosition);
michael@0:     }
michael@0:   }
michael@0: }
michael@0: 
michael@0: void nsScanner::ReplaceCharacter(nsScannerIterator& aPosition,
michael@0:                                  char16_t aChar)
michael@0: {
michael@0:   if (mSlidingBuffer) {
michael@0:     mSlidingBuffer->ReplaceCharacter(aPosition, aChar);
michael@0:   }
michael@0: }
michael@0: 
michael@0: bool nsScanner::AppendToBuffer(nsScannerString::Buffer* aBuf,
michael@0:                                  nsIRequest *aRequest,
michael@0:                                  int32_t aErrorPos)
michael@0: {
michael@0:   uint32_t countRemaining = mCountRemaining;
michael@0:   if (!mSlidingBuffer) {
michael@0:     mSlidingBuffer = new nsScannerString(aBuf);
michael@0:     if (!mSlidingBuffer)
michael@0:       return false;
michael@0:     mSlidingBuffer->BeginReading(mCurrentPosition);
michael@0:     mMarkPosition = mCurrentPosition;
michael@0:     mSlidingBuffer->EndReading(mEndPosition);
michael@0:     mCountRemaining = aBuf->DataLength();
michael@0:   }
michael@0:   else {
michael@0:     mSlidingBuffer->AppendBuffer(aBuf);
michael@0:     if (mCurrentPosition == mEndPosition) {
michael@0:       mSlidingBuffer->BeginReading(mCurrentPosition);
michael@0:     }
michael@0:     mSlidingBuffer->EndReading(mEndPosition);
michael@0:     mCountRemaining += aBuf->DataLength();
michael@0:   }
michael@0: 
michael@0:   if (aErrorPos != -1 && !mHasInvalidCharacter) {
michael@0:     mHasInvalidCharacter = true;
michael@0:     mFirstInvalidPosition = mCurrentPosition;
michael@0:     mFirstInvalidPosition.advance(countRemaining + aErrorPos);
michael@0:   }
michael@0: 
michael@0:   if (mFirstNonWhitespacePosition == -1) {
michael@0:     nsScannerIterator iter(mCurrentPosition);
michael@0:     nsScannerIterator end(mEndPosition);
michael@0: 
michael@0:     while (iter != end) {
michael@0:       if (!nsCRT::IsAsciiSpace(*iter)) {
michael@0:         mFirstNonWhitespacePosition = Distance(mCurrentPosition, iter);
michael@0: 
michael@0:         break;
michael@0:       }
michael@0: 
michael@0:       ++iter;
michael@0:     }
michael@0:   }
michael@0:   return true;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  *  call this to copy bytes out of the scanner that have not yet been consumed
michael@0:  *  by the tokenization process.
michael@0:  *  
michael@0:  *  @update  gess 5/12/98
michael@0:  *  @param   aCopyBuffer is where the scanner buffer will be copied to
michael@0:  *  @return  nada
michael@0:  */
michael@0: void nsScanner::CopyUnusedData(nsString& aCopyBuffer) {
michael@0:   if (!mSlidingBuffer) {
michael@0:     aCopyBuffer.Truncate();
michael@0:     return;
michael@0:   }
michael@0: 
michael@0:   nsScannerIterator start, end;
michael@0:   start = mCurrentPosition;
michael@0:   end = mEndPosition;
michael@0: 
michael@0:   CopyUnicodeTo(start, end, aCopyBuffer);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  *  Retrieve the name of the file that the scanner is reading from.
michael@0:  *  In some cases, it's just a given name, because the scanner isn't
michael@0:  *  really reading from a file.
michael@0:  *  
michael@0:  *  @update  gess 5/12/98
michael@0:  *  @return  
michael@0:  */
michael@0: nsString& nsScanner::GetFilename(void) {
michael@0:   return mFilename;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  *  Conduct self test. Actually, selftesting for this class
michael@0:  *  occurs in the parser selftest.
michael@0:  *  
michael@0:  *  @update  gess 3/25/98
michael@0:  *  @param   
michael@0:  *  @return  
michael@0:  */
michael@0: 
michael@0: void nsScanner::SelfTest(void) {
michael@0: #ifdef _DEBUG
michael@0: #endif
michael@0: }
michael@0: 
michael@0: void nsScanner::OverrideReplacementCharacter(char16_t aReplacementCharacter)
michael@0: {
michael@0:   mReplacementCharacter = aReplacementCharacter;
michael@0: 
michael@0:   if (mHasInvalidCharacter) {
michael@0:     ReplaceCharacter(mFirstInvalidPosition, mReplacementCharacter);
michael@0:   }
michael@0: }
michael@0: