michael@0: /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ michael@0: /* vim: set ts=2 sw=2 et tw=78: */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: //#define __INCREMENTAL 1 michael@0: michael@0: #include "mozilla/DebugOnly.h" michael@0: michael@0: #include "nsScanner.h" michael@0: #include "nsDebug.h" michael@0: #include "nsReadableUtils.h" michael@0: #include "nsIInputStream.h" michael@0: #include "nsIFile.h" michael@0: #include "nsNetUtil.h" michael@0: #include "nsUTF8Utils.h" // for LossyConvertEncoding michael@0: #include "nsCRT.h" michael@0: #include "nsParser.h" michael@0: #include "nsCharsetSource.h" michael@0: michael@0: #include "mozilla/dom/EncodingUtils.h" michael@0: michael@0: using mozilla::dom::EncodingUtils; michael@0: michael@0: // We replace NUL characters with this character. michael@0: static char16_t sInvalid = UCS2_REPLACEMENT_CHAR; michael@0: michael@0: nsReadEndCondition::nsReadEndCondition(const char16_t* aTerminateChars) : michael@0: mChars(aTerminateChars), mFilter(char16_t(~0)) // All bits set michael@0: { michael@0: // Build filter that will be used to filter out characters with michael@0: // bits that none of the terminal chars have. This works very well michael@0: // because terminal chars often have only the last 4-6 bits set and michael@0: // normal ascii letters have bit 7 set. Other letters have even higher michael@0: // bits set. michael@0: michael@0: // Calculate filter michael@0: const char16_t *current = aTerminateChars; michael@0: char16_t terminalChar = *current; michael@0: while (terminalChar) { michael@0: mFilter &= ~terminalChar; michael@0: ++current; michael@0: terminalChar = *current; michael@0: } michael@0: } michael@0: michael@0: /** michael@0: * Use this constructor if you want i/o to be based on michael@0: * a single string you hand in during construction. michael@0: * This short cut was added for Javascript. michael@0: * michael@0: * @update gess 5/12/98 michael@0: * @param aMode represents the parser mode (nav, other) michael@0: * @return michael@0: */ michael@0: nsScanner::nsScanner(const nsAString& anHTMLString) michael@0: { michael@0: MOZ_COUNT_CTOR(nsScanner); michael@0: michael@0: mSlidingBuffer = nullptr; michael@0: mCountRemaining = 0; michael@0: mFirstNonWhitespacePosition = -1; michael@0: if (AppendToBuffer(anHTMLString)) { michael@0: mSlidingBuffer->BeginReading(mCurrentPosition); michael@0: } else { michael@0: /* XXX see hack below, re: bug 182067 */ michael@0: memset(&mCurrentPosition, 0, sizeof(mCurrentPosition)); michael@0: mEndPosition = mCurrentPosition; michael@0: } michael@0: mMarkPosition = mCurrentPosition; michael@0: mIncremental = false; michael@0: mUnicodeDecoder = 0; michael@0: mCharsetSource = kCharsetUninitialized; michael@0: mHasInvalidCharacter = false; michael@0: mReplacementCharacter = char16_t(0x0); michael@0: } michael@0: michael@0: /** michael@0: * Use this constructor if you want i/o to be based on strings michael@0: * the scanner receives. If you pass a null filename, you michael@0: * can still provide data to the scanner via append. michael@0: */ michael@0: nsScanner::nsScanner(nsString& aFilename, bool aCreateStream) michael@0: : mFilename(aFilename) michael@0: { michael@0: MOZ_COUNT_CTOR(nsScanner); michael@0: NS_ASSERTION(!aCreateStream, "This is always true."); michael@0: michael@0: mSlidingBuffer = nullptr; michael@0: michael@0: // XXX This is a big hack. We need to initialize the iterators to something. michael@0: // What matters is that mCurrentPosition == mEndPosition, so that our methods michael@0: // believe that we are at EOF (see bug 182067). We null out mCurrentPosition michael@0: // so that we have some hope of catching null pointer dereferences associated michael@0: // with this hack. --darin michael@0: memset(&mCurrentPosition, 0, sizeof(mCurrentPosition)); michael@0: mMarkPosition = mCurrentPosition; michael@0: mEndPosition = mCurrentPosition; michael@0: michael@0: mIncremental = true; michael@0: mFirstNonWhitespacePosition = -1; michael@0: mCountRemaining = 0; michael@0: michael@0: mUnicodeDecoder = 0; michael@0: mCharsetSource = kCharsetUninitialized; michael@0: mHasInvalidCharacter = false; michael@0: mReplacementCharacter = char16_t(0x0); michael@0: // XML defaults to UTF-8 and about:blank is UTF-8, too. michael@0: SetDocumentCharset(NS_LITERAL_CSTRING("UTF-8"), kCharsetFromDocTypeDefault); michael@0: } michael@0: michael@0: nsresult nsScanner::SetDocumentCharset(const nsACString& aCharset , int32_t aSource) michael@0: { michael@0: if (aSource < mCharsetSource) // priority is lower than the current one michael@0: return NS_OK; michael@0: michael@0: mCharsetSource = aSource; michael@0: michael@0: nsCString charsetName; michael@0: mozilla::DebugOnly valid = michael@0: EncodingUtils::FindEncodingForLabel(aCharset, charsetName); michael@0: MOZ_ASSERT(valid, "Should never call with a bogus aCharset."); michael@0: michael@0: if (!mCharset.IsEmpty() && charsetName.Equals(mCharset)) { michael@0: return NS_OK; // no difference, don't change it michael@0: } michael@0: michael@0: // different, need to change it michael@0: michael@0: mCharset.Assign(charsetName); michael@0: michael@0: mUnicodeDecoder = EncodingUtils::DecoderForEncoding(mCharset); michael@0: mUnicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Signal); michael@0: michael@0: return NS_OK; michael@0: } michael@0: michael@0: michael@0: /** michael@0: * default destructor michael@0: * michael@0: * @update gess 3/25/98 michael@0: * @param michael@0: * @return michael@0: */ michael@0: nsScanner::~nsScanner() { michael@0: michael@0: delete mSlidingBuffer; michael@0: michael@0: MOZ_COUNT_DTOR(nsScanner); michael@0: } michael@0: michael@0: /** michael@0: * Resets current offset position of input stream to marked position. michael@0: * This allows us to back up to this point if the need should arise, michael@0: * such as when tokenization gets interrupted. michael@0: * NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST! michael@0: * michael@0: * @update gess 5/12/98 michael@0: * @param michael@0: * @return michael@0: */ michael@0: void nsScanner::RewindToMark(void){ michael@0: if (mSlidingBuffer) { michael@0: mCountRemaining += (Distance(mMarkPosition, mCurrentPosition)); michael@0: mCurrentPosition = mMarkPosition; michael@0: } michael@0: } michael@0: michael@0: michael@0: /** michael@0: * Records current offset position in input stream. This allows us michael@0: * to back up to this point if the need should arise, such as when michael@0: * tokenization gets interrupted. michael@0: * michael@0: * @update gess 7/29/98 michael@0: * @param michael@0: * @return michael@0: */ michael@0: int32_t nsScanner::Mark() { michael@0: int32_t distance = 0; michael@0: if (mSlidingBuffer) { michael@0: nsScannerIterator oldStart; michael@0: mSlidingBuffer->BeginReading(oldStart); michael@0: michael@0: distance = Distance(oldStart, mCurrentPosition); michael@0: michael@0: mSlidingBuffer->DiscardPrefix(mCurrentPosition); michael@0: mSlidingBuffer->BeginReading(mCurrentPosition); michael@0: mMarkPosition = mCurrentPosition; michael@0: } michael@0: michael@0: return distance; michael@0: } michael@0: michael@0: /** michael@0: * Insert data to our underlying input buffer as michael@0: * if it were read from an input stream. michael@0: * michael@0: * @update harishd 01/12/99 michael@0: * @return error code michael@0: */ michael@0: bool nsScanner::UngetReadable(const nsAString& aBuffer) { michael@0: if (!mSlidingBuffer) { michael@0: return false; michael@0: } michael@0: michael@0: mSlidingBuffer->UngetReadable(aBuffer,mCurrentPosition); michael@0: mSlidingBuffer->BeginReading(mCurrentPosition); // Insertion invalidated our iterators michael@0: mSlidingBuffer->EndReading(mEndPosition); michael@0: michael@0: uint32_t length = aBuffer.Length(); michael@0: mCountRemaining += length; // Ref. bug 117441 michael@0: return true; michael@0: } michael@0: michael@0: /** michael@0: * Append data to our underlying input buffer as michael@0: * if it were read from an input stream. michael@0: * michael@0: * @update gess4/3/98 michael@0: * @return error code michael@0: */ michael@0: nsresult nsScanner::Append(const nsAString& aBuffer) { michael@0: if (!AppendToBuffer(aBuffer)) michael@0: return NS_ERROR_OUT_OF_MEMORY; michael@0: return NS_OK; michael@0: } michael@0: michael@0: /** michael@0: * michael@0: * michael@0: * @update gess 5/21/98 michael@0: * @param michael@0: * @return michael@0: */ michael@0: nsresult nsScanner::Append(const char* aBuffer, uint32_t aLen, michael@0: nsIRequest *aRequest) michael@0: { michael@0: nsresult res = NS_OK; michael@0: if (mUnicodeDecoder) { michael@0: int32_t unicharBufLen = 0; michael@0: mUnicodeDecoder->GetMaxLength(aBuffer, aLen, &unicharBufLen); michael@0: nsScannerString::Buffer* buffer = nsScannerString::AllocBuffer(unicharBufLen + 1); michael@0: NS_ENSURE_TRUE(buffer,NS_ERROR_OUT_OF_MEMORY); michael@0: char16_t *unichars = buffer->DataStart(); michael@0: michael@0: int32_t totalChars = 0; michael@0: int32_t unicharLength = unicharBufLen; michael@0: int32_t errorPos = -1; michael@0: michael@0: do { michael@0: int32_t srcLength = aLen; michael@0: res = mUnicodeDecoder->Convert(aBuffer, &srcLength, unichars, &unicharLength); michael@0: michael@0: totalChars += unicharLength; michael@0: // Continuation of failure case michael@0: if(NS_FAILED(res)) { michael@0: // if we failed, we consume one byte, replace it with the replacement michael@0: // character and try the conversion again. michael@0: michael@0: // This is only needed because some decoders don't follow the michael@0: // nsIUnicodeDecoder contract: they return a failure when *aDestLength michael@0: // is 0 rather than the correct NS_OK_UDEC_MOREOUTPUT. See bug 244177 michael@0: if ((unichars + unicharLength) >= buffer->DataEnd()) { michael@0: NS_ERROR("Unexpected end of destination buffer"); michael@0: break; michael@0: } michael@0: michael@0: if (mReplacementCharacter == 0x0 && errorPos == -1) { michael@0: errorPos = totalChars; michael@0: } michael@0: unichars[unicharLength++] = mReplacementCharacter == 0x0 ? michael@0: mUnicodeDecoder->GetCharacterForUnMapped() : michael@0: mReplacementCharacter; michael@0: michael@0: unichars = unichars + unicharLength; michael@0: unicharLength = unicharBufLen - (++totalChars); michael@0: michael@0: mUnicodeDecoder->Reset(); michael@0: michael@0: if(((uint32_t) (srcLength + 1)) > aLen) { michael@0: srcLength = aLen; michael@0: } michael@0: else { michael@0: ++srcLength; michael@0: } michael@0: michael@0: aBuffer += srcLength; michael@0: aLen -= srcLength; michael@0: } michael@0: } while (NS_FAILED(res) && (aLen > 0)); michael@0: michael@0: buffer->SetDataLength(totalChars); michael@0: // Don't propagate return code of unicode decoder michael@0: // since it doesn't reflect on our success or failure michael@0: // - Ref. bug 87110 michael@0: res = NS_OK; michael@0: if (!AppendToBuffer(buffer, aRequest, errorPos)) michael@0: res = NS_ERROR_OUT_OF_MEMORY; michael@0: } michael@0: else { michael@0: NS_WARNING("No decoder found."); michael@0: res = NS_ERROR_FAILURE; michael@0: } michael@0: michael@0: return res; michael@0: } michael@0: michael@0: /** michael@0: * retrieve next char from scanners internal input stream michael@0: * michael@0: * @update gess 3/25/98 michael@0: * @param michael@0: * @return error code reflecting read status michael@0: */ michael@0: nsresult nsScanner::GetChar(char16_t& aChar) { michael@0: if (!mSlidingBuffer || mCurrentPosition == mEndPosition) { michael@0: aChar = 0; michael@0: return kEOF; michael@0: } michael@0: michael@0: aChar = *mCurrentPosition++; michael@0: --mCountRemaining; michael@0: michael@0: return NS_OK; michael@0: } michael@0: michael@0: michael@0: /** michael@0: * peek ahead to consume next char from scanner's internal michael@0: * input buffer michael@0: * michael@0: * @update gess 3/25/98 michael@0: * @param michael@0: * @return michael@0: */ michael@0: nsresult nsScanner::Peek(char16_t& aChar, uint32_t aOffset) { michael@0: aChar = 0; michael@0: michael@0: if (!mSlidingBuffer || mCurrentPosition == mEndPosition) { michael@0: return kEOF; michael@0: } michael@0: michael@0: if (aOffset > 0) { michael@0: if (mCountRemaining <= aOffset) michael@0: return kEOF; michael@0: michael@0: nsScannerIterator pos = mCurrentPosition; michael@0: pos.advance(aOffset); michael@0: aChar=*pos; michael@0: } michael@0: else { michael@0: aChar=*mCurrentPosition; michael@0: } michael@0: michael@0: return NS_OK; michael@0: } michael@0: michael@0: nsresult nsScanner::Peek(nsAString& aStr, int32_t aNumChars, int32_t aOffset) michael@0: { michael@0: if (!mSlidingBuffer || mCurrentPosition == mEndPosition) { michael@0: return kEOF; michael@0: } michael@0: michael@0: nsScannerIterator start, end; michael@0: michael@0: start = mCurrentPosition; michael@0: michael@0: if ((int32_t)mCountRemaining <= aOffset) { michael@0: return kEOF; michael@0: } michael@0: michael@0: if (aOffset > 0) { michael@0: start.advance(aOffset); michael@0: } michael@0: michael@0: if (mCountRemaining < uint32_t(aNumChars + aOffset)) { michael@0: end = mEndPosition; michael@0: } michael@0: else { michael@0: end = start; michael@0: end.advance(aNumChars); michael@0: } michael@0: michael@0: CopyUnicodeTo(start, end, aStr); michael@0: michael@0: return NS_OK; michael@0: } michael@0: michael@0: michael@0: /** michael@0: * Skip whitespace on scanner input stream michael@0: * michael@0: * @update gess 3/25/98 michael@0: * @param michael@0: * @return error status michael@0: */ michael@0: nsresult nsScanner::SkipWhitespace(int32_t& aNewlinesSkipped) { michael@0: michael@0: if (!mSlidingBuffer) { michael@0: return kEOF; michael@0: } michael@0: michael@0: char16_t theChar = 0; michael@0: nsresult result = Peek(theChar); michael@0: michael@0: if (NS_FAILED(result)) { michael@0: return result; michael@0: } michael@0: michael@0: nsScannerIterator current = mCurrentPosition; michael@0: bool done = false; michael@0: bool skipped = false; michael@0: michael@0: while (!done && current != mEndPosition) { michael@0: switch(theChar) { michael@0: case '\n': michael@0: case '\r': ++aNewlinesSkipped; michael@0: case ' ' : michael@0: case '\t': michael@0: { michael@0: skipped = true; michael@0: char16_t thePrevChar = theChar; michael@0: theChar = (++current != mEndPosition) ? *current : '\0'; michael@0: if ((thePrevChar == '\r' && theChar == '\n') || michael@0: (thePrevChar == '\n' && theChar == '\r')) { michael@0: theChar = (++current != mEndPosition) ? *current : '\0'; // CRLF == LFCR => LF michael@0: } michael@0: } michael@0: break; michael@0: default: michael@0: done = true; michael@0: break; michael@0: } michael@0: } michael@0: michael@0: if (skipped) { michael@0: SetPosition(current); michael@0: if (current == mEndPosition) { michael@0: result = kEOF; michael@0: } michael@0: } michael@0: michael@0: return result; michael@0: } michael@0: michael@0: /** michael@0: * Skip over chars as long as they equal given char michael@0: * michael@0: * @update gess 3/25/98 michael@0: * @param michael@0: * @return error code michael@0: */ michael@0: nsresult nsScanner::SkipOver(char16_t aSkipChar){ michael@0: michael@0: if (!mSlidingBuffer) { michael@0: return kEOF; michael@0: } michael@0: michael@0: char16_t ch=0; michael@0: nsresult result=NS_OK; michael@0: michael@0: while(NS_OK==result) { michael@0: result=Peek(ch); michael@0: if(NS_OK == result) { michael@0: if(ch!=aSkipChar) { michael@0: break; michael@0: } michael@0: GetChar(ch); michael@0: } michael@0: else break; michael@0: } //while michael@0: return result; michael@0: michael@0: } michael@0: michael@0: #if 0 michael@0: void DoErrTest(nsString& aString) { michael@0: int32_t pos=aString.FindChar(0); michael@0: if(kNotFound', or a '/'. michael@0: * michael@0: * @param aString - receives new data from stream michael@0: * @return error code michael@0: */ michael@0: nsresult nsScanner::ReadTagIdentifier(nsScannerSharedSubstring& aString) { michael@0: michael@0: if (!mSlidingBuffer) { michael@0: return kEOF; michael@0: } michael@0: michael@0: char16_t theChar=0; michael@0: nsresult result=Peek(theChar); michael@0: nsScannerIterator current, end; michael@0: bool found=false; michael@0: michael@0: current = mCurrentPosition; michael@0: end = mEndPosition; michael@0: michael@0: // Loop until we find an illegal character. Everything is then appended michael@0: // later. michael@0: while(current != end && !found) { michael@0: theChar=*current; michael@0: michael@0: switch(theChar) { michael@0: case '\n': michael@0: case '\r': michael@0: case ' ' : michael@0: case '\t': michael@0: case '\v': michael@0: case '\f': michael@0: case '<': michael@0: case '>': michael@0: case '/': michael@0: found = true; michael@0: break; michael@0: michael@0: case '\0': michael@0: ReplaceCharacter(current, sInvalid); michael@0: break; michael@0: michael@0: default: michael@0: break; michael@0: } michael@0: michael@0: if (!found) { michael@0: ++current; michael@0: } michael@0: } michael@0: michael@0: // Don't bother appending nothing. michael@0: if (current != mCurrentPosition) { michael@0: AppendUnicodeTo(mCurrentPosition, current, aString); michael@0: } michael@0: michael@0: SetPosition(current); michael@0: if (current == end) { michael@0: result = kEOF; michael@0: } michael@0: michael@0: //DoErrTest(aString); michael@0: michael@0: return result; michael@0: } michael@0: michael@0: /** michael@0: * Consume characters until you run into a char that's not valid in an michael@0: * entity name michael@0: * michael@0: * @param aString - receives new data from stream michael@0: * @return error code michael@0: */ michael@0: nsresult nsScanner::ReadEntityIdentifier(nsString& aString) { michael@0: michael@0: if (!mSlidingBuffer) { michael@0: return kEOF; michael@0: } michael@0: michael@0: char16_t theChar=0; michael@0: nsresult result=Peek(theChar); michael@0: nsScannerIterator origin, current, end; michael@0: bool found=false; michael@0: michael@0: origin = mCurrentPosition; michael@0: current = mCurrentPosition; michael@0: end = mEndPosition; michael@0: michael@0: while(current != end) { michael@0: michael@0: theChar=*current; michael@0: if(theChar) { michael@0: found=false; michael@0: switch(theChar) { michael@0: case '_': michael@0: case '-': michael@0: case '.': michael@0: // Don't allow ':' in entity names. See bug 23791 michael@0: found = true; michael@0: break; michael@0: default: michael@0: found = ('a'<=theChar && theChar<='z') || michael@0: ('A'<=theChar && theChar<='Z') || michael@0: ('0'<=theChar && theChar<='9'); michael@0: break; michael@0: } michael@0: michael@0: if(!found) { michael@0: AppendUnicodeTo(mCurrentPosition, current, aString); michael@0: break; michael@0: } michael@0: } michael@0: ++current; michael@0: } michael@0: michael@0: SetPosition(current); michael@0: if (current == end) { michael@0: AppendUnicodeTo(origin, current, aString); michael@0: return kEOF; michael@0: } michael@0: michael@0: //DoErrTest(aString); michael@0: michael@0: return result; michael@0: } michael@0: michael@0: /** michael@0: * Consume digits michael@0: * michael@0: * @param aString - should contain digits michael@0: * @return error code michael@0: */ michael@0: nsresult nsScanner::ReadNumber(nsString& aString,int32_t aBase) { michael@0: michael@0: if (!mSlidingBuffer) { michael@0: return kEOF; michael@0: } michael@0: michael@0: NS_ASSERTION(aBase == 10 || aBase == 16,"base value not supported"); michael@0: michael@0: char16_t theChar=0; michael@0: nsresult result=Peek(theChar); michael@0: nsScannerIterator origin, current, end; michael@0: michael@0: origin = mCurrentPosition; michael@0: current = origin; michael@0: end = mEndPosition; michael@0: michael@0: bool done = false; michael@0: while(current != end) { michael@0: theChar=*current; michael@0: if(theChar) { michael@0: done = (theChar < '0' || theChar > '9') && michael@0: ((aBase == 16)? (theChar < 'A' || theChar > 'F') && michael@0: (theChar < 'a' || theChar > 'f') michael@0: :true); michael@0: if(done) { michael@0: AppendUnicodeTo(origin, current, aString); michael@0: break; michael@0: } michael@0: } michael@0: ++current; michael@0: } michael@0: michael@0: SetPosition(current); michael@0: if (current == end) { michael@0: AppendUnicodeTo(origin, current, aString); michael@0: return kEOF; michael@0: } michael@0: michael@0: //DoErrTest(aString); michael@0: michael@0: return result; michael@0: } michael@0: michael@0: /** michael@0: * Consume characters until you find the terminal char michael@0: * michael@0: * @update gess 3/25/98 michael@0: * @param aString receives new data from stream michael@0: * @param addTerminal tells us whether to append terminal to aString michael@0: * @return error code michael@0: */ michael@0: nsresult nsScanner::ReadWhitespace(nsScannerSharedSubstring& aString, michael@0: int32_t& aNewlinesSkipped, michael@0: bool& aHaveCR) { michael@0: michael@0: aHaveCR = false; michael@0: michael@0: if (!mSlidingBuffer) { michael@0: return kEOF; michael@0: } michael@0: michael@0: char16_t theChar = 0; michael@0: nsresult result = Peek(theChar); michael@0: michael@0: if (NS_FAILED(result)) { michael@0: return result; michael@0: } michael@0: michael@0: nsScannerIterator origin, current, end; michael@0: bool done = false; michael@0: michael@0: origin = mCurrentPosition; michael@0: current = origin; michael@0: end = mEndPosition; michael@0: michael@0: bool haveCR = false; michael@0: michael@0: while(!done && current != end) { michael@0: switch(theChar) { michael@0: case '\n': michael@0: case '\r': michael@0: { michael@0: ++aNewlinesSkipped; michael@0: char16_t thePrevChar = theChar; michael@0: theChar = (++current != end) ? *current : '\0'; michael@0: if ((thePrevChar == '\r' && theChar == '\n') || michael@0: (thePrevChar == '\n' && theChar == '\r')) { michael@0: theChar = (++current != end) ? *current : '\0'; // CRLF == LFCR => LF michael@0: haveCR = true; michael@0: } else if (thePrevChar == '\r') { michael@0: // Lone CR becomes CRLF; callers should know to remove extra CRs michael@0: AppendUnicodeTo(origin, current, aString); michael@0: aString.writable().Append(char16_t('\n')); michael@0: origin = current; michael@0: haveCR = true; michael@0: } michael@0: } michael@0: break; michael@0: case ' ' : michael@0: case '\t': michael@0: theChar = (++current != end) ? *current : '\0'; michael@0: break; michael@0: default: michael@0: done = true; michael@0: AppendUnicodeTo(origin, current, aString); michael@0: break; michael@0: } michael@0: } michael@0: michael@0: SetPosition(current); michael@0: if (current == end) { michael@0: AppendUnicodeTo(origin, current, aString); michael@0: result = kEOF; michael@0: } michael@0: michael@0: aHaveCR = haveCR; michael@0: return result; michael@0: } michael@0: michael@0: //XXXbz callers of this have to manage their lone '\r' themselves if they want michael@0: //it to work. Good thing they're all in view-source and it deals. michael@0: nsresult nsScanner::ReadWhitespace(nsScannerIterator& aStart, michael@0: nsScannerIterator& aEnd, michael@0: int32_t& aNewlinesSkipped) { michael@0: michael@0: if (!mSlidingBuffer) { michael@0: return kEOF; michael@0: } michael@0: michael@0: char16_t theChar = 0; michael@0: nsresult result = Peek(theChar); michael@0: michael@0: if (NS_FAILED(result)) { michael@0: return result; michael@0: } michael@0: michael@0: nsScannerIterator origin, current, end; michael@0: bool done = false; michael@0: michael@0: origin = mCurrentPosition; michael@0: current = origin; michael@0: end = mEndPosition; michael@0: michael@0: while(!done && current != end) { michael@0: switch(theChar) { michael@0: case '\n': michael@0: case '\r': ++aNewlinesSkipped; michael@0: case ' ' : michael@0: case '\t': michael@0: { michael@0: char16_t thePrevChar = theChar; michael@0: theChar = (++current != end) ? *current : '\0'; michael@0: if ((thePrevChar == '\r' && theChar == '\n') || michael@0: (thePrevChar == '\n' && theChar == '\r')) { michael@0: theChar = (++current != end) ? *current : '\0'; // CRLF == LFCR => LF michael@0: } michael@0: } michael@0: break; michael@0: default: michael@0: done = true; michael@0: aStart = origin; michael@0: aEnd = current; michael@0: break; michael@0: } michael@0: } michael@0: michael@0: SetPosition(current); michael@0: if (current == end) { michael@0: aStart = origin; michael@0: aEnd = current; michael@0: result = kEOF; michael@0: } michael@0: michael@0: return result; michael@0: } michael@0: michael@0: /** michael@0: * Consume characters until you encounter one contained in given michael@0: * input set. michael@0: * michael@0: * @update gess 3/25/98 michael@0: * @param aString will contain the result of this method michael@0: * @param aTerminalSet is an ordered string that contains michael@0: * the set of INVALID characters michael@0: * @return error code michael@0: */ michael@0: nsresult nsScanner::ReadUntil(nsAString& aString, michael@0: const nsReadEndCondition& aEndCondition, michael@0: bool addTerminal) michael@0: { michael@0: if (!mSlidingBuffer) { michael@0: return kEOF; michael@0: } michael@0: michael@0: nsScannerIterator origin, current; michael@0: const char16_t* setstart = aEndCondition.mChars; michael@0: const char16_t* setcurrent; michael@0: michael@0: origin = mCurrentPosition; michael@0: current = origin; michael@0: michael@0: char16_t theChar=0; michael@0: nsresult result=Peek(theChar); michael@0: michael@0: if (NS_FAILED(result)) { michael@0: return result; michael@0: } michael@0: michael@0: while (current != mEndPosition) { michael@0: theChar = *current; michael@0: if (theChar == '\0') { michael@0: ReplaceCharacter(current, sInvalid); michael@0: theChar = sInvalid; michael@0: } michael@0: michael@0: // Filter out completely wrong characters michael@0: // Check if all bits are in the required area michael@0: if(!(theChar & aEndCondition.mFilter)) { michael@0: // They were. Do a thorough check. michael@0: michael@0: setcurrent = setstart; michael@0: while (*setcurrent) { michael@0: if (*setcurrent == theChar) { michael@0: if(addTerminal) michael@0: ++current; michael@0: AppendUnicodeTo(origin, current, aString); michael@0: SetPosition(current); michael@0: michael@0: //DoErrTest(aString); michael@0: michael@0: return NS_OK; michael@0: } michael@0: ++setcurrent; michael@0: } michael@0: } michael@0: michael@0: ++current; michael@0: } michael@0: michael@0: // If we are here, we didn't find any terminator in the string and michael@0: // current = mEndPosition michael@0: SetPosition(current); michael@0: AppendUnicodeTo(origin, current, aString); michael@0: return kEOF; michael@0: } michael@0: michael@0: nsresult nsScanner::ReadUntil(nsScannerSharedSubstring& aString, michael@0: const nsReadEndCondition& aEndCondition, michael@0: bool addTerminal) michael@0: { michael@0: if (!mSlidingBuffer) { michael@0: return kEOF; michael@0: } michael@0: michael@0: nsScannerIterator origin, current; michael@0: const char16_t* setstart = aEndCondition.mChars; michael@0: const char16_t* setcurrent; michael@0: michael@0: origin = mCurrentPosition; michael@0: current = origin; michael@0: michael@0: char16_t theChar=0; michael@0: nsresult result=Peek(theChar); michael@0: michael@0: if (NS_FAILED(result)) { michael@0: return result; michael@0: } michael@0: michael@0: while (current != mEndPosition) { michael@0: theChar = *current; michael@0: if (theChar == '\0') { michael@0: ReplaceCharacter(current, sInvalid); michael@0: theChar = sInvalid; michael@0: } michael@0: michael@0: // Filter out completely wrong characters michael@0: // Check if all bits are in the required area michael@0: if(!(theChar & aEndCondition.mFilter)) { michael@0: // They were. Do a thorough check. michael@0: michael@0: setcurrent = setstart; michael@0: while (*setcurrent) { michael@0: if (*setcurrent == theChar) { michael@0: if(addTerminal) michael@0: ++current; michael@0: AppendUnicodeTo(origin, current, aString); michael@0: SetPosition(current); michael@0: michael@0: //DoErrTest(aString); michael@0: michael@0: return NS_OK; michael@0: } michael@0: ++setcurrent; michael@0: } michael@0: } michael@0: michael@0: ++current; michael@0: } michael@0: michael@0: // If we are here, we didn't find any terminator in the string and michael@0: // current = mEndPosition michael@0: SetPosition(current); michael@0: AppendUnicodeTo(origin, current, aString); michael@0: return kEOF; michael@0: } michael@0: michael@0: nsresult nsScanner::ReadUntil(nsScannerIterator& aStart, michael@0: nsScannerIterator& aEnd, michael@0: const nsReadEndCondition &aEndCondition, michael@0: bool addTerminal) michael@0: { michael@0: if (!mSlidingBuffer) { michael@0: return kEOF; michael@0: } michael@0: michael@0: nsScannerIterator origin, current; michael@0: const char16_t* setstart = aEndCondition.mChars; michael@0: const char16_t* setcurrent; michael@0: michael@0: origin = mCurrentPosition; michael@0: current = origin; michael@0: michael@0: char16_t theChar=0; michael@0: nsresult result=Peek(theChar); michael@0: michael@0: if (NS_FAILED(result)) { michael@0: aStart = aEnd = current; michael@0: return result; michael@0: } michael@0: michael@0: while (current != mEndPosition) { michael@0: theChar = *current; michael@0: if (theChar == '\0') { michael@0: ReplaceCharacter(current, sInvalid); michael@0: theChar = sInvalid; michael@0: } michael@0: michael@0: // Filter out completely wrong characters michael@0: // Check if all bits are in the required area michael@0: if(!(theChar & aEndCondition.mFilter)) { michael@0: // They were. Do a thorough check. michael@0: setcurrent = setstart; michael@0: while (*setcurrent) { michael@0: if (*setcurrent == theChar) { michael@0: if(addTerminal) michael@0: ++current; michael@0: aStart = origin; michael@0: aEnd = current; michael@0: SetPosition(current); michael@0: michael@0: return NS_OK; michael@0: } michael@0: ++setcurrent; michael@0: } michael@0: } michael@0: michael@0: ++current; michael@0: } michael@0: michael@0: // If we are here, we didn't find any terminator in the string and michael@0: // current = mEndPosition michael@0: SetPosition(current); michael@0: aStart = origin; michael@0: aEnd = current; michael@0: return kEOF; michael@0: } michael@0: michael@0: /** michael@0: * Consumes chars until you see the given terminalChar michael@0: * michael@0: * @update gess 3/25/98 michael@0: * @param michael@0: * @return error code michael@0: */ michael@0: nsresult nsScanner::ReadUntil(nsAString& aString, michael@0: char16_t aTerminalChar, michael@0: bool addTerminal) michael@0: { michael@0: if (!mSlidingBuffer) { michael@0: return kEOF; michael@0: } michael@0: michael@0: nsScannerIterator origin, current; michael@0: michael@0: origin = mCurrentPosition; michael@0: current = origin; michael@0: michael@0: char16_t theChar; michael@0: nsresult result = Peek(theChar); michael@0: michael@0: if (NS_FAILED(result)) { michael@0: return result; michael@0: } michael@0: michael@0: while (current != mEndPosition) { michael@0: theChar = *current; michael@0: if (theChar == '\0') { michael@0: ReplaceCharacter(current, sInvalid); michael@0: theChar = sInvalid; michael@0: } michael@0: michael@0: if (aTerminalChar == theChar) { michael@0: if(addTerminal) michael@0: ++current; michael@0: AppendUnicodeTo(origin, current, aString); michael@0: SetPosition(current); michael@0: return NS_OK; michael@0: } michael@0: ++current; michael@0: } michael@0: michael@0: // If we are here, we didn't find any terminator in the string and michael@0: // current = mEndPosition michael@0: AppendUnicodeTo(origin, current, aString); michael@0: SetPosition(current); michael@0: return kEOF; michael@0: michael@0: } michael@0: michael@0: void nsScanner::BindSubstring(nsScannerSubstring& aSubstring, const nsScannerIterator& aStart, const nsScannerIterator& aEnd) michael@0: { michael@0: aSubstring.Rebind(*mSlidingBuffer, aStart, aEnd); michael@0: } michael@0: michael@0: void nsScanner::CurrentPosition(nsScannerIterator& aPosition) michael@0: { michael@0: aPosition = mCurrentPosition; michael@0: } michael@0: michael@0: void nsScanner::EndReading(nsScannerIterator& aPosition) michael@0: { michael@0: aPosition = mEndPosition; michael@0: } michael@0: michael@0: void nsScanner::SetPosition(nsScannerIterator& aPosition, bool aTerminate, bool aReverse) michael@0: { michael@0: if (mSlidingBuffer) { michael@0: #ifdef DEBUG michael@0: uint32_t origRemaining = mCountRemaining; michael@0: #endif michael@0: michael@0: if (aReverse) { michael@0: mCountRemaining += (Distance(aPosition, mCurrentPosition)); michael@0: } michael@0: else { michael@0: mCountRemaining -= (Distance(mCurrentPosition, aPosition)); michael@0: } michael@0: michael@0: NS_ASSERTION((mCountRemaining >= origRemaining && aReverse) || michael@0: (mCountRemaining <= origRemaining && !aReverse), michael@0: "Improper use of nsScanner::SetPosition. Make sure to set the" michael@0: " aReverse parameter correctly"); michael@0: michael@0: mCurrentPosition = aPosition; michael@0: if (aTerminate && (mCurrentPosition == mEndPosition)) { michael@0: mMarkPosition = mCurrentPosition; michael@0: mSlidingBuffer->DiscardPrefix(mCurrentPosition); michael@0: } michael@0: } michael@0: } michael@0: michael@0: void nsScanner::ReplaceCharacter(nsScannerIterator& aPosition, michael@0: char16_t aChar) michael@0: { michael@0: if (mSlidingBuffer) { michael@0: mSlidingBuffer->ReplaceCharacter(aPosition, aChar); michael@0: } michael@0: } michael@0: michael@0: bool nsScanner::AppendToBuffer(nsScannerString::Buffer* aBuf, michael@0: nsIRequest *aRequest, michael@0: int32_t aErrorPos) michael@0: { michael@0: uint32_t countRemaining = mCountRemaining; michael@0: if (!mSlidingBuffer) { michael@0: mSlidingBuffer = new nsScannerString(aBuf); michael@0: if (!mSlidingBuffer) michael@0: return false; michael@0: mSlidingBuffer->BeginReading(mCurrentPosition); michael@0: mMarkPosition = mCurrentPosition; michael@0: mSlidingBuffer->EndReading(mEndPosition); michael@0: mCountRemaining = aBuf->DataLength(); michael@0: } michael@0: else { michael@0: mSlidingBuffer->AppendBuffer(aBuf); michael@0: if (mCurrentPosition == mEndPosition) { michael@0: mSlidingBuffer->BeginReading(mCurrentPosition); michael@0: } michael@0: mSlidingBuffer->EndReading(mEndPosition); michael@0: mCountRemaining += aBuf->DataLength(); michael@0: } michael@0: michael@0: if (aErrorPos != -1 && !mHasInvalidCharacter) { michael@0: mHasInvalidCharacter = true; michael@0: mFirstInvalidPosition = mCurrentPosition; michael@0: mFirstInvalidPosition.advance(countRemaining + aErrorPos); michael@0: } michael@0: michael@0: if (mFirstNonWhitespacePosition == -1) { michael@0: nsScannerIterator iter(mCurrentPosition); michael@0: nsScannerIterator end(mEndPosition); michael@0: michael@0: while (iter != end) { michael@0: if (!nsCRT::IsAsciiSpace(*iter)) { michael@0: mFirstNonWhitespacePosition = Distance(mCurrentPosition, iter); michael@0: michael@0: break; michael@0: } michael@0: michael@0: ++iter; michael@0: } michael@0: } michael@0: return true; michael@0: } michael@0: michael@0: /** michael@0: * call this to copy bytes out of the scanner that have not yet been consumed michael@0: * by the tokenization process. michael@0: * michael@0: * @update gess 5/12/98 michael@0: * @param aCopyBuffer is where the scanner buffer will be copied to michael@0: * @return nada michael@0: */ michael@0: void nsScanner::CopyUnusedData(nsString& aCopyBuffer) { michael@0: if (!mSlidingBuffer) { michael@0: aCopyBuffer.Truncate(); michael@0: return; michael@0: } michael@0: michael@0: nsScannerIterator start, end; michael@0: start = mCurrentPosition; michael@0: end = mEndPosition; michael@0: michael@0: CopyUnicodeTo(start, end, aCopyBuffer); michael@0: } michael@0: michael@0: /** michael@0: * Retrieve the name of the file that the scanner is reading from. michael@0: * In some cases, it's just a given name, because the scanner isn't michael@0: * really reading from a file. michael@0: * michael@0: * @update gess 5/12/98 michael@0: * @return michael@0: */ michael@0: nsString& nsScanner::GetFilename(void) { michael@0: return mFilename; michael@0: } michael@0: michael@0: /** michael@0: * Conduct self test. Actually, selftesting for this class michael@0: * occurs in the parser selftest. michael@0: * michael@0: * @update gess 3/25/98 michael@0: * @param michael@0: * @return michael@0: */ michael@0: michael@0: void nsScanner::SelfTest(void) { michael@0: #ifdef _DEBUG michael@0: #endif michael@0: } michael@0: michael@0: void nsScanner::OverrideReplacementCharacter(char16_t aReplacementCharacter) michael@0: { michael@0: mReplacementCharacter = aReplacementCharacter; michael@0: michael@0: if (mHasInvalidCharacter) { michael@0: ReplaceCharacter(mFirstInvalidPosition, mReplacementCharacter); michael@0: } michael@0: } michael@0: