The Tor Browser: diff parser/htmlparser/src/nsScanner.cpp

     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/parser/htmlparser/src/nsScanner.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1199 @@
     1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     1.5 +/* vim: set ts=2 sw=2 et tw=78: */
     1.6 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.7 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.8 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.9 +
    1.10 +//#define __INCREMENTAL 1
    1.11 +
    1.12 +#include "mozilla/DebugOnly.h"
    1.13 +
    1.14 +#include "nsScanner.h"
    1.15 +#include "nsDebug.h"
    1.16 +#include "nsReadableUtils.h"
    1.17 +#include "nsIInputStream.h"
    1.18 +#include "nsIFile.h"
    1.19 +#include "nsNetUtil.h"
    1.20 +#include "nsUTF8Utils.h" // for LossyConvertEncoding
    1.21 +#include "nsCRT.h"
    1.22 +#include "nsParser.h"
    1.23 +#include "nsCharsetSource.h"
    1.24 +
    1.25 +#include "mozilla/dom/EncodingUtils.h"
    1.26 +
    1.27 +using mozilla::dom::EncodingUtils;
    1.28 +
    1.29 +// We replace NUL characters with this character.
    1.30 +static char16_t sInvalid = UCS2_REPLACEMENT_CHAR;
    1.31 +
    1.32 +nsReadEndCondition::nsReadEndCondition(const char16_t* aTerminateChars) :
    1.33 +  mChars(aTerminateChars), mFilter(char16_t(~0)) // All bits set
    1.34 +{
    1.35 +  // Build filter that will be used to filter out characters with
    1.36 +  // bits that none of the terminal chars have. This works very well
    1.37 +  // because terminal chars often have only the last 4-6 bits set and
    1.38 +  // normal ascii letters have bit 7 set. Other letters have even higher
    1.39 +  // bits set.
    1.40 +  
    1.41 +  // Calculate filter
    1.42 +  const char16_t *current = aTerminateChars;
    1.43 +  char16_t terminalChar = *current;
    1.44 +  while (terminalChar) {
    1.45 +    mFilter &= ~terminalChar;
    1.46 +    ++current;
    1.47 +    terminalChar = *current;
    1.48 +  }
    1.49 +}
    1.50 +
    1.51 +/**
    1.52 + *  Use this constructor if you want i/o to be based on 
    1.53 + *  a single string you hand in during construction.
    1.54 + *  This short cut was added for Javascript.
    1.55 + *
    1.56 + *  @update  gess 5/12/98
    1.57 + *  @param   aMode represents the parser mode (nav, other)
    1.58 + *  @return  
    1.59 + */
    1.60 +nsScanner::nsScanner(const nsAString& anHTMLString)
    1.61 +{
    1.62 +  MOZ_COUNT_CTOR(nsScanner);
    1.63 +
    1.64 +  mSlidingBuffer = nullptr;
    1.65 +  mCountRemaining = 0;
    1.66 +  mFirstNonWhitespacePosition = -1;
    1.67 +  if (AppendToBuffer(anHTMLString)) {
    1.68 +    mSlidingBuffer->BeginReading(mCurrentPosition);
    1.69 +  } else {
    1.70 +    /* XXX see hack below, re: bug 182067 */
    1.71 +    memset(&mCurrentPosition, 0, sizeof(mCurrentPosition));
    1.72 +    mEndPosition = mCurrentPosition;
    1.73 +  }
    1.74 +  mMarkPosition = mCurrentPosition;
    1.75 +  mIncremental = false;
    1.76 +  mUnicodeDecoder = 0;
    1.77 +  mCharsetSource = kCharsetUninitialized;
    1.78 +  mHasInvalidCharacter = false;
    1.79 +  mReplacementCharacter = char16_t(0x0);
    1.80 +}
    1.81 +
    1.82 +/**
    1.83 + *  Use this constructor if you want i/o to be based on strings 
    1.84 + *  the scanner receives. If you pass a null filename, you
    1.85 + *  can still provide data to the scanner via append.
    1.86 + */
    1.87 +nsScanner::nsScanner(nsString& aFilename, bool aCreateStream)
    1.88 +  : mFilename(aFilename)
    1.89 +{
    1.90 +  MOZ_COUNT_CTOR(nsScanner);
    1.91 +  NS_ASSERTION(!aCreateStream, "This is always true.");
    1.92 +
    1.93 +  mSlidingBuffer = nullptr;
    1.94 +
    1.95 +  // XXX This is a big hack.  We need to initialize the iterators to something.
    1.96 +  // What matters is that mCurrentPosition == mEndPosition, so that our methods
    1.97 +  // believe that we are at EOF (see bug 182067).  We null out mCurrentPosition
    1.98 +  // so that we have some hope of catching null pointer dereferences associated
    1.99 +  // with this hack. --darin
   1.100 +  memset(&mCurrentPosition, 0, sizeof(mCurrentPosition));
   1.101 +  mMarkPosition = mCurrentPosition;
   1.102 +  mEndPosition = mCurrentPosition;
   1.103 +
   1.104 +  mIncremental = true;
   1.105 +  mFirstNonWhitespacePosition = -1;
   1.106 +  mCountRemaining = 0;
   1.107 +
   1.108 +  mUnicodeDecoder = 0;
   1.109 +  mCharsetSource = kCharsetUninitialized;
   1.110 +  mHasInvalidCharacter = false;
   1.111 +  mReplacementCharacter = char16_t(0x0);
   1.112 +  // XML defaults to UTF-8 and about:blank is UTF-8, too.
   1.113 +  SetDocumentCharset(NS_LITERAL_CSTRING("UTF-8"), kCharsetFromDocTypeDefault);
   1.114 +}
   1.115 +
   1.116 +nsresult nsScanner::SetDocumentCharset(const nsACString& aCharset , int32_t aSource)
   1.117 +{
   1.118 +  if (aSource < mCharsetSource) // priority is lower than the current one
   1.119 +    return NS_OK;
   1.120 +
   1.121 +  mCharsetSource = aSource;
   1.122 +
   1.123 +  nsCString charsetName;
   1.124 +  mozilla::DebugOnly<bool> valid =
   1.125 +      EncodingUtils::FindEncodingForLabel(aCharset, charsetName);
   1.126 +  MOZ_ASSERT(valid, "Should never call with a bogus aCharset.");
   1.127 +
   1.128 +  if (!mCharset.IsEmpty() && charsetName.Equals(mCharset)) {
   1.129 +    return NS_OK; // no difference, don't change it
   1.130 +  }
   1.131 +
   1.132 +  // different, need to change it
   1.133 +
   1.134 +  mCharset.Assign(charsetName);
   1.135 +
   1.136 +  mUnicodeDecoder = EncodingUtils::DecoderForEncoding(mCharset);
   1.137 +  mUnicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Signal);
   1.138 +
   1.139 +  return NS_OK;
   1.140 +}
   1.141 +
   1.142 +
   1.143 +/**
   1.144 + *  default destructor
   1.145 + *  
   1.146 + *  @update  gess 3/25/98
   1.147 + *  @param   
   1.148 + *  @return  
   1.149 + */
   1.150 +nsScanner::~nsScanner() {
   1.151 +
   1.152 +  delete mSlidingBuffer;
   1.153 +
   1.154 +  MOZ_COUNT_DTOR(nsScanner);
   1.155 +}
   1.156 +
   1.157 +/**
   1.158 + *  Resets current offset position of input stream to marked position. 
   1.159 + *  This allows us to back up to this point if the need should arise, 
   1.160 + *  such as when tokenization gets interrupted.
   1.161 + *  NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST!
   1.162 + *
   1.163 + *  @update  gess 5/12/98
   1.164 + *  @param   
   1.165 + *  @return  
   1.166 + */
   1.167 +void nsScanner::RewindToMark(void){
   1.168 +  if (mSlidingBuffer) {
   1.169 +    mCountRemaining += (Distance(mMarkPosition, mCurrentPosition));
   1.170 +    mCurrentPosition = mMarkPosition;
   1.171 +  }
   1.172 +}
   1.173 +
   1.174 +
   1.175 +/**
   1.176 + *  Records current offset position in input stream. This allows us
   1.177 + *  to back up to this point if the need should arise, such as when
   1.178 + *  tokenization gets interrupted.
   1.179 + *
   1.180 + *  @update  gess 7/29/98
   1.181 + *  @param   
   1.182 + *  @return  
   1.183 + */
   1.184 +int32_t nsScanner::Mark() {
   1.185 +  int32_t distance = 0;
   1.186 +  if (mSlidingBuffer) {
   1.187 +    nsScannerIterator oldStart;
   1.188 +    mSlidingBuffer->BeginReading(oldStart);
   1.189 +
   1.190 +    distance = Distance(oldStart, mCurrentPosition);
   1.191 +
   1.192 +    mSlidingBuffer->DiscardPrefix(mCurrentPosition);
   1.193 +    mSlidingBuffer->BeginReading(mCurrentPosition);
   1.194 +    mMarkPosition = mCurrentPosition;
   1.195 +  }
   1.196 +
   1.197 +  return distance;
   1.198 +}
   1.199 +
   1.200 +/** 
   1.201 + * Insert data to our underlying input buffer as
   1.202 + * if it were read from an input stream.
   1.203 + *
   1.204 + * @update  harishd 01/12/99
   1.205 + * @return  error code 
   1.206 + */
   1.207 +bool nsScanner::UngetReadable(const nsAString& aBuffer) {
   1.208 +  if (!mSlidingBuffer) {
   1.209 +    return false;
   1.210 +  }
   1.211 +
   1.212 +  mSlidingBuffer->UngetReadable(aBuffer,mCurrentPosition);
   1.213 +  mSlidingBuffer->BeginReading(mCurrentPosition); // Insertion invalidated our iterators
   1.214 +  mSlidingBuffer->EndReading(mEndPosition);
   1.215 + 
   1.216 +  uint32_t length = aBuffer.Length();
   1.217 +  mCountRemaining += length; // Ref. bug 117441
   1.218 +  return true;
   1.219 +}
   1.220 +
   1.221 +/** 
   1.222 + * Append data to our underlying input buffer as
   1.223 + * if it were read from an input stream.
   1.224 + *
   1.225 + * @update  gess4/3/98
   1.226 + * @return  error code 
   1.227 + */
   1.228 +nsresult nsScanner::Append(const nsAString& aBuffer) {
   1.229 +  if (!AppendToBuffer(aBuffer))
   1.230 +    return NS_ERROR_OUT_OF_MEMORY;
   1.231 +  return NS_OK;
   1.232 +}
   1.233 +
   1.234 +/**
   1.235 + *  
   1.236 + *  
   1.237 + *  @update  gess 5/21/98
   1.238 + *  @param   
   1.239 + *  @return  
   1.240 + */
   1.241 +nsresult nsScanner::Append(const char* aBuffer, uint32_t aLen,
   1.242 +                           nsIRequest *aRequest)
   1.243 +{
   1.244 +  nsresult res = NS_OK;
   1.245 +  if (mUnicodeDecoder) {
   1.246 +    int32_t unicharBufLen = 0;
   1.247 +    mUnicodeDecoder->GetMaxLength(aBuffer, aLen, &unicharBufLen);
   1.248 +    nsScannerString::Buffer* buffer = nsScannerString::AllocBuffer(unicharBufLen + 1);
   1.249 +    NS_ENSURE_TRUE(buffer,NS_ERROR_OUT_OF_MEMORY);
   1.250 +    char16_t *unichars = buffer->DataStart();
   1.251 +
   1.252 +    int32_t totalChars = 0;
   1.253 +    int32_t unicharLength = unicharBufLen;
   1.254 +    int32_t errorPos = -1;
   1.255 +
   1.256 +    do {
   1.257 +      int32_t srcLength = aLen;
   1.258 +      res = mUnicodeDecoder->Convert(aBuffer, &srcLength, unichars, &unicharLength);
   1.259 +
   1.260 +      totalChars += unicharLength;
   1.261 +      // Continuation of failure case
   1.262 +      if(NS_FAILED(res)) {
   1.263 +        // if we failed, we consume one byte, replace it with the replacement
   1.264 +        // character and try the conversion again.
   1.265 +
   1.266 +        // This is only needed because some decoders don't follow the
   1.267 +        // nsIUnicodeDecoder contract: they return a failure when *aDestLength
   1.268 +        // is 0 rather than the correct NS_OK_UDEC_MOREOUTPUT.  See bug 244177
   1.269 +        if ((unichars + unicharLength) >= buffer->DataEnd()) {
   1.270 +          NS_ERROR("Unexpected end of destination buffer");
   1.271 +          break;
   1.272 +        }
   1.273 +
   1.274 +        if (mReplacementCharacter == 0x0 && errorPos == -1) {
   1.275 +          errorPos = totalChars;
   1.276 +        }
   1.277 +        unichars[unicharLength++] = mReplacementCharacter == 0x0 ?
   1.278 +                                    mUnicodeDecoder->GetCharacterForUnMapped() :
   1.279 +                                    mReplacementCharacter;
   1.280 +
   1.281 +        unichars = unichars + unicharLength;
   1.282 +        unicharLength = unicharBufLen - (++totalChars);
   1.283 +
   1.284 +        mUnicodeDecoder->Reset();
   1.285 +
   1.286 +        if(((uint32_t) (srcLength + 1)) > aLen) {
   1.287 +          srcLength = aLen;
   1.288 +        }
   1.289 +        else {
   1.290 +          ++srcLength;
   1.291 +        }
   1.292 +
   1.293 +        aBuffer += srcLength;
   1.294 +        aLen -= srcLength;
   1.295 +      }
   1.296 +    } while (NS_FAILED(res) && (aLen > 0));
   1.297 +
   1.298 +    buffer->SetDataLength(totalChars);
   1.299 +    // Don't propagate return code of unicode decoder
   1.300 +    // since it doesn't reflect on our success or failure
   1.301 +    // - Ref. bug 87110
   1.302 +    res = NS_OK; 
   1.303 +    if (!AppendToBuffer(buffer, aRequest, errorPos))
   1.304 +      res = NS_ERROR_OUT_OF_MEMORY;
   1.305 +  }
   1.306 +  else {
   1.307 +    NS_WARNING("No decoder found.");
   1.308 +    res = NS_ERROR_FAILURE;
   1.309 +  }
   1.310 +
   1.311 +  return res;
   1.312 +}
   1.313 +
   1.314 +/**
   1.315 + *  retrieve next char from scanners internal input stream
   1.316 + *  
   1.317 + *  @update  gess 3/25/98
   1.318 + *  @param   
   1.319 + *  @return  error code reflecting read status
   1.320 + */
   1.321 +nsresult nsScanner::GetChar(char16_t& aChar) {
   1.322 +  if (!mSlidingBuffer || mCurrentPosition == mEndPosition) {
   1.323 +    aChar = 0;
   1.324 +    return kEOF;
   1.325 +  }
   1.326 +
   1.327 +  aChar = *mCurrentPosition++;
   1.328 +  --mCountRemaining;
   1.329 +
   1.330 +  return NS_OK;
   1.331 +}
   1.332 +
   1.333 +
   1.334 +/**
   1.335 + *  peek ahead to consume next char from scanner's internal
   1.336 + *  input buffer
   1.337 + *  
   1.338 + *  @update  gess 3/25/98
   1.339 + *  @param   
   1.340 + *  @return  
   1.341 + */
   1.342 +nsresult nsScanner::Peek(char16_t& aChar, uint32_t aOffset) {
   1.343 +  aChar = 0;
   1.344 +
   1.345 +  if (!mSlidingBuffer || mCurrentPosition == mEndPosition) {
   1.346 +    return kEOF;
   1.347 +  }
   1.348 +
   1.349 +  if (aOffset > 0) {
   1.350 +    if (mCountRemaining <= aOffset)
   1.351 +      return kEOF;
   1.352 +
   1.353 +    nsScannerIterator pos = mCurrentPosition;
   1.354 +    pos.advance(aOffset);
   1.355 +    aChar=*pos;
   1.356 +  }
   1.357 +  else {
   1.358 +    aChar=*mCurrentPosition;
   1.359 +  }
   1.360 +
   1.361 +  return NS_OK;
   1.362 +}
   1.363 +
   1.364 +nsresult nsScanner::Peek(nsAString& aStr, int32_t aNumChars, int32_t aOffset)
   1.365 +{
   1.366 +  if (!mSlidingBuffer || mCurrentPosition == mEndPosition) {
   1.367 +    return kEOF;
   1.368 +  }
   1.369 +
   1.370 +  nsScannerIterator start, end;
   1.371 +
   1.372 +  start = mCurrentPosition;
   1.373 +
   1.374 +  if ((int32_t)mCountRemaining <= aOffset) {
   1.375 +    return kEOF;
   1.376 +  }
   1.377 +
   1.378 +  if (aOffset > 0) {
   1.379 +    start.advance(aOffset);
   1.380 +  }
   1.381 +
   1.382 +  if (mCountRemaining < uint32_t(aNumChars + aOffset)) {
   1.383 +    end = mEndPosition;
   1.384 +  }
   1.385 +  else {
   1.386 +    end = start;
   1.387 +    end.advance(aNumChars);
   1.388 +  }
   1.389 +
   1.390 +  CopyUnicodeTo(start, end, aStr);
   1.391 +
   1.392 +  return NS_OK;
   1.393 +}
   1.394 +
   1.395 +
   1.396 +/**
   1.397 + *  Skip whitespace on scanner input stream
   1.398 + *  
   1.399 + *  @update  gess 3/25/98
   1.400 + *  @param   
   1.401 + *  @return  error status
   1.402 + */
   1.403 +nsresult nsScanner::SkipWhitespace(int32_t& aNewlinesSkipped) {
   1.404 +
   1.405 +  if (!mSlidingBuffer) {
   1.406 +    return kEOF;
   1.407 +  }
   1.408 +
   1.409 +  char16_t theChar = 0;
   1.410 +  nsresult  result = Peek(theChar);
   1.411 +  
   1.412 +  if (NS_FAILED(result)) {
   1.413 +    return result;
   1.414 +  }
   1.415 +  
   1.416 +  nsScannerIterator current = mCurrentPosition;
   1.417 +  bool      done = false;
   1.418 +  bool      skipped = false;
   1.419 +  
   1.420 +  while (!done && current != mEndPosition) {
   1.421 +    switch(theChar) {
   1.422 +      case '\n':
   1.423 +      case '\r': ++aNewlinesSkipped;
   1.424 +      case ' ' :
   1.425 +      case '\t':
   1.426 +        {
   1.427 +          skipped = true;
   1.428 +          char16_t thePrevChar = theChar;
   1.429 +          theChar = (++current != mEndPosition) ? *current : '\0';
   1.430 +          if ((thePrevChar == '\r' && theChar == '\n') ||
   1.431 +              (thePrevChar == '\n' && theChar == '\r')) {
   1.432 +            theChar = (++current != mEndPosition) ? *current : '\0'; // CRLF == LFCR => LF
   1.433 +          }
   1.434 +        }
   1.435 +        break;
   1.436 +      default:
   1.437 +        done = true;
   1.438 +        break;
   1.439 +    }
   1.440 +  }
   1.441 +
   1.442 +  if (skipped) {
   1.443 +    SetPosition(current);
   1.444 +    if (current == mEndPosition) {
   1.445 +      result = kEOF;
   1.446 +    }
   1.447 +  }
   1.448 +
   1.449 +  return result;
   1.450 +}
   1.451 +
   1.452 +/**
   1.453 + *  Skip over chars as long as they equal given char
   1.454 + *  
   1.455 + *  @update  gess 3/25/98
   1.456 + *  @param   
   1.457 + *  @return  error code
   1.458 + */
   1.459 +nsresult nsScanner::SkipOver(char16_t aSkipChar){
   1.460 +
   1.461 +  if (!mSlidingBuffer) {
   1.462 +    return kEOF;
   1.463 +  }
   1.464 +
   1.465 +  char16_t ch=0;
   1.466 +  nsresult   result=NS_OK;
   1.467 +
   1.468 +  while(NS_OK==result) {
   1.469 +    result=Peek(ch);
   1.470 +    if(NS_OK == result) {
   1.471 +      if(ch!=aSkipChar) {
   1.472 +        break;
   1.473 +      }
   1.474 +      GetChar(ch);
   1.475 +    } 
   1.476 +    else break;
   1.477 +  } //while
   1.478 +  return result;
   1.479 +
   1.480 +}
   1.481 +
   1.482 +#if 0
   1.483 +void DoErrTest(nsString& aString) {
   1.484 +  int32_t pos=aString.FindChar(0);
   1.485 +  if(kNotFound<pos) {
   1.486 +    if(aString.Length()-1!=pos) {
   1.487 +    }
   1.488 +  }
   1.489 +}
   1.490 +
   1.491 +void DoErrTest(nsCString& aString) {
   1.492 +  int32_t pos=aString.FindChar(0);
   1.493 +  if(kNotFound<pos) {
   1.494 +    if(aString.Length()-1!=pos) {
   1.495 +    }
   1.496 +  }
   1.497 +}
   1.498 +#endif
   1.499 +
   1.500 +/**
   1.501 + *  Consume characters until you run into space, a '<', a '>', or a '/'.
   1.502 + *  
   1.503 + *  @param   aString - receives new data from stream
   1.504 + *  @return  error code
   1.505 + */
   1.506 +nsresult nsScanner::ReadTagIdentifier(nsScannerSharedSubstring& aString) {
   1.507 +
   1.508 +  if (!mSlidingBuffer) {
   1.509 +    return kEOF;
   1.510 +  }
   1.511 +
   1.512 +  char16_t         theChar=0;
   1.513 +  nsresult          result=Peek(theChar);
   1.514 +  nsScannerIterator current, end;
   1.515 +  bool              found=false;  
   1.516 +  
   1.517 +  current = mCurrentPosition;
   1.518 +  end = mEndPosition;
   1.519 +
   1.520 +  // Loop until we find an illegal character. Everything is then appended
   1.521 +  // later.
   1.522 +  while(current != end && !found) {
   1.523 +    theChar=*current;
   1.524 +
   1.525 +    switch(theChar) {
   1.526 +      case '\n':
   1.527 +      case '\r':
   1.528 +      case ' ' :
   1.529 +      case '\t':
   1.530 +      case '\v':
   1.531 +      case '\f':
   1.532 +      case '<':
   1.533 +      case '>':
   1.534 +      case '/':
   1.535 +        found = true;
   1.536 +        break;
   1.537 +
   1.538 +      case '\0':
   1.539 +        ReplaceCharacter(current, sInvalid);
   1.540 +        break;
   1.541 +
   1.542 +      default:
   1.543 +        break;
   1.544 +    }
   1.545 +
   1.546 +    if (!found) {
   1.547 +      ++current;
   1.548 +    }
   1.549 +  }
   1.550 +
   1.551 +  // Don't bother appending nothing.
   1.552 +  if (current != mCurrentPosition) {
   1.553 +    AppendUnicodeTo(mCurrentPosition, current, aString);
   1.554 +  }
   1.555 +
   1.556 +  SetPosition(current);  
   1.557 +  if (current == end) {
   1.558 +    result = kEOF;
   1.559 +  }
   1.560 +
   1.561 +  //DoErrTest(aString);
   1.562 +
   1.563 +  return result;
   1.564 +}
   1.565 +
   1.566 +/**
   1.567 + *  Consume characters until you run into a char that's not valid in an
   1.568 + *  entity name
   1.569 + *  
   1.570 + *  @param   aString - receives new data from stream
   1.571 + *  @return  error code
   1.572 + */
   1.573 +nsresult nsScanner::ReadEntityIdentifier(nsString& aString) {
   1.574 +
   1.575 +  if (!mSlidingBuffer) {
   1.576 +    return kEOF;
   1.577 +  }
   1.578 +
   1.579 +  char16_t         theChar=0;
   1.580 +  nsresult          result=Peek(theChar);
   1.581 +  nsScannerIterator origin, current, end;
   1.582 +  bool              found=false;  
   1.583 +
   1.584 +  origin = mCurrentPosition;
   1.585 +  current = mCurrentPosition;
   1.586 +  end = mEndPosition;
   1.587 +
   1.588 +  while(current != end) {
   1.589 + 
   1.590 +    theChar=*current;
   1.591 +    if(theChar) {
   1.592 +      found=false;
   1.593 +      switch(theChar) {
   1.594 +        case '_':
   1.595 +        case '-':
   1.596 +        case '.':
   1.597 +          // Don't allow ':' in entity names.  See bug 23791
   1.598 +          found = true;
   1.599 +          break;
   1.600 +        default:
   1.601 +          found = ('a'<=theChar && theChar<='z') ||
   1.602 +                  ('A'<=theChar && theChar<='Z') ||
   1.603 +                  ('0'<=theChar && theChar<='9');
   1.604 +          break;
   1.605 +      }
   1.606 +
   1.607 +      if(!found) {
   1.608 +        AppendUnicodeTo(mCurrentPosition, current, aString);
   1.609 +        break;
   1.610 +      }
   1.611 +    }
   1.612 +    ++current;
   1.613 +  }
   1.614 +  
   1.615 +  SetPosition(current);
   1.616 +  if (current == end) {
   1.617 +    AppendUnicodeTo(origin, current, aString);
   1.618 +    return kEOF;
   1.619 +  }
   1.620 +
   1.621 +  //DoErrTest(aString);
   1.622 +
   1.623 +  return result;
   1.624 +}
   1.625 +
   1.626 +/**
   1.627 + *  Consume digits 
   1.628 + *  
   1.629 + *  @param   aString - should contain digits
   1.630 + *  @return  error code
   1.631 + */
   1.632 +nsresult nsScanner::ReadNumber(nsString& aString,int32_t aBase) {
   1.633 +
   1.634 +  if (!mSlidingBuffer) {
   1.635 +    return kEOF;
   1.636 +  }
   1.637 +
   1.638 +  NS_ASSERTION(aBase == 10 || aBase == 16,"base value not supported");
   1.639 +
   1.640 +  char16_t         theChar=0;
   1.641 +  nsresult          result=Peek(theChar);
   1.642 +  nsScannerIterator origin, current, end;
   1.643 +
   1.644 +  origin = mCurrentPosition;
   1.645 +  current = origin;
   1.646 +  end = mEndPosition;
   1.647 +
   1.648 +  bool done = false;
   1.649 +  while(current != end) {
   1.650 +    theChar=*current;
   1.651 +    if(theChar) {
   1.652 +      done = (theChar < '0' || theChar > '9') && 
   1.653 +             ((aBase == 16)? (theChar < 'A' || theChar > 'F') &&
   1.654 +                             (theChar < 'a' || theChar > 'f')
   1.655 +                             :true);
   1.656 +      if(done) {
   1.657 +        AppendUnicodeTo(origin, current, aString);
   1.658 +        break;
   1.659 +      }
   1.660 +    }
   1.661 +    ++current;
   1.662 +  }
   1.663 +
   1.664 +  SetPosition(current);
   1.665 +  if (current == end) {
   1.666 +    AppendUnicodeTo(origin, current, aString);
   1.667 +    return kEOF;
   1.668 +  }
   1.669 +
   1.670 +  //DoErrTest(aString);
   1.671 +
   1.672 +  return result;
   1.673 +}
   1.674 +
   1.675 +/**
   1.676 + *  Consume characters until you find the terminal char
   1.677 + *  
   1.678 + *  @update  gess 3/25/98
   1.679 + *  @param   aString receives new data from stream
   1.680 + *  @param   addTerminal tells us whether to append terminal to aString
   1.681 + *  @return  error code
   1.682 + */
   1.683 +nsresult nsScanner::ReadWhitespace(nsScannerSharedSubstring& aString,
   1.684 +                                   int32_t& aNewlinesSkipped,
   1.685 +                                   bool& aHaveCR) {
   1.686 +
   1.687 +  aHaveCR = false;
   1.688 +
   1.689 +  if (!mSlidingBuffer) {
   1.690 +    return kEOF;
   1.691 +  }
   1.692 +
   1.693 +  char16_t theChar = 0;
   1.694 +  nsresult  result = Peek(theChar);
   1.695 +  
   1.696 +  if (NS_FAILED(result)) {
   1.697 +    return result;
   1.698 +  }
   1.699 +  
   1.700 +  nsScannerIterator origin, current, end;
   1.701 +  bool done = false;  
   1.702 +
   1.703 +  origin = mCurrentPosition;
   1.704 +  current = origin;
   1.705 +  end = mEndPosition;
   1.706 +
   1.707 +  bool haveCR = false;
   1.708 +
   1.709 +  while(!done && current != end) {
   1.710 +    switch(theChar) {
   1.711 +      case '\n':
   1.712 +      case '\r':
   1.713 +        {
   1.714 +          ++aNewlinesSkipped;
   1.715 +          char16_t thePrevChar = theChar;
   1.716 +          theChar = (++current != end) ? *current : '\0';
   1.717 +          if ((thePrevChar == '\r' && theChar == '\n') ||
   1.718 +              (thePrevChar == '\n' && theChar == '\r')) {
   1.719 +            theChar = (++current != end) ? *current : '\0'; // CRLF == LFCR => LF
   1.720 +            haveCR = true;
   1.721 +          } else if (thePrevChar == '\r') {
   1.722 +            // Lone CR becomes CRLF; callers should know to remove extra CRs
   1.723 +            AppendUnicodeTo(origin, current, aString);
   1.724 +            aString.writable().Append(char16_t('\n'));
   1.725 +            origin = current;
   1.726 +            haveCR = true;
   1.727 +          }
   1.728 +        }
   1.729 +        break;
   1.730 +      case ' ' :
   1.731 +      case '\t':
   1.732 +        theChar = (++current != end) ? *current : '\0';
   1.733 +        break;
   1.734 +      default:
   1.735 +        done = true;
   1.736 +        AppendUnicodeTo(origin, current, aString);
   1.737 +        break;
   1.738 +    }
   1.739 +  }
   1.740 +
   1.741 +  SetPosition(current);
   1.742 +  if (current == end) {
   1.743 +    AppendUnicodeTo(origin, current, aString);
   1.744 +    result = kEOF;
   1.745 +  }
   1.746 +
   1.747 +  aHaveCR = haveCR;
   1.748 +  return result;
   1.749 +}
   1.750 +
   1.751 +//XXXbz callers of this have to manage their lone '\r' themselves if they want
   1.752 +//it to work.  Good thing they're all in view-source and it deals.
   1.753 +nsresult nsScanner::ReadWhitespace(nsScannerIterator& aStart, 
   1.754 +                                   nsScannerIterator& aEnd,
   1.755 +                                   int32_t& aNewlinesSkipped) {
   1.756 +
   1.757 +  if (!mSlidingBuffer) {
   1.758 +    return kEOF;
   1.759 +  }
   1.760 +
   1.761 +  char16_t theChar = 0;
   1.762 +  nsresult  result = Peek(theChar);
   1.763 +  
   1.764 +  if (NS_FAILED(result)) {
   1.765 +    return result;
   1.766 +  }
   1.767 +  
   1.768 +  nsScannerIterator origin, current, end;
   1.769 +  bool done = false;  
   1.770 +
   1.771 +  origin = mCurrentPosition;
   1.772 +  current = origin;
   1.773 +  end = mEndPosition;
   1.774 +
   1.775 +  while(!done && current != end) {
   1.776 +    switch(theChar) {
   1.777 +      case '\n':
   1.778 +      case '\r': ++aNewlinesSkipped;
   1.779 +      case ' ' :
   1.780 +      case '\t':
   1.781 +        {
   1.782 +          char16_t thePrevChar = theChar;
   1.783 +          theChar = (++current != end) ? *current : '\0';
   1.784 +          if ((thePrevChar == '\r' && theChar == '\n') ||
   1.785 +              (thePrevChar == '\n' && theChar == '\r')) {
   1.786 +            theChar = (++current != end) ? *current : '\0'; // CRLF == LFCR => LF
   1.787 +          }
   1.788 +        }
   1.789 +        break;
   1.790 +      default:
   1.791 +        done = true;
   1.792 +        aStart = origin;
   1.793 +        aEnd = current;
   1.794 +        break;
   1.795 +    }
   1.796 +  }
   1.797 +
   1.798 +  SetPosition(current);
   1.799 +  if (current == end) {
   1.800 +    aStart = origin;
   1.801 +    aEnd = current;
   1.802 +    result = kEOF;
   1.803 +  }
   1.804 +
   1.805 +  return result;
   1.806 +}
   1.807 +
   1.808 +/**
   1.809 + *  Consume characters until you encounter one contained in given
   1.810 + *  input set.
   1.811 + *  
   1.812 + *  @update  gess 3/25/98
   1.813 + *  @param   aString will contain the result of this method
   1.814 + *  @param   aTerminalSet is an ordered string that contains
   1.815 + *           the set of INVALID characters
   1.816 + *  @return  error code
   1.817 + */
   1.818 +nsresult nsScanner::ReadUntil(nsAString& aString,
   1.819 +                              const nsReadEndCondition& aEndCondition,
   1.820 +                              bool addTerminal)
   1.821 +{  
   1.822 +  if (!mSlidingBuffer) {
   1.823 +    return kEOF;
   1.824 +  }
   1.825 +
   1.826 +  nsScannerIterator origin, current;
   1.827 +  const char16_t* setstart = aEndCondition.mChars;
   1.828 +  const char16_t* setcurrent;
   1.829 +
   1.830 +  origin = mCurrentPosition;
   1.831 +  current = origin;
   1.832 +
   1.833 +  char16_t         theChar=0;
   1.834 +  nsresult          result=Peek(theChar);
   1.835 +
   1.836 +  if (NS_FAILED(result)) {
   1.837 +    return result;
   1.838 +  }
   1.839 +  
   1.840 +  while (current != mEndPosition) {
   1.841 +    theChar = *current;
   1.842 +    if (theChar == '\0') {
   1.843 +      ReplaceCharacter(current, sInvalid);
   1.844 +      theChar = sInvalid;
   1.845 +    }
   1.846 +
   1.847 +    // Filter out completely wrong characters
   1.848 +    // Check if all bits are in the required area
   1.849 +    if(!(theChar & aEndCondition.mFilter)) {
   1.850 +      // They were. Do a thorough check.
   1.851 +
   1.852 +      setcurrent = setstart;
   1.853 +      while (*setcurrent) {
   1.854 +        if (*setcurrent == theChar) {
   1.855 +          if(addTerminal)
   1.856 +            ++current;
   1.857 +          AppendUnicodeTo(origin, current, aString);
   1.858 +          SetPosition(current);
   1.859 +
   1.860 +          //DoErrTest(aString);
   1.861 +
   1.862 +          return NS_OK;
   1.863 +        }
   1.864 +        ++setcurrent;
   1.865 +      }
   1.866 +    }
   1.867 +    
   1.868 +    ++current;
   1.869 +  }
   1.870 +
   1.871 +  // If we are here, we didn't find any terminator in the string and
   1.872 +  // current = mEndPosition
   1.873 +  SetPosition(current);
   1.874 +  AppendUnicodeTo(origin, current, aString);
   1.875 +  return kEOF;
   1.876 +}
   1.877 +
   1.878 +nsresult nsScanner::ReadUntil(nsScannerSharedSubstring& aString,
   1.879 +                              const nsReadEndCondition& aEndCondition,
   1.880 +                              bool addTerminal)
   1.881 +{  
   1.882 +  if (!mSlidingBuffer) {
   1.883 +    return kEOF;
   1.884 +  }
   1.885 +
   1.886 +  nsScannerIterator origin, current;
   1.887 +  const char16_t* setstart = aEndCondition.mChars;
   1.888 +  const char16_t* setcurrent;
   1.889 +
   1.890 +  origin = mCurrentPosition;
   1.891 +  current = origin;
   1.892 +
   1.893 +  char16_t         theChar=0;
   1.894 +  nsresult          result=Peek(theChar);
   1.895 +
   1.896 +  if (NS_FAILED(result)) {
   1.897 +    return result;
   1.898 +  }
   1.899 +  
   1.900 +  while (current != mEndPosition) {
   1.901 +    theChar = *current;
   1.902 +    if (theChar == '\0') {
   1.903 +      ReplaceCharacter(current, sInvalid);
   1.904 +      theChar = sInvalid;
   1.905 +    }
   1.906 +
   1.907 +    // Filter out completely wrong characters
   1.908 +    // Check if all bits are in the required area
   1.909 +    if(!(theChar & aEndCondition.mFilter)) {
   1.910 +      // They were. Do a thorough check.
   1.911 +
   1.912 +      setcurrent = setstart;
   1.913 +      while (*setcurrent) {
   1.914 +        if (*setcurrent == theChar) {
   1.915 +          if(addTerminal)
   1.916 +            ++current;
   1.917 +          AppendUnicodeTo(origin, current, aString);
   1.918 +          SetPosition(current);
   1.919 +
   1.920 +          //DoErrTest(aString);
   1.921 +
   1.922 +          return NS_OK;
   1.923 +        }
   1.924 +        ++setcurrent;
   1.925 +      }
   1.926 +    }
   1.927 +    
   1.928 +    ++current;
   1.929 +  }
   1.930 +
   1.931 +  // If we are here, we didn't find any terminator in the string and
   1.932 +  // current = mEndPosition
   1.933 +  SetPosition(current);
   1.934 +  AppendUnicodeTo(origin, current, aString);
   1.935 +  return kEOF;
   1.936 +}
   1.937 +
   1.938 +nsresult nsScanner::ReadUntil(nsScannerIterator& aStart, 
   1.939 +                              nsScannerIterator& aEnd,
   1.940 +                              const nsReadEndCondition &aEndCondition,
   1.941 +                              bool addTerminal)
   1.942 +{
   1.943 +  if (!mSlidingBuffer) {
   1.944 +    return kEOF;
   1.945 +  }
   1.946 +
   1.947 +  nsScannerIterator origin, current;
   1.948 +  const char16_t* setstart = aEndCondition.mChars;
   1.949 +  const char16_t* setcurrent;
   1.950 +
   1.951 +  origin = mCurrentPosition;
   1.952 +  current = origin;
   1.953 +
   1.954 +  char16_t         theChar=0;
   1.955 +  nsresult          result=Peek(theChar);
   1.956 +  
   1.957 +  if (NS_FAILED(result)) {
   1.958 +    aStart = aEnd = current;
   1.959 +    return result;
   1.960 +  }
   1.961 +  
   1.962 +  while (current != mEndPosition) {
   1.963 +    theChar = *current;
   1.964 +    if (theChar == '\0') {
   1.965 +      ReplaceCharacter(current, sInvalid);
   1.966 +      theChar = sInvalid;
   1.967 +    }
   1.968 +
   1.969 +    // Filter out completely wrong characters
   1.970 +    // Check if all bits are in the required area
   1.971 +    if(!(theChar & aEndCondition.mFilter)) {
   1.972 +      // They were. Do a thorough check.
   1.973 +      setcurrent = setstart;
   1.974 +      while (*setcurrent) {
   1.975 +        if (*setcurrent == theChar) {
   1.976 +          if(addTerminal)
   1.977 +            ++current;
   1.978 +          aStart = origin;
   1.979 +          aEnd = current;
   1.980 +          SetPosition(current);
   1.981 +
   1.982 +          return NS_OK;
   1.983 +        }
   1.984 +        ++setcurrent;
   1.985 +      }
   1.986 +    }
   1.987 +
   1.988 +    ++current;
   1.989 +  }
   1.990 +
   1.991 +  // If we are here, we didn't find any terminator in the string and
   1.992 +  // current = mEndPosition
   1.993 +  SetPosition(current);
   1.994 +  aStart = origin;
   1.995 +  aEnd = current;
   1.996 +  return kEOF;
   1.997 +}
   1.998 +
   1.999 +/**
  1.1000 + *  Consumes chars until you see the given terminalChar
  1.1001 + *  
  1.1002 + *  @update  gess 3/25/98
  1.1003 + *  @param   
  1.1004 + *  @return  error code
  1.1005 + */
  1.1006 +nsresult nsScanner::ReadUntil(nsAString& aString,
  1.1007 +                              char16_t aTerminalChar,
  1.1008 +                              bool addTerminal)
  1.1009 +{
  1.1010 +  if (!mSlidingBuffer) {
  1.1011 +    return kEOF;
  1.1012 +  }
  1.1013 +
  1.1014 +  nsScannerIterator origin, current;
  1.1015 +
  1.1016 +  origin = mCurrentPosition;
  1.1017 +  current = origin;
  1.1018 +
  1.1019 +  char16_t theChar;
  1.1020 +  nsresult result = Peek(theChar);
  1.1021 +
  1.1022 +  if (NS_FAILED(result)) {
  1.1023 +    return result;
  1.1024 +  }
  1.1025 +
  1.1026 +  while (current != mEndPosition) {
  1.1027 +    theChar = *current;
  1.1028 +    if (theChar == '\0') {
  1.1029 +      ReplaceCharacter(current, sInvalid);
  1.1030 +      theChar = sInvalid;
  1.1031 +    }
  1.1032 +
  1.1033 +    if (aTerminalChar == theChar) {
  1.1034 +      if(addTerminal)
  1.1035 +        ++current;
  1.1036 +      AppendUnicodeTo(origin, current, aString);
  1.1037 +      SetPosition(current);
  1.1038 +      return NS_OK;
  1.1039 +    }
  1.1040 +    ++current;
  1.1041 +  }
  1.1042 +
  1.1043 +  // If we are here, we didn't find any terminator in the string and
  1.1044 +  // current = mEndPosition
  1.1045 +  AppendUnicodeTo(origin, current, aString);
  1.1046 +  SetPosition(current);
  1.1047 +  return kEOF;
  1.1048 +
  1.1049 +}
  1.1050 +
  1.1051 +void nsScanner::BindSubstring(nsScannerSubstring& aSubstring, const nsScannerIterator& aStart, const nsScannerIterator& aEnd)
  1.1052 +{
  1.1053 +  aSubstring.Rebind(*mSlidingBuffer, aStart, aEnd);
  1.1054 +}
  1.1055 +
  1.1056 +void nsScanner::CurrentPosition(nsScannerIterator& aPosition)
  1.1057 +{
  1.1058 +  aPosition = mCurrentPosition;
  1.1059 +}
  1.1060 +
  1.1061 +void nsScanner::EndReading(nsScannerIterator& aPosition)
  1.1062 +{
  1.1063 +  aPosition = mEndPosition;
  1.1064 +}
  1.1065 + 
  1.1066 +void nsScanner::SetPosition(nsScannerIterator& aPosition, bool aTerminate, bool aReverse)
  1.1067 +{
  1.1068 +  if (mSlidingBuffer) {
  1.1069 +#ifdef DEBUG
  1.1070 +    uint32_t origRemaining = mCountRemaining;
  1.1071 +#endif
  1.1072 +
  1.1073 +    if (aReverse) {
  1.1074 +      mCountRemaining += (Distance(aPosition, mCurrentPosition));
  1.1075 +    }
  1.1076 +    else {
  1.1077 +      mCountRemaining -= (Distance(mCurrentPosition, aPosition));
  1.1078 +    }
  1.1079 +
  1.1080 +    NS_ASSERTION((mCountRemaining >= origRemaining && aReverse) ||
  1.1081 +                 (mCountRemaining <= origRemaining && !aReverse),
  1.1082 +                 "Improper use of nsScanner::SetPosition. Make sure to set the"
  1.1083 +                 " aReverse parameter correctly");
  1.1084 +
  1.1085 +    mCurrentPosition = aPosition;
  1.1086 +    if (aTerminate && (mCurrentPosition == mEndPosition)) {
  1.1087 +      mMarkPosition = mCurrentPosition;
  1.1088 +      mSlidingBuffer->DiscardPrefix(mCurrentPosition);
  1.1089 +    }
  1.1090 +  }
  1.1091 +}
  1.1092 +
  1.1093 +void nsScanner::ReplaceCharacter(nsScannerIterator& aPosition,
  1.1094 +                                 char16_t aChar)
  1.1095 +{
  1.1096 +  if (mSlidingBuffer) {
  1.1097 +    mSlidingBuffer->ReplaceCharacter(aPosition, aChar);
  1.1098 +  }
  1.1099 +}
  1.1100 +
  1.1101 +bool nsScanner::AppendToBuffer(nsScannerString::Buffer* aBuf,
  1.1102 +                                 nsIRequest *aRequest,
  1.1103 +                                 int32_t aErrorPos)
  1.1104 +{
  1.1105 +  uint32_t countRemaining = mCountRemaining;
  1.1106 +  if (!mSlidingBuffer) {
  1.1107 +    mSlidingBuffer = new nsScannerString(aBuf);
  1.1108 +    if (!mSlidingBuffer)
  1.1109 +      return false;
  1.1110 +    mSlidingBuffer->BeginReading(mCurrentPosition);
  1.1111 +    mMarkPosition = mCurrentPosition;
  1.1112 +    mSlidingBuffer->EndReading(mEndPosition);
  1.1113 +    mCountRemaining = aBuf->DataLength();
  1.1114 +  }
  1.1115 +  else {
  1.1116 +    mSlidingBuffer->AppendBuffer(aBuf);
  1.1117 +    if (mCurrentPosition == mEndPosition) {
  1.1118 +      mSlidingBuffer->BeginReading(mCurrentPosition);
  1.1119 +    }
  1.1120 +    mSlidingBuffer->EndReading(mEndPosition);
  1.1121 +    mCountRemaining += aBuf->DataLength();
  1.1122 +  }
  1.1123 +
  1.1124 +  if (aErrorPos != -1 && !mHasInvalidCharacter) {
  1.1125 +    mHasInvalidCharacter = true;
  1.1126 +    mFirstInvalidPosition = mCurrentPosition;
  1.1127 +    mFirstInvalidPosition.advance(countRemaining + aErrorPos);
  1.1128 +  }
  1.1129 +
  1.1130 +  if (mFirstNonWhitespacePosition == -1) {
  1.1131 +    nsScannerIterator iter(mCurrentPosition);
  1.1132 +    nsScannerIterator end(mEndPosition);
  1.1133 +
  1.1134 +    while (iter != end) {
  1.1135 +      if (!nsCRT::IsAsciiSpace(*iter)) {
  1.1136 +        mFirstNonWhitespacePosition = Distance(mCurrentPosition, iter);
  1.1137 +
  1.1138 +        break;
  1.1139 +      }
  1.1140 +
  1.1141 +      ++iter;
  1.1142 +    }
  1.1143 +  }
  1.1144 +  return true;
  1.1145 +}
  1.1146 +
  1.1147 +/**
  1.1148 + *  call this to copy bytes out of the scanner that have not yet been consumed
  1.1149 + *  by the tokenization process.
  1.1150 + *  
  1.1151 + *  @update  gess 5/12/98
  1.1152 + *  @param   aCopyBuffer is where the scanner buffer will be copied to
  1.1153 + *  @return  nada
  1.1154 + */
  1.1155 +void nsScanner::CopyUnusedData(nsString& aCopyBuffer) {
  1.1156 +  if (!mSlidingBuffer) {
  1.1157 +    aCopyBuffer.Truncate();
  1.1158 +    return;
  1.1159 +  }
  1.1160 +
  1.1161 +  nsScannerIterator start, end;
  1.1162 +  start = mCurrentPosition;
  1.1163 +  end = mEndPosition;
  1.1164 +
  1.1165 +  CopyUnicodeTo(start, end, aCopyBuffer);
  1.1166 +}
  1.1167 +
  1.1168 +/**
  1.1169 + *  Retrieve the name of the file that the scanner is reading from.
  1.1170 + *  In some cases, it's just a given name, because the scanner isn't
  1.1171 + *  really reading from a file.
  1.1172 + *  
  1.1173 + *  @update  gess 5/12/98
  1.1174 + *  @return  
  1.1175 + */
  1.1176 +nsString& nsScanner::GetFilename(void) {
  1.1177 +  return mFilename;
  1.1178 +}
  1.1179 +
  1.1180 +/**
  1.1181 + *  Conduct self test. Actually, selftesting for this class
  1.1182 + *  occurs in the parser selftest.
  1.1183 + *  
  1.1184 + *  @update  gess 3/25/98
  1.1185 + *  @param   
  1.1186 + *  @return  
  1.1187 + */
  1.1188 +
  1.1189 +void nsScanner::SelfTest(void) {
  1.1190 +#ifdef _DEBUG
  1.1191 +#endif
  1.1192 +}
  1.1193 +
  1.1194 +void nsScanner::OverrideReplacementCharacter(char16_t aReplacementCharacter)
  1.1195 +{
  1.1196 +  mReplacementCharacter = aReplacementCharacter;
  1.1197 +
  1.1198 +  if (mHasInvalidCharacter) {
  1.1199 +    ReplaceCharacter(mFirstInvalidPosition, mReplacementCharacter);
  1.1200 +  }
  1.1201 +}
  1.1202 +
The Tor Browser / file diff

diff: parser/htmlparser/src/nsScanner.cpp

parser/htmlparser/src/nsScanner.cpp