1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/parser/htmlparser/src/nsScanner.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,309 @@ 1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 + 1.10 +/** 1.11 + * MODULE NOTES: 1.12 + * @update gess 4/1/98 1.13 + * 1.14 + * The scanner is a low-level service class that knows 1.15 + * how to consume characters out of an (internal) stream. 1.16 + * This class also offers a series of utility methods 1.17 + * that most tokenizers want, such as readUntil() 1.18 + * and SkipWhitespace(). 1.19 + */ 1.20 + 1.21 + 1.22 +#ifndef SCANNER 1.23 +#define SCANNER 1.24 + 1.25 +#include "nsCOMPtr.h" 1.26 +#include "nsString.h" 1.27 +#include "nsIParser.h" 1.28 +#include "nsIUnicodeDecoder.h" 1.29 +#include "nsScannerString.h" 1.30 + 1.31 +class nsParser; 1.32 + 1.33 +class nsReadEndCondition { 1.34 +public: 1.35 + const char16_t *mChars; 1.36 + char16_t mFilter; 1.37 + explicit nsReadEndCondition(const char16_t* aTerminateChars); 1.38 +private: 1.39 + nsReadEndCondition(const nsReadEndCondition& aOther); // No copying 1.40 + void operator=(const nsReadEndCondition& aOther); // No assigning 1.41 +}; 1.42 + 1.43 +class nsScanner { 1.44 + public: 1.45 + 1.46 + /** 1.47 + * Use this constructor for the XML fragment parsing case 1.48 + */ 1.49 + nsScanner(const nsAString& anHTMLString); 1.50 + 1.51 + /** 1.52 + * Use this constructor if you want i/o to be based on 1.53 + * a file (therefore a stream) or just data you provide via Append(). 1.54 + */ 1.55 + nsScanner(nsString& aFilename, bool aCreateStream); 1.56 + 1.57 + ~nsScanner(); 1.58 + 1.59 + /** 1.60 + * retrieve next char from internal input stream 1.61 + * 1.62 + * @update gess 3/25/98 1.63 + * @param ch is the char to accept new value 1.64 + * @return error code reflecting read status 1.65 + */ 1.66 + nsresult GetChar(char16_t& ch); 1.67 + 1.68 + /** 1.69 + * peek ahead to consume next char from scanner's internal 1.70 + * input buffer 1.71 + * 1.72 + * @update gess 3/25/98 1.73 + * @param ch is the char to accept new value 1.74 + * @return error code reflecting read status 1.75 + */ 1.76 + nsresult Peek(char16_t& ch, uint32_t aOffset=0); 1.77 + 1.78 + nsresult Peek(nsAString& aStr, int32_t aNumChars, int32_t aOffset = 0); 1.79 + 1.80 + /** 1.81 + * Skip over chars as long as they equal given char 1.82 + * 1.83 + * @update gess 3/25/98 1.84 + * @param char to be skipped 1.85 + * @return error code 1.86 + */ 1.87 + nsresult SkipOver(char16_t aSkipChar); 1.88 + 1.89 + /** 1.90 + * Skip whitespace on scanner input stream 1.91 + * 1.92 + * @update gess 3/25/98 1.93 + * @return error status 1.94 + */ 1.95 + nsresult SkipWhitespace(int32_t& aNewlinesSkipped); 1.96 + 1.97 + /** 1.98 + * Consume characters until you run into space, a '<', a '>', or a '/'. 1.99 + * 1.100 + * @param aString - receives new data from stream 1.101 + * @return error code 1.102 + */ 1.103 + nsresult ReadTagIdentifier(nsScannerSharedSubstring& aString); 1.104 + 1.105 + /** 1.106 + * Consume characters until you run into a char that's not valid in an 1.107 + * entity name 1.108 + * 1.109 + * @param aString - receives new data from stream 1.110 + * @return error code 1.111 + */ 1.112 + nsresult ReadEntityIdentifier(nsString& aString); 1.113 + nsresult ReadNumber(nsString& aString,int32_t aBase); 1.114 + nsresult ReadWhitespace(nsScannerSharedSubstring& aString, 1.115 + int32_t& aNewlinesSkipped, 1.116 + bool& aHaveCR); 1.117 + nsresult ReadWhitespace(nsScannerIterator& aStart, 1.118 + nsScannerIterator& aEnd, 1.119 + int32_t& aNewlinesSkipped); 1.120 + 1.121 + /** 1.122 + * Consume characters until you find the terminal char 1.123 + * 1.124 + * @update gess 3/25/98 1.125 + * @param aString receives new data from stream 1.126 + * @param aTerminal contains terminating char 1.127 + * @param addTerminal tells us whether to append terminal to aString 1.128 + * @return error code 1.129 + */ 1.130 + nsresult ReadUntil(nsAString& aString, 1.131 + char16_t aTerminal, 1.132 + bool addTerminal); 1.133 + 1.134 + /** 1.135 + * Consume characters until you find one contained in given 1.136 + * terminal set. 1.137 + * 1.138 + * @update gess 3/25/98 1.139 + * @param aString receives new data from stream 1.140 + * @param aTermSet contains set of terminating chars 1.141 + * @param addTerminal tells us whether to append terminal to aString 1.142 + * @return error code 1.143 + */ 1.144 + nsresult ReadUntil(nsAString& aString, 1.145 + const nsReadEndCondition& aEndCondition, 1.146 + bool addTerminal); 1.147 + 1.148 + nsresult ReadUntil(nsScannerSharedSubstring& aString, 1.149 + const nsReadEndCondition& aEndCondition, 1.150 + bool addTerminal); 1.151 + 1.152 + nsresult ReadUntil(nsScannerIterator& aStart, 1.153 + nsScannerIterator& aEnd, 1.154 + const nsReadEndCondition& aEndCondition, 1.155 + bool addTerminal); 1.156 + 1.157 + /** 1.158 + * Records current offset position in input stream. This allows us 1.159 + * to back up to this point if the need should arise, such as when 1.160 + * tokenization gets interrupted. 1.161 + * 1.162 + * @update gess 5/12/98 1.163 + * @param 1.164 + * @return 1.165 + */ 1.166 + int32_t Mark(void); 1.167 + 1.168 + /** 1.169 + * Resets current offset position of input stream to marked position. 1.170 + * This allows us to back up to this point if the need should arise, 1.171 + * such as when tokenization gets interrupted. 1.172 + * NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST! 1.173 + * 1.174 + * @update gess 5/12/98 1.175 + * @param 1.176 + * @return 1.177 + */ 1.178 + void RewindToMark(void); 1.179 + 1.180 + 1.181 + /** 1.182 + * 1.183 + * 1.184 + * @update harishd 01/12/99 1.185 + * @param 1.186 + * @return 1.187 + */ 1.188 + bool UngetReadable(const nsAString& aBuffer); 1.189 + 1.190 + /** 1.191 + * 1.192 + * 1.193 + * @update gess 5/13/98 1.194 + * @param 1.195 + * @return 1.196 + */ 1.197 + nsresult Append(const nsAString& aBuffer); 1.198 + 1.199 + /** 1.200 + * 1.201 + * 1.202 + * @update gess 5/21/98 1.203 + * @param 1.204 + * @return 1.205 + */ 1.206 + nsresult Append(const char* aBuffer, uint32_t aLen, 1.207 + nsIRequest *aRequest); 1.208 + 1.209 + /** 1.210 + * Call this to copy bytes out of the scanner that have not yet been consumed 1.211 + * by the tokenization process. 1.212 + * 1.213 + * @update gess 5/12/98 1.214 + * @param aCopyBuffer is where the scanner buffer will be copied to 1.215 + * @return nada 1.216 + */ 1.217 + void CopyUnusedData(nsString& aCopyBuffer); 1.218 + 1.219 + /** 1.220 + * Retrieve the name of the file that the scanner is reading from. 1.221 + * In some cases, it's just a given name, because the scanner isn't 1.222 + * really reading from a file. 1.223 + * 1.224 + * @update gess 5/12/98 1.225 + * @return 1.226 + */ 1.227 + nsString& GetFilename(void); 1.228 + 1.229 + static void SelfTest(); 1.230 + 1.231 + /** 1.232 + * Use this setter to change the scanner's unicode decoder 1.233 + * 1.234 + * @update ftang 3/02/99 1.235 + * @param aCharset a normalized (alias resolved) charset name 1.236 + * @param aCharsetSource- where the charset info came from 1.237 + * @return 1.238 + */ 1.239 + nsresult SetDocumentCharset(const nsACString& aCharset, int32_t aSource); 1.240 + 1.241 + void BindSubstring(nsScannerSubstring& aSubstring, const nsScannerIterator& aStart, const nsScannerIterator& aEnd); 1.242 + void CurrentPosition(nsScannerIterator& aPosition); 1.243 + void EndReading(nsScannerIterator& aPosition); 1.244 + void SetPosition(nsScannerIterator& aPosition, 1.245 + bool aTruncate = false, 1.246 + bool aReverse = false); 1.247 + void ReplaceCharacter(nsScannerIterator& aPosition, 1.248 + char16_t aChar); 1.249 + 1.250 + /** 1.251 + * Internal method used to cause the internal buffer to 1.252 + * be filled with data. 1.253 + * 1.254 + * @update gess4/3/98 1.255 + */ 1.256 + bool IsIncremental(void) {return mIncremental;} 1.257 + void SetIncremental(bool anIncrValue) {mIncremental=anIncrValue;} 1.258 + 1.259 + /** 1.260 + * Return the position of the first non-whitespace 1.261 + * character. This is only reliable before consumers start 1.262 + * reading from this scanner. 1.263 + */ 1.264 + int32_t FirstNonWhitespacePosition() 1.265 + { 1.266 + return mFirstNonWhitespacePosition; 1.267 + } 1.268 + 1.269 + /** 1.270 + * Override replacement character used by nsIUnicodeDecoder. 1.271 + * Default behavior is that it uses nsIUnicodeDecoder's mapping. 1.272 + * 1.273 + * @param aReplacementCharacter the replacement character 1.274 + * XML (expat) parser uses 0xffff 1.275 + */ 1.276 + void OverrideReplacementCharacter(char16_t aReplacementCharacter); 1.277 + 1.278 + protected: 1.279 + 1.280 + bool AppendToBuffer(nsScannerString::Buffer *, nsIRequest *aRequest, int32_t aErrorPos = -1); 1.281 + bool AppendToBuffer(const nsAString& aStr) 1.282 + { 1.283 + nsScannerString::Buffer* buf = nsScannerString::AllocBufferFromString(aStr); 1.284 + if (!buf) 1.285 + return false; 1.286 + AppendToBuffer(buf, nullptr); 1.287 + return true; 1.288 + } 1.289 + 1.290 + nsScannerString* mSlidingBuffer; 1.291 + nsScannerIterator mCurrentPosition; // The position we will next read from in the scanner buffer 1.292 + nsScannerIterator mMarkPosition; // The position last marked (we may rewind to here) 1.293 + nsScannerIterator mEndPosition; // The current end of the scanner buffer 1.294 + nsScannerIterator mFirstInvalidPosition; // The position of the first invalid character that was detected 1.295 + nsString mFilename; 1.296 + uint32_t mCountRemaining; // The number of bytes still to be read 1.297 + // from the scanner buffer 1.298 + bool mIncremental; 1.299 + bool mHasInvalidCharacter; 1.300 + char16_t mReplacementCharacter; 1.301 + int32_t mFirstNonWhitespacePosition; 1.302 + int32_t mCharsetSource; 1.303 + nsCString mCharset; 1.304 + nsCOMPtr<nsIUnicodeDecoder> mUnicodeDecoder; 1.305 + 1.306 + private: 1.307 + nsScanner &operator =(const nsScanner &); // Not implemented. 1.308 +}; 1.309 + 1.310 +#endif 1.311 + 1.312 +