Fri, 16 Jan 2015 18:13:44 +0100
Integrate suggestion from review to improve consistency with existing code.
michael@0 | 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
michael@0 | 2 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 5 | |
michael@0 | 6 | /* tokenization of CSS style sheets */ |
michael@0 | 7 | |
michael@0 | 8 | #ifndef nsCSSScanner_h___ |
michael@0 | 9 | #define nsCSSScanner_h___ |
michael@0 | 10 | |
michael@0 | 11 | #include "nsString.h" |
michael@0 | 12 | |
michael@0 | 13 | namespace mozilla { |
michael@0 | 14 | namespace css { |
michael@0 | 15 | class ErrorReporter; |
michael@0 | 16 | } |
michael@0 | 17 | } |
michael@0 | 18 | |
michael@0 | 19 | // Token types; in close but not perfect correspondence to the token |
michael@0 | 20 | // categorization in section 4.1.1 of CSS2.1. (The deviations are all |
michael@0 | 21 | // the fault of css3-selectors, which has requirements that can only be |
michael@0 | 22 | // met by changing the generic tokenization.) The comment on each line |
michael@0 | 23 | // illustrates the form of each identifier. |
michael@0 | 24 | |
michael@0 | 25 | enum nsCSSTokenType { |
michael@0 | 26 | // White space of any kind. No value fields are used. Note that |
michael@0 | 27 | // comments do *not* count as white space; comments separate tokens |
michael@0 | 28 | // but are not themselves tokens. |
michael@0 | 29 | eCSSToken_Whitespace, // |
michael@0 | 30 | |
michael@0 | 31 | // Identifier-like tokens. mIdent is the text of the identifier. |
michael@0 | 32 | // The difference between ID and Hash is: if the text after the # |
michael@0 | 33 | // would have been a valid Ident if the # hadn't been there, the |
michael@0 | 34 | // scanner produces an ID token. Otherwise it produces a Hash token. |
michael@0 | 35 | // (This distinction is required by css3-selectors.) |
michael@0 | 36 | eCSSToken_Ident, // word |
michael@0 | 37 | eCSSToken_Function, // word( |
michael@0 | 38 | eCSSToken_AtKeyword, // @word |
michael@0 | 39 | eCSSToken_ID, // #word |
michael@0 | 40 | eCSSToken_Hash, // #0word |
michael@0 | 41 | |
michael@0 | 42 | // Numeric tokens. mNumber is the floating-point value of the |
michael@0 | 43 | // number, and mHasSign indicates whether there was an explicit sign |
michael@0 | 44 | // (+ or -) in front of the number. If mIntegerValid is true, the |
michael@0 | 45 | // number had the lexical form of an integer, and mInteger is its |
michael@0 | 46 | // integer value. Lexically integer values outside the range of a |
michael@0 | 47 | // 32-bit signed number are clamped to the maximum values; mNumber |
michael@0 | 48 | // will indicate a 'truer' value in that case. Percentage tokens |
michael@0 | 49 | // are always considered not to be integers, even if their numeric |
michael@0 | 50 | // value is integral (100% => mNumber = 1.0). For Dimension |
michael@0 | 51 | // tokens, mIdent holds the text of the unit. |
michael@0 | 52 | eCSSToken_Number, // 1 -5 +2e3 3.14159 7.297352e-3 |
michael@0 | 53 | eCSSToken_Dimension, // 24px 8.5in |
michael@0 | 54 | eCSSToken_Percentage, // 85% 1280.4% |
michael@0 | 55 | |
michael@0 | 56 | // String-like tokens. In all cases, mIdent holds the text |
michael@0 | 57 | // belonging to the string, and mSymbol holds the delimiter |
michael@0 | 58 | // character, which may be ', ", or zero (only for unquoted URLs). |
michael@0 | 59 | // Bad_String and Bad_URL tokens are emitted when the closing |
michael@0 | 60 | // delimiter or parenthesis was missing. |
michael@0 | 61 | eCSSToken_String, // 'foo bar' "foo bar" |
michael@0 | 62 | eCSSToken_Bad_String, // 'foo bar |
michael@0 | 63 | eCSSToken_URL, // url(foobar) url("foo bar") |
michael@0 | 64 | eCSSToken_Bad_URL, // url(foo |
michael@0 | 65 | |
michael@0 | 66 | // Any one-character symbol. mSymbol holds the character. |
michael@0 | 67 | eCSSToken_Symbol, // . ; { } ! * |
michael@0 | 68 | |
michael@0 | 69 | // Match operators. These are single tokens rather than pairs of |
michael@0 | 70 | // Symbol tokens because css3-selectors forbids the presence of |
michael@0 | 71 | // comments between the two characters. No value fields are used; |
michael@0 | 72 | // the token type indicates which operator. |
michael@0 | 73 | eCSSToken_Includes, // ~= |
michael@0 | 74 | eCSSToken_Dashmatch, // |= |
michael@0 | 75 | eCSSToken_Beginsmatch, // ^= |
michael@0 | 76 | eCSSToken_Endsmatch, // $= |
michael@0 | 77 | eCSSToken_Containsmatch, // *= |
michael@0 | 78 | |
michael@0 | 79 | // Unicode-range token: currently used only in @font-face. |
michael@0 | 80 | // The lexical rule for this token includes several forms that are |
michael@0 | 81 | // semantically invalid. Therefore, mIdent always holds the |
michael@0 | 82 | // complete original text of the token (so we can print it |
michael@0 | 83 | // accurately in diagnostics), and mIntegerValid is true iff the |
michael@0 | 84 | // token is semantically valid. In that case, mInteger holds the |
michael@0 | 85 | // lowest value included in the range, and mInteger2 holds the |
michael@0 | 86 | // highest value included in the range. |
michael@0 | 87 | eCSSToken_URange, // U+007e U+01?? U+2000-206F |
michael@0 | 88 | |
michael@0 | 89 | // HTML comment delimiters, ignored as a unit when they appear at |
michael@0 | 90 | // the top level of a style sheet, for compatibility with websites |
michael@0 | 91 | // written for compatibility with pre-CSS browsers. This token type |
michael@0 | 92 | // subsumes the css2.1 CDO and CDC tokens, which are always treated |
michael@0 | 93 | // the same by the parser. mIdent holds the text of the token, for |
michael@0 | 94 | // diagnostics. |
michael@0 | 95 | eCSSToken_HTMLComment, // <!-- --> |
michael@0 | 96 | }; |
michael@0 | 97 | |
michael@0 | 98 | // Classification of tokens used to determine if a "/**/" string must be |
michael@0 | 99 | // inserted if pasting token streams together when serializing. We include |
michael@0 | 100 | // values corresponding to eCSSToken_Dashmatch and eCSSToken_Containsmatch, |
michael@0 | 101 | // as css-syntax does not treat these as whole tokens, but we will still |
michael@0 | 102 | // need to insert a "/**/" string between a '|' delim and a '|=' dashmatch |
michael@0 | 103 | // and between a '/' delim and a '*=' containsmatch. |
michael@0 | 104 | // |
michael@0 | 105 | // https://dvcs.w3.org/hg/csswg/raw-file/372e659027a0/css-syntax/Overview.html#serialization |
michael@0 | 106 | enum nsCSSTokenSerializationType { |
michael@0 | 107 | eCSSTokenSerialization_Nothing, |
michael@0 | 108 | eCSSTokenSerialization_Whitespace, |
michael@0 | 109 | eCSSTokenSerialization_AtKeyword_or_Hash, |
michael@0 | 110 | eCSSTokenSerialization_Number, |
michael@0 | 111 | eCSSTokenSerialization_Dimension, |
michael@0 | 112 | eCSSTokenSerialization_Percentage, |
michael@0 | 113 | eCSSTokenSerialization_URange, |
michael@0 | 114 | eCSSTokenSerialization_URL_or_BadURL, |
michael@0 | 115 | eCSSTokenSerialization_Function, |
michael@0 | 116 | eCSSTokenSerialization_Ident, |
michael@0 | 117 | eCSSTokenSerialization_CDC, |
michael@0 | 118 | eCSSTokenSerialization_DashMatch, |
michael@0 | 119 | eCSSTokenSerialization_ContainsMatch, |
michael@0 | 120 | eCSSTokenSerialization_Symbol_Hash, // '#' |
michael@0 | 121 | eCSSTokenSerialization_Symbol_At, // '@' |
michael@0 | 122 | eCSSTokenSerialization_Symbol_Dot_or_Plus, // '.', '+' |
michael@0 | 123 | eCSSTokenSerialization_Symbol_Minus, // '-' |
michael@0 | 124 | eCSSTokenSerialization_Symbol_OpenParen, // '(' |
michael@0 | 125 | eCSSTokenSerialization_Symbol_Question, // '?' |
michael@0 | 126 | eCSSTokenSerialization_Symbol_Assorted, // '$', '^', '~' |
michael@0 | 127 | eCSSTokenSerialization_Symbol_Equals, // '=' |
michael@0 | 128 | eCSSTokenSerialization_Symbol_Bar, // '|' |
michael@0 | 129 | eCSSTokenSerialization_Symbol_Slash, // '/' |
michael@0 | 130 | eCSSTokenSerialization_Symbol_Asterisk, // '*' |
michael@0 | 131 | eCSSTokenSerialization_Other // anything else |
michael@0 | 132 | }; |
michael@0 | 133 | |
michael@0 | 134 | // A single token returned from the scanner. mType is always |
michael@0 | 135 | // meaningful; comments above describe which other fields are |
michael@0 | 136 | // meaningful for which token types. |
michael@0 | 137 | struct nsCSSToken { |
michael@0 | 138 | nsAutoString mIdent; |
michael@0 | 139 | float mNumber; |
michael@0 | 140 | int32_t mInteger; |
michael@0 | 141 | int32_t mInteger2; |
michael@0 | 142 | nsCSSTokenType mType; |
michael@0 | 143 | char16_t mSymbol; |
michael@0 | 144 | bool mIntegerValid; |
michael@0 | 145 | bool mHasSign; |
michael@0 | 146 | |
michael@0 | 147 | nsCSSToken() |
michael@0 | 148 | : mNumber(0), mInteger(0), mInteger2(0), mType(eCSSToken_Whitespace), |
michael@0 | 149 | mSymbol('\0'), mIntegerValid(false), mHasSign(false) |
michael@0 | 150 | {} |
michael@0 | 151 | |
michael@0 | 152 | bool IsSymbol(char16_t aSymbol) const { |
michael@0 | 153 | return mType == eCSSToken_Symbol && mSymbol == aSymbol; |
michael@0 | 154 | } |
michael@0 | 155 | |
michael@0 | 156 | void AppendToString(nsString& aBuffer) const; |
michael@0 | 157 | }; |
michael@0 | 158 | |
michael@0 | 159 | // Represents an nsCSSScanner's saved position in the input buffer. |
michael@0 | 160 | class nsCSSScannerPosition { |
michael@0 | 161 | friend class nsCSSScanner; |
michael@0 | 162 | public: |
michael@0 | 163 | nsCSSScannerPosition() : mInitialized(false) { } |
michael@0 | 164 | |
michael@0 | 165 | uint32_t LineNumber() { |
michael@0 | 166 | MOZ_ASSERT(mInitialized); |
michael@0 | 167 | return mLineNumber; |
michael@0 | 168 | } |
michael@0 | 169 | |
michael@0 | 170 | uint32_t LineOffset() { |
michael@0 | 171 | MOZ_ASSERT(mInitialized); |
michael@0 | 172 | return mLineOffset; |
michael@0 | 173 | } |
michael@0 | 174 | |
michael@0 | 175 | private: |
michael@0 | 176 | uint32_t mOffset; |
michael@0 | 177 | uint32_t mLineNumber; |
michael@0 | 178 | uint32_t mLineOffset; |
michael@0 | 179 | uint32_t mTokenLineNumber; |
michael@0 | 180 | uint32_t mTokenLineOffset; |
michael@0 | 181 | uint32_t mTokenOffset; |
michael@0 | 182 | bool mInitialized; |
michael@0 | 183 | }; |
michael@0 | 184 | |
michael@0 | 185 | // nsCSSScanner tokenizes an input stream using the CSS2.1 forward |
michael@0 | 186 | // compatible tokenization rules. Used internally by nsCSSParser; |
michael@0 | 187 | // not available for use by other code. |
michael@0 | 188 | class nsCSSScanner { |
michael@0 | 189 | public: |
michael@0 | 190 | // |aLineNumber == 1| is the beginning of a file, use |aLineNumber == 0| |
michael@0 | 191 | // when the line number is unknown. |
michael@0 | 192 | nsCSSScanner(const nsAString& aBuffer, uint32_t aLineNumber); |
michael@0 | 193 | ~nsCSSScanner(); |
michael@0 | 194 | |
michael@0 | 195 | void SetErrorReporter(mozilla::css::ErrorReporter* aReporter) { |
michael@0 | 196 | mReporter = aReporter; |
michael@0 | 197 | } |
michael@0 | 198 | // Set whether or not we are processing SVG |
michael@0 | 199 | void SetSVGMode(bool aSVGMode) { |
michael@0 | 200 | mSVGMode = aSVGMode; |
michael@0 | 201 | } |
michael@0 | 202 | bool IsSVGMode() const { |
michael@0 | 203 | return mSVGMode; |
michael@0 | 204 | } |
michael@0 | 205 | |
michael@0 | 206 | // Reset or check whether a BAD_URL or BAD_STRING token has been seen. |
michael@0 | 207 | void ClearSeenBadToken() { mSeenBadToken = false; } |
michael@0 | 208 | bool SeenBadToken() const { return mSeenBadToken; } |
michael@0 | 209 | |
michael@0 | 210 | // Reset or check whether a "var(" FUNCTION token has been seen. |
michael@0 | 211 | void ClearSeenVariableReference() { mSeenVariableReference = false; } |
michael@0 | 212 | bool SeenVariableReference() const { return mSeenVariableReference; } |
michael@0 | 213 | |
michael@0 | 214 | // Get the 1-based line number of the last character of |
michael@0 | 215 | // the most recently processed token. |
michael@0 | 216 | uint32_t GetLineNumber() const { return mTokenLineNumber; } |
michael@0 | 217 | |
michael@0 | 218 | // Get the 0-based column number of the first character of |
michael@0 | 219 | // the most recently processed token. |
michael@0 | 220 | uint32_t GetColumnNumber() const |
michael@0 | 221 | { return mTokenOffset - mTokenLineOffset; } |
michael@0 | 222 | |
michael@0 | 223 | // Get the text of the line containing the first character of |
michael@0 | 224 | // the most recently processed token. |
michael@0 | 225 | nsDependentSubstring GetCurrentLine() const; |
michael@0 | 226 | |
michael@0 | 227 | // Get the next token. Return false on EOF. aTokenResult is filled |
michael@0 | 228 | // in with the data for the token. If aSkipWS is true, skip over |
michael@0 | 229 | // eCSSToken_Whitespace tokens rather than returning them. |
michael@0 | 230 | bool Next(nsCSSToken& aTokenResult, bool aSkipWS); |
michael@0 | 231 | |
michael@0 | 232 | // Get the body of an URL token (everything after the 'url('). |
michael@0 | 233 | // This is exposed for use by nsCSSParser::ParseMozDocumentRule, |
michael@0 | 234 | // which, for historical reasons, must make additional function |
michael@0 | 235 | // tokens behave like url(). Please do not add new uses to the |
michael@0 | 236 | // parser. |
michael@0 | 237 | bool NextURL(nsCSSToken& aTokenResult); |
michael@0 | 238 | |
michael@0 | 239 | // This is exposed for use by nsCSSParser::ParsePseudoClassWithNthPairArg, |
michael@0 | 240 | // because "2n-1" is a single DIMENSION token, and "n-1" is a single |
michael@0 | 241 | // IDENT token, but the :nth() selector syntax wants to interpret |
michael@0 | 242 | // them the same as "2n -1" and "n -1" respectively. Please do not |
michael@0 | 243 | // add new uses to the parser. |
michael@0 | 244 | // |
michael@0 | 245 | // Note: this function may not be used to back up over a line boundary. |
michael@0 | 246 | void Backup(uint32_t n); |
michael@0 | 247 | |
michael@0 | 248 | // Starts recording the input stream from the current position. |
michael@0 | 249 | void StartRecording(); |
michael@0 | 250 | |
michael@0 | 251 | // Abandons recording of the input stream. |
michael@0 | 252 | void StopRecording(); |
michael@0 | 253 | |
michael@0 | 254 | // Stops recording of the input stream and appends the recorded |
michael@0 | 255 | // input to aBuffer. |
michael@0 | 256 | void StopRecording(nsString& aBuffer); |
michael@0 | 257 | |
michael@0 | 258 | // Returns the length of the current recording. |
michael@0 | 259 | uint32_t RecordingLength() const; |
michael@0 | 260 | |
michael@0 | 261 | #ifdef DEBUG |
michael@0 | 262 | bool IsRecording() const; |
michael@0 | 263 | #endif |
michael@0 | 264 | |
michael@0 | 265 | // Stores the current scanner offset into the specified object. |
michael@0 | 266 | void SavePosition(nsCSSScannerPosition& aState); |
michael@0 | 267 | |
michael@0 | 268 | // Resets the scanner offset to a position saved by SavePosition. |
michael@0 | 269 | void RestoreSavedPosition(const nsCSSScannerPosition& aState); |
michael@0 | 270 | |
michael@0 | 271 | enum EOFCharacters { |
michael@0 | 272 | eEOFCharacters_None = 0x0000, |
michael@0 | 273 | |
michael@0 | 274 | // to handle \<EOF> inside strings |
michael@0 | 275 | eEOFCharacters_DropBackslash = 0x0001, |
michael@0 | 276 | |
michael@0 | 277 | // to handle \<EOF> outside strings |
michael@0 | 278 | eEOFCharacters_ReplacementChar = 0x0002, |
michael@0 | 279 | |
michael@0 | 280 | // to close comments |
michael@0 | 281 | eEOFCharacters_Asterisk = 0x0004, |
michael@0 | 282 | eEOFCharacters_Slash = 0x0008, |
michael@0 | 283 | |
michael@0 | 284 | // to close double-quoted strings |
michael@0 | 285 | eEOFCharacters_DoubleQuote = 0x0010, |
michael@0 | 286 | |
michael@0 | 287 | // to close single-quoted strings |
michael@0 | 288 | eEOFCharacters_SingleQuote = 0x0020, |
michael@0 | 289 | |
michael@0 | 290 | // to close URLs |
michael@0 | 291 | eEOFCharacters_CloseParen = 0x0040, |
michael@0 | 292 | }; |
michael@0 | 293 | |
michael@0 | 294 | // Appends any characters to the specified string the input stream to make the |
michael@0 | 295 | // last token not rely on special EOF handling behavior. |
michael@0 | 296 | // |
michael@0 | 297 | // If eEOFCharacters_DropBackslash is in aEOFCharacters, it is ignored. |
michael@0 | 298 | static void AppendImpliedEOFCharacters(EOFCharacters aEOFCharacters, |
michael@0 | 299 | nsAString& aString); |
michael@0 | 300 | |
michael@0 | 301 | EOFCharacters GetEOFCharacters() const { |
michael@0 | 302 | #ifdef DEBUG |
michael@0 | 303 | AssertEOFCharactersValid(mEOFCharacters); |
michael@0 | 304 | #endif |
michael@0 | 305 | return mEOFCharacters; |
michael@0 | 306 | } |
michael@0 | 307 | |
michael@0 | 308 | #ifdef DEBUG |
michael@0 | 309 | static void AssertEOFCharactersValid(uint32_t c); |
michael@0 | 310 | #endif |
michael@0 | 311 | |
michael@0 | 312 | protected: |
michael@0 | 313 | int32_t Peek(uint32_t n = 0); |
michael@0 | 314 | void Advance(uint32_t n = 1); |
michael@0 | 315 | void AdvanceLine(); |
michael@0 | 316 | |
michael@0 | 317 | void SkipWhitespace(); |
michael@0 | 318 | void SkipComment(); |
michael@0 | 319 | |
michael@0 | 320 | bool GatherEscape(nsString& aOutput, bool aInString); |
michael@0 | 321 | bool GatherText(uint8_t aClass, nsString& aIdent); |
michael@0 | 322 | |
michael@0 | 323 | bool ScanIdent(nsCSSToken& aResult); |
michael@0 | 324 | bool ScanAtKeyword(nsCSSToken& aResult); |
michael@0 | 325 | bool ScanHash(nsCSSToken& aResult); |
michael@0 | 326 | bool ScanNumber(nsCSSToken& aResult); |
michael@0 | 327 | bool ScanString(nsCSSToken& aResult); |
michael@0 | 328 | bool ScanURange(nsCSSToken& aResult); |
michael@0 | 329 | |
michael@0 | 330 | void SetEOFCharacters(uint32_t aEOFCharacters); |
michael@0 | 331 | void AddEOFCharacters(uint32_t aEOFCharacters); |
michael@0 | 332 | |
michael@0 | 333 | const char16_t *mBuffer; |
michael@0 | 334 | uint32_t mOffset; |
michael@0 | 335 | uint32_t mCount; |
michael@0 | 336 | |
michael@0 | 337 | uint32_t mLineNumber; |
michael@0 | 338 | uint32_t mLineOffset; |
michael@0 | 339 | |
michael@0 | 340 | uint32_t mTokenLineNumber; |
michael@0 | 341 | uint32_t mTokenLineOffset; |
michael@0 | 342 | uint32_t mTokenOffset; |
michael@0 | 343 | |
michael@0 | 344 | uint32_t mRecordStartOffset; |
michael@0 | 345 | EOFCharacters mEOFCharacters; |
michael@0 | 346 | |
michael@0 | 347 | mozilla::css::ErrorReporter *mReporter; |
michael@0 | 348 | |
michael@0 | 349 | // True if we are in SVG mode; false in "normal" CSS |
michael@0 | 350 | bool mSVGMode; |
michael@0 | 351 | bool mRecording; |
michael@0 | 352 | bool mSeenBadToken; |
michael@0 | 353 | bool mSeenVariableReference; |
michael@0 | 354 | }; |
michael@0 | 355 | |
michael@0 | 356 | // Token for the grid-template-areas micro-syntax |
michael@0 | 357 | // http://dev.w3.org/csswg/css-grid/#propdef-grid-template-areas |
michael@0 | 358 | struct MOZ_STACK_CLASS nsCSSGridTemplateAreaToken { |
michael@0 | 359 | nsAutoString mName; // Empty for a null cell, non-empty for a named cell |
michael@0 | 360 | bool isTrash; // True for a trash token, mName is ignored in this case. |
michael@0 | 361 | }; |
michael@0 | 362 | |
michael@0 | 363 | // Scanner for the grid-template-areas micro-syntax |
michael@0 | 364 | class nsCSSGridTemplateAreaScanner { |
michael@0 | 365 | public: |
michael@0 | 366 | nsCSSGridTemplateAreaScanner(const nsAString& aBuffer); |
michael@0 | 367 | |
michael@0 | 368 | // Get the next token. Return false on EOF. |
michael@0 | 369 | // aTokenResult is filled in with the data for the token. |
michael@0 | 370 | bool Next(nsCSSGridTemplateAreaToken& aTokenResult); |
michael@0 | 371 | |
michael@0 | 372 | private: |
michael@0 | 373 | const char16_t *mBuffer; |
michael@0 | 374 | uint32_t mOffset; |
michael@0 | 375 | uint32_t mCount; |
michael@0 | 376 | }; |
michael@0 | 377 | |
michael@0 | 378 | #endif /* nsCSSScanner_h___ */ |