|
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
|
2 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 |
|
6 #ifndef __nsCharSeparatedTokenizer_h |
|
7 #define __nsCharSeparatedTokenizer_h |
|
8 |
|
9 #include "mozilla/RangedPtr.h" |
|
10 |
|
11 #include "nsDependentSubstring.h" |
|
12 #include "nsCRT.h" |
|
13 |
|
14 /** |
|
15 * This parses a SeparatorChar-separated string into tokens. |
|
16 * Whitespace surrounding tokens is not treated as part of tokens, however |
|
17 * whitespace inside a token is. If the final token is the empty string, it is |
|
18 * not returned. |
|
19 * |
|
20 * Some examples, with SeparatorChar = ',': |
|
21 * |
|
22 * "foo, bar, baz" -> "foo" "bar" "baz" |
|
23 * "foo,bar,baz" -> "foo" "bar" "baz" |
|
24 * "foo , bar hi , baz" -> "foo" "bar hi" "baz" |
|
25 * "foo, ,bar,baz" -> "foo" "" "bar" "baz" |
|
26 * "foo,,bar,baz" -> "foo" "" "bar" "baz" |
|
27 * "foo,bar,baz," -> "foo" "bar" "baz" |
|
28 * |
|
29 * The function used for whitespace detection is a template argument. |
|
30 * By default, it is NS_IsAsciiWhitespace. |
|
31 */ |
|
32 template<bool IsWhitespace(char16_t) = NS_IsAsciiWhitespace> |
|
33 class nsCharSeparatedTokenizerTemplate |
|
34 { |
|
35 public: |
|
36 // Flags -- only one for now. If we need more, they should be defined to |
|
37 // be 1 << 1, 1 << 2, etc. (They're masks, and aFlags is a bitfield.) |
|
38 enum { |
|
39 SEPARATOR_OPTIONAL = 1 |
|
40 }; |
|
41 |
|
42 nsCharSeparatedTokenizerTemplate(const nsSubstring& aSource, |
|
43 char16_t aSeparatorChar, |
|
44 uint32_t aFlags = 0) |
|
45 : mIter(aSource.Data(), aSource.Length()), |
|
46 mEnd(aSource.Data() + aSource.Length(), aSource.Data(), |
|
47 aSource.Length()), |
|
48 mSeparatorChar(aSeparatorChar), |
|
49 mWhitespaceBeforeFirstToken(false), |
|
50 mWhitespaceAfterCurrentToken(false), |
|
51 mSeparatorAfterCurrentToken(false), |
|
52 mSeparatorOptional(aFlags & SEPARATOR_OPTIONAL) |
|
53 { |
|
54 // Skip initial whitespace |
|
55 while (mIter < mEnd && IsWhitespace(*mIter)) { |
|
56 mWhitespaceBeforeFirstToken = true; |
|
57 ++mIter; |
|
58 } |
|
59 } |
|
60 |
|
61 /** |
|
62 * Checks if any more tokens are available. |
|
63 */ |
|
64 bool hasMoreTokens() const |
|
65 { |
|
66 MOZ_ASSERT(mIter == mEnd || !IsWhitespace(*mIter), |
|
67 "Should be at beginning of token if there is one"); |
|
68 |
|
69 return mIter < mEnd; |
|
70 } |
|
71 |
|
72 /* |
|
73 * Returns true if there is whitespace prior to the first token. |
|
74 */ |
|
75 bool whitespaceBeforeFirstToken() const |
|
76 { |
|
77 return mWhitespaceBeforeFirstToken; |
|
78 } |
|
79 |
|
80 /* |
|
81 * Returns true if there is a separator after the current token. |
|
82 * Useful if you want to check whether the last token has a separator |
|
83 * after it which may not be valid. |
|
84 */ |
|
85 bool separatorAfterCurrentToken() const |
|
86 { |
|
87 return mSeparatorAfterCurrentToken; |
|
88 } |
|
89 |
|
90 /* |
|
91 * Returns true if there is any whitespace after the current token. |
|
92 */ |
|
93 bool whitespaceAfterCurrentToken() const |
|
94 { |
|
95 return mWhitespaceAfterCurrentToken; |
|
96 } |
|
97 |
|
98 /** |
|
99 * Returns the next token. |
|
100 */ |
|
101 const nsDependentSubstring nextToken() |
|
102 { |
|
103 mozilla::RangedPtr<const char16_t> tokenStart = mIter, tokenEnd = mIter; |
|
104 |
|
105 MOZ_ASSERT(mIter == mEnd || !IsWhitespace(*mIter), |
|
106 "Should be at beginning of token if there is one"); |
|
107 |
|
108 // Search until we hit separator or end (or whitespace, if a separator |
|
109 // isn't required -- see clause with 'break' below). |
|
110 while (mIter < mEnd && *mIter != mSeparatorChar) { |
|
111 // Skip to end of the current word. |
|
112 while (mIter < mEnd && |
|
113 !IsWhitespace(*mIter) && *mIter != mSeparatorChar) { |
|
114 ++mIter; |
|
115 } |
|
116 tokenEnd = mIter; |
|
117 |
|
118 // Skip whitespace after the current word. |
|
119 mWhitespaceAfterCurrentToken = false; |
|
120 while (mIter < mEnd && IsWhitespace(*mIter)) { |
|
121 mWhitespaceAfterCurrentToken = true; |
|
122 ++mIter; |
|
123 } |
|
124 if (mSeparatorOptional) { |
|
125 // We've hit (and skipped) whitespace, and that's sufficient to end |
|
126 // our token, regardless of whether we've reached a SeparatorChar. |
|
127 break; |
|
128 } // (else, we'll keep looping until we hit mEnd or SeparatorChar) |
|
129 } |
|
130 |
|
131 mSeparatorAfterCurrentToken = (mIter != mEnd && |
|
132 *mIter == mSeparatorChar); |
|
133 MOZ_ASSERT(mSeparatorOptional || |
|
134 (mSeparatorAfterCurrentToken == (mIter < mEnd)), |
|
135 "If we require a separator and haven't hit the end of " |
|
136 "our string, then we shouldn't have left the loop " |
|
137 "unless we hit a separator"); |
|
138 |
|
139 // Skip separator (and any whitespace after it), if we're at one. |
|
140 if (mSeparatorAfterCurrentToken) { |
|
141 ++mIter; |
|
142 |
|
143 while (mIter < mEnd && IsWhitespace(*mIter)) { |
|
144 mWhitespaceAfterCurrentToken = true; |
|
145 ++mIter; |
|
146 } |
|
147 } |
|
148 |
|
149 return Substring(tokenStart.get(), tokenEnd.get()); |
|
150 } |
|
151 |
|
152 private: |
|
153 mozilla::RangedPtr<const char16_t> mIter; |
|
154 const mozilla::RangedPtr<const char16_t> mEnd; |
|
155 char16_t mSeparatorChar; |
|
156 bool mWhitespaceBeforeFirstToken; |
|
157 bool mWhitespaceAfterCurrentToken; |
|
158 bool mSeparatorAfterCurrentToken; |
|
159 bool mSeparatorOptional; |
|
160 }; |
|
161 |
|
162 class nsCharSeparatedTokenizer: public nsCharSeparatedTokenizerTemplate<> |
|
163 { |
|
164 public: |
|
165 nsCharSeparatedTokenizer(const nsSubstring& aSource, |
|
166 char16_t aSeparatorChar, |
|
167 uint32_t aFlags = 0) |
|
168 : nsCharSeparatedTokenizerTemplate<>(aSource, aSeparatorChar, aFlags) |
|
169 { |
|
170 } |
|
171 }; |
|
172 |
|
173 template<bool IsWhitespace(char16_t) = NS_IsAsciiWhitespace> |
|
174 class nsCCharSeparatedTokenizerTemplate |
|
175 { |
|
176 public: |
|
177 // Flags -- only one for now. If we need more, they should be defined to |
|
178 // be 1 << 1, 1 << 2, etc. (They're masks, and aFlags is a bitfield.) |
|
179 enum { |
|
180 SEPARATOR_OPTIONAL = 1 |
|
181 }; |
|
182 |
|
183 nsCCharSeparatedTokenizerTemplate(const nsCSubstring& aSource, |
|
184 char aSeparatorChar, |
|
185 uint32_t aFlags = 0) |
|
186 : mIter(aSource.Data(), aSource.Length()), |
|
187 mEnd(aSource.Data() + aSource.Length(), aSource.Data(), |
|
188 aSource.Length()), |
|
189 mSeparatorChar(aSeparatorChar), |
|
190 mWhitespaceBeforeFirstToken(false), |
|
191 mWhitespaceAfterCurrentToken(false), |
|
192 mSeparatorAfterCurrentToken(false), |
|
193 mSeparatorOptional(aFlags & SEPARATOR_OPTIONAL) |
|
194 { |
|
195 // Skip initial whitespace |
|
196 while (mIter < mEnd && IsWhitespace(*mIter)) { |
|
197 mWhitespaceBeforeFirstToken = true; |
|
198 ++mIter; |
|
199 } |
|
200 } |
|
201 |
|
202 /** |
|
203 * Checks if any more tokens are available. |
|
204 */ |
|
205 bool hasMoreTokens() const |
|
206 { |
|
207 MOZ_ASSERT(mIter == mEnd || !IsWhitespace(*mIter), |
|
208 "Should be at beginning of token if there is one"); |
|
209 |
|
210 return mIter < mEnd; |
|
211 } |
|
212 |
|
213 /* |
|
214 * Returns true if there is whitespace prior to the first token. |
|
215 */ |
|
216 bool whitespaceBeforeFirstToken() const |
|
217 { |
|
218 return mWhitespaceBeforeFirstToken; |
|
219 } |
|
220 |
|
221 /* |
|
222 * Returns true if there is a separator after the current token. |
|
223 * Useful if you want to check whether the last token has a separator |
|
224 * after it which may not be valid. |
|
225 */ |
|
226 bool separatorAfterCurrentToken() const |
|
227 { |
|
228 return mSeparatorAfterCurrentToken; |
|
229 } |
|
230 |
|
231 /* |
|
232 * Returns true if there is any whitespace after the current token. |
|
233 */ |
|
234 bool whitespaceAfterCurrentToken() const |
|
235 { |
|
236 return mWhitespaceAfterCurrentToken; |
|
237 } |
|
238 |
|
239 /** |
|
240 * Returns the next token. |
|
241 */ |
|
242 const nsDependentCSubstring nextToken() |
|
243 { |
|
244 mozilla::RangedPtr<const char> tokenStart = mIter, tokenEnd = mIter; |
|
245 |
|
246 MOZ_ASSERT(mIter == mEnd || !IsWhitespace(*mIter), |
|
247 "Should be at beginning of token if there is one"); |
|
248 |
|
249 // Search until we hit separator or end (or whitespace, if a separator |
|
250 // isn't required -- see clause with 'break' below). |
|
251 while (mIter < mEnd && *mIter != mSeparatorChar) { |
|
252 // Skip to end of the current word. |
|
253 while (mIter < mEnd && |
|
254 !IsWhitespace(*mIter) && *mIter != mSeparatorChar) { |
|
255 ++mIter; |
|
256 } |
|
257 tokenEnd = mIter; |
|
258 |
|
259 // Skip whitespace after the current word. |
|
260 mWhitespaceAfterCurrentToken = false; |
|
261 while (mIter < mEnd && IsWhitespace(*mIter)) { |
|
262 mWhitespaceAfterCurrentToken = true; |
|
263 ++mIter; |
|
264 } |
|
265 if (mSeparatorOptional) { |
|
266 // We've hit (and skipped) whitespace, and that's sufficient to end |
|
267 // our token, regardless of whether we've reached a SeparatorChar. |
|
268 break; |
|
269 } // (else, we'll keep looping until we hit mEnd or SeparatorChar) |
|
270 } |
|
271 |
|
272 mSeparatorAfterCurrentToken = (mIter != mEnd && |
|
273 *mIter == mSeparatorChar); |
|
274 MOZ_ASSERT(mSeparatorOptional || |
|
275 (mSeparatorAfterCurrentToken == (mIter < mEnd)), |
|
276 "If we require a separator and haven't hit the end of " |
|
277 "our string, then we shouldn't have left the loop " |
|
278 "unless we hit a separator"); |
|
279 |
|
280 // Skip separator (and any whitespace after it), if we're at one. |
|
281 if (mSeparatorAfterCurrentToken) { |
|
282 ++mIter; |
|
283 |
|
284 while (mIter < mEnd && IsWhitespace(*mIter)) { |
|
285 mWhitespaceAfterCurrentToken = true; |
|
286 ++mIter; |
|
287 } |
|
288 } |
|
289 |
|
290 return Substring(tokenStart.get(), tokenEnd.get()); |
|
291 } |
|
292 |
|
293 private: |
|
294 mozilla::RangedPtr<const char> mIter; |
|
295 const mozilla::RangedPtr<const char> mEnd; |
|
296 char mSeparatorChar; |
|
297 bool mWhitespaceBeforeFirstToken; |
|
298 bool mWhitespaceAfterCurrentToken; |
|
299 bool mSeparatorAfterCurrentToken; |
|
300 bool mSeparatorOptional; |
|
301 }; |
|
302 |
|
303 class nsCCharSeparatedTokenizer: public nsCCharSeparatedTokenizerTemplate<> |
|
304 { |
|
305 public: |
|
306 nsCCharSeparatedTokenizer(const nsCSubstring& aSource, |
|
307 char aSeparatorChar, |
|
308 uint32_t aFlags = 0) |
|
309 : nsCCharSeparatedTokenizerTemplate<>(aSource, aSeparatorChar, aFlags) |
|
310 { |
|
311 } |
|
312 }; |
|
313 |
|
314 #endif /* __nsCharSeparatedTokenizer_h */ |