|
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- |
|
2 * vim: set ts=8 sts=4 et sw=4 tw=99: |
|
3 * |
|
4 * Copyright (C) 2009, 2010 Apple Inc. All rights reserved. |
|
5 * |
|
6 * Redistribution and use in source and binary forms, with or without |
|
7 * modification, are permitted provided that the following conditions |
|
8 * are met: |
|
9 * 1. Redistributions of source code must retain the above copyright |
|
10 * notice, this list of conditions and the following disclaimer. |
|
11 * 2. Redistributions in binary form must reproduce the above copyright |
|
12 * notice, this list of conditions and the following disclaimer in the |
|
13 * documentation and/or other materials provided with the distribution. |
|
14 * |
|
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
|
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
|
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
|
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
|
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
|
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
|
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
|
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
26 */ |
|
27 |
|
28 #ifndef yarr_YarrInterpreter_h |
|
29 #define yarr_YarrInterpreter_h |
|
30 |
|
31 #include "yarr/YarrPattern.h" |
|
32 |
|
33 namespace WTF { |
|
34 class BumpPointerAllocator; |
|
35 } |
|
36 using WTF::BumpPointerAllocator; |
|
37 |
|
38 namespace JSC { namespace Yarr { |
|
39 |
|
40 class ByteDisjunction; |
|
41 |
|
42 struct ByteTerm { |
|
43 enum Type { |
|
44 TypeBodyAlternativeBegin, |
|
45 TypeBodyAlternativeDisjunction, |
|
46 TypeBodyAlternativeEnd, |
|
47 TypeAlternativeBegin, |
|
48 TypeAlternativeDisjunction, |
|
49 TypeAlternativeEnd, |
|
50 TypeSubpatternBegin, |
|
51 TypeSubpatternEnd, |
|
52 TypeAssertionBOL, |
|
53 TypeAssertionEOL, |
|
54 TypeAssertionWordBoundary, |
|
55 TypePatternCharacterOnce, |
|
56 TypePatternCharacterFixed, |
|
57 TypePatternCharacterGreedy, |
|
58 TypePatternCharacterNonGreedy, |
|
59 TypePatternCasedCharacterOnce, |
|
60 TypePatternCasedCharacterFixed, |
|
61 TypePatternCasedCharacterGreedy, |
|
62 TypePatternCasedCharacterNonGreedy, |
|
63 TypeCharacterClass, |
|
64 TypeBackReference, |
|
65 TypeParenthesesSubpattern, |
|
66 TypeParenthesesSubpatternOnceBegin, |
|
67 TypeParenthesesSubpatternOnceEnd, |
|
68 TypeParenthesesSubpatternTerminalBegin, |
|
69 TypeParenthesesSubpatternTerminalEnd, |
|
70 TypeParentheticalAssertionBegin, |
|
71 TypeParentheticalAssertionEnd, |
|
72 TypeCheckInput, |
|
73 TypeUncheckInput, |
|
74 TypeDotStarEnclosure |
|
75 } type; |
|
76 union { |
|
77 struct { |
|
78 union { |
|
79 UChar patternCharacter; |
|
80 struct { |
|
81 UChar lo; |
|
82 UChar hi; |
|
83 } casedCharacter; |
|
84 CharacterClass* characterClass; |
|
85 unsigned subpatternId; |
|
86 }; |
|
87 union { |
|
88 ByteDisjunction* parenthesesDisjunction; |
|
89 unsigned parenthesesWidth; |
|
90 }; |
|
91 QuantifierType quantityType; |
|
92 unsigned quantityCount; |
|
93 } atom; |
|
94 struct { |
|
95 int next; |
|
96 int end; |
|
97 bool onceThrough; |
|
98 } alternative; |
|
99 struct { |
|
100 bool m_bol : 1; |
|
101 bool m_eol : 1; |
|
102 } anchors; |
|
103 unsigned checkInputCount; |
|
104 }; |
|
105 unsigned frameLocation; |
|
106 bool m_capture : 1; |
|
107 bool m_invert : 1; |
|
108 unsigned inputPosition; |
|
109 |
|
110 ByteTerm(UChar ch, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) |
|
111 : frameLocation(frameLocation) |
|
112 , m_capture(false) |
|
113 , m_invert(false) |
|
114 { |
|
115 switch (quantityType) { |
|
116 case QuantifierFixedCount: |
|
117 type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed; |
|
118 break; |
|
119 case QuantifierGreedy: |
|
120 type = ByteTerm::TypePatternCharacterGreedy; |
|
121 break; |
|
122 case QuantifierNonGreedy: |
|
123 type = ByteTerm::TypePatternCharacterNonGreedy; |
|
124 break; |
|
125 } |
|
126 |
|
127 atom.patternCharacter = ch; |
|
128 atom.quantityType = quantityType; |
|
129 atom.quantityCount = quantityCount.unsafeGet(); |
|
130 inputPosition = inputPos; |
|
131 } |
|
132 |
|
133 ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) |
|
134 : frameLocation(frameLocation) |
|
135 , m_capture(false) |
|
136 , m_invert(false) |
|
137 { |
|
138 switch (quantityType) { |
|
139 case QuantifierFixedCount: |
|
140 type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed; |
|
141 break; |
|
142 case QuantifierGreedy: |
|
143 type = ByteTerm::TypePatternCasedCharacterGreedy; |
|
144 break; |
|
145 case QuantifierNonGreedy: |
|
146 type = ByteTerm::TypePatternCasedCharacterNonGreedy; |
|
147 break; |
|
148 } |
|
149 |
|
150 atom.casedCharacter.lo = lo; |
|
151 atom.casedCharacter.hi = hi; |
|
152 atom.quantityType = quantityType; |
|
153 atom.quantityCount = quantityCount.unsafeGet(); |
|
154 inputPosition = inputPos; |
|
155 } |
|
156 |
|
157 ByteTerm(CharacterClass* characterClass, bool invert, int inputPos) |
|
158 : type(ByteTerm::TypeCharacterClass) |
|
159 , m_capture(false) |
|
160 , m_invert(invert) |
|
161 { |
|
162 atom.characterClass = characterClass; |
|
163 atom.quantityType = QuantifierFixedCount; |
|
164 atom.quantityCount = 1; |
|
165 inputPosition = inputPos; |
|
166 } |
|
167 |
|
168 ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, int inputPos) |
|
169 : type(type) |
|
170 , m_capture(capture) |
|
171 , m_invert(false) |
|
172 { |
|
173 atom.subpatternId = subpatternId; |
|
174 atom.parenthesesDisjunction = parenthesesInfo; |
|
175 atom.quantityType = QuantifierFixedCount; |
|
176 atom.quantityCount = 1; |
|
177 inputPosition = inputPos; |
|
178 } |
|
179 |
|
180 ByteTerm(Type type, bool invert = false) |
|
181 : type(type) |
|
182 , m_capture(false) |
|
183 , m_invert(invert) |
|
184 { |
|
185 atom.quantityType = QuantifierFixedCount; |
|
186 atom.quantityCount = 1; |
|
187 } |
|
188 |
|
189 ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, int inputPos) |
|
190 : type(type) |
|
191 , m_capture(capture) |
|
192 , m_invert(invert) |
|
193 { |
|
194 atom.subpatternId = subpatternId; |
|
195 atom.quantityType = QuantifierFixedCount; |
|
196 atom.quantityCount = 1; |
|
197 inputPosition = inputPos; |
|
198 } |
|
199 |
|
200 // For js::Vector. Does not create a valid object. |
|
201 ByteTerm() |
|
202 { |
|
203 } |
|
204 |
|
205 static ByteTerm BOL(int inputPos) |
|
206 { |
|
207 ByteTerm term(TypeAssertionBOL); |
|
208 term.inputPosition = inputPos; |
|
209 return term; |
|
210 } |
|
211 |
|
212 static ByteTerm CheckInput(Checked<unsigned> count) |
|
213 { |
|
214 ByteTerm term(TypeCheckInput); |
|
215 term.checkInputCount = count.unsafeGet(); |
|
216 return term; |
|
217 } |
|
218 |
|
219 static ByteTerm UncheckInput(Checked<unsigned> count) |
|
220 { |
|
221 ByteTerm term(TypeUncheckInput); |
|
222 term.checkInputCount = count.unsafeGet(); |
|
223 return term; |
|
224 } |
|
225 |
|
226 static ByteTerm EOL(int inputPos) |
|
227 { |
|
228 ByteTerm term(TypeAssertionEOL); |
|
229 term.inputPosition = inputPos; |
|
230 return term; |
|
231 } |
|
232 |
|
233 static ByteTerm WordBoundary(bool invert, int inputPos) |
|
234 { |
|
235 ByteTerm term(TypeAssertionWordBoundary, invert); |
|
236 term.inputPosition = inputPos; |
|
237 return term; |
|
238 } |
|
239 |
|
240 static ByteTerm BackReference(unsigned subpatternId, int inputPos) |
|
241 { |
|
242 return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos); |
|
243 } |
|
244 |
|
245 static ByteTerm BodyAlternativeBegin(bool onceThrough) |
|
246 { |
|
247 ByteTerm term(TypeBodyAlternativeBegin); |
|
248 term.alternative.next = 0; |
|
249 term.alternative.end = 0; |
|
250 term.alternative.onceThrough = onceThrough; |
|
251 return term; |
|
252 } |
|
253 |
|
254 static ByteTerm BodyAlternativeDisjunction(bool onceThrough) |
|
255 { |
|
256 ByteTerm term(TypeBodyAlternativeDisjunction); |
|
257 term.alternative.next = 0; |
|
258 term.alternative.end = 0; |
|
259 term.alternative.onceThrough = onceThrough; |
|
260 return term; |
|
261 } |
|
262 |
|
263 static ByteTerm BodyAlternativeEnd() |
|
264 { |
|
265 ByteTerm term(TypeBodyAlternativeEnd); |
|
266 term.alternative.next = 0; |
|
267 term.alternative.end = 0; |
|
268 term.alternative.onceThrough = false; |
|
269 return term; |
|
270 } |
|
271 |
|
272 static ByteTerm AlternativeBegin() |
|
273 { |
|
274 ByteTerm term(TypeAlternativeBegin); |
|
275 term.alternative.next = 0; |
|
276 term.alternative.end = 0; |
|
277 term.alternative.onceThrough = false; |
|
278 return term; |
|
279 } |
|
280 |
|
281 static ByteTerm AlternativeDisjunction() |
|
282 { |
|
283 ByteTerm term(TypeAlternativeDisjunction); |
|
284 term.alternative.next = 0; |
|
285 term.alternative.end = 0; |
|
286 term.alternative.onceThrough = false; |
|
287 return term; |
|
288 } |
|
289 |
|
290 static ByteTerm AlternativeEnd() |
|
291 { |
|
292 ByteTerm term(TypeAlternativeEnd); |
|
293 term.alternative.next = 0; |
|
294 term.alternative.end = 0; |
|
295 term.alternative.onceThrough = false; |
|
296 return term; |
|
297 } |
|
298 |
|
299 static ByteTerm SubpatternBegin() |
|
300 { |
|
301 return ByteTerm(TypeSubpatternBegin); |
|
302 } |
|
303 |
|
304 static ByteTerm SubpatternEnd() |
|
305 { |
|
306 return ByteTerm(TypeSubpatternEnd); |
|
307 } |
|
308 |
|
309 static ByteTerm DotStarEnclosure(bool bolAnchor, bool eolAnchor) |
|
310 { |
|
311 ByteTerm term(TypeDotStarEnclosure); |
|
312 term.anchors.m_bol = bolAnchor; |
|
313 term.anchors.m_eol = eolAnchor; |
|
314 return term; |
|
315 } |
|
316 |
|
317 bool invert() |
|
318 { |
|
319 return m_invert; |
|
320 } |
|
321 |
|
322 bool capture() |
|
323 { |
|
324 return m_capture; |
|
325 } |
|
326 }; |
|
327 |
|
328 class ByteDisjunction { |
|
329 WTF_MAKE_FAST_ALLOCATED; |
|
330 public: |
|
331 ByteDisjunction(unsigned numSubpatterns, unsigned frameSize) |
|
332 : m_numSubpatterns(numSubpatterns) |
|
333 , m_frameSize(frameSize) |
|
334 { |
|
335 } |
|
336 |
|
337 Vector<ByteTerm> terms; |
|
338 unsigned m_numSubpatterns; |
|
339 unsigned m_frameSize; |
|
340 }; |
|
341 |
|
342 struct BytecodePattern { |
|
343 WTF_MAKE_FAST_ALLOCATED; |
|
344 public: |
|
345 BytecodePattern(PassOwnPtr<ByteDisjunction> body, Vector<ByteDisjunction*> &allParenthesesInfo, YarrPattern& pattern, BumpPointerAllocator* allocator) |
|
346 : m_body(body) |
|
347 , m_ignoreCase(pattern.m_ignoreCase) |
|
348 , m_multiline(pattern.m_multiline) |
|
349 , m_allocator(allocator) |
|
350 { |
|
351 newlineCharacterClass = pattern.newlineCharacterClass(); |
|
352 wordcharCharacterClass = pattern.wordcharCharacterClass(); |
|
353 |
|
354 // Trick: 'Steal' the YarrPattern's ParenthesesInfo! |
|
355 // The input vector isn't used afterwards anymore, |
|
356 // that way we don't have to copy the input. |
|
357 JS_ASSERT(m_allParenthesesInfo.size() == 0); |
|
358 m_allParenthesesInfo.swap(allParenthesesInfo); |
|
359 |
|
360 // Trick: 'Steal' the YarrPattern's CharacterClasses! |
|
361 // The input vector isn't used afterwards anymore, |
|
362 // that way we don't have to copy the input. |
|
363 JS_ASSERT(m_userCharacterClasses.size() == 0); |
|
364 m_userCharacterClasses.swap(pattern.m_userCharacterClasses); |
|
365 } |
|
366 |
|
367 ~BytecodePattern() |
|
368 { |
|
369 deleteAllValues(m_allParenthesesInfo); |
|
370 deleteAllValues(m_userCharacterClasses); |
|
371 } |
|
372 |
|
373 OwnPtr<ByteDisjunction> m_body; |
|
374 bool m_ignoreCase; |
|
375 bool m_multiline; |
|
376 // Each BytecodePattern is associated with a RegExp, each RegExp is associated |
|
377 // with a JSGlobalData. Cache a pointer to out JSGlobalData's m_regExpAllocator. |
|
378 BumpPointerAllocator* m_allocator; |
|
379 |
|
380 CharacterClass* newlineCharacterClass; |
|
381 CharacterClass* wordcharCharacterClass; |
|
382 |
|
383 private: |
|
384 Vector<ByteDisjunction*> m_allParenthesesInfo; |
|
385 Vector<CharacterClass*> m_userCharacterClasses; |
|
386 }; |
|
387 |
|
388 JS_EXPORT_PRIVATE PassOwnPtr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*); |
|
389 JS_EXPORT_PRIVATE unsigned interpret(JSContext *cx, BytecodePattern*, const String& input, unsigned start, unsigned* output); |
|
390 unsigned interpret(JSContext *cx, BytecodePattern*, const LChar* input, unsigned length, unsigned start, unsigned* output); |
|
391 unsigned interpret(JSContext *cx, BytecodePattern*, const UChar* input, unsigned length, unsigned start, unsigned* output); |
|
392 |
|
393 } } // namespace JSC::Yarr |
|
394 |
|
395 #endif /* yarr_YarrInterpreter_h */ |