1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/js/src/yarr/YarrInterpreter.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,395 @@ 1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- 1.5 + * vim: set ts=8 sts=4 et sw=4 tw=99: 1.6 + * 1.7 + * Copyright (C) 2009, 2010 Apple Inc. All rights reserved. 1.8 + * 1.9 + * Redistribution and use in source and binary forms, with or without 1.10 + * modification, are permitted provided that the following conditions 1.11 + * are met: 1.12 + * 1. Redistributions of source code must retain the above copyright 1.13 + * notice, this list of conditions and the following disclaimer. 1.14 + * 2. Redistributions in binary form must reproduce the above copyright 1.15 + * notice, this list of conditions and the following disclaimer in the 1.16 + * documentation and/or other materials provided with the distribution. 1.17 + * 1.18 + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 1.19 + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1.20 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 1.21 + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 1.22 + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 1.23 + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 1.24 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 1.25 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 1.26 + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 1.27 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 1.28 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 1.29 + */ 1.30 + 1.31 +#ifndef yarr_YarrInterpreter_h 1.32 +#define yarr_YarrInterpreter_h 1.33 + 1.34 +#include "yarr/YarrPattern.h" 1.35 + 1.36 +namespace WTF { 1.37 +class BumpPointerAllocator; 1.38 +} 1.39 +using WTF::BumpPointerAllocator; 1.40 + 1.41 +namespace JSC { namespace Yarr { 1.42 + 1.43 +class ByteDisjunction; 1.44 + 1.45 +struct ByteTerm { 1.46 + enum Type { 1.47 + TypeBodyAlternativeBegin, 1.48 + TypeBodyAlternativeDisjunction, 1.49 + TypeBodyAlternativeEnd, 1.50 + TypeAlternativeBegin, 1.51 + TypeAlternativeDisjunction, 1.52 + TypeAlternativeEnd, 1.53 + TypeSubpatternBegin, 1.54 + TypeSubpatternEnd, 1.55 + TypeAssertionBOL, 1.56 + TypeAssertionEOL, 1.57 + TypeAssertionWordBoundary, 1.58 + TypePatternCharacterOnce, 1.59 + TypePatternCharacterFixed, 1.60 + TypePatternCharacterGreedy, 1.61 + TypePatternCharacterNonGreedy, 1.62 + TypePatternCasedCharacterOnce, 1.63 + TypePatternCasedCharacterFixed, 1.64 + TypePatternCasedCharacterGreedy, 1.65 + TypePatternCasedCharacterNonGreedy, 1.66 + TypeCharacterClass, 1.67 + TypeBackReference, 1.68 + TypeParenthesesSubpattern, 1.69 + TypeParenthesesSubpatternOnceBegin, 1.70 + TypeParenthesesSubpatternOnceEnd, 1.71 + TypeParenthesesSubpatternTerminalBegin, 1.72 + TypeParenthesesSubpatternTerminalEnd, 1.73 + TypeParentheticalAssertionBegin, 1.74 + TypeParentheticalAssertionEnd, 1.75 + TypeCheckInput, 1.76 + TypeUncheckInput, 1.77 + TypeDotStarEnclosure 1.78 + } type; 1.79 + union { 1.80 + struct { 1.81 + union { 1.82 + UChar patternCharacter; 1.83 + struct { 1.84 + UChar lo; 1.85 + UChar hi; 1.86 + } casedCharacter; 1.87 + CharacterClass* characterClass; 1.88 + unsigned subpatternId; 1.89 + }; 1.90 + union { 1.91 + ByteDisjunction* parenthesesDisjunction; 1.92 + unsigned parenthesesWidth; 1.93 + }; 1.94 + QuantifierType quantityType; 1.95 + unsigned quantityCount; 1.96 + } atom; 1.97 + struct { 1.98 + int next; 1.99 + int end; 1.100 + bool onceThrough; 1.101 + } alternative; 1.102 + struct { 1.103 + bool m_bol : 1; 1.104 + bool m_eol : 1; 1.105 + } anchors; 1.106 + unsigned checkInputCount; 1.107 + }; 1.108 + unsigned frameLocation; 1.109 + bool m_capture : 1; 1.110 + bool m_invert : 1; 1.111 + unsigned inputPosition; 1.112 + 1.113 + ByteTerm(UChar ch, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) 1.114 + : frameLocation(frameLocation) 1.115 + , m_capture(false) 1.116 + , m_invert(false) 1.117 + { 1.118 + switch (quantityType) { 1.119 + case QuantifierFixedCount: 1.120 + type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed; 1.121 + break; 1.122 + case QuantifierGreedy: 1.123 + type = ByteTerm::TypePatternCharacterGreedy; 1.124 + break; 1.125 + case QuantifierNonGreedy: 1.126 + type = ByteTerm::TypePatternCharacterNonGreedy; 1.127 + break; 1.128 + } 1.129 + 1.130 + atom.patternCharacter = ch; 1.131 + atom.quantityType = quantityType; 1.132 + atom.quantityCount = quantityCount.unsafeGet(); 1.133 + inputPosition = inputPos; 1.134 + } 1.135 + 1.136 + ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) 1.137 + : frameLocation(frameLocation) 1.138 + , m_capture(false) 1.139 + , m_invert(false) 1.140 + { 1.141 + switch (quantityType) { 1.142 + case QuantifierFixedCount: 1.143 + type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed; 1.144 + break; 1.145 + case QuantifierGreedy: 1.146 + type = ByteTerm::TypePatternCasedCharacterGreedy; 1.147 + break; 1.148 + case QuantifierNonGreedy: 1.149 + type = ByteTerm::TypePatternCasedCharacterNonGreedy; 1.150 + break; 1.151 + } 1.152 + 1.153 + atom.casedCharacter.lo = lo; 1.154 + atom.casedCharacter.hi = hi; 1.155 + atom.quantityType = quantityType; 1.156 + atom.quantityCount = quantityCount.unsafeGet(); 1.157 + inputPosition = inputPos; 1.158 + } 1.159 + 1.160 + ByteTerm(CharacterClass* characterClass, bool invert, int inputPos) 1.161 + : type(ByteTerm::TypeCharacterClass) 1.162 + , m_capture(false) 1.163 + , m_invert(invert) 1.164 + { 1.165 + atom.characterClass = characterClass; 1.166 + atom.quantityType = QuantifierFixedCount; 1.167 + atom.quantityCount = 1; 1.168 + inputPosition = inputPos; 1.169 + } 1.170 + 1.171 + ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, int inputPos) 1.172 + : type(type) 1.173 + , m_capture(capture) 1.174 + , m_invert(false) 1.175 + { 1.176 + atom.subpatternId = subpatternId; 1.177 + atom.parenthesesDisjunction = parenthesesInfo; 1.178 + atom.quantityType = QuantifierFixedCount; 1.179 + atom.quantityCount = 1; 1.180 + inputPosition = inputPos; 1.181 + } 1.182 + 1.183 + ByteTerm(Type type, bool invert = false) 1.184 + : type(type) 1.185 + , m_capture(false) 1.186 + , m_invert(invert) 1.187 + { 1.188 + atom.quantityType = QuantifierFixedCount; 1.189 + atom.quantityCount = 1; 1.190 + } 1.191 + 1.192 + ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, int inputPos) 1.193 + : type(type) 1.194 + , m_capture(capture) 1.195 + , m_invert(invert) 1.196 + { 1.197 + atom.subpatternId = subpatternId; 1.198 + atom.quantityType = QuantifierFixedCount; 1.199 + atom.quantityCount = 1; 1.200 + inputPosition = inputPos; 1.201 + } 1.202 + 1.203 + // For js::Vector. Does not create a valid object. 1.204 + ByteTerm() 1.205 + { 1.206 + } 1.207 + 1.208 + static ByteTerm BOL(int inputPos) 1.209 + { 1.210 + ByteTerm term(TypeAssertionBOL); 1.211 + term.inputPosition = inputPos; 1.212 + return term; 1.213 + } 1.214 + 1.215 + static ByteTerm CheckInput(Checked<unsigned> count) 1.216 + { 1.217 + ByteTerm term(TypeCheckInput); 1.218 + term.checkInputCount = count.unsafeGet(); 1.219 + return term; 1.220 + } 1.221 + 1.222 + static ByteTerm UncheckInput(Checked<unsigned> count) 1.223 + { 1.224 + ByteTerm term(TypeUncheckInput); 1.225 + term.checkInputCount = count.unsafeGet(); 1.226 + return term; 1.227 + } 1.228 + 1.229 + static ByteTerm EOL(int inputPos) 1.230 + { 1.231 + ByteTerm term(TypeAssertionEOL); 1.232 + term.inputPosition = inputPos; 1.233 + return term; 1.234 + } 1.235 + 1.236 + static ByteTerm WordBoundary(bool invert, int inputPos) 1.237 + { 1.238 + ByteTerm term(TypeAssertionWordBoundary, invert); 1.239 + term.inputPosition = inputPos; 1.240 + return term; 1.241 + } 1.242 + 1.243 + static ByteTerm BackReference(unsigned subpatternId, int inputPos) 1.244 + { 1.245 + return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos); 1.246 + } 1.247 + 1.248 + static ByteTerm BodyAlternativeBegin(bool onceThrough) 1.249 + { 1.250 + ByteTerm term(TypeBodyAlternativeBegin); 1.251 + term.alternative.next = 0; 1.252 + term.alternative.end = 0; 1.253 + term.alternative.onceThrough = onceThrough; 1.254 + return term; 1.255 + } 1.256 + 1.257 + static ByteTerm BodyAlternativeDisjunction(bool onceThrough) 1.258 + { 1.259 + ByteTerm term(TypeBodyAlternativeDisjunction); 1.260 + term.alternative.next = 0; 1.261 + term.alternative.end = 0; 1.262 + term.alternative.onceThrough = onceThrough; 1.263 + return term; 1.264 + } 1.265 + 1.266 + static ByteTerm BodyAlternativeEnd() 1.267 + { 1.268 + ByteTerm term(TypeBodyAlternativeEnd); 1.269 + term.alternative.next = 0; 1.270 + term.alternative.end = 0; 1.271 + term.alternative.onceThrough = false; 1.272 + return term; 1.273 + } 1.274 + 1.275 + static ByteTerm AlternativeBegin() 1.276 + { 1.277 + ByteTerm term(TypeAlternativeBegin); 1.278 + term.alternative.next = 0; 1.279 + term.alternative.end = 0; 1.280 + term.alternative.onceThrough = false; 1.281 + return term; 1.282 + } 1.283 + 1.284 + static ByteTerm AlternativeDisjunction() 1.285 + { 1.286 + ByteTerm term(TypeAlternativeDisjunction); 1.287 + term.alternative.next = 0; 1.288 + term.alternative.end = 0; 1.289 + term.alternative.onceThrough = false; 1.290 + return term; 1.291 + } 1.292 + 1.293 + static ByteTerm AlternativeEnd() 1.294 + { 1.295 + ByteTerm term(TypeAlternativeEnd); 1.296 + term.alternative.next = 0; 1.297 + term.alternative.end = 0; 1.298 + term.alternative.onceThrough = false; 1.299 + return term; 1.300 + } 1.301 + 1.302 + static ByteTerm SubpatternBegin() 1.303 + { 1.304 + return ByteTerm(TypeSubpatternBegin); 1.305 + } 1.306 + 1.307 + static ByteTerm SubpatternEnd() 1.308 + { 1.309 + return ByteTerm(TypeSubpatternEnd); 1.310 + } 1.311 + 1.312 + static ByteTerm DotStarEnclosure(bool bolAnchor, bool eolAnchor) 1.313 + { 1.314 + ByteTerm term(TypeDotStarEnclosure); 1.315 + term.anchors.m_bol = bolAnchor; 1.316 + term.anchors.m_eol = eolAnchor; 1.317 + return term; 1.318 + } 1.319 + 1.320 + bool invert() 1.321 + { 1.322 + return m_invert; 1.323 + } 1.324 + 1.325 + bool capture() 1.326 + { 1.327 + return m_capture; 1.328 + } 1.329 +}; 1.330 + 1.331 +class ByteDisjunction { 1.332 + WTF_MAKE_FAST_ALLOCATED; 1.333 +public: 1.334 + ByteDisjunction(unsigned numSubpatterns, unsigned frameSize) 1.335 + : m_numSubpatterns(numSubpatterns) 1.336 + , m_frameSize(frameSize) 1.337 + { 1.338 + } 1.339 + 1.340 + Vector<ByteTerm> terms; 1.341 + unsigned m_numSubpatterns; 1.342 + unsigned m_frameSize; 1.343 +}; 1.344 + 1.345 +struct BytecodePattern { 1.346 + WTF_MAKE_FAST_ALLOCATED; 1.347 +public: 1.348 + BytecodePattern(PassOwnPtr<ByteDisjunction> body, Vector<ByteDisjunction*> &allParenthesesInfo, YarrPattern& pattern, BumpPointerAllocator* allocator) 1.349 + : m_body(body) 1.350 + , m_ignoreCase(pattern.m_ignoreCase) 1.351 + , m_multiline(pattern.m_multiline) 1.352 + , m_allocator(allocator) 1.353 + { 1.354 + newlineCharacterClass = pattern.newlineCharacterClass(); 1.355 + wordcharCharacterClass = pattern.wordcharCharacterClass(); 1.356 + 1.357 + // Trick: 'Steal' the YarrPattern's ParenthesesInfo! 1.358 + // The input vector isn't used afterwards anymore, 1.359 + // that way we don't have to copy the input. 1.360 + JS_ASSERT(m_allParenthesesInfo.size() == 0); 1.361 + m_allParenthesesInfo.swap(allParenthesesInfo); 1.362 + 1.363 + // Trick: 'Steal' the YarrPattern's CharacterClasses! 1.364 + // The input vector isn't used afterwards anymore, 1.365 + // that way we don't have to copy the input. 1.366 + JS_ASSERT(m_userCharacterClasses.size() == 0); 1.367 + m_userCharacterClasses.swap(pattern.m_userCharacterClasses); 1.368 + } 1.369 + 1.370 + ~BytecodePattern() 1.371 + { 1.372 + deleteAllValues(m_allParenthesesInfo); 1.373 + deleteAllValues(m_userCharacterClasses); 1.374 + } 1.375 + 1.376 + OwnPtr<ByteDisjunction> m_body; 1.377 + bool m_ignoreCase; 1.378 + bool m_multiline; 1.379 + // Each BytecodePattern is associated with a RegExp, each RegExp is associated 1.380 + // with a JSGlobalData. Cache a pointer to out JSGlobalData's m_regExpAllocator. 1.381 + BumpPointerAllocator* m_allocator; 1.382 + 1.383 + CharacterClass* newlineCharacterClass; 1.384 + CharacterClass* wordcharCharacterClass; 1.385 + 1.386 +private: 1.387 + Vector<ByteDisjunction*> m_allParenthesesInfo; 1.388 + Vector<CharacterClass*> m_userCharacterClasses; 1.389 +}; 1.390 + 1.391 +JS_EXPORT_PRIVATE PassOwnPtr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*); 1.392 +JS_EXPORT_PRIVATE unsigned interpret(JSContext *cx, BytecodePattern*, const String& input, unsigned start, unsigned* output); 1.393 +unsigned interpret(JSContext *cx, BytecodePattern*, const LChar* input, unsigned length, unsigned start, unsigned* output); 1.394 +unsigned interpret(JSContext *cx, BytecodePattern*, const UChar* input, unsigned length, unsigned start, unsigned* output); 1.395 + 1.396 +} } // namespace JSC::Yarr 1.397 + 1.398 +#endif /* yarr_YarrInterpreter_h */