Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | // |
michael@0 | 2 | // file: repattrn.cpp |
michael@0 | 3 | // |
michael@0 | 4 | /* |
michael@0 | 5 | *************************************************************************** |
michael@0 | 6 | * Copyright (C) 2002-2012 International Business Machines Corporation * |
michael@0 | 7 | * and others. All rights reserved. * |
michael@0 | 8 | *************************************************************************** |
michael@0 | 9 | */ |
michael@0 | 10 | |
michael@0 | 11 | #include "unicode/utypes.h" |
michael@0 | 12 | |
michael@0 | 13 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
michael@0 | 14 | |
michael@0 | 15 | #include "unicode/regex.h" |
michael@0 | 16 | #include "unicode/uclean.h" |
michael@0 | 17 | #include "uassert.h" |
michael@0 | 18 | #include "uvector.h" |
michael@0 | 19 | #include "uvectr32.h" |
michael@0 | 20 | #include "uvectr64.h" |
michael@0 | 21 | #include "regexcmp.h" |
michael@0 | 22 | #include "regeximp.h" |
michael@0 | 23 | #include "regexst.h" |
michael@0 | 24 | |
michael@0 | 25 | U_NAMESPACE_BEGIN |
michael@0 | 26 | |
michael@0 | 27 | //-------------------------------------------------------------------------- |
michael@0 | 28 | // |
michael@0 | 29 | // RegexPattern Default Constructor |
michael@0 | 30 | // |
michael@0 | 31 | //-------------------------------------------------------------------------- |
michael@0 | 32 | RegexPattern::RegexPattern() { |
michael@0 | 33 | // Init all of this instances data. |
michael@0 | 34 | init(); |
michael@0 | 35 | } |
michael@0 | 36 | |
michael@0 | 37 | |
michael@0 | 38 | //-------------------------------------------------------------------------- |
michael@0 | 39 | // |
michael@0 | 40 | // Copy Constructor Note: This is a rather inefficient implementation, |
michael@0 | 41 | // but it probably doesn't matter. |
michael@0 | 42 | // |
michael@0 | 43 | //-------------------------------------------------------------------------- |
michael@0 | 44 | RegexPattern::RegexPattern(const RegexPattern &other) : UObject(other) { |
michael@0 | 45 | init(); |
michael@0 | 46 | *this = other; |
michael@0 | 47 | } |
michael@0 | 48 | |
michael@0 | 49 | |
michael@0 | 50 | |
michael@0 | 51 | //-------------------------------------------------------------------------- |
michael@0 | 52 | // |
michael@0 | 53 | // Assignment Operator |
michael@0 | 54 | // |
michael@0 | 55 | //-------------------------------------------------------------------------- |
michael@0 | 56 | RegexPattern &RegexPattern::operator = (const RegexPattern &other) { |
michael@0 | 57 | if (this == &other) { |
michael@0 | 58 | // Source and destination are the same. Don't do anything. |
michael@0 | 59 | return *this; |
michael@0 | 60 | } |
michael@0 | 61 | |
michael@0 | 62 | // Clean out any previous contents of object being assigned to. |
michael@0 | 63 | zap(); |
michael@0 | 64 | |
michael@0 | 65 | // Give target object a default initialization |
michael@0 | 66 | init(); |
michael@0 | 67 | |
michael@0 | 68 | // Copy simple fields |
michael@0 | 69 | if ( other.fPatternString == NULL ) { |
michael@0 | 70 | fPatternString = NULL; |
michael@0 | 71 | fPattern = utext_clone(fPattern, other.fPattern, FALSE, TRUE, &fDeferredStatus); |
michael@0 | 72 | } else { |
michael@0 | 73 | fPatternString = new UnicodeString(*(other.fPatternString)); |
michael@0 | 74 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 75 | fPattern = utext_openConstUnicodeString(NULL, fPatternString, &status); |
michael@0 | 76 | if (U_FAILURE(status)) { |
michael@0 | 77 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 78 | return *this; |
michael@0 | 79 | } |
michael@0 | 80 | } |
michael@0 | 81 | fFlags = other.fFlags; |
michael@0 | 82 | fLiteralText = other.fLiteralText; |
michael@0 | 83 | fDeferredStatus = other.fDeferredStatus; |
michael@0 | 84 | fMinMatchLen = other.fMinMatchLen; |
michael@0 | 85 | fFrameSize = other.fFrameSize; |
michael@0 | 86 | fDataSize = other.fDataSize; |
michael@0 | 87 | fMaxCaptureDigits = other.fMaxCaptureDigits; |
michael@0 | 88 | fStaticSets = other.fStaticSets; |
michael@0 | 89 | fStaticSets8 = other.fStaticSets8; |
michael@0 | 90 | |
michael@0 | 91 | fStartType = other.fStartType; |
michael@0 | 92 | fInitialStringIdx = other.fInitialStringIdx; |
michael@0 | 93 | fInitialStringLen = other.fInitialStringLen; |
michael@0 | 94 | *fInitialChars = *other.fInitialChars; |
michael@0 | 95 | fInitialChar = other.fInitialChar; |
michael@0 | 96 | *fInitialChars8 = *other.fInitialChars8; |
michael@0 | 97 | fNeedsAltInput = other.fNeedsAltInput; |
michael@0 | 98 | |
michael@0 | 99 | // Copy the pattern. It's just values, nothing deep to copy. |
michael@0 | 100 | fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus); |
michael@0 | 101 | fGroupMap->assign(*other.fGroupMap, fDeferredStatus); |
michael@0 | 102 | |
michael@0 | 103 | // Copy the Unicode Sets. |
michael@0 | 104 | // Could be made more efficient if the sets were reference counted and shared, |
michael@0 | 105 | // but I doubt that pattern copying will be particularly common. |
michael@0 | 106 | // Note: init() already added an empty element zero to fSets |
michael@0 | 107 | int32_t i; |
michael@0 | 108 | int32_t numSets = other.fSets->size(); |
michael@0 | 109 | fSets8 = new Regex8BitSet[numSets]; |
michael@0 | 110 | if (fSets8 == NULL) { |
michael@0 | 111 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 112 | return *this; |
michael@0 | 113 | } |
michael@0 | 114 | for (i=1; i<numSets; i++) { |
michael@0 | 115 | if (U_FAILURE(fDeferredStatus)) { |
michael@0 | 116 | return *this; |
michael@0 | 117 | } |
michael@0 | 118 | UnicodeSet *sourceSet = (UnicodeSet *)other.fSets->elementAt(i); |
michael@0 | 119 | UnicodeSet *newSet = new UnicodeSet(*sourceSet); |
michael@0 | 120 | if (newSet == NULL) { |
michael@0 | 121 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 122 | break; |
michael@0 | 123 | } |
michael@0 | 124 | fSets->addElement(newSet, fDeferredStatus); |
michael@0 | 125 | fSets8[i] = other.fSets8[i]; |
michael@0 | 126 | } |
michael@0 | 127 | |
michael@0 | 128 | return *this; |
michael@0 | 129 | } |
michael@0 | 130 | |
michael@0 | 131 | |
michael@0 | 132 | //-------------------------------------------------------------------------- |
michael@0 | 133 | // |
michael@0 | 134 | // init Shared initialization for use by constructors. |
michael@0 | 135 | // Bring an uninitialized RegexPattern up to a default state. |
michael@0 | 136 | // |
michael@0 | 137 | //-------------------------------------------------------------------------- |
michael@0 | 138 | void RegexPattern::init() { |
michael@0 | 139 | fFlags = 0; |
michael@0 | 140 | fCompiledPat = 0; |
michael@0 | 141 | fLiteralText.remove(); |
michael@0 | 142 | fSets = NULL; |
michael@0 | 143 | fSets8 = NULL; |
michael@0 | 144 | fDeferredStatus = U_ZERO_ERROR; |
michael@0 | 145 | fMinMatchLen = 0; |
michael@0 | 146 | fFrameSize = 0; |
michael@0 | 147 | fDataSize = 0; |
michael@0 | 148 | fGroupMap = NULL; |
michael@0 | 149 | fMaxCaptureDigits = 1; |
michael@0 | 150 | fStaticSets = NULL; |
michael@0 | 151 | fStaticSets8 = NULL; |
michael@0 | 152 | fStartType = START_NO_INFO; |
michael@0 | 153 | fInitialStringIdx = 0; |
michael@0 | 154 | fInitialStringLen = 0; |
michael@0 | 155 | fInitialChars = NULL; |
michael@0 | 156 | fInitialChar = 0; |
michael@0 | 157 | fInitialChars8 = NULL; |
michael@0 | 158 | fNeedsAltInput = FALSE; |
michael@0 | 159 | |
michael@0 | 160 | fPattern = NULL; // will be set later |
michael@0 | 161 | fPatternString = NULL; // may be set later |
michael@0 | 162 | fCompiledPat = new UVector64(fDeferredStatus); |
michael@0 | 163 | fGroupMap = new UVector32(fDeferredStatus); |
michael@0 | 164 | fSets = new UVector(fDeferredStatus); |
michael@0 | 165 | fInitialChars = new UnicodeSet; |
michael@0 | 166 | fInitialChars8 = new Regex8BitSet; |
michael@0 | 167 | if (U_FAILURE(fDeferredStatus)) { |
michael@0 | 168 | return; |
michael@0 | 169 | } |
michael@0 | 170 | if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL || |
michael@0 | 171 | fInitialChars == NULL || fInitialChars8 == NULL) { |
michael@0 | 172 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 173 | return; |
michael@0 | 174 | } |
michael@0 | 175 | |
michael@0 | 176 | // Slot zero of the vector of sets is reserved. Fill it here. |
michael@0 | 177 | fSets->addElement((int32_t)0, fDeferredStatus); |
michael@0 | 178 | } |
michael@0 | 179 | |
michael@0 | 180 | |
michael@0 | 181 | //-------------------------------------------------------------------------- |
michael@0 | 182 | // |
michael@0 | 183 | // zap Delete everything owned by this RegexPattern. |
michael@0 | 184 | // |
michael@0 | 185 | //-------------------------------------------------------------------------- |
michael@0 | 186 | void RegexPattern::zap() { |
michael@0 | 187 | delete fCompiledPat; |
michael@0 | 188 | fCompiledPat = NULL; |
michael@0 | 189 | int i; |
michael@0 | 190 | for (i=1; i<fSets->size(); i++) { |
michael@0 | 191 | UnicodeSet *s; |
michael@0 | 192 | s = (UnicodeSet *)fSets->elementAt(i); |
michael@0 | 193 | if (s != NULL) { |
michael@0 | 194 | delete s; |
michael@0 | 195 | } |
michael@0 | 196 | } |
michael@0 | 197 | delete fSets; |
michael@0 | 198 | fSets = NULL; |
michael@0 | 199 | delete[] fSets8; |
michael@0 | 200 | fSets8 = NULL; |
michael@0 | 201 | delete fGroupMap; |
michael@0 | 202 | fGroupMap = NULL; |
michael@0 | 203 | delete fInitialChars; |
michael@0 | 204 | fInitialChars = NULL; |
michael@0 | 205 | delete fInitialChars8; |
michael@0 | 206 | fInitialChars8 = NULL; |
michael@0 | 207 | if (fPattern != NULL) { |
michael@0 | 208 | utext_close(fPattern); |
michael@0 | 209 | fPattern = NULL; |
michael@0 | 210 | } |
michael@0 | 211 | if (fPatternString != NULL) { |
michael@0 | 212 | delete fPatternString; |
michael@0 | 213 | fPatternString = NULL; |
michael@0 | 214 | } |
michael@0 | 215 | } |
michael@0 | 216 | |
michael@0 | 217 | |
michael@0 | 218 | //-------------------------------------------------------------------------- |
michael@0 | 219 | // |
michael@0 | 220 | // Destructor |
michael@0 | 221 | // |
michael@0 | 222 | //-------------------------------------------------------------------------- |
michael@0 | 223 | RegexPattern::~RegexPattern() { |
michael@0 | 224 | zap(); |
michael@0 | 225 | } |
michael@0 | 226 | |
michael@0 | 227 | |
michael@0 | 228 | //-------------------------------------------------------------------------- |
michael@0 | 229 | // |
michael@0 | 230 | // Clone |
michael@0 | 231 | // |
michael@0 | 232 | //-------------------------------------------------------------------------- |
michael@0 | 233 | RegexPattern *RegexPattern::clone() const { |
michael@0 | 234 | RegexPattern *copy = new RegexPattern(*this); |
michael@0 | 235 | return copy; |
michael@0 | 236 | } |
michael@0 | 237 | |
michael@0 | 238 | |
michael@0 | 239 | //-------------------------------------------------------------------------- |
michael@0 | 240 | // |
michael@0 | 241 | // operator == (comparison) Consider to patterns to be == if the |
michael@0 | 242 | // pattern strings and the flags are the same. |
michael@0 | 243 | // Note that pattern strings with the same |
michael@0 | 244 | // characters can still be considered different. |
michael@0 | 245 | // |
michael@0 | 246 | //-------------------------------------------------------------------------- |
michael@0 | 247 | UBool RegexPattern::operator ==(const RegexPattern &other) const { |
michael@0 | 248 | if (this->fFlags == other.fFlags && this->fDeferredStatus == other.fDeferredStatus) { |
michael@0 | 249 | if (this->fPatternString != NULL && other.fPatternString != NULL) { |
michael@0 | 250 | return *(this->fPatternString) == *(other.fPatternString); |
michael@0 | 251 | } else if (this->fPattern == NULL) { |
michael@0 | 252 | if (other.fPattern == NULL) { |
michael@0 | 253 | return TRUE; |
michael@0 | 254 | } |
michael@0 | 255 | } else if (other.fPattern != NULL) { |
michael@0 | 256 | UTEXT_SETNATIVEINDEX(this->fPattern, 0); |
michael@0 | 257 | UTEXT_SETNATIVEINDEX(other.fPattern, 0); |
michael@0 | 258 | return utext_equals(this->fPattern, other.fPattern); |
michael@0 | 259 | } |
michael@0 | 260 | } |
michael@0 | 261 | return FALSE; |
michael@0 | 262 | } |
michael@0 | 263 | |
michael@0 | 264 | //--------------------------------------------------------------------- |
michael@0 | 265 | // |
michael@0 | 266 | // compile |
michael@0 | 267 | // |
michael@0 | 268 | //--------------------------------------------------------------------- |
michael@0 | 269 | RegexPattern * U_EXPORT2 |
michael@0 | 270 | RegexPattern::compile(const UnicodeString ®ex, |
michael@0 | 271 | uint32_t flags, |
michael@0 | 272 | UParseError &pe, |
michael@0 | 273 | UErrorCode &status) |
michael@0 | 274 | { |
michael@0 | 275 | if (U_FAILURE(status)) { |
michael@0 | 276 | return NULL; |
michael@0 | 277 | } |
michael@0 | 278 | |
michael@0 | 279 | const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | |
michael@0 | 280 | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD | |
michael@0 | 281 | UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL; |
michael@0 | 282 | |
michael@0 | 283 | if ((flags & ~allFlags) != 0) { |
michael@0 | 284 | status = U_REGEX_INVALID_FLAG; |
michael@0 | 285 | return NULL; |
michael@0 | 286 | } |
michael@0 | 287 | |
michael@0 | 288 | if ((flags & UREGEX_CANON_EQ) != 0) { |
michael@0 | 289 | status = U_REGEX_UNIMPLEMENTED; |
michael@0 | 290 | return NULL; |
michael@0 | 291 | } |
michael@0 | 292 | |
michael@0 | 293 | RegexPattern *This = new RegexPattern; |
michael@0 | 294 | if (This == NULL) { |
michael@0 | 295 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 296 | return NULL; |
michael@0 | 297 | } |
michael@0 | 298 | if (U_FAILURE(This->fDeferredStatus)) { |
michael@0 | 299 | status = This->fDeferredStatus; |
michael@0 | 300 | delete This; |
michael@0 | 301 | return NULL; |
michael@0 | 302 | } |
michael@0 | 303 | This->fFlags = flags; |
michael@0 | 304 | |
michael@0 | 305 | RegexCompile compiler(This, status); |
michael@0 | 306 | compiler.compile(regex, pe, status); |
michael@0 | 307 | |
michael@0 | 308 | if (U_FAILURE(status)) { |
michael@0 | 309 | delete This; |
michael@0 | 310 | This = NULL; |
michael@0 | 311 | } |
michael@0 | 312 | |
michael@0 | 313 | return This; |
michael@0 | 314 | } |
michael@0 | 315 | |
michael@0 | 316 | |
michael@0 | 317 | // |
michael@0 | 318 | // compile, UText mode |
michael@0 | 319 | // |
michael@0 | 320 | RegexPattern * U_EXPORT2 |
michael@0 | 321 | RegexPattern::compile(UText *regex, |
michael@0 | 322 | uint32_t flags, |
michael@0 | 323 | UParseError &pe, |
michael@0 | 324 | UErrorCode &status) |
michael@0 | 325 | { |
michael@0 | 326 | if (U_FAILURE(status)) { |
michael@0 | 327 | return NULL; |
michael@0 | 328 | } |
michael@0 | 329 | |
michael@0 | 330 | const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | |
michael@0 | 331 | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD | |
michael@0 | 332 | UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL; |
michael@0 | 333 | |
michael@0 | 334 | if ((flags & ~allFlags) != 0) { |
michael@0 | 335 | status = U_REGEX_INVALID_FLAG; |
michael@0 | 336 | return NULL; |
michael@0 | 337 | } |
michael@0 | 338 | |
michael@0 | 339 | if ((flags & UREGEX_CANON_EQ) != 0) { |
michael@0 | 340 | status = U_REGEX_UNIMPLEMENTED; |
michael@0 | 341 | return NULL; |
michael@0 | 342 | } |
michael@0 | 343 | |
michael@0 | 344 | RegexPattern *This = new RegexPattern; |
michael@0 | 345 | if (This == NULL) { |
michael@0 | 346 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 347 | return NULL; |
michael@0 | 348 | } |
michael@0 | 349 | if (U_FAILURE(This->fDeferredStatus)) { |
michael@0 | 350 | status = This->fDeferredStatus; |
michael@0 | 351 | delete This; |
michael@0 | 352 | return NULL; |
michael@0 | 353 | } |
michael@0 | 354 | This->fFlags = flags; |
michael@0 | 355 | |
michael@0 | 356 | RegexCompile compiler(This, status); |
michael@0 | 357 | compiler.compile(regex, pe, status); |
michael@0 | 358 | |
michael@0 | 359 | if (U_FAILURE(status)) { |
michael@0 | 360 | delete This; |
michael@0 | 361 | This = NULL; |
michael@0 | 362 | } |
michael@0 | 363 | |
michael@0 | 364 | return This; |
michael@0 | 365 | } |
michael@0 | 366 | |
michael@0 | 367 | // |
michael@0 | 368 | // compile with default flags. |
michael@0 | 369 | // |
michael@0 | 370 | RegexPattern * U_EXPORT2 |
michael@0 | 371 | RegexPattern::compile(const UnicodeString ®ex, |
michael@0 | 372 | UParseError &pe, |
michael@0 | 373 | UErrorCode &err) |
michael@0 | 374 | { |
michael@0 | 375 | return compile(regex, 0, pe, err); |
michael@0 | 376 | } |
michael@0 | 377 | |
michael@0 | 378 | |
michael@0 | 379 | // |
michael@0 | 380 | // compile with default flags, UText mode |
michael@0 | 381 | // |
michael@0 | 382 | RegexPattern * U_EXPORT2 |
michael@0 | 383 | RegexPattern::compile(UText *regex, |
michael@0 | 384 | UParseError &pe, |
michael@0 | 385 | UErrorCode &err) |
michael@0 | 386 | { |
michael@0 | 387 | return compile(regex, 0, pe, err); |
michael@0 | 388 | } |
michael@0 | 389 | |
michael@0 | 390 | |
michael@0 | 391 | // |
michael@0 | 392 | // compile with no UParseErr parameter. |
michael@0 | 393 | // |
michael@0 | 394 | RegexPattern * U_EXPORT2 |
michael@0 | 395 | RegexPattern::compile(const UnicodeString ®ex, |
michael@0 | 396 | uint32_t flags, |
michael@0 | 397 | UErrorCode &err) |
michael@0 | 398 | { |
michael@0 | 399 | UParseError pe; |
michael@0 | 400 | return compile(regex, flags, pe, err); |
michael@0 | 401 | } |
michael@0 | 402 | |
michael@0 | 403 | |
michael@0 | 404 | // |
michael@0 | 405 | // compile with no UParseErr parameter, UText mode |
michael@0 | 406 | // |
michael@0 | 407 | RegexPattern * U_EXPORT2 |
michael@0 | 408 | RegexPattern::compile(UText *regex, |
michael@0 | 409 | uint32_t flags, |
michael@0 | 410 | UErrorCode &err) |
michael@0 | 411 | { |
michael@0 | 412 | UParseError pe; |
michael@0 | 413 | return compile(regex, flags, pe, err); |
michael@0 | 414 | } |
michael@0 | 415 | |
michael@0 | 416 | |
michael@0 | 417 | //--------------------------------------------------------------------- |
michael@0 | 418 | // |
michael@0 | 419 | // flags |
michael@0 | 420 | // |
michael@0 | 421 | //--------------------------------------------------------------------- |
michael@0 | 422 | uint32_t RegexPattern::flags() const { |
michael@0 | 423 | return fFlags; |
michael@0 | 424 | } |
michael@0 | 425 | |
michael@0 | 426 | |
michael@0 | 427 | //--------------------------------------------------------------------- |
michael@0 | 428 | // |
michael@0 | 429 | // matcher(UnicodeString, err) |
michael@0 | 430 | // |
michael@0 | 431 | //--------------------------------------------------------------------- |
michael@0 | 432 | RegexMatcher *RegexPattern::matcher(const UnicodeString &input, |
michael@0 | 433 | UErrorCode &status) const { |
michael@0 | 434 | RegexMatcher *retMatcher = matcher(status); |
michael@0 | 435 | if (retMatcher != NULL) { |
michael@0 | 436 | retMatcher->fDeferredStatus = status; |
michael@0 | 437 | retMatcher->reset(input); |
michael@0 | 438 | } |
michael@0 | 439 | return retMatcher; |
michael@0 | 440 | } |
michael@0 | 441 | |
michael@0 | 442 | |
michael@0 | 443 | //--------------------------------------------------------------------- |
michael@0 | 444 | // |
michael@0 | 445 | // matcher(status) |
michael@0 | 446 | // |
michael@0 | 447 | //--------------------------------------------------------------------- |
michael@0 | 448 | RegexMatcher *RegexPattern::matcher(UErrorCode &status) const { |
michael@0 | 449 | RegexMatcher *retMatcher = NULL; |
michael@0 | 450 | |
michael@0 | 451 | if (U_FAILURE(status)) { |
michael@0 | 452 | return NULL; |
michael@0 | 453 | } |
michael@0 | 454 | if (U_FAILURE(fDeferredStatus)) { |
michael@0 | 455 | status = fDeferredStatus; |
michael@0 | 456 | return NULL; |
michael@0 | 457 | } |
michael@0 | 458 | |
michael@0 | 459 | retMatcher = new RegexMatcher(this); |
michael@0 | 460 | if (retMatcher == NULL) { |
michael@0 | 461 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 462 | return NULL; |
michael@0 | 463 | } |
michael@0 | 464 | return retMatcher; |
michael@0 | 465 | } |
michael@0 | 466 | |
michael@0 | 467 | |
michael@0 | 468 | |
michael@0 | 469 | //--------------------------------------------------------------------- |
michael@0 | 470 | // |
michael@0 | 471 | // matches Convenience function to test for a match, starting |
michael@0 | 472 | // with a pattern string and a data string. |
michael@0 | 473 | // |
michael@0 | 474 | //--------------------------------------------------------------------- |
michael@0 | 475 | UBool U_EXPORT2 RegexPattern::matches(const UnicodeString ®ex, |
michael@0 | 476 | const UnicodeString &input, |
michael@0 | 477 | UParseError &pe, |
michael@0 | 478 | UErrorCode &status) { |
michael@0 | 479 | |
michael@0 | 480 | if (U_FAILURE(status)) {return FALSE;} |
michael@0 | 481 | |
michael@0 | 482 | UBool retVal; |
michael@0 | 483 | RegexPattern *pat = NULL; |
michael@0 | 484 | RegexMatcher *matcher = NULL; |
michael@0 | 485 | |
michael@0 | 486 | pat = RegexPattern::compile(regex, 0, pe, status); |
michael@0 | 487 | matcher = pat->matcher(input, status); |
michael@0 | 488 | retVal = matcher->matches(status); |
michael@0 | 489 | |
michael@0 | 490 | delete matcher; |
michael@0 | 491 | delete pat; |
michael@0 | 492 | return retVal; |
michael@0 | 493 | } |
michael@0 | 494 | |
michael@0 | 495 | |
michael@0 | 496 | // |
michael@0 | 497 | // matches, UText mode |
michael@0 | 498 | // |
michael@0 | 499 | UBool U_EXPORT2 RegexPattern::matches(UText *regex, |
michael@0 | 500 | UText *input, |
michael@0 | 501 | UParseError &pe, |
michael@0 | 502 | UErrorCode &status) { |
michael@0 | 503 | |
michael@0 | 504 | if (U_FAILURE(status)) {return FALSE;} |
michael@0 | 505 | |
michael@0 | 506 | UBool retVal = FALSE; |
michael@0 | 507 | RegexPattern *pat = NULL; |
michael@0 | 508 | RegexMatcher *matcher = NULL; |
michael@0 | 509 | |
michael@0 | 510 | pat = RegexPattern::compile(regex, 0, pe, status); |
michael@0 | 511 | matcher = pat->matcher(status); |
michael@0 | 512 | if (U_SUCCESS(status)) { |
michael@0 | 513 | matcher->reset(input); |
michael@0 | 514 | retVal = matcher->matches(status); |
michael@0 | 515 | } |
michael@0 | 516 | |
michael@0 | 517 | delete matcher; |
michael@0 | 518 | delete pat; |
michael@0 | 519 | return retVal; |
michael@0 | 520 | } |
michael@0 | 521 | |
michael@0 | 522 | |
michael@0 | 523 | |
michael@0 | 524 | |
michael@0 | 525 | |
michael@0 | 526 | //--------------------------------------------------------------------- |
michael@0 | 527 | // |
michael@0 | 528 | // pattern |
michael@0 | 529 | // |
michael@0 | 530 | //--------------------------------------------------------------------- |
michael@0 | 531 | UnicodeString RegexPattern::pattern() const { |
michael@0 | 532 | if (fPatternString != NULL) { |
michael@0 | 533 | return *fPatternString; |
michael@0 | 534 | } else if (fPattern == NULL) { |
michael@0 | 535 | return UnicodeString(); |
michael@0 | 536 | } else { |
michael@0 | 537 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 538 | int64_t nativeLen = utext_nativeLength(fPattern); |
michael@0 | 539 | int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status); // buffer overflow error |
michael@0 | 540 | UnicodeString result; |
michael@0 | 541 | |
michael@0 | 542 | status = U_ZERO_ERROR; |
michael@0 | 543 | UChar *resultChars = result.getBuffer(len16); |
michael@0 | 544 | utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning |
michael@0 | 545 | result.releaseBuffer(len16); |
michael@0 | 546 | |
michael@0 | 547 | return result; |
michael@0 | 548 | } |
michael@0 | 549 | } |
michael@0 | 550 | |
michael@0 | 551 | |
michael@0 | 552 | |
michael@0 | 553 | |
michael@0 | 554 | //--------------------------------------------------------------------- |
michael@0 | 555 | // |
michael@0 | 556 | // patternText |
michael@0 | 557 | // |
michael@0 | 558 | //--------------------------------------------------------------------- |
michael@0 | 559 | UText *RegexPattern::patternText(UErrorCode &status) const { |
michael@0 | 560 | if (U_FAILURE(status)) {return NULL;} |
michael@0 | 561 | status = U_ZERO_ERROR; |
michael@0 | 562 | |
michael@0 | 563 | if (fPattern != NULL) { |
michael@0 | 564 | return fPattern; |
michael@0 | 565 | } else { |
michael@0 | 566 | RegexStaticSets::initGlobals(&status); |
michael@0 | 567 | return RegexStaticSets::gStaticSets->fEmptyText; |
michael@0 | 568 | } |
michael@0 | 569 | } |
michael@0 | 570 | |
michael@0 | 571 | |
michael@0 | 572 | |
michael@0 | 573 | //--------------------------------------------------------------------- |
michael@0 | 574 | // |
michael@0 | 575 | // split |
michael@0 | 576 | // |
michael@0 | 577 | //--------------------------------------------------------------------- |
michael@0 | 578 | int32_t RegexPattern::split(const UnicodeString &input, |
michael@0 | 579 | UnicodeString dest[], |
michael@0 | 580 | int32_t destCapacity, |
michael@0 | 581 | UErrorCode &status) const |
michael@0 | 582 | { |
michael@0 | 583 | if (U_FAILURE(status)) { |
michael@0 | 584 | return 0; |
michael@0 | 585 | }; |
michael@0 | 586 | |
michael@0 | 587 | RegexMatcher m(this); |
michael@0 | 588 | int32_t r = 0; |
michael@0 | 589 | // Check m's status to make sure all is ok. |
michael@0 | 590 | if (U_SUCCESS(m.fDeferredStatus)) { |
michael@0 | 591 | r = m.split(input, dest, destCapacity, status); |
michael@0 | 592 | } |
michael@0 | 593 | return r; |
michael@0 | 594 | } |
michael@0 | 595 | |
michael@0 | 596 | // |
michael@0 | 597 | // split, UText mode |
michael@0 | 598 | // |
michael@0 | 599 | int32_t RegexPattern::split(UText *input, |
michael@0 | 600 | UText *dest[], |
michael@0 | 601 | int32_t destCapacity, |
michael@0 | 602 | UErrorCode &status) const |
michael@0 | 603 | { |
michael@0 | 604 | if (U_FAILURE(status)) { |
michael@0 | 605 | return 0; |
michael@0 | 606 | }; |
michael@0 | 607 | |
michael@0 | 608 | RegexMatcher m(this); |
michael@0 | 609 | int32_t r = 0; |
michael@0 | 610 | // Check m's status to make sure all is ok. |
michael@0 | 611 | if (U_SUCCESS(m.fDeferredStatus)) { |
michael@0 | 612 | r = m.split(input, dest, destCapacity, status); |
michael@0 | 613 | } |
michael@0 | 614 | return r; |
michael@0 | 615 | } |
michael@0 | 616 | |
michael@0 | 617 | |
michael@0 | 618 | |
michael@0 | 619 | //--------------------------------------------------------------------- |
michael@0 | 620 | // |
michael@0 | 621 | // dump Output the compiled form of the pattern. |
michael@0 | 622 | // Debugging function only. |
michael@0 | 623 | // |
michael@0 | 624 | //--------------------------------------------------------------------- |
michael@0 | 625 | #if defined(REGEX_DEBUG) |
michael@0 | 626 | void RegexPattern::dumpOp(int32_t index) const { |
michael@0 | 627 | static const char * const opNames[] = {URX_OPCODE_NAMES}; |
michael@0 | 628 | int32_t op = fCompiledPat->elementAti(index); |
michael@0 | 629 | int32_t val = URX_VAL(op); |
michael@0 | 630 | int32_t type = URX_TYPE(op); |
michael@0 | 631 | int32_t pinnedType = type; |
michael@0 | 632 | if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) { |
michael@0 | 633 | pinnedType = 0; |
michael@0 | 634 | } |
michael@0 | 635 | |
michael@0 | 636 | REGEX_DUMP_DEBUG_PRINTF(("%4d %08x %-15s ", index, op, opNames[pinnedType])); |
michael@0 | 637 | switch (type) { |
michael@0 | 638 | case URX_NOP: |
michael@0 | 639 | case URX_DOTANY: |
michael@0 | 640 | case URX_DOTANY_ALL: |
michael@0 | 641 | case URX_FAIL: |
michael@0 | 642 | case URX_CARET: |
michael@0 | 643 | case URX_DOLLAR: |
michael@0 | 644 | case URX_BACKSLASH_G: |
michael@0 | 645 | case URX_BACKSLASH_X: |
michael@0 | 646 | case URX_END: |
michael@0 | 647 | case URX_DOLLAR_M: |
michael@0 | 648 | case URX_CARET_M: |
michael@0 | 649 | // Types with no operand field of interest. |
michael@0 | 650 | break; |
michael@0 | 651 | |
michael@0 | 652 | case URX_RESERVED_OP: |
michael@0 | 653 | case URX_START_CAPTURE: |
michael@0 | 654 | case URX_END_CAPTURE: |
michael@0 | 655 | case URX_STATE_SAVE: |
michael@0 | 656 | case URX_JMP: |
michael@0 | 657 | case URX_JMP_SAV: |
michael@0 | 658 | case URX_JMP_SAV_X: |
michael@0 | 659 | case URX_BACKSLASH_B: |
michael@0 | 660 | case URX_BACKSLASH_BU: |
michael@0 | 661 | case URX_BACKSLASH_D: |
michael@0 | 662 | case URX_BACKSLASH_Z: |
michael@0 | 663 | case URX_STRING_LEN: |
michael@0 | 664 | case URX_CTR_INIT: |
michael@0 | 665 | case URX_CTR_INIT_NG: |
michael@0 | 666 | case URX_CTR_LOOP: |
michael@0 | 667 | case URX_CTR_LOOP_NG: |
michael@0 | 668 | case URX_RELOC_OPRND: |
michael@0 | 669 | case URX_STO_SP: |
michael@0 | 670 | case URX_LD_SP: |
michael@0 | 671 | case URX_BACKREF: |
michael@0 | 672 | case URX_STO_INP_LOC: |
michael@0 | 673 | case URX_JMPX: |
michael@0 | 674 | case URX_LA_START: |
michael@0 | 675 | case URX_LA_END: |
michael@0 | 676 | case URX_BACKREF_I: |
michael@0 | 677 | case URX_LB_START: |
michael@0 | 678 | case URX_LB_CONT: |
michael@0 | 679 | case URX_LB_END: |
michael@0 | 680 | case URX_LBN_CONT: |
michael@0 | 681 | case URX_LBN_END: |
michael@0 | 682 | case URX_LOOP_C: |
michael@0 | 683 | case URX_LOOP_DOT_I: |
michael@0 | 684 | // types with an integer operand field. |
michael@0 | 685 | REGEX_DUMP_DEBUG_PRINTF(("%d", val)); |
michael@0 | 686 | break; |
michael@0 | 687 | |
michael@0 | 688 | case URX_ONECHAR: |
michael@0 | 689 | case URX_ONECHAR_I: |
michael@0 | 690 | REGEX_DUMP_DEBUG_PRINTF(("%c", val<256?val:'?')); |
michael@0 | 691 | break; |
michael@0 | 692 | |
michael@0 | 693 | case URX_STRING: |
michael@0 | 694 | case URX_STRING_I: |
michael@0 | 695 | { |
michael@0 | 696 | int32_t lengthOp = fCompiledPat->elementAti(index+1); |
michael@0 | 697 | U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN); |
michael@0 | 698 | int32_t length = URX_VAL(lengthOp); |
michael@0 | 699 | int32_t i; |
michael@0 | 700 | for (i=val; i<val+length; i++) { |
michael@0 | 701 | UChar c = fLiteralText[i]; |
michael@0 | 702 | if (c < 32 || c >= 256) {c = '.';} |
michael@0 | 703 | REGEX_DUMP_DEBUG_PRINTF(("%c", c)); |
michael@0 | 704 | } |
michael@0 | 705 | } |
michael@0 | 706 | break; |
michael@0 | 707 | |
michael@0 | 708 | case URX_SETREF: |
michael@0 | 709 | case URX_LOOP_SR_I: |
michael@0 | 710 | { |
michael@0 | 711 | UnicodeString s; |
michael@0 | 712 | UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val); |
michael@0 | 713 | set->toPattern(s, TRUE); |
michael@0 | 714 | for (int32_t i=0; i<s.length(); i++) { |
michael@0 | 715 | REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i))); |
michael@0 | 716 | } |
michael@0 | 717 | } |
michael@0 | 718 | break; |
michael@0 | 719 | |
michael@0 | 720 | case URX_STATIC_SETREF: |
michael@0 | 721 | case URX_STAT_SETREF_N: |
michael@0 | 722 | { |
michael@0 | 723 | UnicodeString s; |
michael@0 | 724 | if (val & URX_NEG_SET) { |
michael@0 | 725 | REGEX_DUMP_DEBUG_PRINTF(("NOT ")); |
michael@0 | 726 | val &= ~URX_NEG_SET; |
michael@0 | 727 | } |
michael@0 | 728 | UnicodeSet *set = fStaticSets[val]; |
michael@0 | 729 | set->toPattern(s, TRUE); |
michael@0 | 730 | for (int32_t i=0; i<s.length(); i++) { |
michael@0 | 731 | REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i))); |
michael@0 | 732 | } |
michael@0 | 733 | } |
michael@0 | 734 | break; |
michael@0 | 735 | |
michael@0 | 736 | |
michael@0 | 737 | default: |
michael@0 | 738 | REGEX_DUMP_DEBUG_PRINTF(("??????")); |
michael@0 | 739 | break; |
michael@0 | 740 | } |
michael@0 | 741 | REGEX_DUMP_DEBUG_PRINTF(("\n")); |
michael@0 | 742 | } |
michael@0 | 743 | #endif |
michael@0 | 744 | |
michael@0 | 745 | |
michael@0 | 746 | #if defined(REGEX_DEBUG) |
michael@0 | 747 | U_CAPI void U_EXPORT2 |
michael@0 | 748 | RegexPatternDump(const RegexPattern *This) { |
michael@0 | 749 | int index; |
michael@0 | 750 | int i; |
michael@0 | 751 | |
michael@0 | 752 | REGEX_DUMP_DEBUG_PRINTF(("Original Pattern: ")); |
michael@0 | 753 | UChar32 c = utext_next32From(This->fPattern, 0); |
michael@0 | 754 | while (c != U_SENTINEL) { |
michael@0 | 755 | if (c<32 || c>256) { |
michael@0 | 756 | c = '.'; |
michael@0 | 757 | } |
michael@0 | 758 | REGEX_DUMP_DEBUG_PRINTF(("%c", c)); |
michael@0 | 759 | |
michael@0 | 760 | c = UTEXT_NEXT32(This->fPattern); |
michael@0 | 761 | } |
michael@0 | 762 | REGEX_DUMP_DEBUG_PRINTF(("\n")); |
michael@0 | 763 | REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This->fMinMatchLen)); |
michael@0 | 764 | REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This->fStartType))); |
michael@0 | 765 | if (This->fStartType == START_STRING) { |
michael@0 | 766 | REGEX_DUMP_DEBUG_PRINTF((" Initial match string: \"")); |
michael@0 | 767 | for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitialStringLen; i++) { |
michael@0 | 768 | REGEX_DUMP_DEBUG_PRINTF(("%c", This->fLiteralText[i])); // TODO: non-printables, surrogates. |
michael@0 | 769 | } |
michael@0 | 770 | REGEX_DUMP_DEBUG_PRINTF(("\"\n")); |
michael@0 | 771 | |
michael@0 | 772 | } else if (This->fStartType == START_SET) { |
michael@0 | 773 | int32_t numSetChars = This->fInitialChars->size(); |
michael@0 | 774 | if (numSetChars > 20) { |
michael@0 | 775 | numSetChars = 20; |
michael@0 | 776 | } |
michael@0 | 777 | REGEX_DUMP_DEBUG_PRINTF((" Match First Chars : ")); |
michael@0 | 778 | for (i=0; i<numSetChars; i++) { |
michael@0 | 779 | UChar32 c = This->fInitialChars->charAt(i); |
michael@0 | 780 | if (0x20<c && c <0x7e) { |
michael@0 | 781 | REGEX_DUMP_DEBUG_PRINTF(("%c ", c)); |
michael@0 | 782 | } else { |
michael@0 | 783 | REGEX_DUMP_DEBUG_PRINTF(("%#x ", c)); |
michael@0 | 784 | } |
michael@0 | 785 | } |
michael@0 | 786 | if (numSetChars < This->fInitialChars->size()) { |
michael@0 | 787 | REGEX_DUMP_DEBUG_PRINTF((" ...")); |
michael@0 | 788 | } |
michael@0 | 789 | REGEX_DUMP_DEBUG_PRINTF(("\n")); |
michael@0 | 790 | |
michael@0 | 791 | } else if (This->fStartType == START_CHAR) { |
michael@0 | 792 | REGEX_DUMP_DEBUG_PRINTF((" First char of Match : ")); |
michael@0 | 793 | if (0x20 < This->fInitialChar && This->fInitialChar<0x7e) { |
michael@0 | 794 | REGEX_DUMP_DEBUG_PRINTF(("%c\n", This->fInitialChar)); |
michael@0 | 795 | } else { |
michael@0 | 796 | REGEX_DUMP_DEBUG_PRINTF(("%#x\n", This->fInitialChar)); |
michael@0 | 797 | } |
michael@0 | 798 | } |
michael@0 | 799 | |
michael@0 | 800 | REGEX_DUMP_DEBUG_PRINTF(("\nIndex Binary Type Operand\n" \ |
michael@0 | 801 | "-------------------------------------------\n")); |
michael@0 | 802 | for (index = 0; index<This->fCompiledPat->size(); index++) { |
michael@0 | 803 | This->dumpOp(index); |
michael@0 | 804 | } |
michael@0 | 805 | REGEX_DUMP_DEBUG_PRINTF(("\n\n")); |
michael@0 | 806 | } |
michael@0 | 807 | #endif |
michael@0 | 808 | |
michael@0 | 809 | |
michael@0 | 810 | |
michael@0 | 811 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern) |
michael@0 | 812 | |
michael@0 | 813 | U_NAMESPACE_END |
michael@0 | 814 | #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |