intl/icu/source/i18n/repattrn.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 //
michael@0 2 // file: repattrn.cpp
michael@0 3 //
michael@0 4 /*
michael@0 5 ***************************************************************************
michael@0 6 * Copyright (C) 2002-2012 International Business Machines Corporation *
michael@0 7 * and others. All rights reserved. *
michael@0 8 ***************************************************************************
michael@0 9 */
michael@0 10
michael@0 11 #include "unicode/utypes.h"
michael@0 12
michael@0 13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
michael@0 14
michael@0 15 #include "unicode/regex.h"
michael@0 16 #include "unicode/uclean.h"
michael@0 17 #include "uassert.h"
michael@0 18 #include "uvector.h"
michael@0 19 #include "uvectr32.h"
michael@0 20 #include "uvectr64.h"
michael@0 21 #include "regexcmp.h"
michael@0 22 #include "regeximp.h"
michael@0 23 #include "regexst.h"
michael@0 24
michael@0 25 U_NAMESPACE_BEGIN
michael@0 26
michael@0 27 //--------------------------------------------------------------------------
michael@0 28 //
michael@0 29 // RegexPattern Default Constructor
michael@0 30 //
michael@0 31 //--------------------------------------------------------------------------
michael@0 32 RegexPattern::RegexPattern() {
michael@0 33 // Init all of this instances data.
michael@0 34 init();
michael@0 35 }
michael@0 36
michael@0 37
michael@0 38 //--------------------------------------------------------------------------
michael@0 39 //
michael@0 40 // Copy Constructor Note: This is a rather inefficient implementation,
michael@0 41 // but it probably doesn't matter.
michael@0 42 //
michael@0 43 //--------------------------------------------------------------------------
michael@0 44 RegexPattern::RegexPattern(const RegexPattern &other) : UObject(other) {
michael@0 45 init();
michael@0 46 *this = other;
michael@0 47 }
michael@0 48
michael@0 49
michael@0 50
michael@0 51 //--------------------------------------------------------------------------
michael@0 52 //
michael@0 53 // Assignment Operator
michael@0 54 //
michael@0 55 //--------------------------------------------------------------------------
michael@0 56 RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
michael@0 57 if (this == &other) {
michael@0 58 // Source and destination are the same. Don't do anything.
michael@0 59 return *this;
michael@0 60 }
michael@0 61
michael@0 62 // Clean out any previous contents of object being assigned to.
michael@0 63 zap();
michael@0 64
michael@0 65 // Give target object a default initialization
michael@0 66 init();
michael@0 67
michael@0 68 // Copy simple fields
michael@0 69 if ( other.fPatternString == NULL ) {
michael@0 70 fPatternString = NULL;
michael@0 71 fPattern = utext_clone(fPattern, other.fPattern, FALSE, TRUE, &fDeferredStatus);
michael@0 72 } else {
michael@0 73 fPatternString = new UnicodeString(*(other.fPatternString));
michael@0 74 UErrorCode status = U_ZERO_ERROR;
michael@0 75 fPattern = utext_openConstUnicodeString(NULL, fPatternString, &status);
michael@0 76 if (U_FAILURE(status)) {
michael@0 77 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
michael@0 78 return *this;
michael@0 79 }
michael@0 80 }
michael@0 81 fFlags = other.fFlags;
michael@0 82 fLiteralText = other.fLiteralText;
michael@0 83 fDeferredStatus = other.fDeferredStatus;
michael@0 84 fMinMatchLen = other.fMinMatchLen;
michael@0 85 fFrameSize = other.fFrameSize;
michael@0 86 fDataSize = other.fDataSize;
michael@0 87 fMaxCaptureDigits = other.fMaxCaptureDigits;
michael@0 88 fStaticSets = other.fStaticSets;
michael@0 89 fStaticSets8 = other.fStaticSets8;
michael@0 90
michael@0 91 fStartType = other.fStartType;
michael@0 92 fInitialStringIdx = other.fInitialStringIdx;
michael@0 93 fInitialStringLen = other.fInitialStringLen;
michael@0 94 *fInitialChars = *other.fInitialChars;
michael@0 95 fInitialChar = other.fInitialChar;
michael@0 96 *fInitialChars8 = *other.fInitialChars8;
michael@0 97 fNeedsAltInput = other.fNeedsAltInput;
michael@0 98
michael@0 99 // Copy the pattern. It's just values, nothing deep to copy.
michael@0 100 fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
michael@0 101 fGroupMap->assign(*other.fGroupMap, fDeferredStatus);
michael@0 102
michael@0 103 // Copy the Unicode Sets.
michael@0 104 // Could be made more efficient if the sets were reference counted and shared,
michael@0 105 // but I doubt that pattern copying will be particularly common.
michael@0 106 // Note: init() already added an empty element zero to fSets
michael@0 107 int32_t i;
michael@0 108 int32_t numSets = other.fSets->size();
michael@0 109 fSets8 = new Regex8BitSet[numSets];
michael@0 110 if (fSets8 == NULL) {
michael@0 111 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
michael@0 112 return *this;
michael@0 113 }
michael@0 114 for (i=1; i<numSets; i++) {
michael@0 115 if (U_FAILURE(fDeferredStatus)) {
michael@0 116 return *this;
michael@0 117 }
michael@0 118 UnicodeSet *sourceSet = (UnicodeSet *)other.fSets->elementAt(i);
michael@0 119 UnicodeSet *newSet = new UnicodeSet(*sourceSet);
michael@0 120 if (newSet == NULL) {
michael@0 121 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
michael@0 122 break;
michael@0 123 }
michael@0 124 fSets->addElement(newSet, fDeferredStatus);
michael@0 125 fSets8[i] = other.fSets8[i];
michael@0 126 }
michael@0 127
michael@0 128 return *this;
michael@0 129 }
michael@0 130
michael@0 131
michael@0 132 //--------------------------------------------------------------------------
michael@0 133 //
michael@0 134 // init Shared initialization for use by constructors.
michael@0 135 // Bring an uninitialized RegexPattern up to a default state.
michael@0 136 //
michael@0 137 //--------------------------------------------------------------------------
michael@0 138 void RegexPattern::init() {
michael@0 139 fFlags = 0;
michael@0 140 fCompiledPat = 0;
michael@0 141 fLiteralText.remove();
michael@0 142 fSets = NULL;
michael@0 143 fSets8 = NULL;
michael@0 144 fDeferredStatus = U_ZERO_ERROR;
michael@0 145 fMinMatchLen = 0;
michael@0 146 fFrameSize = 0;
michael@0 147 fDataSize = 0;
michael@0 148 fGroupMap = NULL;
michael@0 149 fMaxCaptureDigits = 1;
michael@0 150 fStaticSets = NULL;
michael@0 151 fStaticSets8 = NULL;
michael@0 152 fStartType = START_NO_INFO;
michael@0 153 fInitialStringIdx = 0;
michael@0 154 fInitialStringLen = 0;
michael@0 155 fInitialChars = NULL;
michael@0 156 fInitialChar = 0;
michael@0 157 fInitialChars8 = NULL;
michael@0 158 fNeedsAltInput = FALSE;
michael@0 159
michael@0 160 fPattern = NULL; // will be set later
michael@0 161 fPatternString = NULL; // may be set later
michael@0 162 fCompiledPat = new UVector64(fDeferredStatus);
michael@0 163 fGroupMap = new UVector32(fDeferredStatus);
michael@0 164 fSets = new UVector(fDeferredStatus);
michael@0 165 fInitialChars = new UnicodeSet;
michael@0 166 fInitialChars8 = new Regex8BitSet;
michael@0 167 if (U_FAILURE(fDeferredStatus)) {
michael@0 168 return;
michael@0 169 }
michael@0 170 if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL ||
michael@0 171 fInitialChars == NULL || fInitialChars8 == NULL) {
michael@0 172 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
michael@0 173 return;
michael@0 174 }
michael@0 175
michael@0 176 // Slot zero of the vector of sets is reserved. Fill it here.
michael@0 177 fSets->addElement((int32_t)0, fDeferredStatus);
michael@0 178 }
michael@0 179
michael@0 180
michael@0 181 //--------------------------------------------------------------------------
michael@0 182 //
michael@0 183 // zap Delete everything owned by this RegexPattern.
michael@0 184 //
michael@0 185 //--------------------------------------------------------------------------
michael@0 186 void RegexPattern::zap() {
michael@0 187 delete fCompiledPat;
michael@0 188 fCompiledPat = NULL;
michael@0 189 int i;
michael@0 190 for (i=1; i<fSets->size(); i++) {
michael@0 191 UnicodeSet *s;
michael@0 192 s = (UnicodeSet *)fSets->elementAt(i);
michael@0 193 if (s != NULL) {
michael@0 194 delete s;
michael@0 195 }
michael@0 196 }
michael@0 197 delete fSets;
michael@0 198 fSets = NULL;
michael@0 199 delete[] fSets8;
michael@0 200 fSets8 = NULL;
michael@0 201 delete fGroupMap;
michael@0 202 fGroupMap = NULL;
michael@0 203 delete fInitialChars;
michael@0 204 fInitialChars = NULL;
michael@0 205 delete fInitialChars8;
michael@0 206 fInitialChars8 = NULL;
michael@0 207 if (fPattern != NULL) {
michael@0 208 utext_close(fPattern);
michael@0 209 fPattern = NULL;
michael@0 210 }
michael@0 211 if (fPatternString != NULL) {
michael@0 212 delete fPatternString;
michael@0 213 fPatternString = NULL;
michael@0 214 }
michael@0 215 }
michael@0 216
michael@0 217
michael@0 218 //--------------------------------------------------------------------------
michael@0 219 //
michael@0 220 // Destructor
michael@0 221 //
michael@0 222 //--------------------------------------------------------------------------
michael@0 223 RegexPattern::~RegexPattern() {
michael@0 224 zap();
michael@0 225 }
michael@0 226
michael@0 227
michael@0 228 //--------------------------------------------------------------------------
michael@0 229 //
michael@0 230 // Clone
michael@0 231 //
michael@0 232 //--------------------------------------------------------------------------
michael@0 233 RegexPattern *RegexPattern::clone() const {
michael@0 234 RegexPattern *copy = new RegexPattern(*this);
michael@0 235 return copy;
michael@0 236 }
michael@0 237
michael@0 238
michael@0 239 //--------------------------------------------------------------------------
michael@0 240 //
michael@0 241 // operator == (comparison) Consider to patterns to be == if the
michael@0 242 // pattern strings and the flags are the same.
michael@0 243 // Note that pattern strings with the same
michael@0 244 // characters can still be considered different.
michael@0 245 //
michael@0 246 //--------------------------------------------------------------------------
michael@0 247 UBool RegexPattern::operator ==(const RegexPattern &other) const {
michael@0 248 if (this->fFlags == other.fFlags && this->fDeferredStatus == other.fDeferredStatus) {
michael@0 249 if (this->fPatternString != NULL && other.fPatternString != NULL) {
michael@0 250 return *(this->fPatternString) == *(other.fPatternString);
michael@0 251 } else if (this->fPattern == NULL) {
michael@0 252 if (other.fPattern == NULL) {
michael@0 253 return TRUE;
michael@0 254 }
michael@0 255 } else if (other.fPattern != NULL) {
michael@0 256 UTEXT_SETNATIVEINDEX(this->fPattern, 0);
michael@0 257 UTEXT_SETNATIVEINDEX(other.fPattern, 0);
michael@0 258 return utext_equals(this->fPattern, other.fPattern);
michael@0 259 }
michael@0 260 }
michael@0 261 return FALSE;
michael@0 262 }
michael@0 263
michael@0 264 //---------------------------------------------------------------------
michael@0 265 //
michael@0 266 // compile
michael@0 267 //
michael@0 268 //---------------------------------------------------------------------
michael@0 269 RegexPattern * U_EXPORT2
michael@0 270 RegexPattern::compile(const UnicodeString &regex,
michael@0 271 uint32_t flags,
michael@0 272 UParseError &pe,
michael@0 273 UErrorCode &status)
michael@0 274 {
michael@0 275 if (U_FAILURE(status)) {
michael@0 276 return NULL;
michael@0 277 }
michael@0 278
michael@0 279 const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
michael@0 280 UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD |
michael@0 281 UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL;
michael@0 282
michael@0 283 if ((flags & ~allFlags) != 0) {
michael@0 284 status = U_REGEX_INVALID_FLAG;
michael@0 285 return NULL;
michael@0 286 }
michael@0 287
michael@0 288 if ((flags & UREGEX_CANON_EQ) != 0) {
michael@0 289 status = U_REGEX_UNIMPLEMENTED;
michael@0 290 return NULL;
michael@0 291 }
michael@0 292
michael@0 293 RegexPattern *This = new RegexPattern;
michael@0 294 if (This == NULL) {
michael@0 295 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 296 return NULL;
michael@0 297 }
michael@0 298 if (U_FAILURE(This->fDeferredStatus)) {
michael@0 299 status = This->fDeferredStatus;
michael@0 300 delete This;
michael@0 301 return NULL;
michael@0 302 }
michael@0 303 This->fFlags = flags;
michael@0 304
michael@0 305 RegexCompile compiler(This, status);
michael@0 306 compiler.compile(regex, pe, status);
michael@0 307
michael@0 308 if (U_FAILURE(status)) {
michael@0 309 delete This;
michael@0 310 This = NULL;
michael@0 311 }
michael@0 312
michael@0 313 return This;
michael@0 314 }
michael@0 315
michael@0 316
michael@0 317 //
michael@0 318 // compile, UText mode
michael@0 319 //
michael@0 320 RegexPattern * U_EXPORT2
michael@0 321 RegexPattern::compile(UText *regex,
michael@0 322 uint32_t flags,
michael@0 323 UParseError &pe,
michael@0 324 UErrorCode &status)
michael@0 325 {
michael@0 326 if (U_FAILURE(status)) {
michael@0 327 return NULL;
michael@0 328 }
michael@0 329
michael@0 330 const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
michael@0 331 UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD |
michael@0 332 UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL;
michael@0 333
michael@0 334 if ((flags & ~allFlags) != 0) {
michael@0 335 status = U_REGEX_INVALID_FLAG;
michael@0 336 return NULL;
michael@0 337 }
michael@0 338
michael@0 339 if ((flags & UREGEX_CANON_EQ) != 0) {
michael@0 340 status = U_REGEX_UNIMPLEMENTED;
michael@0 341 return NULL;
michael@0 342 }
michael@0 343
michael@0 344 RegexPattern *This = new RegexPattern;
michael@0 345 if (This == NULL) {
michael@0 346 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 347 return NULL;
michael@0 348 }
michael@0 349 if (U_FAILURE(This->fDeferredStatus)) {
michael@0 350 status = This->fDeferredStatus;
michael@0 351 delete This;
michael@0 352 return NULL;
michael@0 353 }
michael@0 354 This->fFlags = flags;
michael@0 355
michael@0 356 RegexCompile compiler(This, status);
michael@0 357 compiler.compile(regex, pe, status);
michael@0 358
michael@0 359 if (U_FAILURE(status)) {
michael@0 360 delete This;
michael@0 361 This = NULL;
michael@0 362 }
michael@0 363
michael@0 364 return This;
michael@0 365 }
michael@0 366
michael@0 367 //
michael@0 368 // compile with default flags.
michael@0 369 //
michael@0 370 RegexPattern * U_EXPORT2
michael@0 371 RegexPattern::compile(const UnicodeString &regex,
michael@0 372 UParseError &pe,
michael@0 373 UErrorCode &err)
michael@0 374 {
michael@0 375 return compile(regex, 0, pe, err);
michael@0 376 }
michael@0 377
michael@0 378
michael@0 379 //
michael@0 380 // compile with default flags, UText mode
michael@0 381 //
michael@0 382 RegexPattern * U_EXPORT2
michael@0 383 RegexPattern::compile(UText *regex,
michael@0 384 UParseError &pe,
michael@0 385 UErrorCode &err)
michael@0 386 {
michael@0 387 return compile(regex, 0, pe, err);
michael@0 388 }
michael@0 389
michael@0 390
michael@0 391 //
michael@0 392 // compile with no UParseErr parameter.
michael@0 393 //
michael@0 394 RegexPattern * U_EXPORT2
michael@0 395 RegexPattern::compile(const UnicodeString &regex,
michael@0 396 uint32_t flags,
michael@0 397 UErrorCode &err)
michael@0 398 {
michael@0 399 UParseError pe;
michael@0 400 return compile(regex, flags, pe, err);
michael@0 401 }
michael@0 402
michael@0 403
michael@0 404 //
michael@0 405 // compile with no UParseErr parameter, UText mode
michael@0 406 //
michael@0 407 RegexPattern * U_EXPORT2
michael@0 408 RegexPattern::compile(UText *regex,
michael@0 409 uint32_t flags,
michael@0 410 UErrorCode &err)
michael@0 411 {
michael@0 412 UParseError pe;
michael@0 413 return compile(regex, flags, pe, err);
michael@0 414 }
michael@0 415
michael@0 416
michael@0 417 //---------------------------------------------------------------------
michael@0 418 //
michael@0 419 // flags
michael@0 420 //
michael@0 421 //---------------------------------------------------------------------
michael@0 422 uint32_t RegexPattern::flags() const {
michael@0 423 return fFlags;
michael@0 424 }
michael@0 425
michael@0 426
michael@0 427 //---------------------------------------------------------------------
michael@0 428 //
michael@0 429 // matcher(UnicodeString, err)
michael@0 430 //
michael@0 431 //---------------------------------------------------------------------
michael@0 432 RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
michael@0 433 UErrorCode &status) const {
michael@0 434 RegexMatcher *retMatcher = matcher(status);
michael@0 435 if (retMatcher != NULL) {
michael@0 436 retMatcher->fDeferredStatus = status;
michael@0 437 retMatcher->reset(input);
michael@0 438 }
michael@0 439 return retMatcher;
michael@0 440 }
michael@0 441
michael@0 442
michael@0 443 //---------------------------------------------------------------------
michael@0 444 //
michael@0 445 // matcher(status)
michael@0 446 //
michael@0 447 //---------------------------------------------------------------------
michael@0 448 RegexMatcher *RegexPattern::matcher(UErrorCode &status) const {
michael@0 449 RegexMatcher *retMatcher = NULL;
michael@0 450
michael@0 451 if (U_FAILURE(status)) {
michael@0 452 return NULL;
michael@0 453 }
michael@0 454 if (U_FAILURE(fDeferredStatus)) {
michael@0 455 status = fDeferredStatus;
michael@0 456 return NULL;
michael@0 457 }
michael@0 458
michael@0 459 retMatcher = new RegexMatcher(this);
michael@0 460 if (retMatcher == NULL) {
michael@0 461 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 462 return NULL;
michael@0 463 }
michael@0 464 return retMatcher;
michael@0 465 }
michael@0 466
michael@0 467
michael@0 468
michael@0 469 //---------------------------------------------------------------------
michael@0 470 //
michael@0 471 // matches Convenience function to test for a match, starting
michael@0 472 // with a pattern string and a data string.
michael@0 473 //
michael@0 474 //---------------------------------------------------------------------
michael@0 475 UBool U_EXPORT2 RegexPattern::matches(const UnicodeString &regex,
michael@0 476 const UnicodeString &input,
michael@0 477 UParseError &pe,
michael@0 478 UErrorCode &status) {
michael@0 479
michael@0 480 if (U_FAILURE(status)) {return FALSE;}
michael@0 481
michael@0 482 UBool retVal;
michael@0 483 RegexPattern *pat = NULL;
michael@0 484 RegexMatcher *matcher = NULL;
michael@0 485
michael@0 486 pat = RegexPattern::compile(regex, 0, pe, status);
michael@0 487 matcher = pat->matcher(input, status);
michael@0 488 retVal = matcher->matches(status);
michael@0 489
michael@0 490 delete matcher;
michael@0 491 delete pat;
michael@0 492 return retVal;
michael@0 493 }
michael@0 494
michael@0 495
michael@0 496 //
michael@0 497 // matches, UText mode
michael@0 498 //
michael@0 499 UBool U_EXPORT2 RegexPattern::matches(UText *regex,
michael@0 500 UText *input,
michael@0 501 UParseError &pe,
michael@0 502 UErrorCode &status) {
michael@0 503
michael@0 504 if (U_FAILURE(status)) {return FALSE;}
michael@0 505
michael@0 506 UBool retVal = FALSE;
michael@0 507 RegexPattern *pat = NULL;
michael@0 508 RegexMatcher *matcher = NULL;
michael@0 509
michael@0 510 pat = RegexPattern::compile(regex, 0, pe, status);
michael@0 511 matcher = pat->matcher(status);
michael@0 512 if (U_SUCCESS(status)) {
michael@0 513 matcher->reset(input);
michael@0 514 retVal = matcher->matches(status);
michael@0 515 }
michael@0 516
michael@0 517 delete matcher;
michael@0 518 delete pat;
michael@0 519 return retVal;
michael@0 520 }
michael@0 521
michael@0 522
michael@0 523
michael@0 524
michael@0 525
michael@0 526 //---------------------------------------------------------------------
michael@0 527 //
michael@0 528 // pattern
michael@0 529 //
michael@0 530 //---------------------------------------------------------------------
michael@0 531 UnicodeString RegexPattern::pattern() const {
michael@0 532 if (fPatternString != NULL) {
michael@0 533 return *fPatternString;
michael@0 534 } else if (fPattern == NULL) {
michael@0 535 return UnicodeString();
michael@0 536 } else {
michael@0 537 UErrorCode status = U_ZERO_ERROR;
michael@0 538 int64_t nativeLen = utext_nativeLength(fPattern);
michael@0 539 int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status); // buffer overflow error
michael@0 540 UnicodeString result;
michael@0 541
michael@0 542 status = U_ZERO_ERROR;
michael@0 543 UChar *resultChars = result.getBuffer(len16);
michael@0 544 utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning
michael@0 545 result.releaseBuffer(len16);
michael@0 546
michael@0 547 return result;
michael@0 548 }
michael@0 549 }
michael@0 550
michael@0 551
michael@0 552
michael@0 553
michael@0 554 //---------------------------------------------------------------------
michael@0 555 //
michael@0 556 // patternText
michael@0 557 //
michael@0 558 //---------------------------------------------------------------------
michael@0 559 UText *RegexPattern::patternText(UErrorCode &status) const {
michael@0 560 if (U_FAILURE(status)) {return NULL;}
michael@0 561 status = U_ZERO_ERROR;
michael@0 562
michael@0 563 if (fPattern != NULL) {
michael@0 564 return fPattern;
michael@0 565 } else {
michael@0 566 RegexStaticSets::initGlobals(&status);
michael@0 567 return RegexStaticSets::gStaticSets->fEmptyText;
michael@0 568 }
michael@0 569 }
michael@0 570
michael@0 571
michael@0 572
michael@0 573 //---------------------------------------------------------------------
michael@0 574 //
michael@0 575 // split
michael@0 576 //
michael@0 577 //---------------------------------------------------------------------
michael@0 578 int32_t RegexPattern::split(const UnicodeString &input,
michael@0 579 UnicodeString dest[],
michael@0 580 int32_t destCapacity,
michael@0 581 UErrorCode &status) const
michael@0 582 {
michael@0 583 if (U_FAILURE(status)) {
michael@0 584 return 0;
michael@0 585 };
michael@0 586
michael@0 587 RegexMatcher m(this);
michael@0 588 int32_t r = 0;
michael@0 589 // Check m's status to make sure all is ok.
michael@0 590 if (U_SUCCESS(m.fDeferredStatus)) {
michael@0 591 r = m.split(input, dest, destCapacity, status);
michael@0 592 }
michael@0 593 return r;
michael@0 594 }
michael@0 595
michael@0 596 //
michael@0 597 // split, UText mode
michael@0 598 //
michael@0 599 int32_t RegexPattern::split(UText *input,
michael@0 600 UText *dest[],
michael@0 601 int32_t destCapacity,
michael@0 602 UErrorCode &status) const
michael@0 603 {
michael@0 604 if (U_FAILURE(status)) {
michael@0 605 return 0;
michael@0 606 };
michael@0 607
michael@0 608 RegexMatcher m(this);
michael@0 609 int32_t r = 0;
michael@0 610 // Check m's status to make sure all is ok.
michael@0 611 if (U_SUCCESS(m.fDeferredStatus)) {
michael@0 612 r = m.split(input, dest, destCapacity, status);
michael@0 613 }
michael@0 614 return r;
michael@0 615 }
michael@0 616
michael@0 617
michael@0 618
michael@0 619 //---------------------------------------------------------------------
michael@0 620 //
michael@0 621 // dump Output the compiled form of the pattern.
michael@0 622 // Debugging function only.
michael@0 623 //
michael@0 624 //---------------------------------------------------------------------
michael@0 625 #if defined(REGEX_DEBUG)
michael@0 626 void RegexPattern::dumpOp(int32_t index) const {
michael@0 627 static const char * const opNames[] = {URX_OPCODE_NAMES};
michael@0 628 int32_t op = fCompiledPat->elementAti(index);
michael@0 629 int32_t val = URX_VAL(op);
michael@0 630 int32_t type = URX_TYPE(op);
michael@0 631 int32_t pinnedType = type;
michael@0 632 if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) {
michael@0 633 pinnedType = 0;
michael@0 634 }
michael@0 635
michael@0 636 REGEX_DUMP_DEBUG_PRINTF(("%4d %08x %-15s ", index, op, opNames[pinnedType]));
michael@0 637 switch (type) {
michael@0 638 case URX_NOP:
michael@0 639 case URX_DOTANY:
michael@0 640 case URX_DOTANY_ALL:
michael@0 641 case URX_FAIL:
michael@0 642 case URX_CARET:
michael@0 643 case URX_DOLLAR:
michael@0 644 case URX_BACKSLASH_G:
michael@0 645 case URX_BACKSLASH_X:
michael@0 646 case URX_END:
michael@0 647 case URX_DOLLAR_M:
michael@0 648 case URX_CARET_M:
michael@0 649 // Types with no operand field of interest.
michael@0 650 break;
michael@0 651
michael@0 652 case URX_RESERVED_OP:
michael@0 653 case URX_START_CAPTURE:
michael@0 654 case URX_END_CAPTURE:
michael@0 655 case URX_STATE_SAVE:
michael@0 656 case URX_JMP:
michael@0 657 case URX_JMP_SAV:
michael@0 658 case URX_JMP_SAV_X:
michael@0 659 case URX_BACKSLASH_B:
michael@0 660 case URX_BACKSLASH_BU:
michael@0 661 case URX_BACKSLASH_D:
michael@0 662 case URX_BACKSLASH_Z:
michael@0 663 case URX_STRING_LEN:
michael@0 664 case URX_CTR_INIT:
michael@0 665 case URX_CTR_INIT_NG:
michael@0 666 case URX_CTR_LOOP:
michael@0 667 case URX_CTR_LOOP_NG:
michael@0 668 case URX_RELOC_OPRND:
michael@0 669 case URX_STO_SP:
michael@0 670 case URX_LD_SP:
michael@0 671 case URX_BACKREF:
michael@0 672 case URX_STO_INP_LOC:
michael@0 673 case URX_JMPX:
michael@0 674 case URX_LA_START:
michael@0 675 case URX_LA_END:
michael@0 676 case URX_BACKREF_I:
michael@0 677 case URX_LB_START:
michael@0 678 case URX_LB_CONT:
michael@0 679 case URX_LB_END:
michael@0 680 case URX_LBN_CONT:
michael@0 681 case URX_LBN_END:
michael@0 682 case URX_LOOP_C:
michael@0 683 case URX_LOOP_DOT_I:
michael@0 684 // types with an integer operand field.
michael@0 685 REGEX_DUMP_DEBUG_PRINTF(("%d", val));
michael@0 686 break;
michael@0 687
michael@0 688 case URX_ONECHAR:
michael@0 689 case URX_ONECHAR_I:
michael@0 690 REGEX_DUMP_DEBUG_PRINTF(("%c", val<256?val:'?'));
michael@0 691 break;
michael@0 692
michael@0 693 case URX_STRING:
michael@0 694 case URX_STRING_I:
michael@0 695 {
michael@0 696 int32_t lengthOp = fCompiledPat->elementAti(index+1);
michael@0 697 U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
michael@0 698 int32_t length = URX_VAL(lengthOp);
michael@0 699 int32_t i;
michael@0 700 for (i=val; i<val+length; i++) {
michael@0 701 UChar c = fLiteralText[i];
michael@0 702 if (c < 32 || c >= 256) {c = '.';}
michael@0 703 REGEX_DUMP_DEBUG_PRINTF(("%c", c));
michael@0 704 }
michael@0 705 }
michael@0 706 break;
michael@0 707
michael@0 708 case URX_SETREF:
michael@0 709 case URX_LOOP_SR_I:
michael@0 710 {
michael@0 711 UnicodeString s;
michael@0 712 UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
michael@0 713 set->toPattern(s, TRUE);
michael@0 714 for (int32_t i=0; i<s.length(); i++) {
michael@0 715 REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i)));
michael@0 716 }
michael@0 717 }
michael@0 718 break;
michael@0 719
michael@0 720 case URX_STATIC_SETREF:
michael@0 721 case URX_STAT_SETREF_N:
michael@0 722 {
michael@0 723 UnicodeString s;
michael@0 724 if (val & URX_NEG_SET) {
michael@0 725 REGEX_DUMP_DEBUG_PRINTF(("NOT "));
michael@0 726 val &= ~URX_NEG_SET;
michael@0 727 }
michael@0 728 UnicodeSet *set = fStaticSets[val];
michael@0 729 set->toPattern(s, TRUE);
michael@0 730 for (int32_t i=0; i<s.length(); i++) {
michael@0 731 REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i)));
michael@0 732 }
michael@0 733 }
michael@0 734 break;
michael@0 735
michael@0 736
michael@0 737 default:
michael@0 738 REGEX_DUMP_DEBUG_PRINTF(("??????"));
michael@0 739 break;
michael@0 740 }
michael@0 741 REGEX_DUMP_DEBUG_PRINTF(("\n"));
michael@0 742 }
michael@0 743 #endif
michael@0 744
michael@0 745
michael@0 746 #if defined(REGEX_DEBUG)
michael@0 747 U_CAPI void U_EXPORT2
michael@0 748 RegexPatternDump(const RegexPattern *This) {
michael@0 749 int index;
michael@0 750 int i;
michael@0 751
michael@0 752 REGEX_DUMP_DEBUG_PRINTF(("Original Pattern: "));
michael@0 753 UChar32 c = utext_next32From(This->fPattern, 0);
michael@0 754 while (c != U_SENTINEL) {
michael@0 755 if (c<32 || c>256) {
michael@0 756 c = '.';
michael@0 757 }
michael@0 758 REGEX_DUMP_DEBUG_PRINTF(("%c", c));
michael@0 759
michael@0 760 c = UTEXT_NEXT32(This->fPattern);
michael@0 761 }
michael@0 762 REGEX_DUMP_DEBUG_PRINTF(("\n"));
michael@0 763 REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This->fMinMatchLen));
michael@0 764 REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This->fStartType)));
michael@0 765 if (This->fStartType == START_STRING) {
michael@0 766 REGEX_DUMP_DEBUG_PRINTF((" Initial match string: \""));
michael@0 767 for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitialStringLen; i++) {
michael@0 768 REGEX_DUMP_DEBUG_PRINTF(("%c", This->fLiteralText[i])); // TODO: non-printables, surrogates.
michael@0 769 }
michael@0 770 REGEX_DUMP_DEBUG_PRINTF(("\"\n"));
michael@0 771
michael@0 772 } else if (This->fStartType == START_SET) {
michael@0 773 int32_t numSetChars = This->fInitialChars->size();
michael@0 774 if (numSetChars > 20) {
michael@0 775 numSetChars = 20;
michael@0 776 }
michael@0 777 REGEX_DUMP_DEBUG_PRINTF((" Match First Chars : "));
michael@0 778 for (i=0; i<numSetChars; i++) {
michael@0 779 UChar32 c = This->fInitialChars->charAt(i);
michael@0 780 if (0x20<c && c <0x7e) {
michael@0 781 REGEX_DUMP_DEBUG_PRINTF(("%c ", c));
michael@0 782 } else {
michael@0 783 REGEX_DUMP_DEBUG_PRINTF(("%#x ", c));
michael@0 784 }
michael@0 785 }
michael@0 786 if (numSetChars < This->fInitialChars->size()) {
michael@0 787 REGEX_DUMP_DEBUG_PRINTF((" ..."));
michael@0 788 }
michael@0 789 REGEX_DUMP_DEBUG_PRINTF(("\n"));
michael@0 790
michael@0 791 } else if (This->fStartType == START_CHAR) {
michael@0 792 REGEX_DUMP_DEBUG_PRINTF((" First char of Match : "));
michael@0 793 if (0x20 < This->fInitialChar && This->fInitialChar<0x7e) {
michael@0 794 REGEX_DUMP_DEBUG_PRINTF(("%c\n", This->fInitialChar));
michael@0 795 } else {
michael@0 796 REGEX_DUMP_DEBUG_PRINTF(("%#x\n", This->fInitialChar));
michael@0 797 }
michael@0 798 }
michael@0 799
michael@0 800 REGEX_DUMP_DEBUG_PRINTF(("\nIndex Binary Type Operand\n" \
michael@0 801 "-------------------------------------------\n"));
michael@0 802 for (index = 0; index<This->fCompiledPat->size(); index++) {
michael@0 803 This->dumpOp(index);
michael@0 804 }
michael@0 805 REGEX_DUMP_DEBUG_PRINTF(("\n\n"));
michael@0 806 }
michael@0 807 #endif
michael@0 808
michael@0 809
michael@0 810
michael@0 811 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern)
michael@0 812
michael@0 813 U_NAMESPACE_END
michael@0 814 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS

mercurial