intl/icu/source/i18n/repattrn.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/repattrn.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,814 @@
     1.4 +//
     1.5 +//  file:  repattrn.cpp
     1.6 +//
     1.7 +/*
     1.8 +***************************************************************************
     1.9 +*   Copyright (C) 2002-2012 International Business Machines Corporation   *
    1.10 +*   and others. All rights reserved.                                      *
    1.11 +***************************************************************************
    1.12 +*/
    1.13 +
    1.14 +#include "unicode/utypes.h"
    1.15 +
    1.16 +#if !UCONFIG_NO_REGULAR_EXPRESSIONS
    1.17 +
    1.18 +#include "unicode/regex.h"
    1.19 +#include "unicode/uclean.h"
    1.20 +#include "uassert.h"
    1.21 +#include "uvector.h"
    1.22 +#include "uvectr32.h"
    1.23 +#include "uvectr64.h"
    1.24 +#include "regexcmp.h"
    1.25 +#include "regeximp.h"
    1.26 +#include "regexst.h"
    1.27 +
    1.28 +U_NAMESPACE_BEGIN
    1.29 +
    1.30 +//--------------------------------------------------------------------------
    1.31 +//
    1.32 +//    RegexPattern    Default Constructor
    1.33 +//
    1.34 +//--------------------------------------------------------------------------
    1.35 +RegexPattern::RegexPattern() {
    1.36 +    // Init all of this instances data.
    1.37 +    init();
    1.38 +}
    1.39 +
    1.40 +
    1.41 +//--------------------------------------------------------------------------
    1.42 +//
    1.43 +//   Copy Constructor        Note:  This is a rather inefficient implementation,
    1.44 +//                                  but it probably doesn't matter.
    1.45 +//
    1.46 +//--------------------------------------------------------------------------
    1.47 +RegexPattern::RegexPattern(const RegexPattern &other) :  UObject(other) {
    1.48 +    init();
    1.49 +    *this = other;
    1.50 +}
    1.51 +
    1.52 +
    1.53 +
    1.54 +//--------------------------------------------------------------------------
    1.55 +//
    1.56 +//    Assignment Operator
    1.57 +//
    1.58 +//--------------------------------------------------------------------------
    1.59 +RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
    1.60 +    if (this == &other) {
    1.61 +        // Source and destination are the same.  Don't do anything.
    1.62 +        return *this;
    1.63 +    }
    1.64 +
    1.65 +    // Clean out any previous contents of object being assigned to.
    1.66 +    zap();
    1.67 +
    1.68 +    // Give target object a default initialization
    1.69 +    init();
    1.70 +
    1.71 +    // Copy simple fields
    1.72 +    if ( other.fPatternString == NULL ) {
    1.73 +        fPatternString = NULL;
    1.74 +        fPattern      = utext_clone(fPattern, other.fPattern, FALSE, TRUE, &fDeferredStatus);
    1.75 +    } else {
    1.76 +        fPatternString = new UnicodeString(*(other.fPatternString));
    1.77 +        UErrorCode status = U_ZERO_ERROR;
    1.78 +        fPattern      = utext_openConstUnicodeString(NULL, fPatternString, &status);
    1.79 +        if (U_FAILURE(status)) {
    1.80 +            fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    1.81 +            return *this;
    1.82 +        }
    1.83 +    }
    1.84 +    fFlags            = other.fFlags;
    1.85 +    fLiteralText      = other.fLiteralText;
    1.86 +    fDeferredStatus   = other.fDeferredStatus;
    1.87 +    fMinMatchLen      = other.fMinMatchLen;
    1.88 +    fFrameSize        = other.fFrameSize;
    1.89 +    fDataSize         = other.fDataSize;
    1.90 +    fMaxCaptureDigits = other.fMaxCaptureDigits;
    1.91 +    fStaticSets       = other.fStaticSets;
    1.92 +    fStaticSets8      = other.fStaticSets8;
    1.93 +
    1.94 +    fStartType        = other.fStartType;
    1.95 +    fInitialStringIdx = other.fInitialStringIdx;
    1.96 +    fInitialStringLen = other.fInitialStringLen;
    1.97 +    *fInitialChars    = *other.fInitialChars;
    1.98 +    fInitialChar      = other.fInitialChar;
    1.99 +    *fInitialChars8   = *other.fInitialChars8;
   1.100 +    fNeedsAltInput    = other.fNeedsAltInput;
   1.101 +
   1.102 +    //  Copy the pattern.  It's just values, nothing deep to copy.
   1.103 +    fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
   1.104 +    fGroupMap->assign(*other.fGroupMap, fDeferredStatus);
   1.105 +
   1.106 +    //  Copy the Unicode Sets.
   1.107 +    //    Could be made more efficient if the sets were reference counted and shared,
   1.108 +    //    but I doubt that pattern copying will be particularly common.
   1.109 +    //    Note:  init() already added an empty element zero to fSets
   1.110 +    int32_t i;
   1.111 +    int32_t  numSets = other.fSets->size();
   1.112 +    fSets8 = new Regex8BitSet[numSets];
   1.113 +    if (fSets8 == NULL) {
   1.114 +    	fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
   1.115 +    	return *this;
   1.116 +    }
   1.117 +    for (i=1; i<numSets; i++) {
   1.118 +        if (U_FAILURE(fDeferredStatus)) {
   1.119 +            return *this;
   1.120 +        }
   1.121 +        UnicodeSet *sourceSet = (UnicodeSet *)other.fSets->elementAt(i);
   1.122 +        UnicodeSet *newSet    = new UnicodeSet(*sourceSet);
   1.123 +        if (newSet == NULL) {
   1.124 +            fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
   1.125 +            break;
   1.126 +        }
   1.127 +        fSets->addElement(newSet, fDeferredStatus);
   1.128 +        fSets8[i] = other.fSets8[i];
   1.129 +    }
   1.130 +
   1.131 +    return *this;
   1.132 +}
   1.133 +
   1.134 +
   1.135 +//--------------------------------------------------------------------------
   1.136 +//
   1.137 +//    init        Shared initialization for use by constructors.
   1.138 +//                Bring an uninitialized RegexPattern up to a default state.
   1.139 +//
   1.140 +//--------------------------------------------------------------------------
   1.141 +void RegexPattern::init() {
   1.142 +    fFlags            = 0;
   1.143 +    fCompiledPat      = 0;
   1.144 +    fLiteralText.remove();
   1.145 +    fSets             = NULL;
   1.146 +    fSets8            = NULL;
   1.147 +    fDeferredStatus   = U_ZERO_ERROR;
   1.148 +    fMinMatchLen      = 0;
   1.149 +    fFrameSize        = 0;
   1.150 +    fDataSize         = 0;
   1.151 +    fGroupMap         = NULL;
   1.152 +    fMaxCaptureDigits = 1;
   1.153 +    fStaticSets       = NULL;
   1.154 +    fStaticSets8      = NULL;
   1.155 +    fStartType        = START_NO_INFO;
   1.156 +    fInitialStringIdx = 0;
   1.157 +    fInitialStringLen = 0;
   1.158 +    fInitialChars     = NULL;
   1.159 +    fInitialChar      = 0;
   1.160 +    fInitialChars8    = NULL;
   1.161 +    fNeedsAltInput    = FALSE;
   1.162 +
   1.163 +    fPattern          = NULL; // will be set later
   1.164 +    fPatternString    = NULL; // may be set later
   1.165 +    fCompiledPat      = new UVector64(fDeferredStatus);
   1.166 +    fGroupMap         = new UVector32(fDeferredStatus);
   1.167 +    fSets             = new UVector(fDeferredStatus);
   1.168 +    fInitialChars     = new UnicodeSet;
   1.169 +    fInitialChars8    = new Regex8BitSet;
   1.170 +    if (U_FAILURE(fDeferredStatus)) {
   1.171 +        return;
   1.172 +    }
   1.173 +    if (fCompiledPat == NULL  || fGroupMap == NULL || fSets == NULL ||
   1.174 +        fInitialChars == NULL || fInitialChars8 == NULL) {
   1.175 +        fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
   1.176 +        return;
   1.177 +    }
   1.178 +
   1.179 +    // Slot zero of the vector of sets is reserved.  Fill it here.
   1.180 +    fSets->addElement((int32_t)0, fDeferredStatus);
   1.181 +}
   1.182 +
   1.183 +
   1.184 +//--------------------------------------------------------------------------
   1.185 +//
   1.186 +//   zap            Delete everything owned by this RegexPattern.
   1.187 +//
   1.188 +//--------------------------------------------------------------------------
   1.189 +void RegexPattern::zap() {
   1.190 +    delete fCompiledPat;
   1.191 +    fCompiledPat = NULL;
   1.192 +    int i;
   1.193 +    for (i=1; i<fSets->size(); i++) {
   1.194 +        UnicodeSet *s;
   1.195 +        s = (UnicodeSet *)fSets->elementAt(i);
   1.196 +        if (s != NULL) {
   1.197 +            delete s;
   1.198 +        }
   1.199 +    }
   1.200 +    delete fSets;
   1.201 +    fSets = NULL;
   1.202 +    delete[] fSets8;
   1.203 +    fSets8 = NULL;
   1.204 +    delete fGroupMap;
   1.205 +    fGroupMap = NULL;
   1.206 +    delete fInitialChars;
   1.207 +    fInitialChars = NULL;
   1.208 +    delete fInitialChars8;
   1.209 +    fInitialChars8 = NULL;
   1.210 +    if (fPattern != NULL) {
   1.211 +        utext_close(fPattern);
   1.212 +        fPattern = NULL;
   1.213 +    }
   1.214 +    if (fPatternString != NULL) {
   1.215 +        delete fPatternString;
   1.216 +        fPatternString = NULL;
   1.217 +    }
   1.218 +}
   1.219 +
   1.220 +
   1.221 +//--------------------------------------------------------------------------
   1.222 +//
   1.223 +//   Destructor
   1.224 +//
   1.225 +//--------------------------------------------------------------------------
   1.226 +RegexPattern::~RegexPattern() {
   1.227 +    zap();
   1.228 +}
   1.229 +
   1.230 +
   1.231 +//--------------------------------------------------------------------------
   1.232 +//
   1.233 +//   Clone
   1.234 +//
   1.235 +//--------------------------------------------------------------------------
   1.236 +RegexPattern  *RegexPattern::clone() const {
   1.237 +    RegexPattern  *copy = new RegexPattern(*this);
   1.238 +    return copy;
   1.239 +}
   1.240 +
   1.241 +
   1.242 +//--------------------------------------------------------------------------
   1.243 +//
   1.244 +//   operator ==   (comparison)    Consider to patterns to be == if the
   1.245 +//                                 pattern strings and the flags are the same.
   1.246 +//                                 Note that pattern strings with the same
   1.247 +//                                 characters can still be considered different.
   1.248 +//
   1.249 +//--------------------------------------------------------------------------
   1.250 +UBool   RegexPattern::operator ==(const RegexPattern &other) const {
   1.251 +    if (this->fFlags == other.fFlags && this->fDeferredStatus == other.fDeferredStatus) {
   1.252 +        if (this->fPatternString != NULL && other.fPatternString != NULL) {
   1.253 +            return *(this->fPatternString) == *(other.fPatternString);
   1.254 +        } else if (this->fPattern == NULL) {
   1.255 +            if (other.fPattern == NULL) {
   1.256 +                return TRUE;
   1.257 +            }
   1.258 +        } else if (other.fPattern != NULL) {
   1.259 +            UTEXT_SETNATIVEINDEX(this->fPattern, 0);
   1.260 +            UTEXT_SETNATIVEINDEX(other.fPattern, 0);
   1.261 +            return utext_equals(this->fPattern, other.fPattern);
   1.262 +        }
   1.263 +    }
   1.264 +    return FALSE;
   1.265 +}
   1.266 +
   1.267 +//---------------------------------------------------------------------
   1.268 +//
   1.269 +//   compile
   1.270 +//
   1.271 +//---------------------------------------------------------------------
   1.272 +RegexPattern * U_EXPORT2
   1.273 +RegexPattern::compile(const UnicodeString &regex,
   1.274 +                      uint32_t             flags,
   1.275 +                      UParseError          &pe,
   1.276 +                      UErrorCode           &status)
   1.277 +{
   1.278 +    if (U_FAILURE(status)) {
   1.279 +        return NULL;
   1.280 +    }
   1.281 +    
   1.282 +    const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
   1.283 +    UREGEX_DOTALL   | UREGEX_MULTILINE        | UREGEX_UWORD |
   1.284 +    UREGEX_ERROR_ON_UNKNOWN_ESCAPES           | UREGEX_UNIX_LINES | UREGEX_LITERAL;
   1.285 +    
   1.286 +    if ((flags & ~allFlags) != 0) {
   1.287 +        status = U_REGEX_INVALID_FLAG;
   1.288 +        return NULL;
   1.289 +    }
   1.290 +    
   1.291 +    if ((flags & UREGEX_CANON_EQ) != 0) {
   1.292 +        status = U_REGEX_UNIMPLEMENTED;
   1.293 +        return NULL;
   1.294 +    }
   1.295 +    
   1.296 +    RegexPattern *This = new RegexPattern;
   1.297 +    if (This == NULL) {
   1.298 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.299 +        return NULL;
   1.300 +    }
   1.301 +    if (U_FAILURE(This->fDeferredStatus)) {
   1.302 +        status = This->fDeferredStatus;
   1.303 +        delete This;
   1.304 +        return NULL;
   1.305 +    }
   1.306 +    This->fFlags = flags;
   1.307 +    
   1.308 +    RegexCompile     compiler(This, status);
   1.309 +    compiler.compile(regex, pe, status);
   1.310 +    
   1.311 +    if (U_FAILURE(status)) {
   1.312 +        delete This;
   1.313 +        This = NULL;
   1.314 +    }
   1.315 +    
   1.316 +    return This;
   1.317 +}
   1.318 +
   1.319 +
   1.320 +//
   1.321 +//   compile, UText mode
   1.322 +//
   1.323 +RegexPattern * U_EXPORT2
   1.324 +RegexPattern::compile(UText                *regex,
   1.325 +                      uint32_t             flags,
   1.326 +                      UParseError          &pe,
   1.327 +                      UErrorCode           &status)
   1.328 +{
   1.329 +    if (U_FAILURE(status)) {
   1.330 +        return NULL;
   1.331 +    }
   1.332 +
   1.333 +    const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
   1.334 +                              UREGEX_DOTALL   | UREGEX_MULTILINE        | UREGEX_UWORD |
   1.335 +                              UREGEX_ERROR_ON_UNKNOWN_ESCAPES           | UREGEX_UNIX_LINES | UREGEX_LITERAL;
   1.336 +
   1.337 +    if ((flags & ~allFlags) != 0) {
   1.338 +        status = U_REGEX_INVALID_FLAG;
   1.339 +        return NULL;
   1.340 +    }
   1.341 +
   1.342 +    if ((flags & UREGEX_CANON_EQ) != 0) {
   1.343 +        status = U_REGEX_UNIMPLEMENTED;
   1.344 +        return NULL;
   1.345 +    }
   1.346 +
   1.347 +    RegexPattern *This = new RegexPattern;
   1.348 +    if (This == NULL) {
   1.349 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.350 +        return NULL;
   1.351 +    }
   1.352 +    if (U_FAILURE(This->fDeferredStatus)) {
   1.353 +        status = This->fDeferredStatus;
   1.354 +        delete This;
   1.355 +        return NULL;
   1.356 +    }
   1.357 +    This->fFlags = flags;
   1.358 +
   1.359 +    RegexCompile     compiler(This, status);
   1.360 +    compiler.compile(regex, pe, status);
   1.361 +    
   1.362 +    if (U_FAILURE(status)) {
   1.363 +        delete This;
   1.364 +        This = NULL;
   1.365 +    }
   1.366 +
   1.367 +    return This;
   1.368 +}
   1.369 +
   1.370 +//
   1.371 +//   compile with default flags.
   1.372 +//
   1.373 +RegexPattern * U_EXPORT2
   1.374 +RegexPattern::compile(const UnicodeString &regex,
   1.375 +                      UParseError         &pe,
   1.376 +                      UErrorCode          &err)
   1.377 +{
   1.378 +    return compile(regex, 0, pe, err);
   1.379 +}
   1.380 +
   1.381 +
   1.382 +//
   1.383 +//   compile with default flags, UText mode
   1.384 +//
   1.385 +RegexPattern * U_EXPORT2
   1.386 +RegexPattern::compile(UText               *regex,
   1.387 +                      UParseError         &pe,
   1.388 +                      UErrorCode          &err)
   1.389 +{
   1.390 +    return compile(regex, 0, pe, err);
   1.391 +}
   1.392 +
   1.393 +
   1.394 +//
   1.395 +//   compile with no UParseErr parameter.
   1.396 +//
   1.397 +RegexPattern * U_EXPORT2
   1.398 +RegexPattern::compile(const UnicodeString &regex,
   1.399 +                      uint32_t             flags,
   1.400 +                      UErrorCode          &err)
   1.401 +{
   1.402 +    UParseError pe;
   1.403 +    return compile(regex, flags, pe, err);
   1.404 +}
   1.405 +
   1.406 +
   1.407 +//
   1.408 +//   compile with no UParseErr parameter, UText mode
   1.409 +//
   1.410 +RegexPattern * U_EXPORT2
   1.411 +RegexPattern::compile(UText                *regex,
   1.412 +                      uint32_t             flags,
   1.413 +                      UErrorCode           &err)
   1.414 +{
   1.415 +    UParseError pe;
   1.416 +    return compile(regex, flags, pe, err);
   1.417 +}
   1.418 +
   1.419 +
   1.420 +//---------------------------------------------------------------------
   1.421 +//
   1.422 +//   flags
   1.423 +//
   1.424 +//---------------------------------------------------------------------
   1.425 +uint32_t RegexPattern::flags() const {
   1.426 +    return fFlags;
   1.427 +}
   1.428 +
   1.429 +
   1.430 +//---------------------------------------------------------------------
   1.431 +//
   1.432 +//   matcher(UnicodeString, err)
   1.433 +//
   1.434 +//---------------------------------------------------------------------
   1.435 +RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
   1.436 +                                    UErrorCode          &status)  const {
   1.437 +    RegexMatcher    *retMatcher = matcher(status);
   1.438 +    if (retMatcher != NULL) {
   1.439 +        retMatcher->fDeferredStatus = status;
   1.440 +        retMatcher->reset(input);
   1.441 +    }
   1.442 +    return retMatcher;
   1.443 +}
   1.444 +
   1.445 +
   1.446 +//---------------------------------------------------------------------
   1.447 +//
   1.448 +//   matcher(status)
   1.449 +//
   1.450 +//---------------------------------------------------------------------
   1.451 +RegexMatcher *RegexPattern::matcher(UErrorCode &status)  const {
   1.452 +    RegexMatcher    *retMatcher = NULL;
   1.453 +
   1.454 +    if (U_FAILURE(status)) {
   1.455 +        return NULL;
   1.456 +    }
   1.457 +    if (U_FAILURE(fDeferredStatus)) {
   1.458 +        status = fDeferredStatus;
   1.459 +        return NULL;
   1.460 +    }
   1.461 +
   1.462 +    retMatcher = new RegexMatcher(this);
   1.463 +    if (retMatcher == NULL) {
   1.464 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.465 +        return NULL;
   1.466 +    }
   1.467 +    return retMatcher;
   1.468 +}
   1.469 +
   1.470 +
   1.471 +
   1.472 +//---------------------------------------------------------------------
   1.473 +//
   1.474 +//   matches        Convenience function to test for a match, starting
   1.475 +//                  with a pattern string and a data string.
   1.476 +//
   1.477 +//---------------------------------------------------------------------
   1.478 +UBool U_EXPORT2 RegexPattern::matches(const UnicodeString   &regex,
   1.479 +              const UnicodeString   &input,
   1.480 +                    UParseError     &pe,
   1.481 +                    UErrorCode      &status) {
   1.482 +
   1.483 +    if (U_FAILURE(status)) {return FALSE;}
   1.484 +
   1.485 +    UBool         retVal;
   1.486 +    RegexPattern *pat     = NULL;
   1.487 +    RegexMatcher *matcher = NULL;
   1.488 +
   1.489 +    pat     = RegexPattern::compile(regex, 0, pe, status);
   1.490 +    matcher = pat->matcher(input, status);
   1.491 +    retVal  = matcher->matches(status);
   1.492 +
   1.493 +    delete matcher;
   1.494 +    delete pat;
   1.495 +    return retVal;
   1.496 +}
   1.497 +
   1.498 +
   1.499 +//
   1.500 +//   matches, UText mode
   1.501 +//
   1.502 +UBool U_EXPORT2 RegexPattern::matches(UText                *regex,
   1.503 +                    UText           *input,
   1.504 +                    UParseError     &pe,
   1.505 +                    UErrorCode      &status) {
   1.506 +
   1.507 +    if (U_FAILURE(status)) {return FALSE;}
   1.508 +
   1.509 +    UBool         retVal  = FALSE;
   1.510 +    RegexPattern *pat     = NULL;
   1.511 +    RegexMatcher *matcher = NULL;
   1.512 +
   1.513 +    pat     = RegexPattern::compile(regex, 0, pe, status);
   1.514 +    matcher = pat->matcher(status);
   1.515 +    if (U_SUCCESS(status)) {
   1.516 +        matcher->reset(input);
   1.517 +        retVal  = matcher->matches(status);
   1.518 +    }
   1.519 +
   1.520 +    delete matcher;
   1.521 +    delete pat;
   1.522 +    return retVal;
   1.523 +}
   1.524 +
   1.525 +
   1.526 +
   1.527 +
   1.528 +
   1.529 +//---------------------------------------------------------------------
   1.530 +//
   1.531 +//   pattern
   1.532 +//
   1.533 +//---------------------------------------------------------------------
   1.534 +UnicodeString RegexPattern::pattern() const {
   1.535 +    if (fPatternString != NULL) {
   1.536 +        return *fPatternString;
   1.537 +    } else if (fPattern == NULL) {
   1.538 +        return UnicodeString();
   1.539 +    } else {
   1.540 +        UErrorCode status = U_ZERO_ERROR;
   1.541 +        int64_t nativeLen = utext_nativeLength(fPattern);
   1.542 +        int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status); // buffer overflow error
   1.543 +        UnicodeString result;
   1.544 +        
   1.545 +        status = U_ZERO_ERROR;
   1.546 +        UChar *resultChars = result.getBuffer(len16);
   1.547 +        utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning
   1.548 +        result.releaseBuffer(len16);
   1.549 +        
   1.550 +        return result;
   1.551 +    }
   1.552 +}
   1.553 +
   1.554 +
   1.555 +
   1.556 +
   1.557 +//---------------------------------------------------------------------
   1.558 +//
   1.559 +//   patternText
   1.560 +//
   1.561 +//---------------------------------------------------------------------
   1.562 +UText *RegexPattern::patternText(UErrorCode      &status) const {
   1.563 +    if (U_FAILURE(status)) {return NULL;}
   1.564 +    status = U_ZERO_ERROR;
   1.565 +
   1.566 +    if (fPattern != NULL) {
   1.567 +        return fPattern;
   1.568 +    } else {
   1.569 +        RegexStaticSets::initGlobals(&status);
   1.570 +        return RegexStaticSets::gStaticSets->fEmptyText;
   1.571 +    }
   1.572 +}
   1.573 +
   1.574 +
   1.575 +
   1.576 +//---------------------------------------------------------------------
   1.577 +//
   1.578 +//   split
   1.579 +//
   1.580 +//---------------------------------------------------------------------
   1.581 +int32_t  RegexPattern::split(const UnicodeString &input,
   1.582 +        UnicodeString    dest[],
   1.583 +        int32_t          destCapacity,
   1.584 +        UErrorCode      &status) const
   1.585 +{
   1.586 +    if (U_FAILURE(status)) {
   1.587 +        return 0;
   1.588 +    };
   1.589 +
   1.590 +    RegexMatcher  m(this);
   1.591 +    int32_t r = 0;
   1.592 +    // Check m's status to make sure all is ok.
   1.593 +    if (U_SUCCESS(m.fDeferredStatus)) {
   1.594 +    	r = m.split(input, dest, destCapacity, status);
   1.595 +    }
   1.596 +    return r;
   1.597 +}
   1.598 +
   1.599 +//
   1.600 +//   split, UText mode
   1.601 +//
   1.602 +int32_t  RegexPattern::split(UText *input,
   1.603 +        UText           *dest[],
   1.604 +        int32_t          destCapacity,
   1.605 +        UErrorCode      &status) const
   1.606 +{
   1.607 +    if (U_FAILURE(status)) {
   1.608 +        return 0;
   1.609 +    };
   1.610 +
   1.611 +    RegexMatcher  m(this);
   1.612 +    int32_t r = 0;
   1.613 +    // Check m's status to make sure all is ok.
   1.614 +    if (U_SUCCESS(m.fDeferredStatus)) {
   1.615 +    	r = m.split(input, dest, destCapacity, status);
   1.616 +    }
   1.617 +    return r;
   1.618 +}
   1.619 +
   1.620 +
   1.621 +
   1.622 +//---------------------------------------------------------------------
   1.623 +//
   1.624 +//   dump    Output the compiled form of the pattern.
   1.625 +//           Debugging function only.
   1.626 +//
   1.627 +//---------------------------------------------------------------------
   1.628 +#if defined(REGEX_DEBUG)
   1.629 +void   RegexPattern::dumpOp(int32_t index) const {
   1.630 +    static const char * const opNames[] = {URX_OPCODE_NAMES};
   1.631 +    int32_t op          = fCompiledPat->elementAti(index);
   1.632 +    int32_t val         = URX_VAL(op);
   1.633 +    int32_t type        = URX_TYPE(op);
   1.634 +    int32_t pinnedType  = type;
   1.635 +    if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) {
   1.636 +        pinnedType = 0;
   1.637 +    }
   1.638 +
   1.639 +    REGEX_DUMP_DEBUG_PRINTF(("%4d   %08x    %-15s  ", index, op, opNames[pinnedType]));
   1.640 +    switch (type) {
   1.641 +    case URX_NOP:
   1.642 +    case URX_DOTANY:
   1.643 +    case URX_DOTANY_ALL:
   1.644 +    case URX_FAIL:
   1.645 +    case URX_CARET:
   1.646 +    case URX_DOLLAR:
   1.647 +    case URX_BACKSLASH_G:
   1.648 +    case URX_BACKSLASH_X:
   1.649 +    case URX_END:
   1.650 +    case URX_DOLLAR_M:
   1.651 +    case URX_CARET_M:
   1.652 +        // Types with no operand field of interest.
   1.653 +        break;
   1.654 +
   1.655 +    case URX_RESERVED_OP:
   1.656 +    case URX_START_CAPTURE:
   1.657 +    case URX_END_CAPTURE:
   1.658 +    case URX_STATE_SAVE:
   1.659 +    case URX_JMP:
   1.660 +    case URX_JMP_SAV:
   1.661 +    case URX_JMP_SAV_X:
   1.662 +    case URX_BACKSLASH_B:
   1.663 +    case URX_BACKSLASH_BU:
   1.664 +    case URX_BACKSLASH_D:
   1.665 +    case URX_BACKSLASH_Z:
   1.666 +    case URX_STRING_LEN:
   1.667 +    case URX_CTR_INIT:
   1.668 +    case URX_CTR_INIT_NG:
   1.669 +    case URX_CTR_LOOP:
   1.670 +    case URX_CTR_LOOP_NG:
   1.671 +    case URX_RELOC_OPRND:
   1.672 +    case URX_STO_SP:
   1.673 +    case URX_LD_SP:
   1.674 +    case URX_BACKREF:
   1.675 +    case URX_STO_INP_LOC:
   1.676 +    case URX_JMPX:
   1.677 +    case URX_LA_START:
   1.678 +    case URX_LA_END:
   1.679 +    case URX_BACKREF_I:
   1.680 +    case URX_LB_START:
   1.681 +    case URX_LB_CONT:
   1.682 +    case URX_LB_END:
   1.683 +    case URX_LBN_CONT:
   1.684 +    case URX_LBN_END:
   1.685 +    case URX_LOOP_C:
   1.686 +    case URX_LOOP_DOT_I:
   1.687 +        // types with an integer operand field.
   1.688 +        REGEX_DUMP_DEBUG_PRINTF(("%d", val));
   1.689 +        break;
   1.690 +
   1.691 +    case URX_ONECHAR:
   1.692 +    case URX_ONECHAR_I:
   1.693 +        REGEX_DUMP_DEBUG_PRINTF(("%c", val<256?val:'?'));
   1.694 +        break;
   1.695 +
   1.696 +    case URX_STRING:
   1.697 +    case URX_STRING_I:
   1.698 +        {
   1.699 +            int32_t lengthOp       = fCompiledPat->elementAti(index+1);
   1.700 +            U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
   1.701 +            int32_t length = URX_VAL(lengthOp);
   1.702 +            int32_t i;
   1.703 +            for (i=val; i<val+length; i++) {
   1.704 +                UChar c = fLiteralText[i];
   1.705 +                if (c < 32 || c >= 256) {c = '.';}
   1.706 +                REGEX_DUMP_DEBUG_PRINTF(("%c", c));
   1.707 +            }
   1.708 +        }
   1.709 +        break;
   1.710 +
   1.711 +    case URX_SETREF:
   1.712 +    case URX_LOOP_SR_I:
   1.713 +        {
   1.714 +            UnicodeString s;
   1.715 +            UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
   1.716 +            set->toPattern(s, TRUE);
   1.717 +            for (int32_t i=0; i<s.length(); i++) {
   1.718 +                REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i)));
   1.719 +            }
   1.720 +        }
   1.721 +        break;
   1.722 +
   1.723 +    case URX_STATIC_SETREF:
   1.724 +    case URX_STAT_SETREF_N:
   1.725 +        {
   1.726 +            UnicodeString s;
   1.727 +            if (val & URX_NEG_SET) {
   1.728 +                REGEX_DUMP_DEBUG_PRINTF(("NOT "));
   1.729 +                val &= ~URX_NEG_SET;
   1.730 +            }
   1.731 +            UnicodeSet *set = fStaticSets[val];
   1.732 +            set->toPattern(s, TRUE);
   1.733 +            for (int32_t i=0; i<s.length(); i++) {
   1.734 +                REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i)));
   1.735 +            }
   1.736 +        }
   1.737 +        break;
   1.738 +
   1.739 +
   1.740 +    default:
   1.741 +        REGEX_DUMP_DEBUG_PRINTF(("??????"));
   1.742 +        break;
   1.743 +    }
   1.744 +    REGEX_DUMP_DEBUG_PRINTF(("\n"));
   1.745 +}
   1.746 +#endif
   1.747 +
   1.748 +
   1.749 +#if defined(REGEX_DEBUG)
   1.750 +U_CAPI void  U_EXPORT2
   1.751 +RegexPatternDump(const RegexPattern *This) {
   1.752 +    int      index;
   1.753 +    int      i;
   1.754 +
   1.755 +    REGEX_DUMP_DEBUG_PRINTF(("Original Pattern:  "));
   1.756 +    UChar32 c = utext_next32From(This->fPattern, 0);
   1.757 +    while (c != U_SENTINEL) {
   1.758 +        if (c<32 || c>256) {
   1.759 +            c = '.';
   1.760 +        }
   1.761 +        REGEX_DUMP_DEBUG_PRINTF(("%c", c));
   1.762 +        
   1.763 +        c = UTEXT_NEXT32(This->fPattern);
   1.764 +    }
   1.765 +    REGEX_DUMP_DEBUG_PRINTF(("\n"));
   1.766 +    REGEX_DUMP_DEBUG_PRINTF(("   Min Match Length:  %d\n", This->fMinMatchLen));
   1.767 +    REGEX_DUMP_DEBUG_PRINTF(("   Match Start Type:  %s\n", START_OF_MATCH_STR(This->fStartType)));
   1.768 +    if (This->fStartType == START_STRING) {
   1.769 +        REGEX_DUMP_DEBUG_PRINTF(("    Initial match string: \""));
   1.770 +        for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitialStringLen; i++) {
   1.771 +            REGEX_DUMP_DEBUG_PRINTF(("%c", This->fLiteralText[i]));   // TODO:  non-printables, surrogates.
   1.772 +        }
   1.773 +        REGEX_DUMP_DEBUG_PRINTF(("\"\n"));
   1.774 +
   1.775 +    } else if (This->fStartType == START_SET) {
   1.776 +        int32_t numSetChars = This->fInitialChars->size();
   1.777 +        if (numSetChars > 20) {
   1.778 +            numSetChars = 20;
   1.779 +        }
   1.780 +        REGEX_DUMP_DEBUG_PRINTF(("     Match First Chars : "));
   1.781 +        for (i=0; i<numSetChars; i++) {
   1.782 +            UChar32 c = This->fInitialChars->charAt(i);
   1.783 +            if (0x20<c && c <0x7e) {
   1.784 +                REGEX_DUMP_DEBUG_PRINTF(("%c ", c));
   1.785 +            } else {
   1.786 +                REGEX_DUMP_DEBUG_PRINTF(("%#x ", c));
   1.787 +            }
   1.788 +        }
   1.789 +        if (numSetChars < This->fInitialChars->size()) {
   1.790 +            REGEX_DUMP_DEBUG_PRINTF((" ..."));
   1.791 +        }
   1.792 +        REGEX_DUMP_DEBUG_PRINTF(("\n"));
   1.793 +
   1.794 +    } else if (This->fStartType == START_CHAR) {
   1.795 +        REGEX_DUMP_DEBUG_PRINTF(("    First char of Match : "));
   1.796 +        if (0x20 < This->fInitialChar && This->fInitialChar<0x7e) {
   1.797 +                REGEX_DUMP_DEBUG_PRINTF(("%c\n", This->fInitialChar));
   1.798 +            } else {
   1.799 +                REGEX_DUMP_DEBUG_PRINTF(("%#x\n", This->fInitialChar));
   1.800 +            }
   1.801 +    }
   1.802 +
   1.803 +    REGEX_DUMP_DEBUG_PRINTF(("\nIndex   Binary     Type             Operand\n" \
   1.804 +           "-------------------------------------------\n"));
   1.805 +    for (index = 0; index<This->fCompiledPat->size(); index++) {
   1.806 +        This->dumpOp(index);
   1.807 +    }
   1.808 +    REGEX_DUMP_DEBUG_PRINTF(("\n\n"));
   1.809 +}
   1.810 +#endif
   1.811 +
   1.812 +
   1.813 +
   1.814 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern)
   1.815 +
   1.816 +U_NAMESPACE_END
   1.817 +#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS

mercurial