intl/icu/source/i18n/uregex.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2 *******************************************************************************
     3 *   Copyright (C) 2004-2013, International Business Machines
     4 *   Corporation and others.  All Rights Reserved.
     5 *******************************************************************************
     6 *   file name:  uregex.cpp
     7 */
     9 #include "unicode/utypes.h"
    11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
    13 #include "unicode/regex.h"
    14 #include "unicode/uregex.h"
    15 #include "unicode/unistr.h"
    16 #include "unicode/ustring.h"
    17 #include "unicode/uchar.h"
    18 #include "unicode/uobject.h"
    19 #include "unicode/utf16.h"
    20 #include "umutex.h"
    21 #include "uassert.h"
    22 #include "cmemory.h"
    24 #include "regextxt.h"
    26 #include <stdio.h>
    28 U_NAMESPACE_BEGIN
    30 #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
    32 struct RegularExpression: public UMemory {
    33 public:
    34     RegularExpression();
    35     ~RegularExpression();
    36     int32_t           fMagic;
    37     RegexPattern     *fPat;
    38     u_atomic_int32_t *fPatRefCount;
    39     UChar            *fPatString;
    40     int32_t           fPatStringLen;
    41     RegexMatcher     *fMatcher;
    42     const UChar      *fText;         // Text from setText()
    43     int32_t           fTextLength;   // Length provided by user with setText(), which
    44                                      //  may be -1.
    45     UBool             fOwnsText;
    46 };
    48 static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
    50 RegularExpression::RegularExpression() {
    51     fMagic        = REXP_MAGIC;
    52     fPat          = NULL;
    53     fPatRefCount  = NULL;
    54     fPatString    = NULL;
    55     fPatStringLen = 0;
    56     fMatcher      = NULL;
    57     fText         = NULL;
    58     fTextLength   = 0;
    59     fOwnsText     = FALSE;
    60 }
    62 RegularExpression::~RegularExpression() {
    63     delete fMatcher;
    64     fMatcher = NULL;
    65     if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
    66         delete fPat;
    67         uprv_free(fPatString);
    68         uprv_free((void *)fPatRefCount);
    69     }
    70     if (fOwnsText && fText!=NULL) {
    71         uprv_free((void *)fText);
    72     }
    73     fMagic = 0;
    74 }
    76 U_NAMESPACE_END
    78 U_NAMESPACE_USE
    80 //----------------------------------------------------------------------------------------
    81 //
    82 //   validateRE    Do boilerplate style checks on API function parameters.
    83 //                 Return TRUE if they look OK.
    84 //----------------------------------------------------------------------------------------
    85 static UBool validateRE(const RegularExpression *re, UBool requiresText, UErrorCode *status) {
    86     if (U_FAILURE(*status)) {
    87         return FALSE;
    88     }
    89     if (re == NULL || re->fMagic != REXP_MAGIC) {
    90         *status = U_ILLEGAL_ARGUMENT_ERROR;
    91         return FALSE;
    92     }
    93     // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
    94     if (requiresText && re->fText == NULL && !re->fOwnsText) {
    95         *status = U_REGEX_INVALID_STATE;
    96         return FALSE;
    97     }
    98     return TRUE;
    99 }
   101 //----------------------------------------------------------------------------------------
   102 //
   103 //    uregex_open
   104 //
   105 //----------------------------------------------------------------------------------------
   106 U_CAPI URegularExpression *  U_EXPORT2
   107 uregex_open( const  UChar          *pattern,
   108                     int32_t         patternLength,
   109                     uint32_t        flags,
   110                     UParseError    *pe,
   111                     UErrorCode     *status) {
   113     if (U_FAILURE(*status)) {
   114         return NULL;
   115     }
   116     if (pattern == NULL || patternLength < -1 || patternLength == 0) {
   117         *status = U_ILLEGAL_ARGUMENT_ERROR;
   118         return NULL;
   119     }
   120     int32_t actualPatLen = patternLength;
   121     if (actualPatLen == -1) {
   122         actualPatLen = u_strlen(pattern);
   123     }
   125     RegularExpression  *re     = new RegularExpression;
   126     u_atomic_int32_t   *refC   = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t));
   127     UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
   128     if (re == NULL || refC == NULL || patBuf == NULL) {
   129         *status = U_MEMORY_ALLOCATION_ERROR;
   130         delete re;
   131         uprv_free((void *)refC);
   132         uprv_free(patBuf);
   133         return NULL;
   134     }
   135     re->fPatRefCount = refC;
   136     *re->fPatRefCount = 1;
   138     //
   139     // Make a copy of the pattern string, so we can return it later if asked.
   140     //    For compiling the pattern, we will use a UText wrapper around
   141     //    this local copy, to avoid making even more copies.
   142     //
   143     re->fPatString    = patBuf;
   144     re->fPatStringLen = patternLength;
   145     u_memcpy(patBuf, pattern, actualPatLen);
   146     patBuf[actualPatLen] = 0;
   148     UText patText = UTEXT_INITIALIZER;
   149     utext_openUChars(&patText, patBuf, patternLength, status);
   151     //
   152     // Compile the pattern
   153     //
   154     if (pe != NULL) {
   155         re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
   156     } else {
   157         re->fPat = RegexPattern::compile(&patText, flags, *status);
   158     }
   159     utext_close(&patText);
   161     if (U_FAILURE(*status)) {
   162         goto ErrorExit;
   163     }
   165     //
   166     // Create the matcher object
   167     //
   168     re->fMatcher = re->fPat->matcher(*status);
   169     if (U_SUCCESS(*status)) {
   170         return (URegularExpression*)re;
   171     }
   173 ErrorExit:
   174     delete re;
   175     return NULL;
   177 }
   179 //----------------------------------------------------------------------------------------
   180 //
   181 //    uregex_openUText
   182 //
   183 //----------------------------------------------------------------------------------------
   184 U_CAPI URegularExpression *  U_EXPORT2
   185 uregex_openUText(UText          *pattern,
   186                  uint32_t        flags,
   187                  UParseError    *pe,
   188                  UErrorCode     *status) {
   190     if (U_FAILURE(*status)) {
   191         return NULL;
   192     }
   193     if (pattern == NULL) {
   194         *status = U_ILLEGAL_ARGUMENT_ERROR;
   195         return NULL;
   196     }
   198     int64_t patternNativeLength = utext_nativeLength(pattern);
   200     if (patternNativeLength == 0) {
   201         *status = U_ILLEGAL_ARGUMENT_ERROR;
   202         return NULL;
   203     }
   205     RegularExpression *re     = new RegularExpression;
   207     UErrorCode lengthStatus = U_ZERO_ERROR;
   208     int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus);
   210     u_atomic_int32_t   *refC   = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t));
   211     UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1));
   212     if (re == NULL || refC == NULL || patBuf == NULL) {
   213         *status = U_MEMORY_ALLOCATION_ERROR;
   214         delete re;
   215         uprv_free((void *)refC);
   216         uprv_free(patBuf);
   217         return NULL;
   218     }
   219     re->fPatRefCount = refC;
   220     *re->fPatRefCount = 1;
   222     //
   223     // Make a copy of the pattern string, so we can return it later if asked.
   224     //    For compiling the pattern, we will use a read-only UText wrapper
   225     //    around this local copy, to avoid making even more copies.
   226     //
   227     re->fPatString    = patBuf;
   228     re->fPatStringLen = pattern16Length;
   229     utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status);
   231     UText patText = UTEXT_INITIALIZER;
   232     utext_openUChars(&patText, patBuf, pattern16Length, status);
   234     //
   235     // Compile the pattern
   236     //
   237     if (pe != NULL) {
   238         re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
   239     } else {
   240         re->fPat = RegexPattern::compile(&patText, flags, *status);
   241     }
   242     utext_close(&patText);
   244     if (U_FAILURE(*status)) {
   245         goto ErrorExit;
   246     }
   248     //
   249     // Create the matcher object
   250     //
   251     re->fMatcher = re->fPat->matcher(*status);
   252     if (U_SUCCESS(*status)) {
   253         return (URegularExpression*)re;
   254     }
   256 ErrorExit:
   257     delete re;
   258     return NULL;
   260 }
   262 //----------------------------------------------------------------------------------------
   263 //
   264 //    uregex_close
   265 //
   266 //----------------------------------------------------------------------------------------
   267 U_CAPI void  U_EXPORT2
   268 uregex_close(URegularExpression  *re2) {
   269     RegularExpression *re = (RegularExpression*)re2;
   270     UErrorCode  status = U_ZERO_ERROR;
   271     if (validateRE(re, FALSE, &status) == FALSE) {
   272         return;
   273     }
   274     delete re;
   275 }
   278 //----------------------------------------------------------------------------------------
   279 //
   280 //    uregex_clone
   281 //
   282 //----------------------------------------------------------------------------------------
   283 U_CAPI URegularExpression * U_EXPORT2 
   284 uregex_clone(const URegularExpression *source2, UErrorCode *status)  {
   285     RegularExpression *source = (RegularExpression*)source2;
   286     if (validateRE(source, FALSE, status) == FALSE) {
   287         return NULL;
   288     }
   290     RegularExpression *clone = new RegularExpression;
   291     if (clone == NULL) {
   292         *status = U_MEMORY_ALLOCATION_ERROR;
   293         return NULL;
   294     }
   296     clone->fMatcher = source->fPat->matcher(*status);
   297     if (U_FAILURE(*status)) {
   298         delete clone;
   299         return NULL;
   300     }
   302     clone->fPat          = source->fPat;
   303     clone->fPatRefCount  = source->fPatRefCount; 
   304     clone->fPatString    = source->fPatString;
   305     clone->fPatStringLen = source->fPatStringLen;
   306     umtx_atomic_inc(source->fPatRefCount);
   307     // Note:  fText is not cloned.
   309     return (URegularExpression*)clone;
   310 }
   315 //------------------------------------------------------------------------------
   316 //
   317 //    uregex_pattern
   318 //
   319 //------------------------------------------------------------------------------
   320 U_CAPI const UChar * U_EXPORT2 
   321 uregex_pattern(const  URegularExpression *regexp2,
   322                       int32_t            *patLength,
   323                       UErrorCode         *status)  {
   324     RegularExpression *regexp = (RegularExpression*)regexp2;
   326     if (validateRE(regexp, FALSE, status) == FALSE) {
   327         return NULL;
   328     }
   329     if (patLength != NULL) {
   330         *patLength = regexp->fPatStringLen;
   331     }
   332     return regexp->fPatString;
   333 }
   336 //------------------------------------------------------------------------------
   337 //
   338 //    uregex_patternUText
   339 //
   340 //------------------------------------------------------------------------------
   341 U_CAPI UText * U_EXPORT2
   342 uregex_patternUText(const URegularExpression *regexp2,
   343                           UErrorCode         *status)  {
   344     RegularExpression *regexp = (RegularExpression*)regexp2;
   345     return regexp->fPat->patternText(*status);
   346 }
   349 //------------------------------------------------------------------------------
   350 //
   351 //    uregex_flags
   352 //
   353 //------------------------------------------------------------------------------
   354 U_CAPI int32_t U_EXPORT2 
   355 uregex_flags(const URegularExpression *regexp2, UErrorCode *status)  {
   356     RegularExpression *regexp = (RegularExpression*)regexp2;
   357     if (validateRE(regexp, FALSE, status) == FALSE) {
   358         return 0;
   359     }
   360     int32_t flags = regexp->fPat->flags();
   361     return flags;
   362 }
   365 //------------------------------------------------------------------------------
   366 //
   367 //    uregex_setText
   368 //
   369 //------------------------------------------------------------------------------
   370 U_CAPI void U_EXPORT2 
   371 uregex_setText(URegularExpression *regexp2,
   372                const UChar        *text,
   373                int32_t             textLength,
   374                UErrorCode         *status)  {
   375     RegularExpression *regexp = (RegularExpression*)regexp2;
   376     if (validateRE(regexp, FALSE, status) == FALSE) {
   377         return;
   378     }
   379     if (text == NULL || textLength < -1) {
   380         *status = U_ILLEGAL_ARGUMENT_ERROR;
   381         return;
   382     }
   384     if (regexp->fOwnsText && regexp->fText != NULL) {
   385         uprv_free((void *)regexp->fText);
   386     }
   388     regexp->fText       = text;
   389     regexp->fTextLength = textLength;
   390     regexp->fOwnsText   = FALSE;
   392     UText input = UTEXT_INITIALIZER;
   393     utext_openUChars(&input, text, textLength, status);
   394     regexp->fMatcher->reset(&input);
   395     utext_close(&input); // reset() made a shallow clone, so we don't need this copy
   396 }
   399 //------------------------------------------------------------------------------
   400 //
   401 //    uregex_setUText
   402 //
   403 //------------------------------------------------------------------------------
   404 U_CAPI void U_EXPORT2 
   405 uregex_setUText(URegularExpression *regexp2,
   406                 UText              *text,
   407                 UErrorCode         *status) {
   408     RegularExpression *regexp = (RegularExpression*)regexp2;
   409     if (validateRE(regexp, FALSE, status) == FALSE) {
   410         return;
   411     }
   412     if (text == NULL) {
   413         *status = U_ILLEGAL_ARGUMENT_ERROR;
   414         return;
   415     }
   417     if (regexp->fOwnsText && regexp->fText != NULL) {
   418         uprv_free((void *)regexp->fText);
   419     }
   421     regexp->fText       = NULL; // only fill it in on request
   422     regexp->fTextLength = -1;
   423     regexp->fOwnsText   = TRUE;
   424     regexp->fMatcher->reset(text);
   425 }
   429 //------------------------------------------------------------------------------
   430 //
   431 //    uregex_getText
   432 //
   433 //------------------------------------------------------------------------------
   434 U_CAPI const UChar * U_EXPORT2 
   435 uregex_getText(URegularExpression *regexp2,
   436                int32_t            *textLength,
   437                UErrorCode         *status)  {
   438     RegularExpression *regexp = (RegularExpression*)regexp2;
   439     if (validateRE(regexp, FALSE, status) == FALSE) {
   440         return NULL;
   441     }
   443     if (regexp->fText == NULL) {
   444         // need to fill in the text
   445         UText *inputText = regexp->fMatcher->inputText();
   446         int64_t inputNativeLength = utext_nativeLength(inputText);
   447         if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) {
   448             regexp->fText = inputText->chunkContents;
   449             regexp->fTextLength = (int32_t)inputNativeLength;
   450             regexp->fOwnsText = FALSE; // because the UText owns it
   451         } else {
   452             UErrorCode lengthStatus = U_ZERO_ERROR;
   453             regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error
   454             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1));
   456             utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status);
   457             regexp->fText = inputChars;
   458             regexp->fOwnsText = TRUE; // should already be set but just in case
   459         }
   460     }
   462     if (textLength != NULL) {
   463         *textLength = regexp->fTextLength;
   464     }
   465     return regexp->fText;
   466 }
   469 //------------------------------------------------------------------------------
   470 //
   471 //    uregex_getUText
   472 //
   473 //------------------------------------------------------------------------------
   474 U_CAPI UText * U_EXPORT2 
   475 uregex_getUText(URegularExpression *regexp2,
   476                 UText              *dest,
   477                 UErrorCode         *status)  {
   478     RegularExpression *regexp = (RegularExpression*)regexp2;
   479     if (validateRE(regexp, FALSE, status) == FALSE) {
   480         return dest;
   481     }
   482     return regexp->fMatcher->getInput(dest, *status);
   483 }
   486 //------------------------------------------------------------------------------
   487 //
   488 //    uregex_refreshUText
   489 //
   490 //------------------------------------------------------------------------------
   491 U_CAPI void U_EXPORT2 
   492 uregex_refreshUText(URegularExpression *regexp2,
   493                     UText              *text,
   494                     UErrorCode         *status) {
   495     RegularExpression *regexp = (RegularExpression*)regexp2;
   496     if (validateRE(regexp, FALSE, status) == FALSE) {
   497         return;
   498     }
   499     regexp->fMatcher->refreshInputText(text, *status);
   500 }
   503 //------------------------------------------------------------------------------
   504 //
   505 //    uregex_matches
   506 //
   507 //------------------------------------------------------------------------------
   508 U_CAPI UBool U_EXPORT2 
   509 uregex_matches(URegularExpression *regexp2,
   510                int32_t            startIndex,
   511                UErrorCode        *status)  {
   512     return uregex_matches64( regexp2, (int64_t)startIndex, status);
   513 }
   515 U_CAPI UBool U_EXPORT2 
   516 uregex_matches64(URegularExpression *regexp2,
   517                  int64_t            startIndex,
   518                  UErrorCode        *status)  {
   519     RegularExpression *regexp = (RegularExpression*)regexp2;
   520     UBool result = FALSE;
   521     if (validateRE(regexp, TRUE, status) == FALSE) {
   522         return result;
   523     }
   524     if (startIndex == -1) {
   525         result = regexp->fMatcher->matches(*status);
   526     } else {
   527         result = regexp->fMatcher->matches(startIndex, *status);
   528     }
   529     return result;
   530 }
   533 //------------------------------------------------------------------------------
   534 //
   535 //    uregex_lookingAt
   536 //
   537 //------------------------------------------------------------------------------
   538 U_CAPI UBool U_EXPORT2 
   539 uregex_lookingAt(URegularExpression *regexp2,
   540                  int32_t             startIndex,
   541                  UErrorCode         *status)  {
   542     return uregex_lookingAt64( regexp2, (int64_t)startIndex, status);
   543 }
   545 U_CAPI UBool U_EXPORT2 
   546 uregex_lookingAt64(URegularExpression *regexp2,
   547                    int64_t             startIndex,
   548                    UErrorCode         *status)  {
   549     RegularExpression *regexp = (RegularExpression*)regexp2;
   550     UBool result = FALSE;
   551     if (validateRE(regexp, TRUE, status) == FALSE) {
   552         return result;
   553     }
   554     if (startIndex == -1) {
   555         result = regexp->fMatcher->lookingAt(*status);
   556     } else {
   557         result = regexp->fMatcher->lookingAt(startIndex, *status);
   558     }
   559     return result;
   560 }
   564 //------------------------------------------------------------------------------
   565 //
   566 //    uregex_find
   567 //
   568 //------------------------------------------------------------------------------
   569 U_CAPI UBool U_EXPORT2 
   570 uregex_find(URegularExpression *regexp2,
   571             int32_t             startIndex, 
   572             UErrorCode         *status)  {
   573     return uregex_find64( regexp2, (int64_t)startIndex, status);
   574 }
   576 U_CAPI UBool U_EXPORT2 
   577 uregex_find64(URegularExpression *regexp2,
   578               int64_t             startIndex, 
   579               UErrorCode         *status)  {
   580     RegularExpression *regexp = (RegularExpression*)regexp2;
   581     UBool result = FALSE;
   582     if (validateRE(regexp, TRUE, status) == FALSE) {
   583         return result;
   584     }
   585     if (startIndex == -1) {
   586         regexp->fMatcher->resetPreserveRegion();
   587         result = regexp->fMatcher->find();
   588     } else {
   589         result = regexp->fMatcher->find(startIndex, *status);
   590     }
   591     return result;
   592 }
   595 //------------------------------------------------------------------------------
   596 //
   597 //    uregex_findNext
   598 //
   599 //------------------------------------------------------------------------------
   600 U_CAPI UBool U_EXPORT2 
   601 uregex_findNext(URegularExpression *regexp2,
   602                 UErrorCode         *status)  {
   603     RegularExpression *regexp = (RegularExpression*)regexp2;
   604     if (validateRE(regexp, TRUE, status) == FALSE) {
   605         return FALSE;
   606     }
   607     UBool result = regexp->fMatcher->find();
   608     return result;
   609 }
   611 //------------------------------------------------------------------------------
   612 //
   613 //    uregex_groupCount
   614 //
   615 //------------------------------------------------------------------------------
   616 U_CAPI int32_t U_EXPORT2 
   617 uregex_groupCount(URegularExpression *regexp2,
   618                   UErrorCode         *status)  {
   619     RegularExpression *regexp = (RegularExpression*)regexp2;
   620     if (validateRE(regexp, FALSE, status) == FALSE) {
   621         return 0;
   622     }
   623     int32_t  result = regexp->fMatcher->groupCount();
   624     return result;
   625 }
   628 //------------------------------------------------------------------------------
   629 //
   630 //    uregex_group
   631 //
   632 //------------------------------------------------------------------------------
   633 U_CAPI int32_t U_EXPORT2 
   634 uregex_group(URegularExpression *regexp2,
   635              int32_t             groupNum,
   636              UChar              *dest,
   637              int32_t             destCapacity,
   638              UErrorCode          *status)  {
   639     RegularExpression *regexp = (RegularExpression*)regexp2;
   640     if (validateRE(regexp, TRUE, status) == FALSE) {
   641         return 0;
   642     }
   643     if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
   644         *status = U_ILLEGAL_ARGUMENT_ERROR;
   645         return 0;
   646     }
   648     if (destCapacity == 0 || regexp->fText != NULL) {
   649         // If preflighting or if we already have the text as UChars,
   650         // this is a little cheaper than going through uregex_groupUTextDeep()
   652         //
   653         // Pick up the range of characters from the matcher
   654         //
   655         int32_t  startIx = regexp->fMatcher->start(groupNum, *status);
   656         int32_t  endIx   = regexp->fMatcher->end  (groupNum, *status);
   657         if (U_FAILURE(*status)) {
   658             return 0;
   659         }
   661         //
   662         // Trim length based on buffer capacity
   663         // 
   664         int32_t fullLength = endIx - startIx;
   665         int32_t copyLength = fullLength;
   666         if (copyLength < destCapacity) {
   667             dest[copyLength] = 0;
   668         } else if (copyLength == destCapacity) {
   669             *status = U_STRING_NOT_TERMINATED_WARNING;
   670         } else {
   671             copyLength = destCapacity;
   672             *status = U_BUFFER_OVERFLOW_ERROR;
   673         }
   675         //
   676         // Copy capture group to user's buffer
   677         //
   678         if (copyLength > 0) {
   679             u_memcpy(dest, &regexp->fText[startIx], copyLength);
   680         }
   681         return fullLength;
   682     } else {
   683         UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status);
   684         int32_t result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status);
   685         utext_close(groupText);
   686         return result;
   687     }
   688 }
   691 //------------------------------------------------------------------------------
   692 //
   693 //    uregex_groupUText
   694 //
   695 //------------------------------------------------------------------------------
   696 U_CAPI UText * U_EXPORT2 
   697 uregex_groupUText(URegularExpression *regexp2,
   698                   int32_t             groupNum,
   699                   UText              *dest,
   700                   int64_t            *groupLength,
   701                   UErrorCode         *status)  {
   702     RegularExpression *regexp = (RegularExpression*)regexp2;
   703     if (validateRE(regexp, TRUE, status) == FALSE) {
   704         UErrorCode emptyTextStatus = U_ZERO_ERROR;
   705         return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
   706     }
   708     return regexp->fMatcher->group(groupNum, dest, *groupLength, *status);
   709 }
   711 //------------------------------------------------------------------------------
   712 //
   713 //    uregex_groupUTextDeep
   714 //
   715 //------------------------------------------------------------------------------
   716 U_CAPI UText * U_EXPORT2 
   717 uregex_groupUTextDeep(URegularExpression *regexp2,
   718                   int32_t             groupNum,
   719                   UText              *dest,
   720                   UErrorCode         *status)  {
   721     RegularExpression *regexp = (RegularExpression*)regexp2;
   722     if (validateRE(regexp, TRUE, status) == FALSE) {
   723         UErrorCode emptyTextStatus = U_ZERO_ERROR;
   724         return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
   725     }
   727     if (regexp->fText != NULL) {
   728         //
   729         // Pick up the range of characters from the matcher
   730         // and use our already-extracted characters
   731         //
   732         int32_t  startIx = regexp->fMatcher->start(groupNum, *status);
   733         int32_t  endIx   = regexp->fMatcher->end  (groupNum, *status);
   734         if (U_FAILURE(*status)) {
   735             UErrorCode emptyTextStatus = U_ZERO_ERROR;
   736             return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
   737         }
   739         if (dest) {
   740             utext_replace(dest, 0, utext_nativeLength(dest), &regexp->fText[startIx], endIx - startIx, status);
   741         } else {
   742             UText groupText = UTEXT_INITIALIZER;
   743             utext_openUChars(&groupText, &regexp->fText[startIx], endIx - startIx, status);
   744             dest = utext_clone(NULL, &groupText, TRUE, FALSE, status);
   745             utext_close(&groupText);
   746         }
   748         return dest;
   749     } else {
   750         return regexp->fMatcher->group(groupNum, dest, *status);
   751     }
   752 }
   754 //------------------------------------------------------------------------------
   755 //
   756 //    uregex_start
   757 //
   758 //------------------------------------------------------------------------------
   759 U_CAPI int32_t U_EXPORT2 
   760 uregex_start(URegularExpression *regexp2,
   761              int32_t             groupNum,
   762              UErrorCode          *status)  {
   763     return (int32_t)uregex_start64( regexp2, groupNum, status);
   764 }
   766 U_CAPI int64_t U_EXPORT2 
   767 uregex_start64(URegularExpression *regexp2,
   768                int32_t             groupNum,
   769                UErrorCode          *status)  {
   770     RegularExpression *regexp = (RegularExpression*)regexp2;
   771     if (validateRE(regexp, TRUE, status) == FALSE) {
   772         return 0;
   773     }
   774     int32_t result = regexp->fMatcher->start(groupNum, *status);
   775     return result;
   776 }
   778 //------------------------------------------------------------------------------
   779 //
   780 //    uregex_end
   781 //
   782 //------------------------------------------------------------------------------
   783 U_CAPI int32_t U_EXPORT2 
   784 uregex_end(URegularExpression   *regexp2,
   785            int32_t               groupNum,
   786            UErrorCode           *status)  {
   787     return (int32_t)uregex_end64( regexp2, groupNum, status);
   788 }
   790 U_CAPI int64_t U_EXPORT2 
   791 uregex_end64(URegularExpression   *regexp2,
   792              int32_t               groupNum,
   793              UErrorCode           *status)  {
   794     RegularExpression *regexp = (RegularExpression*)regexp2;
   795     if (validateRE(regexp, TRUE, status) == FALSE) {
   796         return 0;
   797     }
   798     int32_t result = regexp->fMatcher->end(groupNum, *status);
   799     return result;
   800 }
   802 //------------------------------------------------------------------------------
   803 //
   804 //    uregex_reset
   805 //
   806 //------------------------------------------------------------------------------
   807 U_CAPI void U_EXPORT2 
   808 uregex_reset(URegularExpression    *regexp2,
   809              int32_t               index,
   810              UErrorCode            *status)  {
   811     uregex_reset64( regexp2, (int64_t)index, status);
   812 }
   814 U_CAPI void U_EXPORT2 
   815 uregex_reset64(URegularExpression    *regexp2,
   816                int64_t               index,
   817                UErrorCode            *status)  {
   818     RegularExpression *regexp = (RegularExpression*)regexp2;
   819     if (validateRE(regexp, TRUE, status) == FALSE) {
   820         return;
   821     }
   822     regexp->fMatcher->reset(index, *status);
   823 }
   826 //------------------------------------------------------------------------------
   827 //
   828 //    uregex_setRegion
   829 //
   830 //------------------------------------------------------------------------------
   831 U_CAPI void U_EXPORT2 
   832 uregex_setRegion(URegularExpression   *regexp2,
   833                  int32_t               regionStart,
   834                  int32_t               regionLimit,
   835                  UErrorCode           *status)  {
   836     uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status);
   837 }
   839 U_CAPI void U_EXPORT2 
   840 uregex_setRegion64(URegularExpression   *regexp2,
   841                    int64_t               regionStart,
   842                    int64_t               regionLimit,
   843                    UErrorCode           *status)  {
   844     RegularExpression *regexp = (RegularExpression*)regexp2;
   845     if (validateRE(regexp, TRUE, status) == FALSE) {
   846         return;
   847     }
   848     regexp->fMatcher->region(regionStart, regionLimit, *status);
   849 }
   852 //------------------------------------------------------------------------------
   853 //
   854 //    uregex_setRegionAndStart
   855 //
   856 //------------------------------------------------------------------------------
   857 U_CAPI void U_EXPORT2 
   858 uregex_setRegionAndStart(URegularExpression   *regexp2,
   859                  int64_t               regionStart,
   860                  int64_t               regionLimit,
   861                  int64_t               startIndex,
   862                  UErrorCode           *status)  {
   863     RegularExpression *regexp = (RegularExpression*)regexp2;
   864     if (validateRE(regexp, TRUE, status) == FALSE) {
   865         return;
   866     }
   867     regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status);
   868 }
   870 //------------------------------------------------------------------------------
   871 //
   872 //    uregex_regionStart
   873 //
   874 //------------------------------------------------------------------------------
   875 U_CAPI int32_t U_EXPORT2 
   876 uregex_regionStart(const  URegularExpression   *regexp2,
   877                           UErrorCode           *status)  {
   878     return (int32_t)uregex_regionStart64(regexp2, status);
   879 }
   881 U_CAPI int64_t U_EXPORT2 
   882 uregex_regionStart64(const  URegularExpression   *regexp2,
   883                             UErrorCode           *status)  {
   884     RegularExpression *regexp = (RegularExpression*)regexp2;
   885     if (validateRE(regexp, TRUE, status) == FALSE) {
   886         return 0;
   887     }
   888     return regexp->fMatcher->regionStart();
   889 }
   892 //------------------------------------------------------------------------------
   893 //
   894 //    uregex_regionEnd
   895 //
   896 //------------------------------------------------------------------------------
   897 U_CAPI int32_t U_EXPORT2 
   898 uregex_regionEnd(const  URegularExpression   *regexp2,
   899                         UErrorCode           *status)  {
   900     return (int32_t)uregex_regionEnd64(regexp2, status);
   901 }
   903 U_CAPI int64_t U_EXPORT2 
   904 uregex_regionEnd64(const  URegularExpression   *regexp2,
   905                           UErrorCode           *status)  {
   906     RegularExpression *regexp = (RegularExpression*)regexp2;
   907     if (validateRE(regexp, TRUE, status) == FALSE) {
   908         return 0;
   909     }
   910     return regexp->fMatcher->regionEnd();
   911 }
   914 //------------------------------------------------------------------------------
   915 //
   916 //    uregex_hasTransparentBounds
   917 //
   918 //------------------------------------------------------------------------------
   919 U_CAPI UBool U_EXPORT2 
   920 uregex_hasTransparentBounds(const  URegularExpression   *regexp2,
   921                                    UErrorCode           *status)  {
   922     RegularExpression *regexp = (RegularExpression*)regexp2;
   923     if (validateRE(regexp, FALSE, status) == FALSE) {
   924         return FALSE;
   925     }
   926     return regexp->fMatcher->hasTransparentBounds();
   927 }
   930 //------------------------------------------------------------------------------
   931 //
   932 //    uregex_useTransparentBounds
   933 //
   934 //------------------------------------------------------------------------------
   935 U_CAPI void U_EXPORT2 
   936 uregex_useTransparentBounds(URegularExpression    *regexp2,
   937                             UBool                  b,
   938                             UErrorCode            *status)  {
   939     RegularExpression *regexp = (RegularExpression*)regexp2;
   940     if (validateRE(regexp, FALSE, status) == FALSE) {
   941         return;
   942     }
   943     regexp->fMatcher->useTransparentBounds(b);
   944 }
   947 //------------------------------------------------------------------------------
   948 //
   949 //    uregex_hasAnchoringBounds
   950 //
   951 //------------------------------------------------------------------------------
   952 U_CAPI UBool U_EXPORT2 
   953 uregex_hasAnchoringBounds(const  URegularExpression   *regexp2,
   954                                  UErrorCode           *status)  {
   955     RegularExpression *regexp = (RegularExpression*)regexp2;
   956     if (validateRE(regexp, FALSE, status) == FALSE) {
   957         return FALSE;
   958     }
   959     return regexp->fMatcher->hasAnchoringBounds();
   960 }
   963 //------------------------------------------------------------------------------
   964 //
   965 //    uregex_useAnchoringBounds
   966 //
   967 //------------------------------------------------------------------------------
   968 U_CAPI void U_EXPORT2 
   969 uregex_useAnchoringBounds(URegularExpression    *regexp2,
   970                           UBool                  b,
   971                           UErrorCode            *status)  {
   972     RegularExpression *regexp = (RegularExpression*)regexp2;
   973     if (validateRE(regexp, FALSE, status) == FALSE) {
   974         return;
   975     }
   976     regexp->fMatcher->useAnchoringBounds(b);
   977 }
   980 //------------------------------------------------------------------------------
   981 //
   982 //    uregex_hitEnd
   983 //
   984 //------------------------------------------------------------------------------
   985 U_CAPI UBool U_EXPORT2 
   986 uregex_hitEnd(const  URegularExpression   *regexp2,
   987                      UErrorCode           *status)  {
   988     RegularExpression *regexp = (RegularExpression*)regexp2;
   989     if (validateRE(regexp, TRUE, status) == FALSE) {
   990         return FALSE;
   991     }
   992     return regexp->fMatcher->hitEnd();
   993 }
   996 //------------------------------------------------------------------------------
   997 //
   998 //    uregex_requireEnd
   999 //
  1000 //------------------------------------------------------------------------------
  1001 U_CAPI UBool U_EXPORT2 
  1002 uregex_requireEnd(const  URegularExpression   *regexp2,
  1003                          UErrorCode           *status)  {
  1004     RegularExpression *regexp = (RegularExpression*)regexp2;
  1005     if (validateRE(regexp, TRUE, status) == FALSE) {
  1006         return FALSE;
  1008     return regexp->fMatcher->requireEnd();
  1012 //------------------------------------------------------------------------------
  1013 //
  1014 //    uregex_setTimeLimit
  1015 //
  1016 //------------------------------------------------------------------------------
  1017 U_CAPI void U_EXPORT2 
  1018 uregex_setTimeLimit(URegularExpression   *regexp2,
  1019                     int32_t               limit,
  1020                     UErrorCode           *status) {
  1021     RegularExpression *regexp = (RegularExpression*)regexp2;
  1022     if (validateRE(regexp, FALSE, status)) {
  1023         regexp->fMatcher->setTimeLimit(limit, *status);
  1029 //------------------------------------------------------------------------------
  1030 //
  1031 //    uregex_getTimeLimit
  1032 //
  1033 //------------------------------------------------------------------------------
  1034 U_CAPI int32_t U_EXPORT2 
  1035 uregex_getTimeLimit(const  URegularExpression   *regexp2,
  1036                            UErrorCode           *status) {
  1037     int32_t retVal = 0;
  1038     RegularExpression *regexp = (RegularExpression*)regexp2;
  1039     if (validateRE(regexp, FALSE, status)) {
  1040         retVal = regexp->fMatcher->getTimeLimit();
  1042     return retVal;
  1047 //------------------------------------------------------------------------------
  1048 //
  1049 //    uregex_setStackLimit
  1050 //
  1051 //------------------------------------------------------------------------------
  1052 U_CAPI void U_EXPORT2 
  1053 uregex_setStackLimit(URegularExpression   *regexp2,
  1054                      int32_t               limit,
  1055                      UErrorCode           *status) {
  1056     RegularExpression *regexp = (RegularExpression*)regexp2;
  1057     if (validateRE(regexp, FALSE, status)) {
  1058         regexp->fMatcher->setStackLimit(limit, *status);
  1064 //------------------------------------------------------------------------------
  1065 //
  1066 //    uregex_getStackLimit
  1067 //
  1068 //------------------------------------------------------------------------------
  1069 U_CAPI int32_t U_EXPORT2 
  1070 uregex_getStackLimit(const  URegularExpression   *regexp2,
  1071                             UErrorCode           *status) {
  1072     int32_t retVal = 0;
  1073     RegularExpression *regexp = (RegularExpression*)regexp2;
  1074     if (validateRE(regexp, FALSE, status)) {
  1075         retVal = regexp->fMatcher->getStackLimit();
  1077     return retVal;
  1081 //------------------------------------------------------------------------------
  1082 //
  1083 //    uregex_setMatchCallback
  1084 //
  1085 //------------------------------------------------------------------------------
  1086 U_CAPI void U_EXPORT2
  1087 uregex_setMatchCallback(URegularExpression      *regexp2,
  1088                         URegexMatchCallback     *callback,
  1089                         const void              *context,
  1090                         UErrorCode              *status) {
  1091     RegularExpression *regexp = (RegularExpression*)regexp2;
  1092     if (validateRE(regexp, FALSE, status)) {
  1093         regexp->fMatcher->setMatchCallback(callback, context, *status);
  1098 //------------------------------------------------------------------------------
  1099 //
  1100 //    uregex_getMatchCallback
  1101 //
  1102 //------------------------------------------------------------------------------
  1103 U_CAPI void U_EXPORT2 
  1104 uregex_getMatchCallback(const URegularExpression    *regexp2,
  1105                         URegexMatchCallback        **callback,
  1106                         const void                 **context,
  1107                         UErrorCode                  *status) {
  1108     RegularExpression *regexp = (RegularExpression*)regexp2;
  1109      if (validateRE(regexp, FALSE, status)) {
  1110          regexp->fMatcher->getMatchCallback(*callback, *context, *status);
  1115 //------------------------------------------------------------------------------
  1116 //
  1117 //    uregex_setMatchProgressCallback
  1118 //
  1119 //------------------------------------------------------------------------------
  1120 U_CAPI void U_EXPORT2
  1121 uregex_setFindProgressCallback(URegularExpression              *regexp2,
  1122                                 URegexFindProgressCallback      *callback,
  1123                                 const void                      *context,
  1124                                 UErrorCode                      *status) {
  1125     RegularExpression *regexp = (RegularExpression*)regexp2;
  1126     if (validateRE(regexp, FALSE, status)) {
  1127         regexp->fMatcher->setFindProgressCallback(callback, context, *status);
  1132 //------------------------------------------------------------------------------
  1133 //
  1134 //    uregex_getMatchCallback
  1135 //
  1136 //------------------------------------------------------------------------------
  1137 U_CAPI void U_EXPORT2 
  1138 uregex_getFindProgressCallback(const URegularExpression          *regexp2,
  1139                                 URegexFindProgressCallback        **callback,
  1140                                 const void                        **context,
  1141                                 UErrorCode                        *status) {
  1142     RegularExpression *regexp = (RegularExpression*)regexp2;
  1143      if (validateRE(regexp, FALSE, status)) {
  1144          regexp->fMatcher->getFindProgressCallback(*callback, *context, *status);
  1149 //------------------------------------------------------------------------------
  1150 //
  1151 //    uregex_replaceAll
  1152 //
  1153 //------------------------------------------------------------------------------
  1154 U_CAPI int32_t U_EXPORT2 
  1155 uregex_replaceAll(URegularExpression    *regexp2,
  1156                   const UChar           *replacementText,
  1157                   int32_t                replacementLength,
  1158                   UChar                 *destBuf,
  1159                   int32_t                destCapacity,
  1160                   UErrorCode            *status)  {
  1161     RegularExpression *regexp = (RegularExpression*)regexp2;
  1162     if (validateRE(regexp, TRUE, status) == FALSE) {
  1163         return 0;
  1165     if (replacementText == NULL || replacementLength < -1 ||
  1166         (destBuf == NULL && destCapacity > 0) ||
  1167         destCapacity < 0) {
  1168         *status = U_ILLEGAL_ARGUMENT_ERROR;
  1169         return 0;
  1172     int32_t   len = 0;
  1174     uregex_reset(regexp2, 0, status);
  1176     // Note: Seperate error code variables for findNext() and appendReplacement()
  1177     //       are used so that destination buffer overflow errors
  1178     //       in appendReplacement won't stop findNext() from working.
  1179     //       appendReplacement() and appendTail() special case incoming buffer
  1180     //       overflow errors, continuing to return the correct length.
  1181     UErrorCode  findStatus = *status;
  1182     while (uregex_findNext(regexp2, &findStatus)) {
  1183         len += uregex_appendReplacement(regexp2, replacementText, replacementLength,
  1184                                         &destBuf, &destCapacity, status);
  1186     len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
  1188     if (U_FAILURE(findStatus)) {
  1189         // If anything went wrong with the findNext(), make that error trump
  1190         //   whatever may have happened with the append() operations.
  1191         //   Errors in findNext() are not expected.
  1192         *status = findStatus;
  1195     return len;
  1199 //------------------------------------------------------------------------------
  1200 //
  1201 //    uregex_replaceAllUText
  1202 //
  1203 //------------------------------------------------------------------------------
  1204 U_CAPI UText * U_EXPORT2 
  1205 uregex_replaceAllUText(URegularExpression    *regexp2,
  1206                        UText                 *replacementText,
  1207                        UText                 *dest,
  1208                        UErrorCode            *status)  {
  1209     RegularExpression *regexp = (RegularExpression*)regexp2;
  1210     if (validateRE(regexp, TRUE, status) == FALSE) {
  1211         return 0;
  1213     if (replacementText == NULL) {
  1214         *status = U_ILLEGAL_ARGUMENT_ERROR;
  1215         return 0;
  1218     dest = regexp->fMatcher->replaceAll(replacementText, dest, *status);
  1219     return dest;
  1223 //------------------------------------------------------------------------------
  1224 //
  1225 //    uregex_replaceFirst
  1226 //
  1227 //------------------------------------------------------------------------------
  1228 U_CAPI int32_t U_EXPORT2 
  1229 uregex_replaceFirst(URegularExpression  *regexp2,
  1230                     const UChar         *replacementText,
  1231                     int32_t              replacementLength,
  1232                     UChar               *destBuf,
  1233                     int32_t              destCapacity,
  1234                     UErrorCode          *status)  {
  1235     RegularExpression *regexp = (RegularExpression*)regexp2;
  1236     if (validateRE(regexp, TRUE, status) == FALSE) {
  1237         return 0;
  1239     if (replacementText == NULL || replacementLength < -1 ||
  1240         (destBuf == NULL && destCapacity > 0) ||
  1241         destCapacity < 0) {
  1242         *status = U_ILLEGAL_ARGUMENT_ERROR;
  1243         return 0;
  1246     int32_t   len = 0;
  1247     UBool     findSucceeded;
  1248     uregex_reset(regexp2, 0, status);
  1249     findSucceeded = uregex_find(regexp2, 0, status);
  1250     if (findSucceeded) {
  1251         len = uregex_appendReplacement(regexp2, replacementText, replacementLength, 
  1252                                        &destBuf, &destCapacity, status);
  1254     len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
  1256     return len;
  1260 //------------------------------------------------------------------------------
  1261 //
  1262 //    uregex_replaceFirstUText
  1263 //
  1264 //------------------------------------------------------------------------------
  1265 U_CAPI UText * U_EXPORT2 
  1266 uregex_replaceFirstUText(URegularExpression  *regexp2,
  1267                          UText                 *replacementText,
  1268                          UText                 *dest,
  1269                          UErrorCode            *status)  {
  1270     RegularExpression *regexp = (RegularExpression*)regexp2;
  1271     if (validateRE(regexp, TRUE, status) == FALSE) {
  1272         return 0;
  1274     if (replacementText == NULL) {
  1275         *status = U_ILLEGAL_ARGUMENT_ERROR;
  1276         return 0;
  1279     dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status);
  1280     return dest;
  1284 //------------------------------------------------------------------------------
  1285 //
  1286 //    uregex_appendReplacement
  1287 //
  1288 //------------------------------------------------------------------------------
  1290 U_NAMESPACE_BEGIN
  1291 //
  1292 //  Dummy class, because these functions need to be friends of class RegexMatcher,
  1293 //               and stand-alone C functions don't work as friends
  1294 //
  1295 class RegexCImpl {
  1296  public:
  1297    inline static  int32_t appendReplacement(RegularExpression    *regexp,
  1298                       const UChar           *replacementText,
  1299                       int32_t                replacementLength,
  1300                       UChar                **destBuf,
  1301                       int32_t               *destCapacity,
  1302                       UErrorCode            *status);
  1304    inline static int32_t appendTail(RegularExpression    *regexp,
  1305         UChar                **destBuf,
  1306         int32_t               *destCapacity,
  1307         UErrorCode            *status);
  1309     inline static int32_t split(RegularExpression    *regexp,
  1310         UChar                 *destBuf,
  1311         int32_t                destCapacity,
  1312         int32_t               *requiredCapacity,
  1313         UChar                 *destFields[],
  1314         int32_t                destFieldsCapacity,
  1315         UErrorCode            *status);
  1316 };
  1318 U_NAMESPACE_END
  1322 static const UChar BACKSLASH  = 0x5c;
  1323 static const UChar DOLLARSIGN = 0x24;
  1325 //
  1326 //  Move a character to an output buffer, with bounds checking on the index.
  1327 //      Index advances even if capacity is exceeded, for preflight size computations.
  1328 //      This little sequence is used a LOT.
  1329 //
  1330 static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
  1331     if (*idx < bufCapacity) {
  1332         buf[*idx] = c;
  1334     (*idx)++;
  1338 //
  1339 //  appendReplacement, the actual implementation.
  1340 //
  1341 int32_t RegexCImpl::appendReplacement(RegularExpression    *regexp,
  1342                                       const UChar           *replacementText,
  1343                                       int32_t                replacementLength,
  1344                                       UChar                **destBuf,
  1345                                       int32_t               *destCapacity,
  1346                                       UErrorCode            *status)  {
  1348     // If we come in with a buffer overflow error, don't suppress the operation.
  1349     //  A series of appendReplacements, appendTail need to correctly preflight
  1350     //  the buffer size when an overflow happens somewhere in the middle.
  1351     UBool pendingBufferOverflow = FALSE;
  1352     if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
  1353         pendingBufferOverflow = TRUE;
  1354         *status = U_ZERO_ERROR;
  1357     //
  1358     // Validate all paramters
  1359     //
  1360     if (validateRE(regexp, TRUE, status) == FALSE) {
  1361         return 0;
  1363     if (replacementText == NULL || replacementLength < -1 ||
  1364         destCapacity == NULL || destBuf == NULL || 
  1365         (*destBuf == NULL && *destCapacity > 0) ||
  1366         *destCapacity < 0) {
  1367         *status = U_ILLEGAL_ARGUMENT_ERROR;
  1368         return 0;
  1371     RegexMatcher *m = regexp->fMatcher;
  1372     if (m->fMatch == FALSE) {
  1373         *status = U_REGEX_INVALID_STATE;
  1374         return 0;
  1377     UChar    *dest             = *destBuf;
  1378     int32_t   capacity         = *destCapacity;
  1379     int32_t   destIdx          =  0;
  1380     int32_t   i;
  1382     // If it wasn't supplied by the caller,  get the length of the replacement text.
  1383     //   TODO:  slightly smarter logic in the copy loop could watch for the NUL on
  1384     //          the fly and avoid this step.
  1385     if (replacementLength == -1) {
  1386         replacementLength = u_strlen(replacementText);
  1389     // Copy input string from the end of previous match to start of current match
  1390     if (regexp->fText != NULL) {
  1391         int32_t matchStart;
  1392         int32_t lastMatchEnd;
  1393         if (UTEXT_USES_U16(m->fInputText)) {
  1394             lastMatchEnd = (int32_t)m->fLastMatchEnd;
  1395             matchStart = (int32_t)m->fMatchStart;
  1396         } else {
  1397             // !!!: Would like a better way to do this!
  1398             UErrorCode status = U_ZERO_ERROR;
  1399             lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status);
  1400             status = U_ZERO_ERROR;
  1401             matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status);
  1403         for (i=lastMatchEnd; i<matchStart; i++) {
  1404             appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
  1406     } else {
  1407         UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore
  1408         destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart,
  1409                                  dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity),
  1410                                  &possibleOverflowError);
  1412     U_ASSERT(destIdx >= 0);
  1414     // scan the replacement text, looking for substitutions ($n) and \escapes.
  1415     int32_t  replIdx = 0;
  1416     while (replIdx < replacementLength) {
  1417         UChar  c = replacementText[replIdx];
  1418         replIdx++;
  1419         if (c != DOLLARSIGN && c != BACKSLASH) {
  1420             // Common case, no substitution, no escaping, 
  1421             //  just copy the char to the dest buf.
  1422             appendToBuf(c, &destIdx, dest, capacity);
  1423             continue;
  1426         if (c == BACKSLASH) {
  1427             // Backslash Escape.  Copy the following char out without further checks.
  1428             //                    Note:  Surrogate pairs don't need any special handling
  1429             //                           The second half wont be a '$' or a '\', and
  1430             //                           will move to the dest normally on the next
  1431             //                           loop iteration.
  1432             if (replIdx >= replacementLength) {
  1433                 break;
  1435             c = replacementText[replIdx];
  1437             if (c==0x55/*U*/ || c==0x75/*u*/) {
  1438                 // We have a \udddd or \Udddddddd escape sequence.
  1439                 UChar32 escapedChar = 
  1440                     u_unescapeAt(uregex_ucstr_unescape_charAt,
  1441                        &replIdx,                   // Index is updated by unescapeAt 
  1442                        replacementLength,          // Length of replacement text
  1443                        (void *)replacementText);
  1445                 if (escapedChar != (UChar32)0xFFFFFFFF) {
  1446                     if (escapedChar <= 0xffff) {
  1447                         appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
  1448                     } else {
  1449                         appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
  1450                         appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
  1452                     continue;
  1454                 // Note:  if the \u escape was invalid, just fall through and
  1455                 //        treat it as a plain \<anything> escape.
  1458             // Plain backslash escape.  Just put out the escaped character.
  1459             appendToBuf(c, &destIdx, dest, capacity);
  1461             replIdx++;
  1462             continue;
  1467         // We've got a $.  Pick up a capture group number if one follows.
  1468         // Consume at most the number of digits necessary for the largest capture
  1469         // number that is valid for this pattern.
  1471         int32_t numDigits = 0;
  1472         int32_t groupNum  = 0;
  1473         UChar32 digitC;
  1474         for (;;) {
  1475             if (replIdx >= replacementLength) {
  1476                 break;
  1478             U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
  1479             if (u_isdigit(digitC) == FALSE) {
  1480                 break;
  1483             U16_FWD_1(replacementText, replIdx, replacementLength);
  1484             groupNum=groupNum*10 + u_charDigitValue(digitC);
  1485             numDigits++;
  1486             if (numDigits >= m->fPattern->fMaxCaptureDigits) {
  1487                 break;
  1492         if (numDigits == 0) {
  1493             // The $ didn't introduce a group number at all.
  1494             // Treat it as just part of the substitution text.
  1495             appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
  1496             continue;
  1499         // Finally, append the capture group data to the destination.
  1500         destIdx += uregex_group((URegularExpression*)regexp, groupNum,
  1501                                 dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
  1502         if (*status == U_BUFFER_OVERFLOW_ERROR) {
  1503             // Ignore buffer overflow when extracting the group.  We need to
  1504             //   continue on to get full size of the untruncated result.  We will
  1505             //   raise our own buffer overflow error at the end.
  1506             *status = U_ZERO_ERROR;
  1509         if (U_FAILURE(*status)) {
  1510             // Can fail if group number is out of range.
  1511             break;
  1516     //
  1517     //  Nul Terminate the dest buffer if possible.
  1518     //  Set the appropriate buffer overflow or not terminated error, if needed.
  1519     //
  1520     if (destIdx < capacity) {
  1521         dest[destIdx] = 0;
  1522     } else if (destIdx == *destCapacity) {
  1523         *status = U_STRING_NOT_TERMINATED_WARNING;
  1524     } else {
  1525         *status = U_BUFFER_OVERFLOW_ERROR;
  1528     //
  1529     // Return an updated dest buffer and capacity to the caller.
  1530     //
  1531     if (destIdx > 0 &&  *destCapacity > 0) {
  1532         if (destIdx < capacity) {
  1533             *destBuf      += destIdx;
  1534             *destCapacity -= destIdx;
  1535         } else {
  1536             *destBuf      += capacity;
  1537             *destCapacity =  0;
  1541     // If we came in with a buffer overflow, make sure we go out with one also.
  1542     //   (A zero length match right at the end of the previous match could
  1543     //    make this function succeed even though a previous call had overflowed the buf)
  1544     if (pendingBufferOverflow && U_SUCCESS(*status)) {
  1545         *status = U_BUFFER_OVERFLOW_ERROR;
  1548     return destIdx;
  1551 //
  1552 //   appendReplacement   the actual API function,
  1553 //
  1554 U_CAPI int32_t U_EXPORT2 
  1555 uregex_appendReplacement(URegularExpression    *regexp2,
  1556                          const UChar           *replacementText,
  1557                          int32_t                replacementLength,
  1558                          UChar                **destBuf,
  1559                          int32_t               *destCapacity,
  1560                          UErrorCode            *status) {
  1562     RegularExpression *regexp = (RegularExpression*)regexp2;
  1563     return RegexCImpl::appendReplacement(
  1564         regexp, replacementText, replacementLength,destBuf, destCapacity, status);
  1567 //
  1568 //   uregex_appendReplacementUText...can just use the normal C++ method
  1569 //
  1570 U_CAPI void U_EXPORT2 
  1571 uregex_appendReplacementUText(URegularExpression    *regexp2,
  1572                               UText                 *replText,
  1573                               UText                 *dest,
  1574                               UErrorCode            *status)  {
  1575     RegularExpression *regexp = (RegularExpression*)regexp2;
  1576     regexp->fMatcher->appendReplacement(dest, replText, *status);
  1580 //------------------------------------------------------------------------------
  1581 //
  1582 //    uregex_appendTail
  1583 //
  1584 //------------------------------------------------------------------------------
  1585 int32_t RegexCImpl::appendTail(RegularExpression    *regexp,
  1586                                UChar                **destBuf,
  1587                                int32_t               *destCapacity,
  1588                                UErrorCode            *status)
  1591     // If we come in with a buffer overflow error, don't suppress the operation.
  1592     //  A series of appendReplacements, appendTail need to correctly preflight
  1593     //  the buffer size when an overflow happens somewhere in the middle.
  1594     UBool pendingBufferOverflow = FALSE;
  1595     if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
  1596         pendingBufferOverflow = TRUE;
  1597         *status = U_ZERO_ERROR;
  1600     if (validateRE(regexp, TRUE, status) == FALSE) {
  1601         return 0;
  1604     if (destCapacity == NULL || destBuf == NULL || 
  1605         (*destBuf == NULL && *destCapacity > 0) ||
  1606         *destCapacity < 0)
  1608         *status = U_ILLEGAL_ARGUMENT_ERROR;
  1609         return 0;
  1612     RegexMatcher *m = regexp->fMatcher;
  1614     int32_t  destIdx     = 0;
  1615     int32_t  destCap     = *destCapacity;
  1616     UChar    *dest       = *destBuf;
  1618     if (regexp->fText != NULL) {
  1619         int32_t srcIdx;
  1620         int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd);
  1621         if (nativeIdx == -1) {
  1622             srcIdx = 0;
  1623         } else if (UTEXT_USES_U16(m->fInputText)) {
  1624             srcIdx = (int32_t)nativeIdx;
  1625         } else {
  1626             UErrorCode status = U_ZERO_ERROR;
  1627             srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status);
  1630         for (;;) {
  1631             U_ASSERT(destIdx >= 0);
  1633             if (srcIdx == regexp->fTextLength) {
  1634                 break;
  1636             UChar c = regexp->fText[srcIdx];
  1637             if (c == 0 && regexp->fTextLength == -1) {
  1638                 regexp->fTextLength = srcIdx;
  1639                 break;
  1642             if (destIdx < destCap) {
  1643                 dest[destIdx] = c;
  1644             } else {
  1645                 // We've overflowed the dest buffer.
  1646                 //  If the total input string length is known, we can
  1647                 //    compute the total buffer size needed without scanning through the string.
  1648                 if (regexp->fTextLength > 0) {
  1649                     destIdx += (regexp->fTextLength - srcIdx);
  1650                     break;
  1653             srcIdx++;
  1654             destIdx++;
  1656     } else {
  1657         int64_t  srcIdx;
  1658         if (m->fMatch) {
  1659             // The most recent call to find() succeeded.  
  1660             srcIdx = m->fMatchEnd;
  1661         } else {
  1662             // The last call to find() on this matcher failed().
  1663             //   Look back to the end of the last find() that succeeded for src index.
  1664             srcIdx = m->fLastMatchEnd;
  1665             if (srcIdx == -1)  {
  1666                 // There has been no successful match with this matcher.
  1667                 //   We want to copy the whole string.
  1668                 srcIdx = 0;
  1672         destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status);
  1675     //
  1676     //  NUL terminate the output string, if possible, otherwise issue the
  1677     //   appropriate error or warning.
  1678     //
  1679     if (destIdx < destCap) {
  1680         dest[destIdx] = 0;
  1681     } else  if (destIdx == destCap) {
  1682         *status = U_STRING_NOT_TERMINATED_WARNING;
  1683     } else {
  1684         *status = U_BUFFER_OVERFLOW_ERROR;
  1687     //
  1688     // Update the user's buffer ptr and capacity vars to reflect the
  1689     //   amount used.
  1690     //
  1691     if (destIdx < destCap) {
  1692         *destBuf      += destIdx;
  1693         *destCapacity -= destIdx;
  1694     } else if (*destBuf != NULL) {
  1695         *destBuf      += destCap;
  1696         *destCapacity  = 0;
  1699     if (pendingBufferOverflow && U_SUCCESS(*status)) {
  1700         *status = U_BUFFER_OVERFLOW_ERROR;
  1703     return destIdx;
  1707 //
  1708 //   appendTail   the actual API function
  1709 //
  1710 U_CAPI int32_t U_EXPORT2 
  1711 uregex_appendTail(URegularExpression    *regexp2,
  1712                   UChar                **destBuf,
  1713                   int32_t               *destCapacity,
  1714                   UErrorCode            *status)  {
  1715     RegularExpression *regexp = (RegularExpression*)regexp2;
  1716     return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
  1720 //
  1721 //   uregex_appendTailUText...can just use the normal C++ method
  1722 //
  1723 U_CAPI UText * U_EXPORT2 
  1724 uregex_appendTailUText(URegularExpression    *regexp2,
  1725                        UText                 *dest,
  1726                        UErrorCode            *status)  {
  1727     RegularExpression *regexp = (RegularExpression*)regexp2;
  1728     return regexp->fMatcher->appendTail(dest, *status);
  1732 //------------------------------------------------------------------------------
  1733 //
  1734 //    copyString     Internal utility to copy a string to an output buffer,
  1735 //                   while managing buffer overflow and preflight size
  1736 //                   computation.  NUL termination is added to destination,
  1737 //                   and the NUL is counted in the output size.
  1738 //
  1739 //------------------------------------------------------------------------------
  1740 #if 0
  1741 static void copyString(UChar        *destBuffer,    //  Destination buffer.
  1742                        int32_t       destCapacity,  //  Total capacity of dest buffer
  1743                        int32_t      *destIndex,     //  Index into dest buffer.  Updated on return.
  1744                                                     //    Update not clipped to destCapacity.
  1745                        const UChar  *srcPtr,        //  Pointer to source string
  1746                        int32_t       srcLen)        //  Source string len.
  1748     int32_t  si;
  1749     int32_t  di = *destIndex;
  1750     UChar    c;
  1752     for (si=0; si<srcLen;  si++) {
  1753         c = srcPtr[si];
  1754         if (di < destCapacity) {
  1755             destBuffer[di] = c;
  1756             di++;
  1757         } else {
  1758             di += srcLen - si;
  1759             break;
  1762     if (di<destCapacity) {
  1763         destBuffer[di] = 0;
  1765     di++;
  1766     *destIndex = di;
  1768 #endif
  1770 //------------------------------------------------------------------------------
  1771 //
  1772 //    uregex_split
  1773 //
  1774 //------------------------------------------------------------------------------
  1775 int32_t RegexCImpl::split(RegularExpression     *regexp,
  1776                           UChar                 *destBuf,
  1777                           int32_t                destCapacity,
  1778                           int32_t               *requiredCapacity,
  1779                           UChar                 *destFields[],
  1780                           int32_t                destFieldsCapacity,
  1781                           UErrorCode            *status) {
  1782     //
  1783     // Reset for the input text
  1784     //
  1785     regexp->fMatcher->reset();
  1786     UText *inputText = regexp->fMatcher->fInputText;
  1787     int64_t   nextOutputStringStart = 0;
  1788     int64_t   inputLen = regexp->fMatcher->fInputLength;
  1789     if (inputLen == 0) {
  1790         return 0;
  1793     //
  1794     // Loop through the input text, searching for the delimiter pattern
  1795     //
  1796     int32_t   i;             // Index of the field being processed.
  1797     int32_t   destIdx = 0;   // Next available position in destBuf;
  1798     int32_t   numCaptureGroups = regexp->fMatcher->groupCount();
  1799     UErrorCode  tStatus = U_ZERO_ERROR;   // Want to ignore any buffer overflow errors so that the strings are still counted
  1800     for (i=0; ; i++) {
  1801         if (i>=destFieldsCapacity-1) {
  1802             // There are one or zero output strings left.
  1803             // Fill the last output string with whatever is left from the input, then exit the loop.
  1804             //  ( i will be == destFieldsCapacity if we filled the output array while processing
  1805             //    capture groups of the delimiter expression, in which case we will discard the
  1806             //    last capture group saved in favor of the unprocessed remainder of the
  1807             //    input string.)
  1808             if (inputLen > nextOutputStringStart) {
  1809                 if (i != destFieldsCapacity-1) {
  1810                     // No fields are left.  Recycle the last one for holding the trailing part of
  1811                     //   the input string.
  1812                     i = destFieldsCapacity-1;
  1813                     destIdx = (int32_t)(destFields[i] - destFields[0]);
  1816                 destFields[i] = &destBuf[destIdx];
  1817                 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
  1818                                              &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
  1820             break;
  1823         if (regexp->fMatcher->find()) {
  1824             // We found another delimiter.  Move everything from where we started looking
  1825             //  up until the start of the delimiter into the next output string.
  1826             destFields[i] = &destBuf[destIdx];
  1828             destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart,
  1829                                          &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
  1830             if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
  1831                 tStatus = U_ZERO_ERROR;
  1832             } else {
  1833                 *status = tStatus;
  1835             nextOutputStringStart = regexp->fMatcher->fMatchEnd;
  1837             // If the delimiter pattern has capturing parentheses, the captured
  1838             //  text goes out into the next n destination strings.
  1839             int32_t groupNum;
  1840             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
  1841                 // If we've run out of output string slots, bail out.
  1842                 if (i==destFieldsCapacity-1) {
  1843                     break;
  1845                 i++;
  1847                 // Set up to extract the capture group contents into the dest buffer.
  1848                 destFields[i] = &destBuf[destIdx];
  1849                 tStatus = U_ZERO_ERROR;
  1850                 int32_t t = uregex_group((URegularExpression*)regexp, 
  1851                                          groupNum, 
  1852                                          destFields[i], 
  1853                                          REMAINING_CAPACITY(destIdx, destCapacity), 
  1854                                          &tStatus);
  1855                 destIdx += t + 1;    // Record the space used in the output string buffer.
  1856                                      //  +1 for the NUL that terminates the string.
  1857                 if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
  1858                     tStatus = U_ZERO_ERROR;
  1859                 } else {
  1860                     *status = tStatus;
  1864             if (nextOutputStringStart == inputLen) {
  1865                 // The delimiter was at the end of the string. 
  1866                 // Output an empty string, and then we are done.
  1867                 if (destIdx < destCapacity) {
  1868                     destBuf[destIdx] = 0;
  1870                 if (i < destFieldsCapacity-1) {
  1871                    ++i;
  1873                 if (destIdx < destCapacity) {
  1874                     destFields[i] = destBuf + destIdx;
  1876                 ++destIdx;
  1877                 break;
  1881         else
  1883             // We ran off the end of the input while looking for the next delimiter.
  1884             // All the remaining text goes into the current output string.
  1885             destFields[i] = &destBuf[destIdx];
  1886             destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
  1887                                          &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
  1888             break;
  1892     // Zero out any unused portion of the destFields array
  1893     int j;
  1894     for (j=i+1; j<destFieldsCapacity; j++) {
  1895         destFields[j] = NULL;
  1898     if (requiredCapacity != NULL) {
  1899         *requiredCapacity = destIdx;
  1901     if (destIdx > destCapacity) {
  1902         *status = U_BUFFER_OVERFLOW_ERROR;
  1904     return i+1;
  1907 //
  1908 //   uregex_split   The actual API function
  1909 //
  1910 U_CAPI int32_t U_EXPORT2 
  1911 uregex_split(URegularExpression      *regexp2,
  1912              UChar                   *destBuf,
  1913              int32_t                  destCapacity,
  1914              int32_t                 *requiredCapacity,
  1915              UChar                   *destFields[],
  1916              int32_t                  destFieldsCapacity,
  1917              UErrorCode              *status) {
  1918     RegularExpression *regexp = (RegularExpression*)regexp2;
  1919     if (validateRE(regexp, TRUE, status) == FALSE) {
  1920         return 0;
  1922     if ((destBuf == NULL && destCapacity > 0) ||
  1923         destCapacity < 0 ||
  1924         destFields == NULL ||
  1925         destFieldsCapacity < 1 ) {
  1926         *status = U_ILLEGAL_ARGUMENT_ERROR;
  1927         return 0;
  1930     return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status);
  1934 //
  1935 //   uregex_splitUText...can just use the normal C++ method
  1936 //
  1937 U_CAPI int32_t U_EXPORT2 
  1938 uregex_splitUText(URegularExpression    *regexp2,
  1939                   UText                 *destFields[],
  1940                   int32_t                destFieldsCapacity,
  1941                   UErrorCode            *status) {
  1942     RegularExpression *regexp = (RegularExpression*)regexp2;
  1943     return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status);
  1947 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS

mercurial