intl/icu/source/i18n/regexst.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/regexst.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,288 @@
     1.4 +//
     1.5 +//  regexst.h
     1.6 +//
     1.7 +//  Copyright (C) 2004-2013, International Business Machines Corporation and others.
     1.8 +//  All Rights Reserved.
     1.9 +//
    1.10 +//  This file contains class RegexStaticSets
    1.11 +//
    1.12 +//  This class is internal to the regular expression implementation.
    1.13 +//  For the public Regular Expression API, see the file "unicode/regex.h"
    1.14 +//
    1.15 +//  RegexStaticSets groups together the common UnicodeSets that are needed
    1.16 +//   for compiling or executing RegularExpressions.  This grouping simplifies
    1.17 +//   the thread safe lazy creation and sharing of these sets across
    1.18 +//   all instances of regular expressions.
    1.19 +//
    1.20 +#include "unicode/utypes.h"
    1.21 +
    1.22 +#if !UCONFIG_NO_REGULAR_EXPRESSIONS
    1.23 +
    1.24 +#include "unicode/unistr.h"
    1.25 +#include "unicode/uniset.h"
    1.26 +#include "unicode/uchar.h"
    1.27 +#include "unicode/regex.h"
    1.28 +#include "uprops.h"
    1.29 +#include "cmemory.h"
    1.30 +#include "cstring.h"
    1.31 +#include "uassert.h"
    1.32 +#include "ucln_in.h"
    1.33 +#include "umutex.h"
    1.34 +
    1.35 +#include "regexcst.h"   // Contains state table for the regex pattern parser.
    1.36 +                        //   generated by a Perl script.
    1.37 +#include "regexst.h"
    1.38 +
    1.39 +
    1.40 +
    1.41 +U_NAMESPACE_BEGIN
    1.42 +
    1.43 +
    1.44 +//------------------------------------------------------------------------------
    1.45 +//
    1.46 +// Unicode Set pattern strings for all of the required constant sets.
    1.47 +//               Initialized with hex values for portability to EBCDIC based machines.
    1.48 +//                Really ugly, but there's no good way to avoid it.
    1.49 +//
    1.50 +//------------------------------------------------------------------------------
    1.51 +
    1.52 +// "Rule Char" Characters are those with no special meaning, and therefore do not
    1.53 +//    need to be escaped to appear as literals in a regexp.  Expressed
    1.54 +//    as the inverse of those needing escaping --  [^\*\?\+\[\(\)\{\}\^\$\|\\\.]
    1.55 +static const UChar gRuleSet_rule_char_pattern[]       = {
    1.56 + //   [    ^      \     *     \     ?     \     +     \     [     \     (     /     )
    1.57 +    0x5b, 0x5e, 0x5c, 0x2a, 0x5c, 0x3f, 0x5c, 0x2b, 0x5c, 0x5b, 0x5c, 0x28, 0x5c, 0x29,
    1.58 + //   \     {    \     }     \     ^     \     $     \     |     \     \     \     .     ]
    1.59 +    0x5c, 0x7b,0x5c, 0x7d, 0x5c, 0x5e, 0x5c, 0x24, 0x5c, 0x7c, 0x5c, 0x5c, 0x5c, 0x2e, 0x5d, 0};
    1.60 +
    1.61 +
    1.62 +static const UChar gRuleSet_digit_char_pattern[] = {
    1.63 +//    [    0      -    9     ]
    1.64 +    0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
    1.65 +
    1.66 +//
    1.67 +//   Here are the backslash escape characters that ICU's unescape() function
    1.68 +//    will handle.
    1.69 +//
    1.70 +static const UChar gUnescapeCharPattern[] = {
    1.71 +//    [     a     c     e     f     n     r     t     u     U     x    ]
    1.72 +    0x5b, 0x61, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x78, 0x5d, 0};
    1.73 +
    1.74 +
    1.75 +//
    1.76 +//  Unicode Set Definitions for Regular Expression  \w
    1.77 +//
    1.78 +static const UChar gIsWordPattern[] = {
    1.79 +//    [     \     p     {    A     l     p     h     a     b     e     t     i      c    }
    1.80 +    0x5b, 0x5c, 0x70, 0x7b, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x62, 0x65, 0x74, 0x69, 0x63, 0x7d,
    1.81 +//          \     p     {    M     }                               Mark
    1.82 +          0x5c, 0x70, 0x7b, 0x4d, 0x7d,
    1.83 +//          \     p     {    N     d     }                         Digit_Numeric
    1.84 +          0x5c, 0x70, 0x7b, 0x4e, 0x64, 0x7d,
    1.85 +//          \     p     {    P     c     }                         Connector_Punctuation
    1.86 +          0x5c, 0x70, 0x7b, 0x50, 0x63, 0x7d,
    1.87 +//          \     u     2    0     0     c      \     u     2    0     0     d     ]
    1.88 +          0x5c, 0x75, 0x32, 0x30, 0x30, 0x63, 0x5c, 0x75, 0x32, 0x30, 0x30, 0x64, 0x5d, 0};
    1.89 +
    1.90 +
    1.91 +//
    1.92 +//  Unicode Set Definitions for Regular Expression  \s
    1.93 +//
    1.94 +static const UChar gIsSpacePattern[] = {
    1.95 +//        [     \     p     {     W     h     i     t     e     S     p     a     c     e     }     ]
    1.96 +        0x5b, 0x5c, 0x70, 0x7b, 0x57, 0x68, 0x69, 0x74, 0x65, 0x53, 0x70, 0x61, 0x63, 0x65, 0x7d, 0x5d, 0};
    1.97 +
    1.98 +
    1.99 +//
   1.100 +//  UnicodeSets used in implementation of Grapheme Cluster detection, \X
   1.101 +//
   1.102 +static const UChar gGC_ControlPattern[] = {
   1.103 +//    [     [     :     Z     l     :     ]     [     :     Z     p     :     ]
   1.104 +    0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d,
   1.105 +//    [     :     C     c     :     ]     [     :     C     f     :     ]     -
   1.106 +    0x5b, 0x3a, 0x43, 0x63, 0x3a, 0x5d, 0x5b, 0x3a, 0x43, 0x66, 0x3a, 0x5d, 0x2d,
   1.107 +//    [     :     G     r     a     p     h     e     m     e     _
   1.108 +    0x5b, 0x3a, 0x47, 0x72, 0x61, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x5f,
   1.109 +//    E     x     t     e     n     d     :     ]     ]
   1.110 +    0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x3a, 0x5d, 0x5d, 0};
   1.111 +
   1.112 +static const UChar gGC_ExtendPattern[] = {
   1.113 +//    [     \     p     {     G     r     a     p     h     e     m     e     _
   1.114 +    0x5b, 0x5c, 0x70, 0x7b, 0x47, 0x72, 0x61, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x5f,
   1.115 +//    E     x     t     e     n     d     }     ]
   1.116 +    0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x7d, 0x5d, 0};
   1.117 +
   1.118 +static const UChar gGC_LPattern[] = {
   1.119 +//    [     \     p     {     H     a     n     g     u     l     _     S     y     l
   1.120 +    0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
   1.121 +//    l     a     b     l     e     _     T     y     p     e     =     L     }     ]
   1.122 +    0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x7d,  0x5d, 0};
   1.123 +
   1.124 +static const UChar gGC_VPattern[] = {
   1.125 +//    [     \     p     {     H     a     n     g     u     l     _     S     y     l
   1.126 +    0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
   1.127 +//    l     a     b     l     e     _     T     y     p     e     =     V     }     ]
   1.128 +    0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x56, 0x7d,  0x5d, 0};
   1.129 +
   1.130 +static const UChar gGC_TPattern[] = {
   1.131 +//    [     \     p     {     H     a     n     g     u     l     _     S     y     l
   1.132 +    0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
   1.133 +//    l     a     b     l     e     _     T     y     p     e     =     T     }    ]
   1.134 +    0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x54, 0x7d, 0x5d, 0};
   1.135 +
   1.136 +static const UChar gGC_LVPattern[] = {
   1.137 +//    [     \     p     {     H     a     n     g     u     l     _     S     y     l
   1.138 +    0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
   1.139 +//    l     a     b     l     e     _     T     y     p     e     =     L     V     }     ]
   1.140 +    0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x7d, 0x5d, 0};
   1.141 +
   1.142 +static const UChar gGC_LVTPattern[] = {
   1.143 +//    [     \     p     {     H     a     n     g     u     l     _     S     y     l
   1.144 +    0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
   1.145 +//    l     a     b     l     e     _     T     y     p     e     =     L     V     T     }     ]
   1.146 +    0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x54, 0x7d, 0x5d, 0};
   1.147 +
   1.148 +
   1.149 +RegexStaticSets *RegexStaticSets::gStaticSets = NULL;
   1.150 +UInitOnce gStaticSetsInitOnce = U_INITONCE_INITIALIZER;
   1.151 +
   1.152 +RegexStaticSets::RegexStaticSets(UErrorCode *status)
   1.153 +:
   1.154 +fUnescapeCharSet(UnicodeString(TRUE, gUnescapeCharPattern, -1), *status),
   1.155 +fRuleDigitsAlias(NULL),
   1.156 +fEmptyText(NULL)
   1.157 +{
   1.158 +    // First zero out everything
   1.159 +    int i;
   1.160 +    for (i=0; i<URX_LAST_SET; i++) {
   1.161 +        fPropSets[i] = NULL;
   1.162 +    }
   1.163 +    // Then init the sets to their correct values.
   1.164 +    fPropSets[URX_ISWORD_SET]  = new UnicodeSet(UnicodeString(TRUE, gIsWordPattern, -1),     *status);
   1.165 +    fPropSets[URX_ISSPACE_SET] = new UnicodeSet(UnicodeString(TRUE, gIsSpacePattern, -1),    *status);
   1.166 +    fPropSets[URX_GC_EXTEND]   = new UnicodeSet(UnicodeString(TRUE, gGC_ExtendPattern, -1),  *status);
   1.167 +    fPropSets[URX_GC_CONTROL]  = new UnicodeSet(UnicodeString(TRUE, gGC_ControlPattern, -1), *status);
   1.168 +    fPropSets[URX_GC_L]        = new UnicodeSet(UnicodeString(TRUE, gGC_LPattern, -1),       *status);
   1.169 +    fPropSets[URX_GC_V]        = new UnicodeSet(UnicodeString(TRUE, gGC_VPattern, -1),       *status);
   1.170 +    fPropSets[URX_GC_T]        = new UnicodeSet(UnicodeString(TRUE, gGC_TPattern, -1),       *status);
   1.171 +    fPropSets[URX_GC_LV]       = new UnicodeSet(UnicodeString(TRUE, gGC_LVPattern, -1),      *status);
   1.172 +    fPropSets[URX_GC_LVT]      = new UnicodeSet(UnicodeString(TRUE, gGC_LVTPattern, -1),     *status);
   1.173 +    
   1.174 +    // Check for null pointers
   1.175 +    if (fPropSets[URX_ISWORD_SET] == NULL || fPropSets[URX_ISSPACE_SET] == NULL || fPropSets[URX_GC_EXTEND] == NULL || 
   1.176 +        fPropSets[URX_GC_CONTROL] == NULL || fPropSets[URX_GC_L] == NULL || fPropSets[URX_GC_V] == NULL || 
   1.177 +        fPropSets[URX_GC_T] == NULL || fPropSets[URX_GC_LV] == NULL || fPropSets[URX_GC_LVT] == NULL) {
   1.178 +        goto ExitConstrDeleteAll;
   1.179 +    }
   1.180 +    if (U_FAILURE(*status)) {
   1.181 +        // Bail out if we were unable to create the above sets.
   1.182 +        // The rest of the initialization needs them, so we cannot proceed.
   1.183 +        return;
   1.184 +    }
   1.185 +
   1.186 +
   1.187 +    //
   1.188 +    // The following sets  are dynamically constructed, because their
   1.189 +    //   initialization strings would be unreasonable.
   1.190 +    //
   1.191 +
   1.192 +
   1.193 +    //
   1.194 +    //  "Normal" is the set of characters that don't need special handling
   1.195 +    //            when finding grapheme cluster boundaries.
   1.196 +    //
   1.197 +    fPropSets[URX_GC_NORMAL] = new UnicodeSet(0, UnicodeSet::MAX_VALUE);
   1.198 +    // Null pointer check
   1.199 +    if (fPropSets[URX_GC_NORMAL] == NULL) {
   1.200 +    	goto ExitConstrDeleteAll;
   1.201 +    }
   1.202 +    fPropSets[URX_GC_NORMAL]->remove(0xac00, 0xd7a4);
   1.203 +    fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_CONTROL]);
   1.204 +    fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_L]);
   1.205 +    fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_V]);
   1.206 +    fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_T]);
   1.207 +
   1.208 +    // Initialize the 8-bit fast bit sets from the parallel full
   1.209 +    //   UnicodeSets.
   1.210 +    for (i=0; i<URX_LAST_SET; i++) {
   1.211 +        if (fPropSets[i]) {
   1.212 +            fPropSets[i]->compact();
   1.213 +            fPropSets8[i].init(fPropSets[i]);
   1.214 +        }
   1.215 +    }
   1.216 +
   1.217 +    // Sets used while parsing rules, but not referenced from the parse state table
   1.218 +    fRuleSets[kRuleSet_rule_char-128]   = UnicodeSet(UnicodeString(TRUE, gRuleSet_rule_char_pattern, -1),   *status);
   1.219 +    fRuleSets[kRuleSet_digit_char-128]  = UnicodeSet(UnicodeString(TRUE, gRuleSet_digit_char_pattern, -1),  *status);
   1.220 +    fRuleDigitsAlias = &fRuleSets[kRuleSet_digit_char-128];
   1.221 +    for (i=0; i<(int32_t)(sizeof(fRuleSets)/sizeof(fRuleSets[0])); i++) {
   1.222 +        fRuleSets[i].compact();
   1.223 +    }
   1.224 +    
   1.225 +    // Finally, initialize an empty string for utility purposes
   1.226 +    fEmptyText = utext_openUChars(NULL, NULL, 0, status);
   1.227 +    
   1.228 +    return; // If we reached this point, everything is fine so just exit
   1.229 +
   1.230 +ExitConstrDeleteAll: // Remove fPropSets and fRuleSets and return error
   1.231 +    for (i=0; i<URX_LAST_SET; i++) {
   1.232 +        delete fPropSets[i];
   1.233 +        fPropSets[i] = NULL;
   1.234 +    }
   1.235 +    *status = U_MEMORY_ALLOCATION_ERROR;
   1.236 +}
   1.237 +
   1.238 +
   1.239 +RegexStaticSets::~RegexStaticSets() {
   1.240 +    int32_t i;
   1.241 +
   1.242 +    for (i=0; i<URX_LAST_SET; i++) {
   1.243 +        delete fPropSets[i];
   1.244 +        fPropSets[i] = NULL;
   1.245 +    }
   1.246 +    fRuleDigitsAlias = NULL;
   1.247 +    
   1.248 +    utext_close(fEmptyText);
   1.249 +}
   1.250 +
   1.251 +
   1.252 +//------------------------------------------------------------------------------
   1.253 +//
   1.254 +//   regex_cleanup      Memory cleanup function, free/delete all
   1.255 +//                      cached memory.  Called by ICU's u_cleanup() function.
   1.256 +//
   1.257 +//------------------------------------------------------------------------------
   1.258 +UBool
   1.259 +RegexStaticSets::cleanup(void) {
   1.260 +    delete RegexStaticSets::gStaticSets;
   1.261 +    RegexStaticSets::gStaticSets = NULL;
   1.262 +    gStaticSetsInitOnce.reset();
   1.263 +    return TRUE;
   1.264 +}
   1.265 +
   1.266 +U_CDECL_BEGIN
   1.267 +static UBool U_CALLCONV
   1.268 +regex_cleanup(void) {
   1.269 +    return RegexStaticSets::cleanup();
   1.270 +}
   1.271 +
   1.272 +static void U_CALLCONV initStaticSets(UErrorCode &status) {
   1.273 +    U_ASSERT(RegexStaticSets::gStaticSets == NULL);
   1.274 +    ucln_i18n_registerCleanup(UCLN_I18N_REGEX, regex_cleanup);
   1.275 +    RegexStaticSets::gStaticSets = new RegexStaticSets(&status);
   1.276 +    if (U_FAILURE(status)) {
   1.277 +        delete RegexStaticSets::gStaticSets;
   1.278 +        RegexStaticSets::gStaticSets = NULL;
   1.279 +    }
   1.280 +    if (RegexStaticSets::gStaticSets == NULL && U_SUCCESS(status)) {
   1.281 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.282 +    }
   1.283 +}
   1.284 +U_CDECL_END
   1.285 +
   1.286 +void RegexStaticSets::initGlobals(UErrorCode *status) {
   1.287 +    umtx_initOnce(gStaticSetsInitOnce, &initStaticSets, *status);
   1.288 +}
   1.289 +
   1.290 +U_NAMESPACE_END
   1.291 +#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS

mercurial