intl/icu/source/common/patternprops.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/patternprops.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,218 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*   Copyright (C) 2011, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +*******************************************************************************
     1.9 +*   file name:  patternprops.cpp
    1.10 +*   encoding:   US-ASCII
    1.11 +*   tab size:   8 (not used)
    1.12 +*   indentation:4
    1.13 +*
    1.14 +*   created on: 2011mar13
    1.15 +*   created by: Markus W. Scherer
    1.16 +*/
    1.17 +
    1.18 +#include "unicode/utypes.h"
    1.19 +#include "patternprops.h"
    1.20 +
    1.21 +U_NAMESPACE_BEGIN
    1.22 +
    1.23 +/*
    1.24 + * One byte per Latin-1 character.
    1.25 + * Bit 0 is set if either Pattern property is true,
    1.26 + * bit 1 if Pattern_Syntax is true,
    1.27 + * bit 2 if Pattern_White_Space is true.
    1.28 + * That is, Pattern_Syntax is encoded as 3 and Pattern_White_Space as 5.
    1.29 + */
    1.30 +static const uint8_t latin1[256]={
    1.31 +    // WS: 9..D
    1.32 +    0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 0, 0,
    1.33 +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1.34 +    // WS: 20  Syntax: 21..2F
    1.35 +    5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
    1.36 +    // Syntax: 3A..40
    1.37 +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3,
    1.38 +    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1.39 +    // Syntax: 5B..5E
    1.40 +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
    1.41 +    // Syntax: 60
    1.42 +    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1.43 +    // Syntax: 7B..7E
    1.44 +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
    1.45 +    // WS: 85
    1.46 +    0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1.47 +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1.48 +    // Syntax: A1..A7, A9, AB, AC, AE
    1.49 +    0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 0,
    1.50 +    // Syntax: B0, B1, B6, BB, BF
    1.51 +    3, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3,
    1.52 +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1.53 +    // Syntax: D7
    1.54 +    0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
    1.55 +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1.56 +    // Syntax: F7
    1.57 +    0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0
    1.58 +};
    1.59 +
    1.60 +/*
    1.61 + * One byte per 32 characters from U+2000..U+303F indexing into
    1.62 + * a small table of 32-bit data words.
    1.63 + * The first two data words are all-zeros and all-ones.
    1.64 + */
    1.65 +static const uint8_t index2000[130]={
    1.66 +    2, 3, 4, 0, 0, 0, 0, 0,  // 20xx
    1.67 +    0, 0, 0, 0, 5, 1, 1, 1,  // 21xx
    1.68 +    1, 1, 1, 1, 1, 1, 1, 1,  // 22xx
    1.69 +    1, 1, 1, 1, 1, 1, 1, 1,  // 23xx
    1.70 +    1, 1, 1, 0, 0, 0, 0, 0,  // 24xx
    1.71 +    1, 1, 1, 1, 1, 1, 1, 1,  // 25xx
    1.72 +    1, 1, 1, 1, 1, 1, 1, 1,  // 26xx
    1.73 +    1, 1, 1, 6, 7, 1, 1, 1,  // 27xx
    1.74 +    1, 1, 1, 1, 1, 1, 1, 1,  // 28xx
    1.75 +    1, 1, 1, 1, 1, 1, 1, 1,  // 29xx
    1.76 +    1, 1, 1, 1, 1, 1, 1, 1,  // 2Axx
    1.77 +    1, 1, 1, 1, 1, 1, 1, 1,  // 2Bxx
    1.78 +    0, 0, 0, 0, 0, 0, 0, 0,  // 2Cxx
    1.79 +    0, 0, 0, 0, 0, 0, 0, 0,  // 2Dxx
    1.80 +    1, 1, 1, 1, 0, 0, 0, 0,  // 2Exx
    1.81 +    0, 0, 0, 0, 0, 0, 0, 0,  // 2Fxx
    1.82 +    8, 9  // 3000..303F
    1.83 +};
    1.84 +
    1.85 +/*
    1.86 + * One 32-bit integer per 32 characters. Ranges of all-false and all-true
    1.87 + * are mapped to the first two values, other ranges map to appropriate bit patterns.
    1.88 + */
    1.89 +static const uint32_t syntax2000[]={
    1.90 +    0,
    1.91 +    0xffffffff,
    1.92 +    0xffff0000,  // 2: 2010..201F
    1.93 +    0x7fff00ff,  // 3: 2020..2027, 2030..203E
    1.94 +    0x7feffffe,  // 4: 2041..2053, 2055..205E
    1.95 +    0xffff0000,  // 5: 2190..219F
    1.96 +    0x003fffff,  // 6: 2760..2775
    1.97 +    0xfff00000,  // 7: 2794..279F
    1.98 +    0xffffff0e,  // 8: 3001..3003, 3008..301F
    1.99 +    0x00010001   // 9: 3020, 3030
   1.100 +};
   1.101 +
   1.102 +/*
   1.103 + * Same as syntax2000, but with additional bits set for the
   1.104 + * Pattern_White_Space characters 200E 200F 2028 2029.
   1.105 + */
   1.106 +static const uint32_t syntaxOrWhiteSpace2000[]={
   1.107 +    0,
   1.108 +    0xffffffff,
   1.109 +    0xffffc000,  // 2: 200E..201F
   1.110 +    0x7fff03ff,  // 3: 2020..2029, 2030..203E
   1.111 +    0x7feffffe,  // 4: 2041..2053, 2055..205E
   1.112 +    0xffff0000,  // 5: 2190..219F
   1.113 +    0x003fffff,  // 6: 2760..2775
   1.114 +    0xfff00000,  // 7: 2794..279F
   1.115 +    0xffffff0e,  // 8: 3001..3003, 3008..301F
   1.116 +    0x00010001   // 9: 3020, 3030
   1.117 +};
   1.118 +
   1.119 +UBool
   1.120 +PatternProps::isSyntax(UChar32 c) {
   1.121 +    if(c<0) {
   1.122 +        return FALSE;
   1.123 +    } else if(c<=0xff) {
   1.124 +        return (UBool)(latin1[c]>>1)&1;
   1.125 +    } else if(c<0x2010) {
   1.126 +        return FALSE;
   1.127 +    } else if(c<=0x3030) {
   1.128 +        uint32_t bits=syntax2000[index2000[(c-0x2000)>>5]];
   1.129 +        return (UBool)((bits>>(c&0x1f))&1);
   1.130 +    } else if(0xfd3e<=c && c<=0xfe46) {
   1.131 +        return c<=0xfd3f || 0xfe45<=c;
   1.132 +    } else {
   1.133 +        return FALSE;
   1.134 +    }
   1.135 +}
   1.136 +
   1.137 +UBool
   1.138 +PatternProps::isSyntaxOrWhiteSpace(UChar32 c) {
   1.139 +    if(c<0) {
   1.140 +        return FALSE;
   1.141 +    } else if(c<=0xff) {
   1.142 +        return (UBool)(latin1[c]&1);
   1.143 +    } else if(c<0x200e) {
   1.144 +        return FALSE;
   1.145 +    } else if(c<=0x3030) {
   1.146 +        uint32_t bits=syntaxOrWhiteSpace2000[index2000[(c-0x2000)>>5]];
   1.147 +        return (UBool)((bits>>(c&0x1f))&1);
   1.148 +    } else if(0xfd3e<=c && c<=0xfe46) {
   1.149 +        return c<=0xfd3f || 0xfe45<=c;
   1.150 +    } else {
   1.151 +        return FALSE;
   1.152 +    }
   1.153 +}
   1.154 +
   1.155 +UBool
   1.156 +PatternProps::isWhiteSpace(UChar32 c) {
   1.157 +    if(c<0) {
   1.158 +        return FALSE;
   1.159 +    } else if(c<=0xff) {
   1.160 +        return (UBool)(latin1[c]>>2)&1;
   1.161 +    } else if(0x200e<=c && c<=0x2029) {
   1.162 +        return c<=0x200f || 0x2028<=c;
   1.163 +    } else {
   1.164 +        return FALSE;
   1.165 +    }
   1.166 +}
   1.167 +
   1.168 +const UChar *
   1.169 +PatternProps::skipWhiteSpace(const UChar *s, int32_t length) {
   1.170 +    while(length>0 && isWhiteSpace(*s)) {
   1.171 +        ++s;
   1.172 +        --length;
   1.173 +    }
   1.174 +    return s;
   1.175 +}
   1.176 +
   1.177 +const UChar *
   1.178 +PatternProps::trimWhiteSpace(const UChar *s, int32_t &length) {
   1.179 +    if(length<=0 || (!isWhiteSpace(s[0]) && !isWhiteSpace(s[length-1]))) {
   1.180 +        return s;
   1.181 +    }
   1.182 +    int32_t start=0;
   1.183 +    int32_t limit=length;
   1.184 +    while(start<limit && isWhiteSpace(s[start])) {
   1.185 +        ++start;
   1.186 +    }
   1.187 +    if(start<limit) {
   1.188 +        // There is non-white space at start; we will not move limit below that,
   1.189 +        // so we need not test start<limit in the loop.
   1.190 +        while(isWhiteSpace(s[limit-1])) {
   1.191 +            --limit;
   1.192 +        }
   1.193 +    }
   1.194 +    length=limit-start;
   1.195 +    return s+start;
   1.196 +}
   1.197 +
   1.198 +UBool
   1.199 +PatternProps::isIdentifier(const UChar *s, int32_t length) {
   1.200 +    if(length<=0) {
   1.201 +        return FALSE;
   1.202 +    }
   1.203 +    const UChar *limit=s+length;
   1.204 +    do {
   1.205 +        if(isSyntaxOrWhiteSpace(*s++)) {
   1.206 +            return FALSE;
   1.207 +        }
   1.208 +    } while(s<limit);
   1.209 +    return TRUE;
   1.210 +}
   1.211 +
   1.212 +const UChar *
   1.213 +PatternProps::skipIdentifier(const UChar *s, int32_t length) {
   1.214 +    while(length>0 && !isSyntaxOrWhiteSpace(*s)) {
   1.215 +        ++s;
   1.216 +        --length;
   1.217 +    }
   1.218 +    return s;
   1.219 +}
   1.220 +
   1.221 +U_NAMESPACE_END

mercurial