Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* |
michael@0 | 2 | ******************************************************************************* |
michael@0 | 3 | * Copyright (C) 2011, International Business Machines |
michael@0 | 4 | * Corporation and others. All Rights Reserved. |
michael@0 | 5 | ******************************************************************************* |
michael@0 | 6 | * file name: patternprops.cpp |
michael@0 | 7 | * encoding: US-ASCII |
michael@0 | 8 | * tab size: 8 (not used) |
michael@0 | 9 | * indentation:4 |
michael@0 | 10 | * |
michael@0 | 11 | * created on: 2011mar13 |
michael@0 | 12 | * created by: Markus W. Scherer |
michael@0 | 13 | */ |
michael@0 | 14 | |
michael@0 | 15 | #include "unicode/utypes.h" |
michael@0 | 16 | #include "patternprops.h" |
michael@0 | 17 | |
michael@0 | 18 | U_NAMESPACE_BEGIN |
michael@0 | 19 | |
michael@0 | 20 | /* |
michael@0 | 21 | * One byte per Latin-1 character. |
michael@0 | 22 | * Bit 0 is set if either Pattern property is true, |
michael@0 | 23 | * bit 1 if Pattern_Syntax is true, |
michael@0 | 24 | * bit 2 if Pattern_White_Space is true. |
michael@0 | 25 | * That is, Pattern_Syntax is encoded as 3 and Pattern_White_Space as 5. |
michael@0 | 26 | */ |
michael@0 | 27 | static const uint8_t latin1[256]={ |
michael@0 | 28 | // WS: 9..D |
michael@0 | 29 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 0, 0, |
michael@0 | 30 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
michael@0 | 31 | // WS: 20 Syntax: 21..2F |
michael@0 | 32 | 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
michael@0 | 33 | // Syntax: 3A..40 |
michael@0 | 34 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, |
michael@0 | 35 | 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
michael@0 | 36 | // Syntax: 5B..5E |
michael@0 | 37 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, |
michael@0 | 38 | // Syntax: 60 |
michael@0 | 39 | 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
michael@0 | 40 | // Syntax: 7B..7E |
michael@0 | 41 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, |
michael@0 | 42 | // WS: 85 |
michael@0 | 43 | 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
michael@0 | 44 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
michael@0 | 45 | // Syntax: A1..A7, A9, AB, AC, AE |
michael@0 | 46 | 0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 0, |
michael@0 | 47 | // Syntax: B0, B1, B6, BB, BF |
michael@0 | 48 | 3, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3, |
michael@0 | 49 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
michael@0 | 50 | // Syntax: D7 |
michael@0 | 51 | 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, |
michael@0 | 52 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
michael@0 | 53 | // Syntax: F7 |
michael@0 | 54 | 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0 |
michael@0 | 55 | }; |
michael@0 | 56 | |
michael@0 | 57 | /* |
michael@0 | 58 | * One byte per 32 characters from U+2000..U+303F indexing into |
michael@0 | 59 | * a small table of 32-bit data words. |
michael@0 | 60 | * The first two data words are all-zeros and all-ones. |
michael@0 | 61 | */ |
michael@0 | 62 | static const uint8_t index2000[130]={ |
michael@0 | 63 | 2, 3, 4, 0, 0, 0, 0, 0, // 20xx |
michael@0 | 64 | 0, 0, 0, 0, 5, 1, 1, 1, // 21xx |
michael@0 | 65 | 1, 1, 1, 1, 1, 1, 1, 1, // 22xx |
michael@0 | 66 | 1, 1, 1, 1, 1, 1, 1, 1, // 23xx |
michael@0 | 67 | 1, 1, 1, 0, 0, 0, 0, 0, // 24xx |
michael@0 | 68 | 1, 1, 1, 1, 1, 1, 1, 1, // 25xx |
michael@0 | 69 | 1, 1, 1, 1, 1, 1, 1, 1, // 26xx |
michael@0 | 70 | 1, 1, 1, 6, 7, 1, 1, 1, // 27xx |
michael@0 | 71 | 1, 1, 1, 1, 1, 1, 1, 1, // 28xx |
michael@0 | 72 | 1, 1, 1, 1, 1, 1, 1, 1, // 29xx |
michael@0 | 73 | 1, 1, 1, 1, 1, 1, 1, 1, // 2Axx |
michael@0 | 74 | 1, 1, 1, 1, 1, 1, 1, 1, // 2Bxx |
michael@0 | 75 | 0, 0, 0, 0, 0, 0, 0, 0, // 2Cxx |
michael@0 | 76 | 0, 0, 0, 0, 0, 0, 0, 0, // 2Dxx |
michael@0 | 77 | 1, 1, 1, 1, 0, 0, 0, 0, // 2Exx |
michael@0 | 78 | 0, 0, 0, 0, 0, 0, 0, 0, // 2Fxx |
michael@0 | 79 | 8, 9 // 3000..303F |
michael@0 | 80 | }; |
michael@0 | 81 | |
michael@0 | 82 | /* |
michael@0 | 83 | * One 32-bit integer per 32 characters. Ranges of all-false and all-true |
michael@0 | 84 | * are mapped to the first two values, other ranges map to appropriate bit patterns. |
michael@0 | 85 | */ |
michael@0 | 86 | static const uint32_t syntax2000[]={ |
michael@0 | 87 | 0, |
michael@0 | 88 | 0xffffffff, |
michael@0 | 89 | 0xffff0000, // 2: 2010..201F |
michael@0 | 90 | 0x7fff00ff, // 3: 2020..2027, 2030..203E |
michael@0 | 91 | 0x7feffffe, // 4: 2041..2053, 2055..205E |
michael@0 | 92 | 0xffff0000, // 5: 2190..219F |
michael@0 | 93 | 0x003fffff, // 6: 2760..2775 |
michael@0 | 94 | 0xfff00000, // 7: 2794..279F |
michael@0 | 95 | 0xffffff0e, // 8: 3001..3003, 3008..301F |
michael@0 | 96 | 0x00010001 // 9: 3020, 3030 |
michael@0 | 97 | }; |
michael@0 | 98 | |
michael@0 | 99 | /* |
michael@0 | 100 | * Same as syntax2000, but with additional bits set for the |
michael@0 | 101 | * Pattern_White_Space characters 200E 200F 2028 2029. |
michael@0 | 102 | */ |
michael@0 | 103 | static const uint32_t syntaxOrWhiteSpace2000[]={ |
michael@0 | 104 | 0, |
michael@0 | 105 | 0xffffffff, |
michael@0 | 106 | 0xffffc000, // 2: 200E..201F |
michael@0 | 107 | 0x7fff03ff, // 3: 2020..2029, 2030..203E |
michael@0 | 108 | 0x7feffffe, // 4: 2041..2053, 2055..205E |
michael@0 | 109 | 0xffff0000, // 5: 2190..219F |
michael@0 | 110 | 0x003fffff, // 6: 2760..2775 |
michael@0 | 111 | 0xfff00000, // 7: 2794..279F |
michael@0 | 112 | 0xffffff0e, // 8: 3001..3003, 3008..301F |
michael@0 | 113 | 0x00010001 // 9: 3020, 3030 |
michael@0 | 114 | }; |
michael@0 | 115 | |
michael@0 | 116 | UBool |
michael@0 | 117 | PatternProps::isSyntax(UChar32 c) { |
michael@0 | 118 | if(c<0) { |
michael@0 | 119 | return FALSE; |
michael@0 | 120 | } else if(c<=0xff) { |
michael@0 | 121 | return (UBool)(latin1[c]>>1)&1; |
michael@0 | 122 | } else if(c<0x2010) { |
michael@0 | 123 | return FALSE; |
michael@0 | 124 | } else if(c<=0x3030) { |
michael@0 | 125 | uint32_t bits=syntax2000[index2000[(c-0x2000)>>5]]; |
michael@0 | 126 | return (UBool)((bits>>(c&0x1f))&1); |
michael@0 | 127 | } else if(0xfd3e<=c && c<=0xfe46) { |
michael@0 | 128 | return c<=0xfd3f || 0xfe45<=c; |
michael@0 | 129 | } else { |
michael@0 | 130 | return FALSE; |
michael@0 | 131 | } |
michael@0 | 132 | } |
michael@0 | 133 | |
michael@0 | 134 | UBool |
michael@0 | 135 | PatternProps::isSyntaxOrWhiteSpace(UChar32 c) { |
michael@0 | 136 | if(c<0) { |
michael@0 | 137 | return FALSE; |
michael@0 | 138 | } else if(c<=0xff) { |
michael@0 | 139 | return (UBool)(latin1[c]&1); |
michael@0 | 140 | } else if(c<0x200e) { |
michael@0 | 141 | return FALSE; |
michael@0 | 142 | } else if(c<=0x3030) { |
michael@0 | 143 | uint32_t bits=syntaxOrWhiteSpace2000[index2000[(c-0x2000)>>5]]; |
michael@0 | 144 | return (UBool)((bits>>(c&0x1f))&1); |
michael@0 | 145 | } else if(0xfd3e<=c && c<=0xfe46) { |
michael@0 | 146 | return c<=0xfd3f || 0xfe45<=c; |
michael@0 | 147 | } else { |
michael@0 | 148 | return FALSE; |
michael@0 | 149 | } |
michael@0 | 150 | } |
michael@0 | 151 | |
michael@0 | 152 | UBool |
michael@0 | 153 | PatternProps::isWhiteSpace(UChar32 c) { |
michael@0 | 154 | if(c<0) { |
michael@0 | 155 | return FALSE; |
michael@0 | 156 | } else if(c<=0xff) { |
michael@0 | 157 | return (UBool)(latin1[c]>>2)&1; |
michael@0 | 158 | } else if(0x200e<=c && c<=0x2029) { |
michael@0 | 159 | return c<=0x200f || 0x2028<=c; |
michael@0 | 160 | } else { |
michael@0 | 161 | return FALSE; |
michael@0 | 162 | } |
michael@0 | 163 | } |
michael@0 | 164 | |
michael@0 | 165 | const UChar * |
michael@0 | 166 | PatternProps::skipWhiteSpace(const UChar *s, int32_t length) { |
michael@0 | 167 | while(length>0 && isWhiteSpace(*s)) { |
michael@0 | 168 | ++s; |
michael@0 | 169 | --length; |
michael@0 | 170 | } |
michael@0 | 171 | return s; |
michael@0 | 172 | } |
michael@0 | 173 | |
michael@0 | 174 | const UChar * |
michael@0 | 175 | PatternProps::trimWhiteSpace(const UChar *s, int32_t &length) { |
michael@0 | 176 | if(length<=0 || (!isWhiteSpace(s[0]) && !isWhiteSpace(s[length-1]))) { |
michael@0 | 177 | return s; |
michael@0 | 178 | } |
michael@0 | 179 | int32_t start=0; |
michael@0 | 180 | int32_t limit=length; |
michael@0 | 181 | while(start<limit && isWhiteSpace(s[start])) { |
michael@0 | 182 | ++start; |
michael@0 | 183 | } |
michael@0 | 184 | if(start<limit) { |
michael@0 | 185 | // There is non-white space at start; we will not move limit below that, |
michael@0 | 186 | // so we need not test start<limit in the loop. |
michael@0 | 187 | while(isWhiteSpace(s[limit-1])) { |
michael@0 | 188 | --limit; |
michael@0 | 189 | } |
michael@0 | 190 | } |
michael@0 | 191 | length=limit-start; |
michael@0 | 192 | return s+start; |
michael@0 | 193 | } |
michael@0 | 194 | |
michael@0 | 195 | UBool |
michael@0 | 196 | PatternProps::isIdentifier(const UChar *s, int32_t length) { |
michael@0 | 197 | if(length<=0) { |
michael@0 | 198 | return FALSE; |
michael@0 | 199 | } |
michael@0 | 200 | const UChar *limit=s+length; |
michael@0 | 201 | do { |
michael@0 | 202 | if(isSyntaxOrWhiteSpace(*s++)) { |
michael@0 | 203 | return FALSE; |
michael@0 | 204 | } |
michael@0 | 205 | } while(s<limit); |
michael@0 | 206 | return TRUE; |
michael@0 | 207 | } |
michael@0 | 208 | |
michael@0 | 209 | const UChar * |
michael@0 | 210 | PatternProps::skipIdentifier(const UChar *s, int32_t length) { |
michael@0 | 211 | while(length>0 && !isSyntaxOrWhiteSpace(*s)) { |
michael@0 | 212 | ++s; |
michael@0 | 213 | --length; |
michael@0 | 214 | } |
michael@0 | 215 | return s; |
michael@0 | 216 | } |
michael@0 | 217 | |
michael@0 | 218 | U_NAMESPACE_END |