Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | // |
michael@0 | 2 | // Copyright (C) 2002-2012 International Business Machines Corporation |
michael@0 | 3 | // and others. All rights reserved. |
michael@0 | 4 | // |
michael@0 | 5 | // file: regeximp.h |
michael@0 | 6 | // |
michael@0 | 7 | // ICU Regular Expressions, |
michael@0 | 8 | // Definitions of constant values used in the compiled form of |
michael@0 | 9 | // a regular expression pattern. |
michael@0 | 10 | // |
michael@0 | 11 | |
michael@0 | 12 | #ifndef _REGEXIMP_H |
michael@0 | 13 | #define _REGEXIMP_H |
michael@0 | 14 | |
michael@0 | 15 | #include "unicode/utypes.h" |
michael@0 | 16 | #include "unicode/uobject.h" |
michael@0 | 17 | #include "unicode/uniset.h" |
michael@0 | 18 | #include "unicode/utext.h" |
michael@0 | 19 | |
michael@0 | 20 | #include "cmemory.h" |
michael@0 | 21 | #include "ucase.h" |
michael@0 | 22 | |
michael@0 | 23 | U_NAMESPACE_BEGIN |
michael@0 | 24 | |
michael@0 | 25 | // For debugging, define REGEX_DEBUG |
michael@0 | 26 | // To define with configure, |
michael@0 | 27 | // ./runConfigureICU --enable-debug --disable-release Linux CPPFLAGS="-DREGEX_DEBUG" |
michael@0 | 28 | |
michael@0 | 29 | #ifdef REGEX_DEBUG |
michael@0 | 30 | // |
michael@0 | 31 | // debugging options. Enable one or more of the three #defines immediately following |
michael@0 | 32 | // |
michael@0 | 33 | |
michael@0 | 34 | //#define REGEX_SCAN_DEBUG |
michael@0 | 35 | #define REGEX_DUMP_DEBUG |
michael@0 | 36 | #define REGEX_RUN_DEBUG |
michael@0 | 37 | |
michael@0 | 38 | // End of #defines inteded to be directly set. |
michael@0 | 39 | |
michael@0 | 40 | #include <stdio.h> |
michael@0 | 41 | #endif |
michael@0 | 42 | |
michael@0 | 43 | #ifdef REGEX_SCAN_DEBUG |
michael@0 | 44 | #define REGEX_SCAN_DEBUG_PRINTF(a) printf a |
michael@0 | 45 | #else |
michael@0 | 46 | #define REGEX_SCAN_DEBUG_PRINTF(a) |
michael@0 | 47 | #endif |
michael@0 | 48 | |
michael@0 | 49 | #ifdef REGEX_DUMP_DEBUG |
michael@0 | 50 | #define REGEX_DUMP_DEBUG_PRINTF(a) printf a |
michael@0 | 51 | #else |
michael@0 | 52 | #define REGEX_DUMP_DEBUG_PRINTF(a) |
michael@0 | 53 | #endif |
michael@0 | 54 | |
michael@0 | 55 | #ifdef REGEX_RUN_DEBUG |
michael@0 | 56 | #define REGEX_RUN_DEBUG_PRINTF(a) printf a |
michael@0 | 57 | #define REGEX_DUMP_DEBUG_PRINTF(a) printf a |
michael@0 | 58 | #else |
michael@0 | 59 | #define REGEX_RUN_DEBUG_PRINTF(a) |
michael@0 | 60 | #endif |
michael@0 | 61 | |
michael@0 | 62 | |
michael@0 | 63 | // |
michael@0 | 64 | // Opcode types In the compiled form of the regexp, these are the type, or opcodes, |
michael@0 | 65 | // of the entries. |
michael@0 | 66 | // |
michael@0 | 67 | enum { |
michael@0 | 68 | URX_RESERVED_OP = 0, // For multi-operand ops, most non-first words. |
michael@0 | 69 | URX_RESERVED_OP_N = 255, // For multi-operand ops, negative operand values. |
michael@0 | 70 | URX_BACKTRACK = 1, // Force a backtrack, as if a match test had failed. |
michael@0 | 71 | URX_END = 2, |
michael@0 | 72 | URX_ONECHAR = 3, // Value field is the 21 bit unicode char to match |
michael@0 | 73 | URX_STRING = 4, // Value field is index of string start |
michael@0 | 74 | URX_STRING_LEN = 5, // Value field is string length (code units) |
michael@0 | 75 | URX_STATE_SAVE = 6, // Value field is pattern position to push |
michael@0 | 76 | URX_NOP = 7, |
michael@0 | 77 | URX_START_CAPTURE = 8, // Value field is capture group number. |
michael@0 | 78 | URX_END_CAPTURE = 9, // Value field is capture group number |
michael@0 | 79 | URX_STATIC_SETREF = 10, // Value field is index of set in array of sets. |
michael@0 | 80 | URX_SETREF = 11, // Value field is index of set in array of sets. |
michael@0 | 81 | URX_DOTANY = 12, |
michael@0 | 82 | URX_JMP = 13, // Value field is destination position in |
michael@0 | 83 | // the pattern. |
michael@0 | 84 | URX_FAIL = 14, // Stop match operation, No match. |
michael@0 | 85 | |
michael@0 | 86 | URX_JMP_SAV = 15, // Operand: JMP destination location |
michael@0 | 87 | URX_BACKSLASH_B = 16, // Value field: 0: \b 1: \B |
michael@0 | 88 | URX_BACKSLASH_G = 17, |
michael@0 | 89 | URX_JMP_SAV_X = 18, // Conditional JMP_SAV, |
michael@0 | 90 | // Used in (x)+, breaks loop on zero length match. |
michael@0 | 91 | // Operand: Jmp destination. |
michael@0 | 92 | URX_BACKSLASH_X = 19, |
michael@0 | 93 | URX_BACKSLASH_Z = 20, // \z Unconditional end of line. |
michael@0 | 94 | |
michael@0 | 95 | URX_DOTANY_ALL = 21, // ., in the . matches any mode. |
michael@0 | 96 | URX_BACKSLASH_D = 22, // Value field: 0: \d 1: \D |
michael@0 | 97 | URX_CARET = 23, // Value field: 1: multi-line mode. |
michael@0 | 98 | URX_DOLLAR = 24, // Also for \Z |
michael@0 | 99 | |
michael@0 | 100 | URX_CTR_INIT = 25, // Counter Inits for {Interval} loops. |
michael@0 | 101 | URX_CTR_INIT_NG = 26, // 2 kinds, normal and non-greedy. |
michael@0 | 102 | // These are 4 word opcodes. See description. |
michael@0 | 103 | // First Operand: Data loc of counter variable |
michael@0 | 104 | // 2nd Operand: Pat loc of the URX_CTR_LOOPx |
michael@0 | 105 | // at the end of the loop. |
michael@0 | 106 | // 3rd Operand: Minimum count. |
michael@0 | 107 | // 4th Operand: Max count, -1 for unbounded. |
michael@0 | 108 | |
michael@0 | 109 | URX_DOTANY_UNIX = 27, // '.' operator in UNIX_LINES mode, only \n marks end of line. |
michael@0 | 110 | |
michael@0 | 111 | URX_CTR_LOOP = 28, // Loop Ops for {interval} loops. |
michael@0 | 112 | URX_CTR_LOOP_NG = 29, // Also in three flavors. |
michael@0 | 113 | // Operand is loc of corresponding CTR_INIT. |
michael@0 | 114 | |
michael@0 | 115 | URX_CARET_M_UNIX = 30, // '^' operator, test for start of line in multi-line |
michael@0 | 116 | // plus UNIX_LINES mode. |
michael@0 | 117 | |
michael@0 | 118 | URX_RELOC_OPRND = 31, // Operand value in multi-operand ops that refers |
michael@0 | 119 | // back into compiled pattern code, and thus must |
michael@0 | 120 | // be relocated when inserting/deleting ops in code. |
michael@0 | 121 | |
michael@0 | 122 | URX_STO_SP = 32, // Store the stack ptr. Operand is location within |
michael@0 | 123 | // matcher data (not stack data) to store it. |
michael@0 | 124 | URX_LD_SP = 33, // Load the stack pointer. Operand is location |
michael@0 | 125 | // to load from. |
michael@0 | 126 | URX_BACKREF = 34, // Back Reference. Parameter is the index of the |
michael@0 | 127 | // capture group variables in the state stack frame. |
michael@0 | 128 | URX_STO_INP_LOC = 35, // Store the input location. Operand is location |
michael@0 | 129 | // within the matcher stack frame. |
michael@0 | 130 | URX_JMPX = 36, // Conditional JMP. |
michael@0 | 131 | // First Operand: JMP target location. |
michael@0 | 132 | // Second Operand: Data location containing an |
michael@0 | 133 | // input position. If current input position == |
michael@0 | 134 | // saved input position, FAIL rather than taking |
michael@0 | 135 | // the JMP |
michael@0 | 136 | URX_LA_START = 37, // Starting a LookAround expression. |
michael@0 | 137 | // Save InputPos and SP in static data. |
michael@0 | 138 | // Operand: Static data offset for the save |
michael@0 | 139 | URX_LA_END = 38, // Ending a Lookaround expression. |
michael@0 | 140 | // Restore InputPos and Stack to saved values. |
michael@0 | 141 | // Operand: Static data offset for saved data. |
michael@0 | 142 | URX_ONECHAR_I = 39, // Test for case-insensitive match of a literal character. |
michael@0 | 143 | // Operand: the literal char. |
michael@0 | 144 | URX_STRING_I = 40, // Case insensitive string compare. |
michael@0 | 145 | // First Operand: Index of start of string in string literals |
michael@0 | 146 | // Second Operand (next word in compiled code): |
michael@0 | 147 | // the length of the string. |
michael@0 | 148 | URX_BACKREF_I = 41, // Case insensitive back reference. |
michael@0 | 149 | // Parameter is the index of the |
michael@0 | 150 | // capture group variables in the state stack frame. |
michael@0 | 151 | URX_DOLLAR_M = 42, // $ in multi-line mode. |
michael@0 | 152 | URX_CARET_M = 43, // ^ in multi-line mode. |
michael@0 | 153 | URX_LB_START = 44, // LookBehind Start. |
michael@0 | 154 | // Paramater is data location |
michael@0 | 155 | URX_LB_CONT = 45, // LookBehind Continue. |
michael@0 | 156 | // Param 0: the data location |
michael@0 | 157 | // Param 1: The minimum length of the look-behind match |
michael@0 | 158 | // Param 2: The max length of the look-behind match |
michael@0 | 159 | URX_LB_END = 46, // LookBehind End. |
michael@0 | 160 | // Parameter is the data location. |
michael@0 | 161 | // Check that match ended at the right spot, |
michael@0 | 162 | // Restore original input string len. |
michael@0 | 163 | URX_LBN_CONT = 47, // Negative LookBehind Continue |
michael@0 | 164 | // Param 0: the data location |
michael@0 | 165 | // Param 1: The minimum length of the look-behind match |
michael@0 | 166 | // Param 2: The max length of the look-behind match |
michael@0 | 167 | // Param 3: The pattern loc following the look-behind block. |
michael@0 | 168 | URX_LBN_END = 48, // Negative LookBehind end |
michael@0 | 169 | // Parameter is the data location. |
michael@0 | 170 | // Check that the match ended at the right spot. |
michael@0 | 171 | URX_STAT_SETREF_N = 49, // Reference to a prebuilt set (e.g. \w), negated |
michael@0 | 172 | // Operand is index of set in array of sets. |
michael@0 | 173 | URX_LOOP_SR_I = 50, // Init a [set]* loop. |
michael@0 | 174 | // Operand is the sets index in array of user sets. |
michael@0 | 175 | URX_LOOP_C = 51, // Continue a [set]* or OneChar* loop. |
michael@0 | 176 | // Operand is a matcher static data location. |
michael@0 | 177 | // Must always immediately follow LOOP_x_I instruction. |
michael@0 | 178 | URX_LOOP_DOT_I = 52, // .*, initialization of the optimized loop. |
michael@0 | 179 | // Operand value: |
michael@0 | 180 | // bit 0: |
michael@0 | 181 | // 0: Normal (. doesn't match new-line) mode. |
michael@0 | 182 | // 1: . matches new-line mode. |
michael@0 | 183 | // bit 1: controls what new-lines are recognized by this operation. |
michael@0 | 184 | // 0: All Unicode New-lines |
michael@0 | 185 | // 1: UNIX_LINES, \u000a only. |
michael@0 | 186 | URX_BACKSLASH_BU = 53, // \b or \B in UREGEX_UWORD mode, using Unicode style |
michael@0 | 187 | // word boundaries. |
michael@0 | 188 | URX_DOLLAR_D = 54, // $ end of input test, in UNIX_LINES mode. |
michael@0 | 189 | URX_DOLLAR_MD = 55 // $ end of input test, in MULTI_LINE and UNIX_LINES mode. |
michael@0 | 190 | |
michael@0 | 191 | }; |
michael@0 | 192 | |
michael@0 | 193 | // Keep this list of opcode names in sync with the above enum |
michael@0 | 194 | // Used for debug printing only. |
michael@0 | 195 | #define URX_OPCODE_NAMES \ |
michael@0 | 196 | " ", \ |
michael@0 | 197 | "BACKTRACK", \ |
michael@0 | 198 | "END", \ |
michael@0 | 199 | "ONECHAR", \ |
michael@0 | 200 | "STRING", \ |
michael@0 | 201 | "STRING_LEN", \ |
michael@0 | 202 | "STATE_SAVE", \ |
michael@0 | 203 | "NOP", \ |
michael@0 | 204 | "START_CAPTURE", \ |
michael@0 | 205 | "END_CAPTURE", \ |
michael@0 | 206 | "URX_STATIC_SETREF", \ |
michael@0 | 207 | "SETREF", \ |
michael@0 | 208 | "DOTANY", \ |
michael@0 | 209 | "JMP", \ |
michael@0 | 210 | "FAIL", \ |
michael@0 | 211 | "JMP_SAV", \ |
michael@0 | 212 | "BACKSLASH_B", \ |
michael@0 | 213 | "BACKSLASH_G", \ |
michael@0 | 214 | "JMP_SAV_X", \ |
michael@0 | 215 | "BACKSLASH_X", \ |
michael@0 | 216 | "BACKSLASH_Z", \ |
michael@0 | 217 | "DOTANY_ALL", \ |
michael@0 | 218 | "BACKSLASH_D", \ |
michael@0 | 219 | "CARET", \ |
michael@0 | 220 | "DOLLAR", \ |
michael@0 | 221 | "CTR_INIT", \ |
michael@0 | 222 | "CTR_INIT_NG", \ |
michael@0 | 223 | "DOTANY_UNIX", \ |
michael@0 | 224 | "CTR_LOOP", \ |
michael@0 | 225 | "CTR_LOOP_NG", \ |
michael@0 | 226 | "URX_CARET_M_UNIX", \ |
michael@0 | 227 | "RELOC_OPRND", \ |
michael@0 | 228 | "STO_SP", \ |
michael@0 | 229 | "LD_SP", \ |
michael@0 | 230 | "BACKREF", \ |
michael@0 | 231 | "STO_INP_LOC", \ |
michael@0 | 232 | "JMPX", \ |
michael@0 | 233 | "LA_START", \ |
michael@0 | 234 | "LA_END", \ |
michael@0 | 235 | "ONECHAR_I", \ |
michael@0 | 236 | "STRING_I", \ |
michael@0 | 237 | "BACKREF_I", \ |
michael@0 | 238 | "DOLLAR_M", \ |
michael@0 | 239 | "CARET_M", \ |
michael@0 | 240 | "LB_START", \ |
michael@0 | 241 | "LB_CONT", \ |
michael@0 | 242 | "LB_END", \ |
michael@0 | 243 | "LBN_CONT", \ |
michael@0 | 244 | "LBN_END", \ |
michael@0 | 245 | "STAT_SETREF_N", \ |
michael@0 | 246 | "LOOP_SR_I", \ |
michael@0 | 247 | "LOOP_C", \ |
michael@0 | 248 | "LOOP_DOT_I", \ |
michael@0 | 249 | "BACKSLASH_BU", \ |
michael@0 | 250 | "DOLLAR_D", \ |
michael@0 | 251 | "DOLLAR_MD" |
michael@0 | 252 | |
michael@0 | 253 | |
michael@0 | 254 | // |
michael@0 | 255 | // Convenience macros for assembling and disassembling a compiled operation. |
michael@0 | 256 | // |
michael@0 | 257 | #define URX_BUILD(type, val) (int32_t)((type << 24) | (val)) |
michael@0 | 258 | #define URX_TYPE(x) ((uint32_t)(x) >> 24) |
michael@0 | 259 | #define URX_VAL(x) ((x) & 0xffffff) |
michael@0 | 260 | |
michael@0 | 261 | |
michael@0 | 262 | // |
michael@0 | 263 | // Access to Unicode Sets composite character properties |
michael@0 | 264 | // The sets are accessed by the match engine for things like \w (word boundary) |
michael@0 | 265 | // |
michael@0 | 266 | enum { |
michael@0 | 267 | URX_ISWORD_SET = 1, |
michael@0 | 268 | URX_ISALNUM_SET = 2, |
michael@0 | 269 | URX_ISALPHA_SET = 3, |
michael@0 | 270 | URX_ISSPACE_SET = 4, |
michael@0 | 271 | |
michael@0 | 272 | URX_GC_NORMAL, // Sets for finding grapheme cluster boundaries. |
michael@0 | 273 | URX_GC_EXTEND, |
michael@0 | 274 | URX_GC_CONTROL, |
michael@0 | 275 | URX_GC_L, |
michael@0 | 276 | URX_GC_LV, |
michael@0 | 277 | URX_GC_LVT, |
michael@0 | 278 | URX_GC_V, |
michael@0 | 279 | URX_GC_T, |
michael@0 | 280 | |
michael@0 | 281 | URX_LAST_SET, |
michael@0 | 282 | |
michael@0 | 283 | URX_NEG_SET = 0x800000 // Flag bit to reverse sense of set |
michael@0 | 284 | // membership test. |
michael@0 | 285 | }; |
michael@0 | 286 | |
michael@0 | 287 | |
michael@0 | 288 | // |
michael@0 | 289 | // Match Engine State Stack Frame Layout. |
michael@0 | 290 | // |
michael@0 | 291 | struct REStackFrame { |
michael@0 | 292 | // Header |
michael@0 | 293 | int64_t fInputIdx; // Position of next character in the input string |
michael@0 | 294 | int64_t fPatIdx; // Position of next Op in the compiled pattern |
michael@0 | 295 | // (int64_t for UVector64, values fit in an int32_t) |
michael@0 | 296 | // Remainder |
michael@0 | 297 | int64_t fExtra[1]; // Extra state, for capture group start/ends |
michael@0 | 298 | // atomic parentheses, repeat counts, etc. |
michael@0 | 299 | // Locations assigned at pattern compile time. |
michael@0 | 300 | // Variable-length array. |
michael@0 | 301 | }; |
michael@0 | 302 | // number of UVector elements in the header |
michael@0 | 303 | #define RESTACKFRAME_HDRCOUNT 2 |
michael@0 | 304 | |
michael@0 | 305 | // |
michael@0 | 306 | // Start-Of-Match type. Used by find() to quickly scan to positions where a |
michael@0 | 307 | // match might start before firing up the full match engine. |
michael@0 | 308 | // |
michael@0 | 309 | enum StartOfMatch { |
michael@0 | 310 | START_NO_INFO, // No hint available. |
michael@0 | 311 | START_CHAR, // Match starts with a literal code point. |
michael@0 | 312 | START_SET, // Match starts with something matching a set. |
michael@0 | 313 | START_START, // Match starts at start of buffer only (^ or \A) |
michael@0 | 314 | START_LINE, // Match starts with ^ in multi-line mode. |
michael@0 | 315 | START_STRING // Match starts with a literal string. |
michael@0 | 316 | }; |
michael@0 | 317 | |
michael@0 | 318 | #define START_OF_MATCH_STR(v) ((v)==START_NO_INFO? "START_NO_INFO" : \ |
michael@0 | 319 | (v)==START_CHAR? "START_CHAR" : \ |
michael@0 | 320 | (v)==START_SET? "START_SET" : \ |
michael@0 | 321 | (v)==START_START? "START_START" : \ |
michael@0 | 322 | (v)==START_LINE? "START_LINE" : \ |
michael@0 | 323 | (v)==START_STRING? "START_STRING" : \ |
michael@0 | 324 | "ILLEGAL") |
michael@0 | 325 | |
michael@0 | 326 | // |
michael@0 | 327 | // 8 bit set, to fast-path latin-1 set membership tests. |
michael@0 | 328 | // |
michael@0 | 329 | struct Regex8BitSet : public UMemory { |
michael@0 | 330 | inline Regex8BitSet(); |
michael@0 | 331 | inline void operator = (const Regex8BitSet &s); |
michael@0 | 332 | inline void init(const UnicodeSet *src); |
michael@0 | 333 | inline UBool contains(UChar32 c); |
michael@0 | 334 | inline void add(UChar32 c); |
michael@0 | 335 | int8_t d[32]; |
michael@0 | 336 | }; |
michael@0 | 337 | |
michael@0 | 338 | inline Regex8BitSet::Regex8BitSet() { |
michael@0 | 339 | uprv_memset(d, 0, sizeof(d)); |
michael@0 | 340 | } |
michael@0 | 341 | |
michael@0 | 342 | inline UBool Regex8BitSet::contains(UChar32 c) { |
michael@0 | 343 | // No bounds checking! This is deliberate. |
michael@0 | 344 | return ((d[c>>3] & 1 <<(c&7)) != 0); |
michael@0 | 345 | } |
michael@0 | 346 | |
michael@0 | 347 | inline void Regex8BitSet::add(UChar32 c) { |
michael@0 | 348 | d[c>>3] |= 1 << (c&7); |
michael@0 | 349 | } |
michael@0 | 350 | |
michael@0 | 351 | inline void Regex8BitSet::init(const UnicodeSet *s) { |
michael@0 | 352 | if (s != NULL) { |
michael@0 | 353 | for (int32_t i=0; i<=255; i++) { |
michael@0 | 354 | if (s->contains(i)) { |
michael@0 | 355 | this->add(i); |
michael@0 | 356 | } |
michael@0 | 357 | } |
michael@0 | 358 | } |
michael@0 | 359 | } |
michael@0 | 360 | |
michael@0 | 361 | inline void Regex8BitSet::operator = (const Regex8BitSet &s) { |
michael@0 | 362 | uprv_memcpy(d, s.d, sizeof(d)); |
michael@0 | 363 | } |
michael@0 | 364 | |
michael@0 | 365 | |
michael@0 | 366 | // Case folded UText Iterator helper class. |
michael@0 | 367 | // Wraps a UText, provides a case-folded enumeration over its contents. |
michael@0 | 368 | // Used in implementing case insensitive matching constructs. |
michael@0 | 369 | // Implementation in rematch.cpp |
michael@0 | 370 | |
michael@0 | 371 | class CaseFoldingUTextIterator: public UMemory { |
michael@0 | 372 | public: |
michael@0 | 373 | CaseFoldingUTextIterator(UText &text); |
michael@0 | 374 | ~CaseFoldingUTextIterator(); |
michael@0 | 375 | |
michael@0 | 376 | UChar32 next(); // Next case folded character |
michael@0 | 377 | |
michael@0 | 378 | UBool inExpansion(); // True if last char returned from next() and the |
michael@0 | 379 | // next to be returned both originated from a string |
michael@0 | 380 | // folding of the same code point from the orignal UText. |
michael@0 | 381 | private: |
michael@0 | 382 | UText &fUText; |
michael@0 | 383 | const UCaseProps *fcsp; |
michael@0 | 384 | const UChar *fFoldChars; |
michael@0 | 385 | int32_t fFoldLength; |
michael@0 | 386 | int32_t fFoldIndex; |
michael@0 | 387 | |
michael@0 | 388 | }; |
michael@0 | 389 | |
michael@0 | 390 | |
michael@0 | 391 | // Case folded UChar * string iterator. |
michael@0 | 392 | // Wraps a UChar *, provides a case-folded enumeration over its contents. |
michael@0 | 393 | // Used in implementing case insensitive matching constructs. |
michael@0 | 394 | // Implementation in rematch.cpp |
michael@0 | 395 | |
michael@0 | 396 | class CaseFoldingUCharIterator: public UMemory { |
michael@0 | 397 | public: |
michael@0 | 398 | CaseFoldingUCharIterator(const UChar *chars, int64_t start, int64_t limit); |
michael@0 | 399 | ~CaseFoldingUCharIterator(); |
michael@0 | 400 | |
michael@0 | 401 | UChar32 next(); // Next case folded character |
michael@0 | 402 | |
michael@0 | 403 | UBool inExpansion(); // True if last char returned from next() and the |
michael@0 | 404 | // next to be returned both originated from a string |
michael@0 | 405 | // folding of the same code point from the orignal UText. |
michael@0 | 406 | |
michael@0 | 407 | int64_t getIndex(); // Return the current input buffer index. |
michael@0 | 408 | |
michael@0 | 409 | private: |
michael@0 | 410 | const UChar *fChars; |
michael@0 | 411 | int64_t fIndex; |
michael@0 | 412 | int64_t fLimit; |
michael@0 | 413 | const UCaseProps *fcsp; |
michael@0 | 414 | const UChar *fFoldChars; |
michael@0 | 415 | int32_t fFoldLength; |
michael@0 | 416 | int32_t fFoldIndex; |
michael@0 | 417 | |
michael@0 | 418 | }; |
michael@0 | 419 | |
michael@0 | 420 | U_NAMESPACE_END |
michael@0 | 421 | #endif |
michael@0 | 422 |