|
1 /* |
|
2 ******************************************************************************* |
|
3 * Copyright (C) 2004-2013, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ******************************************************************************* |
|
6 * file name: uregex.cpp |
|
7 */ |
|
8 |
|
9 #include "unicode/utypes.h" |
|
10 |
|
11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
|
12 |
|
13 #include "unicode/regex.h" |
|
14 #include "unicode/uregex.h" |
|
15 #include "unicode/unistr.h" |
|
16 #include "unicode/ustring.h" |
|
17 #include "unicode/uchar.h" |
|
18 #include "unicode/uobject.h" |
|
19 #include "unicode/utf16.h" |
|
20 #include "umutex.h" |
|
21 #include "uassert.h" |
|
22 #include "cmemory.h" |
|
23 |
|
24 #include "regextxt.h" |
|
25 |
|
26 #include <stdio.h> |
|
27 |
|
28 U_NAMESPACE_BEGIN |
|
29 |
|
30 #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0) |
|
31 |
|
32 struct RegularExpression: public UMemory { |
|
33 public: |
|
34 RegularExpression(); |
|
35 ~RegularExpression(); |
|
36 int32_t fMagic; |
|
37 RegexPattern *fPat; |
|
38 u_atomic_int32_t *fPatRefCount; |
|
39 UChar *fPatString; |
|
40 int32_t fPatStringLen; |
|
41 RegexMatcher *fMatcher; |
|
42 const UChar *fText; // Text from setText() |
|
43 int32_t fTextLength; // Length provided by user with setText(), which |
|
44 // may be -1. |
|
45 UBool fOwnsText; |
|
46 }; |
|
47 |
|
48 static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII |
|
49 |
|
50 RegularExpression::RegularExpression() { |
|
51 fMagic = REXP_MAGIC; |
|
52 fPat = NULL; |
|
53 fPatRefCount = NULL; |
|
54 fPatString = NULL; |
|
55 fPatStringLen = 0; |
|
56 fMatcher = NULL; |
|
57 fText = NULL; |
|
58 fTextLength = 0; |
|
59 fOwnsText = FALSE; |
|
60 } |
|
61 |
|
62 RegularExpression::~RegularExpression() { |
|
63 delete fMatcher; |
|
64 fMatcher = NULL; |
|
65 if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) { |
|
66 delete fPat; |
|
67 uprv_free(fPatString); |
|
68 uprv_free((void *)fPatRefCount); |
|
69 } |
|
70 if (fOwnsText && fText!=NULL) { |
|
71 uprv_free((void *)fText); |
|
72 } |
|
73 fMagic = 0; |
|
74 } |
|
75 |
|
76 U_NAMESPACE_END |
|
77 |
|
78 U_NAMESPACE_USE |
|
79 |
|
80 //---------------------------------------------------------------------------------------- |
|
81 // |
|
82 // validateRE Do boilerplate style checks on API function parameters. |
|
83 // Return TRUE if they look OK. |
|
84 //---------------------------------------------------------------------------------------- |
|
85 static UBool validateRE(const RegularExpression *re, UBool requiresText, UErrorCode *status) { |
|
86 if (U_FAILURE(*status)) { |
|
87 return FALSE; |
|
88 } |
|
89 if (re == NULL || re->fMagic != REXP_MAGIC) { |
|
90 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
91 return FALSE; |
|
92 } |
|
93 // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway |
|
94 if (requiresText && re->fText == NULL && !re->fOwnsText) { |
|
95 *status = U_REGEX_INVALID_STATE; |
|
96 return FALSE; |
|
97 } |
|
98 return TRUE; |
|
99 } |
|
100 |
|
101 //---------------------------------------------------------------------------------------- |
|
102 // |
|
103 // uregex_open |
|
104 // |
|
105 //---------------------------------------------------------------------------------------- |
|
106 U_CAPI URegularExpression * U_EXPORT2 |
|
107 uregex_open( const UChar *pattern, |
|
108 int32_t patternLength, |
|
109 uint32_t flags, |
|
110 UParseError *pe, |
|
111 UErrorCode *status) { |
|
112 |
|
113 if (U_FAILURE(*status)) { |
|
114 return NULL; |
|
115 } |
|
116 if (pattern == NULL || patternLength < -1 || patternLength == 0) { |
|
117 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
118 return NULL; |
|
119 } |
|
120 int32_t actualPatLen = patternLength; |
|
121 if (actualPatLen == -1) { |
|
122 actualPatLen = u_strlen(pattern); |
|
123 } |
|
124 |
|
125 RegularExpression *re = new RegularExpression; |
|
126 u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t)); |
|
127 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1)); |
|
128 if (re == NULL || refC == NULL || patBuf == NULL) { |
|
129 *status = U_MEMORY_ALLOCATION_ERROR; |
|
130 delete re; |
|
131 uprv_free((void *)refC); |
|
132 uprv_free(patBuf); |
|
133 return NULL; |
|
134 } |
|
135 re->fPatRefCount = refC; |
|
136 *re->fPatRefCount = 1; |
|
137 |
|
138 // |
|
139 // Make a copy of the pattern string, so we can return it later if asked. |
|
140 // For compiling the pattern, we will use a UText wrapper around |
|
141 // this local copy, to avoid making even more copies. |
|
142 // |
|
143 re->fPatString = patBuf; |
|
144 re->fPatStringLen = patternLength; |
|
145 u_memcpy(patBuf, pattern, actualPatLen); |
|
146 patBuf[actualPatLen] = 0; |
|
147 |
|
148 UText patText = UTEXT_INITIALIZER; |
|
149 utext_openUChars(&patText, patBuf, patternLength, status); |
|
150 |
|
151 // |
|
152 // Compile the pattern |
|
153 // |
|
154 if (pe != NULL) { |
|
155 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); |
|
156 } else { |
|
157 re->fPat = RegexPattern::compile(&patText, flags, *status); |
|
158 } |
|
159 utext_close(&patText); |
|
160 |
|
161 if (U_FAILURE(*status)) { |
|
162 goto ErrorExit; |
|
163 } |
|
164 |
|
165 // |
|
166 // Create the matcher object |
|
167 // |
|
168 re->fMatcher = re->fPat->matcher(*status); |
|
169 if (U_SUCCESS(*status)) { |
|
170 return (URegularExpression*)re; |
|
171 } |
|
172 |
|
173 ErrorExit: |
|
174 delete re; |
|
175 return NULL; |
|
176 |
|
177 } |
|
178 |
|
179 //---------------------------------------------------------------------------------------- |
|
180 // |
|
181 // uregex_openUText |
|
182 // |
|
183 //---------------------------------------------------------------------------------------- |
|
184 U_CAPI URegularExpression * U_EXPORT2 |
|
185 uregex_openUText(UText *pattern, |
|
186 uint32_t flags, |
|
187 UParseError *pe, |
|
188 UErrorCode *status) { |
|
189 |
|
190 if (U_FAILURE(*status)) { |
|
191 return NULL; |
|
192 } |
|
193 if (pattern == NULL) { |
|
194 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
195 return NULL; |
|
196 } |
|
197 |
|
198 int64_t patternNativeLength = utext_nativeLength(pattern); |
|
199 |
|
200 if (patternNativeLength == 0) { |
|
201 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
202 return NULL; |
|
203 } |
|
204 |
|
205 RegularExpression *re = new RegularExpression; |
|
206 |
|
207 UErrorCode lengthStatus = U_ZERO_ERROR; |
|
208 int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus); |
|
209 |
|
210 u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t)); |
|
211 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1)); |
|
212 if (re == NULL || refC == NULL || patBuf == NULL) { |
|
213 *status = U_MEMORY_ALLOCATION_ERROR; |
|
214 delete re; |
|
215 uprv_free((void *)refC); |
|
216 uprv_free(patBuf); |
|
217 return NULL; |
|
218 } |
|
219 re->fPatRefCount = refC; |
|
220 *re->fPatRefCount = 1; |
|
221 |
|
222 // |
|
223 // Make a copy of the pattern string, so we can return it later if asked. |
|
224 // For compiling the pattern, we will use a read-only UText wrapper |
|
225 // around this local copy, to avoid making even more copies. |
|
226 // |
|
227 re->fPatString = patBuf; |
|
228 re->fPatStringLen = pattern16Length; |
|
229 utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status); |
|
230 |
|
231 UText patText = UTEXT_INITIALIZER; |
|
232 utext_openUChars(&patText, patBuf, pattern16Length, status); |
|
233 |
|
234 // |
|
235 // Compile the pattern |
|
236 // |
|
237 if (pe != NULL) { |
|
238 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); |
|
239 } else { |
|
240 re->fPat = RegexPattern::compile(&patText, flags, *status); |
|
241 } |
|
242 utext_close(&patText); |
|
243 |
|
244 if (U_FAILURE(*status)) { |
|
245 goto ErrorExit; |
|
246 } |
|
247 |
|
248 // |
|
249 // Create the matcher object |
|
250 // |
|
251 re->fMatcher = re->fPat->matcher(*status); |
|
252 if (U_SUCCESS(*status)) { |
|
253 return (URegularExpression*)re; |
|
254 } |
|
255 |
|
256 ErrorExit: |
|
257 delete re; |
|
258 return NULL; |
|
259 |
|
260 } |
|
261 |
|
262 //---------------------------------------------------------------------------------------- |
|
263 // |
|
264 // uregex_close |
|
265 // |
|
266 //---------------------------------------------------------------------------------------- |
|
267 U_CAPI void U_EXPORT2 |
|
268 uregex_close(URegularExpression *re2) { |
|
269 RegularExpression *re = (RegularExpression*)re2; |
|
270 UErrorCode status = U_ZERO_ERROR; |
|
271 if (validateRE(re, FALSE, &status) == FALSE) { |
|
272 return; |
|
273 } |
|
274 delete re; |
|
275 } |
|
276 |
|
277 |
|
278 //---------------------------------------------------------------------------------------- |
|
279 // |
|
280 // uregex_clone |
|
281 // |
|
282 //---------------------------------------------------------------------------------------- |
|
283 U_CAPI URegularExpression * U_EXPORT2 |
|
284 uregex_clone(const URegularExpression *source2, UErrorCode *status) { |
|
285 RegularExpression *source = (RegularExpression*)source2; |
|
286 if (validateRE(source, FALSE, status) == FALSE) { |
|
287 return NULL; |
|
288 } |
|
289 |
|
290 RegularExpression *clone = new RegularExpression; |
|
291 if (clone == NULL) { |
|
292 *status = U_MEMORY_ALLOCATION_ERROR; |
|
293 return NULL; |
|
294 } |
|
295 |
|
296 clone->fMatcher = source->fPat->matcher(*status); |
|
297 if (U_FAILURE(*status)) { |
|
298 delete clone; |
|
299 return NULL; |
|
300 } |
|
301 |
|
302 clone->fPat = source->fPat; |
|
303 clone->fPatRefCount = source->fPatRefCount; |
|
304 clone->fPatString = source->fPatString; |
|
305 clone->fPatStringLen = source->fPatStringLen; |
|
306 umtx_atomic_inc(source->fPatRefCount); |
|
307 // Note: fText is not cloned. |
|
308 |
|
309 return (URegularExpression*)clone; |
|
310 } |
|
311 |
|
312 |
|
313 |
|
314 |
|
315 //------------------------------------------------------------------------------ |
|
316 // |
|
317 // uregex_pattern |
|
318 // |
|
319 //------------------------------------------------------------------------------ |
|
320 U_CAPI const UChar * U_EXPORT2 |
|
321 uregex_pattern(const URegularExpression *regexp2, |
|
322 int32_t *patLength, |
|
323 UErrorCode *status) { |
|
324 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
325 |
|
326 if (validateRE(regexp, FALSE, status) == FALSE) { |
|
327 return NULL; |
|
328 } |
|
329 if (patLength != NULL) { |
|
330 *patLength = regexp->fPatStringLen; |
|
331 } |
|
332 return regexp->fPatString; |
|
333 } |
|
334 |
|
335 |
|
336 //------------------------------------------------------------------------------ |
|
337 // |
|
338 // uregex_patternUText |
|
339 // |
|
340 //------------------------------------------------------------------------------ |
|
341 U_CAPI UText * U_EXPORT2 |
|
342 uregex_patternUText(const URegularExpression *regexp2, |
|
343 UErrorCode *status) { |
|
344 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
345 return regexp->fPat->patternText(*status); |
|
346 } |
|
347 |
|
348 |
|
349 //------------------------------------------------------------------------------ |
|
350 // |
|
351 // uregex_flags |
|
352 // |
|
353 //------------------------------------------------------------------------------ |
|
354 U_CAPI int32_t U_EXPORT2 |
|
355 uregex_flags(const URegularExpression *regexp2, UErrorCode *status) { |
|
356 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
357 if (validateRE(regexp, FALSE, status) == FALSE) { |
|
358 return 0; |
|
359 } |
|
360 int32_t flags = regexp->fPat->flags(); |
|
361 return flags; |
|
362 } |
|
363 |
|
364 |
|
365 //------------------------------------------------------------------------------ |
|
366 // |
|
367 // uregex_setText |
|
368 // |
|
369 //------------------------------------------------------------------------------ |
|
370 U_CAPI void U_EXPORT2 |
|
371 uregex_setText(URegularExpression *regexp2, |
|
372 const UChar *text, |
|
373 int32_t textLength, |
|
374 UErrorCode *status) { |
|
375 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
376 if (validateRE(regexp, FALSE, status) == FALSE) { |
|
377 return; |
|
378 } |
|
379 if (text == NULL || textLength < -1) { |
|
380 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
381 return; |
|
382 } |
|
383 |
|
384 if (regexp->fOwnsText && regexp->fText != NULL) { |
|
385 uprv_free((void *)regexp->fText); |
|
386 } |
|
387 |
|
388 regexp->fText = text; |
|
389 regexp->fTextLength = textLength; |
|
390 regexp->fOwnsText = FALSE; |
|
391 |
|
392 UText input = UTEXT_INITIALIZER; |
|
393 utext_openUChars(&input, text, textLength, status); |
|
394 regexp->fMatcher->reset(&input); |
|
395 utext_close(&input); // reset() made a shallow clone, so we don't need this copy |
|
396 } |
|
397 |
|
398 |
|
399 //------------------------------------------------------------------------------ |
|
400 // |
|
401 // uregex_setUText |
|
402 // |
|
403 //------------------------------------------------------------------------------ |
|
404 U_CAPI void U_EXPORT2 |
|
405 uregex_setUText(URegularExpression *regexp2, |
|
406 UText *text, |
|
407 UErrorCode *status) { |
|
408 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
409 if (validateRE(regexp, FALSE, status) == FALSE) { |
|
410 return; |
|
411 } |
|
412 if (text == NULL) { |
|
413 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
414 return; |
|
415 } |
|
416 |
|
417 if (regexp->fOwnsText && regexp->fText != NULL) { |
|
418 uprv_free((void *)regexp->fText); |
|
419 } |
|
420 |
|
421 regexp->fText = NULL; // only fill it in on request |
|
422 regexp->fTextLength = -1; |
|
423 regexp->fOwnsText = TRUE; |
|
424 regexp->fMatcher->reset(text); |
|
425 } |
|
426 |
|
427 |
|
428 |
|
429 //------------------------------------------------------------------------------ |
|
430 // |
|
431 // uregex_getText |
|
432 // |
|
433 //------------------------------------------------------------------------------ |
|
434 U_CAPI const UChar * U_EXPORT2 |
|
435 uregex_getText(URegularExpression *regexp2, |
|
436 int32_t *textLength, |
|
437 UErrorCode *status) { |
|
438 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
439 if (validateRE(regexp, FALSE, status) == FALSE) { |
|
440 return NULL; |
|
441 } |
|
442 |
|
443 if (regexp->fText == NULL) { |
|
444 // need to fill in the text |
|
445 UText *inputText = regexp->fMatcher->inputText(); |
|
446 int64_t inputNativeLength = utext_nativeLength(inputText); |
|
447 if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) { |
|
448 regexp->fText = inputText->chunkContents; |
|
449 regexp->fTextLength = (int32_t)inputNativeLength; |
|
450 regexp->fOwnsText = FALSE; // because the UText owns it |
|
451 } else { |
|
452 UErrorCode lengthStatus = U_ZERO_ERROR; |
|
453 regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error |
|
454 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1)); |
|
455 |
|
456 utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status); |
|
457 regexp->fText = inputChars; |
|
458 regexp->fOwnsText = TRUE; // should already be set but just in case |
|
459 } |
|
460 } |
|
461 |
|
462 if (textLength != NULL) { |
|
463 *textLength = regexp->fTextLength; |
|
464 } |
|
465 return regexp->fText; |
|
466 } |
|
467 |
|
468 |
|
469 //------------------------------------------------------------------------------ |
|
470 // |
|
471 // uregex_getUText |
|
472 // |
|
473 //------------------------------------------------------------------------------ |
|
474 U_CAPI UText * U_EXPORT2 |
|
475 uregex_getUText(URegularExpression *regexp2, |
|
476 UText *dest, |
|
477 UErrorCode *status) { |
|
478 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
479 if (validateRE(regexp, FALSE, status) == FALSE) { |
|
480 return dest; |
|
481 } |
|
482 return regexp->fMatcher->getInput(dest, *status); |
|
483 } |
|
484 |
|
485 |
|
486 //------------------------------------------------------------------------------ |
|
487 // |
|
488 // uregex_refreshUText |
|
489 // |
|
490 //------------------------------------------------------------------------------ |
|
491 U_CAPI void U_EXPORT2 |
|
492 uregex_refreshUText(URegularExpression *regexp2, |
|
493 UText *text, |
|
494 UErrorCode *status) { |
|
495 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
496 if (validateRE(regexp, FALSE, status) == FALSE) { |
|
497 return; |
|
498 } |
|
499 regexp->fMatcher->refreshInputText(text, *status); |
|
500 } |
|
501 |
|
502 |
|
503 //------------------------------------------------------------------------------ |
|
504 // |
|
505 // uregex_matches |
|
506 // |
|
507 //------------------------------------------------------------------------------ |
|
508 U_CAPI UBool U_EXPORT2 |
|
509 uregex_matches(URegularExpression *regexp2, |
|
510 int32_t startIndex, |
|
511 UErrorCode *status) { |
|
512 return uregex_matches64( regexp2, (int64_t)startIndex, status); |
|
513 } |
|
514 |
|
515 U_CAPI UBool U_EXPORT2 |
|
516 uregex_matches64(URegularExpression *regexp2, |
|
517 int64_t startIndex, |
|
518 UErrorCode *status) { |
|
519 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
520 UBool result = FALSE; |
|
521 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
522 return result; |
|
523 } |
|
524 if (startIndex == -1) { |
|
525 result = regexp->fMatcher->matches(*status); |
|
526 } else { |
|
527 result = regexp->fMatcher->matches(startIndex, *status); |
|
528 } |
|
529 return result; |
|
530 } |
|
531 |
|
532 |
|
533 //------------------------------------------------------------------------------ |
|
534 // |
|
535 // uregex_lookingAt |
|
536 // |
|
537 //------------------------------------------------------------------------------ |
|
538 U_CAPI UBool U_EXPORT2 |
|
539 uregex_lookingAt(URegularExpression *regexp2, |
|
540 int32_t startIndex, |
|
541 UErrorCode *status) { |
|
542 return uregex_lookingAt64( regexp2, (int64_t)startIndex, status); |
|
543 } |
|
544 |
|
545 U_CAPI UBool U_EXPORT2 |
|
546 uregex_lookingAt64(URegularExpression *regexp2, |
|
547 int64_t startIndex, |
|
548 UErrorCode *status) { |
|
549 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
550 UBool result = FALSE; |
|
551 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
552 return result; |
|
553 } |
|
554 if (startIndex == -1) { |
|
555 result = regexp->fMatcher->lookingAt(*status); |
|
556 } else { |
|
557 result = regexp->fMatcher->lookingAt(startIndex, *status); |
|
558 } |
|
559 return result; |
|
560 } |
|
561 |
|
562 |
|
563 |
|
564 //------------------------------------------------------------------------------ |
|
565 // |
|
566 // uregex_find |
|
567 // |
|
568 //------------------------------------------------------------------------------ |
|
569 U_CAPI UBool U_EXPORT2 |
|
570 uregex_find(URegularExpression *regexp2, |
|
571 int32_t startIndex, |
|
572 UErrorCode *status) { |
|
573 return uregex_find64( regexp2, (int64_t)startIndex, status); |
|
574 } |
|
575 |
|
576 U_CAPI UBool U_EXPORT2 |
|
577 uregex_find64(URegularExpression *regexp2, |
|
578 int64_t startIndex, |
|
579 UErrorCode *status) { |
|
580 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
581 UBool result = FALSE; |
|
582 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
583 return result; |
|
584 } |
|
585 if (startIndex == -1) { |
|
586 regexp->fMatcher->resetPreserveRegion(); |
|
587 result = regexp->fMatcher->find(); |
|
588 } else { |
|
589 result = regexp->fMatcher->find(startIndex, *status); |
|
590 } |
|
591 return result; |
|
592 } |
|
593 |
|
594 |
|
595 //------------------------------------------------------------------------------ |
|
596 // |
|
597 // uregex_findNext |
|
598 // |
|
599 //------------------------------------------------------------------------------ |
|
600 U_CAPI UBool U_EXPORT2 |
|
601 uregex_findNext(URegularExpression *regexp2, |
|
602 UErrorCode *status) { |
|
603 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
604 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
605 return FALSE; |
|
606 } |
|
607 UBool result = regexp->fMatcher->find(); |
|
608 return result; |
|
609 } |
|
610 |
|
611 //------------------------------------------------------------------------------ |
|
612 // |
|
613 // uregex_groupCount |
|
614 // |
|
615 //------------------------------------------------------------------------------ |
|
616 U_CAPI int32_t U_EXPORT2 |
|
617 uregex_groupCount(URegularExpression *regexp2, |
|
618 UErrorCode *status) { |
|
619 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
620 if (validateRE(regexp, FALSE, status) == FALSE) { |
|
621 return 0; |
|
622 } |
|
623 int32_t result = regexp->fMatcher->groupCount(); |
|
624 return result; |
|
625 } |
|
626 |
|
627 |
|
628 //------------------------------------------------------------------------------ |
|
629 // |
|
630 // uregex_group |
|
631 // |
|
632 //------------------------------------------------------------------------------ |
|
633 U_CAPI int32_t U_EXPORT2 |
|
634 uregex_group(URegularExpression *regexp2, |
|
635 int32_t groupNum, |
|
636 UChar *dest, |
|
637 int32_t destCapacity, |
|
638 UErrorCode *status) { |
|
639 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
640 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
641 return 0; |
|
642 } |
|
643 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { |
|
644 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
645 return 0; |
|
646 } |
|
647 |
|
648 if (destCapacity == 0 || regexp->fText != NULL) { |
|
649 // If preflighting or if we already have the text as UChars, |
|
650 // this is a little cheaper than going through uregex_groupUTextDeep() |
|
651 |
|
652 // |
|
653 // Pick up the range of characters from the matcher |
|
654 // |
|
655 int32_t startIx = regexp->fMatcher->start(groupNum, *status); |
|
656 int32_t endIx = regexp->fMatcher->end (groupNum, *status); |
|
657 if (U_FAILURE(*status)) { |
|
658 return 0; |
|
659 } |
|
660 |
|
661 // |
|
662 // Trim length based on buffer capacity |
|
663 // |
|
664 int32_t fullLength = endIx - startIx; |
|
665 int32_t copyLength = fullLength; |
|
666 if (copyLength < destCapacity) { |
|
667 dest[copyLength] = 0; |
|
668 } else if (copyLength == destCapacity) { |
|
669 *status = U_STRING_NOT_TERMINATED_WARNING; |
|
670 } else { |
|
671 copyLength = destCapacity; |
|
672 *status = U_BUFFER_OVERFLOW_ERROR; |
|
673 } |
|
674 |
|
675 // |
|
676 // Copy capture group to user's buffer |
|
677 // |
|
678 if (copyLength > 0) { |
|
679 u_memcpy(dest, ®exp->fText[startIx], copyLength); |
|
680 } |
|
681 return fullLength; |
|
682 } else { |
|
683 UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status); |
|
684 int32_t result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status); |
|
685 utext_close(groupText); |
|
686 return result; |
|
687 } |
|
688 } |
|
689 |
|
690 |
|
691 //------------------------------------------------------------------------------ |
|
692 // |
|
693 // uregex_groupUText |
|
694 // |
|
695 //------------------------------------------------------------------------------ |
|
696 U_CAPI UText * U_EXPORT2 |
|
697 uregex_groupUText(URegularExpression *regexp2, |
|
698 int32_t groupNum, |
|
699 UText *dest, |
|
700 int64_t *groupLength, |
|
701 UErrorCode *status) { |
|
702 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
703 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
704 UErrorCode emptyTextStatus = U_ZERO_ERROR; |
|
705 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); |
|
706 } |
|
707 |
|
708 return regexp->fMatcher->group(groupNum, dest, *groupLength, *status); |
|
709 } |
|
710 |
|
711 //------------------------------------------------------------------------------ |
|
712 // |
|
713 // uregex_groupUTextDeep |
|
714 // |
|
715 //------------------------------------------------------------------------------ |
|
716 U_CAPI UText * U_EXPORT2 |
|
717 uregex_groupUTextDeep(URegularExpression *regexp2, |
|
718 int32_t groupNum, |
|
719 UText *dest, |
|
720 UErrorCode *status) { |
|
721 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
722 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
723 UErrorCode emptyTextStatus = U_ZERO_ERROR; |
|
724 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); |
|
725 } |
|
726 |
|
727 if (regexp->fText != NULL) { |
|
728 // |
|
729 // Pick up the range of characters from the matcher |
|
730 // and use our already-extracted characters |
|
731 // |
|
732 int32_t startIx = regexp->fMatcher->start(groupNum, *status); |
|
733 int32_t endIx = regexp->fMatcher->end (groupNum, *status); |
|
734 if (U_FAILURE(*status)) { |
|
735 UErrorCode emptyTextStatus = U_ZERO_ERROR; |
|
736 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); |
|
737 } |
|
738 |
|
739 if (dest) { |
|
740 utext_replace(dest, 0, utext_nativeLength(dest), ®exp->fText[startIx], endIx - startIx, status); |
|
741 } else { |
|
742 UText groupText = UTEXT_INITIALIZER; |
|
743 utext_openUChars(&groupText, ®exp->fText[startIx], endIx - startIx, status); |
|
744 dest = utext_clone(NULL, &groupText, TRUE, FALSE, status); |
|
745 utext_close(&groupText); |
|
746 } |
|
747 |
|
748 return dest; |
|
749 } else { |
|
750 return regexp->fMatcher->group(groupNum, dest, *status); |
|
751 } |
|
752 } |
|
753 |
|
754 //------------------------------------------------------------------------------ |
|
755 // |
|
756 // uregex_start |
|
757 // |
|
758 //------------------------------------------------------------------------------ |
|
759 U_CAPI int32_t U_EXPORT2 |
|
760 uregex_start(URegularExpression *regexp2, |
|
761 int32_t groupNum, |
|
762 UErrorCode *status) { |
|
763 return (int32_t)uregex_start64( regexp2, groupNum, status); |
|
764 } |
|
765 |
|
766 U_CAPI int64_t U_EXPORT2 |
|
767 uregex_start64(URegularExpression *regexp2, |
|
768 int32_t groupNum, |
|
769 UErrorCode *status) { |
|
770 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
771 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
772 return 0; |
|
773 } |
|
774 int32_t result = regexp->fMatcher->start(groupNum, *status); |
|
775 return result; |
|
776 } |
|
777 |
|
778 //------------------------------------------------------------------------------ |
|
779 // |
|
780 // uregex_end |
|
781 // |
|
782 //------------------------------------------------------------------------------ |
|
783 U_CAPI int32_t U_EXPORT2 |
|
784 uregex_end(URegularExpression *regexp2, |
|
785 int32_t groupNum, |
|
786 UErrorCode *status) { |
|
787 return (int32_t)uregex_end64( regexp2, groupNum, status); |
|
788 } |
|
789 |
|
790 U_CAPI int64_t U_EXPORT2 |
|
791 uregex_end64(URegularExpression *regexp2, |
|
792 int32_t groupNum, |
|
793 UErrorCode *status) { |
|
794 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
795 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
796 return 0; |
|
797 } |
|
798 int32_t result = regexp->fMatcher->end(groupNum, *status); |
|
799 return result; |
|
800 } |
|
801 |
|
802 //------------------------------------------------------------------------------ |
|
803 // |
|
804 // uregex_reset |
|
805 // |
|
806 //------------------------------------------------------------------------------ |
|
807 U_CAPI void U_EXPORT2 |
|
808 uregex_reset(URegularExpression *regexp2, |
|
809 int32_t index, |
|
810 UErrorCode *status) { |
|
811 uregex_reset64( regexp2, (int64_t)index, status); |
|
812 } |
|
813 |
|
814 U_CAPI void U_EXPORT2 |
|
815 uregex_reset64(URegularExpression *regexp2, |
|
816 int64_t index, |
|
817 UErrorCode *status) { |
|
818 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
819 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
820 return; |
|
821 } |
|
822 regexp->fMatcher->reset(index, *status); |
|
823 } |
|
824 |
|
825 |
|
826 //------------------------------------------------------------------------------ |
|
827 // |
|
828 // uregex_setRegion |
|
829 // |
|
830 //------------------------------------------------------------------------------ |
|
831 U_CAPI void U_EXPORT2 |
|
832 uregex_setRegion(URegularExpression *regexp2, |
|
833 int32_t regionStart, |
|
834 int32_t regionLimit, |
|
835 UErrorCode *status) { |
|
836 uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status); |
|
837 } |
|
838 |
|
839 U_CAPI void U_EXPORT2 |
|
840 uregex_setRegion64(URegularExpression *regexp2, |
|
841 int64_t regionStart, |
|
842 int64_t regionLimit, |
|
843 UErrorCode *status) { |
|
844 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
845 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
846 return; |
|
847 } |
|
848 regexp->fMatcher->region(regionStart, regionLimit, *status); |
|
849 } |
|
850 |
|
851 |
|
852 //------------------------------------------------------------------------------ |
|
853 // |
|
854 // uregex_setRegionAndStart |
|
855 // |
|
856 //------------------------------------------------------------------------------ |
|
857 U_CAPI void U_EXPORT2 |
|
858 uregex_setRegionAndStart(URegularExpression *regexp2, |
|
859 int64_t regionStart, |
|
860 int64_t regionLimit, |
|
861 int64_t startIndex, |
|
862 UErrorCode *status) { |
|
863 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
864 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
865 return; |
|
866 } |
|
867 regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status); |
|
868 } |
|
869 |
|
870 //------------------------------------------------------------------------------ |
|
871 // |
|
872 // uregex_regionStart |
|
873 // |
|
874 //------------------------------------------------------------------------------ |
|
875 U_CAPI int32_t U_EXPORT2 |
|
876 uregex_regionStart(const URegularExpression *regexp2, |
|
877 UErrorCode *status) { |
|
878 return (int32_t)uregex_regionStart64(regexp2, status); |
|
879 } |
|
880 |
|
881 U_CAPI int64_t U_EXPORT2 |
|
882 uregex_regionStart64(const URegularExpression *regexp2, |
|
883 UErrorCode *status) { |
|
884 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
885 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
886 return 0; |
|
887 } |
|
888 return regexp->fMatcher->regionStart(); |
|
889 } |
|
890 |
|
891 |
|
892 //------------------------------------------------------------------------------ |
|
893 // |
|
894 // uregex_regionEnd |
|
895 // |
|
896 //------------------------------------------------------------------------------ |
|
897 U_CAPI int32_t U_EXPORT2 |
|
898 uregex_regionEnd(const URegularExpression *regexp2, |
|
899 UErrorCode *status) { |
|
900 return (int32_t)uregex_regionEnd64(regexp2, status); |
|
901 } |
|
902 |
|
903 U_CAPI int64_t U_EXPORT2 |
|
904 uregex_regionEnd64(const URegularExpression *regexp2, |
|
905 UErrorCode *status) { |
|
906 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
907 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
908 return 0; |
|
909 } |
|
910 return regexp->fMatcher->regionEnd(); |
|
911 } |
|
912 |
|
913 |
|
914 //------------------------------------------------------------------------------ |
|
915 // |
|
916 // uregex_hasTransparentBounds |
|
917 // |
|
918 //------------------------------------------------------------------------------ |
|
919 U_CAPI UBool U_EXPORT2 |
|
920 uregex_hasTransparentBounds(const URegularExpression *regexp2, |
|
921 UErrorCode *status) { |
|
922 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
923 if (validateRE(regexp, FALSE, status) == FALSE) { |
|
924 return FALSE; |
|
925 } |
|
926 return regexp->fMatcher->hasTransparentBounds(); |
|
927 } |
|
928 |
|
929 |
|
930 //------------------------------------------------------------------------------ |
|
931 // |
|
932 // uregex_useTransparentBounds |
|
933 // |
|
934 //------------------------------------------------------------------------------ |
|
935 U_CAPI void U_EXPORT2 |
|
936 uregex_useTransparentBounds(URegularExpression *regexp2, |
|
937 UBool b, |
|
938 UErrorCode *status) { |
|
939 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
940 if (validateRE(regexp, FALSE, status) == FALSE) { |
|
941 return; |
|
942 } |
|
943 regexp->fMatcher->useTransparentBounds(b); |
|
944 } |
|
945 |
|
946 |
|
947 //------------------------------------------------------------------------------ |
|
948 // |
|
949 // uregex_hasAnchoringBounds |
|
950 // |
|
951 //------------------------------------------------------------------------------ |
|
952 U_CAPI UBool U_EXPORT2 |
|
953 uregex_hasAnchoringBounds(const URegularExpression *regexp2, |
|
954 UErrorCode *status) { |
|
955 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
956 if (validateRE(regexp, FALSE, status) == FALSE) { |
|
957 return FALSE; |
|
958 } |
|
959 return regexp->fMatcher->hasAnchoringBounds(); |
|
960 } |
|
961 |
|
962 |
|
963 //------------------------------------------------------------------------------ |
|
964 // |
|
965 // uregex_useAnchoringBounds |
|
966 // |
|
967 //------------------------------------------------------------------------------ |
|
968 U_CAPI void U_EXPORT2 |
|
969 uregex_useAnchoringBounds(URegularExpression *regexp2, |
|
970 UBool b, |
|
971 UErrorCode *status) { |
|
972 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
973 if (validateRE(regexp, FALSE, status) == FALSE) { |
|
974 return; |
|
975 } |
|
976 regexp->fMatcher->useAnchoringBounds(b); |
|
977 } |
|
978 |
|
979 |
|
980 //------------------------------------------------------------------------------ |
|
981 // |
|
982 // uregex_hitEnd |
|
983 // |
|
984 //------------------------------------------------------------------------------ |
|
985 U_CAPI UBool U_EXPORT2 |
|
986 uregex_hitEnd(const URegularExpression *regexp2, |
|
987 UErrorCode *status) { |
|
988 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
989 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
990 return FALSE; |
|
991 } |
|
992 return regexp->fMatcher->hitEnd(); |
|
993 } |
|
994 |
|
995 |
|
996 //------------------------------------------------------------------------------ |
|
997 // |
|
998 // uregex_requireEnd |
|
999 // |
|
1000 //------------------------------------------------------------------------------ |
|
1001 U_CAPI UBool U_EXPORT2 |
|
1002 uregex_requireEnd(const URegularExpression *regexp2, |
|
1003 UErrorCode *status) { |
|
1004 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
1005 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
1006 return FALSE; |
|
1007 } |
|
1008 return regexp->fMatcher->requireEnd(); |
|
1009 } |
|
1010 |
|
1011 |
|
1012 //------------------------------------------------------------------------------ |
|
1013 // |
|
1014 // uregex_setTimeLimit |
|
1015 // |
|
1016 //------------------------------------------------------------------------------ |
|
1017 U_CAPI void U_EXPORT2 |
|
1018 uregex_setTimeLimit(URegularExpression *regexp2, |
|
1019 int32_t limit, |
|
1020 UErrorCode *status) { |
|
1021 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
1022 if (validateRE(regexp, FALSE, status)) { |
|
1023 regexp->fMatcher->setTimeLimit(limit, *status); |
|
1024 } |
|
1025 } |
|
1026 |
|
1027 |
|
1028 |
|
1029 //------------------------------------------------------------------------------ |
|
1030 // |
|
1031 // uregex_getTimeLimit |
|
1032 // |
|
1033 //------------------------------------------------------------------------------ |
|
1034 U_CAPI int32_t U_EXPORT2 |
|
1035 uregex_getTimeLimit(const URegularExpression *regexp2, |
|
1036 UErrorCode *status) { |
|
1037 int32_t retVal = 0; |
|
1038 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
1039 if (validateRE(regexp, FALSE, status)) { |
|
1040 retVal = regexp->fMatcher->getTimeLimit(); |
|
1041 } |
|
1042 return retVal; |
|
1043 } |
|
1044 |
|
1045 |
|
1046 |
|
1047 //------------------------------------------------------------------------------ |
|
1048 // |
|
1049 // uregex_setStackLimit |
|
1050 // |
|
1051 //------------------------------------------------------------------------------ |
|
1052 U_CAPI void U_EXPORT2 |
|
1053 uregex_setStackLimit(URegularExpression *regexp2, |
|
1054 int32_t limit, |
|
1055 UErrorCode *status) { |
|
1056 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
1057 if (validateRE(regexp, FALSE, status)) { |
|
1058 regexp->fMatcher->setStackLimit(limit, *status); |
|
1059 } |
|
1060 } |
|
1061 |
|
1062 |
|
1063 |
|
1064 //------------------------------------------------------------------------------ |
|
1065 // |
|
1066 // uregex_getStackLimit |
|
1067 // |
|
1068 //------------------------------------------------------------------------------ |
|
1069 U_CAPI int32_t U_EXPORT2 |
|
1070 uregex_getStackLimit(const URegularExpression *regexp2, |
|
1071 UErrorCode *status) { |
|
1072 int32_t retVal = 0; |
|
1073 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
1074 if (validateRE(regexp, FALSE, status)) { |
|
1075 retVal = regexp->fMatcher->getStackLimit(); |
|
1076 } |
|
1077 return retVal; |
|
1078 } |
|
1079 |
|
1080 |
|
1081 //------------------------------------------------------------------------------ |
|
1082 // |
|
1083 // uregex_setMatchCallback |
|
1084 // |
|
1085 //------------------------------------------------------------------------------ |
|
1086 U_CAPI void U_EXPORT2 |
|
1087 uregex_setMatchCallback(URegularExpression *regexp2, |
|
1088 URegexMatchCallback *callback, |
|
1089 const void *context, |
|
1090 UErrorCode *status) { |
|
1091 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
1092 if (validateRE(regexp, FALSE, status)) { |
|
1093 regexp->fMatcher->setMatchCallback(callback, context, *status); |
|
1094 } |
|
1095 } |
|
1096 |
|
1097 |
|
1098 //------------------------------------------------------------------------------ |
|
1099 // |
|
1100 // uregex_getMatchCallback |
|
1101 // |
|
1102 //------------------------------------------------------------------------------ |
|
1103 U_CAPI void U_EXPORT2 |
|
1104 uregex_getMatchCallback(const URegularExpression *regexp2, |
|
1105 URegexMatchCallback **callback, |
|
1106 const void **context, |
|
1107 UErrorCode *status) { |
|
1108 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
1109 if (validateRE(regexp, FALSE, status)) { |
|
1110 regexp->fMatcher->getMatchCallback(*callback, *context, *status); |
|
1111 } |
|
1112 } |
|
1113 |
|
1114 |
|
1115 //------------------------------------------------------------------------------ |
|
1116 // |
|
1117 // uregex_setMatchProgressCallback |
|
1118 // |
|
1119 //------------------------------------------------------------------------------ |
|
1120 U_CAPI void U_EXPORT2 |
|
1121 uregex_setFindProgressCallback(URegularExpression *regexp2, |
|
1122 URegexFindProgressCallback *callback, |
|
1123 const void *context, |
|
1124 UErrorCode *status) { |
|
1125 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
1126 if (validateRE(regexp, FALSE, status)) { |
|
1127 regexp->fMatcher->setFindProgressCallback(callback, context, *status); |
|
1128 } |
|
1129 } |
|
1130 |
|
1131 |
|
1132 //------------------------------------------------------------------------------ |
|
1133 // |
|
1134 // uregex_getMatchCallback |
|
1135 // |
|
1136 //------------------------------------------------------------------------------ |
|
1137 U_CAPI void U_EXPORT2 |
|
1138 uregex_getFindProgressCallback(const URegularExpression *regexp2, |
|
1139 URegexFindProgressCallback **callback, |
|
1140 const void **context, |
|
1141 UErrorCode *status) { |
|
1142 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
1143 if (validateRE(regexp, FALSE, status)) { |
|
1144 regexp->fMatcher->getFindProgressCallback(*callback, *context, *status); |
|
1145 } |
|
1146 } |
|
1147 |
|
1148 |
|
1149 //------------------------------------------------------------------------------ |
|
1150 // |
|
1151 // uregex_replaceAll |
|
1152 // |
|
1153 //------------------------------------------------------------------------------ |
|
1154 U_CAPI int32_t U_EXPORT2 |
|
1155 uregex_replaceAll(URegularExpression *regexp2, |
|
1156 const UChar *replacementText, |
|
1157 int32_t replacementLength, |
|
1158 UChar *destBuf, |
|
1159 int32_t destCapacity, |
|
1160 UErrorCode *status) { |
|
1161 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
1162 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
1163 return 0; |
|
1164 } |
|
1165 if (replacementText == NULL || replacementLength < -1 || |
|
1166 (destBuf == NULL && destCapacity > 0) || |
|
1167 destCapacity < 0) { |
|
1168 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
1169 return 0; |
|
1170 } |
|
1171 |
|
1172 int32_t len = 0; |
|
1173 |
|
1174 uregex_reset(regexp2, 0, status); |
|
1175 |
|
1176 // Note: Seperate error code variables for findNext() and appendReplacement() |
|
1177 // are used so that destination buffer overflow errors |
|
1178 // in appendReplacement won't stop findNext() from working. |
|
1179 // appendReplacement() and appendTail() special case incoming buffer |
|
1180 // overflow errors, continuing to return the correct length. |
|
1181 UErrorCode findStatus = *status; |
|
1182 while (uregex_findNext(regexp2, &findStatus)) { |
|
1183 len += uregex_appendReplacement(regexp2, replacementText, replacementLength, |
|
1184 &destBuf, &destCapacity, status); |
|
1185 } |
|
1186 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); |
|
1187 |
|
1188 if (U_FAILURE(findStatus)) { |
|
1189 // If anything went wrong with the findNext(), make that error trump |
|
1190 // whatever may have happened with the append() operations. |
|
1191 // Errors in findNext() are not expected. |
|
1192 *status = findStatus; |
|
1193 } |
|
1194 |
|
1195 return len; |
|
1196 } |
|
1197 |
|
1198 |
|
1199 //------------------------------------------------------------------------------ |
|
1200 // |
|
1201 // uregex_replaceAllUText |
|
1202 // |
|
1203 //------------------------------------------------------------------------------ |
|
1204 U_CAPI UText * U_EXPORT2 |
|
1205 uregex_replaceAllUText(URegularExpression *regexp2, |
|
1206 UText *replacementText, |
|
1207 UText *dest, |
|
1208 UErrorCode *status) { |
|
1209 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
1210 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
1211 return 0; |
|
1212 } |
|
1213 if (replacementText == NULL) { |
|
1214 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
1215 return 0; |
|
1216 } |
|
1217 |
|
1218 dest = regexp->fMatcher->replaceAll(replacementText, dest, *status); |
|
1219 return dest; |
|
1220 } |
|
1221 |
|
1222 |
|
1223 //------------------------------------------------------------------------------ |
|
1224 // |
|
1225 // uregex_replaceFirst |
|
1226 // |
|
1227 //------------------------------------------------------------------------------ |
|
1228 U_CAPI int32_t U_EXPORT2 |
|
1229 uregex_replaceFirst(URegularExpression *regexp2, |
|
1230 const UChar *replacementText, |
|
1231 int32_t replacementLength, |
|
1232 UChar *destBuf, |
|
1233 int32_t destCapacity, |
|
1234 UErrorCode *status) { |
|
1235 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
1236 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
1237 return 0; |
|
1238 } |
|
1239 if (replacementText == NULL || replacementLength < -1 || |
|
1240 (destBuf == NULL && destCapacity > 0) || |
|
1241 destCapacity < 0) { |
|
1242 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
1243 return 0; |
|
1244 } |
|
1245 |
|
1246 int32_t len = 0; |
|
1247 UBool findSucceeded; |
|
1248 uregex_reset(regexp2, 0, status); |
|
1249 findSucceeded = uregex_find(regexp2, 0, status); |
|
1250 if (findSucceeded) { |
|
1251 len = uregex_appendReplacement(regexp2, replacementText, replacementLength, |
|
1252 &destBuf, &destCapacity, status); |
|
1253 } |
|
1254 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); |
|
1255 |
|
1256 return len; |
|
1257 } |
|
1258 |
|
1259 |
|
1260 //------------------------------------------------------------------------------ |
|
1261 // |
|
1262 // uregex_replaceFirstUText |
|
1263 // |
|
1264 //------------------------------------------------------------------------------ |
|
1265 U_CAPI UText * U_EXPORT2 |
|
1266 uregex_replaceFirstUText(URegularExpression *regexp2, |
|
1267 UText *replacementText, |
|
1268 UText *dest, |
|
1269 UErrorCode *status) { |
|
1270 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
1271 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
1272 return 0; |
|
1273 } |
|
1274 if (replacementText == NULL) { |
|
1275 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
1276 return 0; |
|
1277 } |
|
1278 |
|
1279 dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status); |
|
1280 return dest; |
|
1281 } |
|
1282 |
|
1283 |
|
1284 //------------------------------------------------------------------------------ |
|
1285 // |
|
1286 // uregex_appendReplacement |
|
1287 // |
|
1288 //------------------------------------------------------------------------------ |
|
1289 |
|
1290 U_NAMESPACE_BEGIN |
|
1291 // |
|
1292 // Dummy class, because these functions need to be friends of class RegexMatcher, |
|
1293 // and stand-alone C functions don't work as friends |
|
1294 // |
|
1295 class RegexCImpl { |
|
1296 public: |
|
1297 inline static int32_t appendReplacement(RegularExpression *regexp, |
|
1298 const UChar *replacementText, |
|
1299 int32_t replacementLength, |
|
1300 UChar **destBuf, |
|
1301 int32_t *destCapacity, |
|
1302 UErrorCode *status); |
|
1303 |
|
1304 inline static int32_t appendTail(RegularExpression *regexp, |
|
1305 UChar **destBuf, |
|
1306 int32_t *destCapacity, |
|
1307 UErrorCode *status); |
|
1308 |
|
1309 inline static int32_t split(RegularExpression *regexp, |
|
1310 UChar *destBuf, |
|
1311 int32_t destCapacity, |
|
1312 int32_t *requiredCapacity, |
|
1313 UChar *destFields[], |
|
1314 int32_t destFieldsCapacity, |
|
1315 UErrorCode *status); |
|
1316 }; |
|
1317 |
|
1318 U_NAMESPACE_END |
|
1319 |
|
1320 |
|
1321 |
|
1322 static const UChar BACKSLASH = 0x5c; |
|
1323 static const UChar DOLLARSIGN = 0x24; |
|
1324 |
|
1325 // |
|
1326 // Move a character to an output buffer, with bounds checking on the index. |
|
1327 // Index advances even if capacity is exceeded, for preflight size computations. |
|
1328 // This little sequence is used a LOT. |
|
1329 // |
|
1330 static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) { |
|
1331 if (*idx < bufCapacity) { |
|
1332 buf[*idx] = c; |
|
1333 } |
|
1334 (*idx)++; |
|
1335 } |
|
1336 |
|
1337 |
|
1338 // |
|
1339 // appendReplacement, the actual implementation. |
|
1340 // |
|
1341 int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, |
|
1342 const UChar *replacementText, |
|
1343 int32_t replacementLength, |
|
1344 UChar **destBuf, |
|
1345 int32_t *destCapacity, |
|
1346 UErrorCode *status) { |
|
1347 |
|
1348 // If we come in with a buffer overflow error, don't suppress the operation. |
|
1349 // A series of appendReplacements, appendTail need to correctly preflight |
|
1350 // the buffer size when an overflow happens somewhere in the middle. |
|
1351 UBool pendingBufferOverflow = FALSE; |
|
1352 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) { |
|
1353 pendingBufferOverflow = TRUE; |
|
1354 *status = U_ZERO_ERROR; |
|
1355 } |
|
1356 |
|
1357 // |
|
1358 // Validate all paramters |
|
1359 // |
|
1360 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
1361 return 0; |
|
1362 } |
|
1363 if (replacementText == NULL || replacementLength < -1 || |
|
1364 destCapacity == NULL || destBuf == NULL || |
|
1365 (*destBuf == NULL && *destCapacity > 0) || |
|
1366 *destCapacity < 0) { |
|
1367 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
1368 return 0; |
|
1369 } |
|
1370 |
|
1371 RegexMatcher *m = regexp->fMatcher; |
|
1372 if (m->fMatch == FALSE) { |
|
1373 *status = U_REGEX_INVALID_STATE; |
|
1374 return 0; |
|
1375 } |
|
1376 |
|
1377 UChar *dest = *destBuf; |
|
1378 int32_t capacity = *destCapacity; |
|
1379 int32_t destIdx = 0; |
|
1380 int32_t i; |
|
1381 |
|
1382 // If it wasn't supplied by the caller, get the length of the replacement text. |
|
1383 // TODO: slightly smarter logic in the copy loop could watch for the NUL on |
|
1384 // the fly and avoid this step. |
|
1385 if (replacementLength == -1) { |
|
1386 replacementLength = u_strlen(replacementText); |
|
1387 } |
|
1388 |
|
1389 // Copy input string from the end of previous match to start of current match |
|
1390 if (regexp->fText != NULL) { |
|
1391 int32_t matchStart; |
|
1392 int32_t lastMatchEnd; |
|
1393 if (UTEXT_USES_U16(m->fInputText)) { |
|
1394 lastMatchEnd = (int32_t)m->fLastMatchEnd; |
|
1395 matchStart = (int32_t)m->fMatchStart; |
|
1396 } else { |
|
1397 // !!!: Would like a better way to do this! |
|
1398 UErrorCode status = U_ZERO_ERROR; |
|
1399 lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status); |
|
1400 status = U_ZERO_ERROR; |
|
1401 matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status); |
|
1402 } |
|
1403 for (i=lastMatchEnd; i<matchStart; i++) { |
|
1404 appendToBuf(regexp->fText[i], &destIdx, dest, capacity); |
|
1405 } |
|
1406 } else { |
|
1407 UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore |
|
1408 destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, |
|
1409 dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), |
|
1410 &possibleOverflowError); |
|
1411 } |
|
1412 U_ASSERT(destIdx >= 0); |
|
1413 |
|
1414 // scan the replacement text, looking for substitutions ($n) and \escapes. |
|
1415 int32_t replIdx = 0; |
|
1416 while (replIdx < replacementLength) { |
|
1417 UChar c = replacementText[replIdx]; |
|
1418 replIdx++; |
|
1419 if (c != DOLLARSIGN && c != BACKSLASH) { |
|
1420 // Common case, no substitution, no escaping, |
|
1421 // just copy the char to the dest buf. |
|
1422 appendToBuf(c, &destIdx, dest, capacity); |
|
1423 continue; |
|
1424 } |
|
1425 |
|
1426 if (c == BACKSLASH) { |
|
1427 // Backslash Escape. Copy the following char out without further checks. |
|
1428 // Note: Surrogate pairs don't need any special handling |
|
1429 // The second half wont be a '$' or a '\', and |
|
1430 // will move to the dest normally on the next |
|
1431 // loop iteration. |
|
1432 if (replIdx >= replacementLength) { |
|
1433 break; |
|
1434 } |
|
1435 c = replacementText[replIdx]; |
|
1436 |
|
1437 if (c==0x55/*U*/ || c==0x75/*u*/) { |
|
1438 // We have a \udddd or \Udddddddd escape sequence. |
|
1439 UChar32 escapedChar = |
|
1440 u_unescapeAt(uregex_ucstr_unescape_charAt, |
|
1441 &replIdx, // Index is updated by unescapeAt |
|
1442 replacementLength, // Length of replacement text |
|
1443 (void *)replacementText); |
|
1444 |
|
1445 if (escapedChar != (UChar32)0xFFFFFFFF) { |
|
1446 if (escapedChar <= 0xffff) { |
|
1447 appendToBuf((UChar)escapedChar, &destIdx, dest, capacity); |
|
1448 } else { |
|
1449 appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity); |
|
1450 appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity); |
|
1451 } |
|
1452 continue; |
|
1453 } |
|
1454 // Note: if the \u escape was invalid, just fall through and |
|
1455 // treat it as a plain \<anything> escape. |
|
1456 } |
|
1457 |
|
1458 // Plain backslash escape. Just put out the escaped character. |
|
1459 appendToBuf(c, &destIdx, dest, capacity); |
|
1460 |
|
1461 replIdx++; |
|
1462 continue; |
|
1463 } |
|
1464 |
|
1465 |
|
1466 |
|
1467 // We've got a $. Pick up a capture group number if one follows. |
|
1468 // Consume at most the number of digits necessary for the largest capture |
|
1469 // number that is valid for this pattern. |
|
1470 |
|
1471 int32_t numDigits = 0; |
|
1472 int32_t groupNum = 0; |
|
1473 UChar32 digitC; |
|
1474 for (;;) { |
|
1475 if (replIdx >= replacementLength) { |
|
1476 break; |
|
1477 } |
|
1478 U16_GET(replacementText, 0, replIdx, replacementLength, digitC); |
|
1479 if (u_isdigit(digitC) == FALSE) { |
|
1480 break; |
|
1481 } |
|
1482 |
|
1483 U16_FWD_1(replacementText, replIdx, replacementLength); |
|
1484 groupNum=groupNum*10 + u_charDigitValue(digitC); |
|
1485 numDigits++; |
|
1486 if (numDigits >= m->fPattern->fMaxCaptureDigits) { |
|
1487 break; |
|
1488 } |
|
1489 } |
|
1490 |
|
1491 |
|
1492 if (numDigits == 0) { |
|
1493 // The $ didn't introduce a group number at all. |
|
1494 // Treat it as just part of the substitution text. |
|
1495 appendToBuf(DOLLARSIGN, &destIdx, dest, capacity); |
|
1496 continue; |
|
1497 } |
|
1498 |
|
1499 // Finally, append the capture group data to the destination. |
|
1500 destIdx += uregex_group((URegularExpression*)regexp, groupNum, |
|
1501 dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status); |
|
1502 if (*status == U_BUFFER_OVERFLOW_ERROR) { |
|
1503 // Ignore buffer overflow when extracting the group. We need to |
|
1504 // continue on to get full size of the untruncated result. We will |
|
1505 // raise our own buffer overflow error at the end. |
|
1506 *status = U_ZERO_ERROR; |
|
1507 } |
|
1508 |
|
1509 if (U_FAILURE(*status)) { |
|
1510 // Can fail if group number is out of range. |
|
1511 break; |
|
1512 } |
|
1513 |
|
1514 } |
|
1515 |
|
1516 // |
|
1517 // Nul Terminate the dest buffer if possible. |
|
1518 // Set the appropriate buffer overflow or not terminated error, if needed. |
|
1519 // |
|
1520 if (destIdx < capacity) { |
|
1521 dest[destIdx] = 0; |
|
1522 } else if (destIdx == *destCapacity) { |
|
1523 *status = U_STRING_NOT_TERMINATED_WARNING; |
|
1524 } else { |
|
1525 *status = U_BUFFER_OVERFLOW_ERROR; |
|
1526 } |
|
1527 |
|
1528 // |
|
1529 // Return an updated dest buffer and capacity to the caller. |
|
1530 // |
|
1531 if (destIdx > 0 && *destCapacity > 0) { |
|
1532 if (destIdx < capacity) { |
|
1533 *destBuf += destIdx; |
|
1534 *destCapacity -= destIdx; |
|
1535 } else { |
|
1536 *destBuf += capacity; |
|
1537 *destCapacity = 0; |
|
1538 } |
|
1539 } |
|
1540 |
|
1541 // If we came in with a buffer overflow, make sure we go out with one also. |
|
1542 // (A zero length match right at the end of the previous match could |
|
1543 // make this function succeed even though a previous call had overflowed the buf) |
|
1544 if (pendingBufferOverflow && U_SUCCESS(*status)) { |
|
1545 *status = U_BUFFER_OVERFLOW_ERROR; |
|
1546 } |
|
1547 |
|
1548 return destIdx; |
|
1549 } |
|
1550 |
|
1551 // |
|
1552 // appendReplacement the actual API function, |
|
1553 // |
|
1554 U_CAPI int32_t U_EXPORT2 |
|
1555 uregex_appendReplacement(URegularExpression *regexp2, |
|
1556 const UChar *replacementText, |
|
1557 int32_t replacementLength, |
|
1558 UChar **destBuf, |
|
1559 int32_t *destCapacity, |
|
1560 UErrorCode *status) { |
|
1561 |
|
1562 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
1563 return RegexCImpl::appendReplacement( |
|
1564 regexp, replacementText, replacementLength,destBuf, destCapacity, status); |
|
1565 } |
|
1566 |
|
1567 // |
|
1568 // uregex_appendReplacementUText...can just use the normal C++ method |
|
1569 // |
|
1570 U_CAPI void U_EXPORT2 |
|
1571 uregex_appendReplacementUText(URegularExpression *regexp2, |
|
1572 UText *replText, |
|
1573 UText *dest, |
|
1574 UErrorCode *status) { |
|
1575 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
1576 regexp->fMatcher->appendReplacement(dest, replText, *status); |
|
1577 } |
|
1578 |
|
1579 |
|
1580 //------------------------------------------------------------------------------ |
|
1581 // |
|
1582 // uregex_appendTail |
|
1583 // |
|
1584 //------------------------------------------------------------------------------ |
|
1585 int32_t RegexCImpl::appendTail(RegularExpression *regexp, |
|
1586 UChar **destBuf, |
|
1587 int32_t *destCapacity, |
|
1588 UErrorCode *status) |
|
1589 { |
|
1590 |
|
1591 // If we come in with a buffer overflow error, don't suppress the operation. |
|
1592 // A series of appendReplacements, appendTail need to correctly preflight |
|
1593 // the buffer size when an overflow happens somewhere in the middle. |
|
1594 UBool pendingBufferOverflow = FALSE; |
|
1595 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) { |
|
1596 pendingBufferOverflow = TRUE; |
|
1597 *status = U_ZERO_ERROR; |
|
1598 } |
|
1599 |
|
1600 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
1601 return 0; |
|
1602 } |
|
1603 |
|
1604 if (destCapacity == NULL || destBuf == NULL || |
|
1605 (*destBuf == NULL && *destCapacity > 0) || |
|
1606 *destCapacity < 0) |
|
1607 { |
|
1608 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
1609 return 0; |
|
1610 } |
|
1611 |
|
1612 RegexMatcher *m = regexp->fMatcher; |
|
1613 |
|
1614 int32_t destIdx = 0; |
|
1615 int32_t destCap = *destCapacity; |
|
1616 UChar *dest = *destBuf; |
|
1617 |
|
1618 if (regexp->fText != NULL) { |
|
1619 int32_t srcIdx; |
|
1620 int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd); |
|
1621 if (nativeIdx == -1) { |
|
1622 srcIdx = 0; |
|
1623 } else if (UTEXT_USES_U16(m->fInputText)) { |
|
1624 srcIdx = (int32_t)nativeIdx; |
|
1625 } else { |
|
1626 UErrorCode status = U_ZERO_ERROR; |
|
1627 srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status); |
|
1628 } |
|
1629 |
|
1630 for (;;) { |
|
1631 U_ASSERT(destIdx >= 0); |
|
1632 |
|
1633 if (srcIdx == regexp->fTextLength) { |
|
1634 break; |
|
1635 } |
|
1636 UChar c = regexp->fText[srcIdx]; |
|
1637 if (c == 0 && regexp->fTextLength == -1) { |
|
1638 regexp->fTextLength = srcIdx; |
|
1639 break; |
|
1640 } |
|
1641 |
|
1642 if (destIdx < destCap) { |
|
1643 dest[destIdx] = c; |
|
1644 } else { |
|
1645 // We've overflowed the dest buffer. |
|
1646 // If the total input string length is known, we can |
|
1647 // compute the total buffer size needed without scanning through the string. |
|
1648 if (regexp->fTextLength > 0) { |
|
1649 destIdx += (regexp->fTextLength - srcIdx); |
|
1650 break; |
|
1651 } |
|
1652 } |
|
1653 srcIdx++; |
|
1654 destIdx++; |
|
1655 } |
|
1656 } else { |
|
1657 int64_t srcIdx; |
|
1658 if (m->fMatch) { |
|
1659 // The most recent call to find() succeeded. |
|
1660 srcIdx = m->fMatchEnd; |
|
1661 } else { |
|
1662 // The last call to find() on this matcher failed(). |
|
1663 // Look back to the end of the last find() that succeeded for src index. |
|
1664 srcIdx = m->fLastMatchEnd; |
|
1665 if (srcIdx == -1) { |
|
1666 // There has been no successful match with this matcher. |
|
1667 // We want to copy the whole string. |
|
1668 srcIdx = 0; |
|
1669 } |
|
1670 } |
|
1671 |
|
1672 destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status); |
|
1673 } |
|
1674 |
|
1675 // |
|
1676 // NUL terminate the output string, if possible, otherwise issue the |
|
1677 // appropriate error or warning. |
|
1678 // |
|
1679 if (destIdx < destCap) { |
|
1680 dest[destIdx] = 0; |
|
1681 } else if (destIdx == destCap) { |
|
1682 *status = U_STRING_NOT_TERMINATED_WARNING; |
|
1683 } else { |
|
1684 *status = U_BUFFER_OVERFLOW_ERROR; |
|
1685 } |
|
1686 |
|
1687 // |
|
1688 // Update the user's buffer ptr and capacity vars to reflect the |
|
1689 // amount used. |
|
1690 // |
|
1691 if (destIdx < destCap) { |
|
1692 *destBuf += destIdx; |
|
1693 *destCapacity -= destIdx; |
|
1694 } else if (*destBuf != NULL) { |
|
1695 *destBuf += destCap; |
|
1696 *destCapacity = 0; |
|
1697 } |
|
1698 |
|
1699 if (pendingBufferOverflow && U_SUCCESS(*status)) { |
|
1700 *status = U_BUFFER_OVERFLOW_ERROR; |
|
1701 } |
|
1702 |
|
1703 return destIdx; |
|
1704 } |
|
1705 |
|
1706 |
|
1707 // |
|
1708 // appendTail the actual API function |
|
1709 // |
|
1710 U_CAPI int32_t U_EXPORT2 |
|
1711 uregex_appendTail(URegularExpression *regexp2, |
|
1712 UChar **destBuf, |
|
1713 int32_t *destCapacity, |
|
1714 UErrorCode *status) { |
|
1715 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
1716 return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status); |
|
1717 } |
|
1718 |
|
1719 |
|
1720 // |
|
1721 // uregex_appendTailUText...can just use the normal C++ method |
|
1722 // |
|
1723 U_CAPI UText * U_EXPORT2 |
|
1724 uregex_appendTailUText(URegularExpression *regexp2, |
|
1725 UText *dest, |
|
1726 UErrorCode *status) { |
|
1727 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
1728 return regexp->fMatcher->appendTail(dest, *status); |
|
1729 } |
|
1730 |
|
1731 |
|
1732 //------------------------------------------------------------------------------ |
|
1733 // |
|
1734 // copyString Internal utility to copy a string to an output buffer, |
|
1735 // while managing buffer overflow and preflight size |
|
1736 // computation. NUL termination is added to destination, |
|
1737 // and the NUL is counted in the output size. |
|
1738 // |
|
1739 //------------------------------------------------------------------------------ |
|
1740 #if 0 |
|
1741 static void copyString(UChar *destBuffer, // Destination buffer. |
|
1742 int32_t destCapacity, // Total capacity of dest buffer |
|
1743 int32_t *destIndex, // Index into dest buffer. Updated on return. |
|
1744 // Update not clipped to destCapacity. |
|
1745 const UChar *srcPtr, // Pointer to source string |
|
1746 int32_t srcLen) // Source string len. |
|
1747 { |
|
1748 int32_t si; |
|
1749 int32_t di = *destIndex; |
|
1750 UChar c; |
|
1751 |
|
1752 for (si=0; si<srcLen; si++) { |
|
1753 c = srcPtr[si]; |
|
1754 if (di < destCapacity) { |
|
1755 destBuffer[di] = c; |
|
1756 di++; |
|
1757 } else { |
|
1758 di += srcLen - si; |
|
1759 break; |
|
1760 } |
|
1761 } |
|
1762 if (di<destCapacity) { |
|
1763 destBuffer[di] = 0; |
|
1764 } |
|
1765 di++; |
|
1766 *destIndex = di; |
|
1767 } |
|
1768 #endif |
|
1769 |
|
1770 //------------------------------------------------------------------------------ |
|
1771 // |
|
1772 // uregex_split |
|
1773 // |
|
1774 //------------------------------------------------------------------------------ |
|
1775 int32_t RegexCImpl::split(RegularExpression *regexp, |
|
1776 UChar *destBuf, |
|
1777 int32_t destCapacity, |
|
1778 int32_t *requiredCapacity, |
|
1779 UChar *destFields[], |
|
1780 int32_t destFieldsCapacity, |
|
1781 UErrorCode *status) { |
|
1782 // |
|
1783 // Reset for the input text |
|
1784 // |
|
1785 regexp->fMatcher->reset(); |
|
1786 UText *inputText = regexp->fMatcher->fInputText; |
|
1787 int64_t nextOutputStringStart = 0; |
|
1788 int64_t inputLen = regexp->fMatcher->fInputLength; |
|
1789 if (inputLen == 0) { |
|
1790 return 0; |
|
1791 } |
|
1792 |
|
1793 // |
|
1794 // Loop through the input text, searching for the delimiter pattern |
|
1795 // |
|
1796 int32_t i; // Index of the field being processed. |
|
1797 int32_t destIdx = 0; // Next available position in destBuf; |
|
1798 int32_t numCaptureGroups = regexp->fMatcher->groupCount(); |
|
1799 UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow errors so that the strings are still counted |
|
1800 for (i=0; ; i++) { |
|
1801 if (i>=destFieldsCapacity-1) { |
|
1802 // There are one or zero output strings left. |
|
1803 // Fill the last output string with whatever is left from the input, then exit the loop. |
|
1804 // ( i will be == destFieldsCapacity if we filled the output array while processing |
|
1805 // capture groups of the delimiter expression, in which case we will discard the |
|
1806 // last capture group saved in favor of the unprocessed remainder of the |
|
1807 // input string.) |
|
1808 if (inputLen > nextOutputStringStart) { |
|
1809 if (i != destFieldsCapacity-1) { |
|
1810 // No fields are left. Recycle the last one for holding the trailing part of |
|
1811 // the input string. |
|
1812 i = destFieldsCapacity-1; |
|
1813 destIdx = (int32_t)(destFields[i] - destFields[0]); |
|
1814 } |
|
1815 |
|
1816 destFields[i] = &destBuf[destIdx]; |
|
1817 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, |
|
1818 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); |
|
1819 } |
|
1820 break; |
|
1821 } |
|
1822 |
|
1823 if (regexp->fMatcher->find()) { |
|
1824 // We found another delimiter. Move everything from where we started looking |
|
1825 // up until the start of the delimiter into the next output string. |
|
1826 destFields[i] = &destBuf[destIdx]; |
|
1827 |
|
1828 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart, |
|
1829 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus); |
|
1830 if (tStatus == U_BUFFER_OVERFLOW_ERROR) { |
|
1831 tStatus = U_ZERO_ERROR; |
|
1832 } else { |
|
1833 *status = tStatus; |
|
1834 } |
|
1835 nextOutputStringStart = regexp->fMatcher->fMatchEnd; |
|
1836 |
|
1837 // If the delimiter pattern has capturing parentheses, the captured |
|
1838 // text goes out into the next n destination strings. |
|
1839 int32_t groupNum; |
|
1840 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { |
|
1841 // If we've run out of output string slots, bail out. |
|
1842 if (i==destFieldsCapacity-1) { |
|
1843 break; |
|
1844 } |
|
1845 i++; |
|
1846 |
|
1847 // Set up to extract the capture group contents into the dest buffer. |
|
1848 destFields[i] = &destBuf[destIdx]; |
|
1849 tStatus = U_ZERO_ERROR; |
|
1850 int32_t t = uregex_group((URegularExpression*)regexp, |
|
1851 groupNum, |
|
1852 destFields[i], |
|
1853 REMAINING_CAPACITY(destIdx, destCapacity), |
|
1854 &tStatus); |
|
1855 destIdx += t + 1; // Record the space used in the output string buffer. |
|
1856 // +1 for the NUL that terminates the string. |
|
1857 if (tStatus == U_BUFFER_OVERFLOW_ERROR) { |
|
1858 tStatus = U_ZERO_ERROR; |
|
1859 } else { |
|
1860 *status = tStatus; |
|
1861 } |
|
1862 } |
|
1863 |
|
1864 if (nextOutputStringStart == inputLen) { |
|
1865 // The delimiter was at the end of the string. |
|
1866 // Output an empty string, and then we are done. |
|
1867 if (destIdx < destCapacity) { |
|
1868 destBuf[destIdx] = 0; |
|
1869 } |
|
1870 if (i < destFieldsCapacity-1) { |
|
1871 ++i; |
|
1872 } |
|
1873 if (destIdx < destCapacity) { |
|
1874 destFields[i] = destBuf + destIdx; |
|
1875 } |
|
1876 ++destIdx; |
|
1877 break; |
|
1878 } |
|
1879 |
|
1880 } |
|
1881 else |
|
1882 { |
|
1883 // We ran off the end of the input while looking for the next delimiter. |
|
1884 // All the remaining text goes into the current output string. |
|
1885 destFields[i] = &destBuf[destIdx]; |
|
1886 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, |
|
1887 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); |
|
1888 break; |
|
1889 } |
|
1890 } |
|
1891 |
|
1892 // Zero out any unused portion of the destFields array |
|
1893 int j; |
|
1894 for (j=i+1; j<destFieldsCapacity; j++) { |
|
1895 destFields[j] = NULL; |
|
1896 } |
|
1897 |
|
1898 if (requiredCapacity != NULL) { |
|
1899 *requiredCapacity = destIdx; |
|
1900 } |
|
1901 if (destIdx > destCapacity) { |
|
1902 *status = U_BUFFER_OVERFLOW_ERROR; |
|
1903 } |
|
1904 return i+1; |
|
1905 } |
|
1906 |
|
1907 // |
|
1908 // uregex_split The actual API function |
|
1909 // |
|
1910 U_CAPI int32_t U_EXPORT2 |
|
1911 uregex_split(URegularExpression *regexp2, |
|
1912 UChar *destBuf, |
|
1913 int32_t destCapacity, |
|
1914 int32_t *requiredCapacity, |
|
1915 UChar *destFields[], |
|
1916 int32_t destFieldsCapacity, |
|
1917 UErrorCode *status) { |
|
1918 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
1919 if (validateRE(regexp, TRUE, status) == FALSE) { |
|
1920 return 0; |
|
1921 } |
|
1922 if ((destBuf == NULL && destCapacity > 0) || |
|
1923 destCapacity < 0 || |
|
1924 destFields == NULL || |
|
1925 destFieldsCapacity < 1 ) { |
|
1926 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
1927 return 0; |
|
1928 } |
|
1929 |
|
1930 return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status); |
|
1931 } |
|
1932 |
|
1933 |
|
1934 // |
|
1935 // uregex_splitUText...can just use the normal C++ method |
|
1936 // |
|
1937 U_CAPI int32_t U_EXPORT2 |
|
1938 uregex_splitUText(URegularExpression *regexp2, |
|
1939 UText *destFields[], |
|
1940 int32_t destFieldsCapacity, |
|
1941 UErrorCode *status) { |
|
1942 RegularExpression *regexp = (RegularExpression*)regexp2; |
|
1943 return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status); |
|
1944 } |
|
1945 |
|
1946 |
|
1947 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
|
1948 |