|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (C) 1999-2012, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ********************************************************************** |
|
6 * Date Name Description |
|
7 * 11/17/99 aliu Creation. |
|
8 ********************************************************************** |
|
9 */ |
|
10 |
|
11 #include "utypeinfo.h" // for 'typeid' to work |
|
12 |
|
13 #include "unicode/utypes.h" |
|
14 |
|
15 #if !UCONFIG_NO_TRANSLITERATION |
|
16 |
|
17 #include "unicode/putil.h" |
|
18 #include "unicode/translit.h" |
|
19 #include "unicode/locid.h" |
|
20 #include "unicode/msgfmt.h" |
|
21 #include "unicode/rep.h" |
|
22 #include "unicode/resbund.h" |
|
23 #include "unicode/unifilt.h" |
|
24 #include "unicode/uniset.h" |
|
25 #include "unicode/uscript.h" |
|
26 #include "unicode/strenum.h" |
|
27 #include "unicode/utf16.h" |
|
28 #include "cpdtrans.h" |
|
29 #include "nultrans.h" |
|
30 #include "rbt_data.h" |
|
31 #include "rbt_pars.h" |
|
32 #include "rbt.h" |
|
33 #include "transreg.h" |
|
34 #include "name2uni.h" |
|
35 #include "nortrans.h" |
|
36 #include "remtrans.h" |
|
37 #include "titletrn.h" |
|
38 #include "tolowtrn.h" |
|
39 #include "toupptrn.h" |
|
40 #include "uni2name.h" |
|
41 #include "brktrans.h" |
|
42 #include "esctrn.h" |
|
43 #include "unesctrn.h" |
|
44 #include "tridpars.h" |
|
45 #include "anytrans.h" |
|
46 #include "util.h" |
|
47 #include "hash.h" |
|
48 #include "mutex.h" |
|
49 #include "ucln_in.h" |
|
50 #include "uassert.h" |
|
51 #include "cmemory.h" |
|
52 #include "cstring.h" |
|
53 #include "uinvchar.h" |
|
54 |
|
55 static const UChar TARGET_SEP = 0x002D; /*-*/ |
|
56 static const UChar ID_DELIM = 0x003B; /*;*/ |
|
57 static const UChar VARIANT_SEP = 0x002F; // '/' |
|
58 |
|
59 /** |
|
60 * Prefix for resource bundle key for the display name for a |
|
61 * transliterator. The ID is appended to this to form the key. |
|
62 * The resource bundle value should be a String. |
|
63 */ |
|
64 static const char RB_DISPLAY_NAME_PREFIX[] = "%Translit%%"; |
|
65 |
|
66 /** |
|
67 * Prefix for resource bundle key for the display name for a |
|
68 * transliterator SCRIPT. The ID is appended to this to form the key. |
|
69 * The resource bundle value should be a String. |
|
70 */ |
|
71 static const char RB_SCRIPT_DISPLAY_NAME_PREFIX[] = "%Translit%"; |
|
72 |
|
73 /** |
|
74 * Resource bundle key for display name pattern. |
|
75 * The resource bundle value should be a String forming a |
|
76 * MessageFormat pattern, e.g.: |
|
77 * "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}". |
|
78 */ |
|
79 static const char RB_DISPLAY_NAME_PATTERN[] = "TransliteratorNamePattern"; |
|
80 |
|
81 /** |
|
82 * Resource bundle key for the list of RuleBasedTransliterator IDs. |
|
83 * The resource bundle value should be a String[] with each element |
|
84 * being a valid ID. The ID will be appended to RB_RULE_BASED_PREFIX |
|
85 * to obtain the class name in which the RB_RULE key will be sought. |
|
86 */ |
|
87 static const char RB_RULE_BASED_IDS[] = "RuleBasedTransliteratorIDs"; |
|
88 |
|
89 /** |
|
90 * The mutex controlling access to registry object. |
|
91 */ |
|
92 static UMutex registryMutex = U_MUTEX_INITIALIZER; |
|
93 |
|
94 /** |
|
95 * System transliterator registry; non-null when initialized. |
|
96 */ |
|
97 static icu::TransliteratorRegistry* registry = 0; |
|
98 |
|
99 // Macro to check/initialize the registry. ONLY USE WITHIN |
|
100 // MUTEX. Avoids function call when registry is initialized. |
|
101 #define HAVE_REGISTRY(status) (registry!=0 || initializeRegistry(status)) |
|
102 |
|
103 U_NAMESPACE_BEGIN |
|
104 |
|
105 UOBJECT_DEFINE_ABSTRACT_RTTI_IMPLEMENTATION(Transliterator) |
|
106 |
|
107 /** |
|
108 * Return TRUE if the given UTransPosition is valid for text of |
|
109 * the given length. |
|
110 */ |
|
111 static inline UBool positionIsValid(UTransPosition& index, int32_t len) { |
|
112 return !(index.contextStart < 0 || |
|
113 index.start < index.contextStart || |
|
114 index.limit < index.start || |
|
115 index.contextLimit < index.limit || |
|
116 len < index.contextLimit); |
|
117 } |
|
118 |
|
119 /** |
|
120 * Default constructor. |
|
121 * @param theID the string identifier for this transliterator |
|
122 * @param theFilter the filter. Any character for which |
|
123 * <tt>filter.contains()</tt> returns <tt>FALSE</tt> will not be |
|
124 * altered by this transliterator. If <tt>filter</tt> is |
|
125 * <tt>null</tt> then no filtering is applied. |
|
126 */ |
|
127 Transliterator::Transliterator(const UnicodeString& theID, |
|
128 UnicodeFilter* adoptedFilter) : |
|
129 UObject(), ID(theID), filter(adoptedFilter), |
|
130 maximumContextLength(0) |
|
131 { |
|
132 // NUL-terminate the ID string, which is a non-aliased copy. |
|
133 ID.append((UChar)0); |
|
134 ID.truncate(ID.length()-1); |
|
135 } |
|
136 |
|
137 /** |
|
138 * Destructor. |
|
139 */ |
|
140 Transliterator::~Transliterator() { |
|
141 if (filter) { |
|
142 delete filter; |
|
143 } |
|
144 } |
|
145 |
|
146 /** |
|
147 * Copy constructor. |
|
148 */ |
|
149 Transliterator::Transliterator(const Transliterator& other) : |
|
150 UObject(other), ID(other.ID), filter(0), |
|
151 maximumContextLength(other.maximumContextLength) |
|
152 { |
|
153 // NUL-terminate the ID string, which is a non-aliased copy. |
|
154 ID.append((UChar)0); |
|
155 ID.truncate(ID.length()-1); |
|
156 |
|
157 if (other.filter != 0) { |
|
158 // We own the filter, so we must have our own copy |
|
159 filter = (UnicodeFilter*) other.filter->clone(); |
|
160 } |
|
161 } |
|
162 |
|
163 Transliterator* Transliterator::clone() const { |
|
164 return NULL; |
|
165 } |
|
166 |
|
167 /** |
|
168 * Assignment operator. |
|
169 */ |
|
170 Transliterator& Transliterator::operator=(const Transliterator& other) { |
|
171 ID = other.ID; |
|
172 // NUL-terminate the ID string |
|
173 ID.getTerminatedBuffer(); |
|
174 |
|
175 maximumContextLength = other.maximumContextLength; |
|
176 adoptFilter((other.filter == 0) ? 0 : (UnicodeFilter*) other.filter->clone()); |
|
177 return *this; |
|
178 } |
|
179 |
|
180 /** |
|
181 * Transliterates a segment of a string. <code>Transliterator</code> API. |
|
182 * @param text the string to be transliterated |
|
183 * @param start the beginning index, inclusive; <code>0 <= start |
|
184 * <= limit</code>. |
|
185 * @param limit the ending index, exclusive; <code>start <= limit |
|
186 * <= text.length()</code>. |
|
187 * @return the new limit index, or -1 |
|
188 */ |
|
189 int32_t Transliterator::transliterate(Replaceable& text, |
|
190 int32_t start, int32_t limit) const { |
|
191 if (start < 0 || |
|
192 limit < start || |
|
193 text.length() < limit) { |
|
194 return -1; |
|
195 } |
|
196 |
|
197 UTransPosition offsets; |
|
198 offsets.contextStart= start; |
|
199 offsets.contextLimit = limit; |
|
200 offsets.start = start; |
|
201 offsets.limit = limit; |
|
202 filteredTransliterate(text, offsets, FALSE, TRUE); |
|
203 return offsets.limit; |
|
204 } |
|
205 |
|
206 /** |
|
207 * Transliterates an entire string in place. Convenience method. |
|
208 * @param text the string to be transliterated |
|
209 */ |
|
210 void Transliterator::transliterate(Replaceable& text) const { |
|
211 transliterate(text, 0, text.length()); |
|
212 } |
|
213 |
|
214 /** |
|
215 * Transliterates the portion of the text buffer that can be |
|
216 * transliterated unambiguosly after new text has been inserted, |
|
217 * typically as a result of a keyboard event. The new text in |
|
218 * <code>insertion</code> will be inserted into <code>text</code> |
|
219 * at <code>index.contextLimit</code>, advancing |
|
220 * <code>index.contextLimit</code> by <code>insertion.length()</code>. |
|
221 * Then the transliterator will try to transliterate characters of |
|
222 * <code>text</code> between <code>index.start</code> and |
|
223 * <code>index.contextLimit</code>. Characters before |
|
224 * <code>index.start</code> will not be changed. |
|
225 * |
|
226 * <p>Upon return, values in <code>index</code> will be updated. |
|
227 * <code>index.contextStart</code> will be advanced to the first |
|
228 * character that future calls to this method will read. |
|
229 * <code>index.start</code> and <code>index.contextLimit</code> will |
|
230 * be adjusted to delimit the range of text that future calls to |
|
231 * this method may change. |
|
232 * |
|
233 * <p>Typical usage of this method begins with an initial call |
|
234 * with <code>index.contextStart</code> and <code>index.contextLimit</code> |
|
235 * set to indicate the portion of <code>text</code> to be |
|
236 * transliterated, and <code>index.start == index.contextStart</code>. |
|
237 * Thereafter, <code>index</code> can be used without |
|
238 * modification in future calls, provided that all changes to |
|
239 * <code>text</code> are made via this method. |
|
240 * |
|
241 * <p>This method assumes that future calls may be made that will |
|
242 * insert new text into the buffer. As a result, it only performs |
|
243 * unambiguous transliterations. After the last call to this |
|
244 * method, there may be untransliterated text that is waiting for |
|
245 * more input to resolve an ambiguity. In order to perform these |
|
246 * pending transliterations, clients should call {@link |
|
247 * #finishKeyboardTransliteration} after the last call to this |
|
248 * method has been made. |
|
249 * |
|
250 * @param text the buffer holding transliterated and untransliterated text |
|
251 * @param index an array of three integers. |
|
252 * |
|
253 * <ul><li><code>index.contextStart</code>: the beginning index, |
|
254 * inclusive; <code>0 <= index.contextStart <= index.contextLimit</code>. |
|
255 * |
|
256 * <li><code>index.contextLimit</code>: the ending index, exclusive; |
|
257 * <code>index.contextStart <= index.contextLimit <= text.length()</code>. |
|
258 * <code>insertion</code> is inserted at |
|
259 * <code>index.contextLimit</code>. |
|
260 * |
|
261 * <li><code>index.start</code>: the next character to be |
|
262 * considered for transliteration; <code>index.contextStart <= |
|
263 * index.start <= index.contextLimit</code>. Characters before |
|
264 * <code>index.start</code> will not be changed by future calls |
|
265 * to this method.</ul> |
|
266 * |
|
267 * @param insertion text to be inserted and possibly |
|
268 * transliterated into the translation buffer at |
|
269 * <code>index.contextLimit</code>. If <code>null</code> then no text |
|
270 * is inserted. |
|
271 * @see #START |
|
272 * @see #LIMIT |
|
273 * @see #CURSOR |
|
274 * @see #handleTransliterate |
|
275 * @exception IllegalArgumentException if <code>index</code> |
|
276 * is invalid |
|
277 */ |
|
278 void Transliterator::transliterate(Replaceable& text, |
|
279 UTransPosition& index, |
|
280 const UnicodeString& insertion, |
|
281 UErrorCode &status) const { |
|
282 _transliterate(text, index, &insertion, status); |
|
283 } |
|
284 |
|
285 /** |
|
286 * Transliterates the portion of the text buffer that can be |
|
287 * transliterated unambiguosly after a new character has been |
|
288 * inserted, typically as a result of a keyboard event. This is a |
|
289 * convenience method; see {@link |
|
290 * #transliterate(Replaceable, int[], String)} for details. |
|
291 * @param text the buffer holding transliterated and |
|
292 * untransliterated text |
|
293 * @param index an array of three integers. See {@link |
|
294 * #transliterate(Replaceable, int[], String)}. |
|
295 * @param insertion text to be inserted and possibly |
|
296 * transliterated into the translation buffer at |
|
297 * <code>index.contextLimit</code>. |
|
298 * @see #transliterate(Replaceable, int[], String) |
|
299 */ |
|
300 void Transliterator::transliterate(Replaceable& text, |
|
301 UTransPosition& index, |
|
302 UChar32 insertion, |
|
303 UErrorCode& status) const { |
|
304 UnicodeString str(insertion); |
|
305 _transliterate(text, index, &str, status); |
|
306 } |
|
307 |
|
308 /** |
|
309 * Transliterates the portion of the text buffer that can be |
|
310 * transliterated unambiguosly. This is a convenience method; see |
|
311 * {@link #transliterate(Replaceable, int[], String)} for |
|
312 * details. |
|
313 * @param text the buffer holding transliterated and |
|
314 * untransliterated text |
|
315 * @param index an array of three integers. See {@link |
|
316 * #transliterate(Replaceable, int[], String)}. |
|
317 * @see #transliterate(Replaceable, int[], String) |
|
318 */ |
|
319 void Transliterator::transliterate(Replaceable& text, |
|
320 UTransPosition& index, |
|
321 UErrorCode& status) const { |
|
322 _transliterate(text, index, 0, status); |
|
323 } |
|
324 |
|
325 /** |
|
326 * Finishes any pending transliterations that were waiting for |
|
327 * more characters. Clients should call this method as the last |
|
328 * call after a sequence of one or more calls to |
|
329 * <code>transliterate()</code>. |
|
330 * @param text the buffer holding transliterated and |
|
331 * untransliterated text. |
|
332 * @param index the array of indices previously passed to {@link |
|
333 * #transliterate} |
|
334 */ |
|
335 void Transliterator::finishTransliteration(Replaceable& text, |
|
336 UTransPosition& index) const { |
|
337 if (!positionIsValid(index, text.length())) { |
|
338 return; |
|
339 } |
|
340 |
|
341 filteredTransliterate(text, index, FALSE, TRUE); |
|
342 } |
|
343 |
|
344 /** |
|
345 * This internal method does keyboard transliteration. If the |
|
346 * 'insertion' is non-null then we append it to 'text' before |
|
347 * proceeding. This method calls through to the pure virtual |
|
348 * framework method handleTransliterate() to do the actual |
|
349 * work. |
|
350 */ |
|
351 void Transliterator::_transliterate(Replaceable& text, |
|
352 UTransPosition& index, |
|
353 const UnicodeString* insertion, |
|
354 UErrorCode &status) const { |
|
355 if (U_FAILURE(status)) { |
|
356 return; |
|
357 } |
|
358 |
|
359 if (!positionIsValid(index, text.length())) { |
|
360 status = U_ILLEGAL_ARGUMENT_ERROR; |
|
361 return; |
|
362 } |
|
363 |
|
364 // int32_t originalStart = index.contextStart; |
|
365 if (insertion != 0) { |
|
366 text.handleReplaceBetween(index.limit, index.limit, *insertion); |
|
367 index.limit += insertion->length(); |
|
368 index.contextLimit += insertion->length(); |
|
369 } |
|
370 |
|
371 if (index.limit > 0 && |
|
372 U16_IS_LEAD(text.charAt(index.limit - 1))) { |
|
373 // Oops, there is a dangling lead surrogate in the buffer. |
|
374 // This will break most transliterators, since they will |
|
375 // assume it is part of a pair. Don't transliterate until |
|
376 // more text comes in. |
|
377 return; |
|
378 } |
|
379 |
|
380 filteredTransliterate(text, index, TRUE, TRUE); |
|
381 |
|
382 #if 0 |
|
383 // TODO |
|
384 // I CAN'T DO what I'm attempting below now that the Kleene star |
|
385 // operator is supported. For example, in the rule |
|
386 |
|
387 // ([:Lu:]+) { x } > $1; |
|
388 |
|
389 // what is the maximum context length? getMaximumContextLength() |
|
390 // will return 1, but this is just the length of the ante context |
|
391 // part of the pattern string -- 1 character, which is a standin |
|
392 // for a Quantifier, which contains a StringMatcher, which |
|
393 // contains a UnicodeSet. |
|
394 |
|
395 // There is a complicated way to make this work again, and that's |
|
396 // to add a "maximum left context" protocol into the |
|
397 // UnicodeMatcher hierarchy. At present I'm not convinced this is |
|
398 // worth it. |
|
399 |
|
400 // --- |
|
401 |
|
402 // The purpose of the code below is to keep the context small |
|
403 // while doing incremental transliteration. When part of the left |
|
404 // context (between contextStart and start) is no longer needed, |
|
405 // we try to advance contextStart past that portion. We use the |
|
406 // maximum context length to do so. |
|
407 int32_t newCS = index.start; |
|
408 int32_t n = getMaximumContextLength(); |
|
409 while (newCS > originalStart && n-- > 0) { |
|
410 --newCS; |
|
411 newCS -= U16_LENGTH(text.char32At(newCS)) - 1; |
|
412 } |
|
413 index.contextStart = uprv_max(newCS, originalStart); |
|
414 #endif |
|
415 } |
|
416 |
|
417 /** |
|
418 * This method breaks up the input text into runs of unfiltered |
|
419 * characters. It passes each such run to |
|
420 * <subclass>.handleTransliterate(). Subclasses that can handle the |
|
421 * filter logic more efficiently themselves may override this method. |
|
422 * |
|
423 * All transliteration calls in this class go through this method. |
|
424 */ |
|
425 void Transliterator::filteredTransliterate(Replaceable& text, |
|
426 UTransPosition& index, |
|
427 UBool incremental, |
|
428 UBool rollback) const { |
|
429 // Short circuit path for transliterators with no filter in |
|
430 // non-incremental mode. |
|
431 if (filter == 0 && !rollback) { |
|
432 handleTransliterate(text, index, incremental); |
|
433 return; |
|
434 } |
|
435 |
|
436 //---------------------------------------------------------------------- |
|
437 // This method processes text in two groupings: |
|
438 // |
|
439 // RUNS -- A run is a contiguous group of characters which are contained |
|
440 // in the filter for this transliterator (filter.contains(ch) == TRUE). |
|
441 // Text outside of runs may appear as context but it is not modified. |
|
442 // The start and limit Position values are narrowed to each run. |
|
443 // |
|
444 // PASSES (incremental only) -- To make incremental mode work correctly, |
|
445 // each run is broken up into n passes, where n is the length (in code |
|
446 // points) of the run. Each pass contains the first n characters. If a |
|
447 // pass is completely transliterated, it is committed, and further passes |
|
448 // include characters after the committed text. If a pass is blocked, |
|
449 // and does not transliterate completely, then this method rolls back |
|
450 // the changes made during the pass, extends the pass by one code point, |
|
451 // and tries again. |
|
452 //---------------------------------------------------------------------- |
|
453 |
|
454 // globalLimit is the limit value for the entire operation. We |
|
455 // set index.limit to the end of each unfiltered run before |
|
456 // calling handleTransliterate(), so we need to maintain the real |
|
457 // value of index.limit here. After each transliteration, we |
|
458 // update globalLimit for insertions or deletions that have |
|
459 // happened. |
|
460 int32_t globalLimit = index.limit; |
|
461 |
|
462 // If there is a non-null filter, then break the input text up. Say the |
|
463 // input text has the form: |
|
464 // xxxabcxxdefxx |
|
465 // where 'x' represents a filtered character (filter.contains('x') == |
|
466 // false). Then we break this up into: |
|
467 // xxxabc xxdef xx |
|
468 // Each pass through the loop consumes a run of filtered |
|
469 // characters (which are ignored) and a subsequent run of |
|
470 // unfiltered characters (which are transliterated). |
|
471 |
|
472 for (;;) { |
|
473 |
|
474 if (filter != NULL) { |
|
475 // Narrow the range to be transliterated to the first segment |
|
476 // of unfiltered characters at or after index.start. |
|
477 |
|
478 // Advance past filtered chars |
|
479 UChar32 c; |
|
480 while (index.start < globalLimit && |
|
481 !filter->contains(c=text.char32At(index.start))) { |
|
482 index.start += U16_LENGTH(c); |
|
483 } |
|
484 |
|
485 // Find the end of this run of unfiltered chars |
|
486 index.limit = index.start; |
|
487 while (index.limit < globalLimit && |
|
488 filter->contains(c=text.char32At(index.limit))) { |
|
489 index.limit += U16_LENGTH(c); |
|
490 } |
|
491 } |
|
492 |
|
493 // Check to see if the unfiltered run is empty. This only |
|
494 // happens at the end of the string when all the remaining |
|
495 // characters are filtered. |
|
496 if (index.limit == index.start) { |
|
497 // assert(index.start == globalLimit); |
|
498 break; |
|
499 } |
|
500 |
|
501 // Is this run incremental? If there is additional |
|
502 // filtered text (if limit < globalLimit) then we pass in |
|
503 // an incremental value of FALSE to force the subclass to |
|
504 // complete the transliteration for this run. |
|
505 UBool isIncrementalRun = |
|
506 (index.limit < globalLimit ? FALSE : incremental); |
|
507 |
|
508 int32_t delta; |
|
509 |
|
510 // Implement rollback. To understand the need for rollback, |
|
511 // consider the following transliterator: |
|
512 // |
|
513 // "t" is "a > A;" |
|
514 // "u" is "A > b;" |
|
515 // "v" is a compound of "t; NFD; u" with a filter [:Ll:] |
|
516 // |
|
517 // Now apply "c" to the input text "a". The result is "b". But if |
|
518 // the transliteration is done incrementally, then the NFD holds |
|
519 // things up after "t" has already transformed "a" to "A". When |
|
520 // finishTransliterate() is called, "A" is _not_ processed because |
|
521 // it gets excluded by the [:Ll:] filter, and the end result is "A" |
|
522 // -- incorrect. The problem is that the filter is applied to a |
|
523 // partially-transliterated result, when we only want it to apply to |
|
524 // input text. Although this example hinges on a compound |
|
525 // transliterator containing NFD and a specific filter, it can |
|
526 // actually happen with any transliterator which may do a partial |
|
527 // transformation in incremental mode into characters outside its |
|
528 // filter. |
|
529 // |
|
530 // To handle this, when in incremental mode we supply characters to |
|
531 // handleTransliterate() in several passes. Each pass adds one more |
|
532 // input character to the input text. That is, for input "ABCD", we |
|
533 // first try "A", then "AB", then "ABC", and finally "ABCD". If at |
|
534 // any point we block (upon return, start < limit) then we roll |
|
535 // back. If at any point we complete the run (upon return start == |
|
536 // limit) then we commit that run. |
|
537 |
|
538 if (rollback && isIncrementalRun) { |
|
539 |
|
540 int32_t runStart = index.start; |
|
541 int32_t runLimit = index.limit; |
|
542 int32_t runLength = runLimit - runStart; |
|
543 |
|
544 // Make a rollback copy at the end of the string |
|
545 int32_t rollbackOrigin = text.length(); |
|
546 text.copy(runStart, runLimit, rollbackOrigin); |
|
547 |
|
548 // Variables reflecting the commitment of completely |
|
549 // transliterated text. passStart is the runStart, advanced |
|
550 // past committed text. rollbackStart is the rollbackOrigin, |
|
551 // advanced past rollback text that corresponds to committed |
|
552 // text. |
|
553 int32_t passStart = runStart; |
|
554 int32_t rollbackStart = rollbackOrigin; |
|
555 |
|
556 // The limit for each pass; we advance by one code point with |
|
557 // each iteration. |
|
558 int32_t passLimit = index.start; |
|
559 |
|
560 // Total length, in 16-bit code units, of uncommitted text. |
|
561 // This is the length to be rolled back. |
|
562 int32_t uncommittedLength = 0; |
|
563 |
|
564 // Total delta (change in length) for all passes |
|
565 int32_t totalDelta = 0; |
|
566 |
|
567 // PASS MAIN LOOP -- Start with a single character, and extend |
|
568 // the text by one character at a time. Roll back partial |
|
569 // transliterations and commit complete transliterations. |
|
570 for (;;) { |
|
571 // Length of additional code point, either one or two |
|
572 int32_t charLength = U16_LENGTH(text.char32At(passLimit)); |
|
573 passLimit += charLength; |
|
574 if (passLimit > runLimit) { |
|
575 break; |
|
576 } |
|
577 uncommittedLength += charLength; |
|
578 |
|
579 index.limit = passLimit; |
|
580 |
|
581 // Delegate to subclass for actual transliteration. Upon |
|
582 // return, start will be updated to point after the |
|
583 // transliterated text, and limit and contextLimit will be |
|
584 // adjusted for length changes. |
|
585 handleTransliterate(text, index, TRUE); |
|
586 |
|
587 delta = index.limit - passLimit; // change in length |
|
588 |
|
589 // We failed to completely transliterate this pass. |
|
590 // Roll back the text. Indices remain unchanged; reset |
|
591 // them where necessary. |
|
592 if (index.start != index.limit) { |
|
593 // Find the rollbackStart, adjusted for length changes |
|
594 // and the deletion of partially transliterated text. |
|
595 int32_t rs = rollbackStart + delta - (index.limit - passStart); |
|
596 |
|
597 // Delete the partially transliterated text |
|
598 text.handleReplaceBetween(passStart, index.limit, UnicodeString()); |
|
599 |
|
600 // Copy the rollback text back |
|
601 text.copy(rs, rs + uncommittedLength, passStart); |
|
602 |
|
603 // Restore indices to their original values |
|
604 index.start = passStart; |
|
605 index.limit = passLimit; |
|
606 index.contextLimit -= delta; |
|
607 } |
|
608 |
|
609 // We did completely transliterate this pass. Update the |
|
610 // commit indices to record how far we got. Adjust indices |
|
611 // for length change. |
|
612 else { |
|
613 // Move the pass indices past the committed text. |
|
614 passStart = passLimit = index.start; |
|
615 |
|
616 // Adjust the rollbackStart for length changes and move |
|
617 // it past the committed text. All characters we've |
|
618 // processed to this point are committed now, so zero |
|
619 // out the uncommittedLength. |
|
620 rollbackStart += delta + uncommittedLength; |
|
621 uncommittedLength = 0; |
|
622 |
|
623 // Adjust indices for length changes. |
|
624 runLimit += delta; |
|
625 totalDelta += delta; |
|
626 } |
|
627 } |
|
628 |
|
629 // Adjust overall limit and rollbackOrigin for insertions and |
|
630 // deletions. Don't need to worry about contextLimit because |
|
631 // handleTransliterate() maintains that. |
|
632 rollbackOrigin += totalDelta; |
|
633 globalLimit += totalDelta; |
|
634 |
|
635 // Delete the rollback copy |
|
636 text.handleReplaceBetween(rollbackOrigin, rollbackOrigin + runLength, UnicodeString()); |
|
637 |
|
638 // Move start past committed text |
|
639 index.start = passStart; |
|
640 } |
|
641 |
|
642 else { |
|
643 // Delegate to subclass for actual transliteration. |
|
644 int32_t limit = index.limit; |
|
645 handleTransliterate(text, index, isIncrementalRun); |
|
646 delta = index.limit - limit; // change in length |
|
647 |
|
648 // In a properly written transliterator, start == limit after |
|
649 // handleTransliterate() returns when incremental is false. |
|
650 // Catch cases where the subclass doesn't do this, and throw |
|
651 // an exception. (Just pinning start to limit is a bad idea, |
|
652 // because what's probably happening is that the subclass |
|
653 // isn't transliterating all the way to the end, and it should |
|
654 // in non-incremental mode.) |
|
655 if (!incremental && index.start != index.limit) { |
|
656 // We can't throw an exception, so just fudge things |
|
657 index.start = index.limit; |
|
658 } |
|
659 |
|
660 // Adjust overall limit for insertions/deletions. Don't need |
|
661 // to worry about contextLimit because handleTransliterate() |
|
662 // maintains that. |
|
663 globalLimit += delta; |
|
664 } |
|
665 |
|
666 if (filter == NULL || isIncrementalRun) { |
|
667 break; |
|
668 } |
|
669 |
|
670 // If we did completely transliterate this |
|
671 // run, then repeat with the next unfiltered run. |
|
672 } |
|
673 |
|
674 // Start is valid where it is. Limit needs to be put back where |
|
675 // it was, modulo adjustments for deletions/insertions. |
|
676 index.limit = globalLimit; |
|
677 } |
|
678 |
|
679 void Transliterator::filteredTransliterate(Replaceable& text, |
|
680 UTransPosition& index, |
|
681 UBool incremental) const { |
|
682 filteredTransliterate(text, index, incremental, FALSE); |
|
683 } |
|
684 |
|
685 /** |
|
686 * Method for subclasses to use to set the maximum context length. |
|
687 * @see #getMaximumContextLength |
|
688 */ |
|
689 void Transliterator::setMaximumContextLength(int32_t maxContextLength) { |
|
690 maximumContextLength = maxContextLength; |
|
691 } |
|
692 |
|
693 /** |
|
694 * Returns a programmatic identifier for this transliterator. |
|
695 * If this identifier is passed to <code>getInstance()</code>, it |
|
696 * will return this object, if it has been registered. |
|
697 * @see #registerInstance |
|
698 * @see #getAvailableIDs |
|
699 */ |
|
700 const UnicodeString& Transliterator::getID(void) const { |
|
701 return ID; |
|
702 } |
|
703 |
|
704 /** |
|
705 * Returns a name for this transliterator that is appropriate for |
|
706 * display to the user in the default locale. See {@link |
|
707 * #getDisplayName(Locale)} for details. |
|
708 */ |
|
709 UnicodeString& U_EXPORT2 Transliterator::getDisplayName(const UnicodeString& ID, |
|
710 UnicodeString& result) { |
|
711 return getDisplayName(ID, Locale::getDefault(), result); |
|
712 } |
|
713 |
|
714 /** |
|
715 * Returns a name for this transliterator that is appropriate for |
|
716 * display to the user in the given locale. This name is taken |
|
717 * from the locale resource data in the standard manner of the |
|
718 * <code>java.text</code> package. |
|
719 * |
|
720 * <p>If no localized names exist in the system resource bundles, |
|
721 * a name is synthesized using a localized |
|
722 * <code>MessageFormat</code> pattern from the resource data. The |
|
723 * arguments to this pattern are an integer followed by one or two |
|
724 * strings. The integer is the number of strings, either 1 or 2. |
|
725 * The strings are formed by splitting the ID for this |
|
726 * transliterator at the first TARGET_SEP. If there is no TARGET_SEP, then the |
|
727 * entire ID forms the only string. |
|
728 * @param inLocale the Locale in which the display name should be |
|
729 * localized. |
|
730 * @see java.text.MessageFormat |
|
731 */ |
|
732 UnicodeString& U_EXPORT2 Transliterator::getDisplayName(const UnicodeString& id, |
|
733 const Locale& inLocale, |
|
734 UnicodeString& result) { |
|
735 UErrorCode status = U_ZERO_ERROR; |
|
736 |
|
737 ResourceBundle bundle(U_ICUDATA_TRANSLIT, inLocale, status); |
|
738 |
|
739 // Suspend checking status until later... |
|
740 |
|
741 result.truncate(0); |
|
742 |
|
743 // Normalize the ID |
|
744 UnicodeString source, target, variant; |
|
745 UBool sawSource; |
|
746 TransliteratorIDParser::IDtoSTV(id, source, target, variant, sawSource); |
|
747 if (target.length() < 1) { |
|
748 // No target; malformed id |
|
749 return result; |
|
750 } |
|
751 if (variant.length() > 0) { // Change "Foo" to "/Foo" |
|
752 variant.insert(0, VARIANT_SEP); |
|
753 } |
|
754 UnicodeString ID(source); |
|
755 ID.append(TARGET_SEP).append(target).append(variant); |
|
756 |
|
757 // build the char* key |
|
758 if (uprv_isInvariantUString(ID.getBuffer(), ID.length())) { |
|
759 char key[200]; |
|
760 uprv_strcpy(key, RB_DISPLAY_NAME_PREFIX); |
|
761 int32_t length=(int32_t)uprv_strlen(RB_DISPLAY_NAME_PREFIX); |
|
762 ID.extract(0, (int32_t)(sizeof(key)-length), key+length, (int32_t)(sizeof(key)-length), US_INV); |
|
763 |
|
764 // Try to retrieve a UnicodeString from the bundle. |
|
765 UnicodeString resString = bundle.getStringEx(key, status); |
|
766 |
|
767 if (U_SUCCESS(status) && resString.length() != 0) { |
|
768 return result = resString; // [sic] assign & return |
|
769 } |
|
770 |
|
771 #if !UCONFIG_NO_FORMATTING |
|
772 // We have failed to get a name from the locale data. This is |
|
773 // typical, since most transliterators will not have localized |
|
774 // name data. The next step is to retrieve the MessageFormat |
|
775 // pattern from the locale data and to use it to synthesize the |
|
776 // name from the ID. |
|
777 |
|
778 status = U_ZERO_ERROR; |
|
779 resString = bundle.getStringEx(RB_DISPLAY_NAME_PATTERN, status); |
|
780 |
|
781 if (U_SUCCESS(status) && resString.length() != 0) { |
|
782 MessageFormat msg(resString, inLocale, status); |
|
783 // Suspend checking status until later... |
|
784 |
|
785 // We pass either 2 or 3 Formattable objects to msg. |
|
786 Formattable args[3]; |
|
787 int32_t nargs; |
|
788 args[0].setLong(2); // # of args to follow |
|
789 args[1].setString(source); |
|
790 args[2].setString(target); |
|
791 nargs = 3; |
|
792 |
|
793 // Use display names for the scripts, if they exist |
|
794 UnicodeString s; |
|
795 length=(int32_t)uprv_strlen(RB_SCRIPT_DISPLAY_NAME_PREFIX); |
|
796 for (int j=1; j<=2; ++j) { |
|
797 status = U_ZERO_ERROR; |
|
798 uprv_strcpy(key, RB_SCRIPT_DISPLAY_NAME_PREFIX); |
|
799 args[j].getString(s); |
|
800 if (uprv_isInvariantUString(s.getBuffer(), s.length())) { |
|
801 s.extract(0, sizeof(key)-length-1, key+length, (int32_t)sizeof(key)-length-1, US_INV); |
|
802 |
|
803 resString = bundle.getStringEx(key, status); |
|
804 |
|
805 if (U_SUCCESS(status)) { |
|
806 args[j] = resString; |
|
807 } |
|
808 } |
|
809 } |
|
810 |
|
811 status = U_ZERO_ERROR; |
|
812 FieldPosition pos; // ignored by msg |
|
813 msg.format(args, nargs, result, pos, status); |
|
814 if (U_SUCCESS(status)) { |
|
815 result.append(variant); |
|
816 return result; |
|
817 } |
|
818 } |
|
819 #endif |
|
820 } |
|
821 |
|
822 // We should not reach this point unless there is something |
|
823 // wrong with the build or the RB_DISPLAY_NAME_PATTERN has |
|
824 // been deleted from the root RB_LOCALE_ELEMENTS resource. |
|
825 result = ID; |
|
826 return result; |
|
827 } |
|
828 |
|
829 /** |
|
830 * Returns the filter used by this transliterator, or <tt>null</tt> |
|
831 * if this transliterator uses no filter. Caller musn't delete |
|
832 * the result! |
|
833 */ |
|
834 const UnicodeFilter* Transliterator::getFilter(void) const { |
|
835 return filter; |
|
836 } |
|
837 |
|
838 /** |
|
839 * Returns the filter used by this transliterator, or |
|
840 * <tt>NULL</tt> if this transliterator uses no filter. The |
|
841 * caller must eventually delete the result. After this call, |
|
842 * this transliterator's filter is set to <tt>NULL</tt>. |
|
843 */ |
|
844 UnicodeFilter* Transliterator::orphanFilter(void) { |
|
845 UnicodeFilter *result = filter; |
|
846 filter = NULL; |
|
847 return result; |
|
848 } |
|
849 |
|
850 /** |
|
851 * Changes the filter used by this transliterator. If the filter |
|
852 * is set to <tt>null</tt> then no filtering will occur. |
|
853 * |
|
854 * <p>Callers must take care if a transliterator is in use by |
|
855 * multiple threads. The filter should not be changed by one |
|
856 * thread while another thread may be transliterating. |
|
857 */ |
|
858 void Transliterator::adoptFilter(UnicodeFilter* filterToAdopt) { |
|
859 delete filter; |
|
860 filter = filterToAdopt; |
|
861 } |
|
862 |
|
863 /** |
|
864 * Returns this transliterator's inverse. See the class |
|
865 * documentation for details. This implementation simply inverts |
|
866 * the two entities in the ID and attempts to retrieve the |
|
867 * resulting transliterator. That is, if <code>getID()</code> |
|
868 * returns "A-B", then this method will return the result of |
|
869 * <code>getInstance("B-A")</code>, or <code>null</code> if that |
|
870 * call fails. |
|
871 * |
|
872 * <p>This method does not take filtering into account. The |
|
873 * returned transliterator will have no filter. |
|
874 * |
|
875 * <p>Subclasses with knowledge of their inverse may wish to |
|
876 * override this method. |
|
877 * |
|
878 * @return a transliterator that is an inverse, not necessarily |
|
879 * exact, of this transliterator, or <code>null</code> if no such |
|
880 * transliterator is registered. |
|
881 * @see #registerInstance |
|
882 */ |
|
883 Transliterator* Transliterator::createInverse(UErrorCode& status) const { |
|
884 UParseError parseError; |
|
885 return Transliterator::createInstance(ID, UTRANS_REVERSE,parseError,status); |
|
886 } |
|
887 |
|
888 Transliterator* U_EXPORT2 |
|
889 Transliterator::createInstance(const UnicodeString& ID, |
|
890 UTransDirection dir, |
|
891 UErrorCode& status) |
|
892 { |
|
893 UParseError parseError; |
|
894 return createInstance(ID, dir, parseError, status); |
|
895 } |
|
896 |
|
897 /** |
|
898 * Returns a <code>Transliterator</code> object given its ID. |
|
899 * The ID must be either a system transliterator ID or a ID registered |
|
900 * using <code>registerInstance()</code>. |
|
901 * |
|
902 * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code> |
|
903 * @return A <code>Transliterator</code> object with the given ID |
|
904 * @see #registerInstance |
|
905 * @see #getAvailableIDs |
|
906 * @see #getID |
|
907 */ |
|
908 Transliterator* U_EXPORT2 |
|
909 Transliterator::createInstance(const UnicodeString& ID, |
|
910 UTransDirection dir, |
|
911 UParseError& parseError, |
|
912 UErrorCode& status) |
|
913 { |
|
914 if (U_FAILURE(status)) { |
|
915 return 0; |
|
916 } |
|
917 |
|
918 UnicodeString canonID; |
|
919 UVector list(status); |
|
920 if (U_FAILURE(status)) { |
|
921 return NULL; |
|
922 } |
|
923 |
|
924 UnicodeSet* globalFilter; |
|
925 // TODO add code for parseError...currently unused, but |
|
926 // later may be used by parsing code... |
|
927 if (!TransliteratorIDParser::parseCompoundID(ID, dir, canonID, list, globalFilter)) { |
|
928 status = U_INVALID_ID; |
|
929 return NULL; |
|
930 } |
|
931 |
|
932 TransliteratorIDParser::instantiateList(list, status); |
|
933 if (U_FAILURE(status)) { |
|
934 return NULL; |
|
935 } |
|
936 |
|
937 U_ASSERT(list.size() > 0); |
|
938 Transliterator* t = NULL; |
|
939 |
|
940 if (list.size() > 1 || canonID.indexOf(ID_DELIM) >= 0) { |
|
941 // [NOTE: If it's a compoundID, we instantiate a CompoundTransliterator even if it only |
|
942 // has one child transliterator. This is so that toRules() will return the right thing |
|
943 // (without any inactive ID), but our main ID still comes out correct. That is, if we |
|
944 // instantiate "(Lower);Latin-Greek;", we want the rules to come out as "::Latin-Greek;" |
|
945 // even though the ID is "(Lower);Latin-Greek;". |
|
946 t = new CompoundTransliterator(list, parseError, status); |
|
947 } |
|
948 else { |
|
949 t = (Transliterator*)list.elementAt(0); |
|
950 } |
|
951 // Check null pointer |
|
952 if (t != NULL) { |
|
953 t->setID(canonID); |
|
954 if (globalFilter != NULL) { |
|
955 t->adoptFilter(globalFilter); |
|
956 } |
|
957 } |
|
958 else if (U_SUCCESS(status)) { |
|
959 status = U_MEMORY_ALLOCATION_ERROR; |
|
960 } |
|
961 return t; |
|
962 } |
|
963 |
|
964 /** |
|
965 * Create a transliterator from a basic ID. This is an ID |
|
966 * containing only the forward direction source, target, and |
|
967 * variant. |
|
968 * @param id a basic ID of the form S-T or S-T/V. |
|
969 * @return a newly created Transliterator or null if the ID is |
|
970 * invalid. |
|
971 */ |
|
972 Transliterator* Transliterator::createBasicInstance(const UnicodeString& id, |
|
973 const UnicodeString* canon) { |
|
974 UParseError pe; |
|
975 UErrorCode ec = U_ZERO_ERROR; |
|
976 TransliteratorAlias* alias = 0; |
|
977 Transliterator* t = 0; |
|
978 |
|
979 umtx_lock(®istryMutex); |
|
980 if (HAVE_REGISTRY(ec)) { |
|
981 t = registry->get(id, alias, ec); |
|
982 } |
|
983 umtx_unlock(®istryMutex); |
|
984 |
|
985 if (U_FAILURE(ec)) { |
|
986 delete t; |
|
987 delete alias; |
|
988 return 0; |
|
989 } |
|
990 |
|
991 // We may have not gotten a transliterator: Because we can't |
|
992 // instantiate a transliterator from inside TransliteratorRegistry:: |
|
993 // get() (that would deadlock), we sometimes pass back an alias. This |
|
994 // contains the data we need to finish the instantiation outside the |
|
995 // registry mutex. The alias may, in turn, generate another alias, so |
|
996 // we handle aliases in a loop. The max times through the loop is two. |
|
997 // [alan] |
|
998 while (alias != 0) { |
|
999 U_ASSERT(t==0); |
|
1000 // Rule-based aliases are handled with TransliteratorAlias:: |
|
1001 // parse(), followed by TransliteratorRegistry::reget(). |
|
1002 // Other aliases are handled with TransliteratorAlias::create(). |
|
1003 if (alias->isRuleBased()) { |
|
1004 // Step 1. parse |
|
1005 TransliteratorParser parser(ec); |
|
1006 alias->parse(parser, pe, ec); |
|
1007 delete alias; |
|
1008 alias = 0; |
|
1009 |
|
1010 // Step 2. reget |
|
1011 umtx_lock(®istryMutex); |
|
1012 if (HAVE_REGISTRY(ec)) { |
|
1013 t = registry->reget(id, parser, alias, ec); |
|
1014 } |
|
1015 umtx_unlock(®istryMutex); |
|
1016 |
|
1017 // Step 3. Loop back around! |
|
1018 } else { |
|
1019 t = alias->create(pe, ec); |
|
1020 delete alias; |
|
1021 alias = 0; |
|
1022 break; |
|
1023 } |
|
1024 if (U_FAILURE(ec)) { |
|
1025 delete t; |
|
1026 delete alias; |
|
1027 t = NULL; |
|
1028 break; |
|
1029 } |
|
1030 } |
|
1031 |
|
1032 if (t != NULL && canon != NULL) { |
|
1033 t->setID(*canon); |
|
1034 } |
|
1035 |
|
1036 return t; |
|
1037 } |
|
1038 |
|
1039 /** |
|
1040 * Returns a <code>Transliterator</code> object constructed from |
|
1041 * the given rule string. This will be a RuleBasedTransliterator, |
|
1042 * if the rule string contains only rules, or a |
|
1043 * CompoundTransliterator, if it contains ID blocks, or a |
|
1044 * NullTransliterator, if it contains ID blocks which parse as |
|
1045 * empty for the given direction. |
|
1046 */ |
|
1047 Transliterator* U_EXPORT2 |
|
1048 Transliterator::createFromRules(const UnicodeString& ID, |
|
1049 const UnicodeString& rules, |
|
1050 UTransDirection dir, |
|
1051 UParseError& parseError, |
|
1052 UErrorCode& status) |
|
1053 { |
|
1054 Transliterator* t = NULL; |
|
1055 |
|
1056 TransliteratorParser parser(status); |
|
1057 parser.parse(rules, dir, parseError, status); |
|
1058 |
|
1059 if (U_FAILURE(status)) { |
|
1060 return 0; |
|
1061 } |
|
1062 |
|
1063 // NOTE: The logic here matches that in TransliteratorRegistry. |
|
1064 if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 0) { |
|
1065 t = new NullTransliterator(); |
|
1066 } |
|
1067 else if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 1) { |
|
1068 t = new RuleBasedTransliterator(ID, (TransliterationRuleData*)parser.dataVector.orphanElementAt(0), TRUE); |
|
1069 } |
|
1070 else if (parser.idBlockVector.size() == 1 && parser.dataVector.size() == 0) { |
|
1071 // idBlock, no data -- this is an alias. The ID has |
|
1072 // been munged from reverse into forward mode, if |
|
1073 // necessary, so instantiate the ID in the forward |
|
1074 // direction. |
|
1075 if (parser.compoundFilter != NULL) { |
|
1076 UnicodeString filterPattern; |
|
1077 parser.compoundFilter->toPattern(filterPattern, FALSE); |
|
1078 t = createInstance(filterPattern + UnicodeString(ID_DELIM) |
|
1079 + *((UnicodeString*)parser.idBlockVector.elementAt(0)), UTRANS_FORWARD, parseError, status); |
|
1080 } |
|
1081 else |
|
1082 t = createInstance(*((UnicodeString*)parser.idBlockVector.elementAt(0)), UTRANS_FORWARD, parseError, status); |
|
1083 |
|
1084 |
|
1085 if (t != NULL) { |
|
1086 t->setID(ID); |
|
1087 } |
|
1088 } |
|
1089 else { |
|
1090 UVector transliterators(status); |
|
1091 int32_t passNumber = 1; |
|
1092 |
|
1093 int32_t limit = parser.idBlockVector.size(); |
|
1094 if (parser.dataVector.size() > limit) |
|
1095 limit = parser.dataVector.size(); |
|
1096 |
|
1097 for (int32_t i = 0; i < limit; i++) { |
|
1098 if (i < parser.idBlockVector.size()) { |
|
1099 UnicodeString* idBlock = (UnicodeString*)parser.idBlockVector.elementAt(i); |
|
1100 if (!idBlock->isEmpty()) { |
|
1101 Transliterator* temp = createInstance(*idBlock, UTRANS_FORWARD, parseError, status); |
|
1102 if (temp != NULL && typeid(*temp) != typeid(NullTransliterator)) |
|
1103 transliterators.addElement(temp, status); |
|
1104 else |
|
1105 delete temp; |
|
1106 } |
|
1107 } |
|
1108 if (!parser.dataVector.isEmpty()) { |
|
1109 TransliterationRuleData* data = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0); |
|
1110 // TODO: Should passNumber be turned into a decimal-string representation (1 -> "1")? |
|
1111 RuleBasedTransliterator* temprbt = new RuleBasedTransliterator(UnicodeString(CompoundTransliterator::PASS_STRING) + UnicodeString(passNumber++), |
|
1112 data, TRUE); |
|
1113 // Check if NULL before adding it to transliterators to avoid future usage of NULL pointer. |
|
1114 if (temprbt == NULL) { |
|
1115 status = U_MEMORY_ALLOCATION_ERROR; |
|
1116 return t; |
|
1117 } |
|
1118 transliterators.addElement(temprbt, status); |
|
1119 } |
|
1120 } |
|
1121 |
|
1122 t = new CompoundTransliterator(transliterators, passNumber - 1, parseError, status); |
|
1123 // Null pointer check |
|
1124 if (t != NULL) { |
|
1125 t->setID(ID); |
|
1126 t->adoptFilter(parser.orphanCompoundFilter()); |
|
1127 } |
|
1128 } |
|
1129 if (U_SUCCESS(status) && t == NULL) { |
|
1130 status = U_MEMORY_ALLOCATION_ERROR; |
|
1131 } |
|
1132 return t; |
|
1133 } |
|
1134 |
|
1135 UnicodeString& Transliterator::toRules(UnicodeString& rulesSource, |
|
1136 UBool escapeUnprintable) const { |
|
1137 // The base class implementation of toRules munges the ID into |
|
1138 // the correct format. That is: foo => ::foo |
|
1139 if (escapeUnprintable) { |
|
1140 rulesSource.truncate(0); |
|
1141 UnicodeString id = getID(); |
|
1142 for (int32_t i=0; i<id.length();) { |
|
1143 UChar32 c = id.char32At(i); |
|
1144 if (!ICU_Utility::escapeUnprintable(rulesSource, c)) { |
|
1145 rulesSource.append(c); |
|
1146 } |
|
1147 i += U16_LENGTH(c); |
|
1148 } |
|
1149 } else { |
|
1150 rulesSource = getID(); |
|
1151 } |
|
1152 // KEEP in sync with rbt_pars |
|
1153 rulesSource.insert(0, UNICODE_STRING_SIMPLE("::")); |
|
1154 rulesSource.append(ID_DELIM); |
|
1155 return rulesSource; |
|
1156 } |
|
1157 |
|
1158 int32_t Transliterator::countElements() const { |
|
1159 const CompoundTransliterator* ct = dynamic_cast<const CompoundTransliterator*>(this); |
|
1160 return ct != NULL ? ct->getCount() : 0; |
|
1161 } |
|
1162 |
|
1163 const Transliterator& Transliterator::getElement(int32_t index, UErrorCode& ec) const { |
|
1164 if (U_FAILURE(ec)) { |
|
1165 return *this; |
|
1166 } |
|
1167 const CompoundTransliterator* cpd = dynamic_cast<const CompoundTransliterator*>(this); |
|
1168 int32_t n = (cpd == NULL) ? 1 : cpd->getCount(); |
|
1169 if (index < 0 || index >= n) { |
|
1170 ec = U_INDEX_OUTOFBOUNDS_ERROR; |
|
1171 return *this; |
|
1172 } else { |
|
1173 return (n == 1) ? *this : cpd->getTransliterator(index); |
|
1174 } |
|
1175 } |
|
1176 |
|
1177 UnicodeSet& Transliterator::getSourceSet(UnicodeSet& result) const { |
|
1178 handleGetSourceSet(result); |
|
1179 if (filter != NULL) { |
|
1180 UnicodeSet* filterSet = dynamic_cast<UnicodeSet*>(filter); |
|
1181 UBool deleteFilterSet = FALSE; |
|
1182 // Most, but not all filters will be UnicodeSets. Optimize for |
|
1183 // the high-runner case. |
|
1184 if (filterSet == NULL) { |
|
1185 filterSet = new UnicodeSet(); |
|
1186 // Check null pointer |
|
1187 if (filterSet == NULL) { |
|
1188 return result; |
|
1189 } |
|
1190 deleteFilterSet = TRUE; |
|
1191 filter->addMatchSetTo(*filterSet); |
|
1192 } |
|
1193 result.retainAll(*filterSet); |
|
1194 if (deleteFilterSet) { |
|
1195 delete filterSet; |
|
1196 } |
|
1197 } |
|
1198 return result; |
|
1199 } |
|
1200 |
|
1201 void Transliterator::handleGetSourceSet(UnicodeSet& result) const { |
|
1202 result.clear(); |
|
1203 } |
|
1204 |
|
1205 UnicodeSet& Transliterator::getTargetSet(UnicodeSet& result) const { |
|
1206 return result.clear(); |
|
1207 } |
|
1208 |
|
1209 // For public consumption |
|
1210 void U_EXPORT2 Transliterator::registerFactory(const UnicodeString& id, |
|
1211 Transliterator::Factory factory, |
|
1212 Transliterator::Token context) { |
|
1213 Mutex lock(®istryMutex); |
|
1214 UErrorCode ec = U_ZERO_ERROR; |
|
1215 if (HAVE_REGISTRY(ec)) { |
|
1216 _registerFactory(id, factory, context); |
|
1217 } |
|
1218 } |
|
1219 |
|
1220 // To be called only by Transliterator subclasses that are called |
|
1221 // to register themselves by initializeRegistry(). |
|
1222 void Transliterator::_registerFactory(const UnicodeString& id, |
|
1223 Transliterator::Factory factory, |
|
1224 Transliterator::Token context) { |
|
1225 UErrorCode ec = U_ZERO_ERROR; |
|
1226 registry->put(id, factory, context, TRUE, ec); |
|
1227 } |
|
1228 |
|
1229 // To be called only by Transliterator subclasses that are called |
|
1230 // to register themselves by initializeRegistry(). |
|
1231 void Transliterator::_registerSpecialInverse(const UnicodeString& target, |
|
1232 const UnicodeString& inverseTarget, |
|
1233 UBool bidirectional) { |
|
1234 UErrorCode status = U_ZERO_ERROR; |
|
1235 TransliteratorIDParser::registerSpecialInverse(target, inverseTarget, bidirectional, status); |
|
1236 } |
|
1237 |
|
1238 /** |
|
1239 * Registers a instance <tt>obj</tt> of a subclass of |
|
1240 * <code>Transliterator</code> with the system. This object must |
|
1241 * implement the <tt>clone()</tt> method. When |
|
1242 * <tt>getInstance()</tt> is called with an ID string that is |
|
1243 * equal to <tt>obj.getID()</tt>, then <tt>obj.clone()</tt> is |
|
1244 * returned. |
|
1245 * |
|
1246 * @param obj an instance of subclass of |
|
1247 * <code>Transliterator</code> that defines <tt>clone()</tt> |
|
1248 * @see #getInstance |
|
1249 * @see #unregister |
|
1250 */ |
|
1251 void U_EXPORT2 Transliterator::registerInstance(Transliterator* adoptedPrototype) { |
|
1252 Mutex lock(®istryMutex); |
|
1253 UErrorCode ec = U_ZERO_ERROR; |
|
1254 if (HAVE_REGISTRY(ec)) { |
|
1255 _registerInstance(adoptedPrototype); |
|
1256 } |
|
1257 } |
|
1258 |
|
1259 void Transliterator::_registerInstance(Transliterator* adoptedPrototype) { |
|
1260 UErrorCode ec = U_ZERO_ERROR; |
|
1261 registry->put(adoptedPrototype, TRUE, ec); |
|
1262 } |
|
1263 |
|
1264 void U_EXPORT2 Transliterator::registerAlias(const UnicodeString& aliasID, |
|
1265 const UnicodeString& realID) { |
|
1266 Mutex lock(®istryMutex); |
|
1267 UErrorCode ec = U_ZERO_ERROR; |
|
1268 if (HAVE_REGISTRY(ec)) { |
|
1269 _registerAlias(aliasID, realID); |
|
1270 } |
|
1271 } |
|
1272 |
|
1273 void Transliterator::_registerAlias(const UnicodeString& aliasID, |
|
1274 const UnicodeString& realID) { |
|
1275 UErrorCode ec = U_ZERO_ERROR; |
|
1276 registry->put(aliasID, realID, FALSE, TRUE, ec); |
|
1277 } |
|
1278 |
|
1279 /** |
|
1280 * Unregisters a transliterator or class. This may be either |
|
1281 * a system transliterator or a user transliterator or class. |
|
1282 * |
|
1283 * @param ID the ID of the transliterator or class |
|
1284 * @see #registerInstance |
|
1285 |
|
1286 */ |
|
1287 void U_EXPORT2 Transliterator::unregister(const UnicodeString& ID) { |
|
1288 Mutex lock(®istryMutex); |
|
1289 UErrorCode ec = U_ZERO_ERROR; |
|
1290 if (HAVE_REGISTRY(ec)) { |
|
1291 registry->remove(ID); |
|
1292 } |
|
1293 } |
|
1294 |
|
1295 /** |
|
1296 * == OBSOLETE - remove in ICU 3.4 == |
|
1297 * Return the number of IDs currently registered with the system. |
|
1298 * To retrieve the actual IDs, call getAvailableID(i) with |
|
1299 * i from 0 to countAvailableIDs() - 1. |
|
1300 */ |
|
1301 int32_t U_EXPORT2 Transliterator::countAvailableIDs(void) { |
|
1302 int32_t retVal = 0; |
|
1303 Mutex lock(®istryMutex); |
|
1304 UErrorCode ec = U_ZERO_ERROR; |
|
1305 if (HAVE_REGISTRY(ec)) { |
|
1306 retVal = registry->countAvailableIDs(); |
|
1307 } |
|
1308 return retVal; |
|
1309 } |
|
1310 |
|
1311 /** |
|
1312 * == OBSOLETE - remove in ICU 3.4 == |
|
1313 * Return the index-th available ID. index must be between 0 |
|
1314 * and countAvailableIDs() - 1, inclusive. If index is out of |
|
1315 * range, the result of getAvailableID(0) is returned. |
|
1316 */ |
|
1317 const UnicodeString& U_EXPORT2 Transliterator::getAvailableID(int32_t index) { |
|
1318 const UnicodeString* result = NULL; |
|
1319 umtx_lock(®istryMutex); |
|
1320 UErrorCode ec = U_ZERO_ERROR; |
|
1321 if (HAVE_REGISTRY(ec)) { |
|
1322 result = ®istry->getAvailableID(index); |
|
1323 } |
|
1324 umtx_unlock(®istryMutex); |
|
1325 U_ASSERT(result != NULL); // fail if no registry |
|
1326 return *result; |
|
1327 } |
|
1328 |
|
1329 StringEnumeration* U_EXPORT2 Transliterator::getAvailableIDs(UErrorCode& ec) { |
|
1330 if (U_FAILURE(ec)) return NULL; |
|
1331 StringEnumeration* result = NULL; |
|
1332 umtx_lock(®istryMutex); |
|
1333 if (HAVE_REGISTRY(ec)) { |
|
1334 result = registry->getAvailableIDs(); |
|
1335 } |
|
1336 umtx_unlock(®istryMutex); |
|
1337 if (result == NULL) { |
|
1338 ec = U_INTERNAL_TRANSLITERATOR_ERROR; |
|
1339 } |
|
1340 return result; |
|
1341 } |
|
1342 |
|
1343 int32_t U_EXPORT2 Transliterator::countAvailableSources(void) { |
|
1344 Mutex lock(®istryMutex); |
|
1345 UErrorCode ec = U_ZERO_ERROR; |
|
1346 return HAVE_REGISTRY(ec) ? _countAvailableSources() : 0; |
|
1347 } |
|
1348 |
|
1349 UnicodeString& U_EXPORT2 Transliterator::getAvailableSource(int32_t index, |
|
1350 UnicodeString& result) { |
|
1351 Mutex lock(®istryMutex); |
|
1352 UErrorCode ec = U_ZERO_ERROR; |
|
1353 if (HAVE_REGISTRY(ec)) { |
|
1354 _getAvailableSource(index, result); |
|
1355 } |
|
1356 return result; |
|
1357 } |
|
1358 |
|
1359 int32_t U_EXPORT2 Transliterator::countAvailableTargets(const UnicodeString& source) { |
|
1360 Mutex lock(®istryMutex); |
|
1361 UErrorCode ec = U_ZERO_ERROR; |
|
1362 return HAVE_REGISTRY(ec) ? _countAvailableTargets(source) : 0; |
|
1363 } |
|
1364 |
|
1365 UnicodeString& U_EXPORT2 Transliterator::getAvailableTarget(int32_t index, |
|
1366 const UnicodeString& source, |
|
1367 UnicodeString& result) { |
|
1368 Mutex lock(®istryMutex); |
|
1369 UErrorCode ec = U_ZERO_ERROR; |
|
1370 if (HAVE_REGISTRY(ec)) { |
|
1371 _getAvailableTarget(index, source, result); |
|
1372 } |
|
1373 return result; |
|
1374 } |
|
1375 |
|
1376 int32_t U_EXPORT2 Transliterator::countAvailableVariants(const UnicodeString& source, |
|
1377 const UnicodeString& target) { |
|
1378 Mutex lock(®istryMutex); |
|
1379 UErrorCode ec = U_ZERO_ERROR; |
|
1380 return HAVE_REGISTRY(ec) ? _countAvailableVariants(source, target) : 0; |
|
1381 } |
|
1382 |
|
1383 UnicodeString& U_EXPORT2 Transliterator::getAvailableVariant(int32_t index, |
|
1384 const UnicodeString& source, |
|
1385 const UnicodeString& target, |
|
1386 UnicodeString& result) { |
|
1387 Mutex lock(®istryMutex); |
|
1388 UErrorCode ec = U_ZERO_ERROR; |
|
1389 if (HAVE_REGISTRY(ec)) { |
|
1390 _getAvailableVariant(index, source, target, result); |
|
1391 } |
|
1392 return result; |
|
1393 } |
|
1394 |
|
1395 int32_t Transliterator::_countAvailableSources(void) { |
|
1396 return registry->countAvailableSources(); |
|
1397 } |
|
1398 |
|
1399 UnicodeString& Transliterator::_getAvailableSource(int32_t index, |
|
1400 UnicodeString& result) { |
|
1401 return registry->getAvailableSource(index, result); |
|
1402 } |
|
1403 |
|
1404 int32_t Transliterator::_countAvailableTargets(const UnicodeString& source) { |
|
1405 return registry->countAvailableTargets(source); |
|
1406 } |
|
1407 |
|
1408 UnicodeString& Transliterator::_getAvailableTarget(int32_t index, |
|
1409 const UnicodeString& source, |
|
1410 UnicodeString& result) { |
|
1411 return registry->getAvailableTarget(index, source, result); |
|
1412 } |
|
1413 |
|
1414 int32_t Transliterator::_countAvailableVariants(const UnicodeString& source, |
|
1415 const UnicodeString& target) { |
|
1416 return registry->countAvailableVariants(source, target); |
|
1417 } |
|
1418 |
|
1419 UnicodeString& Transliterator::_getAvailableVariant(int32_t index, |
|
1420 const UnicodeString& source, |
|
1421 const UnicodeString& target, |
|
1422 UnicodeString& result) { |
|
1423 return registry->getAvailableVariant(index, source, target, result); |
|
1424 } |
|
1425 |
|
1426 #ifdef U_USE_DEPRECATED_TRANSLITERATOR_API |
|
1427 |
|
1428 /** |
|
1429 * Method for subclasses to use to obtain a character in the given |
|
1430 * string, with filtering. |
|
1431 * @deprecated the new architecture provides filtering at the top |
|
1432 * level. This method will be removed Dec 31 2001. |
|
1433 */ |
|
1434 UChar Transliterator::filteredCharAt(const Replaceable& text, int32_t i) const { |
|
1435 UChar c; |
|
1436 const UnicodeFilter* localFilter = getFilter(); |
|
1437 return (localFilter == 0) ? text.charAt(i) : |
|
1438 (localFilter->contains(c = text.charAt(i)) ? c : (UChar)0xFFFE); |
|
1439 } |
|
1440 |
|
1441 #endif |
|
1442 |
|
1443 /** |
|
1444 * If the registry is initialized, return TRUE. If not, initialize it |
|
1445 * and return TRUE. If the registry cannot be initialized, return |
|
1446 * FALSE (rare). |
|
1447 * |
|
1448 * IMPORTANT: Upon entry, registryMutex must be LOCKED. The entire |
|
1449 * initialization is done with the lock held. There is NO REASON to |
|
1450 * unlock, since no other thread that is waiting on the registryMutex |
|
1451 * cannot itself proceed until the registry is initialized. |
|
1452 */ |
|
1453 UBool Transliterator::initializeRegistry(UErrorCode &status) { |
|
1454 if (registry != 0) { |
|
1455 return TRUE; |
|
1456 } |
|
1457 |
|
1458 registry = new TransliteratorRegistry(status); |
|
1459 if (registry == 0 || U_FAILURE(status)) { |
|
1460 delete registry; |
|
1461 registry = 0; |
|
1462 return FALSE; // can't create registry, no recovery |
|
1463 } |
|
1464 |
|
1465 /* The following code parses the index table located in |
|
1466 * icu/data/translit/root.txt. The index is an n x 4 table |
|
1467 * that follows this format: |
|
1468 * <id>{ |
|
1469 * file{ |
|
1470 * resource{"<resource>"} |
|
1471 * direction{"<direction>"} |
|
1472 * } |
|
1473 * } |
|
1474 * <id>{ |
|
1475 * internal{ |
|
1476 * resource{"<resource>"} |
|
1477 * direction{"<direction"} |
|
1478 * } |
|
1479 * } |
|
1480 * <id>{ |
|
1481 * alias{"<getInstanceArg"} |
|
1482 * } |
|
1483 * <id> is the ID of the system transliterator being defined. These |
|
1484 * are public IDs enumerated by Transliterator.getAvailableIDs(), |
|
1485 * unless the second field is "internal". |
|
1486 * |
|
1487 * <resource> is a ResourceReader resource name. Currently these refer |
|
1488 * to file names under com/ibm/text/resources. This string is passed |
|
1489 * directly to ResourceReader, together with <encoding>. |
|
1490 * |
|
1491 * <direction> is either "FORWARD" or "REVERSE". |
|
1492 * |
|
1493 * <getInstanceArg> is a string to be passed directly to |
|
1494 * Transliterator.getInstance(). The returned Transliterator object |
|
1495 * then has its ID changed to <id> and is returned. |
|
1496 * |
|
1497 * The extra blank field on "alias" lines is to make the array square. |
|
1498 */ |
|
1499 //static const char translit_index[] = "translit_index"; |
|
1500 |
|
1501 UResourceBundle *bundle, *transIDs, *colBund; |
|
1502 bundle = ures_open(U_ICUDATA_TRANSLIT, NULL/*open default locale*/, &status); |
|
1503 transIDs = ures_getByKey(bundle, RB_RULE_BASED_IDS, 0, &status); |
|
1504 |
|
1505 int32_t row, maxRows; |
|
1506 if (U_SUCCESS(status)) { |
|
1507 maxRows = ures_getSize(transIDs); |
|
1508 for (row = 0; row < maxRows; row++) { |
|
1509 colBund = ures_getByIndex(transIDs, row, 0, &status); |
|
1510 if (U_SUCCESS(status)) { |
|
1511 UnicodeString id(ures_getKey(colBund), -1, US_INV); |
|
1512 UResourceBundle* res = ures_getNextResource(colBund, NULL, &status); |
|
1513 const char* typeStr = ures_getKey(res); |
|
1514 UChar type; |
|
1515 u_charsToUChars(typeStr, &type, 1); |
|
1516 |
|
1517 if (U_SUCCESS(status)) { |
|
1518 int32_t len = 0; |
|
1519 const UChar *resString; |
|
1520 switch (type) { |
|
1521 case 0x66: // 'f' |
|
1522 case 0x69: // 'i' |
|
1523 // 'file' or 'internal'; |
|
1524 // row[2]=resource, row[3]=direction |
|
1525 { |
|
1526 |
|
1527 resString = ures_getStringByKey(res, "resource", &len, &status); |
|
1528 UBool visible = (type == 0x0066 /*f*/); |
|
1529 UTransDirection dir = |
|
1530 (ures_getUnicodeStringByKey(res, "direction", &status).charAt(0) == |
|
1531 0x0046 /*F*/) ? |
|
1532 UTRANS_FORWARD : UTRANS_REVERSE; |
|
1533 registry->put(id, UnicodeString(TRUE, resString, len), dir, TRUE, visible, status); |
|
1534 } |
|
1535 break; |
|
1536 case 0x61: // 'a' |
|
1537 // 'alias'; row[2]=createInstance argument |
|
1538 resString = ures_getString(res, &len, &status); |
|
1539 registry->put(id, UnicodeString(TRUE, resString, len), TRUE, TRUE, status); |
|
1540 break; |
|
1541 } |
|
1542 } |
|
1543 ures_close(res); |
|
1544 } |
|
1545 ures_close(colBund); |
|
1546 } |
|
1547 } |
|
1548 |
|
1549 ures_close(transIDs); |
|
1550 ures_close(bundle); |
|
1551 |
|
1552 // Manually add prototypes that the system knows about to the |
|
1553 // cache. This is how new non-rule-based transliterators are |
|
1554 // added to the system. |
|
1555 |
|
1556 // This is to allow for null pointer check |
|
1557 NullTransliterator* tempNullTranslit = new NullTransliterator(); |
|
1558 LowercaseTransliterator* tempLowercaseTranslit = new LowercaseTransliterator(); |
|
1559 UppercaseTransliterator* tempUppercaseTranslit = new UppercaseTransliterator(); |
|
1560 TitlecaseTransliterator* tempTitlecaseTranslit = new TitlecaseTransliterator(); |
|
1561 UnicodeNameTransliterator* tempUnicodeTranslit = new UnicodeNameTransliterator(); |
|
1562 NameUnicodeTransliterator* tempNameUnicodeTranslit = new NameUnicodeTransliterator(); |
|
1563 #if !UCONFIG_NO_BREAK_ITERATION |
|
1564 // TODO: could or should these transliterators be referenced polymorphically once constructed? |
|
1565 BreakTransliterator* tempBreakTranslit = new BreakTransliterator(); |
|
1566 #endif |
|
1567 // Check for null pointers |
|
1568 if (tempNullTranslit == NULL || tempLowercaseTranslit == NULL || tempUppercaseTranslit == NULL || |
|
1569 tempTitlecaseTranslit == NULL || tempUnicodeTranslit == NULL || |
|
1570 #if !UCONFIG_NO_BREAK_ITERATION |
|
1571 tempBreakTranslit == NULL || |
|
1572 #endif |
|
1573 tempNameUnicodeTranslit == NULL ) |
|
1574 { |
|
1575 delete tempNullTranslit; |
|
1576 delete tempLowercaseTranslit; |
|
1577 delete tempUppercaseTranslit; |
|
1578 delete tempTitlecaseTranslit; |
|
1579 delete tempUnicodeTranslit; |
|
1580 delete tempNameUnicodeTranslit; |
|
1581 #if !UCONFIG_NO_BREAK_ITERATION |
|
1582 delete tempBreakTranslit; |
|
1583 #endif |
|
1584 // Since there was an error, remove registry |
|
1585 delete registry; |
|
1586 registry = NULL; |
|
1587 |
|
1588 status = U_MEMORY_ALLOCATION_ERROR; |
|
1589 return 0; |
|
1590 } |
|
1591 |
|
1592 registry->put(tempNullTranslit, TRUE, status); |
|
1593 registry->put(tempLowercaseTranslit, TRUE, status); |
|
1594 registry->put(tempUppercaseTranslit, TRUE, status); |
|
1595 registry->put(tempTitlecaseTranslit, TRUE, status); |
|
1596 registry->put(tempUnicodeTranslit, TRUE, status); |
|
1597 registry->put(tempNameUnicodeTranslit, TRUE, status); |
|
1598 #if !UCONFIG_NO_BREAK_ITERATION |
|
1599 registry->put(tempBreakTranslit, FALSE, status); // FALSE means invisible. |
|
1600 #endif |
|
1601 |
|
1602 RemoveTransliterator::registerIDs(); // Must be within mutex |
|
1603 EscapeTransliterator::registerIDs(); |
|
1604 UnescapeTransliterator::registerIDs(); |
|
1605 NormalizationTransliterator::registerIDs(); |
|
1606 AnyTransliterator::registerIDs(); |
|
1607 |
|
1608 _registerSpecialInverse(UNICODE_STRING_SIMPLE("Null"), |
|
1609 UNICODE_STRING_SIMPLE("Null"), FALSE); |
|
1610 _registerSpecialInverse(UNICODE_STRING_SIMPLE("Upper"), |
|
1611 UNICODE_STRING_SIMPLE("Lower"), TRUE); |
|
1612 _registerSpecialInverse(UNICODE_STRING_SIMPLE("Title"), |
|
1613 UNICODE_STRING_SIMPLE("Lower"), FALSE); |
|
1614 |
|
1615 ucln_i18n_registerCleanup(UCLN_I18N_TRANSLITERATOR, utrans_transliterator_cleanup); |
|
1616 |
|
1617 return TRUE; |
|
1618 } |
|
1619 |
|
1620 U_NAMESPACE_END |
|
1621 |
|
1622 // Defined in ucln_in.h: |
|
1623 |
|
1624 /** |
|
1625 * Release all static memory held by transliterator. This will |
|
1626 * necessarily invalidate any rule-based transliterators held by the |
|
1627 * user, because RBTs hold pointers to common data objects. |
|
1628 */ |
|
1629 U_CFUNC UBool utrans_transliterator_cleanup(void) { |
|
1630 U_NAMESPACE_USE |
|
1631 TransliteratorIDParser::cleanup(); |
|
1632 if (registry) { |
|
1633 delete registry; |
|
1634 registry = NULL; |
|
1635 } |
|
1636 return TRUE; |
|
1637 } |
|
1638 |
|
1639 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
|
1640 |
|
1641 //eof |