|
1 /* |
|
2 ***************************************************************** |
|
3 * Copyright (c) 2002-2011, International Business Machines Corporation |
|
4 * and others. All Rights Reserved. |
|
5 ***************************************************************** |
|
6 * Date Name Description |
|
7 * 06/06/2002 aliu Creation. |
|
8 ***************************************************************** |
|
9 */ |
|
10 |
|
11 #include "unicode/utypes.h" |
|
12 |
|
13 #if !UCONFIG_NO_TRANSLITERATION |
|
14 |
|
15 #include "unicode/uobject.h" |
|
16 #include "unicode/uscript.h" |
|
17 #include "nultrans.h" |
|
18 #include "anytrans.h" |
|
19 #include "uvector.h" |
|
20 #include "tridpars.h" |
|
21 #include "hash.h" |
|
22 #include "putilimp.h" |
|
23 #include "uinvchar.h" |
|
24 |
|
25 //------------------------------------------------------------ |
|
26 // Constants |
|
27 |
|
28 static const UChar TARGET_SEP = 45; // '-' |
|
29 static const UChar VARIANT_SEP = 47; // '/' |
|
30 static const UChar ANY[] = {65,110,121,0}; // "Any" |
|
31 static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null" |
|
32 static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-" |
|
33 |
|
34 //------------------------------------------------------------ |
|
35 |
|
36 U_CDECL_BEGIN |
|
37 /** |
|
38 * Deleter function for Transliterator*. |
|
39 */ |
|
40 static void U_CALLCONV |
|
41 _deleteTransliterator(void *obj) { |
|
42 delete (icu::Transliterator*) obj; |
|
43 } |
|
44 U_CDECL_END |
|
45 |
|
46 //------------------------------------------------------------ |
|
47 |
|
48 U_NAMESPACE_BEGIN |
|
49 |
|
50 //------------------------------------------------------------ |
|
51 // ScriptRunIterator |
|
52 |
|
53 /** |
|
54 * Returns a series of ranges corresponding to scripts. They will be |
|
55 * of the form: |
|
56 * |
|
57 * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second |
|
58 * | | - first run (start, limit) |
|
59 * | | - second run (start, limit) |
|
60 * |
|
61 * That is, the runs will overlap. The reason for this is so that a |
|
62 * transliterator can consider common characters both before and after |
|
63 * the scripts. |
|
64 */ |
|
65 class ScriptRunIterator : public UMemory { |
|
66 private: |
|
67 const Replaceable& text; |
|
68 int32_t textStart; |
|
69 int32_t textLimit; |
|
70 |
|
71 public: |
|
72 /** |
|
73 * The code of the current run, valid after next() returns. May |
|
74 * be USCRIPT_INVALID_CODE if and only if the entire text is |
|
75 * COMMON/INHERITED. |
|
76 */ |
|
77 UScriptCode scriptCode; |
|
78 |
|
79 /** |
|
80 * The start of the run, inclusive, valid after next() returns. |
|
81 */ |
|
82 int32_t start; |
|
83 |
|
84 /** |
|
85 * The end of the run, exclusive, valid after next() returns. |
|
86 */ |
|
87 int32_t limit; |
|
88 |
|
89 /** |
|
90 * Constructs a run iterator over the given text from start |
|
91 * (inclusive) to limit (exclusive). |
|
92 */ |
|
93 ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit); |
|
94 |
|
95 /** |
|
96 * Returns TRUE if there are any more runs. TRUE is always |
|
97 * returned at least once. Upon return, the caller should |
|
98 * examine scriptCode, start, and limit. |
|
99 */ |
|
100 UBool next(); |
|
101 |
|
102 /** |
|
103 * Adjusts internal indices for a change in the limit index of the |
|
104 * given delta. A positive delta means the limit has increased. |
|
105 */ |
|
106 void adjustLimit(int32_t delta); |
|
107 |
|
108 private: |
|
109 ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class |
|
110 ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class |
|
111 }; |
|
112 |
|
113 ScriptRunIterator::ScriptRunIterator(const Replaceable& theText, |
|
114 int32_t myStart, int32_t myLimit) : |
|
115 text(theText) |
|
116 { |
|
117 textStart = myStart; |
|
118 textLimit = myLimit; |
|
119 limit = myStart; |
|
120 } |
|
121 |
|
122 UBool ScriptRunIterator::next() { |
|
123 UChar32 ch; |
|
124 UScriptCode s; |
|
125 UErrorCode ec = U_ZERO_ERROR; |
|
126 |
|
127 scriptCode = USCRIPT_INVALID_CODE; // don't know script yet |
|
128 start = limit; |
|
129 |
|
130 // Are we done? |
|
131 if (start == textLimit) { |
|
132 return FALSE; |
|
133 } |
|
134 |
|
135 // Move start back to include adjacent COMMON or INHERITED |
|
136 // characters |
|
137 while (start > textStart) { |
|
138 ch = text.char32At(start - 1); // look back |
|
139 s = uscript_getScript(ch, &ec); |
|
140 if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) { |
|
141 --start; |
|
142 } else { |
|
143 break; |
|
144 } |
|
145 } |
|
146 |
|
147 // Move limit ahead to include COMMON, INHERITED, and characters |
|
148 // of the current script. |
|
149 while (limit < textLimit) { |
|
150 ch = text.char32At(limit); // look ahead |
|
151 s = uscript_getScript(ch, &ec); |
|
152 if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) { |
|
153 if (scriptCode == USCRIPT_INVALID_CODE) { |
|
154 scriptCode = s; |
|
155 } else if (s != scriptCode) { |
|
156 break; |
|
157 } |
|
158 } |
|
159 ++limit; |
|
160 } |
|
161 |
|
162 // Return TRUE even if the entire text is COMMON / INHERITED, in |
|
163 // which case scriptCode will be USCRIPT_INVALID_CODE. |
|
164 return TRUE; |
|
165 } |
|
166 |
|
167 void ScriptRunIterator::adjustLimit(int32_t delta) { |
|
168 limit += delta; |
|
169 textLimit += delta; |
|
170 } |
|
171 |
|
172 //------------------------------------------------------------ |
|
173 // AnyTransliterator |
|
174 |
|
175 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator) |
|
176 |
|
177 AnyTransliterator::AnyTransliterator(const UnicodeString& id, |
|
178 const UnicodeString& theTarget, |
|
179 const UnicodeString& theVariant, |
|
180 UScriptCode theTargetScript, |
|
181 UErrorCode& ec) : |
|
182 Transliterator(id, NULL), |
|
183 targetScript(theTargetScript) |
|
184 { |
|
185 cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec); |
|
186 if (U_FAILURE(ec)) { |
|
187 return; |
|
188 } |
|
189 uhash_setValueDeleter(cache, _deleteTransliterator); |
|
190 |
|
191 target = theTarget; |
|
192 if (theVariant.length() > 0) { |
|
193 target.append(VARIANT_SEP).append(theVariant); |
|
194 } |
|
195 } |
|
196 |
|
197 AnyTransliterator::~AnyTransliterator() { |
|
198 uhash_close(cache); |
|
199 } |
|
200 |
|
201 /** |
|
202 * Copy constructor. |
|
203 */ |
|
204 AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) : |
|
205 Transliterator(o), |
|
206 target(o.target), |
|
207 targetScript(o.targetScript) |
|
208 { |
|
209 // Don't copy the cache contents |
|
210 UErrorCode ec = U_ZERO_ERROR; |
|
211 cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec); |
|
212 if (U_FAILURE(ec)) { |
|
213 return; |
|
214 } |
|
215 uhash_setValueDeleter(cache, _deleteTransliterator); |
|
216 } |
|
217 |
|
218 /** |
|
219 * Transliterator API. |
|
220 */ |
|
221 Transliterator* AnyTransliterator::clone() const { |
|
222 return new AnyTransliterator(*this); |
|
223 } |
|
224 |
|
225 /** |
|
226 * Implements {@link Transliterator#handleTransliterate}. |
|
227 */ |
|
228 void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos, |
|
229 UBool isIncremental) const { |
|
230 int32_t allStart = pos.start; |
|
231 int32_t allLimit = pos.limit; |
|
232 |
|
233 ScriptRunIterator it(text, pos.contextStart, pos.contextLimit); |
|
234 |
|
235 while (it.next()) { |
|
236 // Ignore runs in the ante context |
|
237 if (it.limit <= allStart) continue; |
|
238 |
|
239 // Try to instantiate transliterator from it.scriptCode to |
|
240 // our target or target/variant |
|
241 Transliterator* t = getTransliterator(it.scriptCode); |
|
242 |
|
243 if (t == NULL) { |
|
244 // We have no transliterator. Do nothing, but keep |
|
245 // pos.start up to date. |
|
246 pos.start = it.limit; |
|
247 continue; |
|
248 } |
|
249 |
|
250 // If the run end is before the transliteration limit, do |
|
251 // a non-incremental transliteration. Otherwise do an |
|
252 // incremental one. |
|
253 UBool incremental = isIncremental && (it.limit >= allLimit); |
|
254 |
|
255 pos.start = uprv_max(allStart, it.start); |
|
256 pos.limit = uprv_min(allLimit, it.limit); |
|
257 int32_t limit = pos.limit; |
|
258 t->filteredTransliterate(text, pos, incremental); |
|
259 int32_t delta = pos.limit - limit; |
|
260 allLimit += delta; |
|
261 it.adjustLimit(delta); |
|
262 |
|
263 // We're done if we enter the post context |
|
264 if (it.limit >= allLimit) break; |
|
265 } |
|
266 |
|
267 // Restore limit. pos.start is fine where the last transliterator |
|
268 // left it, or at the end of the last run. |
|
269 pos.limit = allLimit; |
|
270 } |
|
271 |
|
272 Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const { |
|
273 |
|
274 if (source == targetScript || source == USCRIPT_INVALID_CODE) { |
|
275 return NULL; |
|
276 } |
|
277 |
|
278 Transliterator* t = (Transliterator*) uhash_iget(cache, (int32_t) source); |
|
279 if (t == NULL) { |
|
280 UErrorCode ec = U_ZERO_ERROR; |
|
281 UnicodeString sourceName(uscript_getName(source), -1, US_INV); |
|
282 UnicodeString id(sourceName); |
|
283 id.append(TARGET_SEP).append(target); |
|
284 |
|
285 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); |
|
286 if (U_FAILURE(ec) || t == NULL) { |
|
287 delete t; |
|
288 |
|
289 // Try to pivot around Latin, our most common script |
|
290 id = sourceName; |
|
291 id.append(LATIN_PIVOT, -1).append(target); |
|
292 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); |
|
293 if (U_FAILURE(ec) || t == NULL) { |
|
294 delete t; |
|
295 t = NULL; |
|
296 } |
|
297 } |
|
298 |
|
299 if (t != NULL) { |
|
300 uhash_iput(cache, (int32_t) source, t, &ec); |
|
301 } |
|
302 } |
|
303 |
|
304 return t; |
|
305 } |
|
306 |
|
307 /** |
|
308 * Return the script code for a given name, or -1 if not found. |
|
309 */ |
|
310 static UScriptCode scriptNameToCode(const UnicodeString& name) { |
|
311 char buf[128]; |
|
312 UScriptCode code; |
|
313 UErrorCode ec = U_ZERO_ERROR; |
|
314 int32_t nameLen = name.length(); |
|
315 UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen); |
|
316 |
|
317 if (isInvariant) { |
|
318 name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV); |
|
319 buf[127] = 0; // Make sure that we NULL terminate the string. |
|
320 } |
|
321 if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec)) |
|
322 { |
|
323 code = USCRIPT_INVALID_CODE; |
|
324 } |
|
325 return code; |
|
326 } |
|
327 |
|
328 /** |
|
329 * Registers standard transliterators with the system. Called by |
|
330 * Transliterator during initialization. Scan all current targets and |
|
331 * register those that are scripts T as Any-T/V. |
|
332 */ |
|
333 void AnyTransliterator::registerIDs() { |
|
334 |
|
335 UErrorCode ec = U_ZERO_ERROR; |
|
336 Hashtable seen(TRUE, ec); |
|
337 |
|
338 int32_t sourceCount = Transliterator::_countAvailableSources(); |
|
339 for (int32_t s=0; s<sourceCount; ++s) { |
|
340 UnicodeString source; |
|
341 Transliterator::_getAvailableSource(s, source); |
|
342 |
|
343 // Ignore the "Any" source |
|
344 if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue; |
|
345 |
|
346 int32_t targetCount = Transliterator::_countAvailableTargets(source); |
|
347 for (int32_t t=0; t<targetCount; ++t) { |
|
348 UnicodeString target; |
|
349 Transliterator::_getAvailableTarget(t, source, target); |
|
350 |
|
351 // Only process each target once |
|
352 if (seen.geti(target) != 0) continue; |
|
353 ec = U_ZERO_ERROR; |
|
354 seen.puti(target, 1, ec); |
|
355 |
|
356 // Get the script code for the target. If not a script, ignore. |
|
357 UScriptCode targetScript = scriptNameToCode(target); |
|
358 if (targetScript == USCRIPT_INVALID_CODE) continue; |
|
359 |
|
360 int32_t variantCount = Transliterator::_countAvailableVariants(source, target); |
|
361 // assert(variantCount >= 1); |
|
362 for (int32_t v=0; v<variantCount; ++v) { |
|
363 UnicodeString variant; |
|
364 Transliterator::_getAvailableVariant(v, source, target, variant); |
|
365 |
|
366 UnicodeString id; |
|
367 TransliteratorIDParser::STVtoID(UnicodeString(TRUE, ANY, 3), target, variant, id); |
|
368 ec = U_ZERO_ERROR; |
|
369 AnyTransliterator* t = new AnyTransliterator(id, target, variant, |
|
370 targetScript, ec); |
|
371 if (U_FAILURE(ec)) { |
|
372 delete t; |
|
373 } else { |
|
374 Transliterator::_registerInstance(t); |
|
375 Transliterator::_registerSpecialInverse(target, UnicodeString(TRUE, NULL_ID, 4), FALSE); |
|
376 } |
|
377 } |
|
378 } |
|
379 } |
|
380 } |
|
381 |
|
382 U_NAMESPACE_END |
|
383 |
|
384 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
|
385 |
|
386 //eof |