|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (c) 2002-2012, International Business Machines Corporation |
|
4 * and others. All Rights Reserved. |
|
5 ********************************************************************** |
|
6 * Date Name Description |
|
7 * 01/21/2002 aliu Creation. |
|
8 ********************************************************************** |
|
9 */ |
|
10 |
|
11 #include "unicode/utypes.h" |
|
12 |
|
13 #if !UCONFIG_NO_TRANSLITERATION |
|
14 |
|
15 #include "unicode/uniset.h" |
|
16 #include "unicode/utf16.h" |
|
17 #include "strrepl.h" |
|
18 #include "rbt_data.h" |
|
19 #include "util.h" |
|
20 |
|
21 U_NAMESPACE_BEGIN |
|
22 |
|
23 UnicodeReplacer::~UnicodeReplacer() {} |
|
24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer) |
|
25 |
|
26 /** |
|
27 * Construct a StringReplacer that sets the emits the given output |
|
28 * text and sets the cursor to the given position. |
|
29 * @param theOutput text that will replace input text when the |
|
30 * replace() method is called. May contain stand-in characters |
|
31 * that represent nested replacers. |
|
32 * @param theCursorPos cursor position that will be returned by |
|
33 * the replace() method |
|
34 * @param theData transliterator context object that translates |
|
35 * stand-in characters to UnicodeReplacer objects |
|
36 */ |
|
37 StringReplacer::StringReplacer(const UnicodeString& theOutput, |
|
38 int32_t theCursorPos, |
|
39 const TransliterationRuleData* theData) { |
|
40 output = theOutput; |
|
41 cursorPos = theCursorPos; |
|
42 hasCursor = TRUE; |
|
43 data = theData; |
|
44 isComplex = TRUE; |
|
45 } |
|
46 |
|
47 /** |
|
48 * Construct a StringReplacer that sets the emits the given output |
|
49 * text and does not modify the cursor. |
|
50 * @param theOutput text that will replace input text when the |
|
51 * replace() method is called. May contain stand-in characters |
|
52 * that represent nested replacers. |
|
53 * @param theData transliterator context object that translates |
|
54 * stand-in characters to UnicodeReplacer objects |
|
55 */ |
|
56 StringReplacer::StringReplacer(const UnicodeString& theOutput, |
|
57 const TransliterationRuleData* theData) { |
|
58 output = theOutput; |
|
59 cursorPos = 0; |
|
60 hasCursor = FALSE; |
|
61 data = theData; |
|
62 isComplex = TRUE; |
|
63 } |
|
64 |
|
65 /** |
|
66 * Copy constructor. |
|
67 */ |
|
68 StringReplacer::StringReplacer(const StringReplacer& other) : |
|
69 UnicodeFunctor(other), |
|
70 UnicodeReplacer(other) |
|
71 { |
|
72 output = other.output; |
|
73 cursorPos = other.cursorPos; |
|
74 hasCursor = other.hasCursor; |
|
75 data = other.data; |
|
76 isComplex = other.isComplex; |
|
77 } |
|
78 |
|
79 /** |
|
80 * Destructor |
|
81 */ |
|
82 StringReplacer::~StringReplacer() { |
|
83 } |
|
84 |
|
85 /** |
|
86 * Implement UnicodeFunctor |
|
87 */ |
|
88 UnicodeFunctor* StringReplacer::clone() const { |
|
89 return new StringReplacer(*this); |
|
90 } |
|
91 |
|
92 /** |
|
93 * Implement UnicodeFunctor |
|
94 */ |
|
95 UnicodeReplacer* StringReplacer::toReplacer() const { |
|
96 return const_cast<StringReplacer *>(this); |
|
97 } |
|
98 |
|
99 /** |
|
100 * UnicodeReplacer API |
|
101 */ |
|
102 int32_t StringReplacer::replace(Replaceable& text, |
|
103 int32_t start, |
|
104 int32_t limit, |
|
105 int32_t& cursor) { |
|
106 int32_t outLen; |
|
107 int32_t newStart = 0; |
|
108 |
|
109 // NOTE: It should be possible to _always_ run the complex |
|
110 // processing code; just slower. If not, then there is a bug |
|
111 // in the complex processing code. |
|
112 |
|
113 // Simple (no nested replacers) Processing Code : |
|
114 if (!isComplex) { |
|
115 text.handleReplaceBetween(start, limit, output); |
|
116 outLen = output.length(); |
|
117 |
|
118 // Setup default cursor position (for cursorPos within output) |
|
119 newStart = cursorPos; |
|
120 } |
|
121 |
|
122 // Complex (nested replacers) Processing Code : |
|
123 else { |
|
124 /* When there are segments to be copied, use the Replaceable.copy() |
|
125 * API in order to retain out-of-band data. Copy everything to the |
|
126 * end of the string, then copy them back over the key. This preserves |
|
127 * the integrity of indices into the key and surrounding context while |
|
128 * generating the output text. |
|
129 */ |
|
130 UnicodeString buf; |
|
131 int32_t oOutput; // offset into 'output' |
|
132 isComplex = FALSE; |
|
133 |
|
134 // The temporary buffer starts at tempStart, and extends |
|
135 // to destLimit. The start of the buffer has a single |
|
136 // character from before the key. This provides style |
|
137 // data when addition characters are filled into the |
|
138 // temporary buffer. If there is nothing to the left, use |
|
139 // the non-character U+FFFF, which Replaceable subclasses |
|
140 // should treat specially as a "no-style character." |
|
141 // destStart points to the point after the style context |
|
142 // character, so it is tempStart+1 or tempStart+2. |
|
143 int32_t tempStart = text.length(); // start of temp buffer |
|
144 int32_t destStart = tempStart; // copy new text to here |
|
145 if (start > 0) { |
|
146 int32_t len = U16_LENGTH(text.char32At(start-1)); |
|
147 text.copy(start-len, start, tempStart); |
|
148 destStart += len; |
|
149 } else { |
|
150 UnicodeString str((UChar) 0xFFFF); |
|
151 text.handleReplaceBetween(tempStart, tempStart, str); |
|
152 destStart++; |
|
153 } |
|
154 int32_t destLimit = destStart; |
|
155 |
|
156 for (oOutput=0; oOutput<output.length(); ) { |
|
157 if (oOutput == cursorPos) { |
|
158 // Record the position of the cursor |
|
159 newStart = destLimit - destStart; // relative to start |
|
160 } |
|
161 UChar32 c = output.char32At(oOutput); |
|
162 UnicodeReplacer* r = data->lookupReplacer(c); |
|
163 if (r == NULL) { |
|
164 // Accumulate straight (non-segment) text. |
|
165 buf.append(c); |
|
166 } else { |
|
167 isComplex = TRUE; |
|
168 |
|
169 // Insert any accumulated straight text. |
|
170 if (buf.length() > 0) { |
|
171 text.handleReplaceBetween(destLimit, destLimit, buf); |
|
172 destLimit += buf.length(); |
|
173 buf.truncate(0); |
|
174 } |
|
175 |
|
176 // Delegate output generation to replacer object |
|
177 int32_t len = r->replace(text, destLimit, destLimit, cursor); |
|
178 destLimit += len; |
|
179 } |
|
180 oOutput += U16_LENGTH(c); |
|
181 } |
|
182 // Insert any accumulated straight text. |
|
183 if (buf.length() > 0) { |
|
184 text.handleReplaceBetween(destLimit, destLimit, buf); |
|
185 destLimit += buf.length(); |
|
186 } |
|
187 if (oOutput == cursorPos) { |
|
188 // Record the position of the cursor |
|
189 newStart = destLimit - destStart; // relative to start |
|
190 } |
|
191 |
|
192 outLen = destLimit - destStart; |
|
193 |
|
194 // Copy new text to start, and delete it |
|
195 text.copy(destStart, destLimit, start); |
|
196 text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString()); |
|
197 |
|
198 // Delete the old text (the key) |
|
199 text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString()); |
|
200 } |
|
201 |
|
202 if (hasCursor) { |
|
203 // Adjust the cursor for positions outside the key. These |
|
204 // refer to code points rather than code units. If cursorPos |
|
205 // is within the output string, then use newStart, which has |
|
206 // already been set above. |
|
207 if (cursorPos < 0) { |
|
208 newStart = start; |
|
209 int32_t n = cursorPos; |
|
210 // Outside the output string, cursorPos counts code points |
|
211 while (n < 0 && newStart > 0) { |
|
212 newStart -= U16_LENGTH(text.char32At(newStart-1)); |
|
213 ++n; |
|
214 } |
|
215 newStart += n; |
|
216 } else if (cursorPos > output.length()) { |
|
217 newStart = start + outLen; |
|
218 int32_t n = cursorPos - output.length(); |
|
219 // Outside the output string, cursorPos counts code points |
|
220 while (n > 0 && newStart < text.length()) { |
|
221 newStart += U16_LENGTH(text.char32At(newStart)); |
|
222 --n; |
|
223 } |
|
224 newStart += n; |
|
225 } else { |
|
226 // Cursor is within output string. It has been set up above |
|
227 // to be relative to start. |
|
228 newStart += start; |
|
229 } |
|
230 |
|
231 cursor = newStart; |
|
232 } |
|
233 |
|
234 return outLen; |
|
235 } |
|
236 |
|
237 /** |
|
238 * UnicodeReplacer API |
|
239 */ |
|
240 UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule, |
|
241 UBool escapeUnprintable) const { |
|
242 rule.truncate(0); |
|
243 UnicodeString quoteBuf; |
|
244 |
|
245 int32_t cursor = cursorPos; |
|
246 |
|
247 // Handle a cursor preceding the output |
|
248 if (hasCursor && cursor < 0) { |
|
249 while (cursor++ < 0) { |
|
250 ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); |
|
251 } |
|
252 // Fall through and append '|' below |
|
253 } |
|
254 |
|
255 for (int32_t i=0; i<output.length(); ++i) { |
|
256 if (hasCursor && i == cursor) { |
|
257 ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); |
|
258 } |
|
259 UChar c = output.charAt(i); // Ok to use 16-bits here |
|
260 |
|
261 UnicodeReplacer* r = data->lookupReplacer(c); |
|
262 if (r == NULL) { |
|
263 ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf); |
|
264 } else { |
|
265 UnicodeString buf; |
|
266 r->toReplacerPattern(buf, escapeUnprintable); |
|
267 buf.insert(0, (UChar)0x20); |
|
268 buf.append((UChar)0x20); |
|
269 ICU_Utility::appendToRule(rule, buf, |
|
270 TRUE, escapeUnprintable, quoteBuf); |
|
271 } |
|
272 } |
|
273 |
|
274 // Handle a cursor after the output. Use > rather than >= because |
|
275 // if cursor == output.length() it is at the end of the output, |
|
276 // which is the default position, so we need not emit it. |
|
277 if (hasCursor && cursor > output.length()) { |
|
278 cursor -= output.length(); |
|
279 while (cursor-- > 0) { |
|
280 ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); |
|
281 } |
|
282 ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); |
|
283 } |
|
284 // Flush quoteBuf out to result |
|
285 ICU_Utility::appendToRule(rule, -1, |
|
286 TRUE, escapeUnprintable, quoteBuf); |
|
287 |
|
288 return rule; |
|
289 } |
|
290 |
|
291 /** |
|
292 * Implement UnicodeReplacer |
|
293 */ |
|
294 void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const { |
|
295 UChar32 ch; |
|
296 for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) { |
|
297 ch = output.char32At(i); |
|
298 UnicodeReplacer* r = data->lookupReplacer(ch); |
|
299 if (r == NULL) { |
|
300 toUnionTo.add(ch); |
|
301 } else { |
|
302 r->addReplacementSetTo(toUnionTo); |
|
303 } |
|
304 } |
|
305 } |
|
306 |
|
307 /** |
|
308 * UnicodeFunctor API |
|
309 */ |
|
310 void StringReplacer::setData(const TransliterationRuleData* d) { |
|
311 data = d; |
|
312 int32_t i = 0; |
|
313 while (i<output.length()) { |
|
314 UChar32 c = output.char32At(i); |
|
315 UnicodeFunctor* f = data->lookup(c); |
|
316 if (f != NULL) { |
|
317 f->setData(data); |
|
318 } |
|
319 i += U16_LENGTH(c); |
|
320 } |
|
321 } |
|
322 |
|
323 U_NAMESPACE_END |
|
324 |
|
325 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
|
326 |
|
327 //eof |