|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (c) 2001-2011, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ********************************************************************** |
|
6 * Date Name Description |
|
7 * 11/19/2001 aliu Creation. |
|
8 ********************************************************************** |
|
9 */ |
|
10 |
|
11 #include "unicode/utypes.h" |
|
12 |
|
13 #if !UCONFIG_NO_TRANSLITERATION |
|
14 |
|
15 #include "unicode/uchar.h" |
|
16 #include "unicode/utf16.h" |
|
17 #include "unesctrn.h" |
|
18 #include "util.h" |
|
19 |
|
20 #include "cmemory.h" |
|
21 |
|
22 U_NAMESPACE_BEGIN |
|
23 |
|
24 /** |
|
25 * Special character marking the end of the spec[] array. |
|
26 */ |
|
27 static const UChar END = 0xFFFF; |
|
28 |
|
29 // Unicode: "U+10FFFF" hex, min=4, max=6 |
|
30 static const UChar SPEC_Unicode[] = { |
|
31 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, |
|
32 END |
|
33 }; |
|
34 |
|
35 // Java: "\\uFFFF" hex, min=4, max=4 |
|
36 static const UChar SPEC_Java[] = { |
|
37 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, |
|
38 END |
|
39 }; |
|
40 |
|
41 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8 |
|
42 static const UChar SPEC_C[] = { |
|
43 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, |
|
44 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, |
|
45 END |
|
46 }; |
|
47 |
|
48 // XML: "" hex, min=1, max=6 |
|
49 static const UChar SPEC_XML[] = { |
|
50 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, |
|
51 END |
|
52 }; |
|
53 |
|
54 // XML10: "" dec, min=1, max=7 (not really "Hex-Any") |
|
55 static const UChar SPEC_XML10[] = { |
|
56 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, |
|
57 END |
|
58 }; |
|
59 |
|
60 // Perl: "\\x{263A}" hex, min=1, max=6 |
|
61 static const UChar SPEC_Perl[] = { |
|
62 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, |
|
63 END |
|
64 }; |
|
65 |
|
66 // All: Java, C, Perl, XML, XML10, Unicode |
|
67 static const UChar SPEC_Any[] = { |
|
68 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, // Unicode |
|
69 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, // Java |
|
70 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, // C (surrogates) |
|
71 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, // XML |
|
72 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, // XML10 |
|
73 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl |
|
74 END |
|
75 }; |
|
76 |
|
77 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator) |
|
78 |
|
79 static UChar* copySpec(const UChar* spec) { |
|
80 int32_t len = 0; |
|
81 while (spec[len] != END) { |
|
82 ++len; |
|
83 } |
|
84 ++len; |
|
85 UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar)); |
|
86 // Check for memory allocation error. |
|
87 if (result != NULL) { |
|
88 uprv_memcpy(result, spec, len*sizeof(result[0])); |
|
89 } |
|
90 return result; |
|
91 } |
|
92 |
|
93 /** |
|
94 * Factory methods. Ignore the context. |
|
95 */ |
|
96 static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) { |
|
97 return new UnescapeTransliterator(ID, SPEC_Unicode); |
|
98 } |
|
99 static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) { |
|
100 return new UnescapeTransliterator(ID, SPEC_Java); |
|
101 } |
|
102 static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) { |
|
103 return new UnescapeTransliterator(ID, SPEC_C); |
|
104 } |
|
105 static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) { |
|
106 return new UnescapeTransliterator(ID, SPEC_XML); |
|
107 } |
|
108 static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) { |
|
109 return new UnescapeTransliterator(ID, SPEC_XML10); |
|
110 } |
|
111 static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) { |
|
112 return new UnescapeTransliterator(ID, SPEC_Perl); |
|
113 } |
|
114 static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) { |
|
115 return new UnescapeTransliterator(ID, SPEC_Any); |
|
116 } |
|
117 |
|
118 /** |
|
119 * Registers standard variants with the system. Called by |
|
120 * Transliterator during initialization. |
|
121 */ |
|
122 void UnescapeTransliterator::registerIDs() { |
|
123 Token t = integerToken(0); |
|
124 |
|
125 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t); |
|
126 |
|
127 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t); |
|
128 |
|
129 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t); |
|
130 |
|
131 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t); |
|
132 |
|
133 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t); |
|
134 |
|
135 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t); |
|
136 |
|
137 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t); |
|
138 } |
|
139 |
|
140 /** |
|
141 * Constructor. Takes the encoded spec array. |
|
142 */ |
|
143 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID, |
|
144 const UChar *newSpec) : |
|
145 Transliterator(newID, NULL) |
|
146 { |
|
147 this->spec = copySpec(newSpec); |
|
148 } |
|
149 |
|
150 /** |
|
151 * Copy constructor. |
|
152 */ |
|
153 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) : |
|
154 Transliterator(o) { |
|
155 this->spec = copySpec(o.spec); |
|
156 } |
|
157 |
|
158 UnescapeTransliterator::~UnescapeTransliterator() { |
|
159 uprv_free(spec); |
|
160 } |
|
161 |
|
162 /** |
|
163 * Transliterator API. |
|
164 */ |
|
165 Transliterator* UnescapeTransliterator::clone() const { |
|
166 return new UnescapeTransliterator(*this); |
|
167 } |
|
168 |
|
169 /** |
|
170 * Implements {@link Transliterator#handleTransliterate}. |
|
171 */ |
|
172 void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos, |
|
173 UBool isIncremental) const { |
|
174 int32_t start = pos.start; |
|
175 int32_t limit = pos.limit; |
|
176 int32_t i, j, ipat; |
|
177 |
|
178 while (start < limit) { |
|
179 // Loop over the forms in spec[]. Exit this loop when we |
|
180 // match one of the specs. Exit the outer loop if a |
|
181 // partial match is detected and isIncremental is true. |
|
182 for (j=0, ipat=0; spec[ipat] != END; ++j) { |
|
183 |
|
184 // Read the header |
|
185 int32_t prefixLen = spec[ipat++]; |
|
186 int32_t suffixLen = spec[ipat++]; |
|
187 int8_t radix = (int8_t) spec[ipat++]; |
|
188 int32_t minDigits = spec[ipat++]; |
|
189 int32_t maxDigits = spec[ipat++]; |
|
190 |
|
191 // s is a copy of start that is advanced over the |
|
192 // characters as we parse them. |
|
193 int32_t s = start; |
|
194 UBool match = TRUE; |
|
195 |
|
196 for (i=0; i<prefixLen; ++i) { |
|
197 if (s >= limit) { |
|
198 if (i > 0) { |
|
199 // We've already matched a character. This is |
|
200 // a partial match, so we return if in |
|
201 // incremental mode. In non-incremental mode, |
|
202 // go to the next spec. |
|
203 if (isIncremental) { |
|
204 goto exit; |
|
205 } |
|
206 match = FALSE; |
|
207 break; |
|
208 } |
|
209 } |
|
210 UChar c = text.charAt(s++); |
|
211 if (c != spec[ipat + i]) { |
|
212 match = FALSE; |
|
213 break; |
|
214 } |
|
215 } |
|
216 |
|
217 if (match) { |
|
218 UChar32 u = 0; |
|
219 int32_t digitCount = 0; |
|
220 for (;;) { |
|
221 if (s >= limit) { |
|
222 // Check for partial match in incremental mode. |
|
223 if (s > start && isIncremental) { |
|
224 goto exit; |
|
225 } |
|
226 break; |
|
227 } |
|
228 UChar32 ch = text.char32At(s); |
|
229 int32_t digit = u_digit(ch, radix); |
|
230 if (digit < 0) { |
|
231 break; |
|
232 } |
|
233 s += U16_LENGTH(ch); |
|
234 u = (u * radix) + digit; |
|
235 if (++digitCount == maxDigits) { |
|
236 break; |
|
237 } |
|
238 } |
|
239 |
|
240 match = (digitCount >= minDigits); |
|
241 |
|
242 if (match) { |
|
243 for (i=0; i<suffixLen; ++i) { |
|
244 if (s >= limit) { |
|
245 // Check for partial match in incremental mode. |
|
246 if (s > start && isIncremental) { |
|
247 goto exit; |
|
248 } |
|
249 match = FALSE; |
|
250 break; |
|
251 } |
|
252 UChar c = text.charAt(s++); |
|
253 if (c != spec[ipat + prefixLen + i]) { |
|
254 match = FALSE; |
|
255 break; |
|
256 } |
|
257 } |
|
258 |
|
259 if (match) { |
|
260 // At this point, we have a match |
|
261 UnicodeString str(u); |
|
262 text.handleReplaceBetween(start, s, str); |
|
263 limit -= s - start - str.length(); |
|
264 // The following break statement leaves the |
|
265 // loop that is traversing the forms in |
|
266 // spec[]. We then parse the next input |
|
267 // character. |
|
268 break; |
|
269 } |
|
270 } |
|
271 } |
|
272 |
|
273 ipat += prefixLen + suffixLen; |
|
274 } |
|
275 |
|
276 if (start < limit) { |
|
277 start += U16_LENGTH(text.char32At(start)); |
|
278 } |
|
279 } |
|
280 |
|
281 exit: |
|
282 pos.contextLimit += limit - pos.limit; |
|
283 pos.limit = limit; |
|
284 pos.start = start; |
|
285 } |
|
286 |
|
287 U_NAMESPACE_END |
|
288 |
|
289 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
|
290 |
|
291 //eof |