| |
1 /* |
| |
2 ******************************************************************************* |
| |
3 * |
| |
4 * Copyright (C) 1999-2010, International Business Machines |
| |
5 * Corporation and others. All Rights Reserved. |
| |
6 * |
| |
7 ******************************************************************************* |
| |
8 * file name: unistr_cnv.cpp |
| |
9 * encoding: US-ASCII |
| |
10 * tab size: 8 (not used) |
| |
11 * indentation:2 |
| |
12 * |
| |
13 * created on: 2004aug19 |
| |
14 * created by: Markus W. Scherer |
| |
15 * |
| |
16 * Character conversion functions moved here from unistr.cpp |
| |
17 */ |
| |
18 |
| |
19 #include "unicode/utypes.h" |
| |
20 |
| |
21 #if !UCONFIG_NO_CONVERSION |
| |
22 |
| |
23 #include "unicode/putil.h" |
| |
24 #include "cstring.h" |
| |
25 #include "cmemory.h" |
| |
26 #include "unicode/ustring.h" |
| |
27 #include "unicode/unistr.h" |
| |
28 #include "unicode/ucnv.h" |
| |
29 #include "ucnv_imp.h" |
| |
30 #include "putilimp.h" |
| |
31 #include "ustr_cnv.h" |
| |
32 #include "ustr_imp.h" |
| |
33 |
| |
34 U_NAMESPACE_BEGIN |
| |
35 |
| |
36 //======================================== |
| |
37 // Constructors |
| |
38 //======================================== |
| |
39 |
| |
40 #if !U_CHARSET_IS_UTF8 |
| |
41 |
| |
42 UnicodeString::UnicodeString(const char *codepageData) |
| |
43 : fShortLength(0), |
| |
44 fFlags(kShortString) |
| |
45 { |
| |
46 if(codepageData != 0) { |
| |
47 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0); |
| |
48 } |
| |
49 } |
| |
50 |
| |
51 UnicodeString::UnicodeString(const char *codepageData, |
| |
52 int32_t dataLength) |
| |
53 : fShortLength(0), |
| |
54 fFlags(kShortString) |
| |
55 { |
| |
56 if(codepageData != 0) { |
| |
57 doCodepageCreate(codepageData, dataLength, 0); |
| |
58 } |
| |
59 } |
| |
60 |
| |
61 // else see unistr.cpp |
| |
62 #endif |
| |
63 |
| |
64 UnicodeString::UnicodeString(const char *codepageData, |
| |
65 const char *codepage) |
| |
66 : fShortLength(0), |
| |
67 fFlags(kShortString) |
| |
68 { |
| |
69 if(codepageData != 0) { |
| |
70 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage); |
| |
71 } |
| |
72 } |
| |
73 |
| |
74 UnicodeString::UnicodeString(const char *codepageData, |
| |
75 int32_t dataLength, |
| |
76 const char *codepage) |
| |
77 : fShortLength(0), |
| |
78 fFlags(kShortString) |
| |
79 { |
| |
80 if(codepageData != 0) { |
| |
81 doCodepageCreate(codepageData, dataLength, codepage); |
| |
82 } |
| |
83 } |
| |
84 |
| |
85 UnicodeString::UnicodeString(const char *src, int32_t srcLength, |
| |
86 UConverter *cnv, |
| |
87 UErrorCode &errorCode) |
| |
88 : fShortLength(0), |
| |
89 fFlags(kShortString) |
| |
90 { |
| |
91 if(U_SUCCESS(errorCode)) { |
| |
92 // check arguments |
| |
93 if(src==NULL) { |
| |
94 // treat as an empty string, do nothing more |
| |
95 } else if(srcLength<-1) { |
| |
96 errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| |
97 } else { |
| |
98 // get input length |
| |
99 if(srcLength==-1) { |
| |
100 srcLength=(int32_t)uprv_strlen(src); |
| |
101 } |
| |
102 if(srcLength>0) { |
| |
103 if(cnv!=0) { |
| |
104 // use the provided converter |
| |
105 ucnv_resetToUnicode(cnv); |
| |
106 doCodepageCreate(src, srcLength, cnv, errorCode); |
| |
107 } else { |
| |
108 // use the default converter |
| |
109 cnv=u_getDefaultConverter(&errorCode); |
| |
110 doCodepageCreate(src, srcLength, cnv, errorCode); |
| |
111 u_releaseDefaultConverter(cnv); |
| |
112 } |
| |
113 } |
| |
114 } |
| |
115 |
| |
116 if(U_FAILURE(errorCode)) { |
| |
117 setToBogus(); |
| |
118 } |
| |
119 } |
| |
120 } |
| |
121 |
| |
122 //======================================== |
| |
123 // Codeset conversion |
| |
124 //======================================== |
| |
125 |
| |
126 #if !U_CHARSET_IS_UTF8 |
| |
127 |
| |
128 int32_t |
| |
129 UnicodeString::extract(int32_t start, |
| |
130 int32_t length, |
| |
131 char *target, |
| |
132 uint32_t dstSize) const { |
| |
133 return extract(start, length, target, dstSize, 0); |
| |
134 } |
| |
135 |
| |
136 // else see unistr.cpp |
| |
137 #endif |
| |
138 |
| |
139 int32_t |
| |
140 UnicodeString::extract(int32_t start, |
| |
141 int32_t length, |
| |
142 char *target, |
| |
143 uint32_t dstSize, |
| |
144 const char *codepage) const |
| |
145 { |
| |
146 // if the arguments are illegal, then do nothing |
| |
147 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) { |
| |
148 return 0; |
| |
149 } |
| |
150 |
| |
151 // pin the indices to legal values |
| |
152 pinIndices(start, length); |
| |
153 |
| |
154 // We need to cast dstSize to int32_t for all subsequent code. |
| |
155 // I don't know why the API was defined with uint32_t but we are stuck with it. |
| |
156 // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize |
| |
157 // as a limit in some functions, it may wrap around and yield a pointer |
| |
158 // that compares less-than target. |
| |
159 int32_t capacity; |
| |
160 if(dstSize < 0x7fffffff) { |
| |
161 // Assume that the capacity is real and a limit pointer won't wrap around. |
| |
162 capacity = (int32_t)dstSize; |
| |
163 } else { |
| |
164 // Pin the capacity so that a limit pointer does not wrap around. |
| |
165 char *targetLimit = (char *)U_MAX_PTR(target); |
| |
166 // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff |
| |
167 // greater than target and does not wrap around the top of the address space. |
| |
168 capacity = (int32_t)(targetLimit - target); |
| |
169 } |
| |
170 |
| |
171 // create the converter |
| |
172 UConverter *converter; |
| |
173 UErrorCode status = U_ZERO_ERROR; |
| |
174 |
| |
175 // just write the NUL if the string length is 0 |
| |
176 if(length == 0) { |
| |
177 return u_terminateChars(target, capacity, 0, &status); |
| |
178 } |
| |
179 |
| |
180 // if the codepage is the default, use our cache |
| |
181 // if it is an empty string, then use the "invariant character" conversion |
| |
182 if (codepage == 0) { |
| |
183 const char *defaultName = ucnv_getDefaultName(); |
| |
184 if(UCNV_FAST_IS_UTF8(defaultName)) { |
| |
185 return toUTF8(start, length, target, capacity); |
| |
186 } |
| |
187 converter = u_getDefaultConverter(&status); |
| |
188 } else if (*codepage == 0) { |
| |
189 // use the "invariant characters" conversion |
| |
190 int32_t destLength; |
| |
191 if(length <= capacity) { |
| |
192 destLength = length; |
| |
193 } else { |
| |
194 destLength = capacity; |
| |
195 } |
| |
196 u_UCharsToChars(getArrayStart() + start, target, destLength); |
| |
197 return u_terminateChars(target, capacity, length, &status); |
| |
198 } else { |
| |
199 converter = ucnv_open(codepage, &status); |
| |
200 } |
| |
201 |
| |
202 length = doExtract(start, length, target, capacity, converter, status); |
| |
203 |
| |
204 // close the converter |
| |
205 if (codepage == 0) { |
| |
206 u_releaseDefaultConverter(converter); |
| |
207 } else { |
| |
208 ucnv_close(converter); |
| |
209 } |
| |
210 |
| |
211 return length; |
| |
212 } |
| |
213 |
| |
214 int32_t |
| |
215 UnicodeString::extract(char *dest, int32_t destCapacity, |
| |
216 UConverter *cnv, |
| |
217 UErrorCode &errorCode) const |
| |
218 { |
| |
219 if(U_FAILURE(errorCode)) { |
| |
220 return 0; |
| |
221 } |
| |
222 |
| |
223 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) { |
| |
224 errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| |
225 return 0; |
| |
226 } |
| |
227 |
| |
228 // nothing to do? |
| |
229 if(isEmpty()) { |
| |
230 return u_terminateChars(dest, destCapacity, 0, &errorCode); |
| |
231 } |
| |
232 |
| |
233 // get the converter |
| |
234 UBool isDefaultConverter; |
| |
235 if(cnv==0) { |
| |
236 isDefaultConverter=TRUE; |
| |
237 cnv=u_getDefaultConverter(&errorCode); |
| |
238 if(U_FAILURE(errorCode)) { |
| |
239 return 0; |
| |
240 } |
| |
241 } else { |
| |
242 isDefaultConverter=FALSE; |
| |
243 ucnv_resetFromUnicode(cnv); |
| |
244 } |
| |
245 |
| |
246 // convert |
| |
247 int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode); |
| |
248 |
| |
249 // release the converter |
| |
250 if(isDefaultConverter) { |
| |
251 u_releaseDefaultConverter(cnv); |
| |
252 } |
| |
253 |
| |
254 return len; |
| |
255 } |
| |
256 |
| |
257 int32_t |
| |
258 UnicodeString::doExtract(int32_t start, int32_t length, |
| |
259 char *dest, int32_t destCapacity, |
| |
260 UConverter *cnv, |
| |
261 UErrorCode &errorCode) const |
| |
262 { |
| |
263 if(U_FAILURE(errorCode)) { |
| |
264 if(destCapacity!=0) { |
| |
265 *dest=0; |
| |
266 } |
| |
267 return 0; |
| |
268 } |
| |
269 |
| |
270 const UChar *src=getArrayStart()+start, *srcLimit=src+length; |
| |
271 char *originalDest=dest; |
| |
272 const char *destLimit; |
| |
273 |
| |
274 if(destCapacity==0) { |
| |
275 destLimit=dest=0; |
| |
276 } else if(destCapacity==-1) { |
| |
277 // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used. |
| |
278 destLimit=(char*)U_MAX_PTR(dest); |
| |
279 // for NUL-termination, translate into highest int32_t |
| |
280 destCapacity=0x7fffffff; |
| |
281 } else { |
| |
282 destLimit=dest+destCapacity; |
| |
283 } |
| |
284 |
| |
285 // perform the conversion |
| |
286 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode); |
| |
287 length=(int32_t)(dest-originalDest); |
| |
288 |
| |
289 // if an overflow occurs, then get the preflighting length |
| |
290 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { |
| |
291 char buffer[1024]; |
| |
292 |
| |
293 destLimit=buffer+sizeof(buffer); |
| |
294 do { |
| |
295 dest=buffer; |
| |
296 errorCode=U_ZERO_ERROR; |
| |
297 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode); |
| |
298 length+=(int32_t)(dest-buffer); |
| |
299 } while(errorCode==U_BUFFER_OVERFLOW_ERROR); |
| |
300 } |
| |
301 |
| |
302 return u_terminateChars(originalDest, destCapacity, length, &errorCode); |
| |
303 } |
| |
304 |
| |
305 void |
| |
306 UnicodeString::doCodepageCreate(const char *codepageData, |
| |
307 int32_t dataLength, |
| |
308 const char *codepage) |
| |
309 { |
| |
310 // if there's nothing to convert, do nothing |
| |
311 if(codepageData == 0 || dataLength == 0 || dataLength < -1) { |
| |
312 return; |
| |
313 } |
| |
314 if(dataLength == -1) { |
| |
315 dataLength = (int32_t)uprv_strlen(codepageData); |
| |
316 } |
| |
317 |
| |
318 UErrorCode status = U_ZERO_ERROR; |
| |
319 |
| |
320 // create the converter |
| |
321 // if the codepage is the default, use our cache |
| |
322 // if it is an empty string, then use the "invariant character" conversion |
| |
323 UConverter *converter; |
| |
324 if (codepage == 0) { |
| |
325 const char *defaultName = ucnv_getDefaultName(); |
| |
326 if(UCNV_FAST_IS_UTF8(defaultName)) { |
| |
327 setToUTF8(StringPiece(codepageData, dataLength)); |
| |
328 return; |
| |
329 } |
| |
330 converter = u_getDefaultConverter(&status); |
| |
331 } else if(*codepage == 0) { |
| |
332 // use the "invariant characters" conversion |
| |
333 if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) { |
| |
334 u_charsToUChars(codepageData, getArrayStart(), dataLength); |
| |
335 setLength(dataLength); |
| |
336 } else { |
| |
337 setToBogus(); |
| |
338 } |
| |
339 return; |
| |
340 } else { |
| |
341 converter = ucnv_open(codepage, &status); |
| |
342 } |
| |
343 |
| |
344 // if we failed, set the appropriate flags and return |
| |
345 if(U_FAILURE(status)) { |
| |
346 setToBogus(); |
| |
347 return; |
| |
348 } |
| |
349 |
| |
350 // perform the conversion |
| |
351 doCodepageCreate(codepageData, dataLength, converter, status); |
| |
352 if(U_FAILURE(status)) { |
| |
353 setToBogus(); |
| |
354 } |
| |
355 |
| |
356 // close the converter |
| |
357 if(codepage == 0) { |
| |
358 u_releaseDefaultConverter(converter); |
| |
359 } else { |
| |
360 ucnv_close(converter); |
| |
361 } |
| |
362 } |
| |
363 |
| |
364 void |
| |
365 UnicodeString::doCodepageCreate(const char *codepageData, |
| |
366 int32_t dataLength, |
| |
367 UConverter *converter, |
| |
368 UErrorCode &status) |
| |
369 { |
| |
370 if(U_FAILURE(status)) { |
| |
371 return; |
| |
372 } |
| |
373 |
| |
374 // set up the conversion parameters |
| |
375 const char *mySource = codepageData; |
| |
376 const char *mySourceEnd = mySource + dataLength; |
| |
377 UChar *array, *myTarget; |
| |
378 |
| |
379 // estimate the size needed: |
| |
380 int32_t arraySize; |
| |
381 if(dataLength <= US_STACKBUF_SIZE) { |
| |
382 // try to use the stack buffer |
| |
383 arraySize = US_STACKBUF_SIZE; |
| |
384 } else { |
| |
385 // 1.25 UChar's per source byte should cover most cases |
| |
386 arraySize = dataLength + (dataLength >> 2); |
| |
387 } |
| |
388 |
| |
389 // we do not care about the current contents |
| |
390 UBool doCopyArray = FALSE; |
| |
391 for(;;) { |
| |
392 if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) { |
| |
393 setToBogus(); |
| |
394 break; |
| |
395 } |
| |
396 |
| |
397 // perform the conversion |
| |
398 array = getArrayStart(); |
| |
399 myTarget = array + length(); |
| |
400 ucnv_toUnicode(converter, &myTarget, array + getCapacity(), |
| |
401 &mySource, mySourceEnd, 0, TRUE, &status); |
| |
402 |
| |
403 // update the conversion parameters |
| |
404 setLength((int32_t)(myTarget - array)); |
| |
405 |
| |
406 // allocate more space and copy data, if needed |
| |
407 if(status == U_BUFFER_OVERFLOW_ERROR) { |
| |
408 // reset the error code |
| |
409 status = U_ZERO_ERROR; |
| |
410 |
| |
411 // keep the previous conversion results |
| |
412 doCopyArray = TRUE; |
| |
413 |
| |
414 // estimate the new size needed, larger than before |
| |
415 // try 2 UChar's per remaining source byte |
| |
416 arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource)); |
| |
417 } else { |
| |
418 break; |
| |
419 } |
| |
420 } |
| |
421 } |
| |
422 |
| |
423 U_NAMESPACE_END |
| |
424 |
| |
425 #endif |