|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 1999-2010, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * file name: unistr_cnv.cpp |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:2 |
|
12 * |
|
13 * created on: 2004aug19 |
|
14 * created by: Markus W. Scherer |
|
15 * |
|
16 * Character conversion functions moved here from unistr.cpp |
|
17 */ |
|
18 |
|
19 #include "unicode/utypes.h" |
|
20 |
|
21 #if !UCONFIG_NO_CONVERSION |
|
22 |
|
23 #include "unicode/putil.h" |
|
24 #include "cstring.h" |
|
25 #include "cmemory.h" |
|
26 #include "unicode/ustring.h" |
|
27 #include "unicode/unistr.h" |
|
28 #include "unicode/ucnv.h" |
|
29 #include "ucnv_imp.h" |
|
30 #include "putilimp.h" |
|
31 #include "ustr_cnv.h" |
|
32 #include "ustr_imp.h" |
|
33 |
|
34 U_NAMESPACE_BEGIN |
|
35 |
|
36 //======================================== |
|
37 // Constructors |
|
38 //======================================== |
|
39 |
|
40 #if !U_CHARSET_IS_UTF8 |
|
41 |
|
42 UnicodeString::UnicodeString(const char *codepageData) |
|
43 : fShortLength(0), |
|
44 fFlags(kShortString) |
|
45 { |
|
46 if(codepageData != 0) { |
|
47 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0); |
|
48 } |
|
49 } |
|
50 |
|
51 UnicodeString::UnicodeString(const char *codepageData, |
|
52 int32_t dataLength) |
|
53 : fShortLength(0), |
|
54 fFlags(kShortString) |
|
55 { |
|
56 if(codepageData != 0) { |
|
57 doCodepageCreate(codepageData, dataLength, 0); |
|
58 } |
|
59 } |
|
60 |
|
61 // else see unistr.cpp |
|
62 #endif |
|
63 |
|
64 UnicodeString::UnicodeString(const char *codepageData, |
|
65 const char *codepage) |
|
66 : fShortLength(0), |
|
67 fFlags(kShortString) |
|
68 { |
|
69 if(codepageData != 0) { |
|
70 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage); |
|
71 } |
|
72 } |
|
73 |
|
74 UnicodeString::UnicodeString(const char *codepageData, |
|
75 int32_t dataLength, |
|
76 const char *codepage) |
|
77 : fShortLength(0), |
|
78 fFlags(kShortString) |
|
79 { |
|
80 if(codepageData != 0) { |
|
81 doCodepageCreate(codepageData, dataLength, codepage); |
|
82 } |
|
83 } |
|
84 |
|
85 UnicodeString::UnicodeString(const char *src, int32_t srcLength, |
|
86 UConverter *cnv, |
|
87 UErrorCode &errorCode) |
|
88 : fShortLength(0), |
|
89 fFlags(kShortString) |
|
90 { |
|
91 if(U_SUCCESS(errorCode)) { |
|
92 // check arguments |
|
93 if(src==NULL) { |
|
94 // treat as an empty string, do nothing more |
|
95 } else if(srcLength<-1) { |
|
96 errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
97 } else { |
|
98 // get input length |
|
99 if(srcLength==-1) { |
|
100 srcLength=(int32_t)uprv_strlen(src); |
|
101 } |
|
102 if(srcLength>0) { |
|
103 if(cnv!=0) { |
|
104 // use the provided converter |
|
105 ucnv_resetToUnicode(cnv); |
|
106 doCodepageCreate(src, srcLength, cnv, errorCode); |
|
107 } else { |
|
108 // use the default converter |
|
109 cnv=u_getDefaultConverter(&errorCode); |
|
110 doCodepageCreate(src, srcLength, cnv, errorCode); |
|
111 u_releaseDefaultConverter(cnv); |
|
112 } |
|
113 } |
|
114 } |
|
115 |
|
116 if(U_FAILURE(errorCode)) { |
|
117 setToBogus(); |
|
118 } |
|
119 } |
|
120 } |
|
121 |
|
122 //======================================== |
|
123 // Codeset conversion |
|
124 //======================================== |
|
125 |
|
126 #if !U_CHARSET_IS_UTF8 |
|
127 |
|
128 int32_t |
|
129 UnicodeString::extract(int32_t start, |
|
130 int32_t length, |
|
131 char *target, |
|
132 uint32_t dstSize) const { |
|
133 return extract(start, length, target, dstSize, 0); |
|
134 } |
|
135 |
|
136 // else see unistr.cpp |
|
137 #endif |
|
138 |
|
139 int32_t |
|
140 UnicodeString::extract(int32_t start, |
|
141 int32_t length, |
|
142 char *target, |
|
143 uint32_t dstSize, |
|
144 const char *codepage) const |
|
145 { |
|
146 // if the arguments are illegal, then do nothing |
|
147 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) { |
|
148 return 0; |
|
149 } |
|
150 |
|
151 // pin the indices to legal values |
|
152 pinIndices(start, length); |
|
153 |
|
154 // We need to cast dstSize to int32_t for all subsequent code. |
|
155 // I don't know why the API was defined with uint32_t but we are stuck with it. |
|
156 // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize |
|
157 // as a limit in some functions, it may wrap around and yield a pointer |
|
158 // that compares less-than target. |
|
159 int32_t capacity; |
|
160 if(dstSize < 0x7fffffff) { |
|
161 // Assume that the capacity is real and a limit pointer won't wrap around. |
|
162 capacity = (int32_t)dstSize; |
|
163 } else { |
|
164 // Pin the capacity so that a limit pointer does not wrap around. |
|
165 char *targetLimit = (char *)U_MAX_PTR(target); |
|
166 // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff |
|
167 // greater than target and does not wrap around the top of the address space. |
|
168 capacity = (int32_t)(targetLimit - target); |
|
169 } |
|
170 |
|
171 // create the converter |
|
172 UConverter *converter; |
|
173 UErrorCode status = U_ZERO_ERROR; |
|
174 |
|
175 // just write the NUL if the string length is 0 |
|
176 if(length == 0) { |
|
177 return u_terminateChars(target, capacity, 0, &status); |
|
178 } |
|
179 |
|
180 // if the codepage is the default, use our cache |
|
181 // if it is an empty string, then use the "invariant character" conversion |
|
182 if (codepage == 0) { |
|
183 const char *defaultName = ucnv_getDefaultName(); |
|
184 if(UCNV_FAST_IS_UTF8(defaultName)) { |
|
185 return toUTF8(start, length, target, capacity); |
|
186 } |
|
187 converter = u_getDefaultConverter(&status); |
|
188 } else if (*codepage == 0) { |
|
189 // use the "invariant characters" conversion |
|
190 int32_t destLength; |
|
191 if(length <= capacity) { |
|
192 destLength = length; |
|
193 } else { |
|
194 destLength = capacity; |
|
195 } |
|
196 u_UCharsToChars(getArrayStart() + start, target, destLength); |
|
197 return u_terminateChars(target, capacity, length, &status); |
|
198 } else { |
|
199 converter = ucnv_open(codepage, &status); |
|
200 } |
|
201 |
|
202 length = doExtract(start, length, target, capacity, converter, status); |
|
203 |
|
204 // close the converter |
|
205 if (codepage == 0) { |
|
206 u_releaseDefaultConverter(converter); |
|
207 } else { |
|
208 ucnv_close(converter); |
|
209 } |
|
210 |
|
211 return length; |
|
212 } |
|
213 |
|
214 int32_t |
|
215 UnicodeString::extract(char *dest, int32_t destCapacity, |
|
216 UConverter *cnv, |
|
217 UErrorCode &errorCode) const |
|
218 { |
|
219 if(U_FAILURE(errorCode)) { |
|
220 return 0; |
|
221 } |
|
222 |
|
223 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) { |
|
224 errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
225 return 0; |
|
226 } |
|
227 |
|
228 // nothing to do? |
|
229 if(isEmpty()) { |
|
230 return u_terminateChars(dest, destCapacity, 0, &errorCode); |
|
231 } |
|
232 |
|
233 // get the converter |
|
234 UBool isDefaultConverter; |
|
235 if(cnv==0) { |
|
236 isDefaultConverter=TRUE; |
|
237 cnv=u_getDefaultConverter(&errorCode); |
|
238 if(U_FAILURE(errorCode)) { |
|
239 return 0; |
|
240 } |
|
241 } else { |
|
242 isDefaultConverter=FALSE; |
|
243 ucnv_resetFromUnicode(cnv); |
|
244 } |
|
245 |
|
246 // convert |
|
247 int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode); |
|
248 |
|
249 // release the converter |
|
250 if(isDefaultConverter) { |
|
251 u_releaseDefaultConverter(cnv); |
|
252 } |
|
253 |
|
254 return len; |
|
255 } |
|
256 |
|
257 int32_t |
|
258 UnicodeString::doExtract(int32_t start, int32_t length, |
|
259 char *dest, int32_t destCapacity, |
|
260 UConverter *cnv, |
|
261 UErrorCode &errorCode) const |
|
262 { |
|
263 if(U_FAILURE(errorCode)) { |
|
264 if(destCapacity!=0) { |
|
265 *dest=0; |
|
266 } |
|
267 return 0; |
|
268 } |
|
269 |
|
270 const UChar *src=getArrayStart()+start, *srcLimit=src+length; |
|
271 char *originalDest=dest; |
|
272 const char *destLimit; |
|
273 |
|
274 if(destCapacity==0) { |
|
275 destLimit=dest=0; |
|
276 } else if(destCapacity==-1) { |
|
277 // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used. |
|
278 destLimit=(char*)U_MAX_PTR(dest); |
|
279 // for NUL-termination, translate into highest int32_t |
|
280 destCapacity=0x7fffffff; |
|
281 } else { |
|
282 destLimit=dest+destCapacity; |
|
283 } |
|
284 |
|
285 // perform the conversion |
|
286 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode); |
|
287 length=(int32_t)(dest-originalDest); |
|
288 |
|
289 // if an overflow occurs, then get the preflighting length |
|
290 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { |
|
291 char buffer[1024]; |
|
292 |
|
293 destLimit=buffer+sizeof(buffer); |
|
294 do { |
|
295 dest=buffer; |
|
296 errorCode=U_ZERO_ERROR; |
|
297 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode); |
|
298 length+=(int32_t)(dest-buffer); |
|
299 } while(errorCode==U_BUFFER_OVERFLOW_ERROR); |
|
300 } |
|
301 |
|
302 return u_terminateChars(originalDest, destCapacity, length, &errorCode); |
|
303 } |
|
304 |
|
305 void |
|
306 UnicodeString::doCodepageCreate(const char *codepageData, |
|
307 int32_t dataLength, |
|
308 const char *codepage) |
|
309 { |
|
310 // if there's nothing to convert, do nothing |
|
311 if(codepageData == 0 || dataLength == 0 || dataLength < -1) { |
|
312 return; |
|
313 } |
|
314 if(dataLength == -1) { |
|
315 dataLength = (int32_t)uprv_strlen(codepageData); |
|
316 } |
|
317 |
|
318 UErrorCode status = U_ZERO_ERROR; |
|
319 |
|
320 // create the converter |
|
321 // if the codepage is the default, use our cache |
|
322 // if it is an empty string, then use the "invariant character" conversion |
|
323 UConverter *converter; |
|
324 if (codepage == 0) { |
|
325 const char *defaultName = ucnv_getDefaultName(); |
|
326 if(UCNV_FAST_IS_UTF8(defaultName)) { |
|
327 setToUTF8(StringPiece(codepageData, dataLength)); |
|
328 return; |
|
329 } |
|
330 converter = u_getDefaultConverter(&status); |
|
331 } else if(*codepage == 0) { |
|
332 // use the "invariant characters" conversion |
|
333 if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) { |
|
334 u_charsToUChars(codepageData, getArrayStart(), dataLength); |
|
335 setLength(dataLength); |
|
336 } else { |
|
337 setToBogus(); |
|
338 } |
|
339 return; |
|
340 } else { |
|
341 converter = ucnv_open(codepage, &status); |
|
342 } |
|
343 |
|
344 // if we failed, set the appropriate flags and return |
|
345 if(U_FAILURE(status)) { |
|
346 setToBogus(); |
|
347 return; |
|
348 } |
|
349 |
|
350 // perform the conversion |
|
351 doCodepageCreate(codepageData, dataLength, converter, status); |
|
352 if(U_FAILURE(status)) { |
|
353 setToBogus(); |
|
354 } |
|
355 |
|
356 // close the converter |
|
357 if(codepage == 0) { |
|
358 u_releaseDefaultConverter(converter); |
|
359 } else { |
|
360 ucnv_close(converter); |
|
361 } |
|
362 } |
|
363 |
|
364 void |
|
365 UnicodeString::doCodepageCreate(const char *codepageData, |
|
366 int32_t dataLength, |
|
367 UConverter *converter, |
|
368 UErrorCode &status) |
|
369 { |
|
370 if(U_FAILURE(status)) { |
|
371 return; |
|
372 } |
|
373 |
|
374 // set up the conversion parameters |
|
375 const char *mySource = codepageData; |
|
376 const char *mySourceEnd = mySource + dataLength; |
|
377 UChar *array, *myTarget; |
|
378 |
|
379 // estimate the size needed: |
|
380 int32_t arraySize; |
|
381 if(dataLength <= US_STACKBUF_SIZE) { |
|
382 // try to use the stack buffer |
|
383 arraySize = US_STACKBUF_SIZE; |
|
384 } else { |
|
385 // 1.25 UChar's per source byte should cover most cases |
|
386 arraySize = dataLength + (dataLength >> 2); |
|
387 } |
|
388 |
|
389 // we do not care about the current contents |
|
390 UBool doCopyArray = FALSE; |
|
391 for(;;) { |
|
392 if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) { |
|
393 setToBogus(); |
|
394 break; |
|
395 } |
|
396 |
|
397 // perform the conversion |
|
398 array = getArrayStart(); |
|
399 myTarget = array + length(); |
|
400 ucnv_toUnicode(converter, &myTarget, array + getCapacity(), |
|
401 &mySource, mySourceEnd, 0, TRUE, &status); |
|
402 |
|
403 // update the conversion parameters |
|
404 setLength((int32_t)(myTarget - array)); |
|
405 |
|
406 // allocate more space and copy data, if needed |
|
407 if(status == U_BUFFER_OVERFLOW_ERROR) { |
|
408 // reset the error code |
|
409 status = U_ZERO_ERROR; |
|
410 |
|
411 // keep the previous conversion results |
|
412 doCopyArray = TRUE; |
|
413 |
|
414 // estimate the new size needed, larger than before |
|
415 // try 2 UChar's per remaining source byte |
|
416 arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource)); |
|
417 } else { |
|
418 break; |
|
419 } |
|
420 } |
|
421 } |
|
422 |
|
423 U_NAMESPACE_END |
|
424 |
|
425 #endif |