|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 1998-2011, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * |
|
9 * File ucbuf.c |
|
10 * |
|
11 * Modification History: |
|
12 * |
|
13 * Date Name Description |
|
14 * 05/10/01 Ram Creation. |
|
15 ******************************************************************************* |
|
16 */ |
|
17 |
|
18 #include "unicode/utypes.h" |
|
19 #include "unicode/putil.h" |
|
20 #include "unicode/uchar.h" |
|
21 #include "unicode/ucnv.h" |
|
22 #include "unicode/ucnv_err.h" |
|
23 #include "unicode/ustring.h" |
|
24 #include "unicode/utf16.h" |
|
25 #include "filestrm.h" |
|
26 #include "cstring.h" |
|
27 #include "cmemory.h" |
|
28 #include "ustrfmt.h" |
|
29 #include "ucbuf.h" |
|
30 #include <stdio.h> |
|
31 |
|
32 #if !UCONFIG_NO_CONVERSION |
|
33 |
|
34 |
|
35 #define MAX_IN_BUF 1000 |
|
36 #define MAX_U_BUF 1500 |
|
37 #define CONTEXT_LEN 20 |
|
38 |
|
39 struct UCHARBUF { |
|
40 UChar* buffer; |
|
41 UChar* currentPos; |
|
42 UChar* bufLimit; |
|
43 int32_t bufCapacity; |
|
44 int32_t remaining; |
|
45 int32_t signatureLength; |
|
46 FileStream* in; |
|
47 UConverter* conv; |
|
48 UBool showWarning; /* makes this API not produce any errors */ |
|
49 UBool isBuffered; |
|
50 }; |
|
51 |
|
52 U_CAPI UBool U_EXPORT2 |
|
53 ucbuf_autodetect_fs(FileStream* in, const char** cp, UConverter** conv, int32_t* signatureLength, UErrorCode* error){ |
|
54 char start[8]; |
|
55 int32_t numRead; |
|
56 |
|
57 UChar target[1]={ 0 }; |
|
58 UChar* pTarget; |
|
59 const char* pStart; |
|
60 |
|
61 /* read a few bytes */ |
|
62 numRead=T_FileStream_read(in, start, sizeof(start)); |
|
63 |
|
64 *cp = ucnv_detectUnicodeSignature(start, numRead, signatureLength, error); |
|
65 |
|
66 /* unread the bytes beyond what was consumed for U+FEFF */ |
|
67 T_FileStream_rewind(in); |
|
68 if (*signatureLength > 0) { |
|
69 T_FileStream_read(in, start, *signatureLength); |
|
70 } |
|
71 |
|
72 if(*cp==NULL){ |
|
73 *conv =NULL; |
|
74 return FALSE; |
|
75 } |
|
76 |
|
77 /* open the converter for the detected Unicode charset */ |
|
78 *conv = ucnv_open(*cp,error); |
|
79 |
|
80 /* convert and ignore initial U+FEFF, and the buffer overflow */ |
|
81 pTarget = target; |
|
82 pStart = start; |
|
83 ucnv_toUnicode(*conv, &pTarget, target+1, &pStart, start+*signatureLength, NULL, FALSE, error); |
|
84 *signatureLength = (int32_t)(pStart - start); |
|
85 if(*error==U_BUFFER_OVERFLOW_ERROR) { |
|
86 *error=U_ZERO_ERROR; |
|
87 } |
|
88 |
|
89 /* verify that we successfully read exactly U+FEFF */ |
|
90 if(U_SUCCESS(*error) && (pTarget!=(target+1) || target[0]!=0xfeff)) { |
|
91 *error=U_INTERNAL_PROGRAM_ERROR; |
|
92 } |
|
93 |
|
94 |
|
95 return TRUE; |
|
96 } |
|
97 static UBool ucbuf_isCPKnown(const char* cp){ |
|
98 if(ucnv_compareNames("UTF-8",cp)==0){ |
|
99 return TRUE; |
|
100 } |
|
101 if(ucnv_compareNames("UTF-16BE",cp)==0){ |
|
102 return TRUE; |
|
103 } |
|
104 if(ucnv_compareNames("UTF-16LE",cp)==0){ |
|
105 return TRUE; |
|
106 } |
|
107 if(ucnv_compareNames("UTF-16",cp)==0){ |
|
108 return TRUE; |
|
109 } |
|
110 if(ucnv_compareNames("UTF-32",cp)==0){ |
|
111 return TRUE; |
|
112 } |
|
113 if(ucnv_compareNames("UTF-32BE",cp)==0){ |
|
114 return TRUE; |
|
115 } |
|
116 if(ucnv_compareNames("UTF-32LE",cp)==0){ |
|
117 return TRUE; |
|
118 } |
|
119 if(ucnv_compareNames("SCSU",cp)==0){ |
|
120 return TRUE; |
|
121 } |
|
122 if(ucnv_compareNames("BOCU-1",cp)==0){ |
|
123 return TRUE; |
|
124 } |
|
125 if(ucnv_compareNames("UTF-7",cp)==0){ |
|
126 return TRUE; |
|
127 } |
|
128 return FALSE; |
|
129 } |
|
130 |
|
131 U_CAPI FileStream * U_EXPORT2 |
|
132 ucbuf_autodetect(const char* fileName, const char** cp,UConverter** conv, int32_t* signatureLength,UErrorCode* error){ |
|
133 FileStream* in=NULL; |
|
134 if(error==NULL || U_FAILURE(*error)){ |
|
135 return NULL; |
|
136 } |
|
137 if(conv==NULL || cp==NULL || fileName==NULL){ |
|
138 *error = U_ILLEGAL_ARGUMENT_ERROR; |
|
139 return NULL; |
|
140 } |
|
141 /* open the file */ |
|
142 in= T_FileStream_open(fileName,"rb"); |
|
143 |
|
144 if(in == NULL){ |
|
145 *error=U_FILE_ACCESS_ERROR; |
|
146 return NULL; |
|
147 } |
|
148 |
|
149 if(ucbuf_autodetect_fs(in,cp,conv,signatureLength,error)) { |
|
150 return in; |
|
151 } else { |
|
152 ucnv_close(*conv); |
|
153 *conv=NULL; |
|
154 T_FileStream_close(in); |
|
155 return NULL; |
|
156 } |
|
157 } |
|
158 |
|
159 /* fill the uchar buffer */ |
|
160 static UCHARBUF* |
|
161 ucbuf_fillucbuf( UCHARBUF* buf,UErrorCode* error){ |
|
162 UChar* pTarget=NULL; |
|
163 UChar* target=NULL; |
|
164 const char* source=NULL; |
|
165 char carr[MAX_IN_BUF] = {'\0'}; |
|
166 char* cbuf = carr; |
|
167 int32_t inputRead=0; |
|
168 int32_t outputWritten=0; |
|
169 int32_t offset=0; |
|
170 const char* sourceLimit =NULL; |
|
171 int32_t cbufSize=0; |
|
172 pTarget = buf->buffer; |
|
173 /* check if we arrived here without exhausting the buffer*/ |
|
174 if(buf->currentPos<buf->bufLimit){ |
|
175 offset = (int32_t)(buf->bufLimit-buf->currentPos); |
|
176 memmove(buf->buffer,buf->currentPos,offset* sizeof(UChar)); |
|
177 } |
|
178 |
|
179 #if DEBUG |
|
180 memset(pTarget+offset,0xff,sizeof(UChar)*(MAX_IN_BUF-offset)); |
|
181 #endif |
|
182 if(buf->isBuffered){ |
|
183 cbufSize = MAX_IN_BUF; |
|
184 /* read the file */ |
|
185 inputRead=T_FileStream_read(buf->in,cbuf,cbufSize-offset); |
|
186 buf->remaining-=inputRead; |
|
187 |
|
188 }else{ |
|
189 cbufSize = T_FileStream_size(buf->in); |
|
190 cbuf = (char*)uprv_malloc(cbufSize); |
|
191 if (cbuf == NULL) { |
|
192 *error = U_MEMORY_ALLOCATION_ERROR; |
|
193 return NULL; |
|
194 } |
|
195 inputRead= T_FileStream_read(buf->in,cbuf,cbufSize); |
|
196 buf->remaining-=inputRead; |
|
197 } |
|
198 |
|
199 /* just to be sure...*/ |
|
200 if ( 0 == inputRead ) |
|
201 buf->remaining = 0; |
|
202 |
|
203 target=pTarget; |
|
204 /* convert the bytes */ |
|
205 if(buf->conv){ |
|
206 /* set the callback to stop */ |
|
207 UConverterToUCallback toUOldAction ; |
|
208 void* toUOldContext; |
|
209 void* toUNewContext=NULL; |
|
210 ucnv_setToUCallBack(buf->conv, |
|
211 UCNV_TO_U_CALLBACK_STOP, |
|
212 toUNewContext, |
|
213 &toUOldAction, |
|
214 (const void**)&toUOldContext, |
|
215 error); |
|
216 /* since state is saved in the converter we add offset to source*/ |
|
217 target = pTarget+offset; |
|
218 source = cbuf; |
|
219 sourceLimit = source + inputRead; |
|
220 ucnv_toUnicode(buf->conv,&target,target+(buf->bufCapacity-offset), |
|
221 &source,sourceLimit,NULL, |
|
222 (UBool)(buf->remaining==0),error); |
|
223 |
|
224 if(U_FAILURE(*error)){ |
|
225 char context[CONTEXT_LEN+1]; |
|
226 char preContext[CONTEXT_LEN+1]; |
|
227 char postContext[CONTEXT_LEN+1]; |
|
228 int8_t len = CONTEXT_LEN; |
|
229 int32_t start=0; |
|
230 int32_t stop =0; |
|
231 int32_t pos =0; |
|
232 /* use erro1 to preserve the error code */ |
|
233 UErrorCode error1 =U_ZERO_ERROR; |
|
234 |
|
235 if( buf->showWarning==TRUE){ |
|
236 fprintf(stderr,"\n###WARNING: Encountered abnormal bytes while" |
|
237 " converting input stream to target encoding: %s\n", |
|
238 u_errorName(*error)); |
|
239 } |
|
240 |
|
241 |
|
242 /* now get the context chars */ |
|
243 ucnv_getInvalidChars(buf->conv,context,&len,&error1); |
|
244 context[len]= 0 ; /* null terminate the buffer */ |
|
245 |
|
246 pos = (int32_t)(source - cbuf - len); |
|
247 |
|
248 /* for pre-context */ |
|
249 start = (pos <=CONTEXT_LEN)? 0 : (pos - (CONTEXT_LEN-1)); |
|
250 stop = pos-len; |
|
251 |
|
252 memcpy(preContext,cbuf+start,stop-start); |
|
253 /* null terminate the buffer */ |
|
254 preContext[stop-start] = 0; |
|
255 |
|
256 /* for post-context */ |
|
257 start = pos+len; |
|
258 stop = (int32_t)(((pos+CONTEXT_LEN)<= (sourceLimit-cbuf) )? (pos+(CONTEXT_LEN-1)) : (sourceLimit-cbuf)); |
|
259 |
|
260 memcpy(postContext,source,stop-start); |
|
261 /* null terminate the buffer */ |
|
262 postContext[stop-start] = 0; |
|
263 |
|
264 if(buf->showWarning ==TRUE){ |
|
265 /* print out the context */ |
|
266 fprintf(stderr,"\tPre-context: %s\n",preContext); |
|
267 fprintf(stderr,"\tContext: %s\n",context); |
|
268 fprintf(stderr,"\tPost-context: %s\n", postContext); |
|
269 } |
|
270 |
|
271 /* reset the converter */ |
|
272 ucnv_reset(buf->conv); |
|
273 |
|
274 /* set the call back to substitute |
|
275 * and restart conversion |
|
276 */ |
|
277 ucnv_setToUCallBack(buf->conv, |
|
278 UCNV_TO_U_CALLBACK_SUBSTITUTE, |
|
279 toUNewContext, |
|
280 &toUOldAction, |
|
281 (const void**)&toUOldContext, |
|
282 &error1); |
|
283 |
|
284 /* reset source and target start positions */ |
|
285 target = pTarget+offset; |
|
286 source = cbuf; |
|
287 |
|
288 /* re convert */ |
|
289 ucnv_toUnicode(buf->conv,&target,target+(buf->bufCapacity-offset), |
|
290 &source,sourceLimit,NULL, |
|
291 (UBool)(buf->remaining==0),&error1); |
|
292 |
|
293 } |
|
294 outputWritten = (int32_t)(target - pTarget); |
|
295 |
|
296 |
|
297 #if DEBUG |
|
298 { |
|
299 int i; |
|
300 target = pTarget; |
|
301 for(i=0;i<numRead;i++){ |
|
302 /* printf("%c", (char)(*target++));*/ |
|
303 } |
|
304 } |
|
305 #endif |
|
306 |
|
307 }else{ |
|
308 u_charsToUChars(cbuf,target+offset,inputRead); |
|
309 outputWritten=((buf->remaining>cbufSize)? cbufSize:inputRead+offset); |
|
310 } |
|
311 buf->currentPos = pTarget; |
|
312 buf->bufLimit=pTarget+outputWritten; |
|
313 *buf->bufLimit=0; /*NUL terminate*/ |
|
314 if(cbuf!=carr){ |
|
315 uprv_free(cbuf); |
|
316 } |
|
317 return buf; |
|
318 } |
|
319 |
|
320 |
|
321 |
|
322 /* get a UChar from the stream*/ |
|
323 U_CAPI int32_t U_EXPORT2 |
|
324 ucbuf_getc(UCHARBUF* buf,UErrorCode* error){ |
|
325 if(error==NULL || U_FAILURE(*error)){ |
|
326 return FALSE; |
|
327 } |
|
328 if(buf->currentPos>=buf->bufLimit){ |
|
329 if(buf->remaining==0){ |
|
330 return U_EOF; |
|
331 } |
|
332 buf=ucbuf_fillucbuf(buf,error); |
|
333 if(U_FAILURE(*error)){ |
|
334 return U_EOF; |
|
335 } |
|
336 } |
|
337 |
|
338 return *(buf->currentPos++); |
|
339 } |
|
340 |
|
341 /* get a UChar32 from the stream*/ |
|
342 U_CAPI int32_t U_EXPORT2 |
|
343 ucbuf_getc32(UCHARBUF* buf,UErrorCode* error){ |
|
344 int32_t retVal = (int32_t)U_EOF; |
|
345 if(error==NULL || U_FAILURE(*error)){ |
|
346 return FALSE; |
|
347 } |
|
348 if(buf->currentPos+1>=buf->bufLimit){ |
|
349 if(buf->remaining==0){ |
|
350 return U_EOF; |
|
351 } |
|
352 buf=ucbuf_fillucbuf(buf,error); |
|
353 if(U_FAILURE(*error)){ |
|
354 return U_EOF; |
|
355 } |
|
356 } |
|
357 if(U16_IS_LEAD(*(buf->currentPos))){ |
|
358 retVal=U16_GET_SUPPLEMENTARY(buf->currentPos[0],buf->currentPos[1]); |
|
359 buf->currentPos+=2; |
|
360 }else{ |
|
361 retVal = *(buf->currentPos++); |
|
362 } |
|
363 return retVal; |
|
364 } |
|
365 |
|
366 /* u_unescapeAt() callback to return a UChar*/ |
|
367 static UChar U_CALLCONV |
|
368 _charAt(int32_t offset, void *context) { |
|
369 return ((UCHARBUF*) context)->currentPos[offset]; |
|
370 } |
|
371 |
|
372 /* getc and escape it */ |
|
373 U_CAPI int32_t U_EXPORT2 |
|
374 ucbuf_getcx32(UCHARBUF* buf,UErrorCode* error) { |
|
375 int32_t length; |
|
376 int32_t offset; |
|
377 UChar32 c32,c1,c2; |
|
378 if(error==NULL || U_FAILURE(*error)){ |
|
379 return FALSE; |
|
380 } |
|
381 /* Fill the buffer if it is empty */ |
|
382 if (buf->currentPos >=buf->bufLimit-2) { |
|
383 ucbuf_fillucbuf(buf,error); |
|
384 } |
|
385 |
|
386 /* Get the next character in the buffer */ |
|
387 if (buf->currentPos < buf->bufLimit) { |
|
388 c1 = *(buf->currentPos)++; |
|
389 } else { |
|
390 c1 = U_EOF; |
|
391 } |
|
392 |
|
393 c2 = *(buf->currentPos); |
|
394 |
|
395 /* If it isn't a backslash, return it */ |
|
396 if (c1 != 0x005C) { |
|
397 return c1; |
|
398 } |
|
399 |
|
400 /* Determine the amount of data in the buffer */ |
|
401 length = (int32_t)(buf->bufLimit - buf->currentPos); |
|
402 |
|
403 /* The longest escape sequence is \Uhhhhhhhh; make sure |
|
404 we have at least that many characters */ |
|
405 if (length < 10) { |
|
406 |
|
407 /* fill the buffer */ |
|
408 ucbuf_fillucbuf(buf,error); |
|
409 length = (int32_t)(buf->bufLimit - buf->buffer); |
|
410 } |
|
411 |
|
412 /* Process the escape */ |
|
413 offset = 0; |
|
414 c32 = u_unescapeAt(_charAt, &offset, length, (void*)buf); |
|
415 |
|
416 /* check if u_unescapeAt unescaped and converted |
|
417 * to c32 or not |
|
418 */ |
|
419 if(c32==0xFFFFFFFF){ |
|
420 if(buf->showWarning) { |
|
421 char context[CONTEXT_LEN+1]; |
|
422 int32_t len = CONTEXT_LEN; |
|
423 if(length < len) { |
|
424 len = length; |
|
425 } |
|
426 context[len]= 0 ; /* null terminate the buffer */ |
|
427 u_UCharsToChars( buf->currentPos, context, len); |
|
428 fprintf(stderr,"Bad escape: [%c%s]...\n", (int)c1, context); |
|
429 } |
|
430 *error= U_ILLEGAL_ESCAPE_SEQUENCE; |
|
431 return c1; |
|
432 }else if(c32!=c2 || (c32==0x0075 && c2==0x0075 && c1==0x005C) /* for \u0075 c2=0x0075 and c32==0x0075*/){ |
|
433 /* Update the current buffer position */ |
|
434 buf->currentPos += offset; |
|
435 }else{ |
|
436 /* unescaping failed so we just return |
|
437 * c1 and not consume the buffer |
|
438 * this is useful for rules with escapes |
|
439 * in resouce bundles |
|
440 * eg: \' \\ \" |
|
441 */ |
|
442 return c1; |
|
443 } |
|
444 |
|
445 return c32; |
|
446 } |
|
447 |
|
448 U_CAPI UCHARBUF* U_EXPORT2 |
|
449 ucbuf_open(const char* fileName,const char** cp,UBool showWarning, UBool buffered, UErrorCode* error){ |
|
450 |
|
451 FileStream* in = NULL; |
|
452 int32_t fileSize=0; |
|
453 const char* knownCp; |
|
454 if(error==NULL || U_FAILURE(*error)){ |
|
455 return NULL; |
|
456 } |
|
457 if(cp==NULL || fileName==NULL){ |
|
458 *error = U_ILLEGAL_ARGUMENT_ERROR; |
|
459 return FALSE; |
|
460 } |
|
461 if (!uprv_strcmp(fileName, "-")) { |
|
462 in = T_FileStream_stdin(); |
|
463 }else{ |
|
464 in = T_FileStream_open(fileName, "rb"); |
|
465 } |
|
466 |
|
467 if(in!=NULL){ |
|
468 UCHARBUF* buf =(UCHARBUF*) uprv_malloc(sizeof(UCHARBUF)); |
|
469 fileSize = T_FileStream_size(in); |
|
470 if(buf == NULL){ |
|
471 *error = U_MEMORY_ALLOCATION_ERROR; |
|
472 T_FileStream_close(in); |
|
473 return NULL; |
|
474 } |
|
475 buf->in=in; |
|
476 buf->conv=NULL; |
|
477 buf->showWarning = showWarning; |
|
478 buf->isBuffered = buffered; |
|
479 buf->signatureLength=0; |
|
480 if(*cp==NULL || **cp=='\0'){ |
|
481 /* don't have code page name... try to autodetect */ |
|
482 ucbuf_autodetect_fs(in,cp,&buf->conv,&buf->signatureLength,error); |
|
483 }else if(ucbuf_isCPKnown(*cp)){ |
|
484 /* discard BOM */ |
|
485 ucbuf_autodetect_fs(in,&knownCp,&buf->conv,&buf->signatureLength,error); |
|
486 } |
|
487 if(U_SUCCESS(*error) && buf->conv==NULL) { |
|
488 buf->conv=ucnv_open(*cp,error); |
|
489 } |
|
490 if(U_FAILURE(*error)){ |
|
491 ucnv_close(buf->conv); |
|
492 uprv_free(buf); |
|
493 T_FileStream_close(in); |
|
494 return NULL; |
|
495 } |
|
496 |
|
497 if((buf->conv==NULL) && (buf->showWarning==TRUE)){ |
|
498 fprintf(stderr,"###WARNING: No converter defined. Using codepage of system.\n"); |
|
499 } |
|
500 buf->remaining=fileSize-buf->signatureLength; |
|
501 if(buf->isBuffered){ |
|
502 buf->bufCapacity=MAX_U_BUF; |
|
503 }else{ |
|
504 buf->bufCapacity=buf->remaining+buf->signatureLength+1/*for terminating nul*/; |
|
505 } |
|
506 buf->buffer=(UChar*) uprv_malloc(U_SIZEOF_UCHAR * buf->bufCapacity ); |
|
507 if (buf->buffer == NULL) { |
|
508 *error = U_MEMORY_ALLOCATION_ERROR; |
|
509 ucbuf_close(buf); |
|
510 return NULL; |
|
511 } |
|
512 buf->currentPos=buf->buffer; |
|
513 buf->bufLimit=buf->buffer; |
|
514 if(U_FAILURE(*error)){ |
|
515 fprintf(stderr, "Could not open codepage [%s]: %s\n", *cp, u_errorName(*error)); |
|
516 ucbuf_close(buf); |
|
517 return NULL; |
|
518 } |
|
519 ucbuf_fillucbuf(buf,error); |
|
520 if(U_FAILURE(*error)){ |
|
521 ucbuf_close(buf); |
|
522 return NULL; |
|
523 } |
|
524 return buf; |
|
525 } |
|
526 *error =U_FILE_ACCESS_ERROR; |
|
527 return NULL; |
|
528 } |
|
529 |
|
530 |
|
531 |
|
532 /* TODO: this method will fail if at the |
|
533 * begining of buffer and the uchar to unget |
|
534 * is from the previous buffer. Need to implement |
|
535 * system to take care of that situation. |
|
536 */ |
|
537 U_CAPI void U_EXPORT2 |
|
538 ucbuf_ungetc(int32_t c,UCHARBUF* buf){ |
|
539 /* decrement currentPos pointer |
|
540 * if not at the begining of buffer |
|
541 */ |
|
542 if(buf->currentPos!=buf->buffer){ |
|
543 if(*(buf->currentPos-1)==c){ |
|
544 buf->currentPos--; |
|
545 } else { |
|
546 /* ungetc failed - did not match. */ |
|
547 } |
|
548 } else { |
|
549 /* ungetc failed - beginning of buffer. */ |
|
550 } |
|
551 } |
|
552 |
|
553 /* frees the resources of UChar* buffer */ |
|
554 static void |
|
555 ucbuf_closebuf(UCHARBUF* buf){ |
|
556 uprv_free(buf->buffer); |
|
557 buf->buffer = NULL; |
|
558 } |
|
559 |
|
560 /* close the buf and release resources*/ |
|
561 U_CAPI void U_EXPORT2 |
|
562 ucbuf_close(UCHARBUF* buf){ |
|
563 if(buf!=NULL){ |
|
564 if(buf->conv){ |
|
565 ucnv_close(buf->conv); |
|
566 } |
|
567 T_FileStream_close(buf->in); |
|
568 ucbuf_closebuf(buf); |
|
569 uprv_free(buf); |
|
570 } |
|
571 } |
|
572 |
|
573 /* rewind the buf and file stream */ |
|
574 U_CAPI void U_EXPORT2 |
|
575 ucbuf_rewind(UCHARBUF* buf,UErrorCode* error){ |
|
576 if(error==NULL || U_FAILURE(*error)){ |
|
577 return; |
|
578 } |
|
579 if(buf){ |
|
580 buf->currentPos=buf->buffer; |
|
581 buf->bufLimit=buf->buffer; |
|
582 T_FileStream_rewind(buf->in); |
|
583 buf->remaining=T_FileStream_size(buf->in)-buf->signatureLength; |
|
584 |
|
585 ucnv_resetToUnicode(buf->conv); |
|
586 if(buf->signatureLength>0) { |
|
587 UChar target[1]={ 0 }; |
|
588 UChar* pTarget; |
|
589 char start[8]; |
|
590 const char* pStart; |
|
591 int32_t numRead; |
|
592 |
|
593 /* read the signature bytes */ |
|
594 numRead=T_FileStream_read(buf->in, start, buf->signatureLength); |
|
595 |
|
596 /* convert and ignore initial U+FEFF, and the buffer overflow */ |
|
597 pTarget = target; |
|
598 pStart = start; |
|
599 ucnv_toUnicode(buf->conv, &pTarget, target+1, &pStart, start+numRead, NULL, FALSE, error); |
|
600 if(*error==U_BUFFER_OVERFLOW_ERROR) { |
|
601 *error=U_ZERO_ERROR; |
|
602 } |
|
603 |
|
604 /* verify that we successfully read exactly U+FEFF */ |
|
605 if(U_SUCCESS(*error) && (numRead!=buf->signatureLength || pTarget!=(target+1) || target[0]!=0xfeff)) { |
|
606 *error=U_INTERNAL_PROGRAM_ERROR; |
|
607 } |
|
608 } |
|
609 } |
|
610 } |
|
611 |
|
612 |
|
613 U_CAPI int32_t U_EXPORT2 |
|
614 ucbuf_size(UCHARBUF* buf){ |
|
615 if(buf){ |
|
616 if(buf->isBuffered){ |
|
617 return (T_FileStream_size(buf->in)-buf->signatureLength)/ucnv_getMinCharSize(buf->conv); |
|
618 }else{ |
|
619 return (int32_t)(buf->bufLimit - buf->buffer); |
|
620 } |
|
621 } |
|
622 return 0; |
|
623 } |
|
624 |
|
625 U_CAPI const UChar* U_EXPORT2 |
|
626 ucbuf_getBuffer(UCHARBUF* buf,int32_t* len,UErrorCode* error){ |
|
627 if(error==NULL || U_FAILURE(*error)){ |
|
628 return NULL; |
|
629 } |
|
630 if(buf==NULL || len==NULL){ |
|
631 *error = U_ILLEGAL_ARGUMENT_ERROR; |
|
632 return NULL; |
|
633 } |
|
634 *len = (int32_t)(buf->bufLimit - buf->buffer); |
|
635 return buf->buffer; |
|
636 } |
|
637 |
|
638 U_CAPI const char* U_EXPORT2 |
|
639 ucbuf_resolveFileName(const char* inputDir, const char* fileName, char* target, int32_t* len, UErrorCode* status){ |
|
640 int32_t requiredLen = 0; |
|
641 int32_t dirlen = 0; |
|
642 int32_t filelen = 0; |
|
643 if(status==NULL || U_FAILURE(*status)){ |
|
644 return NULL; |
|
645 } |
|
646 |
|
647 if(inputDir == NULL || fileName == NULL || len==NULL || (target==NULL && *len>0)){ |
|
648 *status = U_ILLEGAL_ARGUMENT_ERROR; |
|
649 return NULL; |
|
650 } |
|
651 |
|
652 |
|
653 dirlen = (int32_t)uprv_strlen(inputDir); |
|
654 filelen = (int32_t)uprv_strlen(fileName); |
|
655 if(inputDir[dirlen-1] != U_FILE_SEP_CHAR) { |
|
656 requiredLen = dirlen + filelen + 2; |
|
657 if((*len < requiredLen) || target==NULL){ |
|
658 *len = requiredLen; |
|
659 *status = U_BUFFER_OVERFLOW_ERROR; |
|
660 return NULL; |
|
661 } |
|
662 |
|
663 target[0] = '\0'; |
|
664 /* |
|
665 * append the input dir to openFileName if the first char in |
|
666 * filename is not file seperation char and the last char input directory is not '.'. |
|
667 * This is to support : |
|
668 * genrb -s. /home/icu/data |
|
669 * genrb -s. icu/data |
|
670 * The user cannot mix notations like |
|
671 * genrb -s. /icu/data --- the absolute path specified. -s redundant |
|
672 * user should use |
|
673 * genrb -s. icu/data --- start from CWD and look in icu/data dir |
|
674 */ |
|
675 if( (fileName[0] != U_FILE_SEP_CHAR) && (inputDir[dirlen-1] !='.')){ |
|
676 uprv_strcpy(target, inputDir); |
|
677 target[dirlen] = U_FILE_SEP_CHAR; |
|
678 } |
|
679 target[dirlen + 1] = '\0'; |
|
680 } else { |
|
681 requiredLen = dirlen + filelen + 1; |
|
682 if((*len < requiredLen) || target==NULL){ |
|
683 *len = requiredLen; |
|
684 *status = U_BUFFER_OVERFLOW_ERROR; |
|
685 return NULL; |
|
686 } |
|
687 |
|
688 uprv_strcpy(target, inputDir); |
|
689 } |
|
690 |
|
691 uprv_strcat(target, fileName); |
|
692 return target; |
|
693 } |
|
694 /* |
|
695 * Unicode TR 13 says any of the below chars is |
|
696 * a new line char in a readline function in addition |
|
697 * to CR+LF combination which needs to be |
|
698 * handled seperately |
|
699 */ |
|
700 static UBool ucbuf_isCharNewLine(UChar c){ |
|
701 switch(c){ |
|
702 case 0x000A: /* LF */ |
|
703 case 0x000D: /* CR */ |
|
704 case 0x000C: /* FF */ |
|
705 case 0x0085: /* NEL */ |
|
706 case 0x2028: /* LS */ |
|
707 case 0x2029: /* PS */ |
|
708 return TRUE; |
|
709 default: |
|
710 return FALSE; |
|
711 } |
|
712 } |
|
713 |
|
714 U_CAPI const UChar* U_EXPORT2 |
|
715 ucbuf_readline(UCHARBUF* buf,int32_t* len,UErrorCode* err){ |
|
716 UChar* temp = buf->currentPos; |
|
717 UChar* savePos =NULL; |
|
718 UChar c=0x0000; |
|
719 if(buf->isBuffered){ |
|
720 /* The input is buffered we have to do more |
|
721 * for returning a pointer U_TRUNCATED_CHAR_FOUND |
|
722 */ |
|
723 for(;;){ |
|
724 c = *temp++; |
|
725 if(buf->remaining==0){ |
|
726 return NULL; /* end of file is reached return NULL */ |
|
727 } |
|
728 if(temp>=buf->bufLimit && buf->currentPos == buf->buffer){ |
|
729 *err= U_TRUNCATED_CHAR_FOUND; |
|
730 return NULL; |
|
731 }else{ |
|
732 ucbuf_fillucbuf(buf,err); |
|
733 if(U_FAILURE(*err)){ |
|
734 return NULL; |
|
735 } |
|
736 } |
|
737 /* |
|
738 * Accoding to TR 13 readLine functions must interpret |
|
739 * CR, CR+LF, LF, NEL, PS, LS or FF as line seperators |
|
740 */ |
|
741 /* Windows CR LF */ |
|
742 if(c ==0x0d && temp+1<=buf->bufLimit && *(temp+1) == 0x0a ){ |
|
743 *len = (int32_t)(temp++ - buf->currentPos); |
|
744 savePos = buf->currentPos; |
|
745 buf->currentPos = temp; |
|
746 return savePos; |
|
747 } |
|
748 /* else */ |
|
749 |
|
750 if (temp>=buf->bufLimit|| ucbuf_isCharNewLine(c)){ /* Unipad inserts 2028 line separators! */ |
|
751 *len = (int32_t)(temp - buf->currentPos); |
|
752 savePos = buf->currentPos; |
|
753 buf->currentPos = temp; |
|
754 return savePos; |
|
755 } |
|
756 } |
|
757 }else{ |
|
758 /* we know that all input is read into the internal |
|
759 * buffer so we can safely return pointers |
|
760 */ |
|
761 for(;;){ |
|
762 c = *temp++; |
|
763 |
|
764 if(buf->currentPos==buf->bufLimit){ |
|
765 return NULL; /* end of file is reached return NULL */ |
|
766 } |
|
767 /* Windows CR LF */ |
|
768 if(c ==0x0d && temp+1<=buf->bufLimit && *(temp+1) == 0x0a ){ |
|
769 *len = (int32_t)(temp++ - buf->currentPos); |
|
770 savePos = buf->currentPos; |
|
771 buf->currentPos = temp; |
|
772 return savePos; |
|
773 } |
|
774 /* else */ |
|
775 if (temp>=buf->bufLimit|| ucbuf_isCharNewLine(c)) { /* Unipad inserts 2028 line separators! */ |
|
776 *len = (int32_t)(temp - buf->currentPos); |
|
777 savePos = buf->currentPos; |
|
778 buf->currentPos = temp; |
|
779 return savePos; |
|
780 } |
|
781 } |
|
782 } |
|
783 /* not reached */ |
|
784 /* A compiler warning will appear if all paths don't contain a return statement. */ |
|
785 /* return NULL;*/ |
|
786 } |
|
787 #endif |