|
1 /***************************************************************************** |
|
2 * |
|
3 * Copyright (C) 1999-2013, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 * |
|
6 ******************************************************************************/ |
|
7 |
|
8 /* |
|
9 * uconv(1): an iconv(1)-like converter using ICU. |
|
10 * |
|
11 * Original code by Jonas Utterström <jonas.utterstrom@vittran.norrnod.se> |
|
12 * contributed in 1999. |
|
13 * |
|
14 * Conversion to the C conversion API and many improvements by |
|
15 * Yves Arrouye <yves@realnames.com>, current maintainer. |
|
16 * |
|
17 * Markus Scherer maintainer from 2003. |
|
18 * See source code repository history for changes. |
|
19 */ |
|
20 |
|
21 #include <unicode/utypes.h> |
|
22 #include <unicode/putil.h> |
|
23 #include <unicode/ucnv.h> |
|
24 #include <unicode/uenum.h> |
|
25 #include <unicode/unistr.h> |
|
26 #include <unicode/translit.h> |
|
27 #include <unicode/uset.h> |
|
28 #include <unicode/uclean.h> |
|
29 #include <unicode/utf16.h> |
|
30 |
|
31 #include <stdio.h> |
|
32 #include <errno.h> |
|
33 #include <string.h> |
|
34 #include <stdlib.h> |
|
35 |
|
36 #include "cmemory.h" |
|
37 #include "cstring.h" |
|
38 #include "ustrfmt.h" |
|
39 |
|
40 #include "unicode/uwmsg.h" |
|
41 |
|
42 U_NAMESPACE_USE |
|
43 |
|
44 #if U_PLATFORM_USES_ONLY_WIN32_API && !defined(__STRICT_ANSI__) |
|
45 #include <io.h> |
|
46 #include <fcntl.h> |
|
47 #if U_PLATFORM_USES_ONLY_WIN32_API |
|
48 #define USE_FILENO_BINARY_MODE 1 |
|
49 /* Windows likes to rename Unix-like functions */ |
|
50 #ifndef fileno |
|
51 #define fileno _fileno |
|
52 #endif |
|
53 #ifndef setmode |
|
54 #define setmode _setmode |
|
55 #endif |
|
56 #ifndef O_BINARY |
|
57 #define O_BINARY _O_BINARY |
|
58 #endif |
|
59 #endif |
|
60 #endif |
|
61 |
|
62 #ifdef UCONVMSG_LINK |
|
63 /* below from the README */ |
|
64 #include "unicode/utypes.h" |
|
65 #include "unicode/udata.h" |
|
66 U_CFUNC char uconvmsg_dat[]; |
|
67 #endif |
|
68 |
|
69 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
|
70 |
|
71 #define DEFAULT_BUFSZ 4096 |
|
72 #define UCONVMSG "uconvmsg" |
|
73 |
|
74 static UResourceBundle *gBundle = 0; /* Bundle containing messages. */ |
|
75 |
|
76 /* |
|
77 * Initialize the message bundle so that message strings can be fetched |
|
78 * by u_wmsg(). |
|
79 * |
|
80 */ |
|
81 |
|
82 static void initMsg(const char *pname) { |
|
83 static int ps = 0; |
|
84 |
|
85 if (!ps) { |
|
86 char dataPath[2048]; /* XXX Sloppy: should be PATH_MAX. */ |
|
87 UErrorCode err = U_ZERO_ERROR; |
|
88 |
|
89 ps = 1; |
|
90 |
|
91 /* Set up our static data - if any */ |
|
92 #if defined(UCONVMSG_LINK) && U_PLATFORM != U_PF_OS390 /* On z/OS, this is failing. */ |
|
93 udata_setAppData(UCONVMSG, (const void*) uconvmsg_dat, &err); |
|
94 if (U_FAILURE(err)) { |
|
95 fprintf(stderr, "%s: warning, problem installing our static resource bundle data uconvmsg: %s - trying anyways.\n", |
|
96 pname, u_errorName(err)); |
|
97 err = U_ZERO_ERROR; /* It may still fail */ |
|
98 } |
|
99 #endif |
|
100 |
|
101 /* Get messages. */ |
|
102 gBundle = u_wmsg_setPath(UCONVMSG, &err); |
|
103 if (U_FAILURE(err)) { |
|
104 fprintf(stderr, |
|
105 "%s: warning: couldn't open bundle %s: %s\n", |
|
106 pname, UCONVMSG, u_errorName(err)); |
|
107 #ifdef UCONVMSG_LINK |
|
108 fprintf(stderr, |
|
109 "%s: setAppData was called, internal data %s failed to load\n", |
|
110 pname, UCONVMSG); |
|
111 #endif |
|
112 |
|
113 err = U_ZERO_ERROR; |
|
114 /* that was try #1, try again with a path */ |
|
115 uprv_strcpy(dataPath, u_getDataDirectory()); |
|
116 uprv_strcat(dataPath, U_FILE_SEP_STRING); |
|
117 uprv_strcat(dataPath, UCONVMSG); |
|
118 |
|
119 gBundle = u_wmsg_setPath(dataPath, &err); |
|
120 if (U_FAILURE(err)) { |
|
121 fprintf(stderr, |
|
122 "%s: warning: still couldn't open bundle %s: %s\n", |
|
123 pname, dataPath, u_errorName(err)); |
|
124 fprintf(stderr, "%s: warning: messages will not be displayed\n", pname); |
|
125 } |
|
126 } |
|
127 } |
|
128 } |
|
129 |
|
130 /* Mapping of callback names to the callbacks passed to the converter |
|
131 API. */ |
|
132 |
|
133 static struct callback_ent { |
|
134 const char *name; |
|
135 UConverterFromUCallback fromu; |
|
136 const void *fromuctxt; |
|
137 UConverterToUCallback tou; |
|
138 const void *touctxt; |
|
139 } transcode_callbacks[] = { |
|
140 { "substitute", |
|
141 UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, |
|
142 UCNV_TO_U_CALLBACK_SUBSTITUTE, 0 }, |
|
143 { "skip", |
|
144 UCNV_FROM_U_CALLBACK_SKIP, 0, |
|
145 UCNV_TO_U_CALLBACK_SKIP, 0 }, |
|
146 { "stop", |
|
147 UCNV_FROM_U_CALLBACK_STOP, 0, |
|
148 UCNV_TO_U_CALLBACK_STOP, 0 }, |
|
149 { "escape", |
|
150 UCNV_FROM_U_CALLBACK_ESCAPE, 0, |
|
151 UCNV_TO_U_CALLBACK_ESCAPE, 0}, |
|
152 { "escape-icu", |
|
153 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU, |
|
154 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU }, |
|
155 { "escape-java", |
|
156 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA, |
|
157 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA }, |
|
158 { "escape-c", |
|
159 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C, |
|
160 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C }, |
|
161 { "escape-xml", |
|
162 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX, |
|
163 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX }, |
|
164 { "escape-xml-hex", |
|
165 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX, |
|
166 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX }, |
|
167 { "escape-xml-dec", |
|
168 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, |
|
169 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC }, |
|
170 { "escape-unicode", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE, |
|
171 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE } |
|
172 }; |
|
173 |
|
174 /* Return a pointer to a callback record given its name. */ |
|
175 |
|
176 static const struct callback_ent *findCallback(const char *name) { |
|
177 int i, count = |
|
178 sizeof(transcode_callbacks) / sizeof(*transcode_callbacks); |
|
179 |
|
180 /* We'll do a linear search, there aren't many of them and bsearch() |
|
181 may not be that portable. */ |
|
182 |
|
183 for (i = 0; i < count; ++i) { |
|
184 if (!uprv_stricmp(name, transcode_callbacks[i].name)) { |
|
185 return &transcode_callbacks[i]; |
|
186 } |
|
187 } |
|
188 |
|
189 return 0; |
|
190 } |
|
191 |
|
192 /* Print converter information. If lookfor is set, only that converter will |
|
193 be printed, otherwise all converters will be printed. If canon is non |
|
194 zero, tags and aliases for each converter are printed too, in the format |
|
195 expected for convrters.txt(5). */ |
|
196 |
|
197 static int printConverters(const char *pname, const char *lookfor, |
|
198 UBool canon) |
|
199 { |
|
200 UErrorCode err = U_ZERO_ERROR; |
|
201 int32_t num; |
|
202 uint16_t num_stds; |
|
203 const char **stds; |
|
204 |
|
205 /* If there is a specified name, just handle that now. */ |
|
206 |
|
207 if (lookfor) { |
|
208 if (!canon) { |
|
209 printf("%s\n", lookfor); |
|
210 return 0; |
|
211 } else { |
|
212 /* Because we are printing a canonical name, we need the |
|
213 true converter name. We've done that already except for |
|
214 the default name (because we want to print the exact |
|
215 name one would get when calling ucnv_getDefaultName() |
|
216 in non-canon mode). But since we do not know at this |
|
217 point if we have the default name or something else, we |
|
218 need to normalize again to the canonical converter |
|
219 name. */ |
|
220 |
|
221 const char *truename = ucnv_getAlias(lookfor, 0, &err); |
|
222 if (U_SUCCESS(err)) { |
|
223 lookfor = truename; |
|
224 } else { |
|
225 err = U_ZERO_ERROR; |
|
226 } |
|
227 } |
|
228 } |
|
229 |
|
230 /* Print converter names. We come here for one of two reasons: we |
|
231 are printing all the names (lookfor was null), or we have a |
|
232 single converter to print but in canon mode, hence we need to |
|
233 get to it in order to print everything. */ |
|
234 |
|
235 num = ucnv_countAvailable(); |
|
236 if (num <= 0) { |
|
237 initMsg(pname); |
|
238 u_wmsg(stderr, "cantGetNames"); |
|
239 return -1; |
|
240 } |
|
241 if (lookfor) { |
|
242 num = 1; /* We know where we want to be. */ |
|
243 } |
|
244 |
|
245 num_stds = ucnv_countStandards(); |
|
246 stds = (const char **) uprv_malloc(num_stds * sizeof(*stds)); |
|
247 if (!stds) { |
|
248 u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR)); |
|
249 return -1; |
|
250 } else { |
|
251 uint16_t s; |
|
252 |
|
253 if (canon) { |
|
254 printf("{ "); |
|
255 } |
|
256 for (s = 0; s < num_stds; ++s) { |
|
257 stds[s] = ucnv_getStandard(s, &err); |
|
258 if (canon) { |
|
259 printf("%s ", stds[s]); |
|
260 } |
|
261 if (U_FAILURE(err)) { |
|
262 u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(err)); |
|
263 goto error_cleanup; |
|
264 } |
|
265 } |
|
266 if (canon) { |
|
267 puts("}"); |
|
268 } |
|
269 } |
|
270 |
|
271 for (int32_t i = 0; i < num; i++) { |
|
272 const char *name; |
|
273 uint16_t num_aliases; |
|
274 |
|
275 /* Set the name either to what we are looking for, or |
|
276 to the current converter name. */ |
|
277 |
|
278 if (lookfor) { |
|
279 name = lookfor; |
|
280 } else { |
|
281 name = ucnv_getAvailableName(i); |
|
282 } |
|
283 |
|
284 /* Get all the aliases associated to the name. */ |
|
285 |
|
286 err = U_ZERO_ERROR; |
|
287 num_aliases = ucnv_countAliases(name, &err); |
|
288 if (U_FAILURE(err)) { |
|
289 printf("%s", name); |
|
290 |
|
291 UnicodeString str(name, ""); |
|
292 putchar('\t'); |
|
293 u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(), |
|
294 u_wmsg_errorName(err)); |
|
295 goto error_cleanup; |
|
296 } else { |
|
297 uint16_t a, s, t; |
|
298 |
|
299 /* Write all the aliases and their tags. */ |
|
300 |
|
301 for (a = 0; a < num_aliases; ++a) { |
|
302 const char *alias = ucnv_getAlias(name, a, &err); |
|
303 |
|
304 if (U_FAILURE(err)) { |
|
305 UnicodeString str(name, ""); |
|
306 putchar('\t'); |
|
307 u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(), |
|
308 u_wmsg_errorName(err)); |
|
309 goto error_cleanup; |
|
310 } |
|
311 |
|
312 /* Print the current alias so that it looks right. */ |
|
313 printf("%s%s%s", (canon ? (a == 0? "" : "\t" ) : "") , |
|
314 alias, |
|
315 (canon ? "" : " ")); |
|
316 |
|
317 /* Look (slowly, linear searching) for a tag. */ |
|
318 |
|
319 if (canon) { |
|
320 /* -1 to skip the last standard */ |
|
321 for (s = t = 0; s < num_stds-1; ++s) { |
|
322 UEnumeration *nameEnum = ucnv_openStandardNames(name, stds[s], &err); |
|
323 if (U_SUCCESS(err)) { |
|
324 /* List the standard tags */ |
|
325 const char *standardName; |
|
326 UBool isFirst = TRUE; |
|
327 UErrorCode enumError = U_ZERO_ERROR; |
|
328 while ((standardName = uenum_next(nameEnum, NULL, &enumError))) { |
|
329 /* See if this alias is supported by this standard. */ |
|
330 if (!strcmp(standardName, alias)) { |
|
331 if (!t) { |
|
332 printf(" {"); |
|
333 t = 1; |
|
334 } |
|
335 /* Print a * after the default standard name */ |
|
336 printf(" %s%s", stds[s], (isFirst ? "*" : "")); |
|
337 } |
|
338 isFirst = FALSE; |
|
339 } |
|
340 } |
|
341 } |
|
342 if (t) { |
|
343 printf(" }"); |
|
344 } |
|
345 } |
|
346 /* Terminate this entry. */ |
|
347 if (canon) { |
|
348 puts(""); |
|
349 } |
|
350 |
|
351 /* Move on. */ |
|
352 } |
|
353 /* Terminate this entry. */ |
|
354 if (!canon) { |
|
355 puts(""); |
|
356 } |
|
357 } |
|
358 } |
|
359 |
|
360 /* Free temporary data. */ |
|
361 |
|
362 uprv_free(stds); |
|
363 |
|
364 /* Success. */ |
|
365 |
|
366 return 0; |
|
367 error_cleanup: |
|
368 uprv_free(stds); |
|
369 return -1; |
|
370 } |
|
371 |
|
372 /* Print all available transliterators. If canon is non zero, print |
|
373 one transliterator per line. */ |
|
374 |
|
375 static int printTransliterators(UBool canon) |
|
376 { |
|
377 #if UCONFIG_NO_TRANSLITERATION |
|
378 printf("no transliterators available because of UCONFIG_NO_TRANSLITERATION, see uconfig.h\n"); |
|
379 return 1; |
|
380 #else |
|
381 UErrorCode status = U_ZERO_ERROR; |
|
382 UEnumeration *ids = utrans_openIDs(&status); |
|
383 int32_t i, numtrans = uenum_count(ids, &status); |
|
384 |
|
385 char sepchar = canon ? '\n' : ' '; |
|
386 |
|
387 for (i = 0; U_SUCCESS(status)&& (i < numtrans); ++i) { |
|
388 int32_t len; |
|
389 const char *nextTrans = uenum_next(ids, &len, &status); |
|
390 |
|
391 printf("%s", nextTrans); |
|
392 if (i < numtrans - 1) { |
|
393 putchar(sepchar); |
|
394 } |
|
395 } |
|
396 |
|
397 uenum_close(ids); |
|
398 |
|
399 /* Add a terminating newline if needed. */ |
|
400 |
|
401 if (sepchar != '\n') { |
|
402 putchar('\n'); |
|
403 } |
|
404 |
|
405 /* Success. */ |
|
406 |
|
407 return 0; |
|
408 #endif |
|
409 } |
|
410 |
|
411 enum { |
|
412 uSP = 0x20, // space |
|
413 uCR = 0xd, // carriage return |
|
414 uLF = 0xa, // line feed |
|
415 uNL = 0x85, // newline |
|
416 uLS = 0x2028, // line separator |
|
417 uPS = 0x2029, // paragraph separator |
|
418 uSig = 0xfeff // signature/BOM character |
|
419 }; |
|
420 |
|
421 static inline int32_t |
|
422 getChunkLimit(const UnicodeString &prev, const UnicodeString &s) { |
|
423 // find one of |
|
424 // CR, LF, CRLF, NL, LS, PS |
|
425 // for paragraph ends (see UAX #13/Unicode 4) |
|
426 // and include it in the chunk |
|
427 // all of these characters are on the BMP |
|
428 // do not include FF or VT in case they are part of a paragraph |
|
429 // (important for bidi contexts) |
|
430 static const UChar paraEnds[] = { |
|
431 0xd, 0xa, 0x85, 0x2028, 0x2029 |
|
432 }; |
|
433 enum { |
|
434 iCR, iLF, iNL, iLS, iPS, iCount |
|
435 }; |
|
436 |
|
437 // first, see if there is a CRLF split between prev and s |
|
438 if (prev.endsWith(paraEnds + iCR, 1)) { |
|
439 if (s.startsWith(paraEnds + iLF, 1)) { |
|
440 return 1; // split CRLF, include the LF |
|
441 } else if (!s.isEmpty()) { |
|
442 return 0; // complete the last chunk |
|
443 } else { |
|
444 return -1; // wait for actual further contents to arrive |
|
445 } |
|
446 } |
|
447 |
|
448 const UChar *u = s.getBuffer(), *limit = u + s.length(); |
|
449 UChar c; |
|
450 |
|
451 while (u < limit) { |
|
452 c = *u++; |
|
453 if ( |
|
454 ((c < uSP) && (c == uCR || c == uLF)) || |
|
455 (c == uNL) || |
|
456 ((c & uLS) == uLS) |
|
457 ) { |
|
458 if (c == uCR) { |
|
459 // check for CRLF |
|
460 if (u == limit) { |
|
461 return -1; // LF may be in the next chunk |
|
462 } else if (*u == uLF) { |
|
463 ++u; // include the LF in this chunk |
|
464 } |
|
465 } |
|
466 return (int32_t)(u - s.getBuffer()); |
|
467 } |
|
468 } |
|
469 |
|
470 return -1; // continue collecting the chunk |
|
471 } |
|
472 |
|
473 enum { |
|
474 CNV_NO_FEFF, // cannot convert the U+FEFF Unicode signature character (BOM) |
|
475 CNV_WITH_FEFF, // can convert the U+FEFF signature character |
|
476 CNV_ADDS_FEFF // automatically adds/detects the U+FEFF signature character |
|
477 }; |
|
478 |
|
479 static inline UChar |
|
480 nibbleToHex(uint8_t n) { |
|
481 n &= 0xf; |
|
482 return |
|
483 n <= 9 ? |
|
484 (UChar)(0x30 + n) : |
|
485 (UChar)((0x61 - 10) + n); |
|
486 } |
|
487 |
|
488 // check the converter's Unicode signature properties; |
|
489 // the fromUnicode side of the converter must be in its initial state |
|
490 // and will be reset again if it was used |
|
491 static int32_t |
|
492 cnvSigType(UConverter *cnv) { |
|
493 UErrorCode err; |
|
494 int32_t result; |
|
495 |
|
496 // test if the output charset can convert U+FEFF |
|
497 USet *set = uset_open(1, 0); |
|
498 err = U_ZERO_ERROR; |
|
499 ucnv_getUnicodeSet(cnv, set, UCNV_ROUNDTRIP_SET, &err); |
|
500 if (U_SUCCESS(err) && uset_contains(set, uSig)) { |
|
501 result = CNV_WITH_FEFF; |
|
502 } else { |
|
503 result = CNV_NO_FEFF; // an error occurred or U+FEFF cannot be converted |
|
504 } |
|
505 uset_close(set); |
|
506 |
|
507 if (result == CNV_WITH_FEFF) { |
|
508 // test if the output charset emits a signature anyway |
|
509 const UChar a[1] = { 0x61 }; // "a" |
|
510 const UChar *in; |
|
511 |
|
512 char buffer[20]; |
|
513 char *out; |
|
514 |
|
515 in = a; |
|
516 out = buffer; |
|
517 err = U_ZERO_ERROR; |
|
518 ucnv_fromUnicode(cnv, |
|
519 &out, buffer + sizeof(buffer), |
|
520 &in, a + 1, |
|
521 NULL, TRUE, &err); |
|
522 ucnv_resetFromUnicode(cnv); |
|
523 |
|
524 if (NULL != ucnv_detectUnicodeSignature(buffer, (int32_t)(out - buffer), NULL, &err) && |
|
525 U_SUCCESS(err) |
|
526 ) { |
|
527 result = CNV_ADDS_FEFF; |
|
528 } |
|
529 } |
|
530 |
|
531 return result; |
|
532 } |
|
533 |
|
534 class ConvertFile { |
|
535 public: |
|
536 ConvertFile() : |
|
537 buf(NULL), outbuf(NULL), fromoffsets(NULL), |
|
538 bufsz(0), signature(0) {} |
|
539 |
|
540 void |
|
541 setBufferSize(size_t bufferSize) { |
|
542 bufsz = bufferSize; |
|
543 |
|
544 buf = new char[2 * bufsz]; |
|
545 outbuf = buf + bufsz; |
|
546 |
|
547 // +1 for an added U+FEFF in the intermediate Unicode buffer |
|
548 fromoffsets = new int32_t[bufsz + 1]; |
|
549 } |
|
550 |
|
551 ~ConvertFile() { |
|
552 delete [] buf; |
|
553 delete [] fromoffsets; |
|
554 } |
|
555 |
|
556 UBool convertFile(const char *pname, |
|
557 const char *fromcpage, |
|
558 UConverterToUCallback toucallback, |
|
559 const void *touctxt, |
|
560 const char *tocpage, |
|
561 UConverterFromUCallback fromucallback, |
|
562 const void *fromuctxt, |
|
563 UBool fallback, |
|
564 const char *translit, |
|
565 const char *infilestr, |
|
566 FILE * outfile, int verbose); |
|
567 private: |
|
568 friend int main(int argc, char **argv); |
|
569 |
|
570 char *buf, *outbuf; |
|
571 int32_t *fromoffsets; |
|
572 |
|
573 size_t bufsz; |
|
574 int8_t signature; // add (1) or remove (-1) a U+FEFF Unicode signature character |
|
575 }; |
|
576 |
|
577 // Convert a file from one encoding to another |
|
578 UBool |
|
579 ConvertFile::convertFile(const char *pname, |
|
580 const char *fromcpage, |
|
581 UConverterToUCallback toucallback, |
|
582 const void *touctxt, |
|
583 const char *tocpage, |
|
584 UConverterFromUCallback fromucallback, |
|
585 const void *fromuctxt, |
|
586 UBool fallback, |
|
587 const char *translit, |
|
588 const char *infilestr, |
|
589 FILE * outfile, int verbose) |
|
590 { |
|
591 FILE *infile; |
|
592 UBool ret = TRUE; |
|
593 UConverter *convfrom = 0; |
|
594 UConverter *convto = 0; |
|
595 UErrorCode err = U_ZERO_ERROR; |
|
596 UBool flush; |
|
597 UBool closeFile = FALSE; |
|
598 const char *cbufp, *prevbufp; |
|
599 char *bufp; |
|
600 |
|
601 uint32_t infoffset = 0, outfoffset = 0; /* Where we are in the file, for error reporting. */ |
|
602 |
|
603 const UChar *unibuf, *unibufbp; |
|
604 UChar *unibufp; |
|
605 |
|
606 size_t rd, wr; |
|
607 |
|
608 #if !UCONFIG_NO_TRANSLITERATION |
|
609 Transliterator *t = 0; // Transliterator acting on Unicode data. |
|
610 UnicodeString chunk; // One chunk of the text being collected for transformation. |
|
611 #endif |
|
612 UnicodeString u; // String to do the transliteration. |
|
613 int32_t ulen; |
|
614 |
|
615 // use conversion offsets for error messages |
|
616 // unless a transliterator is used - |
|
617 // a text transformation will reorder characters in unpredictable ways |
|
618 UBool useOffsets = TRUE; |
|
619 |
|
620 // Open the correct input file or connect to stdin for reading input |
|
621 |
|
622 if (infilestr != 0 && strcmp(infilestr, "-")) { |
|
623 infile = fopen(infilestr, "rb"); |
|
624 if (infile == 0) { |
|
625 UnicodeString str1(infilestr, ""); |
|
626 str1.append((UChar32) 0); |
|
627 UnicodeString str2(strerror(errno), ""); |
|
628 str2.append((UChar32) 0); |
|
629 initMsg(pname); |
|
630 u_wmsg(stderr, "cantOpenInputF", str1.getBuffer(), str2.getBuffer()); |
|
631 return FALSE; |
|
632 } |
|
633 closeFile = TRUE; |
|
634 } else { |
|
635 infilestr = "-"; |
|
636 infile = stdin; |
|
637 #ifdef USE_FILENO_BINARY_MODE |
|
638 if (setmode(fileno(stdin), O_BINARY) == -1) { |
|
639 initMsg(pname); |
|
640 u_wmsg(stderr, "cantSetInBinMode"); |
|
641 return FALSE; |
|
642 } |
|
643 #endif |
|
644 } |
|
645 |
|
646 if (verbose) { |
|
647 fprintf(stderr, "%s:\n", infilestr); |
|
648 } |
|
649 |
|
650 #if !UCONFIG_NO_TRANSLITERATION |
|
651 // Create transliterator as needed. |
|
652 |
|
653 if (translit != NULL && *translit) { |
|
654 UParseError parse; |
|
655 UnicodeString str(translit), pestr; |
|
656 |
|
657 /* Create from rules or by ID as needed. */ |
|
658 |
|
659 parse.line = -1; |
|
660 |
|
661 if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) { |
|
662 t = Transliterator::createFromRules("Uconv", str, UTRANS_FORWARD, parse, err); |
|
663 } else { |
|
664 t = Transliterator::createInstance(translit, UTRANS_FORWARD, err); |
|
665 } |
|
666 |
|
667 if (U_FAILURE(err)) { |
|
668 str.append((UChar32) 0); |
|
669 initMsg(pname); |
|
670 |
|
671 if (parse.line >= 0) { |
|
672 UChar linebuf[20], offsetbuf[20]; |
|
673 uprv_itou(linebuf, 20, parse.line, 10, 0); |
|
674 uprv_itou(offsetbuf, 20, parse.offset, 10, 0); |
|
675 u_wmsg(stderr, "cantCreateTranslitParseErr", str.getTerminatedBuffer(), |
|
676 u_wmsg_errorName(err), linebuf, offsetbuf); |
|
677 } else { |
|
678 u_wmsg(stderr, "cantCreateTranslit", str.getTerminatedBuffer(), |
|
679 u_wmsg_errorName(err)); |
|
680 } |
|
681 |
|
682 if (t) { |
|
683 delete t; |
|
684 t = 0; |
|
685 } |
|
686 goto error_exit; |
|
687 } |
|
688 |
|
689 useOffsets = FALSE; |
|
690 } |
|
691 #endif |
|
692 |
|
693 // Create codepage converter. If the codepage or its aliases weren't |
|
694 // available, it returns NULL and a failure code. We also set the |
|
695 // callbacks, and return errors in the same way. |
|
696 |
|
697 convfrom = ucnv_open(fromcpage, &err); |
|
698 if (U_FAILURE(err)) { |
|
699 UnicodeString str(fromcpage, ""); |
|
700 initMsg(pname); |
|
701 u_wmsg(stderr, "cantOpenFromCodeset", str.getTerminatedBuffer(), |
|
702 u_wmsg_errorName(err)); |
|
703 goto error_exit; |
|
704 } |
|
705 ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err); |
|
706 if (U_FAILURE(err)) { |
|
707 initMsg(pname); |
|
708 u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err)); |
|
709 goto error_exit; |
|
710 } |
|
711 |
|
712 convto = ucnv_open(tocpage, &err); |
|
713 if (U_FAILURE(err)) { |
|
714 UnicodeString str(tocpage, ""); |
|
715 initMsg(pname); |
|
716 u_wmsg(stderr, "cantOpenToCodeset", str.getTerminatedBuffer(), |
|
717 u_wmsg_errorName(err)); |
|
718 goto error_exit; |
|
719 } |
|
720 ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err); |
|
721 if (U_FAILURE(err)) { |
|
722 initMsg(pname); |
|
723 u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err)); |
|
724 goto error_exit; |
|
725 } |
|
726 ucnv_setFallback(convto, fallback); |
|
727 |
|
728 UBool willexit, fromSawEndOfBytes, toSawEndOfUnicode; |
|
729 int8_t sig; |
|
730 |
|
731 // OK, we can convert now. |
|
732 sig = signature; |
|
733 rd = 0; |
|
734 |
|
735 do { |
|
736 willexit = FALSE; |
|
737 |
|
738 // input file offset at the beginning of the next buffer |
|
739 infoffset += rd; |
|
740 |
|
741 rd = fread(buf, 1, bufsz, infile); |
|
742 if (ferror(infile) != 0) { |
|
743 UnicodeString str(strerror(errno)); |
|
744 initMsg(pname); |
|
745 u_wmsg(stderr, "cantRead", str.getTerminatedBuffer()); |
|
746 goto error_exit; |
|
747 } |
|
748 |
|
749 // Convert the read buffer into the new encoding via Unicode. |
|
750 // After the call 'unibufp' will be placed behind the last |
|
751 // character that was converted in the 'unibuf'. |
|
752 // Also the 'cbufp' is positioned behind the last converted |
|
753 // character. |
|
754 // At the last conversion in the file, flush should be set to |
|
755 // true so that we get all characters converted. |
|
756 // |
|
757 // The converter must be flushed at the end of conversion so |
|
758 // that characters on hold also will be written. |
|
759 |
|
760 cbufp = buf; |
|
761 flush = (UBool)(rd != bufsz); |
|
762 |
|
763 // convert until the input is consumed |
|
764 do { |
|
765 // remember the start of the current byte-to-Unicode conversion |
|
766 prevbufp = cbufp; |
|
767 |
|
768 unibuf = unibufp = u.getBuffer((int32_t)bufsz); |
|
769 |
|
770 // Use bufsz instead of u.getCapacity() for the targetLimit |
|
771 // so that we don't overflow fromoffsets[]. |
|
772 ucnv_toUnicode(convfrom, &unibufp, unibuf + bufsz, &cbufp, |
|
773 buf + rd, useOffsets ? fromoffsets : NULL, flush, &err); |
|
774 |
|
775 ulen = (int32_t)(unibufp - unibuf); |
|
776 u.releaseBuffer(U_SUCCESS(err) ? ulen : 0); |
|
777 |
|
778 // fromSawEndOfBytes indicates that ucnv_toUnicode() is done |
|
779 // converting all of the input bytes. |
|
780 // It works like this because ucnv_toUnicode() returns only under the |
|
781 // following conditions: |
|
782 // - an error occurred during conversion (an error code is set) |
|
783 // - the target buffer is filled (the error code indicates an overflow) |
|
784 // - the source is consumed |
|
785 // That is, if the error code does not indicate a failure, |
|
786 // not even an overflow, then the source must be consumed entirely. |
|
787 fromSawEndOfBytes = (UBool)U_SUCCESS(err); |
|
788 |
|
789 if (err == U_BUFFER_OVERFLOW_ERROR) { |
|
790 err = U_ZERO_ERROR; |
|
791 } else if (U_FAILURE(err)) { |
|
792 char pos[32], errorBytes[32]; |
|
793 int8_t i, length, errorLength; |
|
794 |
|
795 UErrorCode localError = U_ZERO_ERROR; |
|
796 errorLength = (int8_t)sizeof(errorBytes); |
|
797 ucnv_getInvalidChars(convfrom, errorBytes, &errorLength, &localError); |
|
798 if (U_FAILURE(localError) || errorLength == 0) { |
|
799 errorLength = 1; |
|
800 } |
|
801 |
|
802 // print the input file offset of the start of the error bytes: |
|
803 // input file offset of the current byte buffer + |
|
804 // length of the just consumed bytes - |
|
805 // length of the error bytes |
|
806 length = |
|
807 (int8_t)sprintf(pos, "%d", |
|
808 (int)(infoffset + (cbufp - buf) - errorLength)); |
|
809 |
|
810 // output the bytes that caused the error |
|
811 UnicodeString str; |
|
812 for (i = 0; i < errorLength; ++i) { |
|
813 if (i > 0) { |
|
814 str.append((UChar)uSP); |
|
815 } |
|
816 str.append(nibbleToHex((uint8_t)errorBytes[i] >> 4)); |
|
817 str.append(nibbleToHex((uint8_t)errorBytes[i])); |
|
818 } |
|
819 |
|
820 initMsg(pname); |
|
821 u_wmsg(stderr, "problemCvtToU", |
|
822 UnicodeString(pos, length, "").getTerminatedBuffer(), |
|
823 str.getTerminatedBuffer(), |
|
824 u_wmsg_errorName(err)); |
|
825 |
|
826 willexit = TRUE; |
|
827 err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */ |
|
828 } |
|
829 |
|
830 // Replaced a check for whether the input was consumed by |
|
831 // looping until it is; message key "premEndInput" now obsolete. |
|
832 |
|
833 if (ulen == 0) { |
|
834 continue; |
|
835 } |
|
836 |
|
837 // remove a U+FEFF Unicode signature character if requested |
|
838 if (sig < 0) { |
|
839 if (u.charAt(0) == uSig) { |
|
840 u.remove(0, 1); |
|
841 |
|
842 // account for the removed UChar and offset |
|
843 --ulen; |
|
844 |
|
845 if (useOffsets) { |
|
846 // remove an offset from fromoffsets[] as well |
|
847 // to keep the array parallel with the UChars |
|
848 memmove(fromoffsets, fromoffsets + 1, ulen * 4); |
|
849 } |
|
850 |
|
851 } |
|
852 sig = 0; |
|
853 } |
|
854 |
|
855 #if !UCONFIG_NO_TRANSLITERATION |
|
856 // Transliterate/transform if needed. |
|
857 |
|
858 // For transformation, we use chunking code - |
|
859 // collect Unicode input until, for example, an end-of-line, |
|
860 // then transform and output-convert that and continue collecting. |
|
861 // This makes the transformation result independent of the buffer size |
|
862 // while avoiding the slower keyboard mode. |
|
863 // The end-of-chunk characters are completely included in the |
|
864 // transformed string in case they are to be transformed themselves. |
|
865 if (t != NULL) { |
|
866 UnicodeString out; |
|
867 int32_t chunkLimit; |
|
868 |
|
869 do { |
|
870 chunkLimit = getChunkLimit(chunk, u); |
|
871 if (chunkLimit < 0 && flush && fromSawEndOfBytes) { |
|
872 // use all of the rest at the end of the text |
|
873 chunkLimit = u.length(); |
|
874 } |
|
875 if (chunkLimit >= 0) { |
|
876 // complete the chunk and transform it |
|
877 chunk.append(u, 0, chunkLimit); |
|
878 u.remove(0, chunkLimit); |
|
879 t->transliterate(chunk); |
|
880 |
|
881 // append the transformation result to the result and empty the chunk |
|
882 out.append(chunk); |
|
883 chunk.remove(); |
|
884 } else { |
|
885 // continue collecting the chunk |
|
886 chunk.append(u); |
|
887 break; |
|
888 } |
|
889 } while (!u.isEmpty()); |
|
890 |
|
891 u = out; |
|
892 ulen = u.length(); |
|
893 } |
|
894 #endif |
|
895 |
|
896 // add a U+FEFF Unicode signature character if requested |
|
897 // and possible/necessary |
|
898 if (sig > 0) { |
|
899 if (u.charAt(0) != uSig && cnvSigType(convto) == CNV_WITH_FEFF) { |
|
900 u.insert(0, (UChar)uSig); |
|
901 |
|
902 if (useOffsets) { |
|
903 // insert a pseudo-offset into fromoffsets[] as well |
|
904 // to keep the array parallel with the UChars |
|
905 memmove(fromoffsets + 1, fromoffsets, ulen * 4); |
|
906 fromoffsets[0] = -1; |
|
907 } |
|
908 |
|
909 // account for the additional UChar and offset |
|
910 ++ulen; |
|
911 } |
|
912 sig = 0; |
|
913 } |
|
914 |
|
915 // Convert the Unicode buffer into the destination codepage |
|
916 // Again 'bufp' will be placed behind the last converted character |
|
917 // And 'unibufp' will be placed behind the last converted unicode character |
|
918 // At the last conversion flush should be set to true to ensure that |
|
919 // all characters left get converted |
|
920 |
|
921 unibuf = unibufbp = u.getBuffer(); |
|
922 |
|
923 do { |
|
924 bufp = outbuf; |
|
925 |
|
926 // Use fromSawEndOfBytes in addition to the flush flag - |
|
927 // it indicates whether the intermediate Unicode string |
|
928 // contains the very last UChars for the very last input bytes. |
|
929 ucnv_fromUnicode(convto, &bufp, outbuf + bufsz, |
|
930 &unibufbp, |
|
931 unibuf + ulen, |
|
932 NULL, (UBool)(flush && fromSawEndOfBytes), &err); |
|
933 |
|
934 // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done |
|
935 // converting all of the intermediate UChars. |
|
936 // See comment for fromSawEndOfBytes. |
|
937 toSawEndOfUnicode = (UBool)U_SUCCESS(err); |
|
938 |
|
939 if (err == U_BUFFER_OVERFLOW_ERROR) { |
|
940 err = U_ZERO_ERROR; |
|
941 } else if (U_FAILURE(err)) { |
|
942 UChar errorUChars[4]; |
|
943 const char *errtag; |
|
944 char pos[32]; |
|
945 UChar32 c; |
|
946 int8_t i, length, errorLength; |
|
947 |
|
948 UErrorCode localError = U_ZERO_ERROR; |
|
949 errorLength = (int8_t)LENGTHOF(errorUChars); |
|
950 ucnv_getInvalidUChars(convto, errorUChars, &errorLength, &localError); |
|
951 if (U_FAILURE(localError) || errorLength == 0) { |
|
952 // need at least 1 so that we don't access beyond the length of fromoffsets[] |
|
953 errorLength = 1; |
|
954 } |
|
955 |
|
956 int32_t ferroffset; |
|
957 |
|
958 if (useOffsets) { |
|
959 // Unicode buffer offset of the start of the error UChars |
|
960 ferroffset = (int32_t)((unibufbp - unibuf) - errorLength); |
|
961 if (ferroffset < 0) { |
|
962 // approximation - the character started in the previous Unicode buffer |
|
963 ferroffset = 0; |
|
964 } |
|
965 |
|
966 // get the corresponding byte offset out of fromoffsets[] |
|
967 // go back if the offset is not known for some of the UChars |
|
968 int32_t fromoffset; |
|
969 do { |
|
970 fromoffset = fromoffsets[ferroffset]; |
|
971 } while (fromoffset < 0 && --ferroffset >= 0); |
|
972 |
|
973 // total input file offset = |
|
974 // input file offset of the current byte buffer + |
|
975 // byte buffer offset of where the current Unicode buffer is converted from + |
|
976 // fromoffsets[Unicode offset] |
|
977 ferroffset = infoffset + (prevbufp - buf) + fromoffset; |
|
978 errtag = "problemCvtFromU"; |
|
979 } else { |
|
980 // Do not use fromoffsets if (t != NULL) because the Unicode text may |
|
981 // be different from what the offsets refer to. |
|
982 |
|
983 // output file offset |
|
984 ferroffset = (int32_t)(outfoffset + (bufp - outbuf)); |
|
985 errtag = "problemCvtFromUOut"; |
|
986 } |
|
987 |
|
988 length = (int8_t)sprintf(pos, "%u", (int)ferroffset); |
|
989 |
|
990 // output the code points that caused the error |
|
991 UnicodeString str; |
|
992 for (i = 0; i < errorLength;) { |
|
993 if (i > 0) { |
|
994 str.append((UChar)uSP); |
|
995 } |
|
996 U16_NEXT(errorUChars, i, errorLength, c); |
|
997 if (c >= 0x100000) { |
|
998 str.append(nibbleToHex((uint8_t)(c >> 20))); |
|
999 } |
|
1000 if (c >= 0x10000) { |
|
1001 str.append(nibbleToHex((uint8_t)(c >> 16))); |
|
1002 } |
|
1003 str.append(nibbleToHex((uint8_t)(c >> 12))); |
|
1004 str.append(nibbleToHex((uint8_t)(c >> 8))); |
|
1005 str.append(nibbleToHex((uint8_t)(c >> 4))); |
|
1006 str.append(nibbleToHex((uint8_t)c)); |
|
1007 } |
|
1008 |
|
1009 initMsg(pname); |
|
1010 u_wmsg(stderr, errtag, |
|
1011 UnicodeString(pos, length, "").getTerminatedBuffer(), |
|
1012 str.getTerminatedBuffer(), |
|
1013 u_wmsg_errorName(err)); |
|
1014 u_wmsg(stderr, "errorUnicode", str.getTerminatedBuffer()); |
|
1015 |
|
1016 willexit = TRUE; |
|
1017 err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */ |
|
1018 } |
|
1019 |
|
1020 // Replaced a check for whether the intermediate Unicode characters were all consumed by |
|
1021 // looping until they are; message key "premEnd" now obsolete. |
|
1022 |
|
1023 // Finally, write the converted buffer to the output file |
|
1024 size_t outlen = (size_t) (bufp - outbuf); |
|
1025 outfoffset += (int32_t)(wr = fwrite(outbuf, 1, outlen, outfile)); |
|
1026 if (wr != outlen) { |
|
1027 UnicodeString str(strerror(errno)); |
|
1028 initMsg(pname); |
|
1029 u_wmsg(stderr, "cantWrite", str.getTerminatedBuffer()); |
|
1030 willexit = TRUE; |
|
1031 } |
|
1032 |
|
1033 if (willexit) { |
|
1034 goto error_exit; |
|
1035 } |
|
1036 } while (!toSawEndOfUnicode); |
|
1037 } while (!fromSawEndOfBytes); |
|
1038 } while (!flush); // Stop when we have flushed the |
|
1039 // converters (this means that it's |
|
1040 // the end of output) |
|
1041 |
|
1042 goto normal_exit; |
|
1043 |
|
1044 error_exit: |
|
1045 ret = FALSE; |
|
1046 |
|
1047 normal_exit: |
|
1048 // Cleanup. |
|
1049 |
|
1050 ucnv_close(convfrom); |
|
1051 ucnv_close(convto); |
|
1052 |
|
1053 #if !UCONFIG_NO_TRANSLITERATION |
|
1054 delete t; |
|
1055 #endif |
|
1056 |
|
1057 if (closeFile) { |
|
1058 fclose(infile); |
|
1059 } |
|
1060 |
|
1061 return ret; |
|
1062 } |
|
1063 |
|
1064 static void usage(const char *pname, int ecode) { |
|
1065 const UChar *msg; |
|
1066 int32_t msgLen; |
|
1067 UErrorCode err = U_ZERO_ERROR; |
|
1068 FILE *fp = ecode ? stderr : stdout; |
|
1069 int res; |
|
1070 |
|
1071 initMsg(pname); |
|
1072 msg = |
|
1073 ures_getStringByKey(gBundle, ecode ? "lcUsageWord" : "ucUsageWord", |
|
1074 &msgLen, &err); |
|
1075 UnicodeString upname(pname, (int32_t)(uprv_strlen(pname) + 1)); |
|
1076 UnicodeString mname(msg, msgLen + 1); |
|
1077 |
|
1078 res = u_wmsg(fp, "usage", mname.getBuffer(), upname.getBuffer()); |
|
1079 if (!ecode) { |
|
1080 if (!res) { |
|
1081 fputc('\n', fp); |
|
1082 } |
|
1083 if (!u_wmsg(fp, "help")) { |
|
1084 /* Now dump callbacks and finish. */ |
|
1085 |
|
1086 int i, count = |
|
1087 sizeof(transcode_callbacks) / sizeof(*transcode_callbacks); |
|
1088 for (i = 0; i < count; ++i) { |
|
1089 fprintf(fp, " %s", transcode_callbacks[i].name); |
|
1090 } |
|
1091 fputc('\n', fp); |
|
1092 } |
|
1093 } |
|
1094 |
|
1095 exit(ecode); |
|
1096 } |
|
1097 |
|
1098 extern int |
|
1099 main(int argc, char **argv) |
|
1100 { |
|
1101 FILE *outfile; |
|
1102 int ret = 0; |
|
1103 |
|
1104 size_t bufsz = DEFAULT_BUFSZ; |
|
1105 |
|
1106 const char *fromcpage = 0; |
|
1107 const char *tocpage = 0; |
|
1108 const char *translit = 0; |
|
1109 const char *outfilestr = 0; |
|
1110 UBool fallback = FALSE; |
|
1111 |
|
1112 UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_STOP; |
|
1113 const void *fromuctxt = 0; |
|
1114 UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_STOP; |
|
1115 const void *touctxt = 0; |
|
1116 |
|
1117 char **iter, **remainArgv, **remainArgvLimit; |
|
1118 char **end = argv + argc; |
|
1119 |
|
1120 const char *pname; |
|
1121 |
|
1122 UBool printConvs = FALSE, printCanon = FALSE, printTranslits = FALSE; |
|
1123 const char *printName = 0; |
|
1124 |
|
1125 UBool verbose = FALSE; |
|
1126 UErrorCode status = U_ZERO_ERROR; |
|
1127 |
|
1128 ConvertFile cf; |
|
1129 |
|
1130 /* Initialize ICU */ |
|
1131 u_init(&status); |
|
1132 if (U_FAILURE(status)) { |
|
1133 fprintf(stderr, "%s: can not initialize ICU. status = %s\n", |
|
1134 argv[0], u_errorName(status)); |
|
1135 exit(1); |
|
1136 } |
|
1137 |
|
1138 // Get and prettify pname. |
|
1139 pname = uprv_strrchr(*argv, U_FILE_SEP_CHAR); |
|
1140 #if U_PLATFORM_USES_ONLY_WIN32_API |
|
1141 if (!pname) { |
|
1142 pname = uprv_strrchr(*argv, '/'); |
|
1143 } |
|
1144 #endif |
|
1145 if (!pname) { |
|
1146 pname = *argv; |
|
1147 } else { |
|
1148 ++pname; |
|
1149 } |
|
1150 |
|
1151 // First, get the arguments from command-line |
|
1152 // to know the codepages to convert between |
|
1153 |
|
1154 remainArgv = remainArgvLimit = argv + 1; |
|
1155 for (iter = argv + 1; iter != end; iter++) { |
|
1156 // Check for from charset |
|
1157 if (strcmp("-f", *iter) == 0 || !strcmp("--from-code", *iter)) { |
|
1158 iter++; |
|
1159 if (iter != end) |
|
1160 fromcpage = *iter; |
|
1161 else |
|
1162 usage(pname, 1); |
|
1163 } else if (strcmp("-t", *iter) == 0 || !strcmp("--to-code", *iter)) { |
|
1164 iter++; |
|
1165 if (iter != end) |
|
1166 tocpage = *iter; |
|
1167 else |
|
1168 usage(pname, 1); |
|
1169 } else if (strcmp("-x", *iter) == 0) { |
|
1170 iter++; |
|
1171 if (iter != end) |
|
1172 translit = *iter; |
|
1173 else |
|
1174 usage(pname, 1); |
|
1175 } else if (!strcmp("--fallback", *iter)) { |
|
1176 fallback = TRUE; |
|
1177 } else if (!strcmp("--no-fallback", *iter)) { |
|
1178 fallback = FALSE; |
|
1179 } else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) { |
|
1180 iter++; |
|
1181 if (iter != end) { |
|
1182 bufsz = atoi(*iter); |
|
1183 if ((int) bufsz <= 0) { |
|
1184 initMsg(pname); |
|
1185 UnicodeString str(*iter); |
|
1186 initMsg(pname); |
|
1187 u_wmsg(stderr, "badBlockSize", str.getTerminatedBuffer()); |
|
1188 return 3; |
|
1189 } |
|
1190 } else { |
|
1191 usage(pname, 1); |
|
1192 } |
|
1193 } else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) { |
|
1194 if (printTranslits) { |
|
1195 usage(pname, 1); |
|
1196 } |
|
1197 printConvs = TRUE; |
|
1198 } else if (strcmp("--default-code", *iter) == 0) { |
|
1199 if (printTranslits) { |
|
1200 usage(pname, 1); |
|
1201 } |
|
1202 printName = ucnv_getDefaultName(); |
|
1203 } else if (strcmp("--list-code", *iter) == 0) { |
|
1204 if (printTranslits) { |
|
1205 usage(pname, 1); |
|
1206 } |
|
1207 |
|
1208 iter++; |
|
1209 if (iter != end) { |
|
1210 UErrorCode e = U_ZERO_ERROR; |
|
1211 printName = ucnv_getAlias(*iter, 0, &e); |
|
1212 if (U_FAILURE(e) || !printName) { |
|
1213 UnicodeString str(*iter); |
|
1214 initMsg(pname); |
|
1215 u_wmsg(stderr, "noSuchCodeset", str.getTerminatedBuffer()); |
|
1216 return 2; |
|
1217 } |
|
1218 } else |
|
1219 usage(pname, 1); |
|
1220 } else if (strcmp("--canon", *iter) == 0) { |
|
1221 printCanon = TRUE; |
|
1222 } else if (strcmp("-L", *iter) == 0 |
|
1223 || !strcmp("--list-transliterators", *iter)) { |
|
1224 if (printConvs) { |
|
1225 usage(pname, 1); |
|
1226 } |
|
1227 printTranslits = TRUE; |
|
1228 } else if (strcmp("-h", *iter) == 0 || !strcmp("-?", *iter) |
|
1229 || !strcmp("--help", *iter)) { |
|
1230 usage(pname, 0); |
|
1231 } else if (!strcmp("-c", *iter)) { |
|
1232 fromucallback = UCNV_FROM_U_CALLBACK_SKIP; |
|
1233 } else if (!strcmp("--to-callback", *iter)) { |
|
1234 iter++; |
|
1235 if (iter != end) { |
|
1236 const struct callback_ent *cbe = findCallback(*iter); |
|
1237 if (cbe) { |
|
1238 fromucallback = cbe->fromu; |
|
1239 fromuctxt = cbe->fromuctxt; |
|
1240 } else { |
|
1241 UnicodeString str(*iter); |
|
1242 initMsg(pname); |
|
1243 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer()); |
|
1244 return 4; |
|
1245 } |
|
1246 } else { |
|
1247 usage(pname, 1); |
|
1248 } |
|
1249 } else if (!strcmp("--from-callback", *iter)) { |
|
1250 iter++; |
|
1251 if (iter != end) { |
|
1252 const struct callback_ent *cbe = findCallback(*iter); |
|
1253 if (cbe) { |
|
1254 toucallback = cbe->tou; |
|
1255 touctxt = cbe->touctxt; |
|
1256 } else { |
|
1257 UnicodeString str(*iter); |
|
1258 initMsg(pname); |
|
1259 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer()); |
|
1260 return 4; |
|
1261 } |
|
1262 } else { |
|
1263 usage(pname, 1); |
|
1264 } |
|
1265 } else if (!strcmp("-i", *iter)) { |
|
1266 toucallback = UCNV_TO_U_CALLBACK_SKIP; |
|
1267 } else if (!strcmp("--callback", *iter)) { |
|
1268 iter++; |
|
1269 if (iter != end) { |
|
1270 const struct callback_ent *cbe = findCallback(*iter); |
|
1271 if (cbe) { |
|
1272 fromucallback = cbe->fromu; |
|
1273 fromuctxt = cbe->fromuctxt; |
|
1274 toucallback = cbe->tou; |
|
1275 touctxt = cbe->touctxt; |
|
1276 } else { |
|
1277 UnicodeString str(*iter); |
|
1278 initMsg(pname); |
|
1279 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer()); |
|
1280 return 4; |
|
1281 } |
|
1282 } else { |
|
1283 usage(pname, 1); |
|
1284 } |
|
1285 } else if (!strcmp("-s", *iter) || !strcmp("--silent", *iter)) { |
|
1286 verbose = FALSE; |
|
1287 } else if (!strcmp("-v", *iter) || !strcmp("--verbose", *iter)) { |
|
1288 verbose = TRUE; |
|
1289 } else if (!strcmp("-V", *iter) || !strcmp("--version", *iter)) { |
|
1290 printf("%s v2.1 ICU " U_ICU_VERSION "\n", pname); |
|
1291 return 0; |
|
1292 } else if (!strcmp("-o", *iter) || !strcmp("--output", *iter)) { |
|
1293 ++iter; |
|
1294 if (iter != end && !outfilestr) { |
|
1295 outfilestr = *iter; |
|
1296 } else { |
|
1297 usage(pname, 1); |
|
1298 } |
|
1299 } else if (0 == strcmp("--add-signature", *iter)) { |
|
1300 cf.signature = 1; |
|
1301 } else if (0 == strcmp("--remove-signature", *iter)) { |
|
1302 cf.signature = -1; |
|
1303 } else if (**iter == '-' && (*iter)[1]) { |
|
1304 usage(pname, 1); |
|
1305 } else { |
|
1306 // move a non-option up in argv[] |
|
1307 *remainArgvLimit++ = *iter; |
|
1308 } |
|
1309 } |
|
1310 |
|
1311 if (printConvs || printName) { |
|
1312 return printConverters(pname, printName, printCanon) ? 2 : 0; |
|
1313 } else if (printTranslits) { |
|
1314 return printTransliterators(printCanon) ? 3 : 0; |
|
1315 } |
|
1316 |
|
1317 if (!fromcpage || !uprv_strcmp(fromcpage, "-")) { |
|
1318 fromcpage = ucnv_getDefaultName(); |
|
1319 } |
|
1320 if (!tocpage || !uprv_strcmp(tocpage, "-")) { |
|
1321 tocpage = ucnv_getDefaultName(); |
|
1322 } |
|
1323 |
|
1324 // Open the correct output file or connect to stdout for reading input |
|
1325 if (outfilestr != 0 && strcmp(outfilestr, "-")) { |
|
1326 outfile = fopen(outfilestr, "wb"); |
|
1327 if (outfile == 0) { |
|
1328 UnicodeString str1(outfilestr, ""); |
|
1329 UnicodeString str2(strerror(errno), ""); |
|
1330 initMsg(pname); |
|
1331 u_wmsg(stderr, "cantCreateOutputF", |
|
1332 str1.getBuffer(), str2.getBuffer()); |
|
1333 return 1; |
|
1334 } |
|
1335 } else { |
|
1336 outfilestr = "-"; |
|
1337 outfile = stdout; |
|
1338 #ifdef USE_FILENO_BINARY_MODE |
|
1339 if (setmode(fileno(outfile), O_BINARY) == -1) { |
|
1340 u_wmsg(stderr, "cantSetOutBinMode"); |
|
1341 exit(-1); |
|
1342 } |
|
1343 #endif |
|
1344 } |
|
1345 |
|
1346 /* Loop again on the arguments to find all the input files, and |
|
1347 convert them. */ |
|
1348 |
|
1349 cf.setBufferSize(bufsz); |
|
1350 |
|
1351 if(remainArgv < remainArgvLimit) { |
|
1352 for (iter = remainArgv; iter != remainArgvLimit; iter++) { |
|
1353 if (!cf.convertFile( |
|
1354 pname, fromcpage, toucallback, touctxt, tocpage, |
|
1355 fromucallback, fromuctxt, fallback, translit, *iter, |
|
1356 outfile, verbose) |
|
1357 ) { |
|
1358 goto error_exit; |
|
1359 } |
|
1360 } |
|
1361 } else { |
|
1362 if (!cf.convertFile( |
|
1363 pname, fromcpage, toucallback, touctxt, tocpage, |
|
1364 fromucallback, fromuctxt, fallback, translit, 0, |
|
1365 outfile, verbose) |
|
1366 ) { |
|
1367 goto error_exit; |
|
1368 } |
|
1369 } |
|
1370 |
|
1371 goto normal_exit; |
|
1372 error_exit: |
|
1373 #if !UCONFIG_NO_LEGACY_CONVERSION |
|
1374 ret = 1; |
|
1375 #else |
|
1376 fprintf(stderr, "uconv error: UCONFIG_NO_LEGACY_CONVERSION is on. See uconfig.h\n"); |
|
1377 #endif |
|
1378 normal_exit: |
|
1379 |
|
1380 if (outfile != stdout) { |
|
1381 fclose(outfile); |
|
1382 } |
|
1383 |
|
1384 u_cleanup(); |
|
1385 |
|
1386 return ret; |
|
1387 } |
|
1388 |
|
1389 |
|
1390 /* |
|
1391 * Hey, Emacs, please set the following: |
|
1392 * |
|
1393 * Local Variables: |
|
1394 * indent-tabs-mode: nil |
|
1395 * End: |
|
1396 * |
|
1397 */ |