1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/extra/uconv/uconv.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1397 @@ 1.4 +/***************************************************************************** 1.5 +* 1.6 +* Copyright (C) 1999-2013, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +* 1.9 +******************************************************************************/ 1.10 + 1.11 +/* 1.12 + * uconv(1): an iconv(1)-like converter using ICU. 1.13 + * 1.14 + * Original code by Jonas Utterström <jonas.utterstrom@vittran.norrnod.se> 1.15 + * contributed in 1999. 1.16 + * 1.17 + * Conversion to the C conversion API and many improvements by 1.18 + * Yves Arrouye <yves@realnames.com>, current maintainer. 1.19 + * 1.20 + * Markus Scherer maintainer from 2003. 1.21 + * See source code repository history for changes. 1.22 + */ 1.23 + 1.24 +#include <unicode/utypes.h> 1.25 +#include <unicode/putil.h> 1.26 +#include <unicode/ucnv.h> 1.27 +#include <unicode/uenum.h> 1.28 +#include <unicode/unistr.h> 1.29 +#include <unicode/translit.h> 1.30 +#include <unicode/uset.h> 1.31 +#include <unicode/uclean.h> 1.32 +#include <unicode/utf16.h> 1.33 + 1.34 +#include <stdio.h> 1.35 +#include <errno.h> 1.36 +#include <string.h> 1.37 +#include <stdlib.h> 1.38 + 1.39 +#include "cmemory.h" 1.40 +#include "cstring.h" 1.41 +#include "ustrfmt.h" 1.42 + 1.43 +#include "unicode/uwmsg.h" 1.44 + 1.45 +U_NAMESPACE_USE 1.46 + 1.47 +#if U_PLATFORM_USES_ONLY_WIN32_API && !defined(__STRICT_ANSI__) 1.48 +#include <io.h> 1.49 +#include <fcntl.h> 1.50 +#if U_PLATFORM_USES_ONLY_WIN32_API 1.51 +#define USE_FILENO_BINARY_MODE 1 1.52 +/* Windows likes to rename Unix-like functions */ 1.53 +#ifndef fileno 1.54 +#define fileno _fileno 1.55 +#endif 1.56 +#ifndef setmode 1.57 +#define setmode _setmode 1.58 +#endif 1.59 +#ifndef O_BINARY 1.60 +#define O_BINARY _O_BINARY 1.61 +#endif 1.62 +#endif 1.63 +#endif 1.64 + 1.65 +#ifdef UCONVMSG_LINK 1.66 +/* below from the README */ 1.67 +#include "unicode/utypes.h" 1.68 +#include "unicode/udata.h" 1.69 +U_CFUNC char uconvmsg_dat[]; 1.70 +#endif 1.71 + 1.72 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 1.73 + 1.74 +#define DEFAULT_BUFSZ 4096 1.75 +#define UCONVMSG "uconvmsg" 1.76 + 1.77 +static UResourceBundle *gBundle = 0; /* Bundle containing messages. */ 1.78 + 1.79 +/* 1.80 + * Initialize the message bundle so that message strings can be fetched 1.81 + * by u_wmsg(). 1.82 + * 1.83 + */ 1.84 + 1.85 +static void initMsg(const char *pname) { 1.86 + static int ps = 0; 1.87 + 1.88 + if (!ps) { 1.89 + char dataPath[2048]; /* XXX Sloppy: should be PATH_MAX. */ 1.90 + UErrorCode err = U_ZERO_ERROR; 1.91 + 1.92 + ps = 1; 1.93 + 1.94 + /* Set up our static data - if any */ 1.95 +#if defined(UCONVMSG_LINK) && U_PLATFORM != U_PF_OS390 /* On z/OS, this is failing. */ 1.96 + udata_setAppData(UCONVMSG, (const void*) uconvmsg_dat, &err); 1.97 + if (U_FAILURE(err)) { 1.98 + fprintf(stderr, "%s: warning, problem installing our static resource bundle data uconvmsg: %s - trying anyways.\n", 1.99 + pname, u_errorName(err)); 1.100 + err = U_ZERO_ERROR; /* It may still fail */ 1.101 + } 1.102 +#endif 1.103 + 1.104 + /* Get messages. */ 1.105 + gBundle = u_wmsg_setPath(UCONVMSG, &err); 1.106 + if (U_FAILURE(err)) { 1.107 + fprintf(stderr, 1.108 + "%s: warning: couldn't open bundle %s: %s\n", 1.109 + pname, UCONVMSG, u_errorName(err)); 1.110 +#ifdef UCONVMSG_LINK 1.111 + fprintf(stderr, 1.112 + "%s: setAppData was called, internal data %s failed to load\n", 1.113 + pname, UCONVMSG); 1.114 +#endif 1.115 + 1.116 + err = U_ZERO_ERROR; 1.117 + /* that was try #1, try again with a path */ 1.118 + uprv_strcpy(dataPath, u_getDataDirectory()); 1.119 + uprv_strcat(dataPath, U_FILE_SEP_STRING); 1.120 + uprv_strcat(dataPath, UCONVMSG); 1.121 + 1.122 + gBundle = u_wmsg_setPath(dataPath, &err); 1.123 + if (U_FAILURE(err)) { 1.124 + fprintf(stderr, 1.125 + "%s: warning: still couldn't open bundle %s: %s\n", 1.126 + pname, dataPath, u_errorName(err)); 1.127 + fprintf(stderr, "%s: warning: messages will not be displayed\n", pname); 1.128 + } 1.129 + } 1.130 + } 1.131 +} 1.132 + 1.133 +/* Mapping of callback names to the callbacks passed to the converter 1.134 + API. */ 1.135 + 1.136 +static struct callback_ent { 1.137 + const char *name; 1.138 + UConverterFromUCallback fromu; 1.139 + const void *fromuctxt; 1.140 + UConverterToUCallback tou; 1.141 + const void *touctxt; 1.142 +} transcode_callbacks[] = { 1.143 + { "substitute", 1.144 + UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 1.145 + UCNV_TO_U_CALLBACK_SUBSTITUTE, 0 }, 1.146 + { "skip", 1.147 + UCNV_FROM_U_CALLBACK_SKIP, 0, 1.148 + UCNV_TO_U_CALLBACK_SKIP, 0 }, 1.149 + { "stop", 1.150 + UCNV_FROM_U_CALLBACK_STOP, 0, 1.151 + UCNV_TO_U_CALLBACK_STOP, 0 }, 1.152 + { "escape", 1.153 + UCNV_FROM_U_CALLBACK_ESCAPE, 0, 1.154 + UCNV_TO_U_CALLBACK_ESCAPE, 0}, 1.155 + { "escape-icu", 1.156 + UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU, 1.157 + UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU }, 1.158 + { "escape-java", 1.159 + UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA, 1.160 + UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA }, 1.161 + { "escape-c", 1.162 + UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C, 1.163 + UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C }, 1.164 + { "escape-xml", 1.165 + UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX, 1.166 + UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX }, 1.167 + { "escape-xml-hex", 1.168 + UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX, 1.169 + UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX }, 1.170 + { "escape-xml-dec", 1.171 + UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 1.172 + UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC }, 1.173 + { "escape-unicode", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE, 1.174 + UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE } 1.175 +}; 1.176 + 1.177 +/* Return a pointer to a callback record given its name. */ 1.178 + 1.179 +static const struct callback_ent *findCallback(const char *name) { 1.180 + int i, count = 1.181 + sizeof(transcode_callbacks) / sizeof(*transcode_callbacks); 1.182 + 1.183 + /* We'll do a linear search, there aren't many of them and bsearch() 1.184 + may not be that portable. */ 1.185 + 1.186 + for (i = 0; i < count; ++i) { 1.187 + if (!uprv_stricmp(name, transcode_callbacks[i].name)) { 1.188 + return &transcode_callbacks[i]; 1.189 + } 1.190 + } 1.191 + 1.192 + return 0; 1.193 +} 1.194 + 1.195 +/* Print converter information. If lookfor is set, only that converter will 1.196 + be printed, otherwise all converters will be printed. If canon is non 1.197 + zero, tags and aliases for each converter are printed too, in the format 1.198 + expected for convrters.txt(5). */ 1.199 + 1.200 +static int printConverters(const char *pname, const char *lookfor, 1.201 + UBool canon) 1.202 +{ 1.203 + UErrorCode err = U_ZERO_ERROR; 1.204 + int32_t num; 1.205 + uint16_t num_stds; 1.206 + const char **stds; 1.207 + 1.208 + /* If there is a specified name, just handle that now. */ 1.209 + 1.210 + if (lookfor) { 1.211 + if (!canon) { 1.212 + printf("%s\n", lookfor); 1.213 + return 0; 1.214 + } else { 1.215 + /* Because we are printing a canonical name, we need the 1.216 + true converter name. We've done that already except for 1.217 + the default name (because we want to print the exact 1.218 + name one would get when calling ucnv_getDefaultName() 1.219 + in non-canon mode). But since we do not know at this 1.220 + point if we have the default name or something else, we 1.221 + need to normalize again to the canonical converter 1.222 + name. */ 1.223 + 1.224 + const char *truename = ucnv_getAlias(lookfor, 0, &err); 1.225 + if (U_SUCCESS(err)) { 1.226 + lookfor = truename; 1.227 + } else { 1.228 + err = U_ZERO_ERROR; 1.229 + } 1.230 + } 1.231 + } 1.232 + 1.233 + /* Print converter names. We come here for one of two reasons: we 1.234 + are printing all the names (lookfor was null), or we have a 1.235 + single converter to print but in canon mode, hence we need to 1.236 + get to it in order to print everything. */ 1.237 + 1.238 + num = ucnv_countAvailable(); 1.239 + if (num <= 0) { 1.240 + initMsg(pname); 1.241 + u_wmsg(stderr, "cantGetNames"); 1.242 + return -1; 1.243 + } 1.244 + if (lookfor) { 1.245 + num = 1; /* We know where we want to be. */ 1.246 + } 1.247 + 1.248 + num_stds = ucnv_countStandards(); 1.249 + stds = (const char **) uprv_malloc(num_stds * sizeof(*stds)); 1.250 + if (!stds) { 1.251 + u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR)); 1.252 + return -1; 1.253 + } else { 1.254 + uint16_t s; 1.255 + 1.256 + if (canon) { 1.257 + printf("{ "); 1.258 + } 1.259 + for (s = 0; s < num_stds; ++s) { 1.260 + stds[s] = ucnv_getStandard(s, &err); 1.261 + if (canon) { 1.262 + printf("%s ", stds[s]); 1.263 + } 1.264 + if (U_FAILURE(err)) { 1.265 + u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(err)); 1.266 + goto error_cleanup; 1.267 + } 1.268 + } 1.269 + if (canon) { 1.270 + puts("}"); 1.271 + } 1.272 + } 1.273 + 1.274 + for (int32_t i = 0; i < num; i++) { 1.275 + const char *name; 1.276 + uint16_t num_aliases; 1.277 + 1.278 + /* Set the name either to what we are looking for, or 1.279 + to the current converter name. */ 1.280 + 1.281 + if (lookfor) { 1.282 + name = lookfor; 1.283 + } else { 1.284 + name = ucnv_getAvailableName(i); 1.285 + } 1.286 + 1.287 + /* Get all the aliases associated to the name. */ 1.288 + 1.289 + err = U_ZERO_ERROR; 1.290 + num_aliases = ucnv_countAliases(name, &err); 1.291 + if (U_FAILURE(err)) { 1.292 + printf("%s", name); 1.293 + 1.294 + UnicodeString str(name, ""); 1.295 + putchar('\t'); 1.296 + u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(), 1.297 + u_wmsg_errorName(err)); 1.298 + goto error_cleanup; 1.299 + } else { 1.300 + uint16_t a, s, t; 1.301 + 1.302 + /* Write all the aliases and their tags. */ 1.303 + 1.304 + for (a = 0; a < num_aliases; ++a) { 1.305 + const char *alias = ucnv_getAlias(name, a, &err); 1.306 + 1.307 + if (U_FAILURE(err)) { 1.308 + UnicodeString str(name, ""); 1.309 + putchar('\t'); 1.310 + u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(), 1.311 + u_wmsg_errorName(err)); 1.312 + goto error_cleanup; 1.313 + } 1.314 + 1.315 + /* Print the current alias so that it looks right. */ 1.316 + printf("%s%s%s", (canon ? (a == 0? "" : "\t" ) : "") , 1.317 + alias, 1.318 + (canon ? "" : " ")); 1.319 + 1.320 + /* Look (slowly, linear searching) for a tag. */ 1.321 + 1.322 + if (canon) { 1.323 + /* -1 to skip the last standard */ 1.324 + for (s = t = 0; s < num_stds-1; ++s) { 1.325 + UEnumeration *nameEnum = ucnv_openStandardNames(name, stds[s], &err); 1.326 + if (U_SUCCESS(err)) { 1.327 + /* List the standard tags */ 1.328 + const char *standardName; 1.329 + UBool isFirst = TRUE; 1.330 + UErrorCode enumError = U_ZERO_ERROR; 1.331 + while ((standardName = uenum_next(nameEnum, NULL, &enumError))) { 1.332 + /* See if this alias is supported by this standard. */ 1.333 + if (!strcmp(standardName, alias)) { 1.334 + if (!t) { 1.335 + printf(" {"); 1.336 + t = 1; 1.337 + } 1.338 + /* Print a * after the default standard name */ 1.339 + printf(" %s%s", stds[s], (isFirst ? "*" : "")); 1.340 + } 1.341 + isFirst = FALSE; 1.342 + } 1.343 + } 1.344 + } 1.345 + if (t) { 1.346 + printf(" }"); 1.347 + } 1.348 + } 1.349 + /* Terminate this entry. */ 1.350 + if (canon) { 1.351 + puts(""); 1.352 + } 1.353 + 1.354 + /* Move on. */ 1.355 + } 1.356 + /* Terminate this entry. */ 1.357 + if (!canon) { 1.358 + puts(""); 1.359 + } 1.360 + } 1.361 + } 1.362 + 1.363 + /* Free temporary data. */ 1.364 + 1.365 + uprv_free(stds); 1.366 + 1.367 + /* Success. */ 1.368 + 1.369 + return 0; 1.370 +error_cleanup: 1.371 + uprv_free(stds); 1.372 + return -1; 1.373 +} 1.374 + 1.375 +/* Print all available transliterators. If canon is non zero, print 1.376 + one transliterator per line. */ 1.377 + 1.378 +static int printTransliterators(UBool canon) 1.379 +{ 1.380 +#if UCONFIG_NO_TRANSLITERATION 1.381 + printf("no transliterators available because of UCONFIG_NO_TRANSLITERATION, see uconfig.h\n"); 1.382 + return 1; 1.383 +#else 1.384 + UErrorCode status = U_ZERO_ERROR; 1.385 + UEnumeration *ids = utrans_openIDs(&status); 1.386 + int32_t i, numtrans = uenum_count(ids, &status); 1.387 + 1.388 + char sepchar = canon ? '\n' : ' '; 1.389 + 1.390 + for (i = 0; U_SUCCESS(status)&& (i < numtrans); ++i) { 1.391 + int32_t len; 1.392 + const char *nextTrans = uenum_next(ids, &len, &status); 1.393 + 1.394 + printf("%s", nextTrans); 1.395 + if (i < numtrans - 1) { 1.396 + putchar(sepchar); 1.397 + } 1.398 + } 1.399 + 1.400 + uenum_close(ids); 1.401 + 1.402 + /* Add a terminating newline if needed. */ 1.403 + 1.404 + if (sepchar != '\n') { 1.405 + putchar('\n'); 1.406 + } 1.407 + 1.408 + /* Success. */ 1.409 + 1.410 + return 0; 1.411 +#endif 1.412 +} 1.413 + 1.414 +enum { 1.415 + uSP = 0x20, // space 1.416 + uCR = 0xd, // carriage return 1.417 + uLF = 0xa, // line feed 1.418 + uNL = 0x85, // newline 1.419 + uLS = 0x2028, // line separator 1.420 + uPS = 0x2029, // paragraph separator 1.421 + uSig = 0xfeff // signature/BOM character 1.422 +}; 1.423 + 1.424 +static inline int32_t 1.425 +getChunkLimit(const UnicodeString &prev, const UnicodeString &s) { 1.426 + // find one of 1.427 + // CR, LF, CRLF, NL, LS, PS 1.428 + // for paragraph ends (see UAX #13/Unicode 4) 1.429 + // and include it in the chunk 1.430 + // all of these characters are on the BMP 1.431 + // do not include FF or VT in case they are part of a paragraph 1.432 + // (important for bidi contexts) 1.433 + static const UChar paraEnds[] = { 1.434 + 0xd, 0xa, 0x85, 0x2028, 0x2029 1.435 + }; 1.436 + enum { 1.437 + iCR, iLF, iNL, iLS, iPS, iCount 1.438 + }; 1.439 + 1.440 + // first, see if there is a CRLF split between prev and s 1.441 + if (prev.endsWith(paraEnds + iCR, 1)) { 1.442 + if (s.startsWith(paraEnds + iLF, 1)) { 1.443 + return 1; // split CRLF, include the LF 1.444 + } else if (!s.isEmpty()) { 1.445 + return 0; // complete the last chunk 1.446 + } else { 1.447 + return -1; // wait for actual further contents to arrive 1.448 + } 1.449 + } 1.450 + 1.451 + const UChar *u = s.getBuffer(), *limit = u + s.length(); 1.452 + UChar c; 1.453 + 1.454 + while (u < limit) { 1.455 + c = *u++; 1.456 + if ( 1.457 + ((c < uSP) && (c == uCR || c == uLF)) || 1.458 + (c == uNL) || 1.459 + ((c & uLS) == uLS) 1.460 + ) { 1.461 + if (c == uCR) { 1.462 + // check for CRLF 1.463 + if (u == limit) { 1.464 + return -1; // LF may be in the next chunk 1.465 + } else if (*u == uLF) { 1.466 + ++u; // include the LF in this chunk 1.467 + } 1.468 + } 1.469 + return (int32_t)(u - s.getBuffer()); 1.470 + } 1.471 + } 1.472 + 1.473 + return -1; // continue collecting the chunk 1.474 +} 1.475 + 1.476 +enum { 1.477 + CNV_NO_FEFF, // cannot convert the U+FEFF Unicode signature character (BOM) 1.478 + CNV_WITH_FEFF, // can convert the U+FEFF signature character 1.479 + CNV_ADDS_FEFF // automatically adds/detects the U+FEFF signature character 1.480 +}; 1.481 + 1.482 +static inline UChar 1.483 +nibbleToHex(uint8_t n) { 1.484 + n &= 0xf; 1.485 + return 1.486 + n <= 9 ? 1.487 + (UChar)(0x30 + n) : 1.488 + (UChar)((0x61 - 10) + n); 1.489 +} 1.490 + 1.491 +// check the converter's Unicode signature properties; 1.492 +// the fromUnicode side of the converter must be in its initial state 1.493 +// and will be reset again if it was used 1.494 +static int32_t 1.495 +cnvSigType(UConverter *cnv) { 1.496 + UErrorCode err; 1.497 + int32_t result; 1.498 + 1.499 + // test if the output charset can convert U+FEFF 1.500 + USet *set = uset_open(1, 0); 1.501 + err = U_ZERO_ERROR; 1.502 + ucnv_getUnicodeSet(cnv, set, UCNV_ROUNDTRIP_SET, &err); 1.503 + if (U_SUCCESS(err) && uset_contains(set, uSig)) { 1.504 + result = CNV_WITH_FEFF; 1.505 + } else { 1.506 + result = CNV_NO_FEFF; // an error occurred or U+FEFF cannot be converted 1.507 + } 1.508 + uset_close(set); 1.509 + 1.510 + if (result == CNV_WITH_FEFF) { 1.511 + // test if the output charset emits a signature anyway 1.512 + const UChar a[1] = { 0x61 }; // "a" 1.513 + const UChar *in; 1.514 + 1.515 + char buffer[20]; 1.516 + char *out; 1.517 + 1.518 + in = a; 1.519 + out = buffer; 1.520 + err = U_ZERO_ERROR; 1.521 + ucnv_fromUnicode(cnv, 1.522 + &out, buffer + sizeof(buffer), 1.523 + &in, a + 1, 1.524 + NULL, TRUE, &err); 1.525 + ucnv_resetFromUnicode(cnv); 1.526 + 1.527 + if (NULL != ucnv_detectUnicodeSignature(buffer, (int32_t)(out - buffer), NULL, &err) && 1.528 + U_SUCCESS(err) 1.529 + ) { 1.530 + result = CNV_ADDS_FEFF; 1.531 + } 1.532 + } 1.533 + 1.534 + return result; 1.535 +} 1.536 + 1.537 +class ConvertFile { 1.538 +public: 1.539 + ConvertFile() : 1.540 + buf(NULL), outbuf(NULL), fromoffsets(NULL), 1.541 + bufsz(0), signature(0) {} 1.542 + 1.543 + void 1.544 + setBufferSize(size_t bufferSize) { 1.545 + bufsz = bufferSize; 1.546 + 1.547 + buf = new char[2 * bufsz]; 1.548 + outbuf = buf + bufsz; 1.549 + 1.550 + // +1 for an added U+FEFF in the intermediate Unicode buffer 1.551 + fromoffsets = new int32_t[bufsz + 1]; 1.552 + } 1.553 + 1.554 + ~ConvertFile() { 1.555 + delete [] buf; 1.556 + delete [] fromoffsets; 1.557 + } 1.558 + 1.559 + UBool convertFile(const char *pname, 1.560 + const char *fromcpage, 1.561 + UConverterToUCallback toucallback, 1.562 + const void *touctxt, 1.563 + const char *tocpage, 1.564 + UConverterFromUCallback fromucallback, 1.565 + const void *fromuctxt, 1.566 + UBool fallback, 1.567 + const char *translit, 1.568 + const char *infilestr, 1.569 + FILE * outfile, int verbose); 1.570 +private: 1.571 + friend int main(int argc, char **argv); 1.572 + 1.573 + char *buf, *outbuf; 1.574 + int32_t *fromoffsets; 1.575 + 1.576 + size_t bufsz; 1.577 + int8_t signature; // add (1) or remove (-1) a U+FEFF Unicode signature character 1.578 +}; 1.579 + 1.580 +// Convert a file from one encoding to another 1.581 +UBool 1.582 +ConvertFile::convertFile(const char *pname, 1.583 + const char *fromcpage, 1.584 + UConverterToUCallback toucallback, 1.585 + const void *touctxt, 1.586 + const char *tocpage, 1.587 + UConverterFromUCallback fromucallback, 1.588 + const void *fromuctxt, 1.589 + UBool fallback, 1.590 + const char *translit, 1.591 + const char *infilestr, 1.592 + FILE * outfile, int verbose) 1.593 +{ 1.594 + FILE *infile; 1.595 + UBool ret = TRUE; 1.596 + UConverter *convfrom = 0; 1.597 + UConverter *convto = 0; 1.598 + UErrorCode err = U_ZERO_ERROR; 1.599 + UBool flush; 1.600 + UBool closeFile = FALSE; 1.601 + const char *cbufp, *prevbufp; 1.602 + char *bufp; 1.603 + 1.604 + uint32_t infoffset = 0, outfoffset = 0; /* Where we are in the file, for error reporting. */ 1.605 + 1.606 + const UChar *unibuf, *unibufbp; 1.607 + UChar *unibufp; 1.608 + 1.609 + size_t rd, wr; 1.610 + 1.611 +#if !UCONFIG_NO_TRANSLITERATION 1.612 + Transliterator *t = 0; // Transliterator acting on Unicode data. 1.613 + UnicodeString chunk; // One chunk of the text being collected for transformation. 1.614 +#endif 1.615 + UnicodeString u; // String to do the transliteration. 1.616 + int32_t ulen; 1.617 + 1.618 + // use conversion offsets for error messages 1.619 + // unless a transliterator is used - 1.620 + // a text transformation will reorder characters in unpredictable ways 1.621 + UBool useOffsets = TRUE; 1.622 + 1.623 + // Open the correct input file or connect to stdin for reading input 1.624 + 1.625 + if (infilestr != 0 && strcmp(infilestr, "-")) { 1.626 + infile = fopen(infilestr, "rb"); 1.627 + if (infile == 0) { 1.628 + UnicodeString str1(infilestr, ""); 1.629 + str1.append((UChar32) 0); 1.630 + UnicodeString str2(strerror(errno), ""); 1.631 + str2.append((UChar32) 0); 1.632 + initMsg(pname); 1.633 + u_wmsg(stderr, "cantOpenInputF", str1.getBuffer(), str2.getBuffer()); 1.634 + return FALSE; 1.635 + } 1.636 + closeFile = TRUE; 1.637 + } else { 1.638 + infilestr = "-"; 1.639 + infile = stdin; 1.640 +#ifdef USE_FILENO_BINARY_MODE 1.641 + if (setmode(fileno(stdin), O_BINARY) == -1) { 1.642 + initMsg(pname); 1.643 + u_wmsg(stderr, "cantSetInBinMode"); 1.644 + return FALSE; 1.645 + } 1.646 +#endif 1.647 + } 1.648 + 1.649 + if (verbose) { 1.650 + fprintf(stderr, "%s:\n", infilestr); 1.651 + } 1.652 + 1.653 +#if !UCONFIG_NO_TRANSLITERATION 1.654 + // Create transliterator as needed. 1.655 + 1.656 + if (translit != NULL && *translit) { 1.657 + UParseError parse; 1.658 + UnicodeString str(translit), pestr; 1.659 + 1.660 + /* Create from rules or by ID as needed. */ 1.661 + 1.662 + parse.line = -1; 1.663 + 1.664 + if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) { 1.665 + t = Transliterator::createFromRules("Uconv", str, UTRANS_FORWARD, parse, err); 1.666 + } else { 1.667 + t = Transliterator::createInstance(translit, UTRANS_FORWARD, err); 1.668 + } 1.669 + 1.670 + if (U_FAILURE(err)) { 1.671 + str.append((UChar32) 0); 1.672 + initMsg(pname); 1.673 + 1.674 + if (parse.line >= 0) { 1.675 + UChar linebuf[20], offsetbuf[20]; 1.676 + uprv_itou(linebuf, 20, parse.line, 10, 0); 1.677 + uprv_itou(offsetbuf, 20, parse.offset, 10, 0); 1.678 + u_wmsg(stderr, "cantCreateTranslitParseErr", str.getTerminatedBuffer(), 1.679 + u_wmsg_errorName(err), linebuf, offsetbuf); 1.680 + } else { 1.681 + u_wmsg(stderr, "cantCreateTranslit", str.getTerminatedBuffer(), 1.682 + u_wmsg_errorName(err)); 1.683 + } 1.684 + 1.685 + if (t) { 1.686 + delete t; 1.687 + t = 0; 1.688 + } 1.689 + goto error_exit; 1.690 + } 1.691 + 1.692 + useOffsets = FALSE; 1.693 + } 1.694 +#endif 1.695 + 1.696 + // Create codepage converter. If the codepage or its aliases weren't 1.697 + // available, it returns NULL and a failure code. We also set the 1.698 + // callbacks, and return errors in the same way. 1.699 + 1.700 + convfrom = ucnv_open(fromcpage, &err); 1.701 + if (U_FAILURE(err)) { 1.702 + UnicodeString str(fromcpage, ""); 1.703 + initMsg(pname); 1.704 + u_wmsg(stderr, "cantOpenFromCodeset", str.getTerminatedBuffer(), 1.705 + u_wmsg_errorName(err)); 1.706 + goto error_exit; 1.707 + } 1.708 + ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err); 1.709 + if (U_FAILURE(err)) { 1.710 + initMsg(pname); 1.711 + u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err)); 1.712 + goto error_exit; 1.713 + } 1.714 + 1.715 + convto = ucnv_open(tocpage, &err); 1.716 + if (U_FAILURE(err)) { 1.717 + UnicodeString str(tocpage, ""); 1.718 + initMsg(pname); 1.719 + u_wmsg(stderr, "cantOpenToCodeset", str.getTerminatedBuffer(), 1.720 + u_wmsg_errorName(err)); 1.721 + goto error_exit; 1.722 + } 1.723 + ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err); 1.724 + if (U_FAILURE(err)) { 1.725 + initMsg(pname); 1.726 + u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err)); 1.727 + goto error_exit; 1.728 + } 1.729 + ucnv_setFallback(convto, fallback); 1.730 + 1.731 + UBool willexit, fromSawEndOfBytes, toSawEndOfUnicode; 1.732 + int8_t sig; 1.733 + 1.734 + // OK, we can convert now. 1.735 + sig = signature; 1.736 + rd = 0; 1.737 + 1.738 + do { 1.739 + willexit = FALSE; 1.740 + 1.741 + // input file offset at the beginning of the next buffer 1.742 + infoffset += rd; 1.743 + 1.744 + rd = fread(buf, 1, bufsz, infile); 1.745 + if (ferror(infile) != 0) { 1.746 + UnicodeString str(strerror(errno)); 1.747 + initMsg(pname); 1.748 + u_wmsg(stderr, "cantRead", str.getTerminatedBuffer()); 1.749 + goto error_exit; 1.750 + } 1.751 + 1.752 + // Convert the read buffer into the new encoding via Unicode. 1.753 + // After the call 'unibufp' will be placed behind the last 1.754 + // character that was converted in the 'unibuf'. 1.755 + // Also the 'cbufp' is positioned behind the last converted 1.756 + // character. 1.757 + // At the last conversion in the file, flush should be set to 1.758 + // true so that we get all characters converted. 1.759 + // 1.760 + // The converter must be flushed at the end of conversion so 1.761 + // that characters on hold also will be written. 1.762 + 1.763 + cbufp = buf; 1.764 + flush = (UBool)(rd != bufsz); 1.765 + 1.766 + // convert until the input is consumed 1.767 + do { 1.768 + // remember the start of the current byte-to-Unicode conversion 1.769 + prevbufp = cbufp; 1.770 + 1.771 + unibuf = unibufp = u.getBuffer((int32_t)bufsz); 1.772 + 1.773 + // Use bufsz instead of u.getCapacity() for the targetLimit 1.774 + // so that we don't overflow fromoffsets[]. 1.775 + ucnv_toUnicode(convfrom, &unibufp, unibuf + bufsz, &cbufp, 1.776 + buf + rd, useOffsets ? fromoffsets : NULL, flush, &err); 1.777 + 1.778 + ulen = (int32_t)(unibufp - unibuf); 1.779 + u.releaseBuffer(U_SUCCESS(err) ? ulen : 0); 1.780 + 1.781 + // fromSawEndOfBytes indicates that ucnv_toUnicode() is done 1.782 + // converting all of the input bytes. 1.783 + // It works like this because ucnv_toUnicode() returns only under the 1.784 + // following conditions: 1.785 + // - an error occurred during conversion (an error code is set) 1.786 + // - the target buffer is filled (the error code indicates an overflow) 1.787 + // - the source is consumed 1.788 + // That is, if the error code does not indicate a failure, 1.789 + // not even an overflow, then the source must be consumed entirely. 1.790 + fromSawEndOfBytes = (UBool)U_SUCCESS(err); 1.791 + 1.792 + if (err == U_BUFFER_OVERFLOW_ERROR) { 1.793 + err = U_ZERO_ERROR; 1.794 + } else if (U_FAILURE(err)) { 1.795 + char pos[32], errorBytes[32]; 1.796 + int8_t i, length, errorLength; 1.797 + 1.798 + UErrorCode localError = U_ZERO_ERROR; 1.799 + errorLength = (int8_t)sizeof(errorBytes); 1.800 + ucnv_getInvalidChars(convfrom, errorBytes, &errorLength, &localError); 1.801 + if (U_FAILURE(localError) || errorLength == 0) { 1.802 + errorLength = 1; 1.803 + } 1.804 + 1.805 + // print the input file offset of the start of the error bytes: 1.806 + // input file offset of the current byte buffer + 1.807 + // length of the just consumed bytes - 1.808 + // length of the error bytes 1.809 + length = 1.810 + (int8_t)sprintf(pos, "%d", 1.811 + (int)(infoffset + (cbufp - buf) - errorLength)); 1.812 + 1.813 + // output the bytes that caused the error 1.814 + UnicodeString str; 1.815 + for (i = 0; i < errorLength; ++i) { 1.816 + if (i > 0) { 1.817 + str.append((UChar)uSP); 1.818 + } 1.819 + str.append(nibbleToHex((uint8_t)errorBytes[i] >> 4)); 1.820 + str.append(nibbleToHex((uint8_t)errorBytes[i])); 1.821 + } 1.822 + 1.823 + initMsg(pname); 1.824 + u_wmsg(stderr, "problemCvtToU", 1.825 + UnicodeString(pos, length, "").getTerminatedBuffer(), 1.826 + str.getTerminatedBuffer(), 1.827 + u_wmsg_errorName(err)); 1.828 + 1.829 + willexit = TRUE; 1.830 + err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */ 1.831 + } 1.832 + 1.833 + // Replaced a check for whether the input was consumed by 1.834 + // looping until it is; message key "premEndInput" now obsolete. 1.835 + 1.836 + if (ulen == 0) { 1.837 + continue; 1.838 + } 1.839 + 1.840 + // remove a U+FEFF Unicode signature character if requested 1.841 + if (sig < 0) { 1.842 + if (u.charAt(0) == uSig) { 1.843 + u.remove(0, 1); 1.844 + 1.845 + // account for the removed UChar and offset 1.846 + --ulen; 1.847 + 1.848 + if (useOffsets) { 1.849 + // remove an offset from fromoffsets[] as well 1.850 + // to keep the array parallel with the UChars 1.851 + memmove(fromoffsets, fromoffsets + 1, ulen * 4); 1.852 + } 1.853 + 1.854 + } 1.855 + sig = 0; 1.856 + } 1.857 + 1.858 +#if !UCONFIG_NO_TRANSLITERATION 1.859 + // Transliterate/transform if needed. 1.860 + 1.861 + // For transformation, we use chunking code - 1.862 + // collect Unicode input until, for example, an end-of-line, 1.863 + // then transform and output-convert that and continue collecting. 1.864 + // This makes the transformation result independent of the buffer size 1.865 + // while avoiding the slower keyboard mode. 1.866 + // The end-of-chunk characters are completely included in the 1.867 + // transformed string in case they are to be transformed themselves. 1.868 + if (t != NULL) { 1.869 + UnicodeString out; 1.870 + int32_t chunkLimit; 1.871 + 1.872 + do { 1.873 + chunkLimit = getChunkLimit(chunk, u); 1.874 + if (chunkLimit < 0 && flush && fromSawEndOfBytes) { 1.875 + // use all of the rest at the end of the text 1.876 + chunkLimit = u.length(); 1.877 + } 1.878 + if (chunkLimit >= 0) { 1.879 + // complete the chunk and transform it 1.880 + chunk.append(u, 0, chunkLimit); 1.881 + u.remove(0, chunkLimit); 1.882 + t->transliterate(chunk); 1.883 + 1.884 + // append the transformation result to the result and empty the chunk 1.885 + out.append(chunk); 1.886 + chunk.remove(); 1.887 + } else { 1.888 + // continue collecting the chunk 1.889 + chunk.append(u); 1.890 + break; 1.891 + } 1.892 + } while (!u.isEmpty()); 1.893 + 1.894 + u = out; 1.895 + ulen = u.length(); 1.896 + } 1.897 +#endif 1.898 + 1.899 + // add a U+FEFF Unicode signature character if requested 1.900 + // and possible/necessary 1.901 + if (sig > 0) { 1.902 + if (u.charAt(0) != uSig && cnvSigType(convto) == CNV_WITH_FEFF) { 1.903 + u.insert(0, (UChar)uSig); 1.904 + 1.905 + if (useOffsets) { 1.906 + // insert a pseudo-offset into fromoffsets[] as well 1.907 + // to keep the array parallel with the UChars 1.908 + memmove(fromoffsets + 1, fromoffsets, ulen * 4); 1.909 + fromoffsets[0] = -1; 1.910 + } 1.911 + 1.912 + // account for the additional UChar and offset 1.913 + ++ulen; 1.914 + } 1.915 + sig = 0; 1.916 + } 1.917 + 1.918 + // Convert the Unicode buffer into the destination codepage 1.919 + // Again 'bufp' will be placed behind the last converted character 1.920 + // And 'unibufp' will be placed behind the last converted unicode character 1.921 + // At the last conversion flush should be set to true to ensure that 1.922 + // all characters left get converted 1.923 + 1.924 + unibuf = unibufbp = u.getBuffer(); 1.925 + 1.926 + do { 1.927 + bufp = outbuf; 1.928 + 1.929 + // Use fromSawEndOfBytes in addition to the flush flag - 1.930 + // it indicates whether the intermediate Unicode string 1.931 + // contains the very last UChars for the very last input bytes. 1.932 + ucnv_fromUnicode(convto, &bufp, outbuf + bufsz, 1.933 + &unibufbp, 1.934 + unibuf + ulen, 1.935 + NULL, (UBool)(flush && fromSawEndOfBytes), &err); 1.936 + 1.937 + // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done 1.938 + // converting all of the intermediate UChars. 1.939 + // See comment for fromSawEndOfBytes. 1.940 + toSawEndOfUnicode = (UBool)U_SUCCESS(err); 1.941 + 1.942 + if (err == U_BUFFER_OVERFLOW_ERROR) { 1.943 + err = U_ZERO_ERROR; 1.944 + } else if (U_FAILURE(err)) { 1.945 + UChar errorUChars[4]; 1.946 + const char *errtag; 1.947 + char pos[32]; 1.948 + UChar32 c; 1.949 + int8_t i, length, errorLength; 1.950 + 1.951 + UErrorCode localError = U_ZERO_ERROR; 1.952 + errorLength = (int8_t)LENGTHOF(errorUChars); 1.953 + ucnv_getInvalidUChars(convto, errorUChars, &errorLength, &localError); 1.954 + if (U_FAILURE(localError) || errorLength == 0) { 1.955 + // need at least 1 so that we don't access beyond the length of fromoffsets[] 1.956 + errorLength = 1; 1.957 + } 1.958 + 1.959 + int32_t ferroffset; 1.960 + 1.961 + if (useOffsets) { 1.962 + // Unicode buffer offset of the start of the error UChars 1.963 + ferroffset = (int32_t)((unibufbp - unibuf) - errorLength); 1.964 + if (ferroffset < 0) { 1.965 + // approximation - the character started in the previous Unicode buffer 1.966 + ferroffset = 0; 1.967 + } 1.968 + 1.969 + // get the corresponding byte offset out of fromoffsets[] 1.970 + // go back if the offset is not known for some of the UChars 1.971 + int32_t fromoffset; 1.972 + do { 1.973 + fromoffset = fromoffsets[ferroffset]; 1.974 + } while (fromoffset < 0 && --ferroffset >= 0); 1.975 + 1.976 + // total input file offset = 1.977 + // input file offset of the current byte buffer + 1.978 + // byte buffer offset of where the current Unicode buffer is converted from + 1.979 + // fromoffsets[Unicode offset] 1.980 + ferroffset = infoffset + (prevbufp - buf) + fromoffset; 1.981 + errtag = "problemCvtFromU"; 1.982 + } else { 1.983 + // Do not use fromoffsets if (t != NULL) because the Unicode text may 1.984 + // be different from what the offsets refer to. 1.985 + 1.986 + // output file offset 1.987 + ferroffset = (int32_t)(outfoffset + (bufp - outbuf)); 1.988 + errtag = "problemCvtFromUOut"; 1.989 + } 1.990 + 1.991 + length = (int8_t)sprintf(pos, "%u", (int)ferroffset); 1.992 + 1.993 + // output the code points that caused the error 1.994 + UnicodeString str; 1.995 + for (i = 0; i < errorLength;) { 1.996 + if (i > 0) { 1.997 + str.append((UChar)uSP); 1.998 + } 1.999 + U16_NEXT(errorUChars, i, errorLength, c); 1.1000 + if (c >= 0x100000) { 1.1001 + str.append(nibbleToHex((uint8_t)(c >> 20))); 1.1002 + } 1.1003 + if (c >= 0x10000) { 1.1004 + str.append(nibbleToHex((uint8_t)(c >> 16))); 1.1005 + } 1.1006 + str.append(nibbleToHex((uint8_t)(c >> 12))); 1.1007 + str.append(nibbleToHex((uint8_t)(c >> 8))); 1.1008 + str.append(nibbleToHex((uint8_t)(c >> 4))); 1.1009 + str.append(nibbleToHex((uint8_t)c)); 1.1010 + } 1.1011 + 1.1012 + initMsg(pname); 1.1013 + u_wmsg(stderr, errtag, 1.1014 + UnicodeString(pos, length, "").getTerminatedBuffer(), 1.1015 + str.getTerminatedBuffer(), 1.1016 + u_wmsg_errorName(err)); 1.1017 + u_wmsg(stderr, "errorUnicode", str.getTerminatedBuffer()); 1.1018 + 1.1019 + willexit = TRUE; 1.1020 + err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */ 1.1021 + } 1.1022 + 1.1023 + // Replaced a check for whether the intermediate Unicode characters were all consumed by 1.1024 + // looping until they are; message key "premEnd" now obsolete. 1.1025 + 1.1026 + // Finally, write the converted buffer to the output file 1.1027 + size_t outlen = (size_t) (bufp - outbuf); 1.1028 + outfoffset += (int32_t)(wr = fwrite(outbuf, 1, outlen, outfile)); 1.1029 + if (wr != outlen) { 1.1030 + UnicodeString str(strerror(errno)); 1.1031 + initMsg(pname); 1.1032 + u_wmsg(stderr, "cantWrite", str.getTerminatedBuffer()); 1.1033 + willexit = TRUE; 1.1034 + } 1.1035 + 1.1036 + if (willexit) { 1.1037 + goto error_exit; 1.1038 + } 1.1039 + } while (!toSawEndOfUnicode); 1.1040 + } while (!fromSawEndOfBytes); 1.1041 + } while (!flush); // Stop when we have flushed the 1.1042 + // converters (this means that it's 1.1043 + // the end of output) 1.1044 + 1.1045 + goto normal_exit; 1.1046 + 1.1047 +error_exit: 1.1048 + ret = FALSE; 1.1049 + 1.1050 +normal_exit: 1.1051 + // Cleanup. 1.1052 + 1.1053 + ucnv_close(convfrom); 1.1054 + ucnv_close(convto); 1.1055 + 1.1056 +#if !UCONFIG_NO_TRANSLITERATION 1.1057 + delete t; 1.1058 +#endif 1.1059 + 1.1060 + if (closeFile) { 1.1061 + fclose(infile); 1.1062 + } 1.1063 + 1.1064 + return ret; 1.1065 +} 1.1066 + 1.1067 +static void usage(const char *pname, int ecode) { 1.1068 + const UChar *msg; 1.1069 + int32_t msgLen; 1.1070 + UErrorCode err = U_ZERO_ERROR; 1.1071 + FILE *fp = ecode ? stderr : stdout; 1.1072 + int res; 1.1073 + 1.1074 + initMsg(pname); 1.1075 + msg = 1.1076 + ures_getStringByKey(gBundle, ecode ? "lcUsageWord" : "ucUsageWord", 1.1077 + &msgLen, &err); 1.1078 + UnicodeString upname(pname, (int32_t)(uprv_strlen(pname) + 1)); 1.1079 + UnicodeString mname(msg, msgLen + 1); 1.1080 + 1.1081 + res = u_wmsg(fp, "usage", mname.getBuffer(), upname.getBuffer()); 1.1082 + if (!ecode) { 1.1083 + if (!res) { 1.1084 + fputc('\n', fp); 1.1085 + } 1.1086 + if (!u_wmsg(fp, "help")) { 1.1087 + /* Now dump callbacks and finish. */ 1.1088 + 1.1089 + int i, count = 1.1090 + sizeof(transcode_callbacks) / sizeof(*transcode_callbacks); 1.1091 + for (i = 0; i < count; ++i) { 1.1092 + fprintf(fp, " %s", transcode_callbacks[i].name); 1.1093 + } 1.1094 + fputc('\n', fp); 1.1095 + } 1.1096 + } 1.1097 + 1.1098 + exit(ecode); 1.1099 +} 1.1100 + 1.1101 +extern int 1.1102 +main(int argc, char **argv) 1.1103 +{ 1.1104 + FILE *outfile; 1.1105 + int ret = 0; 1.1106 + 1.1107 + size_t bufsz = DEFAULT_BUFSZ; 1.1108 + 1.1109 + const char *fromcpage = 0; 1.1110 + const char *tocpage = 0; 1.1111 + const char *translit = 0; 1.1112 + const char *outfilestr = 0; 1.1113 + UBool fallback = FALSE; 1.1114 + 1.1115 + UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_STOP; 1.1116 + const void *fromuctxt = 0; 1.1117 + UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_STOP; 1.1118 + const void *touctxt = 0; 1.1119 + 1.1120 + char **iter, **remainArgv, **remainArgvLimit; 1.1121 + char **end = argv + argc; 1.1122 + 1.1123 + const char *pname; 1.1124 + 1.1125 + UBool printConvs = FALSE, printCanon = FALSE, printTranslits = FALSE; 1.1126 + const char *printName = 0; 1.1127 + 1.1128 + UBool verbose = FALSE; 1.1129 + UErrorCode status = U_ZERO_ERROR; 1.1130 + 1.1131 + ConvertFile cf; 1.1132 + 1.1133 + /* Initialize ICU */ 1.1134 + u_init(&status); 1.1135 + if (U_FAILURE(status)) { 1.1136 + fprintf(stderr, "%s: can not initialize ICU. status = %s\n", 1.1137 + argv[0], u_errorName(status)); 1.1138 + exit(1); 1.1139 + } 1.1140 + 1.1141 + // Get and prettify pname. 1.1142 + pname = uprv_strrchr(*argv, U_FILE_SEP_CHAR); 1.1143 +#if U_PLATFORM_USES_ONLY_WIN32_API 1.1144 + if (!pname) { 1.1145 + pname = uprv_strrchr(*argv, '/'); 1.1146 + } 1.1147 +#endif 1.1148 + if (!pname) { 1.1149 + pname = *argv; 1.1150 + } else { 1.1151 + ++pname; 1.1152 + } 1.1153 + 1.1154 + // First, get the arguments from command-line 1.1155 + // to know the codepages to convert between 1.1156 + 1.1157 + remainArgv = remainArgvLimit = argv + 1; 1.1158 + for (iter = argv + 1; iter != end; iter++) { 1.1159 + // Check for from charset 1.1160 + if (strcmp("-f", *iter) == 0 || !strcmp("--from-code", *iter)) { 1.1161 + iter++; 1.1162 + if (iter != end) 1.1163 + fromcpage = *iter; 1.1164 + else 1.1165 + usage(pname, 1); 1.1166 + } else if (strcmp("-t", *iter) == 0 || !strcmp("--to-code", *iter)) { 1.1167 + iter++; 1.1168 + if (iter != end) 1.1169 + tocpage = *iter; 1.1170 + else 1.1171 + usage(pname, 1); 1.1172 + } else if (strcmp("-x", *iter) == 0) { 1.1173 + iter++; 1.1174 + if (iter != end) 1.1175 + translit = *iter; 1.1176 + else 1.1177 + usage(pname, 1); 1.1178 + } else if (!strcmp("--fallback", *iter)) { 1.1179 + fallback = TRUE; 1.1180 + } else if (!strcmp("--no-fallback", *iter)) { 1.1181 + fallback = FALSE; 1.1182 + } else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) { 1.1183 + iter++; 1.1184 + if (iter != end) { 1.1185 + bufsz = atoi(*iter); 1.1186 + if ((int) bufsz <= 0) { 1.1187 + initMsg(pname); 1.1188 + UnicodeString str(*iter); 1.1189 + initMsg(pname); 1.1190 + u_wmsg(stderr, "badBlockSize", str.getTerminatedBuffer()); 1.1191 + return 3; 1.1192 + } 1.1193 + } else { 1.1194 + usage(pname, 1); 1.1195 + } 1.1196 + } else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) { 1.1197 + if (printTranslits) { 1.1198 + usage(pname, 1); 1.1199 + } 1.1200 + printConvs = TRUE; 1.1201 + } else if (strcmp("--default-code", *iter) == 0) { 1.1202 + if (printTranslits) { 1.1203 + usage(pname, 1); 1.1204 + } 1.1205 + printName = ucnv_getDefaultName(); 1.1206 + } else if (strcmp("--list-code", *iter) == 0) { 1.1207 + if (printTranslits) { 1.1208 + usage(pname, 1); 1.1209 + } 1.1210 + 1.1211 + iter++; 1.1212 + if (iter != end) { 1.1213 + UErrorCode e = U_ZERO_ERROR; 1.1214 + printName = ucnv_getAlias(*iter, 0, &e); 1.1215 + if (U_FAILURE(e) || !printName) { 1.1216 + UnicodeString str(*iter); 1.1217 + initMsg(pname); 1.1218 + u_wmsg(stderr, "noSuchCodeset", str.getTerminatedBuffer()); 1.1219 + return 2; 1.1220 + } 1.1221 + } else 1.1222 + usage(pname, 1); 1.1223 + } else if (strcmp("--canon", *iter) == 0) { 1.1224 + printCanon = TRUE; 1.1225 + } else if (strcmp("-L", *iter) == 0 1.1226 + || !strcmp("--list-transliterators", *iter)) { 1.1227 + if (printConvs) { 1.1228 + usage(pname, 1); 1.1229 + } 1.1230 + printTranslits = TRUE; 1.1231 + } else if (strcmp("-h", *iter) == 0 || !strcmp("-?", *iter) 1.1232 + || !strcmp("--help", *iter)) { 1.1233 + usage(pname, 0); 1.1234 + } else if (!strcmp("-c", *iter)) { 1.1235 + fromucallback = UCNV_FROM_U_CALLBACK_SKIP; 1.1236 + } else if (!strcmp("--to-callback", *iter)) { 1.1237 + iter++; 1.1238 + if (iter != end) { 1.1239 + const struct callback_ent *cbe = findCallback(*iter); 1.1240 + if (cbe) { 1.1241 + fromucallback = cbe->fromu; 1.1242 + fromuctxt = cbe->fromuctxt; 1.1243 + } else { 1.1244 + UnicodeString str(*iter); 1.1245 + initMsg(pname); 1.1246 + u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer()); 1.1247 + return 4; 1.1248 + } 1.1249 + } else { 1.1250 + usage(pname, 1); 1.1251 + } 1.1252 + } else if (!strcmp("--from-callback", *iter)) { 1.1253 + iter++; 1.1254 + if (iter != end) { 1.1255 + const struct callback_ent *cbe = findCallback(*iter); 1.1256 + if (cbe) { 1.1257 + toucallback = cbe->tou; 1.1258 + touctxt = cbe->touctxt; 1.1259 + } else { 1.1260 + UnicodeString str(*iter); 1.1261 + initMsg(pname); 1.1262 + u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer()); 1.1263 + return 4; 1.1264 + } 1.1265 + } else { 1.1266 + usage(pname, 1); 1.1267 + } 1.1268 + } else if (!strcmp("-i", *iter)) { 1.1269 + toucallback = UCNV_TO_U_CALLBACK_SKIP; 1.1270 + } else if (!strcmp("--callback", *iter)) { 1.1271 + iter++; 1.1272 + if (iter != end) { 1.1273 + const struct callback_ent *cbe = findCallback(*iter); 1.1274 + if (cbe) { 1.1275 + fromucallback = cbe->fromu; 1.1276 + fromuctxt = cbe->fromuctxt; 1.1277 + toucallback = cbe->tou; 1.1278 + touctxt = cbe->touctxt; 1.1279 + } else { 1.1280 + UnicodeString str(*iter); 1.1281 + initMsg(pname); 1.1282 + u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer()); 1.1283 + return 4; 1.1284 + } 1.1285 + } else { 1.1286 + usage(pname, 1); 1.1287 + } 1.1288 + } else if (!strcmp("-s", *iter) || !strcmp("--silent", *iter)) { 1.1289 + verbose = FALSE; 1.1290 + } else if (!strcmp("-v", *iter) || !strcmp("--verbose", *iter)) { 1.1291 + verbose = TRUE; 1.1292 + } else if (!strcmp("-V", *iter) || !strcmp("--version", *iter)) { 1.1293 + printf("%s v2.1 ICU " U_ICU_VERSION "\n", pname); 1.1294 + return 0; 1.1295 + } else if (!strcmp("-o", *iter) || !strcmp("--output", *iter)) { 1.1296 + ++iter; 1.1297 + if (iter != end && !outfilestr) { 1.1298 + outfilestr = *iter; 1.1299 + } else { 1.1300 + usage(pname, 1); 1.1301 + } 1.1302 + } else if (0 == strcmp("--add-signature", *iter)) { 1.1303 + cf.signature = 1; 1.1304 + } else if (0 == strcmp("--remove-signature", *iter)) { 1.1305 + cf.signature = -1; 1.1306 + } else if (**iter == '-' && (*iter)[1]) { 1.1307 + usage(pname, 1); 1.1308 + } else { 1.1309 + // move a non-option up in argv[] 1.1310 + *remainArgvLimit++ = *iter; 1.1311 + } 1.1312 + } 1.1313 + 1.1314 + if (printConvs || printName) { 1.1315 + return printConverters(pname, printName, printCanon) ? 2 : 0; 1.1316 + } else if (printTranslits) { 1.1317 + return printTransliterators(printCanon) ? 3 : 0; 1.1318 + } 1.1319 + 1.1320 + if (!fromcpage || !uprv_strcmp(fromcpage, "-")) { 1.1321 + fromcpage = ucnv_getDefaultName(); 1.1322 + } 1.1323 + if (!tocpage || !uprv_strcmp(tocpage, "-")) { 1.1324 + tocpage = ucnv_getDefaultName(); 1.1325 + } 1.1326 + 1.1327 + // Open the correct output file or connect to stdout for reading input 1.1328 + if (outfilestr != 0 && strcmp(outfilestr, "-")) { 1.1329 + outfile = fopen(outfilestr, "wb"); 1.1330 + if (outfile == 0) { 1.1331 + UnicodeString str1(outfilestr, ""); 1.1332 + UnicodeString str2(strerror(errno), ""); 1.1333 + initMsg(pname); 1.1334 + u_wmsg(stderr, "cantCreateOutputF", 1.1335 + str1.getBuffer(), str2.getBuffer()); 1.1336 + return 1; 1.1337 + } 1.1338 + } else { 1.1339 + outfilestr = "-"; 1.1340 + outfile = stdout; 1.1341 +#ifdef USE_FILENO_BINARY_MODE 1.1342 + if (setmode(fileno(outfile), O_BINARY) == -1) { 1.1343 + u_wmsg(stderr, "cantSetOutBinMode"); 1.1344 + exit(-1); 1.1345 + } 1.1346 +#endif 1.1347 + } 1.1348 + 1.1349 + /* Loop again on the arguments to find all the input files, and 1.1350 + convert them. */ 1.1351 + 1.1352 + cf.setBufferSize(bufsz); 1.1353 + 1.1354 + if(remainArgv < remainArgvLimit) { 1.1355 + for (iter = remainArgv; iter != remainArgvLimit; iter++) { 1.1356 + if (!cf.convertFile( 1.1357 + pname, fromcpage, toucallback, touctxt, tocpage, 1.1358 + fromucallback, fromuctxt, fallback, translit, *iter, 1.1359 + outfile, verbose) 1.1360 + ) { 1.1361 + goto error_exit; 1.1362 + } 1.1363 + } 1.1364 + } else { 1.1365 + if (!cf.convertFile( 1.1366 + pname, fromcpage, toucallback, touctxt, tocpage, 1.1367 + fromucallback, fromuctxt, fallback, translit, 0, 1.1368 + outfile, verbose) 1.1369 + ) { 1.1370 + goto error_exit; 1.1371 + } 1.1372 + } 1.1373 + 1.1374 + goto normal_exit; 1.1375 +error_exit: 1.1376 +#if !UCONFIG_NO_LEGACY_CONVERSION 1.1377 + ret = 1; 1.1378 +#else 1.1379 + fprintf(stderr, "uconv error: UCONFIG_NO_LEGACY_CONVERSION is on. See uconfig.h\n"); 1.1380 +#endif 1.1381 +normal_exit: 1.1382 + 1.1383 + if (outfile != stdout) { 1.1384 + fclose(outfile); 1.1385 + } 1.1386 + 1.1387 + u_cleanup(); 1.1388 + 1.1389 + return ret; 1.1390 +} 1.1391 + 1.1392 + 1.1393 +/* 1.1394 + * Hey, Emacs, please set the following: 1.1395 + * 1.1396 + * Local Variables: 1.1397 + * indent-tabs-mode: nil 1.1398 + * End: 1.1399 + * 1.1400 + */