intl/icu/source/extra/uconv/uconv.cpp

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

michael@0 1 /*****************************************************************************
michael@0 2 *
michael@0 3 * Copyright (C) 1999-2013, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 *
michael@0 6 ******************************************************************************/
michael@0 7
michael@0 8 /*
michael@0 9 * uconv(1): an iconv(1)-like converter using ICU.
michael@0 10 *
michael@0 11 * Original code by Jonas Utterstr&#x00F6;m <jonas.utterstrom@vittran.norrnod.se>
michael@0 12 * contributed in 1999.
michael@0 13 *
michael@0 14 * Conversion to the C conversion API and many improvements by
michael@0 15 * Yves Arrouye <yves@realnames.com>, current maintainer.
michael@0 16 *
michael@0 17 * Markus Scherer maintainer from 2003.
michael@0 18 * See source code repository history for changes.
michael@0 19 */
michael@0 20
michael@0 21 #include <unicode/utypes.h>
michael@0 22 #include <unicode/putil.h>
michael@0 23 #include <unicode/ucnv.h>
michael@0 24 #include <unicode/uenum.h>
michael@0 25 #include <unicode/unistr.h>
michael@0 26 #include <unicode/translit.h>
michael@0 27 #include <unicode/uset.h>
michael@0 28 #include <unicode/uclean.h>
michael@0 29 #include <unicode/utf16.h>
michael@0 30
michael@0 31 #include <stdio.h>
michael@0 32 #include <errno.h>
michael@0 33 #include <string.h>
michael@0 34 #include <stdlib.h>
michael@0 35
michael@0 36 #include "cmemory.h"
michael@0 37 #include "cstring.h"
michael@0 38 #include "ustrfmt.h"
michael@0 39
michael@0 40 #include "unicode/uwmsg.h"
michael@0 41
michael@0 42 U_NAMESPACE_USE
michael@0 43
michael@0 44 #if U_PLATFORM_USES_ONLY_WIN32_API && !defined(__STRICT_ANSI__)
michael@0 45 #include <io.h>
michael@0 46 #include <fcntl.h>
michael@0 47 #if U_PLATFORM_USES_ONLY_WIN32_API
michael@0 48 #define USE_FILENO_BINARY_MODE 1
michael@0 49 /* Windows likes to rename Unix-like functions */
michael@0 50 #ifndef fileno
michael@0 51 #define fileno _fileno
michael@0 52 #endif
michael@0 53 #ifndef setmode
michael@0 54 #define setmode _setmode
michael@0 55 #endif
michael@0 56 #ifndef O_BINARY
michael@0 57 #define O_BINARY _O_BINARY
michael@0 58 #endif
michael@0 59 #endif
michael@0 60 #endif
michael@0 61
michael@0 62 #ifdef UCONVMSG_LINK
michael@0 63 /* below from the README */
michael@0 64 #include "unicode/utypes.h"
michael@0 65 #include "unicode/udata.h"
michael@0 66 U_CFUNC char uconvmsg_dat[];
michael@0 67 #endif
michael@0 68
michael@0 69 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
michael@0 70
michael@0 71 #define DEFAULT_BUFSZ 4096
michael@0 72 #define UCONVMSG "uconvmsg"
michael@0 73
michael@0 74 static UResourceBundle *gBundle = 0; /* Bundle containing messages. */
michael@0 75
michael@0 76 /*
michael@0 77 * Initialize the message bundle so that message strings can be fetched
michael@0 78 * by u_wmsg().
michael@0 79 *
michael@0 80 */
michael@0 81
michael@0 82 static void initMsg(const char *pname) {
michael@0 83 static int ps = 0;
michael@0 84
michael@0 85 if (!ps) {
michael@0 86 char dataPath[2048]; /* XXX Sloppy: should be PATH_MAX. */
michael@0 87 UErrorCode err = U_ZERO_ERROR;
michael@0 88
michael@0 89 ps = 1;
michael@0 90
michael@0 91 /* Set up our static data - if any */
michael@0 92 #if defined(UCONVMSG_LINK) && U_PLATFORM != U_PF_OS390 /* On z/OS, this is failing. */
michael@0 93 udata_setAppData(UCONVMSG, (const void*) uconvmsg_dat, &err);
michael@0 94 if (U_FAILURE(err)) {
michael@0 95 fprintf(stderr, "%s: warning, problem installing our static resource bundle data uconvmsg: %s - trying anyways.\n",
michael@0 96 pname, u_errorName(err));
michael@0 97 err = U_ZERO_ERROR; /* It may still fail */
michael@0 98 }
michael@0 99 #endif
michael@0 100
michael@0 101 /* Get messages. */
michael@0 102 gBundle = u_wmsg_setPath(UCONVMSG, &err);
michael@0 103 if (U_FAILURE(err)) {
michael@0 104 fprintf(stderr,
michael@0 105 "%s: warning: couldn't open bundle %s: %s\n",
michael@0 106 pname, UCONVMSG, u_errorName(err));
michael@0 107 #ifdef UCONVMSG_LINK
michael@0 108 fprintf(stderr,
michael@0 109 "%s: setAppData was called, internal data %s failed to load\n",
michael@0 110 pname, UCONVMSG);
michael@0 111 #endif
michael@0 112
michael@0 113 err = U_ZERO_ERROR;
michael@0 114 /* that was try #1, try again with a path */
michael@0 115 uprv_strcpy(dataPath, u_getDataDirectory());
michael@0 116 uprv_strcat(dataPath, U_FILE_SEP_STRING);
michael@0 117 uprv_strcat(dataPath, UCONVMSG);
michael@0 118
michael@0 119 gBundle = u_wmsg_setPath(dataPath, &err);
michael@0 120 if (U_FAILURE(err)) {
michael@0 121 fprintf(stderr,
michael@0 122 "%s: warning: still couldn't open bundle %s: %s\n",
michael@0 123 pname, dataPath, u_errorName(err));
michael@0 124 fprintf(stderr, "%s: warning: messages will not be displayed\n", pname);
michael@0 125 }
michael@0 126 }
michael@0 127 }
michael@0 128 }
michael@0 129
michael@0 130 /* Mapping of callback names to the callbacks passed to the converter
michael@0 131 API. */
michael@0 132
michael@0 133 static struct callback_ent {
michael@0 134 const char *name;
michael@0 135 UConverterFromUCallback fromu;
michael@0 136 const void *fromuctxt;
michael@0 137 UConverterToUCallback tou;
michael@0 138 const void *touctxt;
michael@0 139 } transcode_callbacks[] = {
michael@0 140 { "substitute",
michael@0 141 UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0,
michael@0 142 UCNV_TO_U_CALLBACK_SUBSTITUTE, 0 },
michael@0 143 { "skip",
michael@0 144 UCNV_FROM_U_CALLBACK_SKIP, 0,
michael@0 145 UCNV_TO_U_CALLBACK_SKIP, 0 },
michael@0 146 { "stop",
michael@0 147 UCNV_FROM_U_CALLBACK_STOP, 0,
michael@0 148 UCNV_TO_U_CALLBACK_STOP, 0 },
michael@0 149 { "escape",
michael@0 150 UCNV_FROM_U_CALLBACK_ESCAPE, 0,
michael@0 151 UCNV_TO_U_CALLBACK_ESCAPE, 0},
michael@0 152 { "escape-icu",
michael@0 153 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU,
michael@0 154 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU },
michael@0 155 { "escape-java",
michael@0 156 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA,
michael@0 157 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA },
michael@0 158 { "escape-c",
michael@0 159 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C,
michael@0 160 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C },
michael@0 161 { "escape-xml",
michael@0 162 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX,
michael@0 163 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX },
michael@0 164 { "escape-xml-hex",
michael@0 165 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX,
michael@0 166 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX },
michael@0 167 { "escape-xml-dec",
michael@0 168 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC,
michael@0 169 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC },
michael@0 170 { "escape-unicode", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE,
michael@0 171 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE }
michael@0 172 };
michael@0 173
michael@0 174 /* Return a pointer to a callback record given its name. */
michael@0 175
michael@0 176 static const struct callback_ent *findCallback(const char *name) {
michael@0 177 int i, count =
michael@0 178 sizeof(transcode_callbacks) / sizeof(*transcode_callbacks);
michael@0 179
michael@0 180 /* We'll do a linear search, there aren't many of them and bsearch()
michael@0 181 may not be that portable. */
michael@0 182
michael@0 183 for (i = 0; i < count; ++i) {
michael@0 184 if (!uprv_stricmp(name, transcode_callbacks[i].name)) {
michael@0 185 return &transcode_callbacks[i];
michael@0 186 }
michael@0 187 }
michael@0 188
michael@0 189 return 0;
michael@0 190 }
michael@0 191
michael@0 192 /* Print converter information. If lookfor is set, only that converter will
michael@0 193 be printed, otherwise all converters will be printed. If canon is non
michael@0 194 zero, tags and aliases for each converter are printed too, in the format
michael@0 195 expected for convrters.txt(5). */
michael@0 196
michael@0 197 static int printConverters(const char *pname, const char *lookfor,
michael@0 198 UBool canon)
michael@0 199 {
michael@0 200 UErrorCode err = U_ZERO_ERROR;
michael@0 201 int32_t num;
michael@0 202 uint16_t num_stds;
michael@0 203 const char **stds;
michael@0 204
michael@0 205 /* If there is a specified name, just handle that now. */
michael@0 206
michael@0 207 if (lookfor) {
michael@0 208 if (!canon) {
michael@0 209 printf("%s\n", lookfor);
michael@0 210 return 0;
michael@0 211 } else {
michael@0 212 /* Because we are printing a canonical name, we need the
michael@0 213 true converter name. We've done that already except for
michael@0 214 the default name (because we want to print the exact
michael@0 215 name one would get when calling ucnv_getDefaultName()
michael@0 216 in non-canon mode). But since we do not know at this
michael@0 217 point if we have the default name or something else, we
michael@0 218 need to normalize again to the canonical converter
michael@0 219 name. */
michael@0 220
michael@0 221 const char *truename = ucnv_getAlias(lookfor, 0, &err);
michael@0 222 if (U_SUCCESS(err)) {
michael@0 223 lookfor = truename;
michael@0 224 } else {
michael@0 225 err = U_ZERO_ERROR;
michael@0 226 }
michael@0 227 }
michael@0 228 }
michael@0 229
michael@0 230 /* Print converter names. We come here for one of two reasons: we
michael@0 231 are printing all the names (lookfor was null), or we have a
michael@0 232 single converter to print but in canon mode, hence we need to
michael@0 233 get to it in order to print everything. */
michael@0 234
michael@0 235 num = ucnv_countAvailable();
michael@0 236 if (num <= 0) {
michael@0 237 initMsg(pname);
michael@0 238 u_wmsg(stderr, "cantGetNames");
michael@0 239 return -1;
michael@0 240 }
michael@0 241 if (lookfor) {
michael@0 242 num = 1; /* We know where we want to be. */
michael@0 243 }
michael@0 244
michael@0 245 num_stds = ucnv_countStandards();
michael@0 246 stds = (const char **) uprv_malloc(num_stds * sizeof(*stds));
michael@0 247 if (!stds) {
michael@0 248 u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR));
michael@0 249 return -1;
michael@0 250 } else {
michael@0 251 uint16_t s;
michael@0 252
michael@0 253 if (canon) {
michael@0 254 printf("{ ");
michael@0 255 }
michael@0 256 for (s = 0; s < num_stds; ++s) {
michael@0 257 stds[s] = ucnv_getStandard(s, &err);
michael@0 258 if (canon) {
michael@0 259 printf("%s ", stds[s]);
michael@0 260 }
michael@0 261 if (U_FAILURE(err)) {
michael@0 262 u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(err));
michael@0 263 goto error_cleanup;
michael@0 264 }
michael@0 265 }
michael@0 266 if (canon) {
michael@0 267 puts("}");
michael@0 268 }
michael@0 269 }
michael@0 270
michael@0 271 for (int32_t i = 0; i < num; i++) {
michael@0 272 const char *name;
michael@0 273 uint16_t num_aliases;
michael@0 274
michael@0 275 /* Set the name either to what we are looking for, or
michael@0 276 to the current converter name. */
michael@0 277
michael@0 278 if (lookfor) {
michael@0 279 name = lookfor;
michael@0 280 } else {
michael@0 281 name = ucnv_getAvailableName(i);
michael@0 282 }
michael@0 283
michael@0 284 /* Get all the aliases associated to the name. */
michael@0 285
michael@0 286 err = U_ZERO_ERROR;
michael@0 287 num_aliases = ucnv_countAliases(name, &err);
michael@0 288 if (U_FAILURE(err)) {
michael@0 289 printf("%s", name);
michael@0 290
michael@0 291 UnicodeString str(name, "");
michael@0 292 putchar('\t');
michael@0 293 u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
michael@0 294 u_wmsg_errorName(err));
michael@0 295 goto error_cleanup;
michael@0 296 } else {
michael@0 297 uint16_t a, s, t;
michael@0 298
michael@0 299 /* Write all the aliases and their tags. */
michael@0 300
michael@0 301 for (a = 0; a < num_aliases; ++a) {
michael@0 302 const char *alias = ucnv_getAlias(name, a, &err);
michael@0 303
michael@0 304 if (U_FAILURE(err)) {
michael@0 305 UnicodeString str(name, "");
michael@0 306 putchar('\t');
michael@0 307 u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
michael@0 308 u_wmsg_errorName(err));
michael@0 309 goto error_cleanup;
michael@0 310 }
michael@0 311
michael@0 312 /* Print the current alias so that it looks right. */
michael@0 313 printf("%s%s%s", (canon ? (a == 0? "" : "\t" ) : "") ,
michael@0 314 alias,
michael@0 315 (canon ? "" : " "));
michael@0 316
michael@0 317 /* Look (slowly, linear searching) for a tag. */
michael@0 318
michael@0 319 if (canon) {
michael@0 320 /* -1 to skip the last standard */
michael@0 321 for (s = t = 0; s < num_stds-1; ++s) {
michael@0 322 UEnumeration *nameEnum = ucnv_openStandardNames(name, stds[s], &err);
michael@0 323 if (U_SUCCESS(err)) {
michael@0 324 /* List the standard tags */
michael@0 325 const char *standardName;
michael@0 326 UBool isFirst = TRUE;
michael@0 327 UErrorCode enumError = U_ZERO_ERROR;
michael@0 328 while ((standardName = uenum_next(nameEnum, NULL, &enumError))) {
michael@0 329 /* See if this alias is supported by this standard. */
michael@0 330 if (!strcmp(standardName, alias)) {
michael@0 331 if (!t) {
michael@0 332 printf(" {");
michael@0 333 t = 1;
michael@0 334 }
michael@0 335 /* Print a * after the default standard name */
michael@0 336 printf(" %s%s", stds[s], (isFirst ? "*" : ""));
michael@0 337 }
michael@0 338 isFirst = FALSE;
michael@0 339 }
michael@0 340 }
michael@0 341 }
michael@0 342 if (t) {
michael@0 343 printf(" }");
michael@0 344 }
michael@0 345 }
michael@0 346 /* Terminate this entry. */
michael@0 347 if (canon) {
michael@0 348 puts("");
michael@0 349 }
michael@0 350
michael@0 351 /* Move on. */
michael@0 352 }
michael@0 353 /* Terminate this entry. */
michael@0 354 if (!canon) {
michael@0 355 puts("");
michael@0 356 }
michael@0 357 }
michael@0 358 }
michael@0 359
michael@0 360 /* Free temporary data. */
michael@0 361
michael@0 362 uprv_free(stds);
michael@0 363
michael@0 364 /* Success. */
michael@0 365
michael@0 366 return 0;
michael@0 367 error_cleanup:
michael@0 368 uprv_free(stds);
michael@0 369 return -1;
michael@0 370 }
michael@0 371
michael@0 372 /* Print all available transliterators. If canon is non zero, print
michael@0 373 one transliterator per line. */
michael@0 374
michael@0 375 static int printTransliterators(UBool canon)
michael@0 376 {
michael@0 377 #if UCONFIG_NO_TRANSLITERATION
michael@0 378 printf("no transliterators available because of UCONFIG_NO_TRANSLITERATION, see uconfig.h\n");
michael@0 379 return 1;
michael@0 380 #else
michael@0 381 UErrorCode status = U_ZERO_ERROR;
michael@0 382 UEnumeration *ids = utrans_openIDs(&status);
michael@0 383 int32_t i, numtrans = uenum_count(ids, &status);
michael@0 384
michael@0 385 char sepchar = canon ? '\n' : ' ';
michael@0 386
michael@0 387 for (i = 0; U_SUCCESS(status)&& (i < numtrans); ++i) {
michael@0 388 int32_t len;
michael@0 389 const char *nextTrans = uenum_next(ids, &len, &status);
michael@0 390
michael@0 391 printf("%s", nextTrans);
michael@0 392 if (i < numtrans - 1) {
michael@0 393 putchar(sepchar);
michael@0 394 }
michael@0 395 }
michael@0 396
michael@0 397 uenum_close(ids);
michael@0 398
michael@0 399 /* Add a terminating newline if needed. */
michael@0 400
michael@0 401 if (sepchar != '\n') {
michael@0 402 putchar('\n');
michael@0 403 }
michael@0 404
michael@0 405 /* Success. */
michael@0 406
michael@0 407 return 0;
michael@0 408 #endif
michael@0 409 }
michael@0 410
michael@0 411 enum {
michael@0 412 uSP = 0x20, // space
michael@0 413 uCR = 0xd, // carriage return
michael@0 414 uLF = 0xa, // line feed
michael@0 415 uNL = 0x85, // newline
michael@0 416 uLS = 0x2028, // line separator
michael@0 417 uPS = 0x2029, // paragraph separator
michael@0 418 uSig = 0xfeff // signature/BOM character
michael@0 419 };
michael@0 420
michael@0 421 static inline int32_t
michael@0 422 getChunkLimit(const UnicodeString &prev, const UnicodeString &s) {
michael@0 423 // find one of
michael@0 424 // CR, LF, CRLF, NL, LS, PS
michael@0 425 // for paragraph ends (see UAX #13/Unicode 4)
michael@0 426 // and include it in the chunk
michael@0 427 // all of these characters are on the BMP
michael@0 428 // do not include FF or VT in case they are part of a paragraph
michael@0 429 // (important for bidi contexts)
michael@0 430 static const UChar paraEnds[] = {
michael@0 431 0xd, 0xa, 0x85, 0x2028, 0x2029
michael@0 432 };
michael@0 433 enum {
michael@0 434 iCR, iLF, iNL, iLS, iPS, iCount
michael@0 435 };
michael@0 436
michael@0 437 // first, see if there is a CRLF split between prev and s
michael@0 438 if (prev.endsWith(paraEnds + iCR, 1)) {
michael@0 439 if (s.startsWith(paraEnds + iLF, 1)) {
michael@0 440 return 1; // split CRLF, include the LF
michael@0 441 } else if (!s.isEmpty()) {
michael@0 442 return 0; // complete the last chunk
michael@0 443 } else {
michael@0 444 return -1; // wait for actual further contents to arrive
michael@0 445 }
michael@0 446 }
michael@0 447
michael@0 448 const UChar *u = s.getBuffer(), *limit = u + s.length();
michael@0 449 UChar c;
michael@0 450
michael@0 451 while (u < limit) {
michael@0 452 c = *u++;
michael@0 453 if (
michael@0 454 ((c < uSP) && (c == uCR || c == uLF)) ||
michael@0 455 (c == uNL) ||
michael@0 456 ((c & uLS) == uLS)
michael@0 457 ) {
michael@0 458 if (c == uCR) {
michael@0 459 // check for CRLF
michael@0 460 if (u == limit) {
michael@0 461 return -1; // LF may be in the next chunk
michael@0 462 } else if (*u == uLF) {
michael@0 463 ++u; // include the LF in this chunk
michael@0 464 }
michael@0 465 }
michael@0 466 return (int32_t)(u - s.getBuffer());
michael@0 467 }
michael@0 468 }
michael@0 469
michael@0 470 return -1; // continue collecting the chunk
michael@0 471 }
michael@0 472
michael@0 473 enum {
michael@0 474 CNV_NO_FEFF, // cannot convert the U+FEFF Unicode signature character (BOM)
michael@0 475 CNV_WITH_FEFF, // can convert the U+FEFF signature character
michael@0 476 CNV_ADDS_FEFF // automatically adds/detects the U+FEFF signature character
michael@0 477 };
michael@0 478
michael@0 479 static inline UChar
michael@0 480 nibbleToHex(uint8_t n) {
michael@0 481 n &= 0xf;
michael@0 482 return
michael@0 483 n <= 9 ?
michael@0 484 (UChar)(0x30 + n) :
michael@0 485 (UChar)((0x61 - 10) + n);
michael@0 486 }
michael@0 487
michael@0 488 // check the converter's Unicode signature properties;
michael@0 489 // the fromUnicode side of the converter must be in its initial state
michael@0 490 // and will be reset again if it was used
michael@0 491 static int32_t
michael@0 492 cnvSigType(UConverter *cnv) {
michael@0 493 UErrorCode err;
michael@0 494 int32_t result;
michael@0 495
michael@0 496 // test if the output charset can convert U+FEFF
michael@0 497 USet *set = uset_open(1, 0);
michael@0 498 err = U_ZERO_ERROR;
michael@0 499 ucnv_getUnicodeSet(cnv, set, UCNV_ROUNDTRIP_SET, &err);
michael@0 500 if (U_SUCCESS(err) && uset_contains(set, uSig)) {
michael@0 501 result = CNV_WITH_FEFF;
michael@0 502 } else {
michael@0 503 result = CNV_NO_FEFF; // an error occurred or U+FEFF cannot be converted
michael@0 504 }
michael@0 505 uset_close(set);
michael@0 506
michael@0 507 if (result == CNV_WITH_FEFF) {
michael@0 508 // test if the output charset emits a signature anyway
michael@0 509 const UChar a[1] = { 0x61 }; // "a"
michael@0 510 const UChar *in;
michael@0 511
michael@0 512 char buffer[20];
michael@0 513 char *out;
michael@0 514
michael@0 515 in = a;
michael@0 516 out = buffer;
michael@0 517 err = U_ZERO_ERROR;
michael@0 518 ucnv_fromUnicode(cnv,
michael@0 519 &out, buffer + sizeof(buffer),
michael@0 520 &in, a + 1,
michael@0 521 NULL, TRUE, &err);
michael@0 522 ucnv_resetFromUnicode(cnv);
michael@0 523
michael@0 524 if (NULL != ucnv_detectUnicodeSignature(buffer, (int32_t)(out - buffer), NULL, &err) &&
michael@0 525 U_SUCCESS(err)
michael@0 526 ) {
michael@0 527 result = CNV_ADDS_FEFF;
michael@0 528 }
michael@0 529 }
michael@0 530
michael@0 531 return result;
michael@0 532 }
michael@0 533
michael@0 534 class ConvertFile {
michael@0 535 public:
michael@0 536 ConvertFile() :
michael@0 537 buf(NULL), outbuf(NULL), fromoffsets(NULL),
michael@0 538 bufsz(0), signature(0) {}
michael@0 539
michael@0 540 void
michael@0 541 setBufferSize(size_t bufferSize) {
michael@0 542 bufsz = bufferSize;
michael@0 543
michael@0 544 buf = new char[2 * bufsz];
michael@0 545 outbuf = buf + bufsz;
michael@0 546
michael@0 547 // +1 for an added U+FEFF in the intermediate Unicode buffer
michael@0 548 fromoffsets = new int32_t[bufsz + 1];
michael@0 549 }
michael@0 550
michael@0 551 ~ConvertFile() {
michael@0 552 delete [] buf;
michael@0 553 delete [] fromoffsets;
michael@0 554 }
michael@0 555
michael@0 556 UBool convertFile(const char *pname,
michael@0 557 const char *fromcpage,
michael@0 558 UConverterToUCallback toucallback,
michael@0 559 const void *touctxt,
michael@0 560 const char *tocpage,
michael@0 561 UConverterFromUCallback fromucallback,
michael@0 562 const void *fromuctxt,
michael@0 563 UBool fallback,
michael@0 564 const char *translit,
michael@0 565 const char *infilestr,
michael@0 566 FILE * outfile, int verbose);
michael@0 567 private:
michael@0 568 friend int main(int argc, char **argv);
michael@0 569
michael@0 570 char *buf, *outbuf;
michael@0 571 int32_t *fromoffsets;
michael@0 572
michael@0 573 size_t bufsz;
michael@0 574 int8_t signature; // add (1) or remove (-1) a U+FEFF Unicode signature character
michael@0 575 };
michael@0 576
michael@0 577 // Convert a file from one encoding to another
michael@0 578 UBool
michael@0 579 ConvertFile::convertFile(const char *pname,
michael@0 580 const char *fromcpage,
michael@0 581 UConverterToUCallback toucallback,
michael@0 582 const void *touctxt,
michael@0 583 const char *tocpage,
michael@0 584 UConverterFromUCallback fromucallback,
michael@0 585 const void *fromuctxt,
michael@0 586 UBool fallback,
michael@0 587 const char *translit,
michael@0 588 const char *infilestr,
michael@0 589 FILE * outfile, int verbose)
michael@0 590 {
michael@0 591 FILE *infile;
michael@0 592 UBool ret = TRUE;
michael@0 593 UConverter *convfrom = 0;
michael@0 594 UConverter *convto = 0;
michael@0 595 UErrorCode err = U_ZERO_ERROR;
michael@0 596 UBool flush;
michael@0 597 UBool closeFile = FALSE;
michael@0 598 const char *cbufp, *prevbufp;
michael@0 599 char *bufp;
michael@0 600
michael@0 601 uint32_t infoffset = 0, outfoffset = 0; /* Where we are in the file, for error reporting. */
michael@0 602
michael@0 603 const UChar *unibuf, *unibufbp;
michael@0 604 UChar *unibufp;
michael@0 605
michael@0 606 size_t rd, wr;
michael@0 607
michael@0 608 #if !UCONFIG_NO_TRANSLITERATION
michael@0 609 Transliterator *t = 0; // Transliterator acting on Unicode data.
michael@0 610 UnicodeString chunk; // One chunk of the text being collected for transformation.
michael@0 611 #endif
michael@0 612 UnicodeString u; // String to do the transliteration.
michael@0 613 int32_t ulen;
michael@0 614
michael@0 615 // use conversion offsets for error messages
michael@0 616 // unless a transliterator is used -
michael@0 617 // a text transformation will reorder characters in unpredictable ways
michael@0 618 UBool useOffsets = TRUE;
michael@0 619
michael@0 620 // Open the correct input file or connect to stdin for reading input
michael@0 621
michael@0 622 if (infilestr != 0 && strcmp(infilestr, "-")) {
michael@0 623 infile = fopen(infilestr, "rb");
michael@0 624 if (infile == 0) {
michael@0 625 UnicodeString str1(infilestr, "");
michael@0 626 str1.append((UChar32) 0);
michael@0 627 UnicodeString str2(strerror(errno), "");
michael@0 628 str2.append((UChar32) 0);
michael@0 629 initMsg(pname);
michael@0 630 u_wmsg(stderr, "cantOpenInputF", str1.getBuffer(), str2.getBuffer());
michael@0 631 return FALSE;
michael@0 632 }
michael@0 633 closeFile = TRUE;
michael@0 634 } else {
michael@0 635 infilestr = "-";
michael@0 636 infile = stdin;
michael@0 637 #ifdef USE_FILENO_BINARY_MODE
michael@0 638 if (setmode(fileno(stdin), O_BINARY) == -1) {
michael@0 639 initMsg(pname);
michael@0 640 u_wmsg(stderr, "cantSetInBinMode");
michael@0 641 return FALSE;
michael@0 642 }
michael@0 643 #endif
michael@0 644 }
michael@0 645
michael@0 646 if (verbose) {
michael@0 647 fprintf(stderr, "%s:\n", infilestr);
michael@0 648 }
michael@0 649
michael@0 650 #if !UCONFIG_NO_TRANSLITERATION
michael@0 651 // Create transliterator as needed.
michael@0 652
michael@0 653 if (translit != NULL && *translit) {
michael@0 654 UParseError parse;
michael@0 655 UnicodeString str(translit), pestr;
michael@0 656
michael@0 657 /* Create from rules or by ID as needed. */
michael@0 658
michael@0 659 parse.line = -1;
michael@0 660
michael@0 661 if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) {
michael@0 662 t = Transliterator::createFromRules("Uconv", str, UTRANS_FORWARD, parse, err);
michael@0 663 } else {
michael@0 664 t = Transliterator::createInstance(translit, UTRANS_FORWARD, err);
michael@0 665 }
michael@0 666
michael@0 667 if (U_FAILURE(err)) {
michael@0 668 str.append((UChar32) 0);
michael@0 669 initMsg(pname);
michael@0 670
michael@0 671 if (parse.line >= 0) {
michael@0 672 UChar linebuf[20], offsetbuf[20];
michael@0 673 uprv_itou(linebuf, 20, parse.line, 10, 0);
michael@0 674 uprv_itou(offsetbuf, 20, parse.offset, 10, 0);
michael@0 675 u_wmsg(stderr, "cantCreateTranslitParseErr", str.getTerminatedBuffer(),
michael@0 676 u_wmsg_errorName(err), linebuf, offsetbuf);
michael@0 677 } else {
michael@0 678 u_wmsg(stderr, "cantCreateTranslit", str.getTerminatedBuffer(),
michael@0 679 u_wmsg_errorName(err));
michael@0 680 }
michael@0 681
michael@0 682 if (t) {
michael@0 683 delete t;
michael@0 684 t = 0;
michael@0 685 }
michael@0 686 goto error_exit;
michael@0 687 }
michael@0 688
michael@0 689 useOffsets = FALSE;
michael@0 690 }
michael@0 691 #endif
michael@0 692
michael@0 693 // Create codepage converter. If the codepage or its aliases weren't
michael@0 694 // available, it returns NULL and a failure code. We also set the
michael@0 695 // callbacks, and return errors in the same way.
michael@0 696
michael@0 697 convfrom = ucnv_open(fromcpage, &err);
michael@0 698 if (U_FAILURE(err)) {
michael@0 699 UnicodeString str(fromcpage, "");
michael@0 700 initMsg(pname);
michael@0 701 u_wmsg(stderr, "cantOpenFromCodeset", str.getTerminatedBuffer(),
michael@0 702 u_wmsg_errorName(err));
michael@0 703 goto error_exit;
michael@0 704 }
michael@0 705 ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err);
michael@0 706 if (U_FAILURE(err)) {
michael@0 707 initMsg(pname);
michael@0 708 u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
michael@0 709 goto error_exit;
michael@0 710 }
michael@0 711
michael@0 712 convto = ucnv_open(tocpage, &err);
michael@0 713 if (U_FAILURE(err)) {
michael@0 714 UnicodeString str(tocpage, "");
michael@0 715 initMsg(pname);
michael@0 716 u_wmsg(stderr, "cantOpenToCodeset", str.getTerminatedBuffer(),
michael@0 717 u_wmsg_errorName(err));
michael@0 718 goto error_exit;
michael@0 719 }
michael@0 720 ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err);
michael@0 721 if (U_FAILURE(err)) {
michael@0 722 initMsg(pname);
michael@0 723 u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
michael@0 724 goto error_exit;
michael@0 725 }
michael@0 726 ucnv_setFallback(convto, fallback);
michael@0 727
michael@0 728 UBool willexit, fromSawEndOfBytes, toSawEndOfUnicode;
michael@0 729 int8_t sig;
michael@0 730
michael@0 731 // OK, we can convert now.
michael@0 732 sig = signature;
michael@0 733 rd = 0;
michael@0 734
michael@0 735 do {
michael@0 736 willexit = FALSE;
michael@0 737
michael@0 738 // input file offset at the beginning of the next buffer
michael@0 739 infoffset += rd;
michael@0 740
michael@0 741 rd = fread(buf, 1, bufsz, infile);
michael@0 742 if (ferror(infile) != 0) {
michael@0 743 UnicodeString str(strerror(errno));
michael@0 744 initMsg(pname);
michael@0 745 u_wmsg(stderr, "cantRead", str.getTerminatedBuffer());
michael@0 746 goto error_exit;
michael@0 747 }
michael@0 748
michael@0 749 // Convert the read buffer into the new encoding via Unicode.
michael@0 750 // After the call 'unibufp' will be placed behind the last
michael@0 751 // character that was converted in the 'unibuf'.
michael@0 752 // Also the 'cbufp' is positioned behind the last converted
michael@0 753 // character.
michael@0 754 // At the last conversion in the file, flush should be set to
michael@0 755 // true so that we get all characters converted.
michael@0 756 //
michael@0 757 // The converter must be flushed at the end of conversion so
michael@0 758 // that characters on hold also will be written.
michael@0 759
michael@0 760 cbufp = buf;
michael@0 761 flush = (UBool)(rd != bufsz);
michael@0 762
michael@0 763 // convert until the input is consumed
michael@0 764 do {
michael@0 765 // remember the start of the current byte-to-Unicode conversion
michael@0 766 prevbufp = cbufp;
michael@0 767
michael@0 768 unibuf = unibufp = u.getBuffer((int32_t)bufsz);
michael@0 769
michael@0 770 // Use bufsz instead of u.getCapacity() for the targetLimit
michael@0 771 // so that we don't overflow fromoffsets[].
michael@0 772 ucnv_toUnicode(convfrom, &unibufp, unibuf + bufsz, &cbufp,
michael@0 773 buf + rd, useOffsets ? fromoffsets : NULL, flush, &err);
michael@0 774
michael@0 775 ulen = (int32_t)(unibufp - unibuf);
michael@0 776 u.releaseBuffer(U_SUCCESS(err) ? ulen : 0);
michael@0 777
michael@0 778 // fromSawEndOfBytes indicates that ucnv_toUnicode() is done
michael@0 779 // converting all of the input bytes.
michael@0 780 // It works like this because ucnv_toUnicode() returns only under the
michael@0 781 // following conditions:
michael@0 782 // - an error occurred during conversion (an error code is set)
michael@0 783 // - the target buffer is filled (the error code indicates an overflow)
michael@0 784 // - the source is consumed
michael@0 785 // That is, if the error code does not indicate a failure,
michael@0 786 // not even an overflow, then the source must be consumed entirely.
michael@0 787 fromSawEndOfBytes = (UBool)U_SUCCESS(err);
michael@0 788
michael@0 789 if (err == U_BUFFER_OVERFLOW_ERROR) {
michael@0 790 err = U_ZERO_ERROR;
michael@0 791 } else if (U_FAILURE(err)) {
michael@0 792 char pos[32], errorBytes[32];
michael@0 793 int8_t i, length, errorLength;
michael@0 794
michael@0 795 UErrorCode localError = U_ZERO_ERROR;
michael@0 796 errorLength = (int8_t)sizeof(errorBytes);
michael@0 797 ucnv_getInvalidChars(convfrom, errorBytes, &errorLength, &localError);
michael@0 798 if (U_FAILURE(localError) || errorLength == 0) {
michael@0 799 errorLength = 1;
michael@0 800 }
michael@0 801
michael@0 802 // print the input file offset of the start of the error bytes:
michael@0 803 // input file offset of the current byte buffer +
michael@0 804 // length of the just consumed bytes -
michael@0 805 // length of the error bytes
michael@0 806 length =
michael@0 807 (int8_t)sprintf(pos, "%d",
michael@0 808 (int)(infoffset + (cbufp - buf) - errorLength));
michael@0 809
michael@0 810 // output the bytes that caused the error
michael@0 811 UnicodeString str;
michael@0 812 for (i = 0; i < errorLength; ++i) {
michael@0 813 if (i > 0) {
michael@0 814 str.append((UChar)uSP);
michael@0 815 }
michael@0 816 str.append(nibbleToHex((uint8_t)errorBytes[i] >> 4));
michael@0 817 str.append(nibbleToHex((uint8_t)errorBytes[i]));
michael@0 818 }
michael@0 819
michael@0 820 initMsg(pname);
michael@0 821 u_wmsg(stderr, "problemCvtToU",
michael@0 822 UnicodeString(pos, length, "").getTerminatedBuffer(),
michael@0 823 str.getTerminatedBuffer(),
michael@0 824 u_wmsg_errorName(err));
michael@0 825
michael@0 826 willexit = TRUE;
michael@0 827 err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
michael@0 828 }
michael@0 829
michael@0 830 // Replaced a check for whether the input was consumed by
michael@0 831 // looping until it is; message key "premEndInput" now obsolete.
michael@0 832
michael@0 833 if (ulen == 0) {
michael@0 834 continue;
michael@0 835 }
michael@0 836
michael@0 837 // remove a U+FEFF Unicode signature character if requested
michael@0 838 if (sig < 0) {
michael@0 839 if (u.charAt(0) == uSig) {
michael@0 840 u.remove(0, 1);
michael@0 841
michael@0 842 // account for the removed UChar and offset
michael@0 843 --ulen;
michael@0 844
michael@0 845 if (useOffsets) {
michael@0 846 // remove an offset from fromoffsets[] as well
michael@0 847 // to keep the array parallel with the UChars
michael@0 848 memmove(fromoffsets, fromoffsets + 1, ulen * 4);
michael@0 849 }
michael@0 850
michael@0 851 }
michael@0 852 sig = 0;
michael@0 853 }
michael@0 854
michael@0 855 #if !UCONFIG_NO_TRANSLITERATION
michael@0 856 // Transliterate/transform if needed.
michael@0 857
michael@0 858 // For transformation, we use chunking code -
michael@0 859 // collect Unicode input until, for example, an end-of-line,
michael@0 860 // then transform and output-convert that and continue collecting.
michael@0 861 // This makes the transformation result independent of the buffer size
michael@0 862 // while avoiding the slower keyboard mode.
michael@0 863 // The end-of-chunk characters are completely included in the
michael@0 864 // transformed string in case they are to be transformed themselves.
michael@0 865 if (t != NULL) {
michael@0 866 UnicodeString out;
michael@0 867 int32_t chunkLimit;
michael@0 868
michael@0 869 do {
michael@0 870 chunkLimit = getChunkLimit(chunk, u);
michael@0 871 if (chunkLimit < 0 && flush && fromSawEndOfBytes) {
michael@0 872 // use all of the rest at the end of the text
michael@0 873 chunkLimit = u.length();
michael@0 874 }
michael@0 875 if (chunkLimit >= 0) {
michael@0 876 // complete the chunk and transform it
michael@0 877 chunk.append(u, 0, chunkLimit);
michael@0 878 u.remove(0, chunkLimit);
michael@0 879 t->transliterate(chunk);
michael@0 880
michael@0 881 // append the transformation result to the result and empty the chunk
michael@0 882 out.append(chunk);
michael@0 883 chunk.remove();
michael@0 884 } else {
michael@0 885 // continue collecting the chunk
michael@0 886 chunk.append(u);
michael@0 887 break;
michael@0 888 }
michael@0 889 } while (!u.isEmpty());
michael@0 890
michael@0 891 u = out;
michael@0 892 ulen = u.length();
michael@0 893 }
michael@0 894 #endif
michael@0 895
michael@0 896 // add a U+FEFF Unicode signature character if requested
michael@0 897 // and possible/necessary
michael@0 898 if (sig > 0) {
michael@0 899 if (u.charAt(0) != uSig && cnvSigType(convto) == CNV_WITH_FEFF) {
michael@0 900 u.insert(0, (UChar)uSig);
michael@0 901
michael@0 902 if (useOffsets) {
michael@0 903 // insert a pseudo-offset into fromoffsets[] as well
michael@0 904 // to keep the array parallel with the UChars
michael@0 905 memmove(fromoffsets + 1, fromoffsets, ulen * 4);
michael@0 906 fromoffsets[0] = -1;
michael@0 907 }
michael@0 908
michael@0 909 // account for the additional UChar and offset
michael@0 910 ++ulen;
michael@0 911 }
michael@0 912 sig = 0;
michael@0 913 }
michael@0 914
michael@0 915 // Convert the Unicode buffer into the destination codepage
michael@0 916 // Again 'bufp' will be placed behind the last converted character
michael@0 917 // And 'unibufp' will be placed behind the last converted unicode character
michael@0 918 // At the last conversion flush should be set to true to ensure that
michael@0 919 // all characters left get converted
michael@0 920
michael@0 921 unibuf = unibufbp = u.getBuffer();
michael@0 922
michael@0 923 do {
michael@0 924 bufp = outbuf;
michael@0 925
michael@0 926 // Use fromSawEndOfBytes in addition to the flush flag -
michael@0 927 // it indicates whether the intermediate Unicode string
michael@0 928 // contains the very last UChars for the very last input bytes.
michael@0 929 ucnv_fromUnicode(convto, &bufp, outbuf + bufsz,
michael@0 930 &unibufbp,
michael@0 931 unibuf + ulen,
michael@0 932 NULL, (UBool)(flush && fromSawEndOfBytes), &err);
michael@0 933
michael@0 934 // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done
michael@0 935 // converting all of the intermediate UChars.
michael@0 936 // See comment for fromSawEndOfBytes.
michael@0 937 toSawEndOfUnicode = (UBool)U_SUCCESS(err);
michael@0 938
michael@0 939 if (err == U_BUFFER_OVERFLOW_ERROR) {
michael@0 940 err = U_ZERO_ERROR;
michael@0 941 } else if (U_FAILURE(err)) {
michael@0 942 UChar errorUChars[4];
michael@0 943 const char *errtag;
michael@0 944 char pos[32];
michael@0 945 UChar32 c;
michael@0 946 int8_t i, length, errorLength;
michael@0 947
michael@0 948 UErrorCode localError = U_ZERO_ERROR;
michael@0 949 errorLength = (int8_t)LENGTHOF(errorUChars);
michael@0 950 ucnv_getInvalidUChars(convto, errorUChars, &errorLength, &localError);
michael@0 951 if (U_FAILURE(localError) || errorLength == 0) {
michael@0 952 // need at least 1 so that we don't access beyond the length of fromoffsets[]
michael@0 953 errorLength = 1;
michael@0 954 }
michael@0 955
michael@0 956 int32_t ferroffset;
michael@0 957
michael@0 958 if (useOffsets) {
michael@0 959 // Unicode buffer offset of the start of the error UChars
michael@0 960 ferroffset = (int32_t)((unibufbp - unibuf) - errorLength);
michael@0 961 if (ferroffset < 0) {
michael@0 962 // approximation - the character started in the previous Unicode buffer
michael@0 963 ferroffset = 0;
michael@0 964 }
michael@0 965
michael@0 966 // get the corresponding byte offset out of fromoffsets[]
michael@0 967 // go back if the offset is not known for some of the UChars
michael@0 968 int32_t fromoffset;
michael@0 969 do {
michael@0 970 fromoffset = fromoffsets[ferroffset];
michael@0 971 } while (fromoffset < 0 && --ferroffset >= 0);
michael@0 972
michael@0 973 // total input file offset =
michael@0 974 // input file offset of the current byte buffer +
michael@0 975 // byte buffer offset of where the current Unicode buffer is converted from +
michael@0 976 // fromoffsets[Unicode offset]
michael@0 977 ferroffset = infoffset + (prevbufp - buf) + fromoffset;
michael@0 978 errtag = "problemCvtFromU";
michael@0 979 } else {
michael@0 980 // Do not use fromoffsets if (t != NULL) because the Unicode text may
michael@0 981 // be different from what the offsets refer to.
michael@0 982
michael@0 983 // output file offset
michael@0 984 ferroffset = (int32_t)(outfoffset + (bufp - outbuf));
michael@0 985 errtag = "problemCvtFromUOut";
michael@0 986 }
michael@0 987
michael@0 988 length = (int8_t)sprintf(pos, "%u", (int)ferroffset);
michael@0 989
michael@0 990 // output the code points that caused the error
michael@0 991 UnicodeString str;
michael@0 992 for (i = 0; i < errorLength;) {
michael@0 993 if (i > 0) {
michael@0 994 str.append((UChar)uSP);
michael@0 995 }
michael@0 996 U16_NEXT(errorUChars, i, errorLength, c);
michael@0 997 if (c >= 0x100000) {
michael@0 998 str.append(nibbleToHex((uint8_t)(c >> 20)));
michael@0 999 }
michael@0 1000 if (c >= 0x10000) {
michael@0 1001 str.append(nibbleToHex((uint8_t)(c >> 16)));
michael@0 1002 }
michael@0 1003 str.append(nibbleToHex((uint8_t)(c >> 12)));
michael@0 1004 str.append(nibbleToHex((uint8_t)(c >> 8)));
michael@0 1005 str.append(nibbleToHex((uint8_t)(c >> 4)));
michael@0 1006 str.append(nibbleToHex((uint8_t)c));
michael@0 1007 }
michael@0 1008
michael@0 1009 initMsg(pname);
michael@0 1010 u_wmsg(stderr, errtag,
michael@0 1011 UnicodeString(pos, length, "").getTerminatedBuffer(),
michael@0 1012 str.getTerminatedBuffer(),
michael@0 1013 u_wmsg_errorName(err));
michael@0 1014 u_wmsg(stderr, "errorUnicode", str.getTerminatedBuffer());
michael@0 1015
michael@0 1016 willexit = TRUE;
michael@0 1017 err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
michael@0 1018 }
michael@0 1019
michael@0 1020 // Replaced a check for whether the intermediate Unicode characters were all consumed by
michael@0 1021 // looping until they are; message key "premEnd" now obsolete.
michael@0 1022
michael@0 1023 // Finally, write the converted buffer to the output file
michael@0 1024 size_t outlen = (size_t) (bufp - outbuf);
michael@0 1025 outfoffset += (int32_t)(wr = fwrite(outbuf, 1, outlen, outfile));
michael@0 1026 if (wr != outlen) {
michael@0 1027 UnicodeString str(strerror(errno));
michael@0 1028 initMsg(pname);
michael@0 1029 u_wmsg(stderr, "cantWrite", str.getTerminatedBuffer());
michael@0 1030 willexit = TRUE;
michael@0 1031 }
michael@0 1032
michael@0 1033 if (willexit) {
michael@0 1034 goto error_exit;
michael@0 1035 }
michael@0 1036 } while (!toSawEndOfUnicode);
michael@0 1037 } while (!fromSawEndOfBytes);
michael@0 1038 } while (!flush); // Stop when we have flushed the
michael@0 1039 // converters (this means that it's
michael@0 1040 // the end of output)
michael@0 1041
michael@0 1042 goto normal_exit;
michael@0 1043
michael@0 1044 error_exit:
michael@0 1045 ret = FALSE;
michael@0 1046
michael@0 1047 normal_exit:
michael@0 1048 // Cleanup.
michael@0 1049
michael@0 1050 ucnv_close(convfrom);
michael@0 1051 ucnv_close(convto);
michael@0 1052
michael@0 1053 #if !UCONFIG_NO_TRANSLITERATION
michael@0 1054 delete t;
michael@0 1055 #endif
michael@0 1056
michael@0 1057 if (closeFile) {
michael@0 1058 fclose(infile);
michael@0 1059 }
michael@0 1060
michael@0 1061 return ret;
michael@0 1062 }
michael@0 1063
michael@0 1064 static void usage(const char *pname, int ecode) {
michael@0 1065 const UChar *msg;
michael@0 1066 int32_t msgLen;
michael@0 1067 UErrorCode err = U_ZERO_ERROR;
michael@0 1068 FILE *fp = ecode ? stderr : stdout;
michael@0 1069 int res;
michael@0 1070
michael@0 1071 initMsg(pname);
michael@0 1072 msg =
michael@0 1073 ures_getStringByKey(gBundle, ecode ? "lcUsageWord" : "ucUsageWord",
michael@0 1074 &msgLen, &err);
michael@0 1075 UnicodeString upname(pname, (int32_t)(uprv_strlen(pname) + 1));
michael@0 1076 UnicodeString mname(msg, msgLen + 1);
michael@0 1077
michael@0 1078 res = u_wmsg(fp, "usage", mname.getBuffer(), upname.getBuffer());
michael@0 1079 if (!ecode) {
michael@0 1080 if (!res) {
michael@0 1081 fputc('\n', fp);
michael@0 1082 }
michael@0 1083 if (!u_wmsg(fp, "help")) {
michael@0 1084 /* Now dump callbacks and finish. */
michael@0 1085
michael@0 1086 int i, count =
michael@0 1087 sizeof(transcode_callbacks) / sizeof(*transcode_callbacks);
michael@0 1088 for (i = 0; i < count; ++i) {
michael@0 1089 fprintf(fp, " %s", transcode_callbacks[i].name);
michael@0 1090 }
michael@0 1091 fputc('\n', fp);
michael@0 1092 }
michael@0 1093 }
michael@0 1094
michael@0 1095 exit(ecode);
michael@0 1096 }
michael@0 1097
michael@0 1098 extern int
michael@0 1099 main(int argc, char **argv)
michael@0 1100 {
michael@0 1101 FILE *outfile;
michael@0 1102 int ret = 0;
michael@0 1103
michael@0 1104 size_t bufsz = DEFAULT_BUFSZ;
michael@0 1105
michael@0 1106 const char *fromcpage = 0;
michael@0 1107 const char *tocpage = 0;
michael@0 1108 const char *translit = 0;
michael@0 1109 const char *outfilestr = 0;
michael@0 1110 UBool fallback = FALSE;
michael@0 1111
michael@0 1112 UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_STOP;
michael@0 1113 const void *fromuctxt = 0;
michael@0 1114 UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_STOP;
michael@0 1115 const void *touctxt = 0;
michael@0 1116
michael@0 1117 char **iter, **remainArgv, **remainArgvLimit;
michael@0 1118 char **end = argv + argc;
michael@0 1119
michael@0 1120 const char *pname;
michael@0 1121
michael@0 1122 UBool printConvs = FALSE, printCanon = FALSE, printTranslits = FALSE;
michael@0 1123 const char *printName = 0;
michael@0 1124
michael@0 1125 UBool verbose = FALSE;
michael@0 1126 UErrorCode status = U_ZERO_ERROR;
michael@0 1127
michael@0 1128 ConvertFile cf;
michael@0 1129
michael@0 1130 /* Initialize ICU */
michael@0 1131 u_init(&status);
michael@0 1132 if (U_FAILURE(status)) {
michael@0 1133 fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
michael@0 1134 argv[0], u_errorName(status));
michael@0 1135 exit(1);
michael@0 1136 }
michael@0 1137
michael@0 1138 // Get and prettify pname.
michael@0 1139 pname = uprv_strrchr(*argv, U_FILE_SEP_CHAR);
michael@0 1140 #if U_PLATFORM_USES_ONLY_WIN32_API
michael@0 1141 if (!pname) {
michael@0 1142 pname = uprv_strrchr(*argv, '/');
michael@0 1143 }
michael@0 1144 #endif
michael@0 1145 if (!pname) {
michael@0 1146 pname = *argv;
michael@0 1147 } else {
michael@0 1148 ++pname;
michael@0 1149 }
michael@0 1150
michael@0 1151 // First, get the arguments from command-line
michael@0 1152 // to know the codepages to convert between
michael@0 1153
michael@0 1154 remainArgv = remainArgvLimit = argv + 1;
michael@0 1155 for (iter = argv + 1; iter != end; iter++) {
michael@0 1156 // Check for from charset
michael@0 1157 if (strcmp("-f", *iter) == 0 || !strcmp("--from-code", *iter)) {
michael@0 1158 iter++;
michael@0 1159 if (iter != end)
michael@0 1160 fromcpage = *iter;
michael@0 1161 else
michael@0 1162 usage(pname, 1);
michael@0 1163 } else if (strcmp("-t", *iter) == 0 || !strcmp("--to-code", *iter)) {
michael@0 1164 iter++;
michael@0 1165 if (iter != end)
michael@0 1166 tocpage = *iter;
michael@0 1167 else
michael@0 1168 usage(pname, 1);
michael@0 1169 } else if (strcmp("-x", *iter) == 0) {
michael@0 1170 iter++;
michael@0 1171 if (iter != end)
michael@0 1172 translit = *iter;
michael@0 1173 else
michael@0 1174 usage(pname, 1);
michael@0 1175 } else if (!strcmp("--fallback", *iter)) {
michael@0 1176 fallback = TRUE;
michael@0 1177 } else if (!strcmp("--no-fallback", *iter)) {
michael@0 1178 fallback = FALSE;
michael@0 1179 } else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) {
michael@0 1180 iter++;
michael@0 1181 if (iter != end) {
michael@0 1182 bufsz = atoi(*iter);
michael@0 1183 if ((int) bufsz <= 0) {
michael@0 1184 initMsg(pname);
michael@0 1185 UnicodeString str(*iter);
michael@0 1186 initMsg(pname);
michael@0 1187 u_wmsg(stderr, "badBlockSize", str.getTerminatedBuffer());
michael@0 1188 return 3;
michael@0 1189 }
michael@0 1190 } else {
michael@0 1191 usage(pname, 1);
michael@0 1192 }
michael@0 1193 } else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) {
michael@0 1194 if (printTranslits) {
michael@0 1195 usage(pname, 1);
michael@0 1196 }
michael@0 1197 printConvs = TRUE;
michael@0 1198 } else if (strcmp("--default-code", *iter) == 0) {
michael@0 1199 if (printTranslits) {
michael@0 1200 usage(pname, 1);
michael@0 1201 }
michael@0 1202 printName = ucnv_getDefaultName();
michael@0 1203 } else if (strcmp("--list-code", *iter) == 0) {
michael@0 1204 if (printTranslits) {
michael@0 1205 usage(pname, 1);
michael@0 1206 }
michael@0 1207
michael@0 1208 iter++;
michael@0 1209 if (iter != end) {
michael@0 1210 UErrorCode e = U_ZERO_ERROR;
michael@0 1211 printName = ucnv_getAlias(*iter, 0, &e);
michael@0 1212 if (U_FAILURE(e) || !printName) {
michael@0 1213 UnicodeString str(*iter);
michael@0 1214 initMsg(pname);
michael@0 1215 u_wmsg(stderr, "noSuchCodeset", str.getTerminatedBuffer());
michael@0 1216 return 2;
michael@0 1217 }
michael@0 1218 } else
michael@0 1219 usage(pname, 1);
michael@0 1220 } else if (strcmp("--canon", *iter) == 0) {
michael@0 1221 printCanon = TRUE;
michael@0 1222 } else if (strcmp("-L", *iter) == 0
michael@0 1223 || !strcmp("--list-transliterators", *iter)) {
michael@0 1224 if (printConvs) {
michael@0 1225 usage(pname, 1);
michael@0 1226 }
michael@0 1227 printTranslits = TRUE;
michael@0 1228 } else if (strcmp("-h", *iter) == 0 || !strcmp("-?", *iter)
michael@0 1229 || !strcmp("--help", *iter)) {
michael@0 1230 usage(pname, 0);
michael@0 1231 } else if (!strcmp("-c", *iter)) {
michael@0 1232 fromucallback = UCNV_FROM_U_CALLBACK_SKIP;
michael@0 1233 } else if (!strcmp("--to-callback", *iter)) {
michael@0 1234 iter++;
michael@0 1235 if (iter != end) {
michael@0 1236 const struct callback_ent *cbe = findCallback(*iter);
michael@0 1237 if (cbe) {
michael@0 1238 fromucallback = cbe->fromu;
michael@0 1239 fromuctxt = cbe->fromuctxt;
michael@0 1240 } else {
michael@0 1241 UnicodeString str(*iter);
michael@0 1242 initMsg(pname);
michael@0 1243 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
michael@0 1244 return 4;
michael@0 1245 }
michael@0 1246 } else {
michael@0 1247 usage(pname, 1);
michael@0 1248 }
michael@0 1249 } else if (!strcmp("--from-callback", *iter)) {
michael@0 1250 iter++;
michael@0 1251 if (iter != end) {
michael@0 1252 const struct callback_ent *cbe = findCallback(*iter);
michael@0 1253 if (cbe) {
michael@0 1254 toucallback = cbe->tou;
michael@0 1255 touctxt = cbe->touctxt;
michael@0 1256 } else {
michael@0 1257 UnicodeString str(*iter);
michael@0 1258 initMsg(pname);
michael@0 1259 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
michael@0 1260 return 4;
michael@0 1261 }
michael@0 1262 } else {
michael@0 1263 usage(pname, 1);
michael@0 1264 }
michael@0 1265 } else if (!strcmp("-i", *iter)) {
michael@0 1266 toucallback = UCNV_TO_U_CALLBACK_SKIP;
michael@0 1267 } else if (!strcmp("--callback", *iter)) {
michael@0 1268 iter++;
michael@0 1269 if (iter != end) {
michael@0 1270 const struct callback_ent *cbe = findCallback(*iter);
michael@0 1271 if (cbe) {
michael@0 1272 fromucallback = cbe->fromu;
michael@0 1273 fromuctxt = cbe->fromuctxt;
michael@0 1274 toucallback = cbe->tou;
michael@0 1275 touctxt = cbe->touctxt;
michael@0 1276 } else {
michael@0 1277 UnicodeString str(*iter);
michael@0 1278 initMsg(pname);
michael@0 1279 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
michael@0 1280 return 4;
michael@0 1281 }
michael@0 1282 } else {
michael@0 1283 usage(pname, 1);
michael@0 1284 }
michael@0 1285 } else if (!strcmp("-s", *iter) || !strcmp("--silent", *iter)) {
michael@0 1286 verbose = FALSE;
michael@0 1287 } else if (!strcmp("-v", *iter) || !strcmp("--verbose", *iter)) {
michael@0 1288 verbose = TRUE;
michael@0 1289 } else if (!strcmp("-V", *iter) || !strcmp("--version", *iter)) {
michael@0 1290 printf("%s v2.1 ICU " U_ICU_VERSION "\n", pname);
michael@0 1291 return 0;
michael@0 1292 } else if (!strcmp("-o", *iter) || !strcmp("--output", *iter)) {
michael@0 1293 ++iter;
michael@0 1294 if (iter != end && !outfilestr) {
michael@0 1295 outfilestr = *iter;
michael@0 1296 } else {
michael@0 1297 usage(pname, 1);
michael@0 1298 }
michael@0 1299 } else if (0 == strcmp("--add-signature", *iter)) {
michael@0 1300 cf.signature = 1;
michael@0 1301 } else if (0 == strcmp("--remove-signature", *iter)) {
michael@0 1302 cf.signature = -1;
michael@0 1303 } else if (**iter == '-' && (*iter)[1]) {
michael@0 1304 usage(pname, 1);
michael@0 1305 } else {
michael@0 1306 // move a non-option up in argv[]
michael@0 1307 *remainArgvLimit++ = *iter;
michael@0 1308 }
michael@0 1309 }
michael@0 1310
michael@0 1311 if (printConvs || printName) {
michael@0 1312 return printConverters(pname, printName, printCanon) ? 2 : 0;
michael@0 1313 } else if (printTranslits) {
michael@0 1314 return printTransliterators(printCanon) ? 3 : 0;
michael@0 1315 }
michael@0 1316
michael@0 1317 if (!fromcpage || !uprv_strcmp(fromcpage, "-")) {
michael@0 1318 fromcpage = ucnv_getDefaultName();
michael@0 1319 }
michael@0 1320 if (!tocpage || !uprv_strcmp(tocpage, "-")) {
michael@0 1321 tocpage = ucnv_getDefaultName();
michael@0 1322 }
michael@0 1323
michael@0 1324 // Open the correct output file or connect to stdout for reading input
michael@0 1325 if (outfilestr != 0 && strcmp(outfilestr, "-")) {
michael@0 1326 outfile = fopen(outfilestr, "wb");
michael@0 1327 if (outfile == 0) {
michael@0 1328 UnicodeString str1(outfilestr, "");
michael@0 1329 UnicodeString str2(strerror(errno), "");
michael@0 1330 initMsg(pname);
michael@0 1331 u_wmsg(stderr, "cantCreateOutputF",
michael@0 1332 str1.getBuffer(), str2.getBuffer());
michael@0 1333 return 1;
michael@0 1334 }
michael@0 1335 } else {
michael@0 1336 outfilestr = "-";
michael@0 1337 outfile = stdout;
michael@0 1338 #ifdef USE_FILENO_BINARY_MODE
michael@0 1339 if (setmode(fileno(outfile), O_BINARY) == -1) {
michael@0 1340 u_wmsg(stderr, "cantSetOutBinMode");
michael@0 1341 exit(-1);
michael@0 1342 }
michael@0 1343 #endif
michael@0 1344 }
michael@0 1345
michael@0 1346 /* Loop again on the arguments to find all the input files, and
michael@0 1347 convert them. */
michael@0 1348
michael@0 1349 cf.setBufferSize(bufsz);
michael@0 1350
michael@0 1351 if(remainArgv < remainArgvLimit) {
michael@0 1352 for (iter = remainArgv; iter != remainArgvLimit; iter++) {
michael@0 1353 if (!cf.convertFile(
michael@0 1354 pname, fromcpage, toucallback, touctxt, tocpage,
michael@0 1355 fromucallback, fromuctxt, fallback, translit, *iter,
michael@0 1356 outfile, verbose)
michael@0 1357 ) {
michael@0 1358 goto error_exit;
michael@0 1359 }
michael@0 1360 }
michael@0 1361 } else {
michael@0 1362 if (!cf.convertFile(
michael@0 1363 pname, fromcpage, toucallback, touctxt, tocpage,
michael@0 1364 fromucallback, fromuctxt, fallback, translit, 0,
michael@0 1365 outfile, verbose)
michael@0 1366 ) {
michael@0 1367 goto error_exit;
michael@0 1368 }
michael@0 1369 }
michael@0 1370
michael@0 1371 goto normal_exit;
michael@0 1372 error_exit:
michael@0 1373 #if !UCONFIG_NO_LEGACY_CONVERSION
michael@0 1374 ret = 1;
michael@0 1375 #else
michael@0 1376 fprintf(stderr, "uconv error: UCONFIG_NO_LEGACY_CONVERSION is on. See uconfig.h\n");
michael@0 1377 #endif
michael@0 1378 normal_exit:
michael@0 1379
michael@0 1380 if (outfile != stdout) {
michael@0 1381 fclose(outfile);
michael@0 1382 }
michael@0 1383
michael@0 1384 u_cleanup();
michael@0 1385
michael@0 1386 return ret;
michael@0 1387 }
michael@0 1388
michael@0 1389
michael@0 1390 /*
michael@0 1391 * Hey, Emacs, please set the following:
michael@0 1392 *
michael@0 1393 * Local Variables:
michael@0 1394 * indent-tabs-mode: nil
michael@0 1395 * End:
michael@0 1396 *
michael@0 1397 */

mercurial