intl/icu/source/extra/uconv/uconv.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/extra/uconv/uconv.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1397 @@
     1.4 +/*****************************************************************************
     1.5 +*
     1.6 +*   Copyright (C) 1999-2013, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +*
     1.9 +******************************************************************************/
    1.10 +
    1.11 +/*
    1.12 + * uconv(1): an iconv(1)-like converter using ICU.
    1.13 + *
    1.14 + * Original code by Jonas Utterstr&#x00F6;m <jonas.utterstrom@vittran.norrnod.se>
    1.15 + * contributed in 1999.
    1.16 + *
    1.17 + * Conversion to the C conversion API and many improvements by
    1.18 + * Yves Arrouye <yves@realnames.com>, current maintainer.
    1.19 + *
    1.20 + * Markus Scherer maintainer from 2003.
    1.21 + * See source code repository history for changes.
    1.22 + */
    1.23 +
    1.24 +#include <unicode/utypes.h>
    1.25 +#include <unicode/putil.h>
    1.26 +#include <unicode/ucnv.h>
    1.27 +#include <unicode/uenum.h>
    1.28 +#include <unicode/unistr.h>
    1.29 +#include <unicode/translit.h>
    1.30 +#include <unicode/uset.h>
    1.31 +#include <unicode/uclean.h>
    1.32 +#include <unicode/utf16.h>
    1.33 +
    1.34 +#include <stdio.h>
    1.35 +#include <errno.h>
    1.36 +#include <string.h>
    1.37 +#include <stdlib.h>
    1.38 +
    1.39 +#include "cmemory.h"
    1.40 +#include "cstring.h"
    1.41 +#include "ustrfmt.h"
    1.42 +
    1.43 +#include "unicode/uwmsg.h"
    1.44 +
    1.45 +U_NAMESPACE_USE
    1.46 +
    1.47 +#if U_PLATFORM_USES_ONLY_WIN32_API && !defined(__STRICT_ANSI__)
    1.48 +#include <io.h>
    1.49 +#include <fcntl.h>
    1.50 +#if U_PLATFORM_USES_ONLY_WIN32_API
    1.51 +#define USE_FILENO_BINARY_MODE 1
    1.52 +/* Windows likes to rename Unix-like functions */
    1.53 +#ifndef fileno
    1.54 +#define fileno _fileno
    1.55 +#endif
    1.56 +#ifndef setmode
    1.57 +#define setmode _setmode
    1.58 +#endif
    1.59 +#ifndef O_BINARY
    1.60 +#define O_BINARY _O_BINARY
    1.61 +#endif
    1.62 +#endif
    1.63 +#endif
    1.64 +
    1.65 +#ifdef UCONVMSG_LINK
    1.66 +/* below from the README */
    1.67 +#include "unicode/utypes.h"
    1.68 +#include "unicode/udata.h"
    1.69 +U_CFUNC char uconvmsg_dat[];
    1.70 +#endif
    1.71 +
    1.72 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
    1.73 +
    1.74 +#define DEFAULT_BUFSZ   4096
    1.75 +#define UCONVMSG "uconvmsg"
    1.76 +
    1.77 +static UResourceBundle *gBundle = 0;    /* Bundle containing messages. */
    1.78 +
    1.79 +/*
    1.80 + * Initialize the message bundle so that message strings can be fetched
    1.81 + * by u_wmsg().
    1.82 + *
    1.83 + */
    1.84 +
    1.85 +static void initMsg(const char *pname) {
    1.86 +    static int ps = 0;
    1.87 +
    1.88 +    if (!ps) {
    1.89 +        char dataPath[2048];        /* XXX Sloppy: should be PATH_MAX. */
    1.90 +        UErrorCode err = U_ZERO_ERROR;
    1.91 +
    1.92 +        ps = 1;
    1.93 +
    1.94 +        /* Set up our static data - if any */
    1.95 +#if defined(UCONVMSG_LINK) && U_PLATFORM != U_PF_OS390 /* On z/OS, this is failing. */
    1.96 +        udata_setAppData(UCONVMSG, (const void*) uconvmsg_dat, &err);
    1.97 +        if (U_FAILURE(err)) {
    1.98 +          fprintf(stderr, "%s: warning, problem installing our static resource bundle data uconvmsg: %s - trying anyways.\n",
    1.99 +                  pname, u_errorName(err));
   1.100 +          err = U_ZERO_ERROR; /* It may still fail */
   1.101 +        }
   1.102 +#endif
   1.103 +
   1.104 +        /* Get messages. */
   1.105 +        gBundle = u_wmsg_setPath(UCONVMSG, &err);
   1.106 +        if (U_FAILURE(err)) {
   1.107 +            fprintf(stderr,
   1.108 +                    "%s: warning: couldn't open bundle %s: %s\n",
   1.109 +                    pname, UCONVMSG, u_errorName(err));
   1.110 +#ifdef UCONVMSG_LINK
   1.111 +            fprintf(stderr,
   1.112 +                    "%s: setAppData was called, internal data %s failed to load\n",
   1.113 +                        pname, UCONVMSG);
   1.114 +#endif
   1.115 + 
   1.116 +            err = U_ZERO_ERROR;
   1.117 +            /* that was try #1, try again with a path */
   1.118 +            uprv_strcpy(dataPath, u_getDataDirectory());
   1.119 +            uprv_strcat(dataPath, U_FILE_SEP_STRING);
   1.120 +            uprv_strcat(dataPath, UCONVMSG);
   1.121 +
   1.122 +            gBundle = u_wmsg_setPath(dataPath, &err);
   1.123 +            if (U_FAILURE(err)) {
   1.124 +                fprintf(stderr,
   1.125 +                    "%s: warning: still couldn't open bundle %s: %s\n",
   1.126 +                    pname, dataPath, u_errorName(err));
   1.127 +                fprintf(stderr, "%s: warning: messages will not be displayed\n", pname);
   1.128 +            }
   1.129 +        }
   1.130 +    }
   1.131 +}
   1.132 +
   1.133 +/* Mapping of callback names to the callbacks passed to the converter
   1.134 +   API. */
   1.135 +
   1.136 +static struct callback_ent {
   1.137 +    const char *name;
   1.138 +    UConverterFromUCallback fromu;
   1.139 +    const void *fromuctxt;
   1.140 +    UConverterToUCallback tou;
   1.141 +    const void *touctxt;
   1.142 +} transcode_callbacks[] = {
   1.143 +    { "substitute",
   1.144 +      UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0,
   1.145 +      UCNV_TO_U_CALLBACK_SUBSTITUTE, 0 },
   1.146 +    { "skip",
   1.147 +      UCNV_FROM_U_CALLBACK_SKIP, 0,
   1.148 +      UCNV_TO_U_CALLBACK_SKIP, 0 },
   1.149 +    { "stop",
   1.150 +      UCNV_FROM_U_CALLBACK_STOP, 0,
   1.151 +      UCNV_TO_U_CALLBACK_STOP, 0 },
   1.152 +    { "escape",
   1.153 +      UCNV_FROM_U_CALLBACK_ESCAPE, 0,
   1.154 +      UCNV_TO_U_CALLBACK_ESCAPE, 0},
   1.155 +    { "escape-icu",
   1.156 +      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU,
   1.157 +      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU },
   1.158 +    { "escape-java",
   1.159 +      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA,
   1.160 +      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA },
   1.161 +    { "escape-c",
   1.162 +      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C,
   1.163 +      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C },
   1.164 +    { "escape-xml",
   1.165 +      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX,
   1.166 +      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX },
   1.167 +    { "escape-xml-hex",
   1.168 +      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX,
   1.169 +      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX },
   1.170 +    { "escape-xml-dec",
   1.171 +      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC,
   1.172 +      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC },
   1.173 +    { "escape-unicode", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE,
   1.174 +      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE }
   1.175 +};
   1.176 +
   1.177 +/* Return a pointer to a callback record given its name. */
   1.178 +
   1.179 +static const struct callback_ent *findCallback(const char *name) {
   1.180 +    int i, count =
   1.181 +        sizeof(transcode_callbacks) / sizeof(*transcode_callbacks);
   1.182 +
   1.183 +    /* We'll do a linear search, there aren't many of them and bsearch()
   1.184 +       may not be that portable. */
   1.185 +
   1.186 +    for (i = 0; i < count; ++i) {
   1.187 +        if (!uprv_stricmp(name, transcode_callbacks[i].name)) {
   1.188 +            return &transcode_callbacks[i];
   1.189 +        }
   1.190 +    }
   1.191 +
   1.192 +    return 0;
   1.193 +}
   1.194 +
   1.195 +/* Print converter information. If lookfor is set, only that converter will
   1.196 +   be printed, otherwise all converters will be printed. If canon is non
   1.197 +   zero, tags and aliases for each converter are printed too, in the format
   1.198 +   expected for convrters.txt(5). */
   1.199 +
   1.200 +static int printConverters(const char *pname, const char *lookfor,
   1.201 +    UBool canon)
   1.202 +{
   1.203 +    UErrorCode err = U_ZERO_ERROR;
   1.204 +    int32_t num;
   1.205 +    uint16_t num_stds;
   1.206 +    const char **stds;
   1.207 +
   1.208 +    /* If there is a specified name, just handle that now. */
   1.209 +
   1.210 +    if (lookfor) {
   1.211 +        if (!canon) {
   1.212 +            printf("%s\n", lookfor);
   1.213 +            return 0;
   1.214 +        } else {
   1.215 +        /*  Because we are printing a canonical name, we need the
   1.216 +            true converter name. We've done that already except for
   1.217 +            the default name (because we want to print the exact
   1.218 +            name one would get when calling ucnv_getDefaultName()
   1.219 +            in non-canon mode). But since we do not know at this
   1.220 +            point if we have the default name or something else, we
   1.221 +            need to normalize again to the canonical converter
   1.222 +            name. */
   1.223 +
   1.224 +            const char *truename = ucnv_getAlias(lookfor, 0, &err);
   1.225 +            if (U_SUCCESS(err)) {
   1.226 +                lookfor = truename;
   1.227 +            } else {
   1.228 +                err = U_ZERO_ERROR;
   1.229 +            }
   1.230 +        }
   1.231 +    }
   1.232 +
   1.233 +    /* Print converter names. We come here for one of two reasons: we
   1.234 +       are printing all the names (lookfor was null), or we have a
   1.235 +       single converter to print but in canon mode, hence we need to
   1.236 +       get to it in order to print everything. */
   1.237 +
   1.238 +    num = ucnv_countAvailable();
   1.239 +    if (num <= 0) {
   1.240 +        initMsg(pname);
   1.241 +        u_wmsg(stderr, "cantGetNames");
   1.242 +        return -1;
   1.243 +    }
   1.244 +    if (lookfor) {
   1.245 +        num = 1;                /* We know where we want to be. */
   1.246 +    }
   1.247 +
   1.248 +    num_stds = ucnv_countStandards();
   1.249 +    stds = (const char **) uprv_malloc(num_stds * sizeof(*stds));
   1.250 +    if (!stds) {
   1.251 +        u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR));
   1.252 +        return -1;
   1.253 +    } else {
   1.254 +        uint16_t s;
   1.255 +
   1.256 +        if (canon) {
   1.257 +            printf("{ ");
   1.258 +        }
   1.259 +        for (s = 0; s < num_stds; ++s) {
   1.260 +            stds[s] = ucnv_getStandard(s, &err);
   1.261 +            if (canon) {
   1.262 +                printf("%s ", stds[s]);
   1.263 +            }
   1.264 +            if (U_FAILURE(err)) {
   1.265 +                u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(err));
   1.266 +                goto error_cleanup;
   1.267 +            }
   1.268 +        }
   1.269 +        if (canon) {
   1.270 +            puts("}");
   1.271 +        }
   1.272 +    }
   1.273 +
   1.274 +    for (int32_t i = 0; i < num; i++) {
   1.275 +        const char *name;
   1.276 +        uint16_t num_aliases;
   1.277 +
   1.278 +        /* Set the name either to what we are looking for, or
   1.279 +        to the current converter name. */
   1.280 +
   1.281 +        if (lookfor) {
   1.282 +            name = lookfor;
   1.283 +        } else {
   1.284 +            name = ucnv_getAvailableName(i);
   1.285 +        }
   1.286 +
   1.287 +        /* Get all the aliases associated to the name. */
   1.288 +
   1.289 +        err = U_ZERO_ERROR;
   1.290 +        num_aliases = ucnv_countAliases(name, &err);
   1.291 +        if (U_FAILURE(err)) {
   1.292 +            printf("%s", name);
   1.293 +
   1.294 +            UnicodeString str(name, "");
   1.295 +            putchar('\t');
   1.296 +            u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
   1.297 +                u_wmsg_errorName(err));
   1.298 +            goto error_cleanup;
   1.299 +        } else {
   1.300 +            uint16_t a, s, t;
   1.301 +
   1.302 +            /* Write all the aliases and their tags. */
   1.303 +
   1.304 +            for (a = 0; a < num_aliases; ++a) {
   1.305 +                const char *alias = ucnv_getAlias(name, a, &err);
   1.306 +
   1.307 +                if (U_FAILURE(err)) {
   1.308 +                    UnicodeString str(name, "");
   1.309 +                    putchar('\t');
   1.310 +                    u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
   1.311 +                        u_wmsg_errorName(err));
   1.312 +                    goto error_cleanup;
   1.313 +                }
   1.314 +
   1.315 +                /* Print the current alias so that it looks right. */
   1.316 +                printf("%s%s%s", (canon ? (a == 0? "" : "\t" ) : "") ,
   1.317 +                                 alias,
   1.318 +                                 (canon ? "" : " "));
   1.319 +
   1.320 +                /* Look (slowly, linear searching) for a tag. */
   1.321 +
   1.322 +                if (canon) {
   1.323 +                    /* -1 to skip the last standard */
   1.324 +                    for (s = t = 0; s < num_stds-1; ++s) {
   1.325 +                        UEnumeration *nameEnum = ucnv_openStandardNames(name, stds[s], &err);
   1.326 +                        if (U_SUCCESS(err)) {
   1.327 +                            /* List the standard tags */
   1.328 +                            const char *standardName;
   1.329 +                            UBool isFirst = TRUE;
   1.330 +                            UErrorCode enumError = U_ZERO_ERROR;
   1.331 +                            while ((standardName = uenum_next(nameEnum, NULL, &enumError))) {
   1.332 +                                /* See if this alias is supported by this standard. */
   1.333 +                                if (!strcmp(standardName, alias)) {
   1.334 +                                    if (!t) {
   1.335 +                                        printf(" {");
   1.336 +                                        t = 1;
   1.337 +                                    }
   1.338 +                                    /* Print a * after the default standard name */
   1.339 +                                    printf(" %s%s", stds[s], (isFirst ? "*" : ""));
   1.340 +                                }
   1.341 +                                isFirst = FALSE;
   1.342 +                            }
   1.343 +                        }
   1.344 +                    }
   1.345 +                    if (t) {
   1.346 +                        printf(" }");
   1.347 +                    }
   1.348 +                }
   1.349 +                /* Terminate this entry. */
   1.350 +                if (canon) {
   1.351 +                    puts("");
   1.352 +                }
   1.353 +
   1.354 +                /* Move on. */
   1.355 +            }
   1.356 +            /* Terminate this entry. */
   1.357 +            if (!canon) {
   1.358 +                puts("");
   1.359 +            }
   1.360 +        }
   1.361 +    }
   1.362 +
   1.363 +    /* Free temporary data. */
   1.364 +
   1.365 +    uprv_free(stds);
   1.366 +
   1.367 +    /* Success. */
   1.368 +
   1.369 +    return 0;
   1.370 +error_cleanup:
   1.371 +    uprv_free(stds);
   1.372 +    return -1;
   1.373 +}
   1.374 +
   1.375 +/* Print all available transliterators. If canon is non zero, print
   1.376 +   one transliterator per line. */
   1.377 +
   1.378 +static int printTransliterators(UBool canon)
   1.379 +{
   1.380 +#if UCONFIG_NO_TRANSLITERATION
   1.381 +    printf("no transliterators available because of UCONFIG_NO_TRANSLITERATION, see uconfig.h\n");
   1.382 +    return 1;
   1.383 +#else
   1.384 +    UErrorCode status = U_ZERO_ERROR;
   1.385 +    UEnumeration *ids = utrans_openIDs(&status);
   1.386 +    int32_t i, numtrans = uenum_count(ids, &status);
   1.387 +
   1.388 +    char sepchar = canon ? '\n' : ' ';
   1.389 +
   1.390 +    for (i = 0; U_SUCCESS(status)&& (i < numtrans); ++i) {
   1.391 +    	int32_t len;
   1.392 +    	const char *nextTrans = uenum_next(ids, &len, &status);
   1.393 +
   1.394 +        printf("%s", nextTrans);
   1.395 +        if (i < numtrans - 1) {
   1.396 +            putchar(sepchar);
   1.397 +        }
   1.398 +    }
   1.399 +
   1.400 +    uenum_close(ids);
   1.401 +
   1.402 +    /* Add a terminating newline if needed. */
   1.403 +
   1.404 +    if (sepchar != '\n') {
   1.405 +        putchar('\n');
   1.406 +    }
   1.407 +
   1.408 +    /* Success. */
   1.409 +
   1.410 +    return 0;
   1.411 +#endif
   1.412 +}
   1.413 +
   1.414 +enum {
   1.415 +    uSP = 0x20,         // space
   1.416 +    uCR = 0xd,          // carriage return
   1.417 +    uLF = 0xa,          // line feed
   1.418 +    uNL = 0x85,         // newline
   1.419 +    uLS = 0x2028,       // line separator
   1.420 +    uPS = 0x2029,       // paragraph separator
   1.421 +    uSig = 0xfeff       // signature/BOM character
   1.422 +};
   1.423 +
   1.424 +static inline int32_t
   1.425 +getChunkLimit(const UnicodeString &prev, const UnicodeString &s) {
   1.426 +    // find one of
   1.427 +    // CR, LF, CRLF, NL, LS, PS
   1.428 +    // for paragraph ends (see UAX #13/Unicode 4)
   1.429 +    // and include it in the chunk
   1.430 +    // all of these characters are on the BMP
   1.431 +    // do not include FF or VT in case they are part of a paragraph
   1.432 +    // (important for bidi contexts)
   1.433 +    static const UChar paraEnds[] = {
   1.434 +        0xd, 0xa, 0x85, 0x2028, 0x2029
   1.435 +    };
   1.436 +    enum {
   1.437 +        iCR, iLF, iNL, iLS, iPS, iCount
   1.438 +    };
   1.439 +
   1.440 +    // first, see if there is a CRLF split between prev and s
   1.441 +    if (prev.endsWith(paraEnds + iCR, 1)) {
   1.442 +        if (s.startsWith(paraEnds + iLF, 1)) {
   1.443 +            return 1; // split CRLF, include the LF
   1.444 +        } else if (!s.isEmpty()) {
   1.445 +            return 0; // complete the last chunk
   1.446 +        } else {
   1.447 +            return -1; // wait for actual further contents to arrive
   1.448 +        }
   1.449 +    }
   1.450 +
   1.451 +    const UChar *u = s.getBuffer(), *limit = u + s.length();
   1.452 +    UChar c;
   1.453 +
   1.454 +    while (u < limit) {
   1.455 +        c = *u++;
   1.456 +        if (
   1.457 +            ((c < uSP) && (c == uCR || c == uLF)) ||
   1.458 +            (c == uNL) ||
   1.459 +            ((c & uLS) == uLS)
   1.460 +        ) {
   1.461 +            if (c == uCR) {
   1.462 +                // check for CRLF
   1.463 +                if (u == limit) {
   1.464 +                    return -1; // LF may be in the next chunk
   1.465 +                } else if (*u == uLF) {
   1.466 +                    ++u; // include the LF in this chunk
   1.467 +                }
   1.468 +            }
   1.469 +            return (int32_t)(u - s.getBuffer());
   1.470 +        }
   1.471 +    }
   1.472 +
   1.473 +    return -1; // continue collecting the chunk
   1.474 +}
   1.475 +
   1.476 +enum {
   1.477 +    CNV_NO_FEFF,    // cannot convert the U+FEFF Unicode signature character (BOM)
   1.478 +    CNV_WITH_FEFF,  // can convert the U+FEFF signature character
   1.479 +    CNV_ADDS_FEFF   // automatically adds/detects the U+FEFF signature character
   1.480 +};
   1.481 +
   1.482 +static inline UChar
   1.483 +nibbleToHex(uint8_t n) {
   1.484 +    n &= 0xf;
   1.485 +    return
   1.486 +        n <= 9 ?
   1.487 +            (UChar)(0x30 + n) :
   1.488 +            (UChar)((0x61 - 10) + n);
   1.489 +}
   1.490 +
   1.491 +// check the converter's Unicode signature properties;
   1.492 +// the fromUnicode side of the converter must be in its initial state
   1.493 +// and will be reset again if it was used
   1.494 +static int32_t
   1.495 +cnvSigType(UConverter *cnv) {
   1.496 +    UErrorCode err;
   1.497 +    int32_t result;
   1.498 +
   1.499 +    // test if the output charset can convert U+FEFF
   1.500 +    USet *set = uset_open(1, 0);
   1.501 +    err = U_ZERO_ERROR;
   1.502 +    ucnv_getUnicodeSet(cnv, set, UCNV_ROUNDTRIP_SET, &err);
   1.503 +    if (U_SUCCESS(err) && uset_contains(set, uSig)) {
   1.504 +        result = CNV_WITH_FEFF;
   1.505 +    } else {
   1.506 +        result = CNV_NO_FEFF; // an error occurred or U+FEFF cannot be converted
   1.507 +    }
   1.508 +    uset_close(set);
   1.509 +
   1.510 +    if (result == CNV_WITH_FEFF) {
   1.511 +        // test if the output charset emits a signature anyway
   1.512 +        const UChar a[1] = { 0x61 }; // "a"
   1.513 +        const UChar *in;
   1.514 +
   1.515 +        char buffer[20];
   1.516 +        char *out;
   1.517 +
   1.518 +        in = a;
   1.519 +        out = buffer;
   1.520 +        err = U_ZERO_ERROR;
   1.521 +        ucnv_fromUnicode(cnv,
   1.522 +            &out, buffer + sizeof(buffer),
   1.523 +            &in, a + 1,
   1.524 +            NULL, TRUE, &err);
   1.525 +        ucnv_resetFromUnicode(cnv);
   1.526 +
   1.527 +        if (NULL != ucnv_detectUnicodeSignature(buffer, (int32_t)(out - buffer), NULL, &err) &&
   1.528 +            U_SUCCESS(err)
   1.529 +        ) {
   1.530 +            result = CNV_ADDS_FEFF;
   1.531 +        }
   1.532 +    }
   1.533 +
   1.534 +    return result;
   1.535 +}
   1.536 +
   1.537 +class ConvertFile {
   1.538 +public:
   1.539 +    ConvertFile() :
   1.540 +        buf(NULL), outbuf(NULL), fromoffsets(NULL),
   1.541 +        bufsz(0), signature(0) {}
   1.542 +
   1.543 +    void
   1.544 +    setBufferSize(size_t bufferSize) {
   1.545 +        bufsz = bufferSize;
   1.546 +
   1.547 +        buf = new char[2 * bufsz];
   1.548 +        outbuf = buf + bufsz;
   1.549 +
   1.550 +        // +1 for an added U+FEFF in the intermediate Unicode buffer
   1.551 +        fromoffsets = new int32_t[bufsz + 1];
   1.552 +    }
   1.553 +
   1.554 +    ~ConvertFile() {
   1.555 +        delete [] buf;
   1.556 +        delete [] fromoffsets;
   1.557 +    }
   1.558 +
   1.559 +    UBool convertFile(const char *pname,
   1.560 +                      const char *fromcpage,
   1.561 +                      UConverterToUCallback toucallback,
   1.562 +                      const void *touctxt,
   1.563 +                      const char *tocpage,
   1.564 +                      UConverterFromUCallback fromucallback,
   1.565 +                      const void *fromuctxt,
   1.566 +                      UBool fallback,
   1.567 +                      const char *translit,
   1.568 +                      const char *infilestr,
   1.569 +                      FILE * outfile, int verbose);
   1.570 +private:
   1.571 +    friend int main(int argc, char **argv);
   1.572 +
   1.573 +    char *buf, *outbuf;
   1.574 +    int32_t *fromoffsets;
   1.575 +
   1.576 +    size_t bufsz;
   1.577 +    int8_t signature; // add (1) or remove (-1) a U+FEFF Unicode signature character
   1.578 +};
   1.579 +
   1.580 +// Convert a file from one encoding to another
   1.581 +UBool
   1.582 +ConvertFile::convertFile(const char *pname,
   1.583 +                         const char *fromcpage,
   1.584 +                         UConverterToUCallback toucallback,
   1.585 +                         const void *touctxt,
   1.586 +                         const char *tocpage,
   1.587 +                         UConverterFromUCallback fromucallback,
   1.588 +                         const void *fromuctxt,
   1.589 +                         UBool fallback,
   1.590 +                         const char *translit,
   1.591 +                         const char *infilestr,
   1.592 +                         FILE * outfile, int verbose)
   1.593 +{
   1.594 +    FILE *infile;
   1.595 +    UBool ret = TRUE;
   1.596 +    UConverter *convfrom = 0;
   1.597 +    UConverter *convto = 0;
   1.598 +    UErrorCode err = U_ZERO_ERROR;
   1.599 +    UBool flush;
   1.600 +    UBool closeFile = FALSE;
   1.601 +    const char *cbufp, *prevbufp;
   1.602 +    char *bufp;
   1.603 +
   1.604 +    uint32_t infoffset = 0, outfoffset = 0;   /* Where we are in the file, for error reporting. */
   1.605 +
   1.606 +    const UChar *unibuf, *unibufbp;
   1.607 +    UChar *unibufp;
   1.608 +
   1.609 +    size_t rd, wr;
   1.610 +
   1.611 +#if !UCONFIG_NO_TRANSLITERATION
   1.612 +    Transliterator *t = 0;      // Transliterator acting on Unicode data.
   1.613 +    UnicodeString chunk;        // One chunk of the text being collected for transformation.
   1.614 +#endif
   1.615 +    UnicodeString u;            // String to do the transliteration.
   1.616 +    int32_t ulen;
   1.617 +
   1.618 +    // use conversion offsets for error messages
   1.619 +    // unless a transliterator is used -
   1.620 +    // a text transformation will reorder characters in unpredictable ways
   1.621 +    UBool useOffsets = TRUE;
   1.622 +
   1.623 +    // Open the correct input file or connect to stdin for reading input
   1.624 +
   1.625 +    if (infilestr != 0 && strcmp(infilestr, "-")) {
   1.626 +        infile = fopen(infilestr, "rb");
   1.627 +        if (infile == 0) {
   1.628 +            UnicodeString str1(infilestr, "");
   1.629 +            str1.append((UChar32) 0);
   1.630 +            UnicodeString str2(strerror(errno), "");
   1.631 +            str2.append((UChar32) 0);
   1.632 +            initMsg(pname);
   1.633 +            u_wmsg(stderr, "cantOpenInputF", str1.getBuffer(), str2.getBuffer());
   1.634 +            return FALSE;
   1.635 +        }
   1.636 +        closeFile = TRUE;
   1.637 +    } else {
   1.638 +        infilestr = "-";
   1.639 +        infile = stdin;
   1.640 +#ifdef USE_FILENO_BINARY_MODE
   1.641 +        if (setmode(fileno(stdin), O_BINARY) == -1) {
   1.642 +            initMsg(pname);
   1.643 +            u_wmsg(stderr, "cantSetInBinMode");
   1.644 +            return FALSE;
   1.645 +        }
   1.646 +#endif
   1.647 +    }
   1.648 +
   1.649 +    if (verbose) {
   1.650 +        fprintf(stderr, "%s:\n", infilestr);
   1.651 +    }
   1.652 +
   1.653 +#if !UCONFIG_NO_TRANSLITERATION
   1.654 +    // Create transliterator as needed.
   1.655 +
   1.656 +    if (translit != NULL && *translit) {
   1.657 +        UParseError parse;
   1.658 +        UnicodeString str(translit), pestr;
   1.659 +
   1.660 +        /* Create from rules or by ID as needed. */
   1.661 +
   1.662 +        parse.line = -1;
   1.663 +
   1.664 +        if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) {
   1.665 +            t = Transliterator::createFromRules("Uconv", str, UTRANS_FORWARD, parse, err);
   1.666 +        } else {
   1.667 +            t = Transliterator::createInstance(translit, UTRANS_FORWARD, err);
   1.668 +        }
   1.669 +
   1.670 +        if (U_FAILURE(err)) {
   1.671 +            str.append((UChar32) 0);
   1.672 +            initMsg(pname);
   1.673 +
   1.674 +            if (parse.line >= 0) {
   1.675 +                UChar linebuf[20], offsetbuf[20];
   1.676 +                uprv_itou(linebuf, 20, parse.line, 10, 0);
   1.677 +                uprv_itou(offsetbuf, 20, parse.offset, 10, 0);
   1.678 +                u_wmsg(stderr, "cantCreateTranslitParseErr", str.getTerminatedBuffer(),
   1.679 +                    u_wmsg_errorName(err), linebuf, offsetbuf);
   1.680 +            } else {
   1.681 +                u_wmsg(stderr, "cantCreateTranslit", str.getTerminatedBuffer(),
   1.682 +                    u_wmsg_errorName(err));
   1.683 +            }
   1.684 +
   1.685 +            if (t) {
   1.686 +                delete t;
   1.687 +                t = 0;
   1.688 +            }
   1.689 +            goto error_exit;
   1.690 +        }
   1.691 +
   1.692 +        useOffsets = FALSE;
   1.693 +    }
   1.694 +#endif
   1.695 +
   1.696 +    // Create codepage converter. If the codepage or its aliases weren't
   1.697 +    // available, it returns NULL and a failure code. We also set the
   1.698 +    // callbacks, and return errors in the same way.
   1.699 +
   1.700 +    convfrom = ucnv_open(fromcpage, &err);
   1.701 +    if (U_FAILURE(err)) {
   1.702 +        UnicodeString str(fromcpage, "");
   1.703 +        initMsg(pname);
   1.704 +        u_wmsg(stderr, "cantOpenFromCodeset", str.getTerminatedBuffer(),
   1.705 +            u_wmsg_errorName(err));
   1.706 +        goto error_exit;
   1.707 +    }
   1.708 +    ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err);
   1.709 +    if (U_FAILURE(err)) {
   1.710 +        initMsg(pname);
   1.711 +        u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
   1.712 +        goto error_exit;
   1.713 +    }
   1.714 +
   1.715 +    convto = ucnv_open(tocpage, &err);
   1.716 +    if (U_FAILURE(err)) {
   1.717 +        UnicodeString str(tocpage, "");
   1.718 +        initMsg(pname);
   1.719 +        u_wmsg(stderr, "cantOpenToCodeset", str.getTerminatedBuffer(),
   1.720 +            u_wmsg_errorName(err));
   1.721 +        goto error_exit;
   1.722 +    }
   1.723 +    ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err);
   1.724 +    if (U_FAILURE(err)) {
   1.725 +        initMsg(pname);
   1.726 +        u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
   1.727 +        goto error_exit;
   1.728 +    }
   1.729 +    ucnv_setFallback(convto, fallback);
   1.730 +
   1.731 +    UBool willexit, fromSawEndOfBytes, toSawEndOfUnicode;
   1.732 +    int8_t sig;
   1.733 +
   1.734 +    // OK, we can convert now.
   1.735 +    sig = signature;
   1.736 +    rd = 0;
   1.737 +
   1.738 +    do {
   1.739 +        willexit = FALSE;
   1.740 +
   1.741 +        // input file offset at the beginning of the next buffer
   1.742 +        infoffset += rd;
   1.743 +
   1.744 +        rd = fread(buf, 1, bufsz, infile);
   1.745 +        if (ferror(infile) != 0) {
   1.746 +            UnicodeString str(strerror(errno));
   1.747 +            initMsg(pname);
   1.748 +            u_wmsg(stderr, "cantRead", str.getTerminatedBuffer());
   1.749 +            goto error_exit;
   1.750 +        }
   1.751 +
   1.752 +        // Convert the read buffer into the new encoding via Unicode.
   1.753 +        // After the call 'unibufp' will be placed behind the last
   1.754 +        // character that was converted in the 'unibuf'.
   1.755 +        // Also the 'cbufp' is positioned behind the last converted
   1.756 +        // character.
   1.757 +        // At the last conversion in the file, flush should be set to
   1.758 +        // true so that we get all characters converted.
   1.759 +        //
   1.760 +        // The converter must be flushed at the end of conversion so
   1.761 +        // that characters on hold also will be written.
   1.762 +
   1.763 +        cbufp = buf;
   1.764 +        flush = (UBool)(rd != bufsz);
   1.765 +
   1.766 +        // convert until the input is consumed
   1.767 +        do {
   1.768 +            // remember the start of the current byte-to-Unicode conversion
   1.769 +            prevbufp = cbufp;
   1.770 +
   1.771 +            unibuf = unibufp = u.getBuffer((int32_t)bufsz);
   1.772 +
   1.773 +            // Use bufsz instead of u.getCapacity() for the targetLimit
   1.774 +            // so that we don't overflow fromoffsets[].
   1.775 +            ucnv_toUnicode(convfrom, &unibufp, unibuf + bufsz, &cbufp,
   1.776 +                buf + rd, useOffsets ? fromoffsets : NULL, flush, &err);
   1.777 +
   1.778 +            ulen = (int32_t)(unibufp - unibuf);
   1.779 +            u.releaseBuffer(U_SUCCESS(err) ? ulen : 0);
   1.780 +
   1.781 +            // fromSawEndOfBytes indicates that ucnv_toUnicode() is done
   1.782 +            // converting all of the input bytes.
   1.783 +            // It works like this because ucnv_toUnicode() returns only under the
   1.784 +            // following conditions:
   1.785 +            // - an error occurred during conversion (an error code is set)
   1.786 +            // - the target buffer is filled (the error code indicates an overflow)
   1.787 +            // - the source is consumed
   1.788 +            // That is, if the error code does not indicate a failure,
   1.789 +            // not even an overflow, then the source must be consumed entirely.
   1.790 +            fromSawEndOfBytes = (UBool)U_SUCCESS(err);
   1.791 +
   1.792 +            if (err == U_BUFFER_OVERFLOW_ERROR) {
   1.793 +                err = U_ZERO_ERROR;
   1.794 +            } else if (U_FAILURE(err)) {
   1.795 +                char pos[32], errorBytes[32];
   1.796 +                int8_t i, length, errorLength;
   1.797 +
   1.798 +                UErrorCode localError = U_ZERO_ERROR;
   1.799 +                errorLength = (int8_t)sizeof(errorBytes);
   1.800 +                ucnv_getInvalidChars(convfrom, errorBytes, &errorLength, &localError);
   1.801 +                if (U_FAILURE(localError) || errorLength == 0) {
   1.802 +                    errorLength = 1;
   1.803 +                }
   1.804 +
   1.805 +                // print the input file offset of the start of the error bytes:
   1.806 +                // input file offset of the current byte buffer +
   1.807 +                // length of the just consumed bytes -
   1.808 +                // length of the error bytes
   1.809 +                length =
   1.810 +                    (int8_t)sprintf(pos, "%d",
   1.811 +                        (int)(infoffset + (cbufp - buf) - errorLength));
   1.812 +
   1.813 +                // output the bytes that caused the error
   1.814 +                UnicodeString str;
   1.815 +                for (i = 0; i < errorLength; ++i) {
   1.816 +                    if (i > 0) {
   1.817 +                        str.append((UChar)uSP);
   1.818 +                    }
   1.819 +                    str.append(nibbleToHex((uint8_t)errorBytes[i] >> 4));
   1.820 +                    str.append(nibbleToHex((uint8_t)errorBytes[i]));
   1.821 +                }
   1.822 +
   1.823 +                initMsg(pname);
   1.824 +                u_wmsg(stderr, "problemCvtToU",
   1.825 +                        UnicodeString(pos, length, "").getTerminatedBuffer(),
   1.826 +                        str.getTerminatedBuffer(),
   1.827 +                        u_wmsg_errorName(err));
   1.828 +
   1.829 +                willexit = TRUE;
   1.830 +                err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
   1.831 +            }
   1.832 +
   1.833 +            // Replaced a check for whether the input was consumed by
   1.834 +            // looping until it is; message key "premEndInput" now obsolete.
   1.835 +
   1.836 +            if (ulen == 0) {
   1.837 +                continue;
   1.838 +            }
   1.839 +
   1.840 +            // remove a U+FEFF Unicode signature character if requested
   1.841 +            if (sig < 0) {
   1.842 +                if (u.charAt(0) == uSig) {
   1.843 +                    u.remove(0, 1);
   1.844 +
   1.845 +                    // account for the removed UChar and offset
   1.846 +                    --ulen;
   1.847 +
   1.848 +                    if (useOffsets) {
   1.849 +                        // remove an offset from fromoffsets[] as well
   1.850 +                        // to keep the array parallel with the UChars
   1.851 +                        memmove(fromoffsets, fromoffsets + 1, ulen * 4);
   1.852 +                    }
   1.853 +
   1.854 +                }
   1.855 +                sig = 0;
   1.856 +            }
   1.857 +
   1.858 +#if !UCONFIG_NO_TRANSLITERATION
   1.859 +            // Transliterate/transform if needed.
   1.860 +
   1.861 +            // For transformation, we use chunking code -
   1.862 +            // collect Unicode input until, for example, an end-of-line,
   1.863 +            // then transform and output-convert that and continue collecting.
   1.864 +            // This makes the transformation result independent of the buffer size
   1.865 +            // while avoiding the slower keyboard mode.
   1.866 +            // The end-of-chunk characters are completely included in the
   1.867 +            // transformed string in case they are to be transformed themselves.
   1.868 +            if (t != NULL) {
   1.869 +                UnicodeString out;
   1.870 +                int32_t chunkLimit;
   1.871 +
   1.872 +                do {
   1.873 +                    chunkLimit = getChunkLimit(chunk, u);
   1.874 +                    if (chunkLimit < 0 && flush && fromSawEndOfBytes) {
   1.875 +                        // use all of the rest at the end of the text
   1.876 +                        chunkLimit = u.length();
   1.877 +                    }
   1.878 +                    if (chunkLimit >= 0) {
   1.879 +                        // complete the chunk and transform it
   1.880 +                        chunk.append(u, 0, chunkLimit);
   1.881 +                        u.remove(0, chunkLimit);
   1.882 +                        t->transliterate(chunk);
   1.883 +
   1.884 +                        // append the transformation result to the result and empty the chunk
   1.885 +                        out.append(chunk);
   1.886 +                        chunk.remove();
   1.887 +                    } else {
   1.888 +                        // continue collecting the chunk
   1.889 +                        chunk.append(u);
   1.890 +                        break;
   1.891 +                    }
   1.892 +                } while (!u.isEmpty());
   1.893 +
   1.894 +                u = out;
   1.895 +                ulen = u.length();
   1.896 +            }
   1.897 +#endif
   1.898 +
   1.899 +            // add a U+FEFF Unicode signature character if requested
   1.900 +            // and possible/necessary
   1.901 +            if (sig > 0) {
   1.902 +                if (u.charAt(0) != uSig && cnvSigType(convto) == CNV_WITH_FEFF) {
   1.903 +                    u.insert(0, (UChar)uSig);
   1.904 +
   1.905 +                    if (useOffsets) {
   1.906 +                        // insert a pseudo-offset into fromoffsets[] as well
   1.907 +                        // to keep the array parallel with the UChars
   1.908 +                        memmove(fromoffsets + 1, fromoffsets, ulen * 4);
   1.909 +                        fromoffsets[0] = -1;
   1.910 +                    }
   1.911 +
   1.912 +                    // account for the additional UChar and offset
   1.913 +                    ++ulen;
   1.914 +                }
   1.915 +                sig = 0;
   1.916 +            }
   1.917 +
   1.918 +            // Convert the Unicode buffer into the destination codepage
   1.919 +            // Again 'bufp' will be placed behind the last converted character
   1.920 +            // And 'unibufp' will be placed behind the last converted unicode character
   1.921 +            // At the last conversion flush should be set to true to ensure that
   1.922 +            // all characters left get converted
   1.923 +
   1.924 +            unibuf = unibufbp = u.getBuffer();
   1.925 +
   1.926 +            do {
   1.927 +                bufp = outbuf;
   1.928 +
   1.929 +                // Use fromSawEndOfBytes in addition to the flush flag -
   1.930 +                // it indicates whether the intermediate Unicode string
   1.931 +                // contains the very last UChars for the very last input bytes.
   1.932 +                ucnv_fromUnicode(convto, &bufp, outbuf + bufsz,
   1.933 +                                 &unibufbp,
   1.934 +                                 unibuf + ulen,
   1.935 +                                 NULL, (UBool)(flush && fromSawEndOfBytes), &err);
   1.936 +
   1.937 +                // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done
   1.938 +                // converting all of the intermediate UChars.
   1.939 +                // See comment for fromSawEndOfBytes.
   1.940 +                toSawEndOfUnicode = (UBool)U_SUCCESS(err);
   1.941 +
   1.942 +                if (err == U_BUFFER_OVERFLOW_ERROR) {
   1.943 +                    err = U_ZERO_ERROR;
   1.944 +                } else if (U_FAILURE(err)) {
   1.945 +                    UChar errorUChars[4];
   1.946 +                    const char *errtag;
   1.947 +                    char pos[32];
   1.948 +                    UChar32 c;
   1.949 +                    int8_t i, length, errorLength;
   1.950 +
   1.951 +                    UErrorCode localError = U_ZERO_ERROR;
   1.952 +                    errorLength = (int8_t)LENGTHOF(errorUChars);
   1.953 +                    ucnv_getInvalidUChars(convto, errorUChars, &errorLength, &localError);
   1.954 +                    if (U_FAILURE(localError) || errorLength == 0) {
   1.955 +                        // need at least 1 so that we don't access beyond the length of fromoffsets[]
   1.956 +                        errorLength = 1;
   1.957 +                    }
   1.958 +
   1.959 +                    int32_t ferroffset;
   1.960 +
   1.961 +                    if (useOffsets) {
   1.962 +                        // Unicode buffer offset of the start of the error UChars
   1.963 +                        ferroffset = (int32_t)((unibufbp - unibuf) - errorLength);
   1.964 +                        if (ferroffset < 0) {
   1.965 +                            // approximation - the character started in the previous Unicode buffer
   1.966 +                            ferroffset = 0;
   1.967 +                        }
   1.968 +
   1.969 +                        // get the corresponding byte offset out of fromoffsets[]
   1.970 +                        // go back if the offset is not known for some of the UChars
   1.971 +                        int32_t fromoffset;
   1.972 +                        do {
   1.973 +                            fromoffset = fromoffsets[ferroffset];
   1.974 +                        } while (fromoffset < 0 && --ferroffset >= 0);
   1.975 +
   1.976 +                        // total input file offset =
   1.977 +                        // input file offset of the current byte buffer +
   1.978 +                        // byte buffer offset of where the current Unicode buffer is converted from +
   1.979 +                        // fromoffsets[Unicode offset]
   1.980 +                        ferroffset = infoffset + (prevbufp - buf) + fromoffset;
   1.981 +                        errtag = "problemCvtFromU";
   1.982 +                    } else {
   1.983 +                        // Do not use fromoffsets if (t != NULL) because the Unicode text may
   1.984 +                        // be different from what the offsets refer to.
   1.985 +
   1.986 +                        // output file offset
   1.987 +                        ferroffset = (int32_t)(outfoffset + (bufp - outbuf));
   1.988 +                        errtag = "problemCvtFromUOut";
   1.989 +                    }
   1.990 +
   1.991 +                    length = (int8_t)sprintf(pos, "%u", (int)ferroffset);
   1.992 +
   1.993 +                    // output the code points that caused the error
   1.994 +                    UnicodeString str;
   1.995 +                    for (i = 0; i < errorLength;) {
   1.996 +                        if (i > 0) {
   1.997 +                            str.append((UChar)uSP);
   1.998 +                        }
   1.999 +                        U16_NEXT(errorUChars, i, errorLength, c);
  1.1000 +                        if (c >= 0x100000) {
  1.1001 +                            str.append(nibbleToHex((uint8_t)(c >> 20)));
  1.1002 +                        }
  1.1003 +                        if (c >= 0x10000) {
  1.1004 +                            str.append(nibbleToHex((uint8_t)(c >> 16)));
  1.1005 +                        }
  1.1006 +                        str.append(nibbleToHex((uint8_t)(c >> 12)));
  1.1007 +                        str.append(nibbleToHex((uint8_t)(c >> 8)));
  1.1008 +                        str.append(nibbleToHex((uint8_t)(c >> 4)));
  1.1009 +                        str.append(nibbleToHex((uint8_t)c));
  1.1010 +                    }
  1.1011 +
  1.1012 +                    initMsg(pname);
  1.1013 +                    u_wmsg(stderr, errtag,
  1.1014 +                            UnicodeString(pos, length, "").getTerminatedBuffer(),
  1.1015 +                            str.getTerminatedBuffer(),
  1.1016 +                           u_wmsg_errorName(err));
  1.1017 +                    u_wmsg(stderr, "errorUnicode", str.getTerminatedBuffer());
  1.1018 +
  1.1019 +                    willexit = TRUE;
  1.1020 +                    err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
  1.1021 +                }
  1.1022 +
  1.1023 +                // Replaced a check for whether the intermediate Unicode characters were all consumed by
  1.1024 +                // looping until they are; message key "premEnd" now obsolete.
  1.1025 +
  1.1026 +                // Finally, write the converted buffer to the output file
  1.1027 +                size_t outlen = (size_t) (bufp - outbuf);
  1.1028 +                outfoffset += (int32_t)(wr = fwrite(outbuf, 1, outlen, outfile));
  1.1029 +                if (wr != outlen) {
  1.1030 +                    UnicodeString str(strerror(errno));
  1.1031 +                    initMsg(pname);
  1.1032 +                    u_wmsg(stderr, "cantWrite", str.getTerminatedBuffer());
  1.1033 +                    willexit = TRUE;
  1.1034 +                }
  1.1035 +
  1.1036 +                if (willexit) {
  1.1037 +                    goto error_exit;
  1.1038 +                }
  1.1039 +            } while (!toSawEndOfUnicode);
  1.1040 +        } while (!fromSawEndOfBytes);
  1.1041 +    } while (!flush);           // Stop when we have flushed the
  1.1042 +                                // converters (this means that it's
  1.1043 +                                // the end of output)
  1.1044 +
  1.1045 +    goto normal_exit;
  1.1046 +
  1.1047 +error_exit:
  1.1048 +    ret = FALSE;
  1.1049 +
  1.1050 +normal_exit:
  1.1051 +    // Cleanup.
  1.1052 +
  1.1053 +    ucnv_close(convfrom);
  1.1054 +    ucnv_close(convto);
  1.1055 +
  1.1056 +#if !UCONFIG_NO_TRANSLITERATION
  1.1057 +    delete t;
  1.1058 +#endif
  1.1059 +
  1.1060 +    if (closeFile) {
  1.1061 +        fclose(infile);
  1.1062 +    }
  1.1063 +
  1.1064 +    return ret;
  1.1065 +}
  1.1066 +
  1.1067 +static void usage(const char *pname, int ecode) {
  1.1068 +    const UChar *msg;
  1.1069 +    int32_t msgLen;
  1.1070 +    UErrorCode err = U_ZERO_ERROR;
  1.1071 +    FILE *fp = ecode ? stderr : stdout;
  1.1072 +    int res;
  1.1073 +
  1.1074 +    initMsg(pname);
  1.1075 +    msg =
  1.1076 +        ures_getStringByKey(gBundle, ecode ? "lcUsageWord" : "ucUsageWord",
  1.1077 +                            &msgLen, &err);
  1.1078 +    UnicodeString upname(pname, (int32_t)(uprv_strlen(pname) + 1));
  1.1079 +    UnicodeString mname(msg, msgLen + 1);
  1.1080 +
  1.1081 +    res = u_wmsg(fp, "usage", mname.getBuffer(), upname.getBuffer());
  1.1082 +    if (!ecode) {
  1.1083 +        if (!res) {
  1.1084 +            fputc('\n', fp);
  1.1085 +        }
  1.1086 +        if (!u_wmsg(fp, "help")) {
  1.1087 +            /* Now dump callbacks and finish. */
  1.1088 +
  1.1089 +            int i, count =
  1.1090 +                sizeof(transcode_callbacks) / sizeof(*transcode_callbacks);
  1.1091 +            for (i = 0; i < count; ++i) {
  1.1092 +                fprintf(fp, " %s", transcode_callbacks[i].name);
  1.1093 +            }
  1.1094 +            fputc('\n', fp);
  1.1095 +        }
  1.1096 +    }
  1.1097 +
  1.1098 +    exit(ecode);
  1.1099 +}
  1.1100 +
  1.1101 +extern int
  1.1102 +main(int argc, char **argv)
  1.1103 +{
  1.1104 +    FILE *outfile;
  1.1105 +    int ret = 0;
  1.1106 +
  1.1107 +    size_t bufsz = DEFAULT_BUFSZ;
  1.1108 +
  1.1109 +    const char *fromcpage = 0;
  1.1110 +    const char *tocpage = 0;
  1.1111 +    const char *translit = 0;
  1.1112 +    const char *outfilestr = 0;
  1.1113 +    UBool fallback = FALSE;
  1.1114 +
  1.1115 +    UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_STOP;
  1.1116 +    const void *fromuctxt = 0;
  1.1117 +    UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_STOP;
  1.1118 +    const void *touctxt = 0;
  1.1119 +
  1.1120 +    char **iter, **remainArgv, **remainArgvLimit;
  1.1121 +    char **end = argv + argc;
  1.1122 +
  1.1123 +    const char *pname;
  1.1124 +
  1.1125 +    UBool printConvs = FALSE, printCanon = FALSE, printTranslits = FALSE;
  1.1126 +    const char *printName = 0;
  1.1127 +
  1.1128 +    UBool verbose = FALSE;
  1.1129 +    UErrorCode status = U_ZERO_ERROR;
  1.1130 +
  1.1131 +    ConvertFile cf;
  1.1132 +
  1.1133 +    /* Initialize ICU */
  1.1134 +    u_init(&status);
  1.1135 +    if (U_FAILURE(status)) {
  1.1136 +        fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
  1.1137 +            argv[0], u_errorName(status));
  1.1138 +        exit(1);
  1.1139 +    }
  1.1140 +
  1.1141 +    // Get and prettify pname.
  1.1142 +    pname = uprv_strrchr(*argv, U_FILE_SEP_CHAR);
  1.1143 +#if U_PLATFORM_USES_ONLY_WIN32_API
  1.1144 +    if (!pname) {
  1.1145 +        pname = uprv_strrchr(*argv, '/');
  1.1146 +    }
  1.1147 +#endif
  1.1148 +    if (!pname) {
  1.1149 +        pname = *argv;
  1.1150 +    } else {
  1.1151 +        ++pname;
  1.1152 +    }
  1.1153 +
  1.1154 +    // First, get the arguments from command-line
  1.1155 +    // to know the codepages to convert between
  1.1156 +
  1.1157 +    remainArgv = remainArgvLimit = argv + 1;
  1.1158 +    for (iter = argv + 1; iter != end; iter++) {
  1.1159 +        // Check for from charset
  1.1160 +        if (strcmp("-f", *iter) == 0 || !strcmp("--from-code", *iter)) {
  1.1161 +            iter++;
  1.1162 +            if (iter != end)
  1.1163 +                fromcpage = *iter;
  1.1164 +            else
  1.1165 +                usage(pname, 1);
  1.1166 +        } else if (strcmp("-t", *iter) == 0 || !strcmp("--to-code", *iter)) {
  1.1167 +            iter++;
  1.1168 +            if (iter != end)
  1.1169 +                tocpage = *iter;
  1.1170 +            else
  1.1171 +                usage(pname, 1);
  1.1172 +        } else if (strcmp("-x", *iter) == 0) {
  1.1173 +            iter++;
  1.1174 +            if (iter != end)
  1.1175 +                translit = *iter;
  1.1176 +            else
  1.1177 +                usage(pname, 1);
  1.1178 +        } else if (!strcmp("--fallback", *iter)) {
  1.1179 +            fallback = TRUE;
  1.1180 +        } else if (!strcmp("--no-fallback", *iter)) {
  1.1181 +            fallback = FALSE;
  1.1182 +        } else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) {
  1.1183 +            iter++;
  1.1184 +            if (iter != end) {
  1.1185 +                bufsz = atoi(*iter);
  1.1186 +                if ((int) bufsz <= 0) {
  1.1187 +                    initMsg(pname);
  1.1188 +                    UnicodeString str(*iter);
  1.1189 +                    initMsg(pname);
  1.1190 +                    u_wmsg(stderr, "badBlockSize", str.getTerminatedBuffer());
  1.1191 +                    return 3;
  1.1192 +                }
  1.1193 +            } else {
  1.1194 +                usage(pname, 1);
  1.1195 +            }
  1.1196 +        } else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) {
  1.1197 +            if (printTranslits) {
  1.1198 +                usage(pname, 1);
  1.1199 +            }
  1.1200 +            printConvs = TRUE;
  1.1201 +        } else if (strcmp("--default-code", *iter) == 0) {
  1.1202 +            if (printTranslits) {
  1.1203 +                usage(pname, 1);
  1.1204 +            }
  1.1205 +            printName = ucnv_getDefaultName();
  1.1206 +        } else if (strcmp("--list-code", *iter) == 0) {
  1.1207 +            if (printTranslits) {
  1.1208 +                usage(pname, 1);
  1.1209 +            }
  1.1210 +
  1.1211 +            iter++;
  1.1212 +            if (iter != end) {
  1.1213 +                UErrorCode e = U_ZERO_ERROR;
  1.1214 +                printName = ucnv_getAlias(*iter, 0, &e);
  1.1215 +                if (U_FAILURE(e) || !printName) {
  1.1216 +                    UnicodeString str(*iter);
  1.1217 +                    initMsg(pname);
  1.1218 +                    u_wmsg(stderr, "noSuchCodeset", str.getTerminatedBuffer());
  1.1219 +                    return 2;
  1.1220 +                }
  1.1221 +            } else
  1.1222 +                usage(pname, 1);
  1.1223 +        } else if (strcmp("--canon", *iter) == 0) {
  1.1224 +            printCanon = TRUE;
  1.1225 +        } else if (strcmp("-L", *iter) == 0
  1.1226 +            || !strcmp("--list-transliterators", *iter)) {
  1.1227 +            if (printConvs) {
  1.1228 +                usage(pname, 1);
  1.1229 +            }
  1.1230 +            printTranslits = TRUE;
  1.1231 +        } else if (strcmp("-h", *iter) == 0 || !strcmp("-?", *iter)
  1.1232 +            || !strcmp("--help", *iter)) {
  1.1233 +            usage(pname, 0);
  1.1234 +        } else if (!strcmp("-c", *iter)) {
  1.1235 +            fromucallback = UCNV_FROM_U_CALLBACK_SKIP;
  1.1236 +        } else if (!strcmp("--to-callback", *iter)) {
  1.1237 +            iter++;
  1.1238 +            if (iter != end) {
  1.1239 +                const struct callback_ent *cbe = findCallback(*iter);
  1.1240 +                if (cbe) {
  1.1241 +                    fromucallback = cbe->fromu;
  1.1242 +                    fromuctxt = cbe->fromuctxt;
  1.1243 +                } else {
  1.1244 +                    UnicodeString str(*iter);
  1.1245 +                    initMsg(pname);
  1.1246 +                    u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
  1.1247 +                    return 4;
  1.1248 +                }
  1.1249 +            } else {
  1.1250 +                usage(pname, 1);
  1.1251 +            }
  1.1252 +        } else if (!strcmp("--from-callback", *iter)) {
  1.1253 +            iter++;
  1.1254 +            if (iter != end) {
  1.1255 +                const struct callback_ent *cbe = findCallback(*iter);
  1.1256 +                if (cbe) {
  1.1257 +                    toucallback = cbe->tou;
  1.1258 +                    touctxt = cbe->touctxt;
  1.1259 +                } else {
  1.1260 +                    UnicodeString str(*iter);
  1.1261 +                    initMsg(pname);
  1.1262 +                    u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
  1.1263 +                    return 4;
  1.1264 +                }
  1.1265 +            } else {
  1.1266 +                usage(pname, 1);
  1.1267 +            }
  1.1268 +        } else if (!strcmp("-i", *iter)) {
  1.1269 +            toucallback = UCNV_TO_U_CALLBACK_SKIP;
  1.1270 +        } else if (!strcmp("--callback", *iter)) {
  1.1271 +            iter++;
  1.1272 +            if (iter != end) {
  1.1273 +                const struct callback_ent *cbe = findCallback(*iter);
  1.1274 +                if (cbe) {
  1.1275 +                    fromucallback = cbe->fromu;
  1.1276 +                    fromuctxt = cbe->fromuctxt;
  1.1277 +                    toucallback = cbe->tou;
  1.1278 +                    touctxt = cbe->touctxt;
  1.1279 +                } else {
  1.1280 +                    UnicodeString str(*iter);
  1.1281 +                    initMsg(pname);
  1.1282 +                    u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
  1.1283 +                    return 4;
  1.1284 +                }
  1.1285 +            } else {
  1.1286 +                usage(pname, 1);
  1.1287 +            }
  1.1288 +        } else if (!strcmp("-s", *iter) || !strcmp("--silent", *iter)) {
  1.1289 +            verbose = FALSE;
  1.1290 +        } else if (!strcmp("-v", *iter) || !strcmp("--verbose", *iter)) {
  1.1291 +            verbose = TRUE;
  1.1292 +        } else if (!strcmp("-V", *iter) || !strcmp("--version", *iter)) {
  1.1293 +            printf("%s v2.1  ICU " U_ICU_VERSION "\n", pname);
  1.1294 +            return 0;
  1.1295 +        } else if (!strcmp("-o", *iter) || !strcmp("--output", *iter)) {
  1.1296 +            ++iter;
  1.1297 +            if (iter != end && !outfilestr) {
  1.1298 +                outfilestr = *iter;
  1.1299 +            } else {
  1.1300 +                usage(pname, 1);
  1.1301 +            }
  1.1302 +        } else if (0 == strcmp("--add-signature", *iter)) {
  1.1303 +            cf.signature = 1;
  1.1304 +        } else if (0 == strcmp("--remove-signature", *iter)) {
  1.1305 +            cf.signature = -1;
  1.1306 +        } else if (**iter == '-' && (*iter)[1]) {
  1.1307 +            usage(pname, 1);
  1.1308 +        } else {
  1.1309 +            // move a non-option up in argv[]
  1.1310 +            *remainArgvLimit++ = *iter;
  1.1311 +        }
  1.1312 +    }
  1.1313 +
  1.1314 +    if (printConvs || printName) {
  1.1315 +        return printConverters(pname, printName, printCanon) ? 2 : 0;
  1.1316 +    } else if (printTranslits) {
  1.1317 +        return printTransliterators(printCanon) ? 3 : 0;
  1.1318 +    }
  1.1319 +
  1.1320 +    if (!fromcpage || !uprv_strcmp(fromcpage, "-")) {
  1.1321 +        fromcpage = ucnv_getDefaultName();
  1.1322 +    }
  1.1323 +    if (!tocpage || !uprv_strcmp(tocpage, "-")) {
  1.1324 +        tocpage = ucnv_getDefaultName();
  1.1325 +    }
  1.1326 +
  1.1327 +    // Open the correct output file or connect to stdout for reading input
  1.1328 +    if (outfilestr != 0 && strcmp(outfilestr, "-")) {
  1.1329 +        outfile = fopen(outfilestr, "wb");
  1.1330 +        if (outfile == 0) {
  1.1331 +            UnicodeString str1(outfilestr, "");
  1.1332 +            UnicodeString str2(strerror(errno), "");
  1.1333 +            initMsg(pname);
  1.1334 +            u_wmsg(stderr, "cantCreateOutputF",
  1.1335 +                str1.getBuffer(), str2.getBuffer());
  1.1336 +            return 1;
  1.1337 +        }
  1.1338 +    } else {
  1.1339 +        outfilestr = "-";
  1.1340 +        outfile = stdout;
  1.1341 +#ifdef USE_FILENO_BINARY_MODE
  1.1342 +        if (setmode(fileno(outfile), O_BINARY) == -1) {
  1.1343 +            u_wmsg(stderr, "cantSetOutBinMode");
  1.1344 +            exit(-1);
  1.1345 +        }
  1.1346 +#endif
  1.1347 +    }
  1.1348 +
  1.1349 +    /* Loop again on the arguments to find all the input files, and
  1.1350 +    convert them. */
  1.1351 +
  1.1352 +    cf.setBufferSize(bufsz);
  1.1353 +
  1.1354 +    if(remainArgv < remainArgvLimit) {
  1.1355 +        for (iter = remainArgv; iter != remainArgvLimit; iter++) {
  1.1356 +            if (!cf.convertFile(
  1.1357 +                    pname, fromcpage, toucallback, touctxt, tocpage,
  1.1358 +                    fromucallback, fromuctxt, fallback, translit, *iter,
  1.1359 +                    outfile, verbose)
  1.1360 +            ) {
  1.1361 +                goto error_exit;
  1.1362 +            }
  1.1363 +        }
  1.1364 +    } else {
  1.1365 +        if (!cf.convertFile(
  1.1366 +                pname, fromcpage, toucallback, touctxt, tocpage,
  1.1367 +                fromucallback, fromuctxt, fallback, translit, 0,
  1.1368 +                outfile, verbose)
  1.1369 +        ) {
  1.1370 +            goto error_exit;
  1.1371 +        }
  1.1372 +    }
  1.1373 +
  1.1374 +    goto normal_exit;
  1.1375 +error_exit:
  1.1376 +#if !UCONFIG_NO_LEGACY_CONVERSION
  1.1377 +    ret = 1;
  1.1378 +#else 
  1.1379 +    fprintf(stderr, "uconv error: UCONFIG_NO_LEGACY_CONVERSION is on. See uconfig.h\n");
  1.1380 +#endif
  1.1381 +normal_exit:
  1.1382 +
  1.1383 +    if (outfile != stdout) {
  1.1384 +        fclose(outfile);
  1.1385 +    }
  1.1386 +
  1.1387 +    u_cleanup();
  1.1388 +
  1.1389 +    return ret;
  1.1390 +}
  1.1391 +
  1.1392 +
  1.1393 +/*
  1.1394 + * Hey, Emacs, please set the following:
  1.1395 + *
  1.1396 + * Local Variables:
  1.1397 + * indent-tabs-mode: nil
  1.1398 + * End:
  1.1399 + *
  1.1400 + */

mercurial