intl/icu/source/tools/genrb/read.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/tools/genrb/read.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,474 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 1998-2012, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +*******************************************************************************
    1.11 +*
    1.12 +* File read.c
    1.13 +*
    1.14 +* Modification History:
    1.15 +*
    1.16 +*   Date        Name        Description
    1.17 +*   05/26/99    stephen     Creation.
    1.18 +*   5/10/01     Ram         removed ustdio dependency
    1.19 +*******************************************************************************
    1.20 +*/
    1.21 +
    1.22 +#include "read.h"
    1.23 +#include "errmsg.h"
    1.24 +#include "unicode/ustring.h"
    1.25 +#include "unicode/utf16.h"
    1.26 +
    1.27 +#define OPENBRACE    0x007B
    1.28 +#define CLOSEBRACE   0x007D
    1.29 +#define COMMA        0x002C
    1.30 +#define QUOTE        0x0022
    1.31 +#define ESCAPE       0x005C
    1.32 +#define SLASH        0x002F
    1.33 +#define ASTERISK     0x002A
    1.34 +#define SPACE        0x0020
    1.35 +#define COLON        0x003A
    1.36 +#define BADBOM       0xFFFE
    1.37 +#define CR           0x000D
    1.38 +#define LF           0x000A
    1.39 +               
    1.40 +static int32_t lineCount;
    1.41 +
    1.42 +/* Protos */
    1.43 +static enum ETokenType getStringToken(UCHARBUF *buf,
    1.44 +                                      UChar32 initialChar,
    1.45 +                                      struct UString *token,
    1.46 +                                      UErrorCode *status);
    1.47 +
    1.48 +static UChar32 getNextChar           (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
    1.49 +static void    seekUntilNewline      (UCHARBUF *buf, struct UString *token, UErrorCode *status);
    1.50 +static void    seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
    1.51 +static UBool   isWhitespace          (UChar32 c);
    1.52 +static UBool   isNewline             (UChar32 c);
    1.53 +
    1.54 +U_CFUNC void resetLineNumber() {
    1.55 +    lineCount = 1;
    1.56 +}
    1.57 +
    1.58 +/* Read and return the next token from the stream.  If the token is of
    1.59 +   type eString, fill in the token parameter with the token.  If the
    1.60 +   token is eError, then the status parameter will contain the
    1.61 +   specific error.  This will be eItemNotFound at the end of file,
    1.62 +   indicating that all tokens have been returned.  This method will
    1.63 +   never return eString twice in a row; instead, multiple adjacent
    1.64 +   string tokens will be merged into one, with no intervening
    1.65 +   space. */
    1.66 +U_CFUNC enum ETokenType
    1.67 +getNextToken(UCHARBUF* buf,
    1.68 +             struct UString *token,
    1.69 +             uint32_t *linenumber, /* out: linenumber of token */
    1.70 +             struct UString *comment,
    1.71 +             UErrorCode *status) {
    1.72 +    enum ETokenType result;
    1.73 +    UChar32         c;
    1.74 +
    1.75 +    if (U_FAILURE(*status)) {
    1.76 +        return TOK_ERROR;
    1.77 +    }
    1.78 +
    1.79 +    /* Skip whitespace */
    1.80 +    c = getNextChar(buf, TRUE, comment, status);
    1.81 +
    1.82 +    if (U_FAILURE(*status)) {
    1.83 +        return TOK_ERROR;
    1.84 +    }
    1.85 +
    1.86 +    *linenumber = lineCount;
    1.87 +
    1.88 +    switch(c) {
    1.89 +    case BADBOM:
    1.90 +        return TOK_ERROR;
    1.91 +    case OPENBRACE:
    1.92 +        return TOK_OPEN_BRACE;
    1.93 +    case CLOSEBRACE:
    1.94 +        return TOK_CLOSE_BRACE;
    1.95 +    case COMMA:
    1.96 +        return TOK_COMMA;
    1.97 +    case U_EOF:
    1.98 +        return TOK_EOF;
    1.99 +    case COLON:
   1.100 +        return TOK_COLON;
   1.101 +
   1.102 +    default:
   1.103 +        result = getStringToken(buf, c, token, status);
   1.104 +    }
   1.105 +
   1.106 +    *linenumber = lineCount;
   1.107 +    return result;
   1.108 +}
   1.109 +
   1.110 +/* Copy a string token into the given UnicodeString.  Upon entry, we
   1.111 +   have already read the first character of the string token, which is
   1.112 +   not a whitespace character (but may be a QUOTE or ESCAPE). This
   1.113 +   function reads all subsequent characters that belong with this
   1.114 +   string, and copy them into the token parameter. The other
   1.115 +   important, and slightly convoluted purpose of this function is to
   1.116 +   merge adjacent strings.  It looks forward a bit, and if the next
   1.117 +   non comment, non whitespace item is a string, it reads it in as
   1.118 +   well.  If two adjacent strings are quoted, they are merged without
   1.119 +   intervening space.  Otherwise a single SPACE character is
   1.120 +   inserted. */
   1.121 +static enum ETokenType getStringToken(UCHARBUF* buf,
   1.122 +                                      UChar32 initialChar,
   1.123 +                                      struct UString *token,
   1.124 +                                      UErrorCode *status) {
   1.125 +    UBool    lastStringWasQuoted;
   1.126 +    UChar32  c;
   1.127 +    UChar    target[3] = { '\0' };
   1.128 +    UChar    *pTarget   = target;
   1.129 +    int      len=0;
   1.130 +    UBool    isFollowingCharEscaped=FALSE;
   1.131 +    UBool    isNLUnescaped = FALSE;
   1.132 +    UChar32  prevC=0;
   1.133 +
   1.134 +    /* We are guaranteed on entry that initialChar is not a whitespace
   1.135 +       character. If we are at the EOF, or have some other problem, it
   1.136 +       doesn't matter; we still want to validly return the initialChar
   1.137 +       (if nothing else) as a string token. */
   1.138 +
   1.139 +    if (U_FAILURE(*status)) {
   1.140 +        return TOK_ERROR;
   1.141 +    }
   1.142 +
   1.143 +    /* setup */
   1.144 +    lastStringWasQuoted = FALSE;
   1.145 +    c = initialChar;
   1.146 +    ustr_setlen(token, 0, status);
   1.147 +
   1.148 +    if (U_FAILURE(*status)) {
   1.149 +        return TOK_ERROR;
   1.150 +    }
   1.151 +
   1.152 +    for (;;) {
   1.153 +        if (c == QUOTE) {
   1.154 +            if (!lastStringWasQuoted && token->fLength > 0) {
   1.155 +                ustr_ucat(token, SPACE, status);
   1.156 +
   1.157 +                if (U_FAILURE(*status)) {
   1.158 +                    return TOK_ERROR;
   1.159 +                }
   1.160 +            }
   1.161 +
   1.162 +            lastStringWasQuoted = TRUE;
   1.163 +
   1.164 +            for (;;) {
   1.165 +                c = ucbuf_getc(buf,status);
   1.166 +
   1.167 +                /* EOF reached */
   1.168 +                if (c == U_EOF) {
   1.169 +                    return TOK_EOF;
   1.170 +                }
   1.171 +
   1.172 +                /* Unterminated quoted strings */
   1.173 +                if (U_FAILURE(*status)) {
   1.174 +                    return TOK_ERROR;
   1.175 +                }
   1.176 +
   1.177 +                if (c == QUOTE && !isFollowingCharEscaped) {
   1.178 +                    break;
   1.179 +                }
   1.180 +
   1.181 +                if (c == ESCAPE  && !isFollowingCharEscaped) {
   1.182 +                    pTarget = target;
   1.183 +                    c       = unescape(buf, status);
   1.184 +
   1.185 +                    if (c == U_ERR) {
   1.186 +                        return TOK_ERROR;
   1.187 +                    }
   1.188 +                    if(c == CR || c == LF){
   1.189 +                        isNLUnescaped = TRUE;
   1.190 +                    }
   1.191 +                }               
   1.192 +
   1.193 +                if(c==ESCAPE && !isFollowingCharEscaped){
   1.194 +                    isFollowingCharEscaped = TRUE;
   1.195 +                }else{
   1.196 +                    U_APPEND_CHAR32(c, pTarget,len);
   1.197 +                    pTarget = target;
   1.198 +                    ustr_uscat(token, pTarget,len, status);
   1.199 +                    isFollowingCharEscaped = FALSE;
   1.200 +                    len=0;
   1.201 +                    if(c == CR || c == LF){
   1.202 +                        if(isNLUnescaped == FALSE && prevC!=CR){
   1.203 +                            lineCount++;
   1.204 +                        }
   1.205 +                        isNLUnescaped = FALSE;
   1.206 +                    }
   1.207 +                }
   1.208 +                
   1.209 +                if (U_FAILURE(*status)) {
   1.210 +                    return TOK_ERROR;
   1.211 +                }
   1.212 +                prevC = c;
   1.213 +            }
   1.214 +        } else {
   1.215 +            if (token->fLength > 0) {
   1.216 +                ustr_ucat(token, SPACE, status);
   1.217 +
   1.218 +                if (U_FAILURE(*status)) {
   1.219 +                    return TOK_ERROR;
   1.220 +                }
   1.221 +            }
   1.222 +            
   1.223 +            if(lastStringWasQuoted){
   1.224 +                if(getShowWarning()){
   1.225 +                    warning(lineCount, "Mixing quoted and unquoted strings");
   1.226 +                }
   1.227 +                if(isStrict()){
   1.228 +                    return TOK_ERROR;
   1.229 +                }
   1.230 +
   1.231 +            }
   1.232 +
   1.233 +            lastStringWasQuoted = FALSE;
   1.234 +            
   1.235 +            /* if we reach here we are mixing 
   1.236 +             * quoted and unquoted strings
   1.237 +             * warn in normal mode and error in
   1.238 +             * pedantic mode
   1.239 +             */
   1.240 +
   1.241 +            if (c == ESCAPE) {
   1.242 +                pTarget = target;
   1.243 +                c       = unescape(buf, status);
   1.244 +
   1.245 +                /* EOF reached */
   1.246 +                if (c == U_EOF) {
   1.247 +                    return TOK_ERROR;
   1.248 +                }
   1.249 +            }
   1.250 +
   1.251 +            U_APPEND_CHAR32(c, pTarget,len);
   1.252 +            pTarget = target;
   1.253 +            ustr_uscat(token, pTarget,len, status);
   1.254 +            len=0;
   1.255 +            
   1.256 +            if (U_FAILURE(*status)) {
   1.257 +                return TOK_ERROR;
   1.258 +            }
   1.259 +
   1.260 +            for (;;) {
   1.261 +                /* DON'T skip whitespace */
   1.262 +                c = getNextChar(buf, FALSE, NULL, status);
   1.263 +
   1.264 +                /* EOF reached */
   1.265 +                if (c == U_EOF) {
   1.266 +                    ucbuf_ungetc(c, buf);
   1.267 +                    return TOK_STRING;
   1.268 +                }
   1.269 +
   1.270 +                if (U_FAILURE(*status)) {
   1.271 +                    return TOK_STRING;
   1.272 +                }
   1.273 +
   1.274 +                if (c == QUOTE
   1.275 +                        || c == OPENBRACE
   1.276 +                        || c == CLOSEBRACE
   1.277 +                        || c == COMMA
   1.278 +                        || c == COLON) {
   1.279 +                    ucbuf_ungetc(c, buf);
   1.280 +                    break;
   1.281 +                }
   1.282 +
   1.283 +                if (isWhitespace(c)) {
   1.284 +                    break;
   1.285 +                }
   1.286 +
   1.287 +                if (c == ESCAPE) {
   1.288 +                    pTarget = target;
   1.289 +                    c       = unescape(buf, status);
   1.290 +
   1.291 +                    if (c == U_ERR) {
   1.292 +                        return TOK_ERROR;
   1.293 +                    }
   1.294 +                }
   1.295 +
   1.296 +                U_APPEND_CHAR32(c, pTarget,len);
   1.297 +                pTarget = target;
   1.298 +                ustr_uscat(token, pTarget,len, status);
   1.299 +                len=0;
   1.300 +                if (U_FAILURE(*status)) {
   1.301 +                    return TOK_ERROR;
   1.302 +                }
   1.303 +            }
   1.304 +        }
   1.305 +
   1.306 +        /* DO skip whitespace */
   1.307 +        c = getNextChar(buf, TRUE, NULL, status);
   1.308 +
   1.309 +        if (U_FAILURE(*status)) {
   1.310 +            return TOK_STRING;
   1.311 +        }
   1.312 +
   1.313 +        if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
   1.314 +            ucbuf_ungetc(c, buf);
   1.315 +            return TOK_STRING;
   1.316 +        }
   1.317 +    }
   1.318 +}
   1.319 +
   1.320 +/* Retrieve the next character.  If skipwhite is
   1.321 +   true, whitespace is skipped as well. */
   1.322 +static UChar32 getNextChar(UCHARBUF* buf,
   1.323 +                           UBool skipwhite,
   1.324 +                           struct UString *token,
   1.325 +                           UErrorCode *status) {
   1.326 +    UChar32 c, c2;
   1.327 +
   1.328 +    if (U_FAILURE(*status)) {
   1.329 +        return U_EOF;
   1.330 +    }
   1.331 +
   1.332 +    for (;;) {
   1.333 +        c = ucbuf_getc(buf,status);
   1.334 +
   1.335 +        if (c == U_EOF) {
   1.336 +            return U_EOF;
   1.337 +        }
   1.338 +
   1.339 +        if (skipwhite && isWhitespace(c)) {
   1.340 +            continue;
   1.341 +        }
   1.342 +
   1.343 +        /* This also handles the get() failing case */
   1.344 +        if (c != SLASH) {
   1.345 +            return c;
   1.346 +        }
   1.347 +
   1.348 +        c = ucbuf_getc(buf,status); /* "/c" */
   1.349 +
   1.350 +        if (c == U_EOF) {
   1.351 +            return U_EOF;
   1.352 +        }
   1.353 +
   1.354 +        switch (c) {
   1.355 +        case SLASH:  /* "//" */
   1.356 +            seekUntilNewline(buf, NULL, status);
   1.357 +            break;
   1.358 +
   1.359 +        case ASTERISK:  /* " / * " */
   1.360 +            c2 = ucbuf_getc(buf, status); /* "/ * c" */
   1.361 +            if(c2 == ASTERISK){  /* "/ * *" */
   1.362 +                /* parse multi-line comment and store it in token*/
   1.363 +                seekUntilEndOfComment(buf, token, status);
   1.364 +            } else {
   1.365 +                ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *".  Include c2  back in buffer.  */
   1.366 +                seekUntilEndOfComment(buf, NULL, status);
   1.367 +            }
   1.368 +            break;
   1.369 +
   1.370 +        default:
   1.371 +            ucbuf_ungetc(c, buf); /* "/c" - put back the c */
   1.372 +            /* If get() failed this is a NOP */
   1.373 +            return SLASH;
   1.374 +        }
   1.375 +
   1.376 +    }
   1.377 +}
   1.378 +
   1.379 +static void seekUntilNewline(UCHARBUF* buf,
   1.380 +                             struct UString *token,
   1.381 +                             UErrorCode *status) {
   1.382 +    UChar32 c;
   1.383 +
   1.384 +    if (U_FAILURE(*status)) {
   1.385 +        return;
   1.386 +    }
   1.387 +
   1.388 +    do {
   1.389 +        c = ucbuf_getc(buf,status);
   1.390 +        /* add the char to token */
   1.391 +        if(token!=NULL){
   1.392 +            ustr_u32cat(token, c, status);
   1.393 +        }
   1.394 +    } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
   1.395 +}
   1.396 +
   1.397 +static void seekUntilEndOfComment(UCHARBUF *buf,
   1.398 +                                  struct UString *token,
   1.399 +                                  UErrorCode *status) {
   1.400 +    UChar32  c, d;
   1.401 +    uint32_t line;
   1.402 +
   1.403 +    if (U_FAILURE(*status)) {
   1.404 +        return;
   1.405 +    }
   1.406 +
   1.407 +    line = lineCount;
   1.408 +
   1.409 +    do {
   1.410 +        c = ucbuf_getc(buf, status);
   1.411 +
   1.412 +        if (c == ASTERISK) {
   1.413 +            d = ucbuf_getc(buf, status);
   1.414 +
   1.415 +            if (d != SLASH) {
   1.416 +                ucbuf_ungetc(d, buf);
   1.417 +            } else {
   1.418 +                break;
   1.419 +            }
   1.420 +        }
   1.421 +        /* add the char to token */
   1.422 +        if(token!=NULL){
   1.423 +            ustr_u32cat(token, c, status);
   1.424 +        }
   1.425 +        /* increment the lineCount */
   1.426 +        isNewline(c);
   1.427 +
   1.428 +    } while (c != U_EOF && *status == U_ZERO_ERROR);
   1.429 +
   1.430 +    if (c == U_EOF) {
   1.431 +        *status = U_INVALID_FORMAT_ERROR;
   1.432 +        error(line, "unterminated comment detected");
   1.433 +    }
   1.434 +}
   1.435 +
   1.436 +U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) {
   1.437 +    if (U_FAILURE(*status)) {
   1.438 +        return U_EOF;
   1.439 +    }
   1.440 +
   1.441 +    /* We expect to be called after the ESCAPE has been seen, but
   1.442 +     * u_fgetcx needs an ESCAPE to do its magic. */
   1.443 +    ucbuf_ungetc(ESCAPE, buf);
   1.444 +
   1.445 +    return ucbuf_getcx32(buf, status);
   1.446 +}
   1.447 +
   1.448 +static UBool isWhitespace(UChar32 c) {
   1.449 +    switch (c) {
   1.450 +        /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
   1.451 +    case 0x000A:
   1.452 +    case 0x2029:
   1.453 +        lineCount++;
   1.454 +    case 0x000D:
   1.455 +    case 0x0020:
   1.456 +    case 0x0009:
   1.457 +    case 0xFEFF:
   1.458 +        return TRUE;
   1.459 +
   1.460 +    default:
   1.461 +        return FALSE;
   1.462 +    }
   1.463 +}
   1.464 +
   1.465 +static UBool isNewline(UChar32 c) {
   1.466 +    switch (c) {
   1.467 +        /* '\n', '\r', 0x2029 */
   1.468 +    case 0x000A:
   1.469 +    case 0x2029:
   1.470 +        lineCount++;
   1.471 +    case 0x000D:
   1.472 +        return TRUE;
   1.473 +
   1.474 +    default:
   1.475 +        return FALSE;
   1.476 +    }
   1.477 +}

mercurial