michael@0: /*
michael@0: *******************************************************************************
michael@0: *
michael@0: *   Copyright (C) 1998-2012, International Business Machines
michael@0: *   Corporation and others.  All Rights Reserved.
michael@0: *
michael@0: *******************************************************************************
michael@0: *
michael@0: * File read.c
michael@0: *
michael@0: * Modification History:
michael@0: *
michael@0: *   Date        Name        Description
michael@0: *   05/26/99    stephen     Creation.
michael@0: *   5/10/01     Ram         removed ustdio dependency
michael@0: *******************************************************************************
michael@0: */
michael@0: 
michael@0: #include "read.h"
michael@0: #include "errmsg.h"
michael@0: #include "unicode/ustring.h"
michael@0: #include "unicode/utf16.h"
michael@0: 
michael@0: #define OPENBRACE    0x007B
michael@0: #define CLOSEBRACE   0x007D
michael@0: #define COMMA        0x002C
michael@0: #define QUOTE        0x0022
michael@0: #define ESCAPE       0x005C
michael@0: #define SLASH        0x002F
michael@0: #define ASTERISK     0x002A
michael@0: #define SPACE        0x0020
michael@0: #define COLON        0x003A
michael@0: #define BADBOM       0xFFFE
michael@0: #define CR           0x000D
michael@0: #define LF           0x000A
michael@0:                
michael@0: static int32_t lineCount;
michael@0: 
michael@0: /* Protos */
michael@0: static enum ETokenType getStringToken(UCHARBUF *buf,
michael@0:                                       UChar32 initialChar,
michael@0:                                       struct UString *token,
michael@0:                                       UErrorCode *status);
michael@0: 
michael@0: static UChar32 getNextChar           (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
michael@0: static void    seekUntilNewline      (UCHARBUF *buf, struct UString *token, UErrorCode *status);
michael@0: static void    seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
michael@0: static UBool   isWhitespace          (UChar32 c);
michael@0: static UBool   isNewline             (UChar32 c);
michael@0: 
michael@0: U_CFUNC void resetLineNumber() {
michael@0:     lineCount = 1;
michael@0: }
michael@0: 
michael@0: /* Read and return the next token from the stream.  If the token is of
michael@0:    type eString, fill in the token parameter with the token.  If the
michael@0:    token is eError, then the status parameter will contain the
michael@0:    specific error.  This will be eItemNotFound at the end of file,
michael@0:    indicating that all tokens have been returned.  This method will
michael@0:    never return eString twice in a row; instead, multiple adjacent
michael@0:    string tokens will be merged into one, with no intervening
michael@0:    space. */
michael@0: U_CFUNC enum ETokenType
michael@0: getNextToken(UCHARBUF* buf,
michael@0:              struct UString *token,
michael@0:              uint32_t *linenumber, /* out: linenumber of token */
michael@0:              struct UString *comment,
michael@0:              UErrorCode *status) {
michael@0:     enum ETokenType result;
michael@0:     UChar32         c;
michael@0: 
michael@0:     if (U_FAILURE(*status)) {
michael@0:         return TOK_ERROR;
michael@0:     }
michael@0: 
michael@0:     /* Skip whitespace */
michael@0:     c = getNextChar(buf, TRUE, comment, status);
michael@0: 
michael@0:     if (U_FAILURE(*status)) {
michael@0:         return TOK_ERROR;
michael@0:     }
michael@0: 
michael@0:     *linenumber = lineCount;
michael@0: 
michael@0:     switch(c) {
michael@0:     case BADBOM:
michael@0:         return TOK_ERROR;
michael@0:     case OPENBRACE:
michael@0:         return TOK_OPEN_BRACE;
michael@0:     case CLOSEBRACE:
michael@0:         return TOK_CLOSE_BRACE;
michael@0:     case COMMA:
michael@0:         return TOK_COMMA;
michael@0:     case U_EOF:
michael@0:         return TOK_EOF;
michael@0:     case COLON:
michael@0:         return TOK_COLON;
michael@0: 
michael@0:     default:
michael@0:         result = getStringToken(buf, c, token, status);
michael@0:     }
michael@0: 
michael@0:     *linenumber = lineCount;
michael@0:     return result;
michael@0: }
michael@0: 
michael@0: /* Copy a string token into the given UnicodeString.  Upon entry, we
michael@0:    have already read the first character of the string token, which is
michael@0:    not a whitespace character (but may be a QUOTE or ESCAPE). This
michael@0:    function reads all subsequent characters that belong with this
michael@0:    string, and copy them into the token parameter. The other
michael@0:    important, and slightly convoluted purpose of this function is to
michael@0:    merge adjacent strings.  It looks forward a bit, and if the next
michael@0:    non comment, non whitespace item is a string, it reads it in as
michael@0:    well.  If two adjacent strings are quoted, they are merged without
michael@0:    intervening space.  Otherwise a single SPACE character is
michael@0:    inserted. */
michael@0: static enum ETokenType getStringToken(UCHARBUF* buf,
michael@0:                                       UChar32 initialChar,
michael@0:                                       struct UString *token,
michael@0:                                       UErrorCode *status) {
michael@0:     UBool    lastStringWasQuoted;
michael@0:     UChar32  c;
michael@0:     UChar    target[3] = { '\0' };
michael@0:     UChar    *pTarget   = target;
michael@0:     int      len=0;
michael@0:     UBool    isFollowingCharEscaped=FALSE;
michael@0:     UBool    isNLUnescaped = FALSE;
michael@0:     UChar32  prevC=0;
michael@0: 
michael@0:     /* We are guaranteed on entry that initialChar is not a whitespace
michael@0:        character. If we are at the EOF, or have some other problem, it
michael@0:        doesn't matter; we still want to validly return the initialChar
michael@0:        (if nothing else) as a string token. */
michael@0: 
michael@0:     if (U_FAILURE(*status)) {
michael@0:         return TOK_ERROR;
michael@0:     }
michael@0: 
michael@0:     /* setup */
michael@0:     lastStringWasQuoted = FALSE;
michael@0:     c = initialChar;
michael@0:     ustr_setlen(token, 0, status);
michael@0: 
michael@0:     if (U_FAILURE(*status)) {
michael@0:         return TOK_ERROR;
michael@0:     }
michael@0: 
michael@0:     for (;;) {
michael@0:         if (c == QUOTE) {
michael@0:             if (!lastStringWasQuoted && token->fLength > 0) {
michael@0:                 ustr_ucat(token, SPACE, status);
michael@0: 
michael@0:                 if (U_FAILURE(*status)) {
michael@0:                     return TOK_ERROR;
michael@0:                 }
michael@0:             }
michael@0: 
michael@0:             lastStringWasQuoted = TRUE;
michael@0: 
michael@0:             for (;;) {
michael@0:                 c = ucbuf_getc(buf,status);
michael@0: 
michael@0:                 /* EOF reached */
michael@0:                 if (c == U_EOF) {
michael@0:                     return TOK_EOF;
michael@0:                 }
michael@0: 
michael@0:                 /* Unterminated quoted strings */
michael@0:                 if (U_FAILURE(*status)) {
michael@0:                     return TOK_ERROR;
michael@0:                 }
michael@0: 
michael@0:                 if (c == QUOTE && !isFollowingCharEscaped) {
michael@0:                     break;
michael@0:                 }
michael@0: 
michael@0:                 if (c == ESCAPE  && !isFollowingCharEscaped) {
michael@0:                     pTarget = target;
michael@0:                     c       = unescape(buf, status);
michael@0: 
michael@0:                     if (c == U_ERR) {
michael@0:                         return TOK_ERROR;
michael@0:                     }
michael@0:                     if(c == CR || c == LF){
michael@0:                         isNLUnescaped = TRUE;
michael@0:                     }
michael@0:                 }               
michael@0: 
michael@0:                 if(c==ESCAPE && !isFollowingCharEscaped){
michael@0:                     isFollowingCharEscaped = TRUE;
michael@0:                 }else{
michael@0:                     U_APPEND_CHAR32(c, pTarget,len);
michael@0:                     pTarget = target;
michael@0:                     ustr_uscat(token, pTarget,len, status);
michael@0:                     isFollowingCharEscaped = FALSE;
michael@0:                     len=0;
michael@0:                     if(c == CR || c == LF){
michael@0:                         if(isNLUnescaped == FALSE && prevC!=CR){
michael@0:                             lineCount++;
michael@0:                         }
michael@0:                         isNLUnescaped = FALSE;
michael@0:                     }
michael@0:                 }
michael@0:                 
michael@0:                 if (U_FAILURE(*status)) {
michael@0:                     return TOK_ERROR;
michael@0:                 }
michael@0:                 prevC = c;
michael@0:             }
michael@0:         } else {
michael@0:             if (token->fLength > 0) {
michael@0:                 ustr_ucat(token, SPACE, status);
michael@0: 
michael@0:                 if (U_FAILURE(*status)) {
michael@0:                     return TOK_ERROR;
michael@0:                 }
michael@0:             }
michael@0:             
michael@0:             if(lastStringWasQuoted){
michael@0:                 if(getShowWarning()){
michael@0:                     warning(lineCount, "Mixing quoted and unquoted strings");
michael@0:                 }
michael@0:                 if(isStrict()){
michael@0:                     return TOK_ERROR;
michael@0:                 }
michael@0: 
michael@0:             }
michael@0: 
michael@0:             lastStringWasQuoted = FALSE;
michael@0:             
michael@0:             /* if we reach here we are mixing 
michael@0:              * quoted and unquoted strings
michael@0:              * warn in normal mode and error in
michael@0:              * pedantic mode
michael@0:              */
michael@0: 
michael@0:             if (c == ESCAPE) {
michael@0:                 pTarget = target;
michael@0:                 c       = unescape(buf, status);
michael@0: 
michael@0:                 /* EOF reached */
michael@0:                 if (c == U_EOF) {
michael@0:                     return TOK_ERROR;
michael@0:                 }
michael@0:             }
michael@0: 
michael@0:             U_APPEND_CHAR32(c, pTarget,len);
michael@0:             pTarget = target;
michael@0:             ustr_uscat(token, pTarget,len, status);
michael@0:             len=0;
michael@0:             
michael@0:             if (U_FAILURE(*status)) {
michael@0:                 return TOK_ERROR;
michael@0:             }
michael@0: 
michael@0:             for (;;) {
michael@0:                 /* DON'T skip whitespace */
michael@0:                 c = getNextChar(buf, FALSE, NULL, status);
michael@0: 
michael@0:                 /* EOF reached */
michael@0:                 if (c == U_EOF) {
michael@0:                     ucbuf_ungetc(c, buf);
michael@0:                     return TOK_STRING;
michael@0:                 }
michael@0: 
michael@0:                 if (U_FAILURE(*status)) {
michael@0:                     return TOK_STRING;
michael@0:                 }
michael@0: 
michael@0:                 if (c == QUOTE
michael@0:                         || c == OPENBRACE
michael@0:                         || c == CLOSEBRACE
michael@0:                         || c == COMMA
michael@0:                         || c == COLON) {
michael@0:                     ucbuf_ungetc(c, buf);
michael@0:                     break;
michael@0:                 }
michael@0: 
michael@0:                 if (isWhitespace(c)) {
michael@0:                     break;
michael@0:                 }
michael@0: 
michael@0:                 if (c == ESCAPE) {
michael@0:                     pTarget = target;
michael@0:                     c       = unescape(buf, status);
michael@0: 
michael@0:                     if (c == U_ERR) {
michael@0:                         return TOK_ERROR;
michael@0:                     }
michael@0:                 }
michael@0: 
michael@0:                 U_APPEND_CHAR32(c, pTarget,len);
michael@0:                 pTarget = target;
michael@0:                 ustr_uscat(token, pTarget,len, status);
michael@0:                 len=0;
michael@0:                 if (U_FAILURE(*status)) {
michael@0:                     return TOK_ERROR;
michael@0:                 }
michael@0:             }
michael@0:         }
michael@0: 
michael@0:         /* DO skip whitespace */
michael@0:         c = getNextChar(buf, TRUE, NULL, status);
michael@0: 
michael@0:         if (U_FAILURE(*status)) {
michael@0:             return TOK_STRING;
michael@0:         }
michael@0: 
michael@0:         if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
michael@0:             ucbuf_ungetc(c, buf);
michael@0:             return TOK_STRING;
michael@0:         }
michael@0:     }
michael@0: }
michael@0: 
michael@0: /* Retrieve the next character.  If skipwhite is
michael@0:    true, whitespace is skipped as well. */
michael@0: static UChar32 getNextChar(UCHARBUF* buf,
michael@0:                            UBool skipwhite,
michael@0:                            struct UString *token,
michael@0:                            UErrorCode *status) {
michael@0:     UChar32 c, c2;
michael@0: 
michael@0:     if (U_FAILURE(*status)) {
michael@0:         return U_EOF;
michael@0:     }
michael@0: 
michael@0:     for (;;) {
michael@0:         c = ucbuf_getc(buf,status);
michael@0: 
michael@0:         if (c == U_EOF) {
michael@0:             return U_EOF;
michael@0:         }
michael@0: 
michael@0:         if (skipwhite && isWhitespace(c)) {
michael@0:             continue;
michael@0:         }
michael@0: 
michael@0:         /* This also handles the get() failing case */
michael@0:         if (c != SLASH) {
michael@0:             return c;
michael@0:         }
michael@0: 
michael@0:         c = ucbuf_getc(buf,status); /* "/c" */
michael@0: 
michael@0:         if (c == U_EOF) {
michael@0:             return U_EOF;
michael@0:         }
michael@0: 
michael@0:         switch (c) {
michael@0:         case SLASH:  /* "//" */
michael@0:             seekUntilNewline(buf, NULL, status);
michael@0:             break;
michael@0: 
michael@0:         case ASTERISK:  /* " / * " */
michael@0:             c2 = ucbuf_getc(buf, status); /* "/ * c" */
michael@0:             if(c2 == ASTERISK){  /* "/ * *" */
michael@0:                 /* parse multi-line comment and store it in token*/
michael@0:                 seekUntilEndOfComment(buf, token, status);
michael@0:             } else {
michael@0:                 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *".  Include c2  back in buffer.  */
michael@0:                 seekUntilEndOfComment(buf, NULL, status);
michael@0:             }
michael@0:             break;
michael@0: 
michael@0:         default:
michael@0:             ucbuf_ungetc(c, buf); /* "/c" - put back the c */
michael@0:             /* If get() failed this is a NOP */
michael@0:             return SLASH;
michael@0:         }
michael@0: 
michael@0:     }
michael@0: }
michael@0: 
michael@0: static void seekUntilNewline(UCHARBUF* buf,
michael@0:                              struct UString *token,
michael@0:                              UErrorCode *status) {
michael@0:     UChar32 c;
michael@0: 
michael@0:     if (U_FAILURE(*status)) {
michael@0:         return;
michael@0:     }
michael@0: 
michael@0:     do {
michael@0:         c = ucbuf_getc(buf,status);
michael@0:         /* add the char to token */
michael@0:         if(token!=NULL){
michael@0:             ustr_u32cat(token, c, status);
michael@0:         }
michael@0:     } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
michael@0: }
michael@0: 
michael@0: static void seekUntilEndOfComment(UCHARBUF *buf,
michael@0:                                   struct UString *token,
michael@0:                                   UErrorCode *status) {
michael@0:     UChar32  c, d;
michael@0:     uint32_t line;
michael@0: 
michael@0:     if (U_FAILURE(*status)) {
michael@0:         return;
michael@0:     }
michael@0: 
michael@0:     line = lineCount;
michael@0: 
michael@0:     do {
michael@0:         c = ucbuf_getc(buf, status);
michael@0: 
michael@0:         if (c == ASTERISK) {
michael@0:             d = ucbuf_getc(buf, status);
michael@0: 
michael@0:             if (d != SLASH) {
michael@0:                 ucbuf_ungetc(d, buf);
michael@0:             } else {
michael@0:                 break;
michael@0:             }
michael@0:         }
michael@0:         /* add the char to token */
michael@0:         if(token!=NULL){
michael@0:             ustr_u32cat(token, c, status);
michael@0:         }
michael@0:         /* increment the lineCount */
michael@0:         isNewline(c);
michael@0: 
michael@0:     } while (c != U_EOF && *status == U_ZERO_ERROR);
michael@0: 
michael@0:     if (c == U_EOF) {
michael@0:         *status = U_INVALID_FORMAT_ERROR;
michael@0:         error(line, "unterminated comment detected");
michael@0:     }
michael@0: }
michael@0: 
michael@0: U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) {
michael@0:     if (U_FAILURE(*status)) {
michael@0:         return U_EOF;
michael@0:     }
michael@0: 
michael@0:     /* We expect to be called after the ESCAPE has been seen, but
michael@0:      * u_fgetcx needs an ESCAPE to do its magic. */
michael@0:     ucbuf_ungetc(ESCAPE, buf);
michael@0: 
michael@0:     return ucbuf_getcx32(buf, status);
michael@0: }
michael@0: 
michael@0: static UBool isWhitespace(UChar32 c) {
michael@0:     switch (c) {
michael@0:         /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
michael@0:     case 0x000A:
michael@0:     case 0x2029:
michael@0:         lineCount++;
michael@0:     case 0x000D:
michael@0:     case 0x0020:
michael@0:     case 0x0009:
michael@0:     case 0xFEFF:
michael@0:         return TRUE;
michael@0: 
michael@0:     default:
michael@0:         return FALSE;
michael@0:     }
michael@0: }
michael@0: 
michael@0: static UBool isNewline(UChar32 c) {
michael@0:     switch (c) {
michael@0:         /* '\n', '\r', 0x2029 */
michael@0:     case 0x000A:
michael@0:     case 0x2029:
michael@0:         lineCount++;
michael@0:     case 0x000D:
michael@0:         return TRUE;
michael@0: 
michael@0:     default:
michael@0:         return FALSE;
michael@0:     }
michael@0: }