michael@0: /* michael@0: ******************************************************************************* michael@0: * michael@0: * Copyright (C) 1998-2012, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ******************************************************************************* michael@0: * michael@0: * File read.c michael@0: * michael@0: * Modification History: michael@0: * michael@0: * Date Name Description michael@0: * 05/26/99 stephen Creation. michael@0: * 5/10/01 Ram removed ustdio dependency michael@0: ******************************************************************************* michael@0: */ michael@0: michael@0: #include "read.h" michael@0: #include "errmsg.h" michael@0: #include "unicode/ustring.h" michael@0: #include "unicode/utf16.h" michael@0: michael@0: #define OPENBRACE 0x007B michael@0: #define CLOSEBRACE 0x007D michael@0: #define COMMA 0x002C michael@0: #define QUOTE 0x0022 michael@0: #define ESCAPE 0x005C michael@0: #define SLASH 0x002F michael@0: #define ASTERISK 0x002A michael@0: #define SPACE 0x0020 michael@0: #define COLON 0x003A michael@0: #define BADBOM 0xFFFE michael@0: #define CR 0x000D michael@0: #define LF 0x000A michael@0: michael@0: static int32_t lineCount; michael@0: michael@0: /* Protos */ michael@0: static enum ETokenType getStringToken(UCHARBUF *buf, michael@0: UChar32 initialChar, michael@0: struct UString *token, michael@0: UErrorCode *status); michael@0: michael@0: static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status); michael@0: static void seekUntilNewline (UCHARBUF *buf, struct UString *token, UErrorCode *status); michael@0: static void seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status); michael@0: static UBool isWhitespace (UChar32 c); michael@0: static UBool isNewline (UChar32 c); michael@0: michael@0: U_CFUNC void resetLineNumber() { michael@0: lineCount = 1; michael@0: } michael@0: michael@0: /* Read and return the next token from the stream. If the token is of michael@0: type eString, fill in the token parameter with the token. If the michael@0: token is eError, then the status parameter will contain the michael@0: specific error. This will be eItemNotFound at the end of file, michael@0: indicating that all tokens have been returned. This method will michael@0: never return eString twice in a row; instead, multiple adjacent michael@0: string tokens will be merged into one, with no intervening michael@0: space. */ michael@0: U_CFUNC enum ETokenType michael@0: getNextToken(UCHARBUF* buf, michael@0: struct UString *token, michael@0: uint32_t *linenumber, /* out: linenumber of token */ michael@0: struct UString *comment, michael@0: UErrorCode *status) { michael@0: enum ETokenType result; michael@0: UChar32 c; michael@0: michael@0: if (U_FAILURE(*status)) { michael@0: return TOK_ERROR; michael@0: } michael@0: michael@0: /* Skip whitespace */ michael@0: c = getNextChar(buf, TRUE, comment, status); michael@0: michael@0: if (U_FAILURE(*status)) { michael@0: return TOK_ERROR; michael@0: } michael@0: michael@0: *linenumber = lineCount; michael@0: michael@0: switch(c) { michael@0: case BADBOM: michael@0: return TOK_ERROR; michael@0: case OPENBRACE: michael@0: return TOK_OPEN_BRACE; michael@0: case CLOSEBRACE: michael@0: return TOK_CLOSE_BRACE; michael@0: case COMMA: michael@0: return TOK_COMMA; michael@0: case U_EOF: michael@0: return TOK_EOF; michael@0: case COLON: michael@0: return TOK_COLON; michael@0: michael@0: default: michael@0: result = getStringToken(buf, c, token, status); michael@0: } michael@0: michael@0: *linenumber = lineCount; michael@0: return result; michael@0: } michael@0: michael@0: /* Copy a string token into the given UnicodeString. Upon entry, we michael@0: have already read the first character of the string token, which is michael@0: not a whitespace character (but may be a QUOTE or ESCAPE). This michael@0: function reads all subsequent characters that belong with this michael@0: string, and copy them into the token parameter. The other michael@0: important, and slightly convoluted purpose of this function is to michael@0: merge adjacent strings. It looks forward a bit, and if the next michael@0: non comment, non whitespace item is a string, it reads it in as michael@0: well. If two adjacent strings are quoted, they are merged without michael@0: intervening space. Otherwise a single SPACE character is michael@0: inserted. */ michael@0: static enum ETokenType getStringToken(UCHARBUF* buf, michael@0: UChar32 initialChar, michael@0: struct UString *token, michael@0: UErrorCode *status) { michael@0: UBool lastStringWasQuoted; michael@0: UChar32 c; michael@0: UChar target[3] = { '\0' }; michael@0: UChar *pTarget = target; michael@0: int len=0; michael@0: UBool isFollowingCharEscaped=FALSE; michael@0: UBool isNLUnescaped = FALSE; michael@0: UChar32 prevC=0; michael@0: michael@0: /* We are guaranteed on entry that initialChar is not a whitespace michael@0: character. If we are at the EOF, or have some other problem, it michael@0: doesn't matter; we still want to validly return the initialChar michael@0: (if nothing else) as a string token. */ michael@0: michael@0: if (U_FAILURE(*status)) { michael@0: return TOK_ERROR; michael@0: } michael@0: michael@0: /* setup */ michael@0: lastStringWasQuoted = FALSE; michael@0: c = initialChar; michael@0: ustr_setlen(token, 0, status); michael@0: michael@0: if (U_FAILURE(*status)) { michael@0: return TOK_ERROR; michael@0: } michael@0: michael@0: for (;;) { michael@0: if (c == QUOTE) { michael@0: if (!lastStringWasQuoted && token->fLength > 0) { michael@0: ustr_ucat(token, SPACE, status); michael@0: michael@0: if (U_FAILURE(*status)) { michael@0: return TOK_ERROR; michael@0: } michael@0: } michael@0: michael@0: lastStringWasQuoted = TRUE; michael@0: michael@0: for (;;) { michael@0: c = ucbuf_getc(buf,status); michael@0: michael@0: /* EOF reached */ michael@0: if (c == U_EOF) { michael@0: return TOK_EOF; michael@0: } michael@0: michael@0: /* Unterminated quoted strings */ michael@0: if (U_FAILURE(*status)) { michael@0: return TOK_ERROR; michael@0: } michael@0: michael@0: if (c == QUOTE && !isFollowingCharEscaped) { michael@0: break; michael@0: } michael@0: michael@0: if (c == ESCAPE && !isFollowingCharEscaped) { michael@0: pTarget = target; michael@0: c = unescape(buf, status); michael@0: michael@0: if (c == U_ERR) { michael@0: return TOK_ERROR; michael@0: } michael@0: if(c == CR || c == LF){ michael@0: isNLUnescaped = TRUE; michael@0: } michael@0: } michael@0: michael@0: if(c==ESCAPE && !isFollowingCharEscaped){ michael@0: isFollowingCharEscaped = TRUE; michael@0: }else{ michael@0: U_APPEND_CHAR32(c, pTarget,len); michael@0: pTarget = target; michael@0: ustr_uscat(token, pTarget,len, status); michael@0: isFollowingCharEscaped = FALSE; michael@0: len=0; michael@0: if(c == CR || c == LF){ michael@0: if(isNLUnescaped == FALSE && prevC!=CR){ michael@0: lineCount++; michael@0: } michael@0: isNLUnescaped = FALSE; michael@0: } michael@0: } michael@0: michael@0: if (U_FAILURE(*status)) { michael@0: return TOK_ERROR; michael@0: } michael@0: prevC = c; michael@0: } michael@0: } else { michael@0: if (token->fLength > 0) { michael@0: ustr_ucat(token, SPACE, status); michael@0: michael@0: if (U_FAILURE(*status)) { michael@0: return TOK_ERROR; michael@0: } michael@0: } michael@0: michael@0: if(lastStringWasQuoted){ michael@0: if(getShowWarning()){ michael@0: warning(lineCount, "Mixing quoted and unquoted strings"); michael@0: } michael@0: if(isStrict()){ michael@0: return TOK_ERROR; michael@0: } michael@0: michael@0: } michael@0: michael@0: lastStringWasQuoted = FALSE; michael@0: michael@0: /* if we reach here we are mixing michael@0: * quoted and unquoted strings michael@0: * warn in normal mode and error in michael@0: * pedantic mode michael@0: */ michael@0: michael@0: if (c == ESCAPE) { michael@0: pTarget = target; michael@0: c = unescape(buf, status); michael@0: michael@0: /* EOF reached */ michael@0: if (c == U_EOF) { michael@0: return TOK_ERROR; michael@0: } michael@0: } michael@0: michael@0: U_APPEND_CHAR32(c, pTarget,len); michael@0: pTarget = target; michael@0: ustr_uscat(token, pTarget,len, status); michael@0: len=0; michael@0: michael@0: if (U_FAILURE(*status)) { michael@0: return TOK_ERROR; michael@0: } michael@0: michael@0: for (;;) { michael@0: /* DON'T skip whitespace */ michael@0: c = getNextChar(buf, FALSE, NULL, status); michael@0: michael@0: /* EOF reached */ michael@0: if (c == U_EOF) { michael@0: ucbuf_ungetc(c, buf); michael@0: return TOK_STRING; michael@0: } michael@0: michael@0: if (U_FAILURE(*status)) { michael@0: return TOK_STRING; michael@0: } michael@0: michael@0: if (c == QUOTE michael@0: || c == OPENBRACE michael@0: || c == CLOSEBRACE michael@0: || c == COMMA michael@0: || c == COLON) { michael@0: ucbuf_ungetc(c, buf); michael@0: break; michael@0: } michael@0: michael@0: if (isWhitespace(c)) { michael@0: break; michael@0: } michael@0: michael@0: if (c == ESCAPE) { michael@0: pTarget = target; michael@0: c = unescape(buf, status); michael@0: michael@0: if (c == U_ERR) { michael@0: return TOK_ERROR; michael@0: } michael@0: } michael@0: michael@0: U_APPEND_CHAR32(c, pTarget,len); michael@0: pTarget = target; michael@0: ustr_uscat(token, pTarget,len, status); michael@0: len=0; michael@0: if (U_FAILURE(*status)) { michael@0: return TOK_ERROR; michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* DO skip whitespace */ michael@0: c = getNextChar(buf, TRUE, NULL, status); michael@0: michael@0: if (U_FAILURE(*status)) { michael@0: return TOK_STRING; michael@0: } michael@0: michael@0: if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) { michael@0: ucbuf_ungetc(c, buf); michael@0: return TOK_STRING; michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* Retrieve the next character. If skipwhite is michael@0: true, whitespace is skipped as well. */ michael@0: static UChar32 getNextChar(UCHARBUF* buf, michael@0: UBool skipwhite, michael@0: struct UString *token, michael@0: UErrorCode *status) { michael@0: UChar32 c, c2; michael@0: michael@0: if (U_FAILURE(*status)) { michael@0: return U_EOF; michael@0: } michael@0: michael@0: for (;;) { michael@0: c = ucbuf_getc(buf,status); michael@0: michael@0: if (c == U_EOF) { michael@0: return U_EOF; michael@0: } michael@0: michael@0: if (skipwhite && isWhitespace(c)) { michael@0: continue; michael@0: } michael@0: michael@0: /* This also handles the get() failing case */ michael@0: if (c != SLASH) { michael@0: return c; michael@0: } michael@0: michael@0: c = ucbuf_getc(buf,status); /* "/c" */ michael@0: michael@0: if (c == U_EOF) { michael@0: return U_EOF; michael@0: } michael@0: michael@0: switch (c) { michael@0: case SLASH: /* "//" */ michael@0: seekUntilNewline(buf, NULL, status); michael@0: break; michael@0: michael@0: case ASTERISK: /* " / * " */ michael@0: c2 = ucbuf_getc(buf, status); /* "/ * c" */ michael@0: if(c2 == ASTERISK){ /* "/ * *" */ michael@0: /* parse multi-line comment and store it in token*/ michael@0: seekUntilEndOfComment(buf, token, status); michael@0: } else { michael@0: ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */ michael@0: seekUntilEndOfComment(buf, NULL, status); michael@0: } michael@0: break; michael@0: michael@0: default: michael@0: ucbuf_ungetc(c, buf); /* "/c" - put back the c */ michael@0: /* If get() failed this is a NOP */ michael@0: return SLASH; michael@0: } michael@0: michael@0: } michael@0: } michael@0: michael@0: static void seekUntilNewline(UCHARBUF* buf, michael@0: struct UString *token, michael@0: UErrorCode *status) { michael@0: UChar32 c; michael@0: michael@0: if (U_FAILURE(*status)) { michael@0: return; michael@0: } michael@0: michael@0: do { michael@0: c = ucbuf_getc(buf,status); michael@0: /* add the char to token */ michael@0: if(token!=NULL){ michael@0: ustr_u32cat(token, c, status); michael@0: } michael@0: } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR); michael@0: } michael@0: michael@0: static void seekUntilEndOfComment(UCHARBUF *buf, michael@0: struct UString *token, michael@0: UErrorCode *status) { michael@0: UChar32 c, d; michael@0: uint32_t line; michael@0: michael@0: if (U_FAILURE(*status)) { michael@0: return; michael@0: } michael@0: michael@0: line = lineCount; michael@0: michael@0: do { michael@0: c = ucbuf_getc(buf, status); michael@0: michael@0: if (c == ASTERISK) { michael@0: d = ucbuf_getc(buf, status); michael@0: michael@0: if (d != SLASH) { michael@0: ucbuf_ungetc(d, buf); michael@0: } else { michael@0: break; michael@0: } michael@0: } michael@0: /* add the char to token */ michael@0: if(token!=NULL){ michael@0: ustr_u32cat(token, c, status); michael@0: } michael@0: /* increment the lineCount */ michael@0: isNewline(c); michael@0: michael@0: } while (c != U_EOF && *status == U_ZERO_ERROR); michael@0: michael@0: if (c == U_EOF) { michael@0: *status = U_INVALID_FORMAT_ERROR; michael@0: error(line, "unterminated comment detected"); michael@0: } michael@0: } michael@0: michael@0: U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) { michael@0: if (U_FAILURE(*status)) { michael@0: return U_EOF; michael@0: } michael@0: michael@0: /* We expect to be called after the ESCAPE has been seen, but michael@0: * u_fgetcx needs an ESCAPE to do its magic. */ michael@0: ucbuf_ungetc(ESCAPE, buf); michael@0: michael@0: return ucbuf_getcx32(buf, status); michael@0: } michael@0: michael@0: static UBool isWhitespace(UChar32 c) { michael@0: switch (c) { michael@0: /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */ michael@0: case 0x000A: michael@0: case 0x2029: michael@0: lineCount++; michael@0: case 0x000D: michael@0: case 0x0020: michael@0: case 0x0009: michael@0: case 0xFEFF: michael@0: return TRUE; michael@0: michael@0: default: michael@0: return FALSE; michael@0: } michael@0: } michael@0: michael@0: static UBool isNewline(UChar32 c) { michael@0: switch (c) { michael@0: /* '\n', '\r', 0x2029 */ michael@0: case 0x000A: michael@0: case 0x2029: michael@0: lineCount++; michael@0: case 0x000D: michael@0: return TRUE; michael@0: michael@0: default: michael@0: return FALSE; michael@0: } michael@0: }