1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/tools/genrb/read.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,474 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 1998-2012, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* 1.12 +* File read.c 1.13 +* 1.14 +* Modification History: 1.15 +* 1.16 +* Date Name Description 1.17 +* 05/26/99 stephen Creation. 1.18 +* 5/10/01 Ram removed ustdio dependency 1.19 +******************************************************************************* 1.20 +*/ 1.21 + 1.22 +#include "read.h" 1.23 +#include "errmsg.h" 1.24 +#include "unicode/ustring.h" 1.25 +#include "unicode/utf16.h" 1.26 + 1.27 +#define OPENBRACE 0x007B 1.28 +#define CLOSEBRACE 0x007D 1.29 +#define COMMA 0x002C 1.30 +#define QUOTE 0x0022 1.31 +#define ESCAPE 0x005C 1.32 +#define SLASH 0x002F 1.33 +#define ASTERISK 0x002A 1.34 +#define SPACE 0x0020 1.35 +#define COLON 0x003A 1.36 +#define BADBOM 0xFFFE 1.37 +#define CR 0x000D 1.38 +#define LF 0x000A 1.39 + 1.40 +static int32_t lineCount; 1.41 + 1.42 +/* Protos */ 1.43 +static enum ETokenType getStringToken(UCHARBUF *buf, 1.44 + UChar32 initialChar, 1.45 + struct UString *token, 1.46 + UErrorCode *status); 1.47 + 1.48 +static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status); 1.49 +static void seekUntilNewline (UCHARBUF *buf, struct UString *token, UErrorCode *status); 1.50 +static void seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status); 1.51 +static UBool isWhitespace (UChar32 c); 1.52 +static UBool isNewline (UChar32 c); 1.53 + 1.54 +U_CFUNC void resetLineNumber() { 1.55 + lineCount = 1; 1.56 +} 1.57 + 1.58 +/* Read and return the next token from the stream. If the token is of 1.59 + type eString, fill in the token parameter with the token. If the 1.60 + token is eError, then the status parameter will contain the 1.61 + specific error. This will be eItemNotFound at the end of file, 1.62 + indicating that all tokens have been returned. This method will 1.63 + never return eString twice in a row; instead, multiple adjacent 1.64 + string tokens will be merged into one, with no intervening 1.65 + space. */ 1.66 +U_CFUNC enum ETokenType 1.67 +getNextToken(UCHARBUF* buf, 1.68 + struct UString *token, 1.69 + uint32_t *linenumber, /* out: linenumber of token */ 1.70 + struct UString *comment, 1.71 + UErrorCode *status) { 1.72 + enum ETokenType result; 1.73 + UChar32 c; 1.74 + 1.75 + if (U_FAILURE(*status)) { 1.76 + return TOK_ERROR; 1.77 + } 1.78 + 1.79 + /* Skip whitespace */ 1.80 + c = getNextChar(buf, TRUE, comment, status); 1.81 + 1.82 + if (U_FAILURE(*status)) { 1.83 + return TOK_ERROR; 1.84 + } 1.85 + 1.86 + *linenumber = lineCount; 1.87 + 1.88 + switch(c) { 1.89 + case BADBOM: 1.90 + return TOK_ERROR; 1.91 + case OPENBRACE: 1.92 + return TOK_OPEN_BRACE; 1.93 + case CLOSEBRACE: 1.94 + return TOK_CLOSE_BRACE; 1.95 + case COMMA: 1.96 + return TOK_COMMA; 1.97 + case U_EOF: 1.98 + return TOK_EOF; 1.99 + case COLON: 1.100 + return TOK_COLON; 1.101 + 1.102 + default: 1.103 + result = getStringToken(buf, c, token, status); 1.104 + } 1.105 + 1.106 + *linenumber = lineCount; 1.107 + return result; 1.108 +} 1.109 + 1.110 +/* Copy a string token into the given UnicodeString. Upon entry, we 1.111 + have already read the first character of the string token, which is 1.112 + not a whitespace character (but may be a QUOTE or ESCAPE). This 1.113 + function reads all subsequent characters that belong with this 1.114 + string, and copy them into the token parameter. The other 1.115 + important, and slightly convoluted purpose of this function is to 1.116 + merge adjacent strings. It looks forward a bit, and if the next 1.117 + non comment, non whitespace item is a string, it reads it in as 1.118 + well. If two adjacent strings are quoted, they are merged without 1.119 + intervening space. Otherwise a single SPACE character is 1.120 + inserted. */ 1.121 +static enum ETokenType getStringToken(UCHARBUF* buf, 1.122 + UChar32 initialChar, 1.123 + struct UString *token, 1.124 + UErrorCode *status) { 1.125 + UBool lastStringWasQuoted; 1.126 + UChar32 c; 1.127 + UChar target[3] = { '\0' }; 1.128 + UChar *pTarget = target; 1.129 + int len=0; 1.130 + UBool isFollowingCharEscaped=FALSE; 1.131 + UBool isNLUnescaped = FALSE; 1.132 + UChar32 prevC=0; 1.133 + 1.134 + /* We are guaranteed on entry that initialChar is not a whitespace 1.135 + character. If we are at the EOF, or have some other problem, it 1.136 + doesn't matter; we still want to validly return the initialChar 1.137 + (if nothing else) as a string token. */ 1.138 + 1.139 + if (U_FAILURE(*status)) { 1.140 + return TOK_ERROR; 1.141 + } 1.142 + 1.143 + /* setup */ 1.144 + lastStringWasQuoted = FALSE; 1.145 + c = initialChar; 1.146 + ustr_setlen(token, 0, status); 1.147 + 1.148 + if (U_FAILURE(*status)) { 1.149 + return TOK_ERROR; 1.150 + } 1.151 + 1.152 + for (;;) { 1.153 + if (c == QUOTE) { 1.154 + if (!lastStringWasQuoted && token->fLength > 0) { 1.155 + ustr_ucat(token, SPACE, status); 1.156 + 1.157 + if (U_FAILURE(*status)) { 1.158 + return TOK_ERROR; 1.159 + } 1.160 + } 1.161 + 1.162 + lastStringWasQuoted = TRUE; 1.163 + 1.164 + for (;;) { 1.165 + c = ucbuf_getc(buf,status); 1.166 + 1.167 + /* EOF reached */ 1.168 + if (c == U_EOF) { 1.169 + return TOK_EOF; 1.170 + } 1.171 + 1.172 + /* Unterminated quoted strings */ 1.173 + if (U_FAILURE(*status)) { 1.174 + return TOK_ERROR; 1.175 + } 1.176 + 1.177 + if (c == QUOTE && !isFollowingCharEscaped) { 1.178 + break; 1.179 + } 1.180 + 1.181 + if (c == ESCAPE && !isFollowingCharEscaped) { 1.182 + pTarget = target; 1.183 + c = unescape(buf, status); 1.184 + 1.185 + if (c == U_ERR) { 1.186 + return TOK_ERROR; 1.187 + } 1.188 + if(c == CR || c == LF){ 1.189 + isNLUnescaped = TRUE; 1.190 + } 1.191 + } 1.192 + 1.193 + if(c==ESCAPE && !isFollowingCharEscaped){ 1.194 + isFollowingCharEscaped = TRUE; 1.195 + }else{ 1.196 + U_APPEND_CHAR32(c, pTarget,len); 1.197 + pTarget = target; 1.198 + ustr_uscat(token, pTarget,len, status); 1.199 + isFollowingCharEscaped = FALSE; 1.200 + len=0; 1.201 + if(c == CR || c == LF){ 1.202 + if(isNLUnescaped == FALSE && prevC!=CR){ 1.203 + lineCount++; 1.204 + } 1.205 + isNLUnescaped = FALSE; 1.206 + } 1.207 + } 1.208 + 1.209 + if (U_FAILURE(*status)) { 1.210 + return TOK_ERROR; 1.211 + } 1.212 + prevC = c; 1.213 + } 1.214 + } else { 1.215 + if (token->fLength > 0) { 1.216 + ustr_ucat(token, SPACE, status); 1.217 + 1.218 + if (U_FAILURE(*status)) { 1.219 + return TOK_ERROR; 1.220 + } 1.221 + } 1.222 + 1.223 + if(lastStringWasQuoted){ 1.224 + if(getShowWarning()){ 1.225 + warning(lineCount, "Mixing quoted and unquoted strings"); 1.226 + } 1.227 + if(isStrict()){ 1.228 + return TOK_ERROR; 1.229 + } 1.230 + 1.231 + } 1.232 + 1.233 + lastStringWasQuoted = FALSE; 1.234 + 1.235 + /* if we reach here we are mixing 1.236 + * quoted and unquoted strings 1.237 + * warn in normal mode and error in 1.238 + * pedantic mode 1.239 + */ 1.240 + 1.241 + if (c == ESCAPE) { 1.242 + pTarget = target; 1.243 + c = unescape(buf, status); 1.244 + 1.245 + /* EOF reached */ 1.246 + if (c == U_EOF) { 1.247 + return TOK_ERROR; 1.248 + } 1.249 + } 1.250 + 1.251 + U_APPEND_CHAR32(c, pTarget,len); 1.252 + pTarget = target; 1.253 + ustr_uscat(token, pTarget,len, status); 1.254 + len=0; 1.255 + 1.256 + if (U_FAILURE(*status)) { 1.257 + return TOK_ERROR; 1.258 + } 1.259 + 1.260 + for (;;) { 1.261 + /* DON'T skip whitespace */ 1.262 + c = getNextChar(buf, FALSE, NULL, status); 1.263 + 1.264 + /* EOF reached */ 1.265 + if (c == U_EOF) { 1.266 + ucbuf_ungetc(c, buf); 1.267 + return TOK_STRING; 1.268 + } 1.269 + 1.270 + if (U_FAILURE(*status)) { 1.271 + return TOK_STRING; 1.272 + } 1.273 + 1.274 + if (c == QUOTE 1.275 + || c == OPENBRACE 1.276 + || c == CLOSEBRACE 1.277 + || c == COMMA 1.278 + || c == COLON) { 1.279 + ucbuf_ungetc(c, buf); 1.280 + break; 1.281 + } 1.282 + 1.283 + if (isWhitespace(c)) { 1.284 + break; 1.285 + } 1.286 + 1.287 + if (c == ESCAPE) { 1.288 + pTarget = target; 1.289 + c = unescape(buf, status); 1.290 + 1.291 + if (c == U_ERR) { 1.292 + return TOK_ERROR; 1.293 + } 1.294 + } 1.295 + 1.296 + U_APPEND_CHAR32(c, pTarget,len); 1.297 + pTarget = target; 1.298 + ustr_uscat(token, pTarget,len, status); 1.299 + len=0; 1.300 + if (U_FAILURE(*status)) { 1.301 + return TOK_ERROR; 1.302 + } 1.303 + } 1.304 + } 1.305 + 1.306 + /* DO skip whitespace */ 1.307 + c = getNextChar(buf, TRUE, NULL, status); 1.308 + 1.309 + if (U_FAILURE(*status)) { 1.310 + return TOK_STRING; 1.311 + } 1.312 + 1.313 + if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) { 1.314 + ucbuf_ungetc(c, buf); 1.315 + return TOK_STRING; 1.316 + } 1.317 + } 1.318 +} 1.319 + 1.320 +/* Retrieve the next character. If skipwhite is 1.321 + true, whitespace is skipped as well. */ 1.322 +static UChar32 getNextChar(UCHARBUF* buf, 1.323 + UBool skipwhite, 1.324 + struct UString *token, 1.325 + UErrorCode *status) { 1.326 + UChar32 c, c2; 1.327 + 1.328 + if (U_FAILURE(*status)) { 1.329 + return U_EOF; 1.330 + } 1.331 + 1.332 + for (;;) { 1.333 + c = ucbuf_getc(buf,status); 1.334 + 1.335 + if (c == U_EOF) { 1.336 + return U_EOF; 1.337 + } 1.338 + 1.339 + if (skipwhite && isWhitespace(c)) { 1.340 + continue; 1.341 + } 1.342 + 1.343 + /* This also handles the get() failing case */ 1.344 + if (c != SLASH) { 1.345 + return c; 1.346 + } 1.347 + 1.348 + c = ucbuf_getc(buf,status); /* "/c" */ 1.349 + 1.350 + if (c == U_EOF) { 1.351 + return U_EOF; 1.352 + } 1.353 + 1.354 + switch (c) { 1.355 + case SLASH: /* "//" */ 1.356 + seekUntilNewline(buf, NULL, status); 1.357 + break; 1.358 + 1.359 + case ASTERISK: /* " / * " */ 1.360 + c2 = ucbuf_getc(buf, status); /* "/ * c" */ 1.361 + if(c2 == ASTERISK){ /* "/ * *" */ 1.362 + /* parse multi-line comment and store it in token*/ 1.363 + seekUntilEndOfComment(buf, token, status); 1.364 + } else { 1.365 + ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */ 1.366 + seekUntilEndOfComment(buf, NULL, status); 1.367 + } 1.368 + break; 1.369 + 1.370 + default: 1.371 + ucbuf_ungetc(c, buf); /* "/c" - put back the c */ 1.372 + /* If get() failed this is a NOP */ 1.373 + return SLASH; 1.374 + } 1.375 + 1.376 + } 1.377 +} 1.378 + 1.379 +static void seekUntilNewline(UCHARBUF* buf, 1.380 + struct UString *token, 1.381 + UErrorCode *status) { 1.382 + UChar32 c; 1.383 + 1.384 + if (U_FAILURE(*status)) { 1.385 + return; 1.386 + } 1.387 + 1.388 + do { 1.389 + c = ucbuf_getc(buf,status); 1.390 + /* add the char to token */ 1.391 + if(token!=NULL){ 1.392 + ustr_u32cat(token, c, status); 1.393 + } 1.394 + } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR); 1.395 +} 1.396 + 1.397 +static void seekUntilEndOfComment(UCHARBUF *buf, 1.398 + struct UString *token, 1.399 + UErrorCode *status) { 1.400 + UChar32 c, d; 1.401 + uint32_t line; 1.402 + 1.403 + if (U_FAILURE(*status)) { 1.404 + return; 1.405 + } 1.406 + 1.407 + line = lineCount; 1.408 + 1.409 + do { 1.410 + c = ucbuf_getc(buf, status); 1.411 + 1.412 + if (c == ASTERISK) { 1.413 + d = ucbuf_getc(buf, status); 1.414 + 1.415 + if (d != SLASH) { 1.416 + ucbuf_ungetc(d, buf); 1.417 + } else { 1.418 + break; 1.419 + } 1.420 + } 1.421 + /* add the char to token */ 1.422 + if(token!=NULL){ 1.423 + ustr_u32cat(token, c, status); 1.424 + } 1.425 + /* increment the lineCount */ 1.426 + isNewline(c); 1.427 + 1.428 + } while (c != U_EOF && *status == U_ZERO_ERROR); 1.429 + 1.430 + if (c == U_EOF) { 1.431 + *status = U_INVALID_FORMAT_ERROR; 1.432 + error(line, "unterminated comment detected"); 1.433 + } 1.434 +} 1.435 + 1.436 +U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) { 1.437 + if (U_FAILURE(*status)) { 1.438 + return U_EOF; 1.439 + } 1.440 + 1.441 + /* We expect to be called after the ESCAPE has been seen, but 1.442 + * u_fgetcx needs an ESCAPE to do its magic. */ 1.443 + ucbuf_ungetc(ESCAPE, buf); 1.444 + 1.445 + return ucbuf_getcx32(buf, status); 1.446 +} 1.447 + 1.448 +static UBool isWhitespace(UChar32 c) { 1.449 + switch (c) { 1.450 + /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */ 1.451 + case 0x000A: 1.452 + case 0x2029: 1.453 + lineCount++; 1.454 + case 0x000D: 1.455 + case 0x0020: 1.456 + case 0x0009: 1.457 + case 0xFEFF: 1.458 + return TRUE; 1.459 + 1.460 + default: 1.461 + return FALSE; 1.462 + } 1.463 +} 1.464 + 1.465 +static UBool isNewline(UChar32 c) { 1.466 + switch (c) { 1.467 + /* '\n', '\r', 0x2029 */ 1.468 + case 0x000A: 1.469 + case 0x2029: 1.470 + lineCount++; 1.471 + case 0x000D: 1.472 + return TRUE; 1.473 + 1.474 + default: 1.475 + return FALSE; 1.476 + } 1.477 +}