intl/icu/source/tools/genrb/read.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 1998-2012, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 *******************************************************************************
michael@0 8 *
michael@0 9 * File read.c
michael@0 10 *
michael@0 11 * Modification History:
michael@0 12 *
michael@0 13 * Date Name Description
michael@0 14 * 05/26/99 stephen Creation.
michael@0 15 * 5/10/01 Ram removed ustdio dependency
michael@0 16 *******************************************************************************
michael@0 17 */
michael@0 18
michael@0 19 #include "read.h"
michael@0 20 #include "errmsg.h"
michael@0 21 #include "unicode/ustring.h"
michael@0 22 #include "unicode/utf16.h"
michael@0 23
michael@0 24 #define OPENBRACE 0x007B
michael@0 25 #define CLOSEBRACE 0x007D
michael@0 26 #define COMMA 0x002C
michael@0 27 #define QUOTE 0x0022
michael@0 28 #define ESCAPE 0x005C
michael@0 29 #define SLASH 0x002F
michael@0 30 #define ASTERISK 0x002A
michael@0 31 #define SPACE 0x0020
michael@0 32 #define COLON 0x003A
michael@0 33 #define BADBOM 0xFFFE
michael@0 34 #define CR 0x000D
michael@0 35 #define LF 0x000A
michael@0 36
michael@0 37 static int32_t lineCount;
michael@0 38
michael@0 39 /* Protos */
michael@0 40 static enum ETokenType getStringToken(UCHARBUF *buf,
michael@0 41 UChar32 initialChar,
michael@0 42 struct UString *token,
michael@0 43 UErrorCode *status);
michael@0 44
michael@0 45 static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
michael@0 46 static void seekUntilNewline (UCHARBUF *buf, struct UString *token, UErrorCode *status);
michael@0 47 static void seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
michael@0 48 static UBool isWhitespace (UChar32 c);
michael@0 49 static UBool isNewline (UChar32 c);
michael@0 50
michael@0 51 U_CFUNC void resetLineNumber() {
michael@0 52 lineCount = 1;
michael@0 53 }
michael@0 54
michael@0 55 /* Read and return the next token from the stream. If the token is of
michael@0 56 type eString, fill in the token parameter with the token. If the
michael@0 57 token is eError, then the status parameter will contain the
michael@0 58 specific error. This will be eItemNotFound at the end of file,
michael@0 59 indicating that all tokens have been returned. This method will
michael@0 60 never return eString twice in a row; instead, multiple adjacent
michael@0 61 string tokens will be merged into one, with no intervening
michael@0 62 space. */
michael@0 63 U_CFUNC enum ETokenType
michael@0 64 getNextToken(UCHARBUF* buf,
michael@0 65 struct UString *token,
michael@0 66 uint32_t *linenumber, /* out: linenumber of token */
michael@0 67 struct UString *comment,
michael@0 68 UErrorCode *status) {
michael@0 69 enum ETokenType result;
michael@0 70 UChar32 c;
michael@0 71
michael@0 72 if (U_FAILURE(*status)) {
michael@0 73 return TOK_ERROR;
michael@0 74 }
michael@0 75
michael@0 76 /* Skip whitespace */
michael@0 77 c = getNextChar(buf, TRUE, comment, status);
michael@0 78
michael@0 79 if (U_FAILURE(*status)) {
michael@0 80 return TOK_ERROR;
michael@0 81 }
michael@0 82
michael@0 83 *linenumber = lineCount;
michael@0 84
michael@0 85 switch(c) {
michael@0 86 case BADBOM:
michael@0 87 return TOK_ERROR;
michael@0 88 case OPENBRACE:
michael@0 89 return TOK_OPEN_BRACE;
michael@0 90 case CLOSEBRACE:
michael@0 91 return TOK_CLOSE_BRACE;
michael@0 92 case COMMA:
michael@0 93 return TOK_COMMA;
michael@0 94 case U_EOF:
michael@0 95 return TOK_EOF;
michael@0 96 case COLON:
michael@0 97 return TOK_COLON;
michael@0 98
michael@0 99 default:
michael@0 100 result = getStringToken(buf, c, token, status);
michael@0 101 }
michael@0 102
michael@0 103 *linenumber = lineCount;
michael@0 104 return result;
michael@0 105 }
michael@0 106
michael@0 107 /* Copy a string token into the given UnicodeString. Upon entry, we
michael@0 108 have already read the first character of the string token, which is
michael@0 109 not a whitespace character (but may be a QUOTE or ESCAPE). This
michael@0 110 function reads all subsequent characters that belong with this
michael@0 111 string, and copy them into the token parameter. The other
michael@0 112 important, and slightly convoluted purpose of this function is to
michael@0 113 merge adjacent strings. It looks forward a bit, and if the next
michael@0 114 non comment, non whitespace item is a string, it reads it in as
michael@0 115 well. If two adjacent strings are quoted, they are merged without
michael@0 116 intervening space. Otherwise a single SPACE character is
michael@0 117 inserted. */
michael@0 118 static enum ETokenType getStringToken(UCHARBUF* buf,
michael@0 119 UChar32 initialChar,
michael@0 120 struct UString *token,
michael@0 121 UErrorCode *status) {
michael@0 122 UBool lastStringWasQuoted;
michael@0 123 UChar32 c;
michael@0 124 UChar target[3] = { '\0' };
michael@0 125 UChar *pTarget = target;
michael@0 126 int len=0;
michael@0 127 UBool isFollowingCharEscaped=FALSE;
michael@0 128 UBool isNLUnescaped = FALSE;
michael@0 129 UChar32 prevC=0;
michael@0 130
michael@0 131 /* We are guaranteed on entry that initialChar is not a whitespace
michael@0 132 character. If we are at the EOF, or have some other problem, it
michael@0 133 doesn't matter; we still want to validly return the initialChar
michael@0 134 (if nothing else) as a string token. */
michael@0 135
michael@0 136 if (U_FAILURE(*status)) {
michael@0 137 return TOK_ERROR;
michael@0 138 }
michael@0 139
michael@0 140 /* setup */
michael@0 141 lastStringWasQuoted = FALSE;
michael@0 142 c = initialChar;
michael@0 143 ustr_setlen(token, 0, status);
michael@0 144
michael@0 145 if (U_FAILURE(*status)) {
michael@0 146 return TOK_ERROR;
michael@0 147 }
michael@0 148
michael@0 149 for (;;) {
michael@0 150 if (c == QUOTE) {
michael@0 151 if (!lastStringWasQuoted && token->fLength > 0) {
michael@0 152 ustr_ucat(token, SPACE, status);
michael@0 153
michael@0 154 if (U_FAILURE(*status)) {
michael@0 155 return TOK_ERROR;
michael@0 156 }
michael@0 157 }
michael@0 158
michael@0 159 lastStringWasQuoted = TRUE;
michael@0 160
michael@0 161 for (;;) {
michael@0 162 c = ucbuf_getc(buf,status);
michael@0 163
michael@0 164 /* EOF reached */
michael@0 165 if (c == U_EOF) {
michael@0 166 return TOK_EOF;
michael@0 167 }
michael@0 168
michael@0 169 /* Unterminated quoted strings */
michael@0 170 if (U_FAILURE(*status)) {
michael@0 171 return TOK_ERROR;
michael@0 172 }
michael@0 173
michael@0 174 if (c == QUOTE && !isFollowingCharEscaped) {
michael@0 175 break;
michael@0 176 }
michael@0 177
michael@0 178 if (c == ESCAPE && !isFollowingCharEscaped) {
michael@0 179 pTarget = target;
michael@0 180 c = unescape(buf, status);
michael@0 181
michael@0 182 if (c == U_ERR) {
michael@0 183 return TOK_ERROR;
michael@0 184 }
michael@0 185 if(c == CR || c == LF){
michael@0 186 isNLUnescaped = TRUE;
michael@0 187 }
michael@0 188 }
michael@0 189
michael@0 190 if(c==ESCAPE && !isFollowingCharEscaped){
michael@0 191 isFollowingCharEscaped = TRUE;
michael@0 192 }else{
michael@0 193 U_APPEND_CHAR32(c, pTarget,len);
michael@0 194 pTarget = target;
michael@0 195 ustr_uscat(token, pTarget,len, status);
michael@0 196 isFollowingCharEscaped = FALSE;
michael@0 197 len=0;
michael@0 198 if(c == CR || c == LF){
michael@0 199 if(isNLUnescaped == FALSE && prevC!=CR){
michael@0 200 lineCount++;
michael@0 201 }
michael@0 202 isNLUnescaped = FALSE;
michael@0 203 }
michael@0 204 }
michael@0 205
michael@0 206 if (U_FAILURE(*status)) {
michael@0 207 return TOK_ERROR;
michael@0 208 }
michael@0 209 prevC = c;
michael@0 210 }
michael@0 211 } else {
michael@0 212 if (token->fLength > 0) {
michael@0 213 ustr_ucat(token, SPACE, status);
michael@0 214
michael@0 215 if (U_FAILURE(*status)) {
michael@0 216 return TOK_ERROR;
michael@0 217 }
michael@0 218 }
michael@0 219
michael@0 220 if(lastStringWasQuoted){
michael@0 221 if(getShowWarning()){
michael@0 222 warning(lineCount, "Mixing quoted and unquoted strings");
michael@0 223 }
michael@0 224 if(isStrict()){
michael@0 225 return TOK_ERROR;
michael@0 226 }
michael@0 227
michael@0 228 }
michael@0 229
michael@0 230 lastStringWasQuoted = FALSE;
michael@0 231
michael@0 232 /* if we reach here we are mixing
michael@0 233 * quoted and unquoted strings
michael@0 234 * warn in normal mode and error in
michael@0 235 * pedantic mode
michael@0 236 */
michael@0 237
michael@0 238 if (c == ESCAPE) {
michael@0 239 pTarget = target;
michael@0 240 c = unescape(buf, status);
michael@0 241
michael@0 242 /* EOF reached */
michael@0 243 if (c == U_EOF) {
michael@0 244 return TOK_ERROR;
michael@0 245 }
michael@0 246 }
michael@0 247
michael@0 248 U_APPEND_CHAR32(c, pTarget,len);
michael@0 249 pTarget = target;
michael@0 250 ustr_uscat(token, pTarget,len, status);
michael@0 251 len=0;
michael@0 252
michael@0 253 if (U_FAILURE(*status)) {
michael@0 254 return TOK_ERROR;
michael@0 255 }
michael@0 256
michael@0 257 for (;;) {
michael@0 258 /* DON'T skip whitespace */
michael@0 259 c = getNextChar(buf, FALSE, NULL, status);
michael@0 260
michael@0 261 /* EOF reached */
michael@0 262 if (c == U_EOF) {
michael@0 263 ucbuf_ungetc(c, buf);
michael@0 264 return TOK_STRING;
michael@0 265 }
michael@0 266
michael@0 267 if (U_FAILURE(*status)) {
michael@0 268 return TOK_STRING;
michael@0 269 }
michael@0 270
michael@0 271 if (c == QUOTE
michael@0 272 || c == OPENBRACE
michael@0 273 || c == CLOSEBRACE
michael@0 274 || c == COMMA
michael@0 275 || c == COLON) {
michael@0 276 ucbuf_ungetc(c, buf);
michael@0 277 break;
michael@0 278 }
michael@0 279
michael@0 280 if (isWhitespace(c)) {
michael@0 281 break;
michael@0 282 }
michael@0 283
michael@0 284 if (c == ESCAPE) {
michael@0 285 pTarget = target;
michael@0 286 c = unescape(buf, status);
michael@0 287
michael@0 288 if (c == U_ERR) {
michael@0 289 return TOK_ERROR;
michael@0 290 }
michael@0 291 }
michael@0 292
michael@0 293 U_APPEND_CHAR32(c, pTarget,len);
michael@0 294 pTarget = target;
michael@0 295 ustr_uscat(token, pTarget,len, status);
michael@0 296 len=0;
michael@0 297 if (U_FAILURE(*status)) {
michael@0 298 return TOK_ERROR;
michael@0 299 }
michael@0 300 }
michael@0 301 }
michael@0 302
michael@0 303 /* DO skip whitespace */
michael@0 304 c = getNextChar(buf, TRUE, NULL, status);
michael@0 305
michael@0 306 if (U_FAILURE(*status)) {
michael@0 307 return TOK_STRING;
michael@0 308 }
michael@0 309
michael@0 310 if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
michael@0 311 ucbuf_ungetc(c, buf);
michael@0 312 return TOK_STRING;
michael@0 313 }
michael@0 314 }
michael@0 315 }
michael@0 316
michael@0 317 /* Retrieve the next character. If skipwhite is
michael@0 318 true, whitespace is skipped as well. */
michael@0 319 static UChar32 getNextChar(UCHARBUF* buf,
michael@0 320 UBool skipwhite,
michael@0 321 struct UString *token,
michael@0 322 UErrorCode *status) {
michael@0 323 UChar32 c, c2;
michael@0 324
michael@0 325 if (U_FAILURE(*status)) {
michael@0 326 return U_EOF;
michael@0 327 }
michael@0 328
michael@0 329 for (;;) {
michael@0 330 c = ucbuf_getc(buf,status);
michael@0 331
michael@0 332 if (c == U_EOF) {
michael@0 333 return U_EOF;
michael@0 334 }
michael@0 335
michael@0 336 if (skipwhite && isWhitespace(c)) {
michael@0 337 continue;
michael@0 338 }
michael@0 339
michael@0 340 /* This also handles the get() failing case */
michael@0 341 if (c != SLASH) {
michael@0 342 return c;
michael@0 343 }
michael@0 344
michael@0 345 c = ucbuf_getc(buf,status); /* "/c" */
michael@0 346
michael@0 347 if (c == U_EOF) {
michael@0 348 return U_EOF;
michael@0 349 }
michael@0 350
michael@0 351 switch (c) {
michael@0 352 case SLASH: /* "//" */
michael@0 353 seekUntilNewline(buf, NULL, status);
michael@0 354 break;
michael@0 355
michael@0 356 case ASTERISK: /* " / * " */
michael@0 357 c2 = ucbuf_getc(buf, status); /* "/ * c" */
michael@0 358 if(c2 == ASTERISK){ /* "/ * *" */
michael@0 359 /* parse multi-line comment and store it in token*/
michael@0 360 seekUntilEndOfComment(buf, token, status);
michael@0 361 } else {
michael@0 362 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */
michael@0 363 seekUntilEndOfComment(buf, NULL, status);
michael@0 364 }
michael@0 365 break;
michael@0 366
michael@0 367 default:
michael@0 368 ucbuf_ungetc(c, buf); /* "/c" - put back the c */
michael@0 369 /* If get() failed this is a NOP */
michael@0 370 return SLASH;
michael@0 371 }
michael@0 372
michael@0 373 }
michael@0 374 }
michael@0 375
michael@0 376 static void seekUntilNewline(UCHARBUF* buf,
michael@0 377 struct UString *token,
michael@0 378 UErrorCode *status) {
michael@0 379 UChar32 c;
michael@0 380
michael@0 381 if (U_FAILURE(*status)) {
michael@0 382 return;
michael@0 383 }
michael@0 384
michael@0 385 do {
michael@0 386 c = ucbuf_getc(buf,status);
michael@0 387 /* add the char to token */
michael@0 388 if(token!=NULL){
michael@0 389 ustr_u32cat(token, c, status);
michael@0 390 }
michael@0 391 } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
michael@0 392 }
michael@0 393
michael@0 394 static void seekUntilEndOfComment(UCHARBUF *buf,
michael@0 395 struct UString *token,
michael@0 396 UErrorCode *status) {
michael@0 397 UChar32 c, d;
michael@0 398 uint32_t line;
michael@0 399
michael@0 400 if (U_FAILURE(*status)) {
michael@0 401 return;
michael@0 402 }
michael@0 403
michael@0 404 line = lineCount;
michael@0 405
michael@0 406 do {
michael@0 407 c = ucbuf_getc(buf, status);
michael@0 408
michael@0 409 if (c == ASTERISK) {
michael@0 410 d = ucbuf_getc(buf, status);
michael@0 411
michael@0 412 if (d != SLASH) {
michael@0 413 ucbuf_ungetc(d, buf);
michael@0 414 } else {
michael@0 415 break;
michael@0 416 }
michael@0 417 }
michael@0 418 /* add the char to token */
michael@0 419 if(token!=NULL){
michael@0 420 ustr_u32cat(token, c, status);
michael@0 421 }
michael@0 422 /* increment the lineCount */
michael@0 423 isNewline(c);
michael@0 424
michael@0 425 } while (c != U_EOF && *status == U_ZERO_ERROR);
michael@0 426
michael@0 427 if (c == U_EOF) {
michael@0 428 *status = U_INVALID_FORMAT_ERROR;
michael@0 429 error(line, "unterminated comment detected");
michael@0 430 }
michael@0 431 }
michael@0 432
michael@0 433 U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) {
michael@0 434 if (U_FAILURE(*status)) {
michael@0 435 return U_EOF;
michael@0 436 }
michael@0 437
michael@0 438 /* We expect to be called after the ESCAPE has been seen, but
michael@0 439 * u_fgetcx needs an ESCAPE to do its magic. */
michael@0 440 ucbuf_ungetc(ESCAPE, buf);
michael@0 441
michael@0 442 return ucbuf_getcx32(buf, status);
michael@0 443 }
michael@0 444
michael@0 445 static UBool isWhitespace(UChar32 c) {
michael@0 446 switch (c) {
michael@0 447 /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
michael@0 448 case 0x000A:
michael@0 449 case 0x2029:
michael@0 450 lineCount++;
michael@0 451 case 0x000D:
michael@0 452 case 0x0020:
michael@0 453 case 0x0009:
michael@0 454 case 0xFEFF:
michael@0 455 return TRUE;
michael@0 456
michael@0 457 default:
michael@0 458 return FALSE;
michael@0 459 }
michael@0 460 }
michael@0 461
michael@0 462 static UBool isNewline(UChar32 c) {
michael@0 463 switch (c) {
michael@0 464 /* '\n', '\r', 0x2029 */
michael@0 465 case 0x000A:
michael@0 466 case 0x2029:
michael@0 467 lineCount++;
michael@0 468 case 0x000D:
michael@0 469 return TRUE;
michael@0 470
michael@0 471 default:
michael@0 472 return FALSE;
michael@0 473 }
michael@0 474 }

mercurial