|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 1998-2012, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * |
|
9 * File read.c |
|
10 * |
|
11 * Modification History: |
|
12 * |
|
13 * Date Name Description |
|
14 * 05/26/99 stephen Creation. |
|
15 * 5/10/01 Ram removed ustdio dependency |
|
16 ******************************************************************************* |
|
17 */ |
|
18 |
|
19 #include "read.h" |
|
20 #include "errmsg.h" |
|
21 #include "unicode/ustring.h" |
|
22 #include "unicode/utf16.h" |
|
23 |
|
24 #define OPENBRACE 0x007B |
|
25 #define CLOSEBRACE 0x007D |
|
26 #define COMMA 0x002C |
|
27 #define QUOTE 0x0022 |
|
28 #define ESCAPE 0x005C |
|
29 #define SLASH 0x002F |
|
30 #define ASTERISK 0x002A |
|
31 #define SPACE 0x0020 |
|
32 #define COLON 0x003A |
|
33 #define BADBOM 0xFFFE |
|
34 #define CR 0x000D |
|
35 #define LF 0x000A |
|
36 |
|
37 static int32_t lineCount; |
|
38 |
|
39 /* Protos */ |
|
40 static enum ETokenType getStringToken(UCHARBUF *buf, |
|
41 UChar32 initialChar, |
|
42 struct UString *token, |
|
43 UErrorCode *status); |
|
44 |
|
45 static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status); |
|
46 static void seekUntilNewline (UCHARBUF *buf, struct UString *token, UErrorCode *status); |
|
47 static void seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status); |
|
48 static UBool isWhitespace (UChar32 c); |
|
49 static UBool isNewline (UChar32 c); |
|
50 |
|
51 U_CFUNC void resetLineNumber() { |
|
52 lineCount = 1; |
|
53 } |
|
54 |
|
55 /* Read and return the next token from the stream. If the token is of |
|
56 type eString, fill in the token parameter with the token. If the |
|
57 token is eError, then the status parameter will contain the |
|
58 specific error. This will be eItemNotFound at the end of file, |
|
59 indicating that all tokens have been returned. This method will |
|
60 never return eString twice in a row; instead, multiple adjacent |
|
61 string tokens will be merged into one, with no intervening |
|
62 space. */ |
|
63 U_CFUNC enum ETokenType |
|
64 getNextToken(UCHARBUF* buf, |
|
65 struct UString *token, |
|
66 uint32_t *linenumber, /* out: linenumber of token */ |
|
67 struct UString *comment, |
|
68 UErrorCode *status) { |
|
69 enum ETokenType result; |
|
70 UChar32 c; |
|
71 |
|
72 if (U_FAILURE(*status)) { |
|
73 return TOK_ERROR; |
|
74 } |
|
75 |
|
76 /* Skip whitespace */ |
|
77 c = getNextChar(buf, TRUE, comment, status); |
|
78 |
|
79 if (U_FAILURE(*status)) { |
|
80 return TOK_ERROR; |
|
81 } |
|
82 |
|
83 *linenumber = lineCount; |
|
84 |
|
85 switch(c) { |
|
86 case BADBOM: |
|
87 return TOK_ERROR; |
|
88 case OPENBRACE: |
|
89 return TOK_OPEN_BRACE; |
|
90 case CLOSEBRACE: |
|
91 return TOK_CLOSE_BRACE; |
|
92 case COMMA: |
|
93 return TOK_COMMA; |
|
94 case U_EOF: |
|
95 return TOK_EOF; |
|
96 case COLON: |
|
97 return TOK_COLON; |
|
98 |
|
99 default: |
|
100 result = getStringToken(buf, c, token, status); |
|
101 } |
|
102 |
|
103 *linenumber = lineCount; |
|
104 return result; |
|
105 } |
|
106 |
|
107 /* Copy a string token into the given UnicodeString. Upon entry, we |
|
108 have already read the first character of the string token, which is |
|
109 not a whitespace character (but may be a QUOTE or ESCAPE). This |
|
110 function reads all subsequent characters that belong with this |
|
111 string, and copy them into the token parameter. The other |
|
112 important, and slightly convoluted purpose of this function is to |
|
113 merge adjacent strings. It looks forward a bit, and if the next |
|
114 non comment, non whitespace item is a string, it reads it in as |
|
115 well. If two adjacent strings are quoted, they are merged without |
|
116 intervening space. Otherwise a single SPACE character is |
|
117 inserted. */ |
|
118 static enum ETokenType getStringToken(UCHARBUF* buf, |
|
119 UChar32 initialChar, |
|
120 struct UString *token, |
|
121 UErrorCode *status) { |
|
122 UBool lastStringWasQuoted; |
|
123 UChar32 c; |
|
124 UChar target[3] = { '\0' }; |
|
125 UChar *pTarget = target; |
|
126 int len=0; |
|
127 UBool isFollowingCharEscaped=FALSE; |
|
128 UBool isNLUnescaped = FALSE; |
|
129 UChar32 prevC=0; |
|
130 |
|
131 /* We are guaranteed on entry that initialChar is not a whitespace |
|
132 character. If we are at the EOF, or have some other problem, it |
|
133 doesn't matter; we still want to validly return the initialChar |
|
134 (if nothing else) as a string token. */ |
|
135 |
|
136 if (U_FAILURE(*status)) { |
|
137 return TOK_ERROR; |
|
138 } |
|
139 |
|
140 /* setup */ |
|
141 lastStringWasQuoted = FALSE; |
|
142 c = initialChar; |
|
143 ustr_setlen(token, 0, status); |
|
144 |
|
145 if (U_FAILURE(*status)) { |
|
146 return TOK_ERROR; |
|
147 } |
|
148 |
|
149 for (;;) { |
|
150 if (c == QUOTE) { |
|
151 if (!lastStringWasQuoted && token->fLength > 0) { |
|
152 ustr_ucat(token, SPACE, status); |
|
153 |
|
154 if (U_FAILURE(*status)) { |
|
155 return TOK_ERROR; |
|
156 } |
|
157 } |
|
158 |
|
159 lastStringWasQuoted = TRUE; |
|
160 |
|
161 for (;;) { |
|
162 c = ucbuf_getc(buf,status); |
|
163 |
|
164 /* EOF reached */ |
|
165 if (c == U_EOF) { |
|
166 return TOK_EOF; |
|
167 } |
|
168 |
|
169 /* Unterminated quoted strings */ |
|
170 if (U_FAILURE(*status)) { |
|
171 return TOK_ERROR; |
|
172 } |
|
173 |
|
174 if (c == QUOTE && !isFollowingCharEscaped) { |
|
175 break; |
|
176 } |
|
177 |
|
178 if (c == ESCAPE && !isFollowingCharEscaped) { |
|
179 pTarget = target; |
|
180 c = unescape(buf, status); |
|
181 |
|
182 if (c == U_ERR) { |
|
183 return TOK_ERROR; |
|
184 } |
|
185 if(c == CR || c == LF){ |
|
186 isNLUnescaped = TRUE; |
|
187 } |
|
188 } |
|
189 |
|
190 if(c==ESCAPE && !isFollowingCharEscaped){ |
|
191 isFollowingCharEscaped = TRUE; |
|
192 }else{ |
|
193 U_APPEND_CHAR32(c, pTarget,len); |
|
194 pTarget = target; |
|
195 ustr_uscat(token, pTarget,len, status); |
|
196 isFollowingCharEscaped = FALSE; |
|
197 len=0; |
|
198 if(c == CR || c == LF){ |
|
199 if(isNLUnescaped == FALSE && prevC!=CR){ |
|
200 lineCount++; |
|
201 } |
|
202 isNLUnescaped = FALSE; |
|
203 } |
|
204 } |
|
205 |
|
206 if (U_FAILURE(*status)) { |
|
207 return TOK_ERROR; |
|
208 } |
|
209 prevC = c; |
|
210 } |
|
211 } else { |
|
212 if (token->fLength > 0) { |
|
213 ustr_ucat(token, SPACE, status); |
|
214 |
|
215 if (U_FAILURE(*status)) { |
|
216 return TOK_ERROR; |
|
217 } |
|
218 } |
|
219 |
|
220 if(lastStringWasQuoted){ |
|
221 if(getShowWarning()){ |
|
222 warning(lineCount, "Mixing quoted and unquoted strings"); |
|
223 } |
|
224 if(isStrict()){ |
|
225 return TOK_ERROR; |
|
226 } |
|
227 |
|
228 } |
|
229 |
|
230 lastStringWasQuoted = FALSE; |
|
231 |
|
232 /* if we reach here we are mixing |
|
233 * quoted and unquoted strings |
|
234 * warn in normal mode and error in |
|
235 * pedantic mode |
|
236 */ |
|
237 |
|
238 if (c == ESCAPE) { |
|
239 pTarget = target; |
|
240 c = unescape(buf, status); |
|
241 |
|
242 /* EOF reached */ |
|
243 if (c == U_EOF) { |
|
244 return TOK_ERROR; |
|
245 } |
|
246 } |
|
247 |
|
248 U_APPEND_CHAR32(c, pTarget,len); |
|
249 pTarget = target; |
|
250 ustr_uscat(token, pTarget,len, status); |
|
251 len=0; |
|
252 |
|
253 if (U_FAILURE(*status)) { |
|
254 return TOK_ERROR; |
|
255 } |
|
256 |
|
257 for (;;) { |
|
258 /* DON'T skip whitespace */ |
|
259 c = getNextChar(buf, FALSE, NULL, status); |
|
260 |
|
261 /* EOF reached */ |
|
262 if (c == U_EOF) { |
|
263 ucbuf_ungetc(c, buf); |
|
264 return TOK_STRING; |
|
265 } |
|
266 |
|
267 if (U_FAILURE(*status)) { |
|
268 return TOK_STRING; |
|
269 } |
|
270 |
|
271 if (c == QUOTE |
|
272 || c == OPENBRACE |
|
273 || c == CLOSEBRACE |
|
274 || c == COMMA |
|
275 || c == COLON) { |
|
276 ucbuf_ungetc(c, buf); |
|
277 break; |
|
278 } |
|
279 |
|
280 if (isWhitespace(c)) { |
|
281 break; |
|
282 } |
|
283 |
|
284 if (c == ESCAPE) { |
|
285 pTarget = target; |
|
286 c = unescape(buf, status); |
|
287 |
|
288 if (c == U_ERR) { |
|
289 return TOK_ERROR; |
|
290 } |
|
291 } |
|
292 |
|
293 U_APPEND_CHAR32(c, pTarget,len); |
|
294 pTarget = target; |
|
295 ustr_uscat(token, pTarget,len, status); |
|
296 len=0; |
|
297 if (U_FAILURE(*status)) { |
|
298 return TOK_ERROR; |
|
299 } |
|
300 } |
|
301 } |
|
302 |
|
303 /* DO skip whitespace */ |
|
304 c = getNextChar(buf, TRUE, NULL, status); |
|
305 |
|
306 if (U_FAILURE(*status)) { |
|
307 return TOK_STRING; |
|
308 } |
|
309 |
|
310 if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) { |
|
311 ucbuf_ungetc(c, buf); |
|
312 return TOK_STRING; |
|
313 } |
|
314 } |
|
315 } |
|
316 |
|
317 /* Retrieve the next character. If skipwhite is |
|
318 true, whitespace is skipped as well. */ |
|
319 static UChar32 getNextChar(UCHARBUF* buf, |
|
320 UBool skipwhite, |
|
321 struct UString *token, |
|
322 UErrorCode *status) { |
|
323 UChar32 c, c2; |
|
324 |
|
325 if (U_FAILURE(*status)) { |
|
326 return U_EOF; |
|
327 } |
|
328 |
|
329 for (;;) { |
|
330 c = ucbuf_getc(buf,status); |
|
331 |
|
332 if (c == U_EOF) { |
|
333 return U_EOF; |
|
334 } |
|
335 |
|
336 if (skipwhite && isWhitespace(c)) { |
|
337 continue; |
|
338 } |
|
339 |
|
340 /* This also handles the get() failing case */ |
|
341 if (c != SLASH) { |
|
342 return c; |
|
343 } |
|
344 |
|
345 c = ucbuf_getc(buf,status); /* "/c" */ |
|
346 |
|
347 if (c == U_EOF) { |
|
348 return U_EOF; |
|
349 } |
|
350 |
|
351 switch (c) { |
|
352 case SLASH: /* "//" */ |
|
353 seekUntilNewline(buf, NULL, status); |
|
354 break; |
|
355 |
|
356 case ASTERISK: /* " / * " */ |
|
357 c2 = ucbuf_getc(buf, status); /* "/ * c" */ |
|
358 if(c2 == ASTERISK){ /* "/ * *" */ |
|
359 /* parse multi-line comment and store it in token*/ |
|
360 seekUntilEndOfComment(buf, token, status); |
|
361 } else { |
|
362 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */ |
|
363 seekUntilEndOfComment(buf, NULL, status); |
|
364 } |
|
365 break; |
|
366 |
|
367 default: |
|
368 ucbuf_ungetc(c, buf); /* "/c" - put back the c */ |
|
369 /* If get() failed this is a NOP */ |
|
370 return SLASH; |
|
371 } |
|
372 |
|
373 } |
|
374 } |
|
375 |
|
376 static void seekUntilNewline(UCHARBUF* buf, |
|
377 struct UString *token, |
|
378 UErrorCode *status) { |
|
379 UChar32 c; |
|
380 |
|
381 if (U_FAILURE(*status)) { |
|
382 return; |
|
383 } |
|
384 |
|
385 do { |
|
386 c = ucbuf_getc(buf,status); |
|
387 /* add the char to token */ |
|
388 if(token!=NULL){ |
|
389 ustr_u32cat(token, c, status); |
|
390 } |
|
391 } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR); |
|
392 } |
|
393 |
|
394 static void seekUntilEndOfComment(UCHARBUF *buf, |
|
395 struct UString *token, |
|
396 UErrorCode *status) { |
|
397 UChar32 c, d; |
|
398 uint32_t line; |
|
399 |
|
400 if (U_FAILURE(*status)) { |
|
401 return; |
|
402 } |
|
403 |
|
404 line = lineCount; |
|
405 |
|
406 do { |
|
407 c = ucbuf_getc(buf, status); |
|
408 |
|
409 if (c == ASTERISK) { |
|
410 d = ucbuf_getc(buf, status); |
|
411 |
|
412 if (d != SLASH) { |
|
413 ucbuf_ungetc(d, buf); |
|
414 } else { |
|
415 break; |
|
416 } |
|
417 } |
|
418 /* add the char to token */ |
|
419 if(token!=NULL){ |
|
420 ustr_u32cat(token, c, status); |
|
421 } |
|
422 /* increment the lineCount */ |
|
423 isNewline(c); |
|
424 |
|
425 } while (c != U_EOF && *status == U_ZERO_ERROR); |
|
426 |
|
427 if (c == U_EOF) { |
|
428 *status = U_INVALID_FORMAT_ERROR; |
|
429 error(line, "unterminated comment detected"); |
|
430 } |
|
431 } |
|
432 |
|
433 U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) { |
|
434 if (U_FAILURE(*status)) { |
|
435 return U_EOF; |
|
436 } |
|
437 |
|
438 /* We expect to be called after the ESCAPE has been seen, but |
|
439 * u_fgetcx needs an ESCAPE to do its magic. */ |
|
440 ucbuf_ungetc(ESCAPE, buf); |
|
441 |
|
442 return ucbuf_getcx32(buf, status); |
|
443 } |
|
444 |
|
445 static UBool isWhitespace(UChar32 c) { |
|
446 switch (c) { |
|
447 /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */ |
|
448 case 0x000A: |
|
449 case 0x2029: |
|
450 lineCount++; |
|
451 case 0x000D: |
|
452 case 0x0020: |
|
453 case 0x0009: |
|
454 case 0xFEFF: |
|
455 return TRUE; |
|
456 |
|
457 default: |
|
458 return FALSE; |
|
459 } |
|
460 } |
|
461 |
|
462 static UBool isNewline(UChar32 c) { |
|
463 switch (c) { |
|
464 /* '\n', '\r', 0x2029 */ |
|
465 case 0x000A: |
|
466 case 0x2029: |
|
467 lineCount++; |
|
468 case 0x000D: |
|
469 return TRUE; |
|
470 |
|
471 default: |
|
472 return FALSE; |
|
473 } |
|
474 } |