|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 2000-2012, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * file name: uparse.c |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 2000apr18 |
|
14 * created by: Markus W. Scherer |
|
15 * |
|
16 * This file provides a parser for files that are delimited by one single |
|
17 * character like ';' or TAB. Example: the Unicode Character Properties files |
|
18 * like UnicodeData.txt are semicolon-delimited. |
|
19 */ |
|
20 |
|
21 #include "unicode/utypes.h" |
|
22 #include "unicode/uchar.h" |
|
23 #include "unicode/ustring.h" |
|
24 #include "unicode/utf16.h" |
|
25 #include "cstring.h" |
|
26 #include "filestrm.h" |
|
27 #include "uparse.h" |
|
28 #include "ustr_imp.h" |
|
29 |
|
30 #include <stdio.h> |
|
31 |
|
32 U_CAPI const char * U_EXPORT2 |
|
33 u_skipWhitespace(const char *s) { |
|
34 while(U_IS_INV_WHITESPACE(*s)) { |
|
35 ++s; |
|
36 } |
|
37 return s; |
|
38 } |
|
39 |
|
40 U_CAPI char * U_EXPORT2 |
|
41 u_rtrim(char *s) { |
|
42 char *end=uprv_strchr(s, 0); |
|
43 while(s<end && U_IS_INV_WHITESPACE(*(end-1))) { |
|
44 *--end = 0; |
|
45 } |
|
46 return end; |
|
47 } |
|
48 |
|
49 /* |
|
50 * If the string starts with # @missing: then return the pointer to the |
|
51 * following non-whitespace character. |
|
52 * Otherwise return the original pointer. |
|
53 * Unicode 5.0 adds such lines in some data files to document |
|
54 * default property values. |
|
55 * Poor man's regex for variable amounts of white space. |
|
56 */ |
|
57 static const char * |
|
58 getMissingLimit(const char *s) { |
|
59 const char *s0=s; |
|
60 if( |
|
61 *(s=u_skipWhitespace(s))=='#' && |
|
62 *(s=u_skipWhitespace(s+1))=='@' && |
|
63 0==strncmp((s=u_skipWhitespace(s+1)), "missing", 7) && |
|
64 *(s=u_skipWhitespace(s+7))==':' |
|
65 ) { |
|
66 return u_skipWhitespace(s+1); |
|
67 } else { |
|
68 return s0; |
|
69 } |
|
70 } |
|
71 |
|
72 U_CAPI void U_EXPORT2 |
|
73 u_parseDelimitedFile(const char *filename, char delimiter, |
|
74 char *fields[][2], int32_t fieldCount, |
|
75 UParseLineFn *lineFn, void *context, |
|
76 UErrorCode *pErrorCode) { |
|
77 FileStream *file; |
|
78 char line[300]; |
|
79 char *start, *limit; |
|
80 int32_t i, length; |
|
81 |
|
82 if(U_FAILURE(*pErrorCode)) { |
|
83 return; |
|
84 } |
|
85 |
|
86 if(fields==NULL || lineFn==NULL || fieldCount<=0) { |
|
87 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
88 return; |
|
89 } |
|
90 |
|
91 if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) { |
|
92 filename=NULL; |
|
93 file=T_FileStream_stdin(); |
|
94 } else { |
|
95 file=T_FileStream_open(filename, "r"); |
|
96 } |
|
97 if(file==NULL) { |
|
98 *pErrorCode=U_FILE_ACCESS_ERROR; |
|
99 return; |
|
100 } |
|
101 |
|
102 while(T_FileStream_readLine(file, line, sizeof(line))!=NULL) { |
|
103 /* remove trailing newline characters */ |
|
104 length=(int32_t)(u_rtrim(line)-line); |
|
105 |
|
106 /* |
|
107 * detect a line with # @missing: |
|
108 * start parsing after that, or else from the beginning of the line |
|
109 * set the default warning for @missing lines |
|
110 */ |
|
111 start=(char *)getMissingLimit(line); |
|
112 if(start==line) { |
|
113 *pErrorCode=U_ZERO_ERROR; |
|
114 } else { |
|
115 *pErrorCode=U_USING_DEFAULT_WARNING; |
|
116 } |
|
117 |
|
118 /* skip this line if it is empty or a comment */ |
|
119 if(*start==0 || *start=='#') { |
|
120 continue; |
|
121 } |
|
122 |
|
123 /* remove in-line comments */ |
|
124 limit=uprv_strchr(start, '#'); |
|
125 if(limit!=NULL) { |
|
126 /* get white space before the pound sign */ |
|
127 while(limit>start && U_IS_INV_WHITESPACE(*(limit-1))) { |
|
128 --limit; |
|
129 } |
|
130 |
|
131 /* truncate the line */ |
|
132 *limit=0; |
|
133 } |
|
134 |
|
135 /* skip lines with only whitespace */ |
|
136 if(u_skipWhitespace(start)[0]==0) { |
|
137 continue; |
|
138 } |
|
139 |
|
140 /* for each field, call the corresponding field function */ |
|
141 for(i=0; i<fieldCount; ++i) { |
|
142 /* set the limit pointer of this field */ |
|
143 limit=start; |
|
144 while(*limit!=delimiter && *limit!=0) { |
|
145 ++limit; |
|
146 } |
|
147 |
|
148 /* set the field start and limit in the fields array */ |
|
149 fields[i][0]=start; |
|
150 fields[i][1]=limit; |
|
151 |
|
152 /* set start to the beginning of the next field, if any */ |
|
153 start=limit; |
|
154 if(*start!=0) { |
|
155 ++start; |
|
156 } else if(i+1<fieldCount) { |
|
157 *pErrorCode=U_PARSE_ERROR; |
|
158 limit=line+length; |
|
159 i=fieldCount; |
|
160 break; |
|
161 } |
|
162 } |
|
163 |
|
164 /* error in a field function? */ |
|
165 if(U_FAILURE(*pErrorCode)) { |
|
166 break; |
|
167 } |
|
168 |
|
169 /* call the field function */ |
|
170 lineFn(context, fields, fieldCount, pErrorCode); |
|
171 if(U_FAILURE(*pErrorCode)) { |
|
172 break; |
|
173 } |
|
174 } |
|
175 |
|
176 if(filename!=NULL) { |
|
177 T_FileStream_close(file); |
|
178 } |
|
179 } |
|
180 |
|
181 /* |
|
182 * parse a list of code points |
|
183 * store them as a UTF-32 string in dest[destCapacity] |
|
184 * return the number of code points |
|
185 */ |
|
186 U_CAPI int32_t U_EXPORT2 |
|
187 u_parseCodePoints(const char *s, |
|
188 uint32_t *dest, int32_t destCapacity, |
|
189 UErrorCode *pErrorCode) { |
|
190 char *end; |
|
191 uint32_t value; |
|
192 int32_t count; |
|
193 |
|
194 if(U_FAILURE(*pErrorCode)) { |
|
195 return 0; |
|
196 } |
|
197 if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) { |
|
198 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
199 return 0; |
|
200 } |
|
201 |
|
202 count=0; |
|
203 for(;;) { |
|
204 s=u_skipWhitespace(s); |
|
205 if(*s==';' || *s==0) { |
|
206 return count; |
|
207 } |
|
208 |
|
209 /* read one code point */ |
|
210 value=(uint32_t)uprv_strtoul(s, &end, 16); |
|
211 if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) { |
|
212 *pErrorCode=U_PARSE_ERROR; |
|
213 return 0; |
|
214 } |
|
215 |
|
216 /* append it to the destination array */ |
|
217 if(count<destCapacity) { |
|
218 dest[count++]=value; |
|
219 } else { |
|
220 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
221 } |
|
222 |
|
223 /* go to the following characters */ |
|
224 s=end; |
|
225 } |
|
226 } |
|
227 |
|
228 /* |
|
229 * parse a list of code points |
|
230 * store them as a string in dest[destCapacity] |
|
231 * set the first code point in *pFirst |
|
232 * @return The length of the string in numbers of UChars. |
|
233 */ |
|
234 U_CAPI int32_t U_EXPORT2 |
|
235 u_parseString(const char *s, |
|
236 UChar *dest, int32_t destCapacity, |
|
237 uint32_t *pFirst, |
|
238 UErrorCode *pErrorCode) { |
|
239 char *end; |
|
240 uint32_t value; |
|
241 int32_t destLength; |
|
242 |
|
243 if(U_FAILURE(*pErrorCode)) { |
|
244 return 0; |
|
245 } |
|
246 if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) { |
|
247 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
248 return 0; |
|
249 } |
|
250 |
|
251 if(pFirst!=NULL) { |
|
252 *pFirst=0xffffffff; |
|
253 } |
|
254 |
|
255 destLength=0; |
|
256 for(;;) { |
|
257 s=u_skipWhitespace(s); |
|
258 if(*s==';' || *s==0) { |
|
259 if(destLength<destCapacity) { |
|
260 dest[destLength]=0; |
|
261 } else if(destLength==destCapacity) { |
|
262 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; |
|
263 } else { |
|
264 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
|
265 } |
|
266 return destLength; |
|
267 } |
|
268 |
|
269 /* read one code point */ |
|
270 value=(uint32_t)uprv_strtoul(s, &end, 16); |
|
271 if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) { |
|
272 *pErrorCode=U_PARSE_ERROR; |
|
273 return 0; |
|
274 } |
|
275 |
|
276 /* store the first code point */ |
|
277 if(pFirst!=NULL) { |
|
278 *pFirst=value; |
|
279 pFirst=NULL; |
|
280 } |
|
281 |
|
282 /* append it to the destination array */ |
|
283 if((destLength+U16_LENGTH(value))<=destCapacity) { |
|
284 U16_APPEND_UNSAFE(dest, destLength, value); |
|
285 } else { |
|
286 destLength+=U16_LENGTH(value); |
|
287 } |
|
288 |
|
289 /* go to the following characters */ |
|
290 s=end; |
|
291 } |
|
292 } |
|
293 |
|
294 /* read a range like start or start..end */ |
|
295 U_CAPI int32_t U_EXPORT2 |
|
296 u_parseCodePointRangeAnyTerminator(const char *s, |
|
297 uint32_t *pStart, uint32_t *pEnd, |
|
298 const char **terminator, |
|
299 UErrorCode *pErrorCode) { |
|
300 char *end; |
|
301 uint32_t value; |
|
302 |
|
303 if(U_FAILURE(*pErrorCode)) { |
|
304 return 0; |
|
305 } |
|
306 if(s==NULL || pStart==NULL || pEnd==NULL) { |
|
307 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
308 return 0; |
|
309 } |
|
310 |
|
311 /* read the start code point */ |
|
312 s=u_skipWhitespace(s); |
|
313 value=(uint32_t)uprv_strtoul(s, &end, 16); |
|
314 if(end<=s || value>=0x110000) { |
|
315 *pErrorCode=U_PARSE_ERROR; |
|
316 return 0; |
|
317 } |
|
318 *pStart=*pEnd=value; |
|
319 |
|
320 /* is there a "..end"? */ |
|
321 s=u_skipWhitespace(end); |
|
322 if(*s!='.' || s[1]!='.') { |
|
323 *terminator=end; |
|
324 return 1; |
|
325 } |
|
326 s=u_skipWhitespace(s+2); |
|
327 |
|
328 /* read the end code point */ |
|
329 value=(uint32_t)uprv_strtoul(s, &end, 16); |
|
330 if(end<=s || value>=0x110000) { |
|
331 *pErrorCode=U_PARSE_ERROR; |
|
332 return 0; |
|
333 } |
|
334 *pEnd=value; |
|
335 |
|
336 /* is this a valid range? */ |
|
337 if(value<*pStart) { |
|
338 *pErrorCode=U_PARSE_ERROR; |
|
339 return 0; |
|
340 } |
|
341 |
|
342 *terminator=end; |
|
343 return value-*pStart+1; |
|
344 } |
|
345 |
|
346 U_CAPI int32_t U_EXPORT2 |
|
347 u_parseCodePointRange(const char *s, |
|
348 uint32_t *pStart, uint32_t *pEnd, |
|
349 UErrorCode *pErrorCode) { |
|
350 const char *terminator; |
|
351 int32_t rangeLength= |
|
352 u_parseCodePointRangeAnyTerminator(s, pStart, pEnd, &terminator, pErrorCode); |
|
353 if(U_SUCCESS(*pErrorCode)) { |
|
354 terminator=u_skipWhitespace(terminator); |
|
355 if(*terminator!=';' && *terminator!=0) { |
|
356 *pErrorCode=U_PARSE_ERROR; |
|
357 return 0; |
|
358 } |
|
359 } |
|
360 return rangeLength; |
|
361 } |
|
362 |
|
363 U_CAPI int32_t U_EXPORT2 |
|
364 u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status) { |
|
365 const char *read = source; |
|
366 int32_t i = 0; |
|
367 unsigned int value = 0; |
|
368 if(sLen == -1) { |
|
369 sLen = (int32_t)strlen(source); |
|
370 } |
|
371 |
|
372 while(read < source+sLen) { |
|
373 sscanf(read, "%2x", &value); |
|
374 if(i < destCapacity) { |
|
375 dest[i] = (char)value; |
|
376 } |
|
377 i++; |
|
378 read += 2; |
|
379 } |
|
380 return u_terminateChars(dest, destCapacity, i, status); |
|
381 } |