1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/tools/toolutil/uparse.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,381 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 2000-2012, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: uparse.c 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2000apr18 1.17 +* created by: Markus W. Scherer 1.18 +* 1.19 +* This file provides a parser for files that are delimited by one single 1.20 +* character like ';' or TAB. Example: the Unicode Character Properties files 1.21 +* like UnicodeData.txt are semicolon-delimited. 1.22 +*/ 1.23 + 1.24 +#include "unicode/utypes.h" 1.25 +#include "unicode/uchar.h" 1.26 +#include "unicode/ustring.h" 1.27 +#include "unicode/utf16.h" 1.28 +#include "cstring.h" 1.29 +#include "filestrm.h" 1.30 +#include "uparse.h" 1.31 +#include "ustr_imp.h" 1.32 + 1.33 +#include <stdio.h> 1.34 + 1.35 +U_CAPI const char * U_EXPORT2 1.36 +u_skipWhitespace(const char *s) { 1.37 + while(U_IS_INV_WHITESPACE(*s)) { 1.38 + ++s; 1.39 + } 1.40 + return s; 1.41 +} 1.42 + 1.43 +U_CAPI char * U_EXPORT2 1.44 +u_rtrim(char *s) { 1.45 + char *end=uprv_strchr(s, 0); 1.46 + while(s<end && U_IS_INV_WHITESPACE(*(end-1))) { 1.47 + *--end = 0; 1.48 + } 1.49 + return end; 1.50 +} 1.51 + 1.52 +/* 1.53 + * If the string starts with # @missing: then return the pointer to the 1.54 + * following non-whitespace character. 1.55 + * Otherwise return the original pointer. 1.56 + * Unicode 5.0 adds such lines in some data files to document 1.57 + * default property values. 1.58 + * Poor man's regex for variable amounts of white space. 1.59 + */ 1.60 +static const char * 1.61 +getMissingLimit(const char *s) { 1.62 + const char *s0=s; 1.63 + if( 1.64 + *(s=u_skipWhitespace(s))=='#' && 1.65 + *(s=u_skipWhitespace(s+1))=='@' && 1.66 + 0==strncmp((s=u_skipWhitespace(s+1)), "missing", 7) && 1.67 + *(s=u_skipWhitespace(s+7))==':' 1.68 + ) { 1.69 + return u_skipWhitespace(s+1); 1.70 + } else { 1.71 + return s0; 1.72 + } 1.73 +} 1.74 + 1.75 +U_CAPI void U_EXPORT2 1.76 +u_parseDelimitedFile(const char *filename, char delimiter, 1.77 + char *fields[][2], int32_t fieldCount, 1.78 + UParseLineFn *lineFn, void *context, 1.79 + UErrorCode *pErrorCode) { 1.80 + FileStream *file; 1.81 + char line[300]; 1.82 + char *start, *limit; 1.83 + int32_t i, length; 1.84 + 1.85 + if(U_FAILURE(*pErrorCode)) { 1.86 + return; 1.87 + } 1.88 + 1.89 + if(fields==NULL || lineFn==NULL || fieldCount<=0) { 1.90 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.91 + return; 1.92 + } 1.93 + 1.94 + if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) { 1.95 + filename=NULL; 1.96 + file=T_FileStream_stdin(); 1.97 + } else { 1.98 + file=T_FileStream_open(filename, "r"); 1.99 + } 1.100 + if(file==NULL) { 1.101 + *pErrorCode=U_FILE_ACCESS_ERROR; 1.102 + return; 1.103 + } 1.104 + 1.105 + while(T_FileStream_readLine(file, line, sizeof(line))!=NULL) { 1.106 + /* remove trailing newline characters */ 1.107 + length=(int32_t)(u_rtrim(line)-line); 1.108 + 1.109 + /* 1.110 + * detect a line with # @missing: 1.111 + * start parsing after that, or else from the beginning of the line 1.112 + * set the default warning for @missing lines 1.113 + */ 1.114 + start=(char *)getMissingLimit(line); 1.115 + if(start==line) { 1.116 + *pErrorCode=U_ZERO_ERROR; 1.117 + } else { 1.118 + *pErrorCode=U_USING_DEFAULT_WARNING; 1.119 + } 1.120 + 1.121 + /* skip this line if it is empty or a comment */ 1.122 + if(*start==0 || *start=='#') { 1.123 + continue; 1.124 + } 1.125 + 1.126 + /* remove in-line comments */ 1.127 + limit=uprv_strchr(start, '#'); 1.128 + if(limit!=NULL) { 1.129 + /* get white space before the pound sign */ 1.130 + while(limit>start && U_IS_INV_WHITESPACE(*(limit-1))) { 1.131 + --limit; 1.132 + } 1.133 + 1.134 + /* truncate the line */ 1.135 + *limit=0; 1.136 + } 1.137 + 1.138 + /* skip lines with only whitespace */ 1.139 + if(u_skipWhitespace(start)[0]==0) { 1.140 + continue; 1.141 + } 1.142 + 1.143 + /* for each field, call the corresponding field function */ 1.144 + for(i=0; i<fieldCount; ++i) { 1.145 + /* set the limit pointer of this field */ 1.146 + limit=start; 1.147 + while(*limit!=delimiter && *limit!=0) { 1.148 + ++limit; 1.149 + } 1.150 + 1.151 + /* set the field start and limit in the fields array */ 1.152 + fields[i][0]=start; 1.153 + fields[i][1]=limit; 1.154 + 1.155 + /* set start to the beginning of the next field, if any */ 1.156 + start=limit; 1.157 + if(*start!=0) { 1.158 + ++start; 1.159 + } else if(i+1<fieldCount) { 1.160 + *pErrorCode=U_PARSE_ERROR; 1.161 + limit=line+length; 1.162 + i=fieldCount; 1.163 + break; 1.164 + } 1.165 + } 1.166 + 1.167 + /* error in a field function? */ 1.168 + if(U_FAILURE(*pErrorCode)) { 1.169 + break; 1.170 + } 1.171 + 1.172 + /* call the field function */ 1.173 + lineFn(context, fields, fieldCount, pErrorCode); 1.174 + if(U_FAILURE(*pErrorCode)) { 1.175 + break; 1.176 + } 1.177 + } 1.178 + 1.179 + if(filename!=NULL) { 1.180 + T_FileStream_close(file); 1.181 + } 1.182 +} 1.183 + 1.184 +/* 1.185 + * parse a list of code points 1.186 + * store them as a UTF-32 string in dest[destCapacity] 1.187 + * return the number of code points 1.188 + */ 1.189 +U_CAPI int32_t U_EXPORT2 1.190 +u_parseCodePoints(const char *s, 1.191 + uint32_t *dest, int32_t destCapacity, 1.192 + UErrorCode *pErrorCode) { 1.193 + char *end; 1.194 + uint32_t value; 1.195 + int32_t count; 1.196 + 1.197 + if(U_FAILURE(*pErrorCode)) { 1.198 + return 0; 1.199 + } 1.200 + if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) { 1.201 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.202 + return 0; 1.203 + } 1.204 + 1.205 + count=0; 1.206 + for(;;) { 1.207 + s=u_skipWhitespace(s); 1.208 + if(*s==';' || *s==0) { 1.209 + return count; 1.210 + } 1.211 + 1.212 + /* read one code point */ 1.213 + value=(uint32_t)uprv_strtoul(s, &end, 16); 1.214 + if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) { 1.215 + *pErrorCode=U_PARSE_ERROR; 1.216 + return 0; 1.217 + } 1.218 + 1.219 + /* append it to the destination array */ 1.220 + if(count<destCapacity) { 1.221 + dest[count++]=value; 1.222 + } else { 1.223 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.224 + } 1.225 + 1.226 + /* go to the following characters */ 1.227 + s=end; 1.228 + } 1.229 +} 1.230 + 1.231 +/* 1.232 + * parse a list of code points 1.233 + * store them as a string in dest[destCapacity] 1.234 + * set the first code point in *pFirst 1.235 + * @return The length of the string in numbers of UChars. 1.236 + */ 1.237 +U_CAPI int32_t U_EXPORT2 1.238 +u_parseString(const char *s, 1.239 + UChar *dest, int32_t destCapacity, 1.240 + uint32_t *pFirst, 1.241 + UErrorCode *pErrorCode) { 1.242 + char *end; 1.243 + uint32_t value; 1.244 + int32_t destLength; 1.245 + 1.246 + if(U_FAILURE(*pErrorCode)) { 1.247 + return 0; 1.248 + } 1.249 + if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) { 1.250 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.251 + return 0; 1.252 + } 1.253 + 1.254 + if(pFirst!=NULL) { 1.255 + *pFirst=0xffffffff; 1.256 + } 1.257 + 1.258 + destLength=0; 1.259 + for(;;) { 1.260 + s=u_skipWhitespace(s); 1.261 + if(*s==';' || *s==0) { 1.262 + if(destLength<destCapacity) { 1.263 + dest[destLength]=0; 1.264 + } else if(destLength==destCapacity) { 1.265 + *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; 1.266 + } else { 1.267 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.268 + } 1.269 + return destLength; 1.270 + } 1.271 + 1.272 + /* read one code point */ 1.273 + value=(uint32_t)uprv_strtoul(s, &end, 16); 1.274 + if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) { 1.275 + *pErrorCode=U_PARSE_ERROR; 1.276 + return 0; 1.277 + } 1.278 + 1.279 + /* store the first code point */ 1.280 + if(pFirst!=NULL) { 1.281 + *pFirst=value; 1.282 + pFirst=NULL; 1.283 + } 1.284 + 1.285 + /* append it to the destination array */ 1.286 + if((destLength+U16_LENGTH(value))<=destCapacity) { 1.287 + U16_APPEND_UNSAFE(dest, destLength, value); 1.288 + } else { 1.289 + destLength+=U16_LENGTH(value); 1.290 + } 1.291 + 1.292 + /* go to the following characters */ 1.293 + s=end; 1.294 + } 1.295 +} 1.296 + 1.297 +/* read a range like start or start..end */ 1.298 +U_CAPI int32_t U_EXPORT2 1.299 +u_parseCodePointRangeAnyTerminator(const char *s, 1.300 + uint32_t *pStart, uint32_t *pEnd, 1.301 + const char **terminator, 1.302 + UErrorCode *pErrorCode) { 1.303 + char *end; 1.304 + uint32_t value; 1.305 + 1.306 + if(U_FAILURE(*pErrorCode)) { 1.307 + return 0; 1.308 + } 1.309 + if(s==NULL || pStart==NULL || pEnd==NULL) { 1.310 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.311 + return 0; 1.312 + } 1.313 + 1.314 + /* read the start code point */ 1.315 + s=u_skipWhitespace(s); 1.316 + value=(uint32_t)uprv_strtoul(s, &end, 16); 1.317 + if(end<=s || value>=0x110000) { 1.318 + *pErrorCode=U_PARSE_ERROR; 1.319 + return 0; 1.320 + } 1.321 + *pStart=*pEnd=value; 1.322 + 1.323 + /* is there a "..end"? */ 1.324 + s=u_skipWhitespace(end); 1.325 + if(*s!='.' || s[1]!='.') { 1.326 + *terminator=end; 1.327 + return 1; 1.328 + } 1.329 + s=u_skipWhitespace(s+2); 1.330 + 1.331 + /* read the end code point */ 1.332 + value=(uint32_t)uprv_strtoul(s, &end, 16); 1.333 + if(end<=s || value>=0x110000) { 1.334 + *pErrorCode=U_PARSE_ERROR; 1.335 + return 0; 1.336 + } 1.337 + *pEnd=value; 1.338 + 1.339 + /* is this a valid range? */ 1.340 + if(value<*pStart) { 1.341 + *pErrorCode=U_PARSE_ERROR; 1.342 + return 0; 1.343 + } 1.344 + 1.345 + *terminator=end; 1.346 + return value-*pStart+1; 1.347 +} 1.348 + 1.349 +U_CAPI int32_t U_EXPORT2 1.350 +u_parseCodePointRange(const char *s, 1.351 + uint32_t *pStart, uint32_t *pEnd, 1.352 + UErrorCode *pErrorCode) { 1.353 + const char *terminator; 1.354 + int32_t rangeLength= 1.355 + u_parseCodePointRangeAnyTerminator(s, pStart, pEnd, &terminator, pErrorCode); 1.356 + if(U_SUCCESS(*pErrorCode)) { 1.357 + terminator=u_skipWhitespace(terminator); 1.358 + if(*terminator!=';' && *terminator!=0) { 1.359 + *pErrorCode=U_PARSE_ERROR; 1.360 + return 0; 1.361 + } 1.362 + } 1.363 + return rangeLength; 1.364 +} 1.365 + 1.366 +U_CAPI int32_t U_EXPORT2 1.367 +u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status) { 1.368 + const char *read = source; 1.369 + int32_t i = 0; 1.370 + unsigned int value = 0; 1.371 + if(sLen == -1) { 1.372 + sLen = (int32_t)strlen(source); 1.373 + } 1.374 + 1.375 + while(read < source+sLen) { 1.376 + sscanf(read, "%2x", &value); 1.377 + if(i < destCapacity) { 1.378 + dest[i] = (char)value; 1.379 + } 1.380 + i++; 1.381 + read += 2; 1.382 + } 1.383 + return u_terminateChars(dest, destCapacity, i, status); 1.384 +}