michael@0: /* michael@0: ******************************************************************************* michael@0: * michael@0: * Copyright (C) 2000-2010, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ******************************************************************************* michael@0: * file name: uparse.h michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2000apr18 michael@0: * created by: Markus W. Scherer michael@0: * michael@0: * This file provides a parser for files that are delimited by one single michael@0: * character like ';' or TAB. Example: the Unicode Character Properties files michael@0: * like UnicodeData.txt are semicolon-delimited. michael@0: */ michael@0: michael@0: #ifndef __UPARSE_H__ michael@0: #define __UPARSE_H__ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: /** michael@0: * Is c an invariant-character whitespace? michael@0: * @param c invariant character michael@0: */ michael@0: #define U_IS_INV_WHITESPACE(c) ((c)==' ' || (c)=='\t' || (c)=='\r' || (c)=='\n') michael@0: michael@0: U_CDECL_BEGIN michael@0: michael@0: /** michael@0: * Skip space ' ' and TAB '\t' characters. michael@0: * michael@0: * @param s Pointer to characters. michael@0: * @return Pointer to first character at or after s that is not a space or TAB. michael@0: */ michael@0: U_CAPI const char * U_EXPORT2 michael@0: u_skipWhitespace(const char *s); michael@0: michael@0: /** michael@0: * Trim whitespace (including line endings) from the end of the string. michael@0: * michael@0: * @param s Pointer to the string. michael@0: * @return Pointer to the new end of the string. michael@0: */ michael@0: U_CAPI char * U_EXPORT2 michael@0: u_rtrim(char *s); michael@0: michael@0: /** Function type for u_parseDelimitedFile(). */ michael@0: typedef void U_CALLCONV michael@0: UParseLineFn(void *context, michael@0: char *fields[][2], michael@0: int32_t fieldCount, michael@0: UErrorCode *pErrorCode); michael@0: michael@0: /** michael@0: * Parser for files that are similar to UnicodeData.txt: michael@0: * This function opens the file and reads it line by line. It skips empty lines michael@0: * and comment lines that start with a '#'. michael@0: * All other lines are separated into fields with one delimiter character michael@0: * (semicolon for Unicode Properties files) between two fields. The last field in michael@0: * a line does not need to be terminated with a delimiter. michael@0: * michael@0: * For each line, after segmenting it, a line function is called. michael@0: * It gets passed the array of field start and limit pointers that is michael@0: * passed into this parser and filled by it for each line. michael@0: * For each field i of the line, the start pointer in fields[i][0] michael@0: * points to the beginning of the field, while the limit pointer in fields[i][1] michael@0: * points behind the field, i.e., to the delimiter or the line end. michael@0: * michael@0: * The context parameter of the line function is michael@0: * the same as the one for the parse function. michael@0: * michael@0: * The line function may modify the contents of the fields including the michael@0: * limit characters. michael@0: * michael@0: * If the file cannot be opened, or there is a parsing error or a field function michael@0: * sets *pErrorCode, then the parser returns with *pErrorCode set to an error code. michael@0: */ michael@0: U_CAPI void U_EXPORT2 michael@0: u_parseDelimitedFile(const char *filename, char delimiter, michael@0: char *fields[][2], int32_t fieldCount, michael@0: UParseLineFn *lineFn, void *context, michael@0: UErrorCode *pErrorCode); michael@0: michael@0: /** michael@0: * Parse a string of code points like 0061 0308 0300. michael@0: * s must end with either ';' or NUL. michael@0: * michael@0: * @return Number of code points. michael@0: */ michael@0: U_CAPI int32_t U_EXPORT2 michael@0: u_parseCodePoints(const char *s, michael@0: uint32_t *dest, int32_t destCapacity, michael@0: UErrorCode *pErrorCode); michael@0: michael@0: /** michael@0: * Parse a list of code points like 0061 0308 0300 michael@0: * into a UChar * string. michael@0: * s must end with either ';' or NUL. michael@0: * michael@0: * Set the first code point in *pFirst. michael@0: * michael@0: * @param s Input char * string. michael@0: * @param dest Output string buffer. michael@0: * @param destCapacity Capacity of dest in numbers of UChars. michael@0: * @param pFirst If pFirst!=NULL the *pFirst will be set to the first michael@0: * code point in the string. michael@0: * @param pErrorCode ICU error code. michael@0: * @return The length of the string in numbers of UChars. michael@0: */ michael@0: U_CAPI int32_t U_EXPORT2 michael@0: u_parseString(const char *s, michael@0: UChar *dest, int32_t destCapacity, michael@0: uint32_t *pFirst, michael@0: UErrorCode *pErrorCode); michael@0: michael@0: /** michael@0: * Parse a code point range like michael@0: * 0085 or michael@0: * 4E00..9FA5. michael@0: * michael@0: * s must contain such a range and end with either ';' or NUL. michael@0: * michael@0: * @return Length of code point range, end-start+1 michael@0: */ michael@0: U_CAPI int32_t U_EXPORT2 michael@0: u_parseCodePointRange(const char *s, michael@0: uint32_t *pStart, uint32_t *pEnd, michael@0: UErrorCode *pErrorCode); michael@0: michael@0: /** michael@0: * Same as u_parseCodePointRange() but the range may be terminated by michael@0: * any character. The position of the terminating character is returned via michael@0: * the *terminator output parameter. michael@0: */ michael@0: U_CAPI int32_t U_EXPORT2 michael@0: u_parseCodePointRangeAnyTerminator(const char *s, michael@0: uint32_t *pStart, uint32_t *pEnd, michael@0: const char **terminator, michael@0: UErrorCode *pErrorCode); michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status); michael@0: michael@0: U_CDECL_END michael@0: michael@0: #endif