|
1 /* |
|
2 ******************************************************************************* |
|
3 * Copyright (C) 2011-2013, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ******************************************************************************* |
|
6 * file name: ppucd.h |
|
7 * encoding: US-ASCII |
|
8 * tab size: 8 (not used) |
|
9 * indentation:4 |
|
10 * |
|
11 * created on: 2011dec11 |
|
12 * created by: Markus W. Scherer |
|
13 */ |
|
14 |
|
15 #ifndef __PPUCD_H__ |
|
16 #define __PPUCD_H__ |
|
17 |
|
18 #include "unicode/utypes.h" |
|
19 #include "unicode/uniset.h" |
|
20 #include "unicode/unistr.h" |
|
21 |
|
22 #include <stdio.h> |
|
23 |
|
24 /** Additions to the uchar.h enum UProperty. */ |
|
25 enum { |
|
26 /** Name_Alias */ |
|
27 PPUCD_NAME_ALIAS=UCHAR_STRING_LIMIT, |
|
28 PPUCD_CONDITIONAL_CASE_MAPPINGS, |
|
29 PPUCD_TURKIC_CASE_FOLDING |
|
30 }; |
|
31 |
|
32 U_NAMESPACE_BEGIN |
|
33 |
|
34 class U_TOOLUTIL_API PropertyNames { |
|
35 public: |
|
36 virtual ~PropertyNames(); |
|
37 virtual int32_t getPropertyEnum(const char *name) const; |
|
38 virtual int32_t getPropertyValueEnum(int32_t property, const char *name) const; |
|
39 }; |
|
40 |
|
41 struct U_TOOLUTIL_API UniProps { |
|
42 UniProps(); |
|
43 ~UniProps(); |
|
44 |
|
45 int32_t getIntProp(int32_t prop) const { return intProps[prop-UCHAR_INT_START]; } |
|
46 |
|
47 UChar32 start, end; |
|
48 UBool binProps[UCHAR_BINARY_LIMIT]; |
|
49 int32_t intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]; |
|
50 UVersionInfo age; |
|
51 UChar32 bmg, bpb; |
|
52 UChar32 scf, slc, stc, suc; |
|
53 int32_t digitValue; |
|
54 const char *numericValue; |
|
55 const char *name; |
|
56 const char *nameAlias; |
|
57 UnicodeString cf, lc, tc, uc; |
|
58 UnicodeSet scx; |
|
59 }; |
|
60 |
|
61 class U_TOOLUTIL_API PreparsedUCD { |
|
62 public: |
|
63 enum LineType { |
|
64 /** No line, end of file. */ |
|
65 NO_LINE, |
|
66 /** Empty line. (Might contain a comment.) */ |
|
67 EMPTY_LINE, |
|
68 |
|
69 /** ucd;6.1.0 */ |
|
70 UNICODE_VERSION_LINE, |
|
71 |
|
72 /** property;Binary;Alpha;Alphabetic */ |
|
73 PROPERTY_LINE, |
|
74 /** binary;N;No;F;False */ |
|
75 BINARY_LINE, |
|
76 /** value;gc;Zs;Space_Separator */ |
|
77 VALUE_LINE, |
|
78 |
|
79 /** defaults;0000..10FFFF;age=NA;bc=L;... */ |
|
80 DEFAULTS_LINE, |
|
81 /** block;0000..007F;age=1.1;blk=ASCII;ea=Na;... */ |
|
82 BLOCK_LINE, |
|
83 /** cp;0030;AHex;bc=EN;gc=Nd;na=DIGIT ZERO;... */ |
|
84 CP_LINE, |
|
85 |
|
86 /** algnamesrange;4E00..9FCC;han;CJK UNIFIED IDEOGRAPH- */ |
|
87 ALG_NAMES_RANGE_LINE, |
|
88 |
|
89 LINE_TYPE_COUNT |
|
90 }; |
|
91 |
|
92 /** |
|
93 * Constructor. |
|
94 * Prepare this object for a new, empty package. |
|
95 */ |
|
96 PreparsedUCD(const char *filename, UErrorCode &errorCode); |
|
97 |
|
98 /** Destructor. */ |
|
99 ~PreparsedUCD(); |
|
100 |
|
101 /** Sets (aliases) a non-standard PropertyNames implementation. Caller retains ownership. */ |
|
102 void setPropertyNames(const PropertyNames *pn) { pnames=pn; } |
|
103 |
|
104 /** |
|
105 * Reads a line from the preparsed UCD file. |
|
106 * Splits the line by replacing each ';' with a NUL. |
|
107 */ |
|
108 LineType readLine(UErrorCode &errorCode); |
|
109 |
|
110 /** Returns the number of the line read by readLine(). */ |
|
111 int32_t getLineNumber() const { return lineNumber; } |
|
112 |
|
113 /** Returns the line's next field, or NULL. */ |
|
114 const char *nextField(); |
|
115 |
|
116 /** Returns the Unicode version when or after the UNICODE_VERSION_LINE has been read. */ |
|
117 const UVersionInfo &getUnicodeVersion() const { return ucdVersion; } |
|
118 |
|
119 /** Returns TRUE if the current line has property values. */ |
|
120 UBool lineHasPropertyValues() const { return DEFAULTS_LINE<=lineType && lineType<=CP_LINE; } |
|
121 |
|
122 /** |
|
123 * Parses properties from the current line. |
|
124 * Clears newValues and sets UProperty codes for property values mentioned |
|
125 * on the current line (as opposed to being inherited). |
|
126 * Returns a pointer to the filled-in UniProps, or NULL if something went wrong. |
|
127 * The returned UniProps are usable until the next line of the same type is read. |
|
128 */ |
|
129 const UniProps *getProps(UnicodeSet &newValues, UErrorCode &errorCode); |
|
130 |
|
131 /** |
|
132 * Returns the code point range for the current algnamesrange line. |
|
133 * Calls & parses nextField(). |
|
134 * Further nextField() calls will yield the range's type & prefix string. |
|
135 * Returns U_SUCCESS(errorCode). |
|
136 */ |
|
137 UBool getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode); |
|
138 |
|
139 private: |
|
140 UBool isLineBufferAvailable(int32_t i) { |
|
141 return defaultLineIndex!=i && blockLineIndex!=i; |
|
142 } |
|
143 |
|
144 /** Resets the field iterator and returns the line's first field (the line type field). */ |
|
145 const char *firstField(); |
|
146 |
|
147 UBool parseProperty(UniProps &props, const char *field, UnicodeSet &newValues, |
|
148 UErrorCode &errorCode); |
|
149 UChar32 parseCodePoint(const char *s, UErrorCode &errorCode); |
|
150 UBool parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode); |
|
151 void parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode); |
|
152 void parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode); |
|
153 |
|
154 static const int32_t kNumLineBuffers=3; |
|
155 |
|
156 PropertyNames *icuPnames; // owned |
|
157 const PropertyNames *pnames; // aliased |
|
158 FILE *file; |
|
159 int32_t defaultLineIndex, blockLineIndex, lineIndex; |
|
160 int32_t lineNumber; |
|
161 LineType lineType; |
|
162 char *fieldLimit; |
|
163 char *lineLimit; |
|
164 |
|
165 UVersionInfo ucdVersion; |
|
166 UniProps defaultProps, blockProps, cpProps; |
|
167 // Multiple lines so that default and block properties can maintain pointers |
|
168 // into their line buffers. |
|
169 char lines[kNumLineBuffers][4096]; |
|
170 }; |
|
171 |
|
172 U_NAMESPACE_END |
|
173 |
|
174 #endif // __PPUCD_H__ |