|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 2004-2005, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * file name: xmlparser.h |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 2004jul21 |
|
14 * created by: Andy Heninger |
|
15 * |
|
16 * Tiny XML parser using ICU and intended for use in ICU tests and in build tools. |
|
17 * Not suitable for production use. Not supported. |
|
18 * Not conformant. Not efficient. |
|
19 * But very small. |
|
20 */ |
|
21 |
|
22 #ifndef __XMLPARSER_H__ |
|
23 #define __XMLPARSER_H__ |
|
24 |
|
25 #include "unicode/uobject.h" |
|
26 #include "unicode/unistr.h" |
|
27 #include "unicode/regex.h" |
|
28 #include "uvector.h" |
|
29 #include "hash.h" |
|
30 |
|
31 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION |
|
32 |
|
33 enum UXMLNodeType { |
|
34 /** Node type string (text contents), stored as a UnicodeString. */ |
|
35 UXML_NODE_TYPE_STRING, |
|
36 /** Node type element, stored as a UXMLElement. */ |
|
37 UXML_NODE_TYPE_ELEMENT, |
|
38 UXML_NODE_TYPE_COUNT |
|
39 }; |
|
40 |
|
41 U_NAMESPACE_BEGIN |
|
42 |
|
43 class UXMLParser; |
|
44 |
|
45 /** |
|
46 * This class represents an element node in a parsed XML tree. |
|
47 */ |
|
48 class U_TOOLUTIL_API UXMLElement : public UObject { |
|
49 public: |
|
50 /** |
|
51 * Destructor. |
|
52 */ |
|
53 virtual ~UXMLElement(); |
|
54 |
|
55 /** |
|
56 * Get the tag name of this element. |
|
57 */ |
|
58 const UnicodeString &getTagName() const; |
|
59 /** |
|
60 * Get the text contents of the element. |
|
61 * Append the contents of all text child nodes. |
|
62 * @param recurse If TRUE, also recursively appends the contents of all |
|
63 * text child nodes of element children. |
|
64 * @return The text contents. |
|
65 */ |
|
66 UnicodeString getText(UBool recurse) const; |
|
67 /** |
|
68 * Get the number of attributes. |
|
69 */ |
|
70 int32_t countAttributes() const; |
|
71 /** |
|
72 * Get the i-th attribute. |
|
73 * @param i Index of the attribute. |
|
74 * @param name Output parameter, receives the attribute name. |
|
75 * @param value Output parameter, receives the attribute value. |
|
76 * @return A pointer to the attribute value (may be &value or a pointer to an |
|
77 * internal string object), or NULL if i is out of bounds. |
|
78 */ |
|
79 const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const; |
|
80 /** |
|
81 * Get the value of the attribute with the given name. |
|
82 * @param name Attribute name to be looked up. |
|
83 * @return A pointer to the attribute value, or NULL if this element |
|
84 * does not have this attribute. |
|
85 */ |
|
86 const UnicodeString *getAttribute(const UnicodeString &name) const; |
|
87 /** |
|
88 * Get the number of child nodes. |
|
89 */ |
|
90 int32_t countChildren() const; |
|
91 /** |
|
92 * Get the i-th child node. |
|
93 * @param i Index of the child node. |
|
94 * @param type The child node type. |
|
95 * @return A pointer to the child node object, or NULL if i is out of bounds. |
|
96 */ |
|
97 const UObject *getChild(int32_t i, UXMLNodeType &type) const; |
|
98 /** |
|
99 * Get the next child element node, skipping non-element child nodes. |
|
100 * @param i Enumeration index; initialize to 0 before getting the first child element. |
|
101 * @return A pointer to the next child element, or NULL if there is none. |
|
102 */ |
|
103 const UXMLElement *nextChildElement(int32_t &i) const; |
|
104 /** |
|
105 * Get the immediate child element with the given name. |
|
106 * If there are multiple child elements with this name, then return |
|
107 * the first one. |
|
108 * @param name Element name to be looked up. |
|
109 * @return A pointer to the element node, or NULL if this element |
|
110 * does not have this immediate child element. |
|
111 */ |
|
112 const UXMLElement *getChildElement(const UnicodeString &name) const; |
|
113 |
|
114 /** |
|
115 * ICU "poor man's RTTI", returns a UClassID for the actual class. |
|
116 */ |
|
117 virtual UClassID getDynamicClassID() const; |
|
118 |
|
119 /** |
|
120 * ICU "poor man's RTTI", returns a UClassID for this class. |
|
121 */ |
|
122 static UClassID U_EXPORT2 getStaticClassID(); |
|
123 |
|
124 private: |
|
125 // prevent default construction etc. |
|
126 UXMLElement(); |
|
127 UXMLElement(const UXMLElement &other); |
|
128 UXMLElement &operator=(const UXMLElement &other); |
|
129 |
|
130 void appendText(UnicodeString &text, UBool recurse) const; |
|
131 |
|
132 friend class UXMLParser; |
|
133 |
|
134 UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode); |
|
135 |
|
136 const UXMLParser *fParser; |
|
137 const UnicodeString *fName; // The tag name of this element (owned by the UXMLParser) |
|
138 UnicodeString fContent; // The text content of this node. All element content is |
|
139 // concatenated even when there are intervening nested elements |
|
140 // (which doesn't happen with most xml files we care about) |
|
141 // Sections of content containing only white space are dropped, |
|
142 // which gets rid the bogus white space content from |
|
143 // elements which are primarily containers for nested elements. |
|
144 UVector fAttNames; // A vector containing the names of this element's attributes |
|
145 // The names are UnicodeString objects, owned by the UXMLParser. |
|
146 UVector fAttValues; // A vector containing the attribute values for |
|
147 // this element's attributes. The order is the same |
|
148 // as that of the attribute name vector. |
|
149 |
|
150 UVector fChildren; // The child nodes of this element (a Vector) |
|
151 |
|
152 UXMLElement *fParent; // A pointer to the parent element of this element. |
|
153 }; |
|
154 |
|
155 /** |
|
156 * A simple XML parser; it is neither efficient nor conformant and only useful for |
|
157 * restricted types of XML documents. |
|
158 * |
|
159 * The parse methods parse whole documents and return the parse trees via their |
|
160 * root elements. |
|
161 */ |
|
162 class U_TOOLUTIL_API UXMLParser : public UObject { |
|
163 public: |
|
164 /** |
|
165 * Create an XML parser. |
|
166 */ |
|
167 static UXMLParser *createParser(UErrorCode &errorCode); |
|
168 /** |
|
169 * Destructor. |
|
170 */ |
|
171 virtual ~UXMLParser(); |
|
172 |
|
173 /** |
|
174 * Parse an XML document, create the entire document tree, and |
|
175 * return a pointer to the root element of the parsed tree. |
|
176 * The caller must delete the element. |
|
177 */ |
|
178 UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode); |
|
179 /** |
|
180 * Parse an XML file, create the entire document tree, and |
|
181 * return a pointer to the root element of the parsed tree. |
|
182 * The caller must delete the element. |
|
183 */ |
|
184 UXMLElement *parseFile(const char *filename, UErrorCode &errorCode); |
|
185 |
|
186 /** |
|
187 * ICU "poor man's RTTI", returns a UClassID for the actual class. |
|
188 */ |
|
189 virtual UClassID getDynamicClassID() const; |
|
190 |
|
191 /** |
|
192 * ICU "poor man's RTTI", returns a UClassID for this class. |
|
193 */ |
|
194 static UClassID U_EXPORT2 getStaticClassID(); |
|
195 |
|
196 private: |
|
197 // prevent default construction etc. |
|
198 UXMLParser(); |
|
199 UXMLParser(const UXMLParser &other); |
|
200 UXMLParser &operator=(const UXMLParser &other); |
|
201 |
|
202 // constructor |
|
203 UXMLParser(UErrorCode &status); |
|
204 |
|
205 void parseMisc(UErrorCode &status); |
|
206 UXMLElement *createElement(RegexMatcher &mEl, UErrorCode &status); |
|
207 void error(const char *message, UErrorCode &status); |
|
208 UnicodeString scanContent(UErrorCode &status); |
|
209 void replaceCharRefs(UnicodeString &s, UErrorCode &status); |
|
210 |
|
211 const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode); |
|
212 public: |
|
213 // public for UXMLElement only |
|
214 const UnicodeString *findName(const UnicodeString &s) const; |
|
215 private: |
|
216 |
|
217 // There is one ICU regex matcher for each of the major XML syntax items |
|
218 // that are recognized. |
|
219 RegexMatcher mXMLDecl; |
|
220 RegexMatcher mXMLComment; |
|
221 RegexMatcher mXMLSP; |
|
222 RegexMatcher mXMLDoctype; |
|
223 RegexMatcher mXMLPI; |
|
224 RegexMatcher mXMLElemStart; |
|
225 RegexMatcher mXMLElemEnd; |
|
226 RegexMatcher mXMLElemEmpty; |
|
227 RegexMatcher mXMLCharData; |
|
228 RegexMatcher mAttrValue; |
|
229 RegexMatcher mAttrNormalizer; |
|
230 RegexMatcher mNewLineNormalizer; |
|
231 RegexMatcher mAmps; |
|
232 |
|
233 Hashtable fNames; // interned element/attribute name strings |
|
234 UStack fElementStack; // Stack holds the parent elements when nested |
|
235 // elements are being parsed. All items on this |
|
236 // stack are of type UXMLElement. |
|
237 int32_t fPos; // String index of the current scan position in |
|
238 // xml source (in fSrc). |
|
239 UnicodeString fOneLF; |
|
240 }; |
|
241 |
|
242 U_NAMESPACE_END |
|
243 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ |
|
244 |
|
245 #endif |