1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/tools/toolutil/xmlparser.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,245 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 2004-2005, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: xmlparser.h 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2004jul21 1.17 +* created by: Andy Heninger 1.18 +* 1.19 +* Tiny XML parser using ICU and intended for use in ICU tests and in build tools. 1.20 +* Not suitable for production use. Not supported. 1.21 +* Not conformant. Not efficient. 1.22 +* But very small. 1.23 +*/ 1.24 + 1.25 +#ifndef __XMLPARSER_H__ 1.26 +#define __XMLPARSER_H__ 1.27 + 1.28 +#include "unicode/uobject.h" 1.29 +#include "unicode/unistr.h" 1.30 +#include "unicode/regex.h" 1.31 +#include "uvector.h" 1.32 +#include "hash.h" 1.33 + 1.34 +#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION 1.35 + 1.36 +enum UXMLNodeType { 1.37 + /** Node type string (text contents), stored as a UnicodeString. */ 1.38 + UXML_NODE_TYPE_STRING, 1.39 + /** Node type element, stored as a UXMLElement. */ 1.40 + UXML_NODE_TYPE_ELEMENT, 1.41 + UXML_NODE_TYPE_COUNT 1.42 +}; 1.43 + 1.44 +U_NAMESPACE_BEGIN 1.45 + 1.46 +class UXMLParser; 1.47 + 1.48 +/** 1.49 + * This class represents an element node in a parsed XML tree. 1.50 + */ 1.51 +class U_TOOLUTIL_API UXMLElement : public UObject { 1.52 +public: 1.53 + /** 1.54 + * Destructor. 1.55 + */ 1.56 + virtual ~UXMLElement(); 1.57 + 1.58 + /** 1.59 + * Get the tag name of this element. 1.60 + */ 1.61 + const UnicodeString &getTagName() const; 1.62 + /** 1.63 + * Get the text contents of the element. 1.64 + * Append the contents of all text child nodes. 1.65 + * @param recurse If TRUE, also recursively appends the contents of all 1.66 + * text child nodes of element children. 1.67 + * @return The text contents. 1.68 + */ 1.69 + UnicodeString getText(UBool recurse) const; 1.70 + /** 1.71 + * Get the number of attributes. 1.72 + */ 1.73 + int32_t countAttributes() const; 1.74 + /** 1.75 + * Get the i-th attribute. 1.76 + * @param i Index of the attribute. 1.77 + * @param name Output parameter, receives the attribute name. 1.78 + * @param value Output parameter, receives the attribute value. 1.79 + * @return A pointer to the attribute value (may be &value or a pointer to an 1.80 + * internal string object), or NULL if i is out of bounds. 1.81 + */ 1.82 + const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const; 1.83 + /** 1.84 + * Get the value of the attribute with the given name. 1.85 + * @param name Attribute name to be looked up. 1.86 + * @return A pointer to the attribute value, or NULL if this element 1.87 + * does not have this attribute. 1.88 + */ 1.89 + const UnicodeString *getAttribute(const UnicodeString &name) const; 1.90 + /** 1.91 + * Get the number of child nodes. 1.92 + */ 1.93 + int32_t countChildren() const; 1.94 + /** 1.95 + * Get the i-th child node. 1.96 + * @param i Index of the child node. 1.97 + * @param type The child node type. 1.98 + * @return A pointer to the child node object, or NULL if i is out of bounds. 1.99 + */ 1.100 + const UObject *getChild(int32_t i, UXMLNodeType &type) const; 1.101 + /** 1.102 + * Get the next child element node, skipping non-element child nodes. 1.103 + * @param i Enumeration index; initialize to 0 before getting the first child element. 1.104 + * @return A pointer to the next child element, or NULL if there is none. 1.105 + */ 1.106 + const UXMLElement *nextChildElement(int32_t &i) const; 1.107 + /** 1.108 + * Get the immediate child element with the given name. 1.109 + * If there are multiple child elements with this name, then return 1.110 + * the first one. 1.111 + * @param name Element name to be looked up. 1.112 + * @return A pointer to the element node, or NULL if this element 1.113 + * does not have this immediate child element. 1.114 + */ 1.115 + const UXMLElement *getChildElement(const UnicodeString &name) const; 1.116 + 1.117 + /** 1.118 + * ICU "poor man's RTTI", returns a UClassID for the actual class. 1.119 + */ 1.120 + virtual UClassID getDynamicClassID() const; 1.121 + 1.122 + /** 1.123 + * ICU "poor man's RTTI", returns a UClassID for this class. 1.124 + */ 1.125 + static UClassID U_EXPORT2 getStaticClassID(); 1.126 + 1.127 +private: 1.128 + // prevent default construction etc. 1.129 + UXMLElement(); 1.130 + UXMLElement(const UXMLElement &other); 1.131 + UXMLElement &operator=(const UXMLElement &other); 1.132 + 1.133 + void appendText(UnicodeString &text, UBool recurse) const; 1.134 + 1.135 + friend class UXMLParser; 1.136 + 1.137 + UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode); 1.138 + 1.139 + const UXMLParser *fParser; 1.140 + const UnicodeString *fName; // The tag name of this element (owned by the UXMLParser) 1.141 + UnicodeString fContent; // The text content of this node. All element content is 1.142 + // concatenated even when there are intervening nested elements 1.143 + // (which doesn't happen with most xml files we care about) 1.144 + // Sections of content containing only white space are dropped, 1.145 + // which gets rid the bogus white space content from 1.146 + // elements which are primarily containers for nested elements. 1.147 + UVector fAttNames; // A vector containing the names of this element's attributes 1.148 + // The names are UnicodeString objects, owned by the UXMLParser. 1.149 + UVector fAttValues; // A vector containing the attribute values for 1.150 + // this element's attributes. The order is the same 1.151 + // as that of the attribute name vector. 1.152 + 1.153 + UVector fChildren; // The child nodes of this element (a Vector) 1.154 + 1.155 + UXMLElement *fParent; // A pointer to the parent element of this element. 1.156 +}; 1.157 + 1.158 +/** 1.159 + * A simple XML parser; it is neither efficient nor conformant and only useful for 1.160 + * restricted types of XML documents. 1.161 + * 1.162 + * The parse methods parse whole documents and return the parse trees via their 1.163 + * root elements. 1.164 + */ 1.165 +class U_TOOLUTIL_API UXMLParser : public UObject { 1.166 +public: 1.167 + /** 1.168 + * Create an XML parser. 1.169 + */ 1.170 + static UXMLParser *createParser(UErrorCode &errorCode); 1.171 + /** 1.172 + * Destructor. 1.173 + */ 1.174 + virtual ~UXMLParser(); 1.175 + 1.176 + /** 1.177 + * Parse an XML document, create the entire document tree, and 1.178 + * return a pointer to the root element of the parsed tree. 1.179 + * The caller must delete the element. 1.180 + */ 1.181 + UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode); 1.182 + /** 1.183 + * Parse an XML file, create the entire document tree, and 1.184 + * return a pointer to the root element of the parsed tree. 1.185 + * The caller must delete the element. 1.186 + */ 1.187 + UXMLElement *parseFile(const char *filename, UErrorCode &errorCode); 1.188 + 1.189 + /** 1.190 + * ICU "poor man's RTTI", returns a UClassID for the actual class. 1.191 + */ 1.192 + virtual UClassID getDynamicClassID() const; 1.193 + 1.194 + /** 1.195 + * ICU "poor man's RTTI", returns a UClassID for this class. 1.196 + */ 1.197 + static UClassID U_EXPORT2 getStaticClassID(); 1.198 + 1.199 +private: 1.200 + // prevent default construction etc. 1.201 + UXMLParser(); 1.202 + UXMLParser(const UXMLParser &other); 1.203 + UXMLParser &operator=(const UXMLParser &other); 1.204 + 1.205 + // constructor 1.206 + UXMLParser(UErrorCode &status); 1.207 + 1.208 + void parseMisc(UErrorCode &status); 1.209 + UXMLElement *createElement(RegexMatcher &mEl, UErrorCode &status); 1.210 + void error(const char *message, UErrorCode &status); 1.211 + UnicodeString scanContent(UErrorCode &status); 1.212 + void replaceCharRefs(UnicodeString &s, UErrorCode &status); 1.213 + 1.214 + const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode); 1.215 +public: 1.216 + // public for UXMLElement only 1.217 + const UnicodeString *findName(const UnicodeString &s) const; 1.218 +private: 1.219 + 1.220 + // There is one ICU regex matcher for each of the major XML syntax items 1.221 + // that are recognized. 1.222 + RegexMatcher mXMLDecl; 1.223 + RegexMatcher mXMLComment; 1.224 + RegexMatcher mXMLSP; 1.225 + RegexMatcher mXMLDoctype; 1.226 + RegexMatcher mXMLPI; 1.227 + RegexMatcher mXMLElemStart; 1.228 + RegexMatcher mXMLElemEnd; 1.229 + RegexMatcher mXMLElemEmpty; 1.230 + RegexMatcher mXMLCharData; 1.231 + RegexMatcher mAttrValue; 1.232 + RegexMatcher mAttrNormalizer; 1.233 + RegexMatcher mNewLineNormalizer; 1.234 + RegexMatcher mAmps; 1.235 + 1.236 + Hashtable fNames; // interned element/attribute name strings 1.237 + UStack fElementStack; // Stack holds the parent elements when nested 1.238 + // elements are being parsed. All items on this 1.239 + // stack are of type UXMLElement. 1.240 + int32_t fPos; // String index of the current scan position in 1.241 + // xml source (in fSrc). 1.242 + UnicodeString fOneLF; 1.243 +}; 1.244 + 1.245 +U_NAMESPACE_END 1.246 +#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 1.247 + 1.248 +#endif