michael@0: /* michael@0: ******************************************************************************* michael@0: * michael@0: * Copyright (C) 2004-2005, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ******************************************************************************* michael@0: * file name: xmlparser.h michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2004jul21 michael@0: * created by: Andy Heninger michael@0: * michael@0: * Tiny XML parser using ICU and intended for use in ICU tests and in build tools. michael@0: * Not suitable for production use. Not supported. michael@0: * Not conformant. Not efficient. michael@0: * But very small. michael@0: */ michael@0: michael@0: #ifndef __XMLPARSER_H__ michael@0: #define __XMLPARSER_H__ michael@0: michael@0: #include "unicode/uobject.h" michael@0: #include "unicode/unistr.h" michael@0: #include "unicode/regex.h" michael@0: #include "uvector.h" michael@0: #include "hash.h" michael@0: michael@0: #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION michael@0: michael@0: enum UXMLNodeType { michael@0: /** Node type string (text contents), stored as a UnicodeString. */ michael@0: UXML_NODE_TYPE_STRING, michael@0: /** Node type element, stored as a UXMLElement. */ michael@0: UXML_NODE_TYPE_ELEMENT, michael@0: UXML_NODE_TYPE_COUNT michael@0: }; michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: class UXMLParser; michael@0: michael@0: /** michael@0: * This class represents an element node in a parsed XML tree. michael@0: */ michael@0: class U_TOOLUTIL_API UXMLElement : public UObject { michael@0: public: michael@0: /** michael@0: * Destructor. michael@0: */ michael@0: virtual ~UXMLElement(); michael@0: michael@0: /** michael@0: * Get the tag name of this element. michael@0: */ michael@0: const UnicodeString &getTagName() const; michael@0: /** michael@0: * Get the text contents of the element. michael@0: * Append the contents of all text child nodes. michael@0: * @param recurse If TRUE, also recursively appends the contents of all michael@0: * text child nodes of element children. michael@0: * @return The text contents. michael@0: */ michael@0: UnicodeString getText(UBool recurse) const; michael@0: /** michael@0: * Get the number of attributes. michael@0: */ michael@0: int32_t countAttributes() const; michael@0: /** michael@0: * Get the i-th attribute. michael@0: * @param i Index of the attribute. michael@0: * @param name Output parameter, receives the attribute name. michael@0: * @param value Output parameter, receives the attribute value. michael@0: * @return A pointer to the attribute value (may be &value or a pointer to an michael@0: * internal string object), or NULL if i is out of bounds. michael@0: */ michael@0: const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const; michael@0: /** michael@0: * Get the value of the attribute with the given name. michael@0: * @param name Attribute name to be looked up. michael@0: * @return A pointer to the attribute value, or NULL if this element michael@0: * does not have this attribute. michael@0: */ michael@0: const UnicodeString *getAttribute(const UnicodeString &name) const; michael@0: /** michael@0: * Get the number of child nodes. michael@0: */ michael@0: int32_t countChildren() const; michael@0: /** michael@0: * Get the i-th child node. michael@0: * @param i Index of the child node. michael@0: * @param type The child node type. michael@0: * @return A pointer to the child node object, or NULL if i is out of bounds. michael@0: */ michael@0: const UObject *getChild(int32_t i, UXMLNodeType &type) const; michael@0: /** michael@0: * Get the next child element node, skipping non-element child nodes. michael@0: * @param i Enumeration index; initialize to 0 before getting the first child element. michael@0: * @return A pointer to the next child element, or NULL if there is none. michael@0: */ michael@0: const UXMLElement *nextChildElement(int32_t &i) const; michael@0: /** michael@0: * Get the immediate child element with the given name. michael@0: * If there are multiple child elements with this name, then return michael@0: * the first one. michael@0: * @param name Element name to be looked up. michael@0: * @return A pointer to the element node, or NULL if this element michael@0: * does not have this immediate child element. michael@0: */ michael@0: const UXMLElement *getChildElement(const UnicodeString &name) const; michael@0: michael@0: /** michael@0: * ICU "poor man's RTTI", returns a UClassID for the actual class. michael@0: */ michael@0: virtual UClassID getDynamicClassID() const; michael@0: michael@0: /** michael@0: * ICU "poor man's RTTI", returns a UClassID for this class. michael@0: */ michael@0: static UClassID U_EXPORT2 getStaticClassID(); michael@0: michael@0: private: michael@0: // prevent default construction etc. michael@0: UXMLElement(); michael@0: UXMLElement(const UXMLElement &other); michael@0: UXMLElement &operator=(const UXMLElement &other); michael@0: michael@0: void appendText(UnicodeString &text, UBool recurse) const; michael@0: michael@0: friend class UXMLParser; michael@0: michael@0: UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode); michael@0: michael@0: const UXMLParser *fParser; michael@0: const UnicodeString *fName; // The tag name of this element (owned by the UXMLParser) michael@0: UnicodeString fContent; // The text content of this node. All element content is michael@0: // concatenated even when there are intervening nested elements michael@0: // (which doesn't happen with most xml files we care about) michael@0: // Sections of content containing only white space are dropped, michael@0: // which gets rid the bogus white space content from michael@0: // elements which are primarily containers for nested elements. michael@0: UVector fAttNames; // A vector containing the names of this element's attributes michael@0: // The names are UnicodeString objects, owned by the UXMLParser. michael@0: UVector fAttValues; // A vector containing the attribute values for michael@0: // this element's attributes. The order is the same michael@0: // as that of the attribute name vector. michael@0: michael@0: UVector fChildren; // The child nodes of this element (a Vector) michael@0: michael@0: UXMLElement *fParent; // A pointer to the parent element of this element. michael@0: }; michael@0: michael@0: /** michael@0: * A simple XML parser; it is neither efficient nor conformant and only useful for michael@0: * restricted types of XML documents. michael@0: * michael@0: * The parse methods parse whole documents and return the parse trees via their michael@0: * root elements. michael@0: */ michael@0: class U_TOOLUTIL_API UXMLParser : public UObject { michael@0: public: michael@0: /** michael@0: * Create an XML parser. michael@0: */ michael@0: static UXMLParser *createParser(UErrorCode &errorCode); michael@0: /** michael@0: * Destructor. michael@0: */ michael@0: virtual ~UXMLParser(); michael@0: michael@0: /** michael@0: * Parse an XML document, create the entire document tree, and michael@0: * return a pointer to the root element of the parsed tree. michael@0: * The caller must delete the element. michael@0: */ michael@0: UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode); michael@0: /** michael@0: * Parse an XML file, create the entire document tree, and michael@0: * return a pointer to the root element of the parsed tree. michael@0: * The caller must delete the element. michael@0: */ michael@0: UXMLElement *parseFile(const char *filename, UErrorCode &errorCode); michael@0: michael@0: /** michael@0: * ICU "poor man's RTTI", returns a UClassID for the actual class. michael@0: */ michael@0: virtual UClassID getDynamicClassID() const; michael@0: michael@0: /** michael@0: * ICU "poor man's RTTI", returns a UClassID for this class. michael@0: */ michael@0: static UClassID U_EXPORT2 getStaticClassID(); michael@0: michael@0: private: michael@0: // prevent default construction etc. michael@0: UXMLParser(); michael@0: UXMLParser(const UXMLParser &other); michael@0: UXMLParser &operator=(const UXMLParser &other); michael@0: michael@0: // constructor michael@0: UXMLParser(UErrorCode &status); michael@0: michael@0: void parseMisc(UErrorCode &status); michael@0: UXMLElement *createElement(RegexMatcher &mEl, UErrorCode &status); michael@0: void error(const char *message, UErrorCode &status); michael@0: UnicodeString scanContent(UErrorCode &status); michael@0: void replaceCharRefs(UnicodeString &s, UErrorCode &status); michael@0: michael@0: const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode); michael@0: public: michael@0: // public for UXMLElement only michael@0: const UnicodeString *findName(const UnicodeString &s) const; michael@0: private: michael@0: michael@0: // There is one ICU regex matcher for each of the major XML syntax items michael@0: // that are recognized. michael@0: RegexMatcher mXMLDecl; michael@0: RegexMatcher mXMLComment; michael@0: RegexMatcher mXMLSP; michael@0: RegexMatcher mXMLDoctype; michael@0: RegexMatcher mXMLPI; michael@0: RegexMatcher mXMLElemStart; michael@0: RegexMatcher mXMLElemEnd; michael@0: RegexMatcher mXMLElemEmpty; michael@0: RegexMatcher mXMLCharData; michael@0: RegexMatcher mAttrValue; michael@0: RegexMatcher mAttrNormalizer; michael@0: RegexMatcher mNewLineNormalizer; michael@0: RegexMatcher mAmps; michael@0: michael@0: Hashtable fNames; // interned element/attribute name strings michael@0: UStack fElementStack; // Stack holds the parent elements when nested michael@0: // elements are being parsed. All items on this michael@0: // stack are of type UXMLElement. michael@0: int32_t fPos; // String index of the current scan position in michael@0: // xml source (in fSrc). michael@0: UnicodeString fOneLF; michael@0: }; michael@0: michael@0: U_NAMESPACE_END michael@0: #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ michael@0: michael@0: #endif