intl/icu/source/tools/toolutil/xmlparser.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/tools/toolutil/xmlparser.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,245 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 2004-2005, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +*******************************************************************************
    1.11 +*   file name:  xmlparser.h
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created on: 2004jul21
    1.17 +*   created by: Andy Heninger
    1.18 +*
    1.19 +* Tiny XML parser using ICU and intended for use in ICU tests and in build tools.
    1.20 +* Not suitable for production use. Not supported.
    1.21 +* Not conformant. Not efficient.
    1.22 +* But very small.
    1.23 +*/
    1.24 +
    1.25 +#ifndef __XMLPARSER_H__
    1.26 +#define __XMLPARSER_H__
    1.27 +
    1.28 +#include "unicode/uobject.h"
    1.29 +#include "unicode/unistr.h"
    1.30 +#include "unicode/regex.h"
    1.31 +#include "uvector.h"
    1.32 +#include "hash.h"
    1.33 +
    1.34 +#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
    1.35 +
    1.36 +enum UXMLNodeType {
    1.37 +    /** Node type string (text contents), stored as a UnicodeString. */
    1.38 +    UXML_NODE_TYPE_STRING,
    1.39 +    /** Node type element, stored as a UXMLElement. */
    1.40 +    UXML_NODE_TYPE_ELEMENT,
    1.41 +    UXML_NODE_TYPE_COUNT
    1.42 +};
    1.43 +
    1.44 +U_NAMESPACE_BEGIN
    1.45 +
    1.46 +class UXMLParser;
    1.47 +
    1.48 +/**
    1.49 + * This class represents an element node in a parsed XML tree.
    1.50 + */
    1.51 +class U_TOOLUTIL_API UXMLElement : public UObject {
    1.52 +public:
    1.53 +    /**
    1.54 +     * Destructor.
    1.55 +     */
    1.56 +    virtual ~UXMLElement();
    1.57 +
    1.58 +    /**
    1.59 +     * Get the tag name of this element.
    1.60 +     */
    1.61 +    const UnicodeString &getTagName() const;
    1.62 +    /**
    1.63 +     * Get the text contents of the element.
    1.64 +     * Append the contents of all text child nodes.
    1.65 +     * @param recurse If TRUE, also recursively appends the contents of all
    1.66 +     *        text child nodes of element children.
    1.67 +     * @return The text contents.
    1.68 +     */
    1.69 +    UnicodeString getText(UBool recurse) const;
    1.70 +    /**
    1.71 +     * Get the number of attributes.
    1.72 +     */
    1.73 +    int32_t countAttributes() const;
    1.74 +    /**
    1.75 +     * Get the i-th attribute.
    1.76 +     * @param i Index of the attribute.
    1.77 +     * @param name Output parameter, receives the attribute name.
    1.78 +     * @param value Output parameter, receives the attribute value.
    1.79 +     * @return A pointer to the attribute value (may be &value or a pointer to an
    1.80 +     *         internal string object), or NULL if i is out of bounds.
    1.81 +     */
    1.82 +    const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const;
    1.83 +    /**
    1.84 +     * Get the value of the attribute with the given name.
    1.85 +     * @param name Attribute name to be looked up.
    1.86 +     * @return A pointer to the attribute value, or NULL if this element
    1.87 +     * does not have this attribute.
    1.88 +     */
    1.89 +    const UnicodeString *getAttribute(const UnicodeString &name) const;
    1.90 +    /**
    1.91 +     * Get the number of child nodes.
    1.92 +     */
    1.93 +    int32_t countChildren() const;
    1.94 +    /**
    1.95 +     * Get the i-th child node.
    1.96 +     * @param i Index of the child node.
    1.97 +     * @param type The child node type.
    1.98 +     * @return A pointer to the child node object, or NULL if i is out of bounds.
    1.99 +     */
   1.100 +    const UObject *getChild(int32_t i, UXMLNodeType &type) const;
   1.101 +    /**
   1.102 +     * Get the next child element node, skipping non-element child nodes.
   1.103 +     * @param i Enumeration index; initialize to 0 before getting the first child element.
   1.104 +     * @return A pointer to the next child element, or NULL if there is none.
   1.105 +     */
   1.106 +    const UXMLElement *nextChildElement(int32_t &i) const;
   1.107 +    /**
   1.108 +     * Get the immediate child element with the given name.
   1.109 +     * If there are multiple child elements with this name, then return
   1.110 +     * the first one.
   1.111 +     * @param name Element name to be looked up.
   1.112 +     * @return A pointer to the element node, or NULL if this element
   1.113 +     * does not have this immediate child element.
   1.114 +     */
   1.115 +    const UXMLElement *getChildElement(const UnicodeString &name) const;
   1.116 +
   1.117 +    /**
   1.118 +     * ICU "poor man's RTTI", returns a UClassID for the actual class.
   1.119 +     */
   1.120 +    virtual UClassID getDynamicClassID() const;
   1.121 +
   1.122 +    /**
   1.123 +     * ICU "poor man's RTTI", returns a UClassID for this class.
   1.124 +     */
   1.125 +    static UClassID U_EXPORT2 getStaticClassID();
   1.126 +
   1.127 +private:
   1.128 +    // prevent default construction etc.
   1.129 +    UXMLElement();
   1.130 +    UXMLElement(const UXMLElement &other);
   1.131 +    UXMLElement &operator=(const UXMLElement &other);
   1.132 +
   1.133 +    void appendText(UnicodeString &text, UBool recurse) const;
   1.134 +
   1.135 +    friend class UXMLParser;
   1.136 +
   1.137 +    UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode);
   1.138 +
   1.139 +    const UXMLParser *fParser;
   1.140 +    const UnicodeString *fName;          // The tag name of this element (owned by the UXMLParser)
   1.141 +    UnicodeString       fContent;        // The text content of this node.  All element content is 
   1.142 +                                         //   concatenated even when there are intervening nested elements
   1.143 +                                         //   (which doesn't happen with most xml files we care about)
   1.144 +                                         //   Sections of content containing only white space are dropped,
   1.145 +                                         //   which gets rid  the bogus white space content from
   1.146 +                                         //   elements which are primarily containers for nested elements.
   1.147 +    UVector             fAttNames;       // A vector containing the names of this element's attributes
   1.148 +                                         //    The names are UnicodeString objects, owned by the UXMLParser.
   1.149 +    UVector             fAttValues;      // A vector containing the attribute values for
   1.150 +                                         //    this element's attributes.  The order is the same
   1.151 +                                         //    as that of the attribute name vector.
   1.152 +
   1.153 +    UVector             fChildren;       // The child nodes of this element (a Vector)
   1.154 +
   1.155 +    UXMLElement        *fParent;         // A pointer to the parent element of this element.
   1.156 +};
   1.157 +
   1.158 +/**
   1.159 + * A simple XML parser; it is neither efficient nor conformant and only useful for
   1.160 + * restricted types of XML documents.
   1.161 + *
   1.162 + * The parse methods parse whole documents and return the parse trees via their
   1.163 + * root elements.
   1.164 + */
   1.165 +class U_TOOLUTIL_API UXMLParser : public UObject {
   1.166 +public:
   1.167 +    /**
   1.168 +     * Create an XML parser.
   1.169 +     */
   1.170 +    static UXMLParser *createParser(UErrorCode &errorCode);
   1.171 +    /**
   1.172 +     * Destructor.
   1.173 +     */
   1.174 +    virtual ~UXMLParser();
   1.175 +
   1.176 +    /**
   1.177 +     * Parse an XML document, create the entire document tree, and
   1.178 +     * return a pointer to the root element of the parsed tree.
   1.179 +     * The caller must delete the element.
   1.180 +     */
   1.181 +    UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode);
   1.182 +    /**
   1.183 +     * Parse an XML file, create the entire document tree, and
   1.184 +     * return a pointer to the root element of the parsed tree.
   1.185 +     * The caller must delete the element.
   1.186 +     */
   1.187 +    UXMLElement *parseFile(const char *filename, UErrorCode &errorCode);
   1.188 +
   1.189 +    /**
   1.190 +     * ICU "poor man's RTTI", returns a UClassID for the actual class.
   1.191 +     */
   1.192 +    virtual UClassID getDynamicClassID() const;
   1.193 +
   1.194 +    /**
   1.195 +     * ICU "poor man's RTTI", returns a UClassID for this class.
   1.196 +     */
   1.197 +    static UClassID U_EXPORT2 getStaticClassID();
   1.198 +
   1.199 +private:
   1.200 +    // prevent default construction etc.
   1.201 +    UXMLParser();
   1.202 +    UXMLParser(const UXMLParser &other);
   1.203 +    UXMLParser &operator=(const UXMLParser &other);
   1.204 +
   1.205 +    // constructor
   1.206 +    UXMLParser(UErrorCode &status);
   1.207 +
   1.208 +    void           parseMisc(UErrorCode &status);
   1.209 +    UXMLElement   *createElement(RegexMatcher &mEl, UErrorCode &status);
   1.210 +    void           error(const char *message, UErrorCode &status);
   1.211 +    UnicodeString  scanContent(UErrorCode &status);
   1.212 +    void           replaceCharRefs(UnicodeString &s, UErrorCode &status);
   1.213 +
   1.214 +    const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode);
   1.215 +public:
   1.216 +    // public for UXMLElement only
   1.217 +    const UnicodeString *findName(const UnicodeString &s) const;
   1.218 +private:
   1.219 +
   1.220 +    // There is one ICU regex matcher for each of the major XML syntax items
   1.221 +    //  that are recognized.
   1.222 +    RegexMatcher mXMLDecl;
   1.223 +    RegexMatcher mXMLComment;
   1.224 +    RegexMatcher mXMLSP;
   1.225 +    RegexMatcher mXMLDoctype;
   1.226 +    RegexMatcher mXMLPI;
   1.227 +    RegexMatcher mXMLElemStart;
   1.228 +    RegexMatcher mXMLElemEnd;
   1.229 +    RegexMatcher mXMLElemEmpty;
   1.230 +    RegexMatcher mXMLCharData;
   1.231 +    RegexMatcher mAttrValue;
   1.232 +    RegexMatcher mAttrNormalizer;
   1.233 +    RegexMatcher mNewLineNormalizer;
   1.234 +    RegexMatcher mAmps;
   1.235 +
   1.236 +    Hashtable             fNames;           // interned element/attribute name strings
   1.237 +    UStack                fElementStack;    // Stack holds the parent elements when nested
   1.238 +                                            //    elements are being parsed.  All items on this
   1.239 +                                            //    stack are of type UXMLElement.
   1.240 +    int32_t               fPos;             // String index of the current scan position in
   1.241 +                                            //    xml source (in fSrc).
   1.242 +    UnicodeString         fOneLF;
   1.243 +};
   1.244 +
   1.245 +U_NAMESPACE_END
   1.246 +#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
   1.247 +
   1.248 +#endif

mercurial