intl/icu/source/tools/toolutil/xmlparser.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2 *******************************************************************************
     3 *
     4 *   Copyright (C) 2004-2005, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 *******************************************************************************
     8 *   file name:  xmlparser.h
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2004jul21
    14 *   created by: Andy Heninger
    15 *
    16 * Tiny XML parser using ICU and intended for use in ICU tests and in build tools.
    17 * Not suitable for production use. Not supported.
    18 * Not conformant. Not efficient.
    19 * But very small.
    20 */
    22 #ifndef __XMLPARSER_H__
    23 #define __XMLPARSER_H__
    25 #include "unicode/uobject.h"
    26 #include "unicode/unistr.h"
    27 #include "unicode/regex.h"
    28 #include "uvector.h"
    29 #include "hash.h"
    31 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
    33 enum UXMLNodeType {
    34     /** Node type string (text contents), stored as a UnicodeString. */
    35     UXML_NODE_TYPE_STRING,
    36     /** Node type element, stored as a UXMLElement. */
    37     UXML_NODE_TYPE_ELEMENT,
    38     UXML_NODE_TYPE_COUNT
    39 };
    41 U_NAMESPACE_BEGIN
    43 class UXMLParser;
    45 /**
    46  * This class represents an element node in a parsed XML tree.
    47  */
    48 class U_TOOLUTIL_API UXMLElement : public UObject {
    49 public:
    50     /**
    51      * Destructor.
    52      */
    53     virtual ~UXMLElement();
    55     /**
    56      * Get the tag name of this element.
    57      */
    58     const UnicodeString &getTagName() const;
    59     /**
    60      * Get the text contents of the element.
    61      * Append the contents of all text child nodes.
    62      * @param recurse If TRUE, also recursively appends the contents of all
    63      *        text child nodes of element children.
    64      * @return The text contents.
    65      */
    66     UnicodeString getText(UBool recurse) const;
    67     /**
    68      * Get the number of attributes.
    69      */
    70     int32_t countAttributes() const;
    71     /**
    72      * Get the i-th attribute.
    73      * @param i Index of the attribute.
    74      * @param name Output parameter, receives the attribute name.
    75      * @param value Output parameter, receives the attribute value.
    76      * @return A pointer to the attribute value (may be &value or a pointer to an
    77      *         internal string object), or NULL if i is out of bounds.
    78      */
    79     const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const;
    80     /**
    81      * Get the value of the attribute with the given name.
    82      * @param name Attribute name to be looked up.
    83      * @return A pointer to the attribute value, or NULL if this element
    84      * does not have this attribute.
    85      */
    86     const UnicodeString *getAttribute(const UnicodeString &name) const;
    87     /**
    88      * Get the number of child nodes.
    89      */
    90     int32_t countChildren() const;
    91     /**
    92      * Get the i-th child node.
    93      * @param i Index of the child node.
    94      * @param type The child node type.
    95      * @return A pointer to the child node object, or NULL if i is out of bounds.
    96      */
    97     const UObject *getChild(int32_t i, UXMLNodeType &type) const;
    98     /**
    99      * Get the next child element node, skipping non-element child nodes.
   100      * @param i Enumeration index; initialize to 0 before getting the first child element.
   101      * @return A pointer to the next child element, or NULL if there is none.
   102      */
   103     const UXMLElement *nextChildElement(int32_t &i) const;
   104     /**
   105      * Get the immediate child element with the given name.
   106      * If there are multiple child elements with this name, then return
   107      * the first one.
   108      * @param name Element name to be looked up.
   109      * @return A pointer to the element node, or NULL if this element
   110      * does not have this immediate child element.
   111      */
   112     const UXMLElement *getChildElement(const UnicodeString &name) const;
   114     /**
   115      * ICU "poor man's RTTI", returns a UClassID for the actual class.
   116      */
   117     virtual UClassID getDynamicClassID() const;
   119     /**
   120      * ICU "poor man's RTTI", returns a UClassID for this class.
   121      */
   122     static UClassID U_EXPORT2 getStaticClassID();
   124 private:
   125     // prevent default construction etc.
   126     UXMLElement();
   127     UXMLElement(const UXMLElement &other);
   128     UXMLElement &operator=(const UXMLElement &other);
   130     void appendText(UnicodeString &text, UBool recurse) const;
   132     friend class UXMLParser;
   134     UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode);
   136     const UXMLParser *fParser;
   137     const UnicodeString *fName;          // The tag name of this element (owned by the UXMLParser)
   138     UnicodeString       fContent;        // The text content of this node.  All element content is 
   139                                          //   concatenated even when there are intervening nested elements
   140                                          //   (which doesn't happen with most xml files we care about)
   141                                          //   Sections of content containing only white space are dropped,
   142                                          //   which gets rid  the bogus white space content from
   143                                          //   elements which are primarily containers for nested elements.
   144     UVector             fAttNames;       // A vector containing the names of this element's attributes
   145                                          //    The names are UnicodeString objects, owned by the UXMLParser.
   146     UVector             fAttValues;      // A vector containing the attribute values for
   147                                          //    this element's attributes.  The order is the same
   148                                          //    as that of the attribute name vector.
   150     UVector             fChildren;       // The child nodes of this element (a Vector)
   152     UXMLElement        *fParent;         // A pointer to the parent element of this element.
   153 };
   155 /**
   156  * A simple XML parser; it is neither efficient nor conformant and only useful for
   157  * restricted types of XML documents.
   158  *
   159  * The parse methods parse whole documents and return the parse trees via their
   160  * root elements.
   161  */
   162 class U_TOOLUTIL_API UXMLParser : public UObject {
   163 public:
   164     /**
   165      * Create an XML parser.
   166      */
   167     static UXMLParser *createParser(UErrorCode &errorCode);
   168     /**
   169      * Destructor.
   170      */
   171     virtual ~UXMLParser();
   173     /**
   174      * Parse an XML document, create the entire document tree, and
   175      * return a pointer to the root element of the parsed tree.
   176      * The caller must delete the element.
   177      */
   178     UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode);
   179     /**
   180      * Parse an XML file, create the entire document tree, and
   181      * return a pointer to the root element of the parsed tree.
   182      * The caller must delete the element.
   183      */
   184     UXMLElement *parseFile(const char *filename, UErrorCode &errorCode);
   186     /**
   187      * ICU "poor man's RTTI", returns a UClassID for the actual class.
   188      */
   189     virtual UClassID getDynamicClassID() const;
   191     /**
   192      * ICU "poor man's RTTI", returns a UClassID for this class.
   193      */
   194     static UClassID U_EXPORT2 getStaticClassID();
   196 private:
   197     // prevent default construction etc.
   198     UXMLParser();
   199     UXMLParser(const UXMLParser &other);
   200     UXMLParser &operator=(const UXMLParser &other);
   202     // constructor
   203     UXMLParser(UErrorCode &status);
   205     void           parseMisc(UErrorCode &status);
   206     UXMLElement   *createElement(RegexMatcher &mEl, UErrorCode &status);
   207     void           error(const char *message, UErrorCode &status);
   208     UnicodeString  scanContent(UErrorCode &status);
   209     void           replaceCharRefs(UnicodeString &s, UErrorCode &status);
   211     const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode);
   212 public:
   213     // public for UXMLElement only
   214     const UnicodeString *findName(const UnicodeString &s) const;
   215 private:
   217     // There is one ICU regex matcher for each of the major XML syntax items
   218     //  that are recognized.
   219     RegexMatcher mXMLDecl;
   220     RegexMatcher mXMLComment;
   221     RegexMatcher mXMLSP;
   222     RegexMatcher mXMLDoctype;
   223     RegexMatcher mXMLPI;
   224     RegexMatcher mXMLElemStart;
   225     RegexMatcher mXMLElemEnd;
   226     RegexMatcher mXMLElemEmpty;
   227     RegexMatcher mXMLCharData;
   228     RegexMatcher mAttrValue;
   229     RegexMatcher mAttrNormalizer;
   230     RegexMatcher mNewLineNormalizer;
   231     RegexMatcher mAmps;
   233     Hashtable             fNames;           // interned element/attribute name strings
   234     UStack                fElementStack;    // Stack holds the parent elements when nested
   235                                             //    elements are being parsed.  All items on this
   236                                             //    stack are of type UXMLElement.
   237     int32_t               fPos;             // String index of the current scan position in
   238                                             //    xml source (in fSrc).
   239     UnicodeString         fOneLF;
   240 };
   242 U_NAMESPACE_END
   243 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
   245 #endif

mercurial