intl/icu/source/tools/toolutil/xmlparser.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2004-2005, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 *******************************************************************************
michael@0 8 * file name: xmlparser.h
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2004jul21
michael@0 14 * created by: Andy Heninger
michael@0 15 *
michael@0 16 * Tiny XML parser using ICU and intended for use in ICU tests and in build tools.
michael@0 17 * Not suitable for production use. Not supported.
michael@0 18 * Not conformant. Not efficient.
michael@0 19 * But very small.
michael@0 20 */
michael@0 21
michael@0 22 #ifndef __XMLPARSER_H__
michael@0 23 #define __XMLPARSER_H__
michael@0 24
michael@0 25 #include "unicode/uobject.h"
michael@0 26 #include "unicode/unistr.h"
michael@0 27 #include "unicode/regex.h"
michael@0 28 #include "uvector.h"
michael@0 29 #include "hash.h"
michael@0 30
michael@0 31 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
michael@0 32
michael@0 33 enum UXMLNodeType {
michael@0 34 /** Node type string (text contents), stored as a UnicodeString. */
michael@0 35 UXML_NODE_TYPE_STRING,
michael@0 36 /** Node type element, stored as a UXMLElement. */
michael@0 37 UXML_NODE_TYPE_ELEMENT,
michael@0 38 UXML_NODE_TYPE_COUNT
michael@0 39 };
michael@0 40
michael@0 41 U_NAMESPACE_BEGIN
michael@0 42
michael@0 43 class UXMLParser;
michael@0 44
michael@0 45 /**
michael@0 46 * This class represents an element node in a parsed XML tree.
michael@0 47 */
michael@0 48 class U_TOOLUTIL_API UXMLElement : public UObject {
michael@0 49 public:
michael@0 50 /**
michael@0 51 * Destructor.
michael@0 52 */
michael@0 53 virtual ~UXMLElement();
michael@0 54
michael@0 55 /**
michael@0 56 * Get the tag name of this element.
michael@0 57 */
michael@0 58 const UnicodeString &getTagName() const;
michael@0 59 /**
michael@0 60 * Get the text contents of the element.
michael@0 61 * Append the contents of all text child nodes.
michael@0 62 * @param recurse If TRUE, also recursively appends the contents of all
michael@0 63 * text child nodes of element children.
michael@0 64 * @return The text contents.
michael@0 65 */
michael@0 66 UnicodeString getText(UBool recurse) const;
michael@0 67 /**
michael@0 68 * Get the number of attributes.
michael@0 69 */
michael@0 70 int32_t countAttributes() const;
michael@0 71 /**
michael@0 72 * Get the i-th attribute.
michael@0 73 * @param i Index of the attribute.
michael@0 74 * @param name Output parameter, receives the attribute name.
michael@0 75 * @param value Output parameter, receives the attribute value.
michael@0 76 * @return A pointer to the attribute value (may be &value or a pointer to an
michael@0 77 * internal string object), or NULL if i is out of bounds.
michael@0 78 */
michael@0 79 const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const;
michael@0 80 /**
michael@0 81 * Get the value of the attribute with the given name.
michael@0 82 * @param name Attribute name to be looked up.
michael@0 83 * @return A pointer to the attribute value, or NULL if this element
michael@0 84 * does not have this attribute.
michael@0 85 */
michael@0 86 const UnicodeString *getAttribute(const UnicodeString &name) const;
michael@0 87 /**
michael@0 88 * Get the number of child nodes.
michael@0 89 */
michael@0 90 int32_t countChildren() const;
michael@0 91 /**
michael@0 92 * Get the i-th child node.
michael@0 93 * @param i Index of the child node.
michael@0 94 * @param type The child node type.
michael@0 95 * @return A pointer to the child node object, or NULL if i is out of bounds.
michael@0 96 */
michael@0 97 const UObject *getChild(int32_t i, UXMLNodeType &type) const;
michael@0 98 /**
michael@0 99 * Get the next child element node, skipping non-element child nodes.
michael@0 100 * @param i Enumeration index; initialize to 0 before getting the first child element.
michael@0 101 * @return A pointer to the next child element, or NULL if there is none.
michael@0 102 */
michael@0 103 const UXMLElement *nextChildElement(int32_t &i) const;
michael@0 104 /**
michael@0 105 * Get the immediate child element with the given name.
michael@0 106 * If there are multiple child elements with this name, then return
michael@0 107 * the first one.
michael@0 108 * @param name Element name to be looked up.
michael@0 109 * @return A pointer to the element node, or NULL if this element
michael@0 110 * does not have this immediate child element.
michael@0 111 */
michael@0 112 const UXMLElement *getChildElement(const UnicodeString &name) const;
michael@0 113
michael@0 114 /**
michael@0 115 * ICU "poor man's RTTI", returns a UClassID for the actual class.
michael@0 116 */
michael@0 117 virtual UClassID getDynamicClassID() const;
michael@0 118
michael@0 119 /**
michael@0 120 * ICU "poor man's RTTI", returns a UClassID for this class.
michael@0 121 */
michael@0 122 static UClassID U_EXPORT2 getStaticClassID();
michael@0 123
michael@0 124 private:
michael@0 125 // prevent default construction etc.
michael@0 126 UXMLElement();
michael@0 127 UXMLElement(const UXMLElement &other);
michael@0 128 UXMLElement &operator=(const UXMLElement &other);
michael@0 129
michael@0 130 void appendText(UnicodeString &text, UBool recurse) const;
michael@0 131
michael@0 132 friend class UXMLParser;
michael@0 133
michael@0 134 UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode);
michael@0 135
michael@0 136 const UXMLParser *fParser;
michael@0 137 const UnicodeString *fName; // The tag name of this element (owned by the UXMLParser)
michael@0 138 UnicodeString fContent; // The text content of this node. All element content is
michael@0 139 // concatenated even when there are intervening nested elements
michael@0 140 // (which doesn't happen with most xml files we care about)
michael@0 141 // Sections of content containing only white space are dropped,
michael@0 142 // which gets rid the bogus white space content from
michael@0 143 // elements which are primarily containers for nested elements.
michael@0 144 UVector fAttNames; // A vector containing the names of this element's attributes
michael@0 145 // The names are UnicodeString objects, owned by the UXMLParser.
michael@0 146 UVector fAttValues; // A vector containing the attribute values for
michael@0 147 // this element's attributes. The order is the same
michael@0 148 // as that of the attribute name vector.
michael@0 149
michael@0 150 UVector fChildren; // The child nodes of this element (a Vector)
michael@0 151
michael@0 152 UXMLElement *fParent; // A pointer to the parent element of this element.
michael@0 153 };
michael@0 154
michael@0 155 /**
michael@0 156 * A simple XML parser; it is neither efficient nor conformant and only useful for
michael@0 157 * restricted types of XML documents.
michael@0 158 *
michael@0 159 * The parse methods parse whole documents and return the parse trees via their
michael@0 160 * root elements.
michael@0 161 */
michael@0 162 class U_TOOLUTIL_API UXMLParser : public UObject {
michael@0 163 public:
michael@0 164 /**
michael@0 165 * Create an XML parser.
michael@0 166 */
michael@0 167 static UXMLParser *createParser(UErrorCode &errorCode);
michael@0 168 /**
michael@0 169 * Destructor.
michael@0 170 */
michael@0 171 virtual ~UXMLParser();
michael@0 172
michael@0 173 /**
michael@0 174 * Parse an XML document, create the entire document tree, and
michael@0 175 * return a pointer to the root element of the parsed tree.
michael@0 176 * The caller must delete the element.
michael@0 177 */
michael@0 178 UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode);
michael@0 179 /**
michael@0 180 * Parse an XML file, create the entire document tree, and
michael@0 181 * return a pointer to the root element of the parsed tree.
michael@0 182 * The caller must delete the element.
michael@0 183 */
michael@0 184 UXMLElement *parseFile(const char *filename, UErrorCode &errorCode);
michael@0 185
michael@0 186 /**
michael@0 187 * ICU "poor man's RTTI", returns a UClassID for the actual class.
michael@0 188 */
michael@0 189 virtual UClassID getDynamicClassID() const;
michael@0 190
michael@0 191 /**
michael@0 192 * ICU "poor man's RTTI", returns a UClassID for this class.
michael@0 193 */
michael@0 194 static UClassID U_EXPORT2 getStaticClassID();
michael@0 195
michael@0 196 private:
michael@0 197 // prevent default construction etc.
michael@0 198 UXMLParser();
michael@0 199 UXMLParser(const UXMLParser &other);
michael@0 200 UXMLParser &operator=(const UXMLParser &other);
michael@0 201
michael@0 202 // constructor
michael@0 203 UXMLParser(UErrorCode &status);
michael@0 204
michael@0 205 void parseMisc(UErrorCode &status);
michael@0 206 UXMLElement *createElement(RegexMatcher &mEl, UErrorCode &status);
michael@0 207 void error(const char *message, UErrorCode &status);
michael@0 208 UnicodeString scanContent(UErrorCode &status);
michael@0 209 void replaceCharRefs(UnicodeString &s, UErrorCode &status);
michael@0 210
michael@0 211 const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode);
michael@0 212 public:
michael@0 213 // public for UXMLElement only
michael@0 214 const UnicodeString *findName(const UnicodeString &s) const;
michael@0 215 private:
michael@0 216
michael@0 217 // There is one ICU regex matcher for each of the major XML syntax items
michael@0 218 // that are recognized.
michael@0 219 RegexMatcher mXMLDecl;
michael@0 220 RegexMatcher mXMLComment;
michael@0 221 RegexMatcher mXMLSP;
michael@0 222 RegexMatcher mXMLDoctype;
michael@0 223 RegexMatcher mXMLPI;
michael@0 224 RegexMatcher mXMLElemStart;
michael@0 225 RegexMatcher mXMLElemEnd;
michael@0 226 RegexMatcher mXMLElemEmpty;
michael@0 227 RegexMatcher mXMLCharData;
michael@0 228 RegexMatcher mAttrValue;
michael@0 229 RegexMatcher mAttrNormalizer;
michael@0 230 RegexMatcher mNewLineNormalizer;
michael@0 231 RegexMatcher mAmps;
michael@0 232
michael@0 233 Hashtable fNames; // interned element/attribute name strings
michael@0 234 UStack fElementStack; // Stack holds the parent elements when nested
michael@0 235 // elements are being parsed. All items on this
michael@0 236 // stack are of type UXMLElement.
michael@0 237 int32_t fPos; // String index of the current scan position in
michael@0 238 // xml source (in fSrc).
michael@0 239 UnicodeString fOneLF;
michael@0 240 };
michael@0 241
michael@0 242 U_NAMESPACE_END
michael@0 243 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
michael@0 244
michael@0 245 #endif

mercurial