The Tor Browser: comparison intl/icu/source/tools/toolutil/xmlparser.cpp

--1:000000000000
+:d010cea549ad
+/*
+*******************************************************************************
+*
+*   Copyright (C) 2004-2010, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+*
+*******************************************************************************
+*   file name:  xmlparser.cpp
+*   encoding:   US-ASCII
+*   tab size:   8 (not used)
+*   indentation:4
+*
+*   created on: 2004jul21
+*   created by: Andy Heninger
+*/
+#include <stdio.h>
+#include "unicode/uchar.h"
+#include "unicode/ucnv.h"
+#include "unicode/regex.h"
+#include "filestrm.h"
+#include "xmlparser.h"
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
+// character constants
+enum {
+x_QUOT=0x22,
+x_AMP=0x26,
+x_APOS=0x27,
+x_LT=0x3c,
+x_GT=0x3e,
+x_l=0x6c
+};
+#define  XML_SPACES "[ \\u0009\\u000d\\u000a]"
+// XML #4
+#define  XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
+"[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
+"[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
+"[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
+//  XML #5
+#define  XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
+//  XML #6
+#define  XML_NAME    XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"
+U_NAMESPACE_BEGIN
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)
+//
+//   UXMLParser constructor.   Mostly just initializes the ICU regexes that are
+//                             used for parsing.
+//
+UXMLParser::UXMLParser(UErrorCode &status) :
+//  XML Declaration.  XML Production #23.
+//      example:  "<?xml version=1.0 encoding="utf-16" ?>
+//      This is a sloppy implementation - just look for the leading <?xml and the closing ?>
+//            allow for a possible leading BOM.
+mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
+//  XML Comment   production #15
+//     example:  "<!-- whatever -->
+//       note, does not detect an illegal "--" within comments
+mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
+//  XML Spaces
+//      production [3]
+mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
+//  XML Doctype decl  production #28
+//     example   "<!DOCTYPE foo SYSTEM "somewhere" >
+//       or      "<!DOCTYPE foo [internal dtd]>
+//    TODO:  we don't actually parse the DOCTYPE or internal subsets.
+//           Some internal dtd subsets could confuse this simple-minded
+//           attempt at skipping over them, specifically, occcurences
+//           of closeing square brackets.  These could appear in comments,
+//           or in parameter entity declarations, for example.
+mXMLDoctype(UnicodeString(
+"(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV
+), 0, status),
+//  XML PI     production #16
+//     example   "<?target stuff?>
+mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
+//  XML Element Start   Productions #40, #41
+//          example   <foo att1='abc'  att2="d e f" >
+//      capture #1:  the tag name
+//
+mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
+"(?:"
+XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
+"(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
+")*"                                                             //   * for zero or more attributes.
+XML_SPACES "*?>", -1, US_INV), 0, status),                               // match " >"
+//  XML Element End     production #42
+//     example   </foo>
+mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
+// XML Element Empty    production #44
+//     example   <foo att1="abc"   att2="d e f" />
+mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
+"(?:"
+XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
+"(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
+")*"                                                             //   * for zero or more attributes.
+XML_SPACES "*?/>", -1, US_INV), 0, status),                              // match " />"
+// XMLCharData.  Everything but '<'.  Note that & will be dealt with later.
+mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
+// Attribute name = "value".  XML Productions 10, 40/41
+//  Capture group 1 is name,
+//                2 is the attribute value, including the quotes.
+//
+//   Note that attributes are scanned twice.  The first time is with
+//        the regex for an entire element start.  There, the attributes
+//        are checked syntactically, but not separted out one by one.
+//        Here, we match a single attribute, and make its name and
+//        attribute value available to the parser code.
+mAttrValue(UnicodeString(XML_SPACES "+("  XML_NAME ")"  XML_SPACES "*=" XML_SPACES "*"
+"((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
+mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
+// Match any of the new-line sequences in content.
+//   All are changed to \u000a.
+mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
+// & char references
+//   We will figure out what we've got based on which capture group has content.
+//   The last one is a catchall for unrecognized entity references..
+//             1     2     3      4      5           6                    7          8
+mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
+0, status),
+fNames(status),
+fElementStack(status),
+fOneLF((UChar)0x0a)        // Plain new-line string, used in new line normalization.
+{
+}
+UXMLParser *
+UXMLParser::createParser(UErrorCode &errorCode) {
+if (U_FAILURE(errorCode)) {
+return NULL;
+} else {
+return new UXMLParser(errorCode);
+}
+}
+UXMLParser::~UXMLParser() {}
+UXMLElement *
+UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
+char bytes[4096], charsetBuffer[100];
+FileStream *f;
+const char *charset, *pb;
+UnicodeString src;
+UConverter *cnv;
+UChar *buffer, *pu;
+int32_t fileLength, bytesLength, length, capacity;
+UBool flush;
+if(U_FAILURE(errorCode)) {
+return NULL;
+}
+f=T_FileStream_open(filename, "rb");
+if(f==NULL) {
+errorCode=U_FILE_ACCESS_ERROR;
+return NULL;
+}
+bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
+if(bytesLength<(int32_t)sizeof(bytes)) {
+// we have already read the entire file
+fileLength=bytesLength;
+} else {
+// get the file length
+fileLength=T_FileStream_size(f);
+}
+/*
+* get the charset:
+* 1. Unicode signature
+* 2. treat as ISO-8859-1 and read XML encoding="charser"
+* 3. default to UTF-8
+*/
+charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
+if(U_SUCCESS(errorCode) && charset!=NULL) {
+// open converter according to Unicode signature
+cnv=ucnv_open(charset, &errorCode);
+} else {
+// read as Latin-1 and parse the XML declaration and encoding
+cnv=ucnv_open("ISO-8859-1", &errorCode);
+if(U_FAILURE(errorCode)) {
+// unexpected error opening Latin-1 converter
+goto exit;
+}
+buffer=src.getBuffer(bytesLength);
+if(buffer==NULL) {
+// unexpected failure to reserve some string capacity
+errorCode=U_MEMORY_ALLOCATION_ERROR;
+goto exit;
+}
+pb=bytes;
+pu=buffer;
+ucnv_toUnicode(
+cnv,
+&pu, buffer+src.getCapacity(),
+&pb, bytes+bytesLength,
+NULL, TRUE, &errorCode);
+src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
+ucnv_close(cnv);
+cnv=NULL;
+if(U_FAILURE(errorCode)) {
+// unexpected error in conversion from Latin-1
+src.remove();
+goto exit;
+}
+// parse XML declaration
+if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
+int32_t declEnd=mXMLDecl.end(errorCode);
+// go beyond <?xml
+int32_t pos=src.indexOf((UChar)x_l)+1;
+mAttrValue.reset(src);
+while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) {  // loop runs once per attribute on this element.
+UnicodeString attName  = mAttrValue.group(1, errorCode);
+UnicodeString attValue = mAttrValue.group(2, errorCode);
+// Trim the quotes from the att value.  These are left over from the original regex
+//   that parsed the attribue, which couldn't conveniently strip them.
+attValue.remove(0,1);                    // one char from the beginning
+attValue.truncate(attValue.length()-1);  // and one from the end.
+if(attName==UNICODE_STRING("encoding", 8)) {
+length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
+charset=charsetBuffer;
+break;
+}
+pos = mAttrValue.end(2, errorCode);
+}
+if(charset==NULL) {
+// default to UTF-8
+charset="UTF-8";
+}
+cnv=ucnv_open(charset, &errorCode);
+}
+}
+if(U_FAILURE(errorCode)) {
+// unable to open the converter
+goto exit;
+}
+// convert the file contents
+capacity=fileLength;        // estimated capacity
+src.getBuffer(capacity);
+src.releaseBuffer(0);       // zero length
+flush=FALSE;
+for(;;) {
+// convert contents of bytes[bytesLength]
+pb=bytes;
+for(;;) {
+length=src.length();
+buffer=src.getBuffer(capacity);
+if(buffer==NULL) {
+// unexpected failure to reserve some string capacity
+errorCode=U_MEMORY_ALLOCATION_ERROR;
+goto exit;
+}
+pu=buffer+length;
+ucnv_toUnicode(
+cnv, &pu, buffer+src.getCapacity(),
+&pb, bytes+bytesLength,
+NULL, FALSE, &errorCode);
+src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
+if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
+errorCode=U_ZERO_ERROR;
+capacity=(3*src.getCapacity())/2; // increase capacity by 50%
+} else {
+break;
+}
+}
+if(U_FAILURE(errorCode)) {
+break; // conversion error
+}
+if(flush) {
+break; // completely converted the file
+}
+// read next block
+bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
+if(bytesLength==0) {
+// reached end of file, convert once more to flush the converter
+flush=TRUE;
+}
+};
+exit:
+ucnv_close(cnv);
+T_FileStream_close(f);
+if(U_SUCCESS(errorCode)) {
+return parse(src, errorCode);
+} else {
+return NULL;
+}
+}
+UXMLElement *
+UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
+if(U_FAILURE(status)) {
+return NULL;
+}
+UXMLElement   *root = NULL;
+fPos = 0; // TODO use just a local pos variable and pass it into functions
+// where necessary?
+// set all matchers to work on the input string
+mXMLDecl.reset(src);
+mXMLComment.reset(src);
+mXMLSP.reset(src);
+mXMLDoctype.reset(src);
+mXMLPI.reset(src);
+mXMLElemStart.reset(src);
+mXMLElemEnd.reset(src);
+mXMLElemEmpty.reset(src);
+mXMLCharData.reset(src);
+mAttrValue.reset(src);
+mAttrNormalizer.reset(src);
+mNewLineNormalizer.reset(src);
+mAmps.reset(src);
+// Consume the XML Declaration, if present.
+if (mXMLDecl.lookingAt(fPos, status)) {
+fPos = mXMLDecl.end(status);
+}
+// Consume "misc" [XML production 27] appearing before DocType
+parseMisc(status);
+// Consume a DocType declaration, if present.
+if (mXMLDoctype.lookingAt(fPos, status)) {
+fPos = mXMLDoctype.end(status);
+}
+// Consume additional "misc" [XML production 27] appearing after the DocType
+parseMisc(status);
+// Get the root element
+if (mXMLElemEmpty.lookingAt(fPos, status)) {
+// Root is an empty element (no nested elements or content)
+root = createElement(mXMLElemEmpty, status);
+fPos = mXMLElemEmpty.end(status);
+} else {
+if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {
+error("Root Element expected", status);
+goto errorExit;
+}
+root = createElement(mXMLElemStart, status);
+UXMLElement  *el = root;
+//
+// This is the loop that consumes the root element of the document,
+//      including all nested content.   Nested elements are handled by
+//      explicit pushes/pops of the element stack; there is no recursion
+//      in the control flow of this code.
+//      "el" always refers to the current element, the one to which content
+//      is being added.  It is above the top of the element stack.
+for (;;) {
+// Nested Element Start
+if (mXMLElemStart.lookingAt(fPos, status)) {
+UXMLElement *t = createElement(mXMLElemStart, status);
+el->fChildren.addElement(t, status);
+t->fParent = el;
+fElementStack.push(el, status);
+el = t;
+continue;
+}
+// Text Content.  String is concatenated onto the current node's content,
+//                but only if it contains something other than spaces.
+UnicodeString s = scanContent(status);
+if (s.length() > 0) {
+mXMLSP.reset(s);
+if (mXMLSP.matches(status) == FALSE) {
+// This chunk of text contains something other than just
+//  white space. Make a child node for it.
+replaceCharRefs(s, status);
+el->fChildren.addElement(s.clone(), status);
+}
+mXMLSP.reset(src);    // The matchers need to stay set to the main input string.
+continue;
+}
+// Comments.  Discard.
+if (mXMLComment.lookingAt(fPos, status)) {
+fPos = mXMLComment.end(status);
+continue;
+}
+// PIs.  Discard.
+if (mXMLPI.lookingAt(fPos, status)) {
+fPos = mXMLPI.end(status);
+continue;
+}
+// Element End
+if (mXMLElemEnd.lookingAt(fPos, status)) {
+fPos = mXMLElemEnd.end(0, status);
+const UnicodeString name = mXMLElemEnd.group(1, status);
+if (name != *el->fName) {
+error("Element start / end tag mismatch", status);
+goto errorExit;
+}
+if (fElementStack.empty()) {
+// Close of the root element.  We're done with the doc.
+el = NULL;
+break;
+}
+el = (UXMLElement *)fElementStack.pop();
+continue;
+}
+// Empty Element.  Stored as a child of the current element, but not stacked.
+if (mXMLElemEmpty.lookingAt(fPos, status)) {
+UXMLElement *t = createElement(mXMLElemEmpty, status);
+el->fChildren.addElement(t, status);
+continue;
+}
+// Hit something within the document that doesn't match anything.
+//   It's an error.
+error("Unrecognized markup", status);
+break;
+}
+if (el != NULL || !fElementStack.empty()) {
+// We bailed out early, for some reason.
+error("Root element not closed.", status);
+goto errorExit;
+}
+}
+// Root Element parse is complete.
+// Consume the annoying xml "Misc" that can appear at the end of the doc.
+parseMisc(status);
+// We should have reached the end of the input
+if (fPos != src.length()) {
+error("Extra content at the end of the document", status);
+goto errorExit;
+}
+// Success!
+return root;
+errorExit:
+delete root;
+return NULL;
+}
+//
+//  createElement
+//      We've just matched an element start tag.  Create and fill in a UXMLElement object
+//      for it.
+//
+UXMLElement *
+UXMLParser::createElement(RegexMatcher  &mEl, UErrorCode &status) {
+// First capture group is the element's name.
+UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);
+// Scan for attributes.
+int32_t   pos = mEl.end(1, status);  // The position after the end of the tag name
+while (mAttrValue.lookingAt(pos, status)) {  // loop runs once per attribute on this element.
+UnicodeString attName  = mAttrValue.group(1, status);
+UnicodeString attValue = mAttrValue.group(2, status);
+// Trim the quotes from the att value.  These are left over from the original regex
+//   that parsed the attribue, which couldn't conveniently strip them.
+attValue.remove(0,1);                    // one char from the beginning
+attValue.truncate(attValue.length()-1);  // and one from the end.
+// XML Attribue value normalization.
+// This is one of the really screwy parts of the XML spec.
+// See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
+// Note that non-validating parsers must treat all entities as type CDATA
+//   which simplifies things some.
+// Att normalization step 1:  normalize any newlines in the attribute value
+mNewLineNormalizer.reset(attValue);
+attValue = mNewLineNormalizer.replaceAll(fOneLF, status);
+// Next change all xml white space chars to plain \u0020 spaces.
+mAttrNormalizer.reset(attValue);
+UnicodeString oneSpace((UChar)0x0020);
+attValue = mAttrNormalizer.replaceAll(oneSpace, status);
+// Replace character entities.
+replaceCharRefs(attValue, status);
+// Save the attribute name and value in our document structure.
+el->fAttNames.addElement((void *)intern(attName, status), status);
+el->fAttValues.addElement(attValue.clone(), status);
+pos = mAttrValue.end(2, status);
+}
+fPos = mEl.end(0, status);
+return el;
+}
+//
+//  parseMisc
+//     Consume XML "Misc" [production #27]
+//        which is any combination of space, PI and comments
+//      Need to watch end-of-input because xml MISC stuff is allowed after
+//        the document element, so we WILL scan off the end in this function
+//
+void
+UXMLParser::parseMisc(UErrorCode &status)  {
+for (;;) {
+if (fPos >= mXMLPI.input().length()) {
+break;
+}
+if (mXMLPI.lookingAt(fPos, status)) {
+fPos = mXMLPI.end(status);
+continue;
+}
+if (mXMLSP.lookingAt(fPos, status)) {
+fPos = mXMLSP.end(status);
+continue;
+}
+if (mXMLComment.lookingAt(fPos, status)) {
+fPos = mXMLComment.end(status);
+continue;
+}
+break;
+}
+}
+//
+//  Scan for document content.
+//
+UnicodeString
+UXMLParser::scanContent(UErrorCode &status) {
+UnicodeString  result;
+if (mXMLCharData.lookingAt(fPos, status)) {
+result = mXMLCharData.group((int32_t)0, status);
+// Normalize the new-lines.  (Before char ref substitution)
+mNewLineNormalizer.reset(result);
+result = mNewLineNormalizer.replaceAll(fOneLF, status);
+// TODO:  handle CDATA
+fPos = mXMLCharData.end(0, status);
+}
+return result;
+}
+//
+//   replaceCharRefs
+//
+//      replace the char entities &lt;  &amp; &#123; &#x12ab; etc. in a string
+//       with the corresponding actual character.
+//
+void
+UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
+UnicodeString result;
+UnicodeString replacement;
+int     i;
+mAmps.reset(s);
+// See the initialization for the regex matcher mAmps.
+//    Which entity we've matched is determined by which capture group has content,
+//      which is flaged by start() of that group not being -1.
+while (mAmps.find()) {
+if (mAmps.start(1, status) != -1) {
+replacement.setTo((UChar)x_AMP);
+} else if (mAmps.start(2, status) != -1) {
+replacement.setTo((UChar)x_LT);
+} else if (mAmps.start(3, status) != -1) {
+replacement.setTo((UChar)x_GT);
+} else if (mAmps.start(4, status) != -1) {
+replacement.setTo((UChar)x_APOS);
+} else if (mAmps.start(5, status) != -1) {
+replacement.setTo((UChar)x_QUOT);
+} else if (mAmps.start(6, status) != -1) {
+UnicodeString hexString = mAmps.group(6, status);
+UChar32 val = 0;
+for (i=0; i<hexString.length(); i++) {
+val = (val << 4) + u_digit(hexString.charAt(i), 16);
+}
+// TODO:  some verification that the character is valid
+replacement.setTo(val);
+} else if (mAmps.start(7, status) != -1) {
+UnicodeString decimalString = mAmps.group(7, status);
+UChar32 val = 0;
+for (i=0; i<decimalString.length(); i++) {
+val = val*10 + u_digit(decimalString.charAt(i), 10);
+}
+// TODO:  some verification that the character is valid
+replacement.setTo(val);
+} else {
+// An unrecognized &entity;  Leave it alone.
+//  TODO:  check that it really looks like an entity, and is not some
+//         random & in the text.
+replacement = mAmps.group((int32_t)0, status);
+}
+mAmps.appendReplacement(result, replacement, status);
+}
+mAmps.appendTail(result);
+s = result;
+}
+void
+UXMLParser::error(const char *message, UErrorCode &status) {
+// TODO:  something better here...
+const UnicodeString &src=mXMLDecl.input();
+int  line = 0;
+int  ci = 0;
+while (ci < fPos && ci>=0) {
+ci = src.indexOf((UChar)0x0a, ci+1);
+line++;
+}
+fprintf(stderr, "Error: %s at line %d\n", message, line);
+if (U_SUCCESS(status)) {
+status = U_PARSE_ERROR;
+}
+}
+// intern strings like in Java
+const UnicodeString *
+UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
+const UHashElement *he=fNames.find(s);
+if(he!=NULL) {
+// already a known name, return its hashed key pointer
+return (const UnicodeString *)he->key.pointer;
+} else {
+// add this new name and return its hashed key pointer
+fNames.puti(s, 0, errorCode);
+he=fNames.find(s);
+return (const UnicodeString *)he->key.pointer;
+}
+}
+const UnicodeString *
+UXMLParser::findName(const UnicodeString &s) const {
+const UHashElement *he=fNames.find(s);
+if(he!=NULL) {
+// a known name, return its hashed key pointer
+return (const UnicodeString *)he->key.pointer;
+} else {
+// unknown name
+return NULL;
+}
+}
+// UXMLElement ------------------------------------------------------------- ***
+UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
+fParser(parser),
+fName(name),
+fAttNames(errorCode),
+fAttValues(errorCode),
+fChildren(errorCode),
+fParent(NULL)
+{
+}
+UXMLElement::~UXMLElement() {
+int   i;
+// attribute names are owned by the UXMLParser, don't delete them here
+for (i=fAttValues.size()-1; i>=0; i--) {
+delete (UObject *)fAttValues.elementAt(i);
+}
+for (i=fChildren.size()-1; i>=0; i--) {
+delete (UObject *)fChildren.elementAt(i);
+}
+}
+const UnicodeString &
+UXMLElement::getTagName() const {
+return *fName;
+}
+UnicodeString
+UXMLElement::getText(UBool recurse) const {
+UnicodeString text;
+appendText(text, recurse);
+return text;
+}
+void
+UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
+const UObject *node;
+int32_t i, count=fChildren.size();
+for(i=0; i<count; ++i) {
+node=(const UObject *)fChildren.elementAt(i);
+const UnicodeString *s=dynamic_cast<const UnicodeString *>(node);
+if(s!=NULL) {
+text.append(*s);
+} else if(recurse) /* must be a UXMLElement */ {
+((const UXMLElement *)node)->appendText(text, recurse);
+}
+}
+}
+int32_t
+UXMLElement::countAttributes() const {
+return fAttNames.size();
+}
+const UnicodeString *
+UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
+if(0<=i && i<fAttNames.size()) {
+name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
+value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
+return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
+} else {
+return NULL;
+}
+}
+const UnicodeString *
+UXMLElement::getAttribute(const UnicodeString &name) const {
+// search for the attribute name by comparing the interned pointer,
+// not the string contents
+const UnicodeString *p=fParser->findName(name);
+if(p==NULL) {
+return NULL; // no such attribute seen by the parser at all
+}
+int32_t i, count=fAttNames.size();
+for(i=0; i<count; ++i) {
+if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
+return (const UnicodeString *)fAttValues.elementAt(i);
+}
+}
+return NULL;
+}
+int32_t
+UXMLElement::countChildren() const {
+return fChildren.size();
+}
+const UObject *
+UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
+if(0<=i && i<fChildren.size()) {
+const UObject *node=(const UObject *)fChildren.elementAt(i);
+if(dynamic_cast<const UXMLElement *>(node)!=NULL) {
+type=UXML_NODE_TYPE_ELEMENT;
+} else {
+type=UXML_NODE_TYPE_STRING;
+}
+return node;
+} else {
+return NULL;
+}
+}
+const UXMLElement *
+UXMLElement::nextChildElement(int32_t &i) const {
+if(i<0) {
+return NULL;
+}
+const UObject *node;
+int32_t count=fChildren.size();
+while(i<count) {
+node=(const UObject *)fChildren.elementAt(i++);
+const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
+if(elem!=NULL) {
+return elem;
+}
+}
+return NULL;
+}
+const UXMLElement *
+UXMLElement::getChildElement(const UnicodeString &name) const {
+// search for the element name by comparing the interned pointer,
+// not the string contents
+const UnicodeString *p=fParser->findName(name);
+if(p==NULL) {
+return NULL; // no such element seen by the parser at all
+}
+const UObject *node;
+int32_t i, count=fChildren.size();
+for(i=0; i<count; ++i) {
+node=(const UObject *)fChildren.elementAt(i);
+const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
+if(elem!=NULL) {
+if(p==elem->fName) {
+return elem;
+}
+}
+}
+return NULL;
+}
+U_NAMESPACE_END
+#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */

The Tor Browser / file comparison

comparison: intl/icu/source/tools/toolutil/xmlparser.cpp

intl/icu/source/tools/toolutil/xmlparser.cpp