intl/icu/source/tools/toolutil/xmlparser.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2 *******************************************************************************
     3 *
     4 *   Copyright (C) 2004-2010, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 *******************************************************************************
     8 *   file name:  xmlparser.cpp
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2004jul21
    14 *   created by: Andy Heninger
    15 */
    17 #include <stdio.h>
    18 #include "unicode/uchar.h"
    19 #include "unicode/ucnv.h"
    20 #include "unicode/regex.h"
    21 #include "filestrm.h"
    22 #include "xmlparser.h"
    24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
    26 // character constants
    27 enum {
    28     x_QUOT=0x22,
    29     x_AMP=0x26,
    30     x_APOS=0x27,
    31     x_LT=0x3c,
    32     x_GT=0x3e,
    33     x_l=0x6c
    34 };
    36 #define  XML_SPACES "[ \\u0009\\u000d\\u000a]"
    38 // XML #4
    39 #define  XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
    40                     "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
    41                     "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
    42                     "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
    44 //  XML #5
    45 #define  XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
    47 //  XML #6
    48 #define  XML_NAME    XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"
    50 U_NAMESPACE_BEGIN
    52 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
    53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)
    55 //
    56 //   UXMLParser constructor.   Mostly just initializes the ICU regexes that are
    57 //                             used for parsing.
    58 //
    59 UXMLParser::UXMLParser(UErrorCode &status) :
    60       //  XML Declaration.  XML Production #23.
    61       //      example:  "<?xml version=1.0 encoding="utf-16" ?>
    62       //      This is a sloppy implementation - just look for the leading <?xml and the closing ?>
    63       //            allow for a possible leading BOM.
    64       mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
    66       //  XML Comment   production #15
    67       //     example:  "<!-- whatever -->
    68       //       note, does not detect an illegal "--" within comments
    69       mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
    71       //  XML Spaces
    72       //      production [3]
    73       mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
    75       //  XML Doctype decl  production #28
    76       //     example   "<!DOCTYPE foo SYSTEM "somewhere" >
    77       //       or      "<!DOCTYPE foo [internal dtd]>
    78       //    TODO:  we don't actually parse the DOCTYPE or internal subsets.
    79       //           Some internal dtd subsets could confuse this simple-minded
    80       //           attempt at skipping over them, specifically, occcurences
    81       //           of closeing square brackets.  These could appear in comments, 
    82       //           or in parameter entity declarations, for example.
    83       mXMLDoctype(UnicodeString(
    84            "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV
    85            ), 0, status),
    87       //  XML PI     production #16
    88       //     example   "<?target stuff?>
    89       mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
    91       //  XML Element Start   Productions #40, #41
    92       //          example   <foo att1='abc'  att2="d e f" >
    93       //      capture #1:  the tag name
    94       //
    95       mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
    96           "(?:" 
    97                 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
    98                 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
    99           ")*"                                                             //   * for zero or more attributes.
   100           XML_SPACES "*?>", -1, US_INV), 0, status),                               // match " >"
   102       //  XML Element End     production #42
   103       //     example   </foo>
   104       mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
   106       // XML Element Empty    production #44
   107       //     example   <foo att1="abc"   att2="d e f" />
   108       mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
   109           "(?:" 
   110                 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
   111                 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
   112           ")*"                                                             //   * for zero or more attributes.
   113           XML_SPACES "*?/>", -1, US_INV), 0, status),                              // match " />"
   116       // XMLCharData.  Everything but '<'.  Note that & will be dealt with later.
   117       mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
   119       // Attribute name = "value".  XML Productions 10, 40/41
   120       //  Capture group 1 is name, 
   121       //                2 is the attribute value, including the quotes.
   122       //
   123       //   Note that attributes are scanned twice.  The first time is with
   124       //        the regex for an entire element start.  There, the attributes
   125       //        are checked syntactically, but not separted out one by one.
   126       //        Here, we match a single attribute, and make its name and
   127       //        attribute value available to the parser code.
   128       mAttrValue(UnicodeString(XML_SPACES "+("  XML_NAME ")"  XML_SPACES "*=" XML_SPACES "*"
   129          "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
   132       mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
   134       // Match any of the new-line sequences in content.
   135       //   All are changed to \u000a.
   136       mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
   138       // & char references
   139       //   We will figure out what we've got based on which capture group has content.
   140       //   The last one is a catchall for unrecognized entity references..
   141       //             1     2     3      4      5           6                    7          8
   142       mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
   143                 0, status),
   145       fNames(status),
   146       fElementStack(status),
   147       fOneLF((UChar)0x0a)        // Plain new-line string, used in new line normalization.
   148       {
   149       }
   151 UXMLParser *
   152 UXMLParser::createParser(UErrorCode &errorCode) {
   153     if (U_FAILURE(errorCode)) {
   154         return NULL;
   155     } else {
   156         return new UXMLParser(errorCode);
   157     }
   158 }
   160 UXMLParser::~UXMLParser() {}
   162 UXMLElement *
   163 UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
   164     char bytes[4096], charsetBuffer[100];
   165     FileStream *f;
   166     const char *charset, *pb;
   167     UnicodeString src;
   168     UConverter *cnv;
   169     UChar *buffer, *pu;
   170     int32_t fileLength, bytesLength, length, capacity;
   171     UBool flush;
   173     if(U_FAILURE(errorCode)) {
   174         return NULL;
   175     }
   177     f=T_FileStream_open(filename, "rb");
   178     if(f==NULL) {
   179         errorCode=U_FILE_ACCESS_ERROR;
   180         return NULL;
   181     }
   183     bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
   184     if(bytesLength<(int32_t)sizeof(bytes)) {
   185         // we have already read the entire file
   186         fileLength=bytesLength;
   187     } else {
   188         // get the file length
   189         fileLength=T_FileStream_size(f);
   190     }
   192     /*
   193      * get the charset:
   194      * 1. Unicode signature
   195      * 2. treat as ISO-8859-1 and read XML encoding="charser"
   196      * 3. default to UTF-8
   197      */
   198     charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
   199     if(U_SUCCESS(errorCode) && charset!=NULL) {
   200         // open converter according to Unicode signature
   201         cnv=ucnv_open(charset, &errorCode);
   202     } else {
   203         // read as Latin-1 and parse the XML declaration and encoding
   204         cnv=ucnv_open("ISO-8859-1", &errorCode);
   205         if(U_FAILURE(errorCode)) {
   206             // unexpected error opening Latin-1 converter
   207             goto exit;
   208         }
   210         buffer=src.getBuffer(bytesLength);
   211         if(buffer==NULL) {
   212             // unexpected failure to reserve some string capacity
   213             errorCode=U_MEMORY_ALLOCATION_ERROR;
   214             goto exit;
   215         }
   216         pb=bytes;
   217         pu=buffer;
   218         ucnv_toUnicode(
   219             cnv,
   220             &pu, buffer+src.getCapacity(),
   221             &pb, bytes+bytesLength,
   222             NULL, TRUE, &errorCode);
   223         src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
   224         ucnv_close(cnv);
   225         cnv=NULL;
   226         if(U_FAILURE(errorCode)) {
   227             // unexpected error in conversion from Latin-1
   228             src.remove();
   229             goto exit;
   230         }
   232         // parse XML declaration
   233         if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
   234             int32_t declEnd=mXMLDecl.end(errorCode);
   235             // go beyond <?xml
   236             int32_t pos=src.indexOf((UChar)x_l)+1;
   238             mAttrValue.reset(src);
   239             while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) {  // loop runs once per attribute on this element.
   240                 UnicodeString attName  = mAttrValue.group(1, errorCode);
   241                 UnicodeString attValue = mAttrValue.group(2, errorCode);
   243                 // Trim the quotes from the att value.  These are left over from the original regex
   244                 //   that parsed the attribue, which couldn't conveniently strip them.
   245                 attValue.remove(0,1);                    // one char from the beginning
   246                 attValue.truncate(attValue.length()-1);  // and one from the end.
   248                 if(attName==UNICODE_STRING("encoding", 8)) {
   249                     length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
   250                     charset=charsetBuffer;
   251                     break;
   252                 }
   253                 pos = mAttrValue.end(2, errorCode);
   254             }
   256             if(charset==NULL) {
   257                 // default to UTF-8
   258                 charset="UTF-8";
   259             }
   260             cnv=ucnv_open(charset, &errorCode);
   261         }
   262     }
   264     if(U_FAILURE(errorCode)) {
   265         // unable to open the converter
   266         goto exit;
   267     }
   269     // convert the file contents
   270     capacity=fileLength;        // estimated capacity
   271     src.getBuffer(capacity);
   272     src.releaseBuffer(0);       // zero length
   273     flush=FALSE;
   274     for(;;) {
   275         // convert contents of bytes[bytesLength]
   276         pb=bytes;
   277         for(;;) {
   278             length=src.length();
   279             buffer=src.getBuffer(capacity);
   280             if(buffer==NULL) {
   281                 // unexpected failure to reserve some string capacity
   282                 errorCode=U_MEMORY_ALLOCATION_ERROR;
   283                 goto exit;
   284             }
   286             pu=buffer+length;
   287             ucnv_toUnicode(
   288                 cnv, &pu, buffer+src.getCapacity(),
   289                 &pb, bytes+bytesLength,
   290                 NULL, FALSE, &errorCode);
   291             src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
   292             if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
   293                 errorCode=U_ZERO_ERROR;
   294                 capacity=(3*src.getCapacity())/2; // increase capacity by 50%
   295             } else {
   296                 break;
   297             }
   298         }
   300         if(U_FAILURE(errorCode)) {
   301             break; // conversion error
   302         }
   304         if(flush) {
   305             break; // completely converted the file
   306         }
   308         // read next block
   309         bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
   310         if(bytesLength==0) {
   311             // reached end of file, convert once more to flush the converter
   312             flush=TRUE;
   313         }
   314     };
   316 exit:
   317     ucnv_close(cnv);
   318     T_FileStream_close(f);
   320     if(U_SUCCESS(errorCode)) {
   321         return parse(src, errorCode);
   322     } else {
   323         return NULL;
   324     }
   325 }
   327 UXMLElement *
   328 UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
   329     if(U_FAILURE(status)) {
   330         return NULL;
   331     }
   333     UXMLElement   *root = NULL;
   334     fPos = 0; // TODO use just a local pos variable and pass it into functions
   335               // where necessary?
   337     // set all matchers to work on the input string
   338     mXMLDecl.reset(src);
   339     mXMLComment.reset(src);
   340     mXMLSP.reset(src);
   341     mXMLDoctype.reset(src);
   342     mXMLPI.reset(src);
   343     mXMLElemStart.reset(src);
   344     mXMLElemEnd.reset(src);
   345     mXMLElemEmpty.reset(src);
   346     mXMLCharData.reset(src);
   347     mAttrValue.reset(src);
   348     mAttrNormalizer.reset(src);
   349     mNewLineNormalizer.reset(src);
   350     mAmps.reset(src);
   352     // Consume the XML Declaration, if present.
   353     if (mXMLDecl.lookingAt(fPos, status)) {
   354         fPos = mXMLDecl.end(status);
   355     }
   357     // Consume "misc" [XML production 27] appearing before DocType
   358     parseMisc(status);
   360     // Consume a DocType declaration, if present.
   361     if (mXMLDoctype.lookingAt(fPos, status)) {
   362         fPos = mXMLDoctype.end(status);
   363     }
   365     // Consume additional "misc" [XML production 27] appearing after the DocType
   366     parseMisc(status);
   368     // Get the root element
   369     if (mXMLElemEmpty.lookingAt(fPos, status)) {
   370         // Root is an empty element (no nested elements or content)
   371         root = createElement(mXMLElemEmpty, status);
   372         fPos = mXMLElemEmpty.end(status);
   373     } else {
   374         if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {
   375             error("Root Element expected", status);
   376             goto errorExit;
   377         }
   378         root = createElement(mXMLElemStart, status);
   379         UXMLElement  *el = root;
   381         //
   382         // This is the loop that consumes the root element of the document,
   383         //      including all nested content.   Nested elements are handled by
   384         //      explicit pushes/pops of the element stack; there is no recursion
   385         //      in the control flow of this code.
   386         //      "el" always refers to the current element, the one to which content
   387         //      is being added.  It is above the top of the element stack.
   388         for (;;) {
   389             // Nested Element Start
   390             if (mXMLElemStart.lookingAt(fPos, status)) {
   391                 UXMLElement *t = createElement(mXMLElemStart, status);
   392                 el->fChildren.addElement(t, status);
   393                 t->fParent = el;
   394                 fElementStack.push(el, status);
   395                 el = t;
   396                 continue;
   397             }
   399             // Text Content.  String is concatenated onto the current node's content,
   400             //                but only if it contains something other than spaces.
   401             UnicodeString s = scanContent(status);
   402             if (s.length() > 0) {
   403                 mXMLSP.reset(s);
   404                 if (mXMLSP.matches(status) == FALSE) {
   405                     // This chunk of text contains something other than just
   406                     //  white space. Make a child node for it.
   407                     replaceCharRefs(s, status);
   408                     el->fChildren.addElement(s.clone(), status);
   409                 }
   410                 mXMLSP.reset(src);    // The matchers need to stay set to the main input string.
   411                 continue;
   412             }
   414             // Comments.  Discard.
   415             if (mXMLComment.lookingAt(fPos, status)) {
   416                 fPos = mXMLComment.end(status);
   417                 continue;
   418             }
   420             // PIs.  Discard.
   421             if (mXMLPI.lookingAt(fPos, status)) {
   422                 fPos = mXMLPI.end(status);
   423                 continue;
   424             }
   426             // Element End
   427             if (mXMLElemEnd.lookingAt(fPos, status)) {
   428                 fPos = mXMLElemEnd.end(0, status);
   429                 const UnicodeString name = mXMLElemEnd.group(1, status);
   430                 if (name != *el->fName) {
   431                     error("Element start / end tag mismatch", status);
   432                     goto errorExit;
   433                 }
   434                 if (fElementStack.empty()) {
   435                     // Close of the root element.  We're done with the doc.
   436                     el = NULL;
   437                     break;
   438                 }
   439                 el = (UXMLElement *)fElementStack.pop();
   440                 continue;
   441             }
   443             // Empty Element.  Stored as a child of the current element, but not stacked.
   444             if (mXMLElemEmpty.lookingAt(fPos, status)) {
   445                 UXMLElement *t = createElement(mXMLElemEmpty, status);
   446                 el->fChildren.addElement(t, status);
   447                 continue;
   448             }
   450             // Hit something within the document that doesn't match anything.
   451             //   It's an error.
   452             error("Unrecognized markup", status);
   453             break;
   454         }
   456         if (el != NULL || !fElementStack.empty()) {
   457             // We bailed out early, for some reason.
   458             error("Root element not closed.", status);
   459             goto errorExit;
   460         }
   461     }
   463     // Root Element parse is complete.
   464     // Consume the annoying xml "Misc" that can appear at the end of the doc.
   465     parseMisc(status);
   467     // We should have reached the end of the input
   468     if (fPos != src.length()) {
   469         error("Extra content at the end of the document", status);
   470         goto errorExit;
   471     }
   473     // Success!
   474     return root;
   476 errorExit:
   477     delete root;
   478     return NULL;
   479 }
   481 //
   482 //  createElement
   483 //      We've just matched an element start tag.  Create and fill in a UXMLElement object
   484 //      for it.
   485 //
   486 UXMLElement *
   487 UXMLParser::createElement(RegexMatcher  &mEl, UErrorCode &status) {
   488     // First capture group is the element's name.
   489     UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);
   491     // Scan for attributes.
   492     int32_t   pos = mEl.end(1, status);  // The position after the end of the tag name
   494     while (mAttrValue.lookingAt(pos, status)) {  // loop runs once per attribute on this element.
   495         UnicodeString attName  = mAttrValue.group(1, status);
   496         UnicodeString attValue = mAttrValue.group(2, status);
   498         // Trim the quotes from the att value.  These are left over from the original regex
   499         //   that parsed the attribue, which couldn't conveniently strip them.
   500         attValue.remove(0,1);                    // one char from the beginning
   501         attValue.truncate(attValue.length()-1);  // and one from the end.
   503         // XML Attribue value normalization. 
   504         // This is one of the really screwy parts of the XML spec.
   505         // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
   506         // Note that non-validating parsers must treat all entities as type CDATA
   507         //   which simplifies things some.
   509         // Att normalization step 1:  normalize any newlines in the attribute value
   510         mNewLineNormalizer.reset(attValue);
   511         attValue = mNewLineNormalizer.replaceAll(fOneLF, status);
   513         // Next change all xml white space chars to plain \u0020 spaces.
   514         mAttrNormalizer.reset(attValue);
   515         UnicodeString oneSpace((UChar)0x0020);
   516         attValue = mAttrNormalizer.replaceAll(oneSpace, status);
   518         // Replace character entities.
   519         replaceCharRefs(attValue, status);
   521         // Save the attribute name and value in our document structure.
   522         el->fAttNames.addElement((void *)intern(attName, status), status);
   523         el->fAttValues.addElement(attValue.clone(), status);
   524         pos = mAttrValue.end(2, status);
   525     }
   526     fPos = mEl.end(0, status);
   527     return el;
   528 }
   530 //
   531 //  parseMisc
   532 //     Consume XML "Misc" [production #27]
   533 //        which is any combination of space, PI and comments
   534 //      Need to watch end-of-input because xml MISC stuff is allowed after
   535 //        the document element, so we WILL scan off the end in this function
   536 //
   537 void
   538 UXMLParser::parseMisc(UErrorCode &status)  {
   539     for (;;) {
   540         if (fPos >= mXMLPI.input().length()) {
   541             break;
   542         }
   543         if (mXMLPI.lookingAt(fPos, status)) {
   544             fPos = mXMLPI.end(status);
   545             continue;
   546         }
   547         if (mXMLSP.lookingAt(fPos, status)) {
   548             fPos = mXMLSP.end(status);
   549             continue;
   550         }
   551         if (mXMLComment.lookingAt(fPos, status)) {
   552             fPos = mXMLComment.end(status);
   553             continue;
   554         }
   555         break;
   556     }
   557 }
   559 //
   560 //  Scan for document content.
   561 //
   562 UnicodeString
   563 UXMLParser::scanContent(UErrorCode &status) {
   564     UnicodeString  result;
   565     if (mXMLCharData.lookingAt(fPos, status)) {
   566         result = mXMLCharData.group((int32_t)0, status);
   567         // Normalize the new-lines.  (Before char ref substitution)
   568         mNewLineNormalizer.reset(result);
   569         result = mNewLineNormalizer.replaceAll(fOneLF, status);
   571         // TODO:  handle CDATA
   572         fPos = mXMLCharData.end(0, status);
   573     }
   575     return result;
   576 }
   578 //
   579 //   replaceCharRefs
   580 //
   581 //      replace the char entities &lt;  &amp; &#123; &#x12ab; etc. in a string
   582 //       with the corresponding actual character.
   583 //
   584 void
   585 UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
   586     UnicodeString result;
   587     UnicodeString replacement;
   588     int     i;
   590     mAmps.reset(s);
   591     // See the initialization for the regex matcher mAmps.
   592     //    Which entity we've matched is determined by which capture group has content,
   593     //      which is flaged by start() of that group not being -1.
   594     while (mAmps.find()) {
   595         if (mAmps.start(1, status) != -1) {
   596             replacement.setTo((UChar)x_AMP);
   597         } else if (mAmps.start(2, status) != -1) {
   598             replacement.setTo((UChar)x_LT);
   599         } else if (mAmps.start(3, status) != -1) {
   600             replacement.setTo((UChar)x_GT);
   601         } else if (mAmps.start(4, status) != -1) {
   602             replacement.setTo((UChar)x_APOS);
   603         } else if (mAmps.start(5, status) != -1) {
   604             replacement.setTo((UChar)x_QUOT);
   605         } else if (mAmps.start(6, status) != -1) {
   606             UnicodeString hexString = mAmps.group(6, status);
   607             UChar32 val = 0;
   608             for (i=0; i<hexString.length(); i++) {
   609                 val = (val << 4) + u_digit(hexString.charAt(i), 16);
   610             }
   611             // TODO:  some verification that the character is valid
   612             replacement.setTo(val);
   613         } else if (mAmps.start(7, status) != -1) {
   614             UnicodeString decimalString = mAmps.group(7, status);
   615             UChar32 val = 0;
   616             for (i=0; i<decimalString.length(); i++) {
   617                 val = val*10 + u_digit(decimalString.charAt(i), 10);
   618             }
   619             // TODO:  some verification that the character is valid
   620             replacement.setTo(val);
   621         } else {
   622             // An unrecognized &entity;  Leave it alone.
   623             //  TODO:  check that it really looks like an entity, and is not some
   624             //         random & in the text.
   625             replacement = mAmps.group((int32_t)0, status);
   626         }
   627         mAmps.appendReplacement(result, replacement, status);
   628     }
   629     mAmps.appendTail(result);
   630     s = result;
   631 }
   633 void
   634 UXMLParser::error(const char *message, UErrorCode &status) {
   635     // TODO:  something better here...
   636     const UnicodeString &src=mXMLDecl.input();
   637     int  line = 0;
   638     int  ci = 0;
   639     while (ci < fPos && ci>=0) {
   640         ci = src.indexOf((UChar)0x0a, ci+1);
   641         line++;
   642     }
   643     fprintf(stderr, "Error: %s at line %d\n", message, line);
   644     if (U_SUCCESS(status)) {
   645         status = U_PARSE_ERROR;
   646     }
   647 }
   649 // intern strings like in Java
   651 const UnicodeString *
   652 UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
   653     const UHashElement *he=fNames.find(s);
   654     if(he!=NULL) {
   655         // already a known name, return its hashed key pointer
   656         return (const UnicodeString *)he->key.pointer;
   657     } else {
   658         // add this new name and return its hashed key pointer
   659         fNames.puti(s, 0, errorCode);
   660         he=fNames.find(s);
   661         return (const UnicodeString *)he->key.pointer;
   662     }
   663 }
   665 const UnicodeString *
   666 UXMLParser::findName(const UnicodeString &s) const {
   667     const UHashElement *he=fNames.find(s);
   668     if(he!=NULL) {
   669         // a known name, return its hashed key pointer
   670         return (const UnicodeString *)he->key.pointer;
   671     } else {
   672         // unknown name
   673         return NULL;
   674     }
   675 }
   677 // UXMLElement ------------------------------------------------------------- ***
   679 UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
   680    fParser(parser),
   681    fName(name),
   682    fAttNames(errorCode),
   683    fAttValues(errorCode),
   684    fChildren(errorCode),
   685    fParent(NULL)
   686 {
   687 }
   689 UXMLElement::~UXMLElement() {
   690     int   i;
   691     // attribute names are owned by the UXMLParser, don't delete them here
   692     for (i=fAttValues.size()-1; i>=0; i--) {
   693         delete (UObject *)fAttValues.elementAt(i);
   694     }
   695     for (i=fChildren.size()-1; i>=0; i--) {
   696         delete (UObject *)fChildren.elementAt(i);
   697     }
   698 }
   700 const UnicodeString &
   701 UXMLElement::getTagName() const {
   702     return *fName;
   703 }
   705 UnicodeString
   706 UXMLElement::getText(UBool recurse) const {
   707     UnicodeString text;
   708     appendText(text, recurse);
   709     return text;
   710 }
   712 void
   713 UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
   714     const UObject *node;
   715     int32_t i, count=fChildren.size();
   716     for(i=0; i<count; ++i) {
   717         node=(const UObject *)fChildren.elementAt(i);
   718         const UnicodeString *s=dynamic_cast<const UnicodeString *>(node);
   719         if(s!=NULL) {
   720             text.append(*s);
   721         } else if(recurse) /* must be a UXMLElement */ {
   722             ((const UXMLElement *)node)->appendText(text, recurse);
   723         }
   724     }
   725 }
   727 int32_t
   728 UXMLElement::countAttributes() const {
   729     return fAttNames.size();
   730 }
   732 const UnicodeString *
   733 UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
   734     if(0<=i && i<fAttNames.size()) {
   735         name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
   736         value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
   737         return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
   738     } else {
   739         return NULL;
   740     }
   741 }
   743 const UnicodeString *
   744 UXMLElement::getAttribute(const UnicodeString &name) const {
   745     // search for the attribute name by comparing the interned pointer,
   746     // not the string contents
   747     const UnicodeString *p=fParser->findName(name);
   748     if(p==NULL) {
   749         return NULL; // no such attribute seen by the parser at all
   750     }
   752     int32_t i, count=fAttNames.size();
   753     for(i=0; i<count; ++i) {
   754         if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
   755             return (const UnicodeString *)fAttValues.elementAt(i);
   756         }
   757     }
   758     return NULL;
   759 }
   761 int32_t
   762 UXMLElement::countChildren() const {
   763     return fChildren.size();
   764 }
   766 const UObject *
   767 UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
   768     if(0<=i && i<fChildren.size()) {
   769         const UObject *node=(const UObject *)fChildren.elementAt(i);
   770         if(dynamic_cast<const UXMLElement *>(node)!=NULL) {
   771             type=UXML_NODE_TYPE_ELEMENT;
   772         } else {
   773             type=UXML_NODE_TYPE_STRING;
   774         }
   775         return node;
   776     } else {
   777         return NULL;
   778     }
   779 }
   781 const UXMLElement *
   782 UXMLElement::nextChildElement(int32_t &i) const {
   783     if(i<0) {
   784         return NULL;
   785     }
   787     const UObject *node;
   788     int32_t count=fChildren.size();
   789     while(i<count) {
   790         node=(const UObject *)fChildren.elementAt(i++);
   791         const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
   792         if(elem!=NULL) {
   793             return elem;
   794         }
   795     }
   796     return NULL;
   797 }
   799 const UXMLElement *
   800 UXMLElement::getChildElement(const UnicodeString &name) const {
   801     // search for the element name by comparing the interned pointer,
   802     // not the string contents
   803     const UnicodeString *p=fParser->findName(name);
   804     if(p==NULL) {
   805         return NULL; // no such element seen by the parser at all
   806     }
   808     const UObject *node;
   809     int32_t i, count=fChildren.size();
   810     for(i=0; i<count; ++i) {
   811         node=(const UObject *)fChildren.elementAt(i);
   812         const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
   813         if(elem!=NULL) {
   814             if(p==elem->fName) {
   815                 return elem;
   816             }
   817         }
   818     }
   819     return NULL;
   820 }
   822 U_NAMESPACE_END
   824 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */

mercurial