The Tor Browser: intl/icu/source/tools/toolutil/xmlparser.cpp@6474c204b198

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*

     2 *******************************************************************************

3 *

     4 *   Copyright (C) 2004-2010, International Business Machines

     5 *   Corporation and others.  All Rights Reserved.

6 *

     7 *******************************************************************************

     8 *   file name:  xmlparser.cpp

     9 *   encoding:   US-ASCII

    10 *   tab size:   8 (not used)

    11 *   indentation:4

    12 *

    13 *   created on: 2004jul21

    14 *   created by: Andy Heninger

    15 */

    17 #include <stdio.h>

    18 #include "unicode/uchar.h"

    19 #include "unicode/ucnv.h"

    20 #include "unicode/regex.h"

    21 #include "filestrm.h"

    22 #include "xmlparser.h"

    24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION

    26 // character constants

    27 enum {

    28     x_QUOT=0x22,

    29     x_AMP=0x26,

    30     x_APOS=0x27,

    31     x_LT=0x3c,

    32     x_GT=0x3e,

    33     x_l=0x6c

    34 };

    36 #define  XML_SPACES "[ \\u0009\\u000d\\u000a]"

    38 // XML #4

    39 #define  XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \

    40                     "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \

    41                     "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \

    42                     "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"

    44 //  XML #5

    45 #define  XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"

    47 //  XML #6

    48 #define  XML_NAME    XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"

    50 U_NAMESPACE_BEGIN

    52 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)

    53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)

    55 //

    56 //   UXMLParser constructor.   Mostly just initializes the ICU regexes that are

    57 //                             used for parsing.

    58 //

    59 UXMLParser::UXMLParser(UErrorCode &status) :

    60       //  XML Declaration.  XML Production #23.

    61       //      example:  "<?xml version=1.0 encoding="utf-16" ?>

    62       //      This is a sloppy implementation - just look for the leading <?xml and the closing ?>

    63       //            allow for a possible leading BOM.

    64       mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),

    66       //  XML Comment   production #15

    67       //     example:  "<!-- whatever -->

    68       //       note, does not detect an illegal "--" within comments

    69       mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),

    71       //  XML Spaces

    72       //      production [3]

    73       mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),

    75       //  XML Doctype decl  production #28

    76       //     example   "<!DOCTYPE foo SYSTEM "somewhere" >

    77       //       or      "<!DOCTYPE foo [internal dtd]>

    78       //    TODO:  we don't actually parse the DOCTYPE or internal subsets.

    79       //           Some internal dtd subsets could confuse this simple-minded

    80       //           attempt at skipping over them, specifically, occcurences

    81       //           of closeing square brackets.  These could appear in comments,

    82       //           or in parameter entity declarations, for example.

    83       mXMLDoctype(UnicodeString(

    84            "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV

    85            ), 0, status),

    87       //  XML PI     production #16

    88       //     example   "<?target stuff?>

    89       mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),

    91       //  XML Element Start   Productions #40, #41

    92       //          example   <foo att1='abc'  att2="d e f" >

    93       //      capture #1:  the tag name

    94       //

    95       mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"

    96           "(?:"

    97                 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "

    98                 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'

    99           ")*"                                                             //   * for zero or more attributes.

   100           XML_SPACES "*?>", -1, US_INV), 0, status),                               // match " >"

   102       //  XML Element End     production #42

   103       //     example   </foo>

   104       mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),

   106       // XML Element Empty    production #44

   107       //     example   <foo att1="abc"   att2="d e f" />

   108       mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"

   109           "(?:"

   110                 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "

   111                 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'

   112           ")*"                                                             //   * for zero or more attributes.

   113           XML_SPACES "*?/>", -1, US_INV), 0, status),                              // match " />"

   116       // XMLCharData.  Everything but '<'.  Note that & will be dealt with later.

   117       mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),

   119       // Attribute name = "value".  XML Productions 10, 40/41

   120       //  Capture group 1 is name,

   121       //                2 is the attribute value, including the quotes.

   122       //

   123       //   Note that attributes are scanned twice.  The first time is with

   124       //        the regex for an entire element start.  There, the attributes

   125       //        are checked syntactically, but not separted out one by one.

   126       //        Here, we match a single attribute, and make its name and

   127       //        attribute value available to the parser code.

   128       mAttrValue(UnicodeString(XML_SPACES "+("  XML_NAME ")"  XML_SPACES "*=" XML_SPACES "*"

   129          "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),

   132       mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),

   134       // Match any of the new-line sequences in content.

   135       //   All are changed to \u000a.

   136       mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),

   138       // & char references

   139       //   We will figure out what we've got based on which capture group has content.

   140       //   The last one is a catchall for unrecognized entity references..

   141       //             1     2     3      4      5           6                    7          8

   142       mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),

   143                 0, status),

   145       fNames(status),

   146       fElementStack(status),

   147       fOneLF((UChar)0x0a)        // Plain new-line string, used in new line normalization.

   148       {

   149       }

   151 UXMLParser *

   152 UXMLParser::createParser(UErrorCode &errorCode) {

   153     if (U_FAILURE(errorCode)) {

   154         return NULL;

   155     } else {

   156         return new UXMLParser(errorCode);

   157     }

   158 }

   160 UXMLParser::~UXMLParser() {}

   162 UXMLElement *

   163 UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {

   164     char bytes[4096], charsetBuffer[100];

   165     FileStream *f;

   166     const char *charset, *pb;

   167     UnicodeString src;

   168     UConverter *cnv;

   169     UChar *buffer, *pu;

   170     int32_t fileLength, bytesLength, length, capacity;

   171     UBool flush;

   173     if(U_FAILURE(errorCode)) {

   174         return NULL;

   175     }

   177     f=T_FileStream_open(filename, "rb");

   178     if(f==NULL) {

   179         errorCode=U_FILE_ACCESS_ERROR;

   180         return NULL;

   181     }

   183     bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));

   184     if(bytesLength<(int32_t)sizeof(bytes)) {

   185         // we have already read the entire file

   186         fileLength=bytesLength;

   187     } else {

   188         // get the file length

   189         fileLength=T_FileStream_size(f);

   190     }

   192     /*

   193      * get the charset:

   194      * 1. Unicode signature

   195      * 2. treat as ISO-8859-1 and read XML encoding="charser"

   196      * 3. default to UTF-8

   197      */

   198     charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);

   199     if(U_SUCCESS(errorCode) && charset!=NULL) {

   200         // open converter according to Unicode signature

   201         cnv=ucnv_open(charset, &errorCode);

   202     } else {

   203         // read as Latin-1 and parse the XML declaration and encoding

   204         cnv=ucnv_open("ISO-8859-1", &errorCode);

   205         if(U_FAILURE(errorCode)) {

   206             // unexpected error opening Latin-1 converter

   207             goto exit;

   208         }

   210         buffer=src.getBuffer(bytesLength);

   211         if(buffer==NULL) {

   212             // unexpected failure to reserve some string capacity

   213             errorCode=U_MEMORY_ALLOCATION_ERROR;

   214             goto exit;

   215         }

   216         pb=bytes;

   217         pu=buffer;

   218         ucnv_toUnicode(

   219             cnv,

   220             &pu, buffer+src.getCapacity(),

   221             &pb, bytes+bytesLength,

   222             NULL, TRUE, &errorCode);

   223         src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);

   224         ucnv_close(cnv);

   225         cnv=NULL;

   226         if(U_FAILURE(errorCode)) {

   227             // unexpected error in conversion from Latin-1

   228             src.remove();

   229             goto exit;

   230         }

   232         // parse XML declaration

   233         if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {

   234             int32_t declEnd=mXMLDecl.end(errorCode);

   235             // go beyond <?xml

   236             int32_t pos=src.indexOf((UChar)x_l)+1;

   238             mAttrValue.reset(src);

   239             while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) {  // loop runs once per attribute on this element.

   240                 UnicodeString attName  = mAttrValue.group(1, errorCode);

   241                 UnicodeString attValue = mAttrValue.group(2, errorCode);

   243                 // Trim the quotes from the att value.  These are left over from the original regex

   244                 //   that parsed the attribue, which couldn't conveniently strip them.

   245                 attValue.remove(0,1);                    // one char from the beginning

   246                 attValue.truncate(attValue.length()-1);  // and one from the end.

   248                 if(attName==UNICODE_STRING("encoding", 8)) {

   249                     length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));

   250                     charset=charsetBuffer;

   251                     break;

   252                 }

   253                 pos = mAttrValue.end(2, errorCode);

   254             }

   256             if(charset==NULL) {

   257                 // default to UTF-8

   258                 charset="UTF-8";

   259             }

   260             cnv=ucnv_open(charset, &errorCode);

   261         }

   262     }

   264     if(U_FAILURE(errorCode)) {

   265         // unable to open the converter

   266         goto exit;

   267     }

   269     // convert the file contents

   270     capacity=fileLength;        // estimated capacity

   271     src.getBuffer(capacity);

   272     src.releaseBuffer(0);       // zero length

   273     flush=FALSE;

   274     for(;;) {

   275         // convert contents of bytes[bytesLength]

   276         pb=bytes;

   277         for(;;) {

   278             length=src.length();

   279             buffer=src.getBuffer(capacity);

   280             if(buffer==NULL) {

   281                 // unexpected failure to reserve some string capacity

   282                 errorCode=U_MEMORY_ALLOCATION_ERROR;

   283                 goto exit;

   284             }

   286             pu=buffer+length;

   287             ucnv_toUnicode(

   288                 cnv, &pu, buffer+src.getCapacity(),

   289                 &pb, bytes+bytesLength,

   290                 NULL, FALSE, &errorCode);

   291             src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);

   292             if(errorCode==U_BUFFER_OVERFLOW_ERROR) {

   293                 errorCode=U_ZERO_ERROR;

   294                 capacity=(3*src.getCapacity())/2; // increase capacity by 50%

   295             } else {

   296                 break;

   297             }

   298         }

   300         if(U_FAILURE(errorCode)) {

   301             break; // conversion error

   302         }

   304         if(flush) {

   305             break; // completely converted the file

   306         }

   308         // read next block

   309         bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));

   310         if(bytesLength==0) {

   311             // reached end of file, convert once more to flush the converter

   312             flush=TRUE;

   313         }

   314     };

   316 exit:

   317     ucnv_close(cnv);

   318     T_FileStream_close(f);

   320     if(U_SUCCESS(errorCode)) {

   321         return parse(src, errorCode);

   322     } else {

   323         return NULL;

   324     }

   325 }

   327 UXMLElement *

   328 UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {

   329     if(U_FAILURE(status)) {

   330         return NULL;

   331     }

   333     UXMLElement   *root = NULL;

   334     fPos = 0; // TODO use just a local pos variable and pass it into functions

   335               // where necessary?

   337     // set all matchers to work on the input string

   338     mXMLDecl.reset(src);

   339     mXMLComment.reset(src);

   340     mXMLSP.reset(src);

   341     mXMLDoctype.reset(src);

   342     mXMLPI.reset(src);

   343     mXMLElemStart.reset(src);

   344     mXMLElemEnd.reset(src);

   345     mXMLElemEmpty.reset(src);

   346     mXMLCharData.reset(src);

   347     mAttrValue.reset(src);

   348     mAttrNormalizer.reset(src);

   349     mNewLineNormalizer.reset(src);

   350     mAmps.reset(src);

   352     // Consume the XML Declaration, if present.

   353     if (mXMLDecl.lookingAt(fPos, status)) {

   354         fPos = mXMLDecl.end(status);

   355     }

   357     // Consume "misc" [XML production 27] appearing before DocType

   358     parseMisc(status);

   360     // Consume a DocType declaration, if present.

   361     if (mXMLDoctype.lookingAt(fPos, status)) {

   362         fPos = mXMLDoctype.end(status);

   363     }

   365     // Consume additional "misc" [XML production 27] appearing after the DocType

   366     parseMisc(status);

   368     // Get the root element

   369     if (mXMLElemEmpty.lookingAt(fPos, status)) {

   370         // Root is an empty element (no nested elements or content)

   371         root = createElement(mXMLElemEmpty, status);

   372         fPos = mXMLElemEmpty.end(status);

   373     } else {

   374         if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {

   375             error("Root Element expected", status);

   376             goto errorExit;

   377         }

   378         root = createElement(mXMLElemStart, status);

   379         UXMLElement  *el = root;

   381         //

   382         // This is the loop that consumes the root element of the document,

   383         //      including all nested content.   Nested elements are handled by

   384         //      explicit pushes/pops of the element stack; there is no recursion

   385         //      in the control flow of this code.

   386         //      "el" always refers to the current element, the one to which content

   387         //      is being added.  It is above the top of the element stack.

   388         for (;;) {

   389             // Nested Element Start

   390             if (mXMLElemStart.lookingAt(fPos, status)) {

   391                 UXMLElement *t = createElement(mXMLElemStart, status);

   392                 el->fChildren.addElement(t, status);

   393                 t->fParent = el;

   394                 fElementStack.push(el, status);

   395                 el = t;

   396                 continue;

   397             }

   399             // Text Content.  String is concatenated onto the current node's content,

   400             //                but only if it contains something other than spaces.

   401             UnicodeString s = scanContent(status);

   402             if (s.length() > 0) {

   403                 mXMLSP.reset(s);

   404                 if (mXMLSP.matches(status) == FALSE) {

   405                     // This chunk of text contains something other than just

   406                     //  white space. Make a child node for it.

   407                     replaceCharRefs(s, status);

   408                     el->fChildren.addElement(s.clone(), status);

   409                 }

   410                 mXMLSP.reset(src);    // The matchers need to stay set to the main input string.

   411                 continue;

   412             }

   414             // Comments.  Discard.

   415             if (mXMLComment.lookingAt(fPos, status)) {

   416                 fPos = mXMLComment.end(status);

   417                 continue;

   418             }

   420             // PIs.  Discard.

   421             if (mXMLPI.lookingAt(fPos, status)) {

   422                 fPos = mXMLPI.end(status);

   423                 continue;

   424             }

   426             // Element End

   427             if (mXMLElemEnd.lookingAt(fPos, status)) {

   428                 fPos = mXMLElemEnd.end(0, status);

   429                 const UnicodeString name = mXMLElemEnd.group(1, status);

   430                 if (name != *el->fName) {

   431                     error("Element start / end tag mismatch", status);

   432                     goto errorExit;

   433                 }

   434                 if (fElementStack.empty()) {

   435                     // Close of the root element.  We're done with the doc.

   436                     el = NULL;

   437                     break;

   438                 }

   439                 el = (UXMLElement *)fElementStack.pop();

   440                 continue;

   441             }

   443             // Empty Element.  Stored as a child of the current element, but not stacked.

   444             if (mXMLElemEmpty.lookingAt(fPos, status)) {

   445                 UXMLElement *t = createElement(mXMLElemEmpty, status);

   446                 el->fChildren.addElement(t, status);

   447                 continue;

   448             }

   450             // Hit something within the document that doesn't match anything.

   451             //   It's an error.

   452             error("Unrecognized markup", status);

   453             break;

   454         }

   456         if (el != NULL || !fElementStack.empty()) {

   457             // We bailed out early, for some reason.

   458             error("Root element not closed.", status);

   459             goto errorExit;

   460         }

   461     }

   463     // Root Element parse is complete.

   464     // Consume the annoying xml "Misc" that can appear at the end of the doc.

   465     parseMisc(status);

   467     // We should have reached the end of the input

   468     if (fPos != src.length()) {

   469         error("Extra content at the end of the document", status);

   470         goto errorExit;

   471     }

   473     // Success!

   474     return root;

   476 errorExit:

   477     delete root;

   478     return NULL;

   479 }

   481 //

   482 //  createElement

   483 //      We've just matched an element start tag.  Create and fill in a UXMLElement object

   484 //      for it.

   485 //

   486 UXMLElement *

   487 UXMLParser::createElement(RegexMatcher  &mEl, UErrorCode &status) {

   488     // First capture group is the element's name.

   489     UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);

   491     // Scan for attributes.

   492     int32_t   pos = mEl.end(1, status);  // The position after the end of the tag name

   494     while (mAttrValue.lookingAt(pos, status)) {  // loop runs once per attribute on this element.

   495         UnicodeString attName  = mAttrValue.group(1, status);

   496         UnicodeString attValue = mAttrValue.group(2, status);

   498         // Trim the quotes from the att value.  These are left over from the original regex

   499         //   that parsed the attribue, which couldn't conveniently strip them.

   500         attValue.remove(0,1);                    // one char from the beginning

   501         attValue.truncate(attValue.length()-1);  // and one from the end.

   503         // XML Attribue value normalization.

   504         // This is one of the really screwy parts of the XML spec.

   505         // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize

   506         // Note that non-validating parsers must treat all entities as type CDATA

   507         //   which simplifies things some.

   509         // Att normalization step 1:  normalize any newlines in the attribute value

   510         mNewLineNormalizer.reset(attValue);

   511         attValue = mNewLineNormalizer.replaceAll(fOneLF, status);

   513         // Next change all xml white space chars to plain \u0020 spaces.

   514         mAttrNormalizer.reset(attValue);

   515         UnicodeString oneSpace((UChar)0x0020);

   516         attValue = mAttrNormalizer.replaceAll(oneSpace, status);

   518         // Replace character entities.

   519         replaceCharRefs(attValue, status);

   521         // Save the attribute name and value in our document structure.

   522         el->fAttNames.addElement((void *)intern(attName, status), status);

   523         el->fAttValues.addElement(attValue.clone(), status);

   524         pos = mAttrValue.end(2, status);

   525     }

   526     fPos = mEl.end(0, status);

   527     return el;

   528 }

   530 //

   531 //  parseMisc

   532 //     Consume XML "Misc" [production #27]

   533 //        which is any combination of space, PI and comments

   534 //      Need to watch end-of-input because xml MISC stuff is allowed after

   535 //        the document element, so we WILL scan off the end in this function

   536 //

   537 void

   538 UXMLParser::parseMisc(UErrorCode &status)  {

   539     for (;;) {

   540         if (fPos >= mXMLPI.input().length()) {

   541             break;

   542         }

   543         if (mXMLPI.lookingAt(fPos, status)) {

   544             fPos = mXMLPI.end(status);

   545             continue;

   546         }

   547         if (mXMLSP.lookingAt(fPos, status)) {

   548             fPos = mXMLSP.end(status);

   549             continue;

   550         }

   551         if (mXMLComment.lookingAt(fPos, status)) {

   552             fPos = mXMLComment.end(status);

   553             continue;

   554         }

   555         break;

   556     }

   557 }

   559 //

   560 //  Scan for document content.

   561 //

   562 UnicodeString

   563 UXMLParser::scanContent(UErrorCode &status) {

   564     UnicodeString  result;

   565     if (mXMLCharData.lookingAt(fPos, status)) {

   566         result = mXMLCharData.group((int32_t)0, status);

   567         // Normalize the new-lines.  (Before char ref substitution)

   568         mNewLineNormalizer.reset(result);

   569         result = mNewLineNormalizer.replaceAll(fOneLF, status);

   571         // TODO:  handle CDATA

   572         fPos = mXMLCharData.end(0, status);

   573     }

   575     return result;

   576 }

   578 //

   579 //   replaceCharRefs

   580 //

   581 //      replace the char entities &lt;  &amp; &#123; &#x12ab; etc. in a string

   582 //       with the corresponding actual character.

   583 //

   584 void

   585 UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {

   586     UnicodeString result;

   587     UnicodeString replacement;

   588     int     i;

   590     mAmps.reset(s);

   591     // See the initialization for the regex matcher mAmps.

   592     //    Which entity we've matched is determined by which capture group has content,

   593     //      which is flaged by start() of that group not being -1.

   594     while (mAmps.find()) {

   595         if (mAmps.start(1, status) != -1) {

   596             replacement.setTo((UChar)x_AMP);

   597         } else if (mAmps.start(2, status) != -1) {

   598             replacement.setTo((UChar)x_LT);

   599         } else if (mAmps.start(3, status) != -1) {

   600             replacement.setTo((UChar)x_GT);

   601         } else if (mAmps.start(4, status) != -1) {

   602             replacement.setTo((UChar)x_APOS);

   603         } else if (mAmps.start(5, status) != -1) {

   604             replacement.setTo((UChar)x_QUOT);

   605         } else if (mAmps.start(6, status) != -1) {

   606             UnicodeString hexString = mAmps.group(6, status);

   607             UChar32 val = 0;

   608             for (i=0; i<hexString.length(); i++) {

   609                 val = (val << 4) + u_digit(hexString.charAt(i), 16);

   610             }

   611             // TODO:  some verification that the character is valid

   612             replacement.setTo(val);

   613         } else if (mAmps.start(7, status) != -1) {

   614             UnicodeString decimalString = mAmps.group(7, status);

   615             UChar32 val = 0;

   616             for (i=0; i<decimalString.length(); i++) {

   617                 val = val*10 + u_digit(decimalString.charAt(i), 10);

   618             }

   619             // TODO:  some verification that the character is valid

   620             replacement.setTo(val);

   621         } else {

   622             // An unrecognized &entity;  Leave it alone.

   623             //  TODO:  check that it really looks like an entity, and is not some

   624             //         random & in the text.

   625             replacement = mAmps.group((int32_t)0, status);

   626         }

   627         mAmps.appendReplacement(result, replacement, status);

   628     }

   629     mAmps.appendTail(result);

   630     s = result;

   631 }

   633 void

   634 UXMLParser::error(const char *message, UErrorCode &status) {

   635     // TODO:  something better here...

   636     const UnicodeString &src=mXMLDecl.input();

   637     int  line = 0;

   638     int  ci = 0;

   639     while (ci < fPos && ci>=0) {

   640         ci = src.indexOf((UChar)0x0a, ci+1);

   641         line++;

   642     }

   643     fprintf(stderr, "Error: %s at line %d\n", message, line);

   644     if (U_SUCCESS(status)) {

   645         status = U_PARSE_ERROR;

   646     }

   647 }

   649 // intern strings like in Java

   651 const UnicodeString *

   652 UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {

   653     const UHashElement *he=fNames.find(s);

   654     if(he!=NULL) {

   655         // already a known name, return its hashed key pointer

   656         return (const UnicodeString *)he->key.pointer;

   657     } else {

   658         // add this new name and return its hashed key pointer

   659         fNames.puti(s, 0, errorCode);

   660         he=fNames.find(s);

   661         return (const UnicodeString *)he->key.pointer;

   662     }

   663 }

   665 const UnicodeString *

   666 UXMLParser::findName(const UnicodeString &s) const {

   667     const UHashElement *he=fNames.find(s);

   668     if(he!=NULL) {

   669         // a known name, return its hashed key pointer

   670         return (const UnicodeString *)he->key.pointer;

   671     } else {

   672         // unknown name

   673         return NULL;

   674     }

   675 }

   677 // UXMLElement ------------------------------------------------------------- ***

   679 UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :

   680    fParser(parser),

   681    fName(name),

   682    fAttNames(errorCode),

   683    fAttValues(errorCode),

   684    fChildren(errorCode),

   685    fParent(NULL)

   686 {

   687 }

   689 UXMLElement::~UXMLElement() {

   690     int   i;

   691     // attribute names are owned by the UXMLParser, don't delete them here

   692     for (i=fAttValues.size()-1; i>=0; i--) {

   693         delete (UObject *)fAttValues.elementAt(i);

   694     }

   695     for (i=fChildren.size()-1; i>=0; i--) {

   696         delete (UObject *)fChildren.elementAt(i);

   697     }

   698 }

   700 const UnicodeString &

   701 UXMLElement::getTagName() const {

   702     return *fName;

   703 }

   705 UnicodeString

   706 UXMLElement::getText(UBool recurse) const {

   707     UnicodeString text;

   708     appendText(text, recurse);

   709     return text;

   710 }

   712 void

   713 UXMLElement::appendText(UnicodeString &text, UBool recurse) const {

   714     const UObject *node;

   715     int32_t i, count=fChildren.size();

   716     for(i=0; i<count; ++i) {

   717         node=(const UObject *)fChildren.elementAt(i);

   718         const UnicodeString *s=dynamic_cast<const UnicodeString *>(node);

   719         if(s!=NULL) {

   720             text.append(*s);

   721         } else if(recurse) /* must be a UXMLElement */ {

   722             ((const UXMLElement *)node)->appendText(text, recurse);

   723         }

   724     }

   725 }

   727 int32_t

   728 UXMLElement::countAttributes() const {

   729     return fAttNames.size();

   730 }

   732 const UnicodeString *

   733 UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {

   734     if(0<=i && i<fAttNames.size()) {

   735         name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));

   736         value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));

   737         return &value; // or return (UnicodeString *)fAttValues.elementAt(i);

   738     } else {

   739         return NULL;

   740     }

   741 }

   743 const UnicodeString *

   744 UXMLElement::getAttribute(const UnicodeString &name) const {

   745     // search for the attribute name by comparing the interned pointer,

   746     // not the string contents

   747     const UnicodeString *p=fParser->findName(name);

   748     if(p==NULL) {

   749         return NULL; // no such attribute seen by the parser at all

   750     }

   752     int32_t i, count=fAttNames.size();

   753     for(i=0; i<count; ++i) {

   754         if(p==(const UnicodeString *)fAttNames.elementAt(i)) {

   755             return (const UnicodeString *)fAttValues.elementAt(i);

   756         }

   757     }

   758     return NULL;

   759 }

   761 int32_t

   762 UXMLElement::countChildren() const {

   763     return fChildren.size();

   764 }

   766 const UObject *

   767 UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {

   768     if(0<=i && i<fChildren.size()) {

   769         const UObject *node=(const UObject *)fChildren.elementAt(i);

   770         if(dynamic_cast<const UXMLElement *>(node)!=NULL) {

   771             type=UXML_NODE_TYPE_ELEMENT;

   772         } else {

   773             type=UXML_NODE_TYPE_STRING;

   774         }

   775         return node;

   776     } else {

   777         return NULL;

   778     }

   779 }

   781 const UXMLElement *

   782 UXMLElement::nextChildElement(int32_t &i) const {

   783     if(i<0) {

   784         return NULL;

   785     }

   787     const UObject *node;

   788     int32_t count=fChildren.size();

   789     while(i<count) {

   790         node=(const UObject *)fChildren.elementAt(i++);

   791         const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);

   792         if(elem!=NULL) {

   793             return elem;

   794         }

   795     }

   796     return NULL;

   797 }

   799 const UXMLElement *

   800 UXMLElement::getChildElement(const UnicodeString &name) const {

   801     // search for the element name by comparing the interned pointer,

   802     // not the string contents

   803     const UnicodeString *p=fParser->findName(name);

   804     if(p==NULL) {

   805         return NULL; // no such element seen by the parser at all

   806     }

   808     const UObject *node;

   809     int32_t i, count=fChildren.size();

   810     for(i=0; i<count; ++i) {

   811         node=(const UObject *)fChildren.elementAt(i);

   812         const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);

   813         if(elem!=NULL) {

   814             if(p==elem->fName) {

   815                 return elem;

   816             }

   817         }

   818     }

   819     return NULL;

   820 }

   822 U_NAMESPACE_END

   824 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */

The Tor Browser / file revision

intl/icu/source/tools/toolutil/xmlparser.cpp@6474c204b198

intl/icu/source/tools/toolutil/xmlparser.cpp