intl/icu/source/tools/toolutil/xmlparser.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/tools/toolutil/xmlparser.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,825 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 2004-2010, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +*******************************************************************************
    1.11 +*   file name:  xmlparser.cpp
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created on: 2004jul21
    1.17 +*   created by: Andy Heninger
    1.18 +*/
    1.19 +
    1.20 +#include <stdio.h>
    1.21 +#include "unicode/uchar.h"
    1.22 +#include "unicode/ucnv.h"
    1.23 +#include "unicode/regex.h"
    1.24 +#include "filestrm.h"
    1.25 +#include "xmlparser.h"
    1.26 +
    1.27 +#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
    1.28 +
    1.29 +// character constants
    1.30 +enum {
    1.31 +    x_QUOT=0x22,
    1.32 +    x_AMP=0x26,
    1.33 +    x_APOS=0x27,
    1.34 +    x_LT=0x3c,
    1.35 +    x_GT=0x3e,
    1.36 +    x_l=0x6c
    1.37 +};
    1.38 +
    1.39 +#define  XML_SPACES "[ \\u0009\\u000d\\u000a]"
    1.40 +
    1.41 +// XML #4
    1.42 +#define  XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
    1.43 +                    "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
    1.44 +                    "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
    1.45 +                    "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
    1.46 +
    1.47 +//  XML #5
    1.48 +#define  XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
    1.49 +
    1.50 +//  XML #6
    1.51 +#define  XML_NAME    XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"
    1.52 +
    1.53 +U_NAMESPACE_BEGIN
    1.54 +
    1.55 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
    1.56 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)
    1.57 +
    1.58 +//
    1.59 +//   UXMLParser constructor.   Mostly just initializes the ICU regexes that are
    1.60 +//                             used for parsing.
    1.61 +//
    1.62 +UXMLParser::UXMLParser(UErrorCode &status) :
    1.63 +      //  XML Declaration.  XML Production #23.
    1.64 +      //      example:  "<?xml version=1.0 encoding="utf-16" ?>
    1.65 +      //      This is a sloppy implementation - just look for the leading <?xml and the closing ?>
    1.66 +      //            allow for a possible leading BOM.
    1.67 +      mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
    1.68 +      
    1.69 +      //  XML Comment   production #15
    1.70 +      //     example:  "<!-- whatever -->
    1.71 +      //       note, does not detect an illegal "--" within comments
    1.72 +      mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
    1.73 +      
    1.74 +      //  XML Spaces
    1.75 +      //      production [3]
    1.76 +      mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
    1.77 +      
    1.78 +      //  XML Doctype decl  production #28
    1.79 +      //     example   "<!DOCTYPE foo SYSTEM "somewhere" >
    1.80 +      //       or      "<!DOCTYPE foo [internal dtd]>
    1.81 +      //    TODO:  we don't actually parse the DOCTYPE or internal subsets.
    1.82 +      //           Some internal dtd subsets could confuse this simple-minded
    1.83 +      //           attempt at skipping over them, specifically, occcurences
    1.84 +      //           of closeing square brackets.  These could appear in comments, 
    1.85 +      //           or in parameter entity declarations, for example.
    1.86 +      mXMLDoctype(UnicodeString(
    1.87 +           "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV
    1.88 +           ), 0, status),
    1.89 +      
    1.90 +      //  XML PI     production #16
    1.91 +      //     example   "<?target stuff?>
    1.92 +      mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
    1.93 +      
    1.94 +      //  XML Element Start   Productions #40, #41
    1.95 +      //          example   <foo att1='abc'  att2="d e f" >
    1.96 +      //      capture #1:  the tag name
    1.97 +      //
    1.98 +      mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
    1.99 +          "(?:" 
   1.100 +                XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
   1.101 +                "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
   1.102 +          ")*"                                                             //   * for zero or more attributes.
   1.103 +          XML_SPACES "*?>", -1, US_INV), 0, status),                               // match " >"
   1.104 +      
   1.105 +      //  XML Element End     production #42
   1.106 +      //     example   </foo>
   1.107 +      mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
   1.108 +      
   1.109 +      // XML Element Empty    production #44
   1.110 +      //     example   <foo att1="abc"   att2="d e f" />
   1.111 +      mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
   1.112 +          "(?:" 
   1.113 +                XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
   1.114 +                "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
   1.115 +          ")*"                                                             //   * for zero or more attributes.
   1.116 +          XML_SPACES "*?/>", -1, US_INV), 0, status),                              // match " />"
   1.117 +      
   1.118 +
   1.119 +      // XMLCharData.  Everything but '<'.  Note that & will be dealt with later.
   1.120 +      mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
   1.121 +
   1.122 +      // Attribute name = "value".  XML Productions 10, 40/41
   1.123 +      //  Capture group 1 is name, 
   1.124 +      //                2 is the attribute value, including the quotes.
   1.125 +      //
   1.126 +      //   Note that attributes are scanned twice.  The first time is with
   1.127 +      //        the regex for an entire element start.  There, the attributes
   1.128 +      //        are checked syntactically, but not separted out one by one.
   1.129 +      //        Here, we match a single attribute, and make its name and
   1.130 +      //        attribute value available to the parser code.
   1.131 +      mAttrValue(UnicodeString(XML_SPACES "+("  XML_NAME ")"  XML_SPACES "*=" XML_SPACES "*"
   1.132 +         "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
   1.133 +
   1.134 +
   1.135 +      mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
   1.136 +
   1.137 +      // Match any of the new-line sequences in content.
   1.138 +      //   All are changed to \u000a.
   1.139 +      mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
   1.140 +
   1.141 +      // & char references
   1.142 +      //   We will figure out what we've got based on which capture group has content.
   1.143 +      //   The last one is a catchall for unrecognized entity references..
   1.144 +      //             1     2     3      4      5           6                    7          8
   1.145 +      mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
   1.146 +                0, status),
   1.147 +
   1.148 +      fNames(status),
   1.149 +      fElementStack(status),
   1.150 +      fOneLF((UChar)0x0a)        // Plain new-line string, used in new line normalization.
   1.151 +      {
   1.152 +      }
   1.153 +
   1.154 +UXMLParser *
   1.155 +UXMLParser::createParser(UErrorCode &errorCode) {
   1.156 +    if (U_FAILURE(errorCode)) {
   1.157 +        return NULL;
   1.158 +    } else {
   1.159 +        return new UXMLParser(errorCode);
   1.160 +    }
   1.161 +}
   1.162 +
   1.163 +UXMLParser::~UXMLParser() {}
   1.164 +
   1.165 +UXMLElement *
   1.166 +UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
   1.167 +    char bytes[4096], charsetBuffer[100];
   1.168 +    FileStream *f;
   1.169 +    const char *charset, *pb;
   1.170 +    UnicodeString src;
   1.171 +    UConverter *cnv;
   1.172 +    UChar *buffer, *pu;
   1.173 +    int32_t fileLength, bytesLength, length, capacity;
   1.174 +    UBool flush;
   1.175 +
   1.176 +    if(U_FAILURE(errorCode)) {
   1.177 +        return NULL;
   1.178 +    }
   1.179 +
   1.180 +    f=T_FileStream_open(filename, "rb");
   1.181 +    if(f==NULL) {
   1.182 +        errorCode=U_FILE_ACCESS_ERROR;
   1.183 +        return NULL;
   1.184 +    }
   1.185 +
   1.186 +    bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
   1.187 +    if(bytesLength<(int32_t)sizeof(bytes)) {
   1.188 +        // we have already read the entire file
   1.189 +        fileLength=bytesLength;
   1.190 +    } else {
   1.191 +        // get the file length
   1.192 +        fileLength=T_FileStream_size(f);
   1.193 +    }
   1.194 +
   1.195 +    /*
   1.196 +     * get the charset:
   1.197 +     * 1. Unicode signature
   1.198 +     * 2. treat as ISO-8859-1 and read XML encoding="charser"
   1.199 +     * 3. default to UTF-8
   1.200 +     */
   1.201 +    charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
   1.202 +    if(U_SUCCESS(errorCode) && charset!=NULL) {
   1.203 +        // open converter according to Unicode signature
   1.204 +        cnv=ucnv_open(charset, &errorCode);
   1.205 +    } else {
   1.206 +        // read as Latin-1 and parse the XML declaration and encoding
   1.207 +        cnv=ucnv_open("ISO-8859-1", &errorCode);
   1.208 +        if(U_FAILURE(errorCode)) {
   1.209 +            // unexpected error opening Latin-1 converter
   1.210 +            goto exit;
   1.211 +        }
   1.212 +
   1.213 +        buffer=src.getBuffer(bytesLength);
   1.214 +        if(buffer==NULL) {
   1.215 +            // unexpected failure to reserve some string capacity
   1.216 +            errorCode=U_MEMORY_ALLOCATION_ERROR;
   1.217 +            goto exit;
   1.218 +        }
   1.219 +        pb=bytes;
   1.220 +        pu=buffer;
   1.221 +        ucnv_toUnicode(
   1.222 +            cnv,
   1.223 +            &pu, buffer+src.getCapacity(),
   1.224 +            &pb, bytes+bytesLength,
   1.225 +            NULL, TRUE, &errorCode);
   1.226 +        src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
   1.227 +        ucnv_close(cnv);
   1.228 +        cnv=NULL;
   1.229 +        if(U_FAILURE(errorCode)) {
   1.230 +            // unexpected error in conversion from Latin-1
   1.231 +            src.remove();
   1.232 +            goto exit;
   1.233 +        }
   1.234 +
   1.235 +        // parse XML declaration
   1.236 +        if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
   1.237 +            int32_t declEnd=mXMLDecl.end(errorCode);
   1.238 +            // go beyond <?xml
   1.239 +            int32_t pos=src.indexOf((UChar)x_l)+1;
   1.240 +
   1.241 +            mAttrValue.reset(src);
   1.242 +            while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) {  // loop runs once per attribute on this element.
   1.243 +                UnicodeString attName  = mAttrValue.group(1, errorCode);
   1.244 +                UnicodeString attValue = mAttrValue.group(2, errorCode);
   1.245 +
   1.246 +                // Trim the quotes from the att value.  These are left over from the original regex
   1.247 +                //   that parsed the attribue, which couldn't conveniently strip them.
   1.248 +                attValue.remove(0,1);                    // one char from the beginning
   1.249 +                attValue.truncate(attValue.length()-1);  // and one from the end.
   1.250 +
   1.251 +                if(attName==UNICODE_STRING("encoding", 8)) {
   1.252 +                    length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
   1.253 +                    charset=charsetBuffer;
   1.254 +                    break;
   1.255 +                }
   1.256 +                pos = mAttrValue.end(2, errorCode);
   1.257 +            }
   1.258 +
   1.259 +            if(charset==NULL) {
   1.260 +                // default to UTF-8
   1.261 +                charset="UTF-8";
   1.262 +            }
   1.263 +            cnv=ucnv_open(charset, &errorCode);
   1.264 +        }
   1.265 +    }
   1.266 +
   1.267 +    if(U_FAILURE(errorCode)) {
   1.268 +        // unable to open the converter
   1.269 +        goto exit;
   1.270 +    }
   1.271 +
   1.272 +    // convert the file contents
   1.273 +    capacity=fileLength;        // estimated capacity
   1.274 +    src.getBuffer(capacity);
   1.275 +    src.releaseBuffer(0);       // zero length
   1.276 +    flush=FALSE;
   1.277 +    for(;;) {
   1.278 +        // convert contents of bytes[bytesLength]
   1.279 +        pb=bytes;
   1.280 +        for(;;) {
   1.281 +            length=src.length();
   1.282 +            buffer=src.getBuffer(capacity);
   1.283 +            if(buffer==NULL) {
   1.284 +                // unexpected failure to reserve some string capacity
   1.285 +                errorCode=U_MEMORY_ALLOCATION_ERROR;
   1.286 +                goto exit;
   1.287 +            }
   1.288 +
   1.289 +            pu=buffer+length;
   1.290 +            ucnv_toUnicode(
   1.291 +                cnv, &pu, buffer+src.getCapacity(),
   1.292 +                &pb, bytes+bytesLength,
   1.293 +                NULL, FALSE, &errorCode);
   1.294 +            src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
   1.295 +            if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
   1.296 +                errorCode=U_ZERO_ERROR;
   1.297 +                capacity=(3*src.getCapacity())/2; // increase capacity by 50%
   1.298 +            } else {
   1.299 +                break;
   1.300 +            }
   1.301 +        }
   1.302 +
   1.303 +        if(U_FAILURE(errorCode)) {
   1.304 +            break; // conversion error
   1.305 +        }
   1.306 +
   1.307 +        if(flush) {
   1.308 +            break; // completely converted the file
   1.309 +        }
   1.310 +
   1.311 +        // read next block
   1.312 +        bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
   1.313 +        if(bytesLength==0) {
   1.314 +            // reached end of file, convert once more to flush the converter
   1.315 +            flush=TRUE;
   1.316 +        }
   1.317 +    };
   1.318 +
   1.319 +exit:
   1.320 +    ucnv_close(cnv);
   1.321 +    T_FileStream_close(f);
   1.322 +
   1.323 +    if(U_SUCCESS(errorCode)) {
   1.324 +        return parse(src, errorCode);
   1.325 +    } else {
   1.326 +        return NULL;
   1.327 +    }
   1.328 +}
   1.329 +
   1.330 +UXMLElement *
   1.331 +UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
   1.332 +    if(U_FAILURE(status)) {
   1.333 +        return NULL;
   1.334 +    }
   1.335 +
   1.336 +    UXMLElement   *root = NULL;
   1.337 +    fPos = 0; // TODO use just a local pos variable and pass it into functions
   1.338 +              // where necessary?
   1.339 +
   1.340 +    // set all matchers to work on the input string
   1.341 +    mXMLDecl.reset(src);
   1.342 +    mXMLComment.reset(src);
   1.343 +    mXMLSP.reset(src);
   1.344 +    mXMLDoctype.reset(src);
   1.345 +    mXMLPI.reset(src);
   1.346 +    mXMLElemStart.reset(src);
   1.347 +    mXMLElemEnd.reset(src);
   1.348 +    mXMLElemEmpty.reset(src);
   1.349 +    mXMLCharData.reset(src);
   1.350 +    mAttrValue.reset(src);
   1.351 +    mAttrNormalizer.reset(src);
   1.352 +    mNewLineNormalizer.reset(src);
   1.353 +    mAmps.reset(src);
   1.354 +
   1.355 +    // Consume the XML Declaration, if present.
   1.356 +    if (mXMLDecl.lookingAt(fPos, status)) {
   1.357 +        fPos = mXMLDecl.end(status);
   1.358 +    }
   1.359 +
   1.360 +    // Consume "misc" [XML production 27] appearing before DocType
   1.361 +    parseMisc(status);
   1.362 +
   1.363 +    // Consume a DocType declaration, if present.
   1.364 +    if (mXMLDoctype.lookingAt(fPos, status)) {
   1.365 +        fPos = mXMLDoctype.end(status);
   1.366 +    }
   1.367 +
   1.368 +    // Consume additional "misc" [XML production 27] appearing after the DocType
   1.369 +    parseMisc(status);
   1.370 +
   1.371 +    // Get the root element
   1.372 +    if (mXMLElemEmpty.lookingAt(fPos, status)) {
   1.373 +        // Root is an empty element (no nested elements or content)
   1.374 +        root = createElement(mXMLElemEmpty, status);
   1.375 +        fPos = mXMLElemEmpty.end(status);
   1.376 +    } else {
   1.377 +        if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {
   1.378 +            error("Root Element expected", status);
   1.379 +            goto errorExit;
   1.380 +        }
   1.381 +        root = createElement(mXMLElemStart, status);
   1.382 +        UXMLElement  *el = root;
   1.383 +
   1.384 +        //
   1.385 +        // This is the loop that consumes the root element of the document,
   1.386 +        //      including all nested content.   Nested elements are handled by
   1.387 +        //      explicit pushes/pops of the element stack; there is no recursion
   1.388 +        //      in the control flow of this code.
   1.389 +        //      "el" always refers to the current element, the one to which content
   1.390 +        //      is being added.  It is above the top of the element stack.
   1.391 +        for (;;) {
   1.392 +            // Nested Element Start
   1.393 +            if (mXMLElemStart.lookingAt(fPos, status)) {
   1.394 +                UXMLElement *t = createElement(mXMLElemStart, status);
   1.395 +                el->fChildren.addElement(t, status);
   1.396 +                t->fParent = el;
   1.397 +                fElementStack.push(el, status);
   1.398 +                el = t;
   1.399 +                continue;
   1.400 +            }
   1.401 +
   1.402 +            // Text Content.  String is concatenated onto the current node's content,
   1.403 +            //                but only if it contains something other than spaces.
   1.404 +            UnicodeString s = scanContent(status);
   1.405 +            if (s.length() > 0) {
   1.406 +                mXMLSP.reset(s);
   1.407 +                if (mXMLSP.matches(status) == FALSE) {
   1.408 +                    // This chunk of text contains something other than just
   1.409 +                    //  white space. Make a child node for it.
   1.410 +                    replaceCharRefs(s, status);
   1.411 +                    el->fChildren.addElement(s.clone(), status);
   1.412 +                }
   1.413 +                mXMLSP.reset(src);    // The matchers need to stay set to the main input string.
   1.414 +                continue;
   1.415 +            }
   1.416 +
   1.417 +            // Comments.  Discard.
   1.418 +            if (mXMLComment.lookingAt(fPos, status)) {
   1.419 +                fPos = mXMLComment.end(status);
   1.420 +                continue;
   1.421 +            }
   1.422 +
   1.423 +            // PIs.  Discard.
   1.424 +            if (mXMLPI.lookingAt(fPos, status)) {
   1.425 +                fPos = mXMLPI.end(status);
   1.426 +                continue;
   1.427 +            }
   1.428 +
   1.429 +            // Element End
   1.430 +            if (mXMLElemEnd.lookingAt(fPos, status)) {
   1.431 +                fPos = mXMLElemEnd.end(0, status);
   1.432 +                const UnicodeString name = mXMLElemEnd.group(1, status);
   1.433 +                if (name != *el->fName) {
   1.434 +                    error("Element start / end tag mismatch", status);
   1.435 +                    goto errorExit;
   1.436 +                }
   1.437 +                if (fElementStack.empty()) {
   1.438 +                    // Close of the root element.  We're done with the doc.
   1.439 +                    el = NULL;
   1.440 +                    break;
   1.441 +                }
   1.442 +                el = (UXMLElement *)fElementStack.pop();
   1.443 +                continue;
   1.444 +            }
   1.445 +
   1.446 +            // Empty Element.  Stored as a child of the current element, but not stacked.
   1.447 +            if (mXMLElemEmpty.lookingAt(fPos, status)) {
   1.448 +                UXMLElement *t = createElement(mXMLElemEmpty, status);
   1.449 +                el->fChildren.addElement(t, status);
   1.450 +                continue;
   1.451 +            }
   1.452 +
   1.453 +            // Hit something within the document that doesn't match anything.
   1.454 +            //   It's an error.
   1.455 +            error("Unrecognized markup", status);
   1.456 +            break;
   1.457 +        }
   1.458 +
   1.459 +        if (el != NULL || !fElementStack.empty()) {
   1.460 +            // We bailed out early, for some reason.
   1.461 +            error("Root element not closed.", status);
   1.462 +            goto errorExit;
   1.463 +        }
   1.464 +    }
   1.465 +
   1.466 +    // Root Element parse is complete.
   1.467 +    // Consume the annoying xml "Misc" that can appear at the end of the doc.
   1.468 +    parseMisc(status);
   1.469 +
   1.470 +    // We should have reached the end of the input
   1.471 +    if (fPos != src.length()) {
   1.472 +        error("Extra content at the end of the document", status);
   1.473 +        goto errorExit;
   1.474 +    }
   1.475 +
   1.476 +    // Success!
   1.477 +    return root;
   1.478 +
   1.479 +errorExit:
   1.480 +    delete root;
   1.481 +    return NULL;
   1.482 +}
   1.483 +
   1.484 +//
   1.485 +//  createElement
   1.486 +//      We've just matched an element start tag.  Create and fill in a UXMLElement object
   1.487 +//      for it.
   1.488 +//
   1.489 +UXMLElement *
   1.490 +UXMLParser::createElement(RegexMatcher  &mEl, UErrorCode &status) {
   1.491 +    // First capture group is the element's name.
   1.492 +    UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);
   1.493 +
   1.494 +    // Scan for attributes.
   1.495 +    int32_t   pos = mEl.end(1, status);  // The position after the end of the tag name
   1.496 +
   1.497 +    while (mAttrValue.lookingAt(pos, status)) {  // loop runs once per attribute on this element.
   1.498 +        UnicodeString attName  = mAttrValue.group(1, status);
   1.499 +        UnicodeString attValue = mAttrValue.group(2, status);
   1.500 +
   1.501 +        // Trim the quotes from the att value.  These are left over from the original regex
   1.502 +        //   that parsed the attribue, which couldn't conveniently strip them.
   1.503 +        attValue.remove(0,1);                    // one char from the beginning
   1.504 +        attValue.truncate(attValue.length()-1);  // and one from the end.
   1.505 +        
   1.506 +        // XML Attribue value normalization. 
   1.507 +        // This is one of the really screwy parts of the XML spec.
   1.508 +        // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
   1.509 +        // Note that non-validating parsers must treat all entities as type CDATA
   1.510 +        //   which simplifies things some.
   1.511 +
   1.512 +        // Att normalization step 1:  normalize any newlines in the attribute value
   1.513 +        mNewLineNormalizer.reset(attValue);
   1.514 +        attValue = mNewLineNormalizer.replaceAll(fOneLF, status);
   1.515 +
   1.516 +        // Next change all xml white space chars to plain \u0020 spaces.
   1.517 +        mAttrNormalizer.reset(attValue);
   1.518 +        UnicodeString oneSpace((UChar)0x0020);
   1.519 +        attValue = mAttrNormalizer.replaceAll(oneSpace, status);
   1.520 +
   1.521 +        // Replace character entities.
   1.522 +        replaceCharRefs(attValue, status);
   1.523 +
   1.524 +        // Save the attribute name and value in our document structure.
   1.525 +        el->fAttNames.addElement((void *)intern(attName, status), status);
   1.526 +        el->fAttValues.addElement(attValue.clone(), status);
   1.527 +        pos = mAttrValue.end(2, status);
   1.528 +    }
   1.529 +    fPos = mEl.end(0, status);
   1.530 +    return el;
   1.531 +}
   1.532 +
   1.533 +//
   1.534 +//  parseMisc
   1.535 +//     Consume XML "Misc" [production #27]
   1.536 +//        which is any combination of space, PI and comments
   1.537 +//      Need to watch end-of-input because xml MISC stuff is allowed after
   1.538 +//        the document element, so we WILL scan off the end in this function
   1.539 +//
   1.540 +void
   1.541 +UXMLParser::parseMisc(UErrorCode &status)  {
   1.542 +    for (;;) {
   1.543 +        if (fPos >= mXMLPI.input().length()) {
   1.544 +            break;
   1.545 +        }
   1.546 +        if (mXMLPI.lookingAt(fPos, status)) {
   1.547 +            fPos = mXMLPI.end(status);
   1.548 +            continue;
   1.549 +        }
   1.550 +        if (mXMLSP.lookingAt(fPos, status)) {
   1.551 +            fPos = mXMLSP.end(status);
   1.552 +            continue;
   1.553 +        }
   1.554 +        if (mXMLComment.lookingAt(fPos, status)) {
   1.555 +            fPos = mXMLComment.end(status);
   1.556 +            continue;
   1.557 +        }
   1.558 +        break;
   1.559 +    }
   1.560 +}
   1.561 +
   1.562 +//
   1.563 +//  Scan for document content.
   1.564 +//
   1.565 +UnicodeString
   1.566 +UXMLParser::scanContent(UErrorCode &status) {
   1.567 +    UnicodeString  result;
   1.568 +    if (mXMLCharData.lookingAt(fPos, status)) {
   1.569 +        result = mXMLCharData.group((int32_t)0, status);
   1.570 +        // Normalize the new-lines.  (Before char ref substitution)
   1.571 +        mNewLineNormalizer.reset(result);
   1.572 +        result = mNewLineNormalizer.replaceAll(fOneLF, status);
   1.573 +        
   1.574 +        // TODO:  handle CDATA
   1.575 +        fPos = mXMLCharData.end(0, status);
   1.576 +    }
   1.577 +
   1.578 +    return result;
   1.579 +}
   1.580 +
   1.581 +//
   1.582 +//   replaceCharRefs
   1.583 +//
   1.584 +//      replace the char entities &lt;  &amp; &#123; &#x12ab; etc. in a string
   1.585 +//       with the corresponding actual character.
   1.586 +//
   1.587 +void
   1.588 +UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
   1.589 +    UnicodeString result;
   1.590 +    UnicodeString replacement;
   1.591 +    int     i;
   1.592 +
   1.593 +    mAmps.reset(s);
   1.594 +    // See the initialization for the regex matcher mAmps.
   1.595 +    //    Which entity we've matched is determined by which capture group has content,
   1.596 +    //      which is flaged by start() of that group not being -1.
   1.597 +    while (mAmps.find()) {
   1.598 +        if (mAmps.start(1, status) != -1) {
   1.599 +            replacement.setTo((UChar)x_AMP);
   1.600 +        } else if (mAmps.start(2, status) != -1) {
   1.601 +            replacement.setTo((UChar)x_LT);
   1.602 +        } else if (mAmps.start(3, status) != -1) {
   1.603 +            replacement.setTo((UChar)x_GT);
   1.604 +        } else if (mAmps.start(4, status) != -1) {
   1.605 +            replacement.setTo((UChar)x_APOS);
   1.606 +        } else if (mAmps.start(5, status) != -1) {
   1.607 +            replacement.setTo((UChar)x_QUOT);
   1.608 +        } else if (mAmps.start(6, status) != -1) {
   1.609 +            UnicodeString hexString = mAmps.group(6, status);
   1.610 +            UChar32 val = 0;
   1.611 +            for (i=0; i<hexString.length(); i++) {
   1.612 +                val = (val << 4) + u_digit(hexString.charAt(i), 16);
   1.613 +            }
   1.614 +            // TODO:  some verification that the character is valid
   1.615 +            replacement.setTo(val);
   1.616 +        } else if (mAmps.start(7, status) != -1) {
   1.617 +            UnicodeString decimalString = mAmps.group(7, status);
   1.618 +            UChar32 val = 0;
   1.619 +            for (i=0; i<decimalString.length(); i++) {
   1.620 +                val = val*10 + u_digit(decimalString.charAt(i), 10);
   1.621 +            }
   1.622 +            // TODO:  some verification that the character is valid
   1.623 +            replacement.setTo(val);
   1.624 +        } else {
   1.625 +            // An unrecognized &entity;  Leave it alone.
   1.626 +            //  TODO:  check that it really looks like an entity, and is not some
   1.627 +            //         random & in the text.
   1.628 +            replacement = mAmps.group((int32_t)0, status);
   1.629 +        }
   1.630 +        mAmps.appendReplacement(result, replacement, status);
   1.631 +    }
   1.632 +    mAmps.appendTail(result);
   1.633 +    s = result;
   1.634 +}
   1.635 +
   1.636 +void
   1.637 +UXMLParser::error(const char *message, UErrorCode &status) {
   1.638 +    // TODO:  something better here...
   1.639 +    const UnicodeString &src=mXMLDecl.input();
   1.640 +    int  line = 0;
   1.641 +    int  ci = 0;
   1.642 +    while (ci < fPos && ci>=0) {
   1.643 +        ci = src.indexOf((UChar)0x0a, ci+1);
   1.644 +        line++;
   1.645 +    }
   1.646 +    fprintf(stderr, "Error: %s at line %d\n", message, line);
   1.647 +    if (U_SUCCESS(status)) {
   1.648 +        status = U_PARSE_ERROR;
   1.649 +    }
   1.650 +}
   1.651 +
   1.652 +// intern strings like in Java
   1.653 +
   1.654 +const UnicodeString *
   1.655 +UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
   1.656 +    const UHashElement *he=fNames.find(s);
   1.657 +    if(he!=NULL) {
   1.658 +        // already a known name, return its hashed key pointer
   1.659 +        return (const UnicodeString *)he->key.pointer;
   1.660 +    } else {
   1.661 +        // add this new name and return its hashed key pointer
   1.662 +        fNames.puti(s, 0, errorCode);
   1.663 +        he=fNames.find(s);
   1.664 +        return (const UnicodeString *)he->key.pointer;
   1.665 +    }
   1.666 +}
   1.667 +
   1.668 +const UnicodeString *
   1.669 +UXMLParser::findName(const UnicodeString &s) const {
   1.670 +    const UHashElement *he=fNames.find(s);
   1.671 +    if(he!=NULL) {
   1.672 +        // a known name, return its hashed key pointer
   1.673 +        return (const UnicodeString *)he->key.pointer;
   1.674 +    } else {
   1.675 +        // unknown name
   1.676 +        return NULL;
   1.677 +    }
   1.678 +}
   1.679 +
   1.680 +// UXMLElement ------------------------------------------------------------- ***
   1.681 +
   1.682 +UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
   1.683 +   fParser(parser),
   1.684 +   fName(name),
   1.685 +   fAttNames(errorCode),
   1.686 +   fAttValues(errorCode),
   1.687 +   fChildren(errorCode),
   1.688 +   fParent(NULL)
   1.689 +{
   1.690 +}
   1.691 +
   1.692 +UXMLElement::~UXMLElement() {
   1.693 +    int   i;
   1.694 +    // attribute names are owned by the UXMLParser, don't delete them here
   1.695 +    for (i=fAttValues.size()-1; i>=0; i--) {
   1.696 +        delete (UObject *)fAttValues.elementAt(i);
   1.697 +    }
   1.698 +    for (i=fChildren.size()-1; i>=0; i--) {
   1.699 +        delete (UObject *)fChildren.elementAt(i);
   1.700 +    }
   1.701 +}
   1.702 +
   1.703 +const UnicodeString &
   1.704 +UXMLElement::getTagName() const {
   1.705 +    return *fName;
   1.706 +}
   1.707 +
   1.708 +UnicodeString
   1.709 +UXMLElement::getText(UBool recurse) const {
   1.710 +    UnicodeString text;
   1.711 +    appendText(text, recurse);
   1.712 +    return text;
   1.713 +}
   1.714 +
   1.715 +void
   1.716 +UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
   1.717 +    const UObject *node;
   1.718 +    int32_t i, count=fChildren.size();
   1.719 +    for(i=0; i<count; ++i) {
   1.720 +        node=(const UObject *)fChildren.elementAt(i);
   1.721 +        const UnicodeString *s=dynamic_cast<const UnicodeString *>(node);
   1.722 +        if(s!=NULL) {
   1.723 +            text.append(*s);
   1.724 +        } else if(recurse) /* must be a UXMLElement */ {
   1.725 +            ((const UXMLElement *)node)->appendText(text, recurse);
   1.726 +        }
   1.727 +    }
   1.728 +}
   1.729 +
   1.730 +int32_t
   1.731 +UXMLElement::countAttributes() const {
   1.732 +    return fAttNames.size();
   1.733 +}
   1.734 +
   1.735 +const UnicodeString *
   1.736 +UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
   1.737 +    if(0<=i && i<fAttNames.size()) {
   1.738 +        name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
   1.739 +        value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
   1.740 +        return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
   1.741 +    } else {
   1.742 +        return NULL;
   1.743 +    }
   1.744 +}
   1.745 +
   1.746 +const UnicodeString *
   1.747 +UXMLElement::getAttribute(const UnicodeString &name) const {
   1.748 +    // search for the attribute name by comparing the interned pointer,
   1.749 +    // not the string contents
   1.750 +    const UnicodeString *p=fParser->findName(name);
   1.751 +    if(p==NULL) {
   1.752 +        return NULL; // no such attribute seen by the parser at all
   1.753 +    }
   1.754 +
   1.755 +    int32_t i, count=fAttNames.size();
   1.756 +    for(i=0; i<count; ++i) {
   1.757 +        if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
   1.758 +            return (const UnicodeString *)fAttValues.elementAt(i);
   1.759 +        }
   1.760 +    }
   1.761 +    return NULL;
   1.762 +}
   1.763 +
   1.764 +int32_t
   1.765 +UXMLElement::countChildren() const {
   1.766 +    return fChildren.size();
   1.767 +}
   1.768 +
   1.769 +const UObject *
   1.770 +UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
   1.771 +    if(0<=i && i<fChildren.size()) {
   1.772 +        const UObject *node=(const UObject *)fChildren.elementAt(i);
   1.773 +        if(dynamic_cast<const UXMLElement *>(node)!=NULL) {
   1.774 +            type=UXML_NODE_TYPE_ELEMENT;
   1.775 +        } else {
   1.776 +            type=UXML_NODE_TYPE_STRING;
   1.777 +        }
   1.778 +        return node;
   1.779 +    } else {
   1.780 +        return NULL;
   1.781 +    }
   1.782 +}
   1.783 +
   1.784 +const UXMLElement *
   1.785 +UXMLElement::nextChildElement(int32_t &i) const {
   1.786 +    if(i<0) {
   1.787 +        return NULL;
   1.788 +    }
   1.789 +
   1.790 +    const UObject *node;
   1.791 +    int32_t count=fChildren.size();
   1.792 +    while(i<count) {
   1.793 +        node=(const UObject *)fChildren.elementAt(i++);
   1.794 +        const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
   1.795 +        if(elem!=NULL) {
   1.796 +            return elem;
   1.797 +        }
   1.798 +    }
   1.799 +    return NULL;
   1.800 +}
   1.801 +
   1.802 +const UXMLElement *
   1.803 +UXMLElement::getChildElement(const UnicodeString &name) const {
   1.804 +    // search for the element name by comparing the interned pointer,
   1.805 +    // not the string contents
   1.806 +    const UnicodeString *p=fParser->findName(name);
   1.807 +    if(p==NULL) {
   1.808 +        return NULL; // no such element seen by the parser at all
   1.809 +    }
   1.810 +
   1.811 +    const UObject *node;
   1.812 +    int32_t i, count=fChildren.size();
   1.813 +    for(i=0; i<count; ++i) {
   1.814 +        node=(const UObject *)fChildren.elementAt(i);
   1.815 +        const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
   1.816 +        if(elem!=NULL) {
   1.817 +            if(p==elem->fName) {
   1.818 +                return elem;
   1.819 +            }
   1.820 +        }
   1.821 +    }
   1.822 +    return NULL;
   1.823 +}
   1.824 +
   1.825 +U_NAMESPACE_END
   1.826 +
   1.827 +#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
   1.828 +

mercurial