1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/tools/toolutil/xmlparser.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,825 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 2004-2010, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: xmlparser.cpp 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2004jul21 1.17 +* created by: Andy Heninger 1.18 +*/ 1.19 + 1.20 +#include <stdio.h> 1.21 +#include "unicode/uchar.h" 1.22 +#include "unicode/ucnv.h" 1.23 +#include "unicode/regex.h" 1.24 +#include "filestrm.h" 1.25 +#include "xmlparser.h" 1.26 + 1.27 +#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION 1.28 + 1.29 +// character constants 1.30 +enum { 1.31 + x_QUOT=0x22, 1.32 + x_AMP=0x26, 1.33 + x_APOS=0x27, 1.34 + x_LT=0x3c, 1.35 + x_GT=0x3e, 1.36 + x_l=0x6c 1.37 +}; 1.38 + 1.39 +#define XML_SPACES "[ \\u0009\\u000d\\u000a]" 1.40 + 1.41 +// XML #4 1.42 +#define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \ 1.43 + "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \ 1.44 + "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \ 1.45 + "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" 1.46 + 1.47 +// XML #5 1.48 +#define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]" 1.49 + 1.50 +// XML #6 1.51 +#define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*" 1.52 + 1.53 +U_NAMESPACE_BEGIN 1.54 + 1.55 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser) 1.56 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement) 1.57 + 1.58 +// 1.59 +// UXMLParser constructor. Mostly just initializes the ICU regexes that are 1.60 +// used for parsing. 1.61 +// 1.62 +UXMLParser::UXMLParser(UErrorCode &status) : 1.63 + // XML Declaration. XML Production #23. 1.64 + // example: "<?xml version=1.0 encoding="utf-16" ?> 1.65 + // This is a sloppy implementation - just look for the leading <?xml and the closing ?> 1.66 + // allow for a possible leading BOM. 1.67 + mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status), 1.68 + 1.69 + // XML Comment production #15 1.70 + // example: "<!-- whatever --> 1.71 + // note, does not detect an illegal "--" within comments 1.72 + mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status), 1.73 + 1.74 + // XML Spaces 1.75 + // production [3] 1.76 + mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status), 1.77 + 1.78 + // XML Doctype decl production #28 1.79 + // example "<!DOCTYPE foo SYSTEM "somewhere" > 1.80 + // or "<!DOCTYPE foo [internal dtd]> 1.81 + // TODO: we don't actually parse the DOCTYPE or internal subsets. 1.82 + // Some internal dtd subsets could confuse this simple-minded 1.83 + // attempt at skipping over them, specifically, occcurences 1.84 + // of closeing square brackets. These could appear in comments, 1.85 + // or in parameter entity declarations, for example. 1.86 + mXMLDoctype(UnicodeString( 1.87 + "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV 1.88 + ), 0, status), 1.89 + 1.90 + // XML PI production #16 1.91 + // example "<?target stuff?> 1.92 + mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status), 1.93 + 1.94 + // XML Element Start Productions #40, #41 1.95 + // example <foo att1='abc' att2="d e f" > 1.96 + // capture #1: the tag name 1.97 + // 1.98 + mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" 1.99 + "(?:" 1.100 + XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " 1.101 + "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' 1.102 + ")*" // * for zero or more attributes. 1.103 + XML_SPACES "*?>", -1, US_INV), 0, status), // match " >" 1.104 + 1.105 + // XML Element End production #42 1.106 + // example </foo> 1.107 + mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status), 1.108 + 1.109 + // XML Element Empty production #44 1.110 + // example <foo att1="abc" att2="d e f" /> 1.111 + mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" 1.112 + "(?:" 1.113 + XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " 1.114 + "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' 1.115 + ")*" // * for zero or more attributes. 1.116 + XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />" 1.117 + 1.118 + 1.119 + // XMLCharData. Everything but '<'. Note that & will be dealt with later. 1.120 + mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status), 1.121 + 1.122 + // Attribute name = "value". XML Productions 10, 40/41 1.123 + // Capture group 1 is name, 1.124 + // 2 is the attribute value, including the quotes. 1.125 + // 1.126 + // Note that attributes are scanned twice. The first time is with 1.127 + // the regex for an entire element start. There, the attributes 1.128 + // are checked syntactically, but not separted out one by one. 1.129 + // Here, we match a single attribute, and make its name and 1.130 + // attribute value available to the parser code. 1.131 + mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*" 1.132 + "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status), 1.133 + 1.134 + 1.135 + mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status), 1.136 + 1.137 + // Match any of the new-line sequences in content. 1.138 + // All are changed to \u000a. 1.139 + mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status), 1.140 + 1.141 + // & char references 1.142 + // We will figure out what we've got based on which capture group has content. 1.143 + // The last one is a catchall for unrecognized entity references.. 1.144 + // 1 2 3 4 5 6 7 8 1.145 + mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"), 1.146 + 0, status), 1.147 + 1.148 + fNames(status), 1.149 + fElementStack(status), 1.150 + fOneLF((UChar)0x0a) // Plain new-line string, used in new line normalization. 1.151 + { 1.152 + } 1.153 + 1.154 +UXMLParser * 1.155 +UXMLParser::createParser(UErrorCode &errorCode) { 1.156 + if (U_FAILURE(errorCode)) { 1.157 + return NULL; 1.158 + } else { 1.159 + return new UXMLParser(errorCode); 1.160 + } 1.161 +} 1.162 + 1.163 +UXMLParser::~UXMLParser() {} 1.164 + 1.165 +UXMLElement * 1.166 +UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) { 1.167 + char bytes[4096], charsetBuffer[100]; 1.168 + FileStream *f; 1.169 + const char *charset, *pb; 1.170 + UnicodeString src; 1.171 + UConverter *cnv; 1.172 + UChar *buffer, *pu; 1.173 + int32_t fileLength, bytesLength, length, capacity; 1.174 + UBool flush; 1.175 + 1.176 + if(U_FAILURE(errorCode)) { 1.177 + return NULL; 1.178 + } 1.179 + 1.180 + f=T_FileStream_open(filename, "rb"); 1.181 + if(f==NULL) { 1.182 + errorCode=U_FILE_ACCESS_ERROR; 1.183 + return NULL; 1.184 + } 1.185 + 1.186 + bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); 1.187 + if(bytesLength<(int32_t)sizeof(bytes)) { 1.188 + // we have already read the entire file 1.189 + fileLength=bytesLength; 1.190 + } else { 1.191 + // get the file length 1.192 + fileLength=T_FileStream_size(f); 1.193 + } 1.194 + 1.195 + /* 1.196 + * get the charset: 1.197 + * 1. Unicode signature 1.198 + * 2. treat as ISO-8859-1 and read XML encoding="charser" 1.199 + * 3. default to UTF-8 1.200 + */ 1.201 + charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode); 1.202 + if(U_SUCCESS(errorCode) && charset!=NULL) { 1.203 + // open converter according to Unicode signature 1.204 + cnv=ucnv_open(charset, &errorCode); 1.205 + } else { 1.206 + // read as Latin-1 and parse the XML declaration and encoding 1.207 + cnv=ucnv_open("ISO-8859-1", &errorCode); 1.208 + if(U_FAILURE(errorCode)) { 1.209 + // unexpected error opening Latin-1 converter 1.210 + goto exit; 1.211 + } 1.212 + 1.213 + buffer=src.getBuffer(bytesLength); 1.214 + if(buffer==NULL) { 1.215 + // unexpected failure to reserve some string capacity 1.216 + errorCode=U_MEMORY_ALLOCATION_ERROR; 1.217 + goto exit; 1.218 + } 1.219 + pb=bytes; 1.220 + pu=buffer; 1.221 + ucnv_toUnicode( 1.222 + cnv, 1.223 + &pu, buffer+src.getCapacity(), 1.224 + &pb, bytes+bytesLength, 1.225 + NULL, TRUE, &errorCode); 1.226 + src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); 1.227 + ucnv_close(cnv); 1.228 + cnv=NULL; 1.229 + if(U_FAILURE(errorCode)) { 1.230 + // unexpected error in conversion from Latin-1 1.231 + src.remove(); 1.232 + goto exit; 1.233 + } 1.234 + 1.235 + // parse XML declaration 1.236 + if(mXMLDecl.reset(src).lookingAt(0, errorCode)) { 1.237 + int32_t declEnd=mXMLDecl.end(errorCode); 1.238 + // go beyond <?xml 1.239 + int32_t pos=src.indexOf((UChar)x_l)+1; 1.240 + 1.241 + mAttrValue.reset(src); 1.242 + while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loop runs once per attribute on this element. 1.243 + UnicodeString attName = mAttrValue.group(1, errorCode); 1.244 + UnicodeString attValue = mAttrValue.group(2, errorCode); 1.245 + 1.246 + // Trim the quotes from the att value. These are left over from the original regex 1.247 + // that parsed the attribue, which couldn't conveniently strip them. 1.248 + attValue.remove(0,1); // one char from the beginning 1.249 + attValue.truncate(attValue.length()-1); // and one from the end. 1.250 + 1.251 + if(attName==UNICODE_STRING("encoding", 8)) { 1.252 + length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer)); 1.253 + charset=charsetBuffer; 1.254 + break; 1.255 + } 1.256 + pos = mAttrValue.end(2, errorCode); 1.257 + } 1.258 + 1.259 + if(charset==NULL) { 1.260 + // default to UTF-8 1.261 + charset="UTF-8"; 1.262 + } 1.263 + cnv=ucnv_open(charset, &errorCode); 1.264 + } 1.265 + } 1.266 + 1.267 + if(U_FAILURE(errorCode)) { 1.268 + // unable to open the converter 1.269 + goto exit; 1.270 + } 1.271 + 1.272 + // convert the file contents 1.273 + capacity=fileLength; // estimated capacity 1.274 + src.getBuffer(capacity); 1.275 + src.releaseBuffer(0); // zero length 1.276 + flush=FALSE; 1.277 + for(;;) { 1.278 + // convert contents of bytes[bytesLength] 1.279 + pb=bytes; 1.280 + for(;;) { 1.281 + length=src.length(); 1.282 + buffer=src.getBuffer(capacity); 1.283 + if(buffer==NULL) { 1.284 + // unexpected failure to reserve some string capacity 1.285 + errorCode=U_MEMORY_ALLOCATION_ERROR; 1.286 + goto exit; 1.287 + } 1.288 + 1.289 + pu=buffer+length; 1.290 + ucnv_toUnicode( 1.291 + cnv, &pu, buffer+src.getCapacity(), 1.292 + &pb, bytes+bytesLength, 1.293 + NULL, FALSE, &errorCode); 1.294 + src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); 1.295 + if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 1.296 + errorCode=U_ZERO_ERROR; 1.297 + capacity=(3*src.getCapacity())/2; // increase capacity by 50% 1.298 + } else { 1.299 + break; 1.300 + } 1.301 + } 1.302 + 1.303 + if(U_FAILURE(errorCode)) { 1.304 + break; // conversion error 1.305 + } 1.306 + 1.307 + if(flush) { 1.308 + break; // completely converted the file 1.309 + } 1.310 + 1.311 + // read next block 1.312 + bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); 1.313 + if(bytesLength==0) { 1.314 + // reached end of file, convert once more to flush the converter 1.315 + flush=TRUE; 1.316 + } 1.317 + }; 1.318 + 1.319 +exit: 1.320 + ucnv_close(cnv); 1.321 + T_FileStream_close(f); 1.322 + 1.323 + if(U_SUCCESS(errorCode)) { 1.324 + return parse(src, errorCode); 1.325 + } else { 1.326 + return NULL; 1.327 + } 1.328 +} 1.329 + 1.330 +UXMLElement * 1.331 +UXMLParser::parse(const UnicodeString &src, UErrorCode &status) { 1.332 + if(U_FAILURE(status)) { 1.333 + return NULL; 1.334 + } 1.335 + 1.336 + UXMLElement *root = NULL; 1.337 + fPos = 0; // TODO use just a local pos variable and pass it into functions 1.338 + // where necessary? 1.339 + 1.340 + // set all matchers to work on the input string 1.341 + mXMLDecl.reset(src); 1.342 + mXMLComment.reset(src); 1.343 + mXMLSP.reset(src); 1.344 + mXMLDoctype.reset(src); 1.345 + mXMLPI.reset(src); 1.346 + mXMLElemStart.reset(src); 1.347 + mXMLElemEnd.reset(src); 1.348 + mXMLElemEmpty.reset(src); 1.349 + mXMLCharData.reset(src); 1.350 + mAttrValue.reset(src); 1.351 + mAttrNormalizer.reset(src); 1.352 + mNewLineNormalizer.reset(src); 1.353 + mAmps.reset(src); 1.354 + 1.355 + // Consume the XML Declaration, if present. 1.356 + if (mXMLDecl.lookingAt(fPos, status)) { 1.357 + fPos = mXMLDecl.end(status); 1.358 + } 1.359 + 1.360 + // Consume "misc" [XML production 27] appearing before DocType 1.361 + parseMisc(status); 1.362 + 1.363 + // Consume a DocType declaration, if present. 1.364 + if (mXMLDoctype.lookingAt(fPos, status)) { 1.365 + fPos = mXMLDoctype.end(status); 1.366 + } 1.367 + 1.368 + // Consume additional "misc" [XML production 27] appearing after the DocType 1.369 + parseMisc(status); 1.370 + 1.371 + // Get the root element 1.372 + if (mXMLElemEmpty.lookingAt(fPos, status)) { 1.373 + // Root is an empty element (no nested elements or content) 1.374 + root = createElement(mXMLElemEmpty, status); 1.375 + fPos = mXMLElemEmpty.end(status); 1.376 + } else { 1.377 + if (mXMLElemStart.lookingAt(fPos, status) == FALSE) { 1.378 + error("Root Element expected", status); 1.379 + goto errorExit; 1.380 + } 1.381 + root = createElement(mXMLElemStart, status); 1.382 + UXMLElement *el = root; 1.383 + 1.384 + // 1.385 + // This is the loop that consumes the root element of the document, 1.386 + // including all nested content. Nested elements are handled by 1.387 + // explicit pushes/pops of the element stack; there is no recursion 1.388 + // in the control flow of this code. 1.389 + // "el" always refers to the current element, the one to which content 1.390 + // is being added. It is above the top of the element stack. 1.391 + for (;;) { 1.392 + // Nested Element Start 1.393 + if (mXMLElemStart.lookingAt(fPos, status)) { 1.394 + UXMLElement *t = createElement(mXMLElemStart, status); 1.395 + el->fChildren.addElement(t, status); 1.396 + t->fParent = el; 1.397 + fElementStack.push(el, status); 1.398 + el = t; 1.399 + continue; 1.400 + } 1.401 + 1.402 + // Text Content. String is concatenated onto the current node's content, 1.403 + // but only if it contains something other than spaces. 1.404 + UnicodeString s = scanContent(status); 1.405 + if (s.length() > 0) { 1.406 + mXMLSP.reset(s); 1.407 + if (mXMLSP.matches(status) == FALSE) { 1.408 + // This chunk of text contains something other than just 1.409 + // white space. Make a child node for it. 1.410 + replaceCharRefs(s, status); 1.411 + el->fChildren.addElement(s.clone(), status); 1.412 + } 1.413 + mXMLSP.reset(src); // The matchers need to stay set to the main input string. 1.414 + continue; 1.415 + } 1.416 + 1.417 + // Comments. Discard. 1.418 + if (mXMLComment.lookingAt(fPos, status)) { 1.419 + fPos = mXMLComment.end(status); 1.420 + continue; 1.421 + } 1.422 + 1.423 + // PIs. Discard. 1.424 + if (mXMLPI.lookingAt(fPos, status)) { 1.425 + fPos = mXMLPI.end(status); 1.426 + continue; 1.427 + } 1.428 + 1.429 + // Element End 1.430 + if (mXMLElemEnd.lookingAt(fPos, status)) { 1.431 + fPos = mXMLElemEnd.end(0, status); 1.432 + const UnicodeString name = mXMLElemEnd.group(1, status); 1.433 + if (name != *el->fName) { 1.434 + error("Element start / end tag mismatch", status); 1.435 + goto errorExit; 1.436 + } 1.437 + if (fElementStack.empty()) { 1.438 + // Close of the root element. We're done with the doc. 1.439 + el = NULL; 1.440 + break; 1.441 + } 1.442 + el = (UXMLElement *)fElementStack.pop(); 1.443 + continue; 1.444 + } 1.445 + 1.446 + // Empty Element. Stored as a child of the current element, but not stacked. 1.447 + if (mXMLElemEmpty.lookingAt(fPos, status)) { 1.448 + UXMLElement *t = createElement(mXMLElemEmpty, status); 1.449 + el->fChildren.addElement(t, status); 1.450 + continue; 1.451 + } 1.452 + 1.453 + // Hit something within the document that doesn't match anything. 1.454 + // It's an error. 1.455 + error("Unrecognized markup", status); 1.456 + break; 1.457 + } 1.458 + 1.459 + if (el != NULL || !fElementStack.empty()) { 1.460 + // We bailed out early, for some reason. 1.461 + error("Root element not closed.", status); 1.462 + goto errorExit; 1.463 + } 1.464 + } 1.465 + 1.466 + // Root Element parse is complete. 1.467 + // Consume the annoying xml "Misc" that can appear at the end of the doc. 1.468 + parseMisc(status); 1.469 + 1.470 + // We should have reached the end of the input 1.471 + if (fPos != src.length()) { 1.472 + error("Extra content at the end of the document", status); 1.473 + goto errorExit; 1.474 + } 1.475 + 1.476 + // Success! 1.477 + return root; 1.478 + 1.479 +errorExit: 1.480 + delete root; 1.481 + return NULL; 1.482 +} 1.483 + 1.484 +// 1.485 +// createElement 1.486 +// We've just matched an element start tag. Create and fill in a UXMLElement object 1.487 +// for it. 1.488 +// 1.489 +UXMLElement * 1.490 +UXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) { 1.491 + // First capture group is the element's name. 1.492 + UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status); 1.493 + 1.494 + // Scan for attributes. 1.495 + int32_t pos = mEl.end(1, status); // The position after the end of the tag name 1.496 + 1.497 + while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute on this element. 1.498 + UnicodeString attName = mAttrValue.group(1, status); 1.499 + UnicodeString attValue = mAttrValue.group(2, status); 1.500 + 1.501 + // Trim the quotes from the att value. These are left over from the original regex 1.502 + // that parsed the attribue, which couldn't conveniently strip them. 1.503 + attValue.remove(0,1); // one char from the beginning 1.504 + attValue.truncate(attValue.length()-1); // and one from the end. 1.505 + 1.506 + // XML Attribue value normalization. 1.507 + // This is one of the really screwy parts of the XML spec. 1.508 + // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize 1.509 + // Note that non-validating parsers must treat all entities as type CDATA 1.510 + // which simplifies things some. 1.511 + 1.512 + // Att normalization step 1: normalize any newlines in the attribute value 1.513 + mNewLineNormalizer.reset(attValue); 1.514 + attValue = mNewLineNormalizer.replaceAll(fOneLF, status); 1.515 + 1.516 + // Next change all xml white space chars to plain \u0020 spaces. 1.517 + mAttrNormalizer.reset(attValue); 1.518 + UnicodeString oneSpace((UChar)0x0020); 1.519 + attValue = mAttrNormalizer.replaceAll(oneSpace, status); 1.520 + 1.521 + // Replace character entities. 1.522 + replaceCharRefs(attValue, status); 1.523 + 1.524 + // Save the attribute name and value in our document structure. 1.525 + el->fAttNames.addElement((void *)intern(attName, status), status); 1.526 + el->fAttValues.addElement(attValue.clone(), status); 1.527 + pos = mAttrValue.end(2, status); 1.528 + } 1.529 + fPos = mEl.end(0, status); 1.530 + return el; 1.531 +} 1.532 + 1.533 +// 1.534 +// parseMisc 1.535 +// Consume XML "Misc" [production #27] 1.536 +// which is any combination of space, PI and comments 1.537 +// Need to watch end-of-input because xml MISC stuff is allowed after 1.538 +// the document element, so we WILL scan off the end in this function 1.539 +// 1.540 +void 1.541 +UXMLParser::parseMisc(UErrorCode &status) { 1.542 + for (;;) { 1.543 + if (fPos >= mXMLPI.input().length()) { 1.544 + break; 1.545 + } 1.546 + if (mXMLPI.lookingAt(fPos, status)) { 1.547 + fPos = mXMLPI.end(status); 1.548 + continue; 1.549 + } 1.550 + if (mXMLSP.lookingAt(fPos, status)) { 1.551 + fPos = mXMLSP.end(status); 1.552 + continue; 1.553 + } 1.554 + if (mXMLComment.lookingAt(fPos, status)) { 1.555 + fPos = mXMLComment.end(status); 1.556 + continue; 1.557 + } 1.558 + break; 1.559 + } 1.560 +} 1.561 + 1.562 +// 1.563 +// Scan for document content. 1.564 +// 1.565 +UnicodeString 1.566 +UXMLParser::scanContent(UErrorCode &status) { 1.567 + UnicodeString result; 1.568 + if (mXMLCharData.lookingAt(fPos, status)) { 1.569 + result = mXMLCharData.group((int32_t)0, status); 1.570 + // Normalize the new-lines. (Before char ref substitution) 1.571 + mNewLineNormalizer.reset(result); 1.572 + result = mNewLineNormalizer.replaceAll(fOneLF, status); 1.573 + 1.574 + // TODO: handle CDATA 1.575 + fPos = mXMLCharData.end(0, status); 1.576 + } 1.577 + 1.578 + return result; 1.579 +} 1.580 + 1.581 +// 1.582 +// replaceCharRefs 1.583 +// 1.584 +// replace the char entities < & { ካ etc. in a string 1.585 +// with the corresponding actual character. 1.586 +// 1.587 +void 1.588 +UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) { 1.589 + UnicodeString result; 1.590 + UnicodeString replacement; 1.591 + int i; 1.592 + 1.593 + mAmps.reset(s); 1.594 + // See the initialization for the regex matcher mAmps. 1.595 + // Which entity we've matched is determined by which capture group has content, 1.596 + // which is flaged by start() of that group not being -1. 1.597 + while (mAmps.find()) { 1.598 + if (mAmps.start(1, status) != -1) { 1.599 + replacement.setTo((UChar)x_AMP); 1.600 + } else if (mAmps.start(2, status) != -1) { 1.601 + replacement.setTo((UChar)x_LT); 1.602 + } else if (mAmps.start(3, status) != -1) { 1.603 + replacement.setTo((UChar)x_GT); 1.604 + } else if (mAmps.start(4, status) != -1) { 1.605 + replacement.setTo((UChar)x_APOS); 1.606 + } else if (mAmps.start(5, status) != -1) { 1.607 + replacement.setTo((UChar)x_QUOT); 1.608 + } else if (mAmps.start(6, status) != -1) { 1.609 + UnicodeString hexString = mAmps.group(6, status); 1.610 + UChar32 val = 0; 1.611 + for (i=0; i<hexString.length(); i++) { 1.612 + val = (val << 4) + u_digit(hexString.charAt(i), 16); 1.613 + } 1.614 + // TODO: some verification that the character is valid 1.615 + replacement.setTo(val); 1.616 + } else if (mAmps.start(7, status) != -1) { 1.617 + UnicodeString decimalString = mAmps.group(7, status); 1.618 + UChar32 val = 0; 1.619 + for (i=0; i<decimalString.length(); i++) { 1.620 + val = val*10 + u_digit(decimalString.charAt(i), 10); 1.621 + } 1.622 + // TODO: some verification that the character is valid 1.623 + replacement.setTo(val); 1.624 + } else { 1.625 + // An unrecognized &entity; Leave it alone. 1.626 + // TODO: check that it really looks like an entity, and is not some 1.627 + // random & in the text. 1.628 + replacement = mAmps.group((int32_t)0, status); 1.629 + } 1.630 + mAmps.appendReplacement(result, replacement, status); 1.631 + } 1.632 + mAmps.appendTail(result); 1.633 + s = result; 1.634 +} 1.635 + 1.636 +void 1.637 +UXMLParser::error(const char *message, UErrorCode &status) { 1.638 + // TODO: something better here... 1.639 + const UnicodeString &src=mXMLDecl.input(); 1.640 + int line = 0; 1.641 + int ci = 0; 1.642 + while (ci < fPos && ci>=0) { 1.643 + ci = src.indexOf((UChar)0x0a, ci+1); 1.644 + line++; 1.645 + } 1.646 + fprintf(stderr, "Error: %s at line %d\n", message, line); 1.647 + if (U_SUCCESS(status)) { 1.648 + status = U_PARSE_ERROR; 1.649 + } 1.650 +} 1.651 + 1.652 +// intern strings like in Java 1.653 + 1.654 +const UnicodeString * 1.655 +UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) { 1.656 + const UHashElement *he=fNames.find(s); 1.657 + if(he!=NULL) { 1.658 + // already a known name, return its hashed key pointer 1.659 + return (const UnicodeString *)he->key.pointer; 1.660 + } else { 1.661 + // add this new name and return its hashed key pointer 1.662 + fNames.puti(s, 0, errorCode); 1.663 + he=fNames.find(s); 1.664 + return (const UnicodeString *)he->key.pointer; 1.665 + } 1.666 +} 1.667 + 1.668 +const UnicodeString * 1.669 +UXMLParser::findName(const UnicodeString &s) const { 1.670 + const UHashElement *he=fNames.find(s); 1.671 + if(he!=NULL) { 1.672 + // a known name, return its hashed key pointer 1.673 + return (const UnicodeString *)he->key.pointer; 1.674 + } else { 1.675 + // unknown name 1.676 + return NULL; 1.677 + } 1.678 +} 1.679 + 1.680 +// UXMLElement ------------------------------------------------------------- *** 1.681 + 1.682 +UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) : 1.683 + fParser(parser), 1.684 + fName(name), 1.685 + fAttNames(errorCode), 1.686 + fAttValues(errorCode), 1.687 + fChildren(errorCode), 1.688 + fParent(NULL) 1.689 +{ 1.690 +} 1.691 + 1.692 +UXMLElement::~UXMLElement() { 1.693 + int i; 1.694 + // attribute names are owned by the UXMLParser, don't delete them here 1.695 + for (i=fAttValues.size()-1; i>=0; i--) { 1.696 + delete (UObject *)fAttValues.elementAt(i); 1.697 + } 1.698 + for (i=fChildren.size()-1; i>=0; i--) { 1.699 + delete (UObject *)fChildren.elementAt(i); 1.700 + } 1.701 +} 1.702 + 1.703 +const UnicodeString & 1.704 +UXMLElement::getTagName() const { 1.705 + return *fName; 1.706 +} 1.707 + 1.708 +UnicodeString 1.709 +UXMLElement::getText(UBool recurse) const { 1.710 + UnicodeString text; 1.711 + appendText(text, recurse); 1.712 + return text; 1.713 +} 1.714 + 1.715 +void 1.716 +UXMLElement::appendText(UnicodeString &text, UBool recurse) const { 1.717 + const UObject *node; 1.718 + int32_t i, count=fChildren.size(); 1.719 + for(i=0; i<count; ++i) { 1.720 + node=(const UObject *)fChildren.elementAt(i); 1.721 + const UnicodeString *s=dynamic_cast<const UnicodeString *>(node); 1.722 + if(s!=NULL) { 1.723 + text.append(*s); 1.724 + } else if(recurse) /* must be a UXMLElement */ { 1.725 + ((const UXMLElement *)node)->appendText(text, recurse); 1.726 + } 1.727 + } 1.728 +} 1.729 + 1.730 +int32_t 1.731 +UXMLElement::countAttributes() const { 1.732 + return fAttNames.size(); 1.733 +} 1.734 + 1.735 +const UnicodeString * 1.736 +UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const { 1.737 + if(0<=i && i<fAttNames.size()) { 1.738 + name.setTo(*(const UnicodeString *)fAttNames.elementAt(i)); 1.739 + value.setTo(*(const UnicodeString *)fAttValues.elementAt(i)); 1.740 + return &value; // or return (UnicodeString *)fAttValues.elementAt(i); 1.741 + } else { 1.742 + return NULL; 1.743 + } 1.744 +} 1.745 + 1.746 +const UnicodeString * 1.747 +UXMLElement::getAttribute(const UnicodeString &name) const { 1.748 + // search for the attribute name by comparing the interned pointer, 1.749 + // not the string contents 1.750 + const UnicodeString *p=fParser->findName(name); 1.751 + if(p==NULL) { 1.752 + return NULL; // no such attribute seen by the parser at all 1.753 + } 1.754 + 1.755 + int32_t i, count=fAttNames.size(); 1.756 + for(i=0; i<count; ++i) { 1.757 + if(p==(const UnicodeString *)fAttNames.elementAt(i)) { 1.758 + return (const UnicodeString *)fAttValues.elementAt(i); 1.759 + } 1.760 + } 1.761 + return NULL; 1.762 +} 1.763 + 1.764 +int32_t 1.765 +UXMLElement::countChildren() const { 1.766 + return fChildren.size(); 1.767 +} 1.768 + 1.769 +const UObject * 1.770 +UXMLElement::getChild(int32_t i, UXMLNodeType &type) const { 1.771 + if(0<=i && i<fChildren.size()) { 1.772 + const UObject *node=(const UObject *)fChildren.elementAt(i); 1.773 + if(dynamic_cast<const UXMLElement *>(node)!=NULL) { 1.774 + type=UXML_NODE_TYPE_ELEMENT; 1.775 + } else { 1.776 + type=UXML_NODE_TYPE_STRING; 1.777 + } 1.778 + return node; 1.779 + } else { 1.780 + return NULL; 1.781 + } 1.782 +} 1.783 + 1.784 +const UXMLElement * 1.785 +UXMLElement::nextChildElement(int32_t &i) const { 1.786 + if(i<0) { 1.787 + return NULL; 1.788 + } 1.789 + 1.790 + const UObject *node; 1.791 + int32_t count=fChildren.size(); 1.792 + while(i<count) { 1.793 + node=(const UObject *)fChildren.elementAt(i++); 1.794 + const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); 1.795 + if(elem!=NULL) { 1.796 + return elem; 1.797 + } 1.798 + } 1.799 + return NULL; 1.800 +} 1.801 + 1.802 +const UXMLElement * 1.803 +UXMLElement::getChildElement(const UnicodeString &name) const { 1.804 + // search for the element name by comparing the interned pointer, 1.805 + // not the string contents 1.806 + const UnicodeString *p=fParser->findName(name); 1.807 + if(p==NULL) { 1.808 + return NULL; // no such element seen by the parser at all 1.809 + } 1.810 + 1.811 + const UObject *node; 1.812 + int32_t i, count=fChildren.size(); 1.813 + for(i=0; i<count; ++i) { 1.814 + node=(const UObject *)fChildren.elementAt(i); 1.815 + const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); 1.816 + if(elem!=NULL) { 1.817 + if(p==elem->fName) { 1.818 + return elem; 1.819 + } 1.820 + } 1.821 + } 1.822 + return NULL; 1.823 +} 1.824 + 1.825 +U_NAMESPACE_END 1.826 + 1.827 +#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 1.828 +