intl/icu/source/tools/toolutil/xmlparser.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2004-2010, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 *******************************************************************************
michael@0 8 * file name: xmlparser.cpp
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2004jul21
michael@0 14 * created by: Andy Heninger
michael@0 15 */
michael@0 16
michael@0 17 #include <stdio.h>
michael@0 18 #include "unicode/uchar.h"
michael@0 19 #include "unicode/ucnv.h"
michael@0 20 #include "unicode/regex.h"
michael@0 21 #include "filestrm.h"
michael@0 22 #include "xmlparser.h"
michael@0 23
michael@0 24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
michael@0 25
michael@0 26 // character constants
michael@0 27 enum {
michael@0 28 x_QUOT=0x22,
michael@0 29 x_AMP=0x26,
michael@0 30 x_APOS=0x27,
michael@0 31 x_LT=0x3c,
michael@0 32 x_GT=0x3e,
michael@0 33 x_l=0x6c
michael@0 34 };
michael@0 35
michael@0 36 #define XML_SPACES "[ \\u0009\\u000d\\u000a]"
michael@0 37
michael@0 38 // XML #4
michael@0 39 #define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
michael@0 40 "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
michael@0 41 "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
michael@0 42 "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
michael@0 43
michael@0 44 // XML #5
michael@0 45 #define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
michael@0 46
michael@0 47 // XML #6
michael@0 48 #define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"
michael@0 49
michael@0 50 U_NAMESPACE_BEGIN
michael@0 51
michael@0 52 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
michael@0 53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)
michael@0 54
michael@0 55 //
michael@0 56 // UXMLParser constructor. Mostly just initializes the ICU regexes that are
michael@0 57 // used for parsing.
michael@0 58 //
michael@0 59 UXMLParser::UXMLParser(UErrorCode &status) :
michael@0 60 // XML Declaration. XML Production #23.
michael@0 61 // example: "<?xml version=1.0 encoding="utf-16" ?>
michael@0 62 // This is a sloppy implementation - just look for the leading <?xml and the closing ?>
michael@0 63 // allow for a possible leading BOM.
michael@0 64 mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
michael@0 65
michael@0 66 // XML Comment production #15
michael@0 67 // example: "<!-- whatever -->
michael@0 68 // note, does not detect an illegal "--" within comments
michael@0 69 mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
michael@0 70
michael@0 71 // XML Spaces
michael@0 72 // production [3]
michael@0 73 mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
michael@0 74
michael@0 75 // XML Doctype decl production #28
michael@0 76 // example "<!DOCTYPE foo SYSTEM "somewhere" >
michael@0 77 // or "<!DOCTYPE foo [internal dtd]>
michael@0 78 // TODO: we don't actually parse the DOCTYPE or internal subsets.
michael@0 79 // Some internal dtd subsets could confuse this simple-minded
michael@0 80 // attempt at skipping over them, specifically, occcurences
michael@0 81 // of closeing square brackets. These could appear in comments,
michael@0 82 // or in parameter entity declarations, for example.
michael@0 83 mXMLDoctype(UnicodeString(
michael@0 84 "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV
michael@0 85 ), 0, status),
michael@0 86
michael@0 87 // XML PI production #16
michael@0 88 // example "<?target stuff?>
michael@0 89 mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
michael@0 90
michael@0 91 // XML Element Start Productions #40, #41
michael@0 92 // example <foo att1='abc' att2="d e f" >
michael@0 93 // capture #1: the tag name
michael@0 94 //
michael@0 95 mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name"
michael@0 96 "(?:"
michael@0 97 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = "
michael@0 98 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"'
michael@0 99 ")*" // * for zero or more attributes.
michael@0 100 XML_SPACES "*?>", -1, US_INV), 0, status), // match " >"
michael@0 101
michael@0 102 // XML Element End production #42
michael@0 103 // example </foo>
michael@0 104 mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
michael@0 105
michael@0 106 // XML Element Empty production #44
michael@0 107 // example <foo att1="abc" att2="d e f" />
michael@0 108 mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name"
michael@0 109 "(?:"
michael@0 110 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = "
michael@0 111 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"'
michael@0 112 ")*" // * for zero or more attributes.
michael@0 113 XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />"
michael@0 114
michael@0 115
michael@0 116 // XMLCharData. Everything but '<'. Note that & will be dealt with later.
michael@0 117 mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
michael@0 118
michael@0 119 // Attribute name = "value". XML Productions 10, 40/41
michael@0 120 // Capture group 1 is name,
michael@0 121 // 2 is the attribute value, including the quotes.
michael@0 122 //
michael@0 123 // Note that attributes are scanned twice. The first time is with
michael@0 124 // the regex for an entire element start. There, the attributes
michael@0 125 // are checked syntactically, but not separted out one by one.
michael@0 126 // Here, we match a single attribute, and make its name and
michael@0 127 // attribute value available to the parser code.
michael@0 128 mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*"
michael@0 129 "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
michael@0 130
michael@0 131
michael@0 132 mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
michael@0 133
michael@0 134 // Match any of the new-line sequences in content.
michael@0 135 // All are changed to \u000a.
michael@0 136 mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
michael@0 137
michael@0 138 // & char references
michael@0 139 // We will figure out what we've got based on which capture group has content.
michael@0 140 // The last one is a catchall for unrecognized entity references..
michael@0 141 // 1 2 3 4 5 6 7 8
michael@0 142 mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
michael@0 143 0, status),
michael@0 144
michael@0 145 fNames(status),
michael@0 146 fElementStack(status),
michael@0 147 fOneLF((UChar)0x0a) // Plain new-line string, used in new line normalization.
michael@0 148 {
michael@0 149 }
michael@0 150
michael@0 151 UXMLParser *
michael@0 152 UXMLParser::createParser(UErrorCode &errorCode) {
michael@0 153 if (U_FAILURE(errorCode)) {
michael@0 154 return NULL;
michael@0 155 } else {
michael@0 156 return new UXMLParser(errorCode);
michael@0 157 }
michael@0 158 }
michael@0 159
michael@0 160 UXMLParser::~UXMLParser() {}
michael@0 161
michael@0 162 UXMLElement *
michael@0 163 UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
michael@0 164 char bytes[4096], charsetBuffer[100];
michael@0 165 FileStream *f;
michael@0 166 const char *charset, *pb;
michael@0 167 UnicodeString src;
michael@0 168 UConverter *cnv;
michael@0 169 UChar *buffer, *pu;
michael@0 170 int32_t fileLength, bytesLength, length, capacity;
michael@0 171 UBool flush;
michael@0 172
michael@0 173 if(U_FAILURE(errorCode)) {
michael@0 174 return NULL;
michael@0 175 }
michael@0 176
michael@0 177 f=T_FileStream_open(filename, "rb");
michael@0 178 if(f==NULL) {
michael@0 179 errorCode=U_FILE_ACCESS_ERROR;
michael@0 180 return NULL;
michael@0 181 }
michael@0 182
michael@0 183 bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
michael@0 184 if(bytesLength<(int32_t)sizeof(bytes)) {
michael@0 185 // we have already read the entire file
michael@0 186 fileLength=bytesLength;
michael@0 187 } else {
michael@0 188 // get the file length
michael@0 189 fileLength=T_FileStream_size(f);
michael@0 190 }
michael@0 191
michael@0 192 /*
michael@0 193 * get the charset:
michael@0 194 * 1. Unicode signature
michael@0 195 * 2. treat as ISO-8859-1 and read XML encoding="charser"
michael@0 196 * 3. default to UTF-8
michael@0 197 */
michael@0 198 charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
michael@0 199 if(U_SUCCESS(errorCode) && charset!=NULL) {
michael@0 200 // open converter according to Unicode signature
michael@0 201 cnv=ucnv_open(charset, &errorCode);
michael@0 202 } else {
michael@0 203 // read as Latin-1 and parse the XML declaration and encoding
michael@0 204 cnv=ucnv_open("ISO-8859-1", &errorCode);
michael@0 205 if(U_FAILURE(errorCode)) {
michael@0 206 // unexpected error opening Latin-1 converter
michael@0 207 goto exit;
michael@0 208 }
michael@0 209
michael@0 210 buffer=src.getBuffer(bytesLength);
michael@0 211 if(buffer==NULL) {
michael@0 212 // unexpected failure to reserve some string capacity
michael@0 213 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 214 goto exit;
michael@0 215 }
michael@0 216 pb=bytes;
michael@0 217 pu=buffer;
michael@0 218 ucnv_toUnicode(
michael@0 219 cnv,
michael@0 220 &pu, buffer+src.getCapacity(),
michael@0 221 &pb, bytes+bytesLength,
michael@0 222 NULL, TRUE, &errorCode);
michael@0 223 src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
michael@0 224 ucnv_close(cnv);
michael@0 225 cnv=NULL;
michael@0 226 if(U_FAILURE(errorCode)) {
michael@0 227 // unexpected error in conversion from Latin-1
michael@0 228 src.remove();
michael@0 229 goto exit;
michael@0 230 }
michael@0 231
michael@0 232 // parse XML declaration
michael@0 233 if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
michael@0 234 int32_t declEnd=mXMLDecl.end(errorCode);
michael@0 235 // go beyond <?xml
michael@0 236 int32_t pos=src.indexOf((UChar)x_l)+1;
michael@0 237
michael@0 238 mAttrValue.reset(src);
michael@0 239 while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loop runs once per attribute on this element.
michael@0 240 UnicodeString attName = mAttrValue.group(1, errorCode);
michael@0 241 UnicodeString attValue = mAttrValue.group(2, errorCode);
michael@0 242
michael@0 243 // Trim the quotes from the att value. These are left over from the original regex
michael@0 244 // that parsed the attribue, which couldn't conveniently strip them.
michael@0 245 attValue.remove(0,1); // one char from the beginning
michael@0 246 attValue.truncate(attValue.length()-1); // and one from the end.
michael@0 247
michael@0 248 if(attName==UNICODE_STRING("encoding", 8)) {
michael@0 249 length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
michael@0 250 charset=charsetBuffer;
michael@0 251 break;
michael@0 252 }
michael@0 253 pos = mAttrValue.end(2, errorCode);
michael@0 254 }
michael@0 255
michael@0 256 if(charset==NULL) {
michael@0 257 // default to UTF-8
michael@0 258 charset="UTF-8";
michael@0 259 }
michael@0 260 cnv=ucnv_open(charset, &errorCode);
michael@0 261 }
michael@0 262 }
michael@0 263
michael@0 264 if(U_FAILURE(errorCode)) {
michael@0 265 // unable to open the converter
michael@0 266 goto exit;
michael@0 267 }
michael@0 268
michael@0 269 // convert the file contents
michael@0 270 capacity=fileLength; // estimated capacity
michael@0 271 src.getBuffer(capacity);
michael@0 272 src.releaseBuffer(0); // zero length
michael@0 273 flush=FALSE;
michael@0 274 for(;;) {
michael@0 275 // convert contents of bytes[bytesLength]
michael@0 276 pb=bytes;
michael@0 277 for(;;) {
michael@0 278 length=src.length();
michael@0 279 buffer=src.getBuffer(capacity);
michael@0 280 if(buffer==NULL) {
michael@0 281 // unexpected failure to reserve some string capacity
michael@0 282 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 283 goto exit;
michael@0 284 }
michael@0 285
michael@0 286 pu=buffer+length;
michael@0 287 ucnv_toUnicode(
michael@0 288 cnv, &pu, buffer+src.getCapacity(),
michael@0 289 &pb, bytes+bytesLength,
michael@0 290 NULL, FALSE, &errorCode);
michael@0 291 src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
michael@0 292 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
michael@0 293 errorCode=U_ZERO_ERROR;
michael@0 294 capacity=(3*src.getCapacity())/2; // increase capacity by 50%
michael@0 295 } else {
michael@0 296 break;
michael@0 297 }
michael@0 298 }
michael@0 299
michael@0 300 if(U_FAILURE(errorCode)) {
michael@0 301 break; // conversion error
michael@0 302 }
michael@0 303
michael@0 304 if(flush) {
michael@0 305 break; // completely converted the file
michael@0 306 }
michael@0 307
michael@0 308 // read next block
michael@0 309 bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
michael@0 310 if(bytesLength==0) {
michael@0 311 // reached end of file, convert once more to flush the converter
michael@0 312 flush=TRUE;
michael@0 313 }
michael@0 314 };
michael@0 315
michael@0 316 exit:
michael@0 317 ucnv_close(cnv);
michael@0 318 T_FileStream_close(f);
michael@0 319
michael@0 320 if(U_SUCCESS(errorCode)) {
michael@0 321 return parse(src, errorCode);
michael@0 322 } else {
michael@0 323 return NULL;
michael@0 324 }
michael@0 325 }
michael@0 326
michael@0 327 UXMLElement *
michael@0 328 UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
michael@0 329 if(U_FAILURE(status)) {
michael@0 330 return NULL;
michael@0 331 }
michael@0 332
michael@0 333 UXMLElement *root = NULL;
michael@0 334 fPos = 0; // TODO use just a local pos variable and pass it into functions
michael@0 335 // where necessary?
michael@0 336
michael@0 337 // set all matchers to work on the input string
michael@0 338 mXMLDecl.reset(src);
michael@0 339 mXMLComment.reset(src);
michael@0 340 mXMLSP.reset(src);
michael@0 341 mXMLDoctype.reset(src);
michael@0 342 mXMLPI.reset(src);
michael@0 343 mXMLElemStart.reset(src);
michael@0 344 mXMLElemEnd.reset(src);
michael@0 345 mXMLElemEmpty.reset(src);
michael@0 346 mXMLCharData.reset(src);
michael@0 347 mAttrValue.reset(src);
michael@0 348 mAttrNormalizer.reset(src);
michael@0 349 mNewLineNormalizer.reset(src);
michael@0 350 mAmps.reset(src);
michael@0 351
michael@0 352 // Consume the XML Declaration, if present.
michael@0 353 if (mXMLDecl.lookingAt(fPos, status)) {
michael@0 354 fPos = mXMLDecl.end(status);
michael@0 355 }
michael@0 356
michael@0 357 // Consume "misc" [XML production 27] appearing before DocType
michael@0 358 parseMisc(status);
michael@0 359
michael@0 360 // Consume a DocType declaration, if present.
michael@0 361 if (mXMLDoctype.lookingAt(fPos, status)) {
michael@0 362 fPos = mXMLDoctype.end(status);
michael@0 363 }
michael@0 364
michael@0 365 // Consume additional "misc" [XML production 27] appearing after the DocType
michael@0 366 parseMisc(status);
michael@0 367
michael@0 368 // Get the root element
michael@0 369 if (mXMLElemEmpty.lookingAt(fPos, status)) {
michael@0 370 // Root is an empty element (no nested elements or content)
michael@0 371 root = createElement(mXMLElemEmpty, status);
michael@0 372 fPos = mXMLElemEmpty.end(status);
michael@0 373 } else {
michael@0 374 if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {
michael@0 375 error("Root Element expected", status);
michael@0 376 goto errorExit;
michael@0 377 }
michael@0 378 root = createElement(mXMLElemStart, status);
michael@0 379 UXMLElement *el = root;
michael@0 380
michael@0 381 //
michael@0 382 // This is the loop that consumes the root element of the document,
michael@0 383 // including all nested content. Nested elements are handled by
michael@0 384 // explicit pushes/pops of the element stack; there is no recursion
michael@0 385 // in the control flow of this code.
michael@0 386 // "el" always refers to the current element, the one to which content
michael@0 387 // is being added. It is above the top of the element stack.
michael@0 388 for (;;) {
michael@0 389 // Nested Element Start
michael@0 390 if (mXMLElemStart.lookingAt(fPos, status)) {
michael@0 391 UXMLElement *t = createElement(mXMLElemStart, status);
michael@0 392 el->fChildren.addElement(t, status);
michael@0 393 t->fParent = el;
michael@0 394 fElementStack.push(el, status);
michael@0 395 el = t;
michael@0 396 continue;
michael@0 397 }
michael@0 398
michael@0 399 // Text Content. String is concatenated onto the current node's content,
michael@0 400 // but only if it contains something other than spaces.
michael@0 401 UnicodeString s = scanContent(status);
michael@0 402 if (s.length() > 0) {
michael@0 403 mXMLSP.reset(s);
michael@0 404 if (mXMLSP.matches(status) == FALSE) {
michael@0 405 // This chunk of text contains something other than just
michael@0 406 // white space. Make a child node for it.
michael@0 407 replaceCharRefs(s, status);
michael@0 408 el->fChildren.addElement(s.clone(), status);
michael@0 409 }
michael@0 410 mXMLSP.reset(src); // The matchers need to stay set to the main input string.
michael@0 411 continue;
michael@0 412 }
michael@0 413
michael@0 414 // Comments. Discard.
michael@0 415 if (mXMLComment.lookingAt(fPos, status)) {
michael@0 416 fPos = mXMLComment.end(status);
michael@0 417 continue;
michael@0 418 }
michael@0 419
michael@0 420 // PIs. Discard.
michael@0 421 if (mXMLPI.lookingAt(fPos, status)) {
michael@0 422 fPos = mXMLPI.end(status);
michael@0 423 continue;
michael@0 424 }
michael@0 425
michael@0 426 // Element End
michael@0 427 if (mXMLElemEnd.lookingAt(fPos, status)) {
michael@0 428 fPos = mXMLElemEnd.end(0, status);
michael@0 429 const UnicodeString name = mXMLElemEnd.group(1, status);
michael@0 430 if (name != *el->fName) {
michael@0 431 error("Element start / end tag mismatch", status);
michael@0 432 goto errorExit;
michael@0 433 }
michael@0 434 if (fElementStack.empty()) {
michael@0 435 // Close of the root element. We're done with the doc.
michael@0 436 el = NULL;
michael@0 437 break;
michael@0 438 }
michael@0 439 el = (UXMLElement *)fElementStack.pop();
michael@0 440 continue;
michael@0 441 }
michael@0 442
michael@0 443 // Empty Element. Stored as a child of the current element, but not stacked.
michael@0 444 if (mXMLElemEmpty.lookingAt(fPos, status)) {
michael@0 445 UXMLElement *t = createElement(mXMLElemEmpty, status);
michael@0 446 el->fChildren.addElement(t, status);
michael@0 447 continue;
michael@0 448 }
michael@0 449
michael@0 450 // Hit something within the document that doesn't match anything.
michael@0 451 // It's an error.
michael@0 452 error("Unrecognized markup", status);
michael@0 453 break;
michael@0 454 }
michael@0 455
michael@0 456 if (el != NULL || !fElementStack.empty()) {
michael@0 457 // We bailed out early, for some reason.
michael@0 458 error("Root element not closed.", status);
michael@0 459 goto errorExit;
michael@0 460 }
michael@0 461 }
michael@0 462
michael@0 463 // Root Element parse is complete.
michael@0 464 // Consume the annoying xml "Misc" that can appear at the end of the doc.
michael@0 465 parseMisc(status);
michael@0 466
michael@0 467 // We should have reached the end of the input
michael@0 468 if (fPos != src.length()) {
michael@0 469 error("Extra content at the end of the document", status);
michael@0 470 goto errorExit;
michael@0 471 }
michael@0 472
michael@0 473 // Success!
michael@0 474 return root;
michael@0 475
michael@0 476 errorExit:
michael@0 477 delete root;
michael@0 478 return NULL;
michael@0 479 }
michael@0 480
michael@0 481 //
michael@0 482 // createElement
michael@0 483 // We've just matched an element start tag. Create and fill in a UXMLElement object
michael@0 484 // for it.
michael@0 485 //
michael@0 486 UXMLElement *
michael@0 487 UXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) {
michael@0 488 // First capture group is the element's name.
michael@0 489 UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);
michael@0 490
michael@0 491 // Scan for attributes.
michael@0 492 int32_t pos = mEl.end(1, status); // The position after the end of the tag name
michael@0 493
michael@0 494 while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute on this element.
michael@0 495 UnicodeString attName = mAttrValue.group(1, status);
michael@0 496 UnicodeString attValue = mAttrValue.group(2, status);
michael@0 497
michael@0 498 // Trim the quotes from the att value. These are left over from the original regex
michael@0 499 // that parsed the attribue, which couldn't conveniently strip them.
michael@0 500 attValue.remove(0,1); // one char from the beginning
michael@0 501 attValue.truncate(attValue.length()-1); // and one from the end.
michael@0 502
michael@0 503 // XML Attribue value normalization.
michael@0 504 // This is one of the really screwy parts of the XML spec.
michael@0 505 // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
michael@0 506 // Note that non-validating parsers must treat all entities as type CDATA
michael@0 507 // which simplifies things some.
michael@0 508
michael@0 509 // Att normalization step 1: normalize any newlines in the attribute value
michael@0 510 mNewLineNormalizer.reset(attValue);
michael@0 511 attValue = mNewLineNormalizer.replaceAll(fOneLF, status);
michael@0 512
michael@0 513 // Next change all xml white space chars to plain \u0020 spaces.
michael@0 514 mAttrNormalizer.reset(attValue);
michael@0 515 UnicodeString oneSpace((UChar)0x0020);
michael@0 516 attValue = mAttrNormalizer.replaceAll(oneSpace, status);
michael@0 517
michael@0 518 // Replace character entities.
michael@0 519 replaceCharRefs(attValue, status);
michael@0 520
michael@0 521 // Save the attribute name and value in our document structure.
michael@0 522 el->fAttNames.addElement((void *)intern(attName, status), status);
michael@0 523 el->fAttValues.addElement(attValue.clone(), status);
michael@0 524 pos = mAttrValue.end(2, status);
michael@0 525 }
michael@0 526 fPos = mEl.end(0, status);
michael@0 527 return el;
michael@0 528 }
michael@0 529
michael@0 530 //
michael@0 531 // parseMisc
michael@0 532 // Consume XML "Misc" [production #27]
michael@0 533 // which is any combination of space, PI and comments
michael@0 534 // Need to watch end-of-input because xml MISC stuff is allowed after
michael@0 535 // the document element, so we WILL scan off the end in this function
michael@0 536 //
michael@0 537 void
michael@0 538 UXMLParser::parseMisc(UErrorCode &status) {
michael@0 539 for (;;) {
michael@0 540 if (fPos >= mXMLPI.input().length()) {
michael@0 541 break;
michael@0 542 }
michael@0 543 if (mXMLPI.lookingAt(fPos, status)) {
michael@0 544 fPos = mXMLPI.end(status);
michael@0 545 continue;
michael@0 546 }
michael@0 547 if (mXMLSP.lookingAt(fPos, status)) {
michael@0 548 fPos = mXMLSP.end(status);
michael@0 549 continue;
michael@0 550 }
michael@0 551 if (mXMLComment.lookingAt(fPos, status)) {
michael@0 552 fPos = mXMLComment.end(status);
michael@0 553 continue;
michael@0 554 }
michael@0 555 break;
michael@0 556 }
michael@0 557 }
michael@0 558
michael@0 559 //
michael@0 560 // Scan for document content.
michael@0 561 //
michael@0 562 UnicodeString
michael@0 563 UXMLParser::scanContent(UErrorCode &status) {
michael@0 564 UnicodeString result;
michael@0 565 if (mXMLCharData.lookingAt(fPos, status)) {
michael@0 566 result = mXMLCharData.group((int32_t)0, status);
michael@0 567 // Normalize the new-lines. (Before char ref substitution)
michael@0 568 mNewLineNormalizer.reset(result);
michael@0 569 result = mNewLineNormalizer.replaceAll(fOneLF, status);
michael@0 570
michael@0 571 // TODO: handle CDATA
michael@0 572 fPos = mXMLCharData.end(0, status);
michael@0 573 }
michael@0 574
michael@0 575 return result;
michael@0 576 }
michael@0 577
michael@0 578 //
michael@0 579 // replaceCharRefs
michael@0 580 //
michael@0 581 // replace the char entities &lt; &amp; &#123; &#x12ab; etc. in a string
michael@0 582 // with the corresponding actual character.
michael@0 583 //
michael@0 584 void
michael@0 585 UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
michael@0 586 UnicodeString result;
michael@0 587 UnicodeString replacement;
michael@0 588 int i;
michael@0 589
michael@0 590 mAmps.reset(s);
michael@0 591 // See the initialization for the regex matcher mAmps.
michael@0 592 // Which entity we've matched is determined by which capture group has content,
michael@0 593 // which is flaged by start() of that group not being -1.
michael@0 594 while (mAmps.find()) {
michael@0 595 if (mAmps.start(1, status) != -1) {
michael@0 596 replacement.setTo((UChar)x_AMP);
michael@0 597 } else if (mAmps.start(2, status) != -1) {
michael@0 598 replacement.setTo((UChar)x_LT);
michael@0 599 } else if (mAmps.start(3, status) != -1) {
michael@0 600 replacement.setTo((UChar)x_GT);
michael@0 601 } else if (mAmps.start(4, status) != -1) {
michael@0 602 replacement.setTo((UChar)x_APOS);
michael@0 603 } else if (mAmps.start(5, status) != -1) {
michael@0 604 replacement.setTo((UChar)x_QUOT);
michael@0 605 } else if (mAmps.start(6, status) != -1) {
michael@0 606 UnicodeString hexString = mAmps.group(6, status);
michael@0 607 UChar32 val = 0;
michael@0 608 for (i=0; i<hexString.length(); i++) {
michael@0 609 val = (val << 4) + u_digit(hexString.charAt(i), 16);
michael@0 610 }
michael@0 611 // TODO: some verification that the character is valid
michael@0 612 replacement.setTo(val);
michael@0 613 } else if (mAmps.start(7, status) != -1) {
michael@0 614 UnicodeString decimalString = mAmps.group(7, status);
michael@0 615 UChar32 val = 0;
michael@0 616 for (i=0; i<decimalString.length(); i++) {
michael@0 617 val = val*10 + u_digit(decimalString.charAt(i), 10);
michael@0 618 }
michael@0 619 // TODO: some verification that the character is valid
michael@0 620 replacement.setTo(val);
michael@0 621 } else {
michael@0 622 // An unrecognized &entity; Leave it alone.
michael@0 623 // TODO: check that it really looks like an entity, and is not some
michael@0 624 // random & in the text.
michael@0 625 replacement = mAmps.group((int32_t)0, status);
michael@0 626 }
michael@0 627 mAmps.appendReplacement(result, replacement, status);
michael@0 628 }
michael@0 629 mAmps.appendTail(result);
michael@0 630 s = result;
michael@0 631 }
michael@0 632
michael@0 633 void
michael@0 634 UXMLParser::error(const char *message, UErrorCode &status) {
michael@0 635 // TODO: something better here...
michael@0 636 const UnicodeString &src=mXMLDecl.input();
michael@0 637 int line = 0;
michael@0 638 int ci = 0;
michael@0 639 while (ci < fPos && ci>=0) {
michael@0 640 ci = src.indexOf((UChar)0x0a, ci+1);
michael@0 641 line++;
michael@0 642 }
michael@0 643 fprintf(stderr, "Error: %s at line %d\n", message, line);
michael@0 644 if (U_SUCCESS(status)) {
michael@0 645 status = U_PARSE_ERROR;
michael@0 646 }
michael@0 647 }
michael@0 648
michael@0 649 // intern strings like in Java
michael@0 650
michael@0 651 const UnicodeString *
michael@0 652 UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
michael@0 653 const UHashElement *he=fNames.find(s);
michael@0 654 if(he!=NULL) {
michael@0 655 // already a known name, return its hashed key pointer
michael@0 656 return (const UnicodeString *)he->key.pointer;
michael@0 657 } else {
michael@0 658 // add this new name and return its hashed key pointer
michael@0 659 fNames.puti(s, 0, errorCode);
michael@0 660 he=fNames.find(s);
michael@0 661 return (const UnicodeString *)he->key.pointer;
michael@0 662 }
michael@0 663 }
michael@0 664
michael@0 665 const UnicodeString *
michael@0 666 UXMLParser::findName(const UnicodeString &s) const {
michael@0 667 const UHashElement *he=fNames.find(s);
michael@0 668 if(he!=NULL) {
michael@0 669 // a known name, return its hashed key pointer
michael@0 670 return (const UnicodeString *)he->key.pointer;
michael@0 671 } else {
michael@0 672 // unknown name
michael@0 673 return NULL;
michael@0 674 }
michael@0 675 }
michael@0 676
michael@0 677 // UXMLElement ------------------------------------------------------------- ***
michael@0 678
michael@0 679 UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
michael@0 680 fParser(parser),
michael@0 681 fName(name),
michael@0 682 fAttNames(errorCode),
michael@0 683 fAttValues(errorCode),
michael@0 684 fChildren(errorCode),
michael@0 685 fParent(NULL)
michael@0 686 {
michael@0 687 }
michael@0 688
michael@0 689 UXMLElement::~UXMLElement() {
michael@0 690 int i;
michael@0 691 // attribute names are owned by the UXMLParser, don't delete them here
michael@0 692 for (i=fAttValues.size()-1; i>=0; i--) {
michael@0 693 delete (UObject *)fAttValues.elementAt(i);
michael@0 694 }
michael@0 695 for (i=fChildren.size()-1; i>=0; i--) {
michael@0 696 delete (UObject *)fChildren.elementAt(i);
michael@0 697 }
michael@0 698 }
michael@0 699
michael@0 700 const UnicodeString &
michael@0 701 UXMLElement::getTagName() const {
michael@0 702 return *fName;
michael@0 703 }
michael@0 704
michael@0 705 UnicodeString
michael@0 706 UXMLElement::getText(UBool recurse) const {
michael@0 707 UnicodeString text;
michael@0 708 appendText(text, recurse);
michael@0 709 return text;
michael@0 710 }
michael@0 711
michael@0 712 void
michael@0 713 UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
michael@0 714 const UObject *node;
michael@0 715 int32_t i, count=fChildren.size();
michael@0 716 for(i=0; i<count; ++i) {
michael@0 717 node=(const UObject *)fChildren.elementAt(i);
michael@0 718 const UnicodeString *s=dynamic_cast<const UnicodeString *>(node);
michael@0 719 if(s!=NULL) {
michael@0 720 text.append(*s);
michael@0 721 } else if(recurse) /* must be a UXMLElement */ {
michael@0 722 ((const UXMLElement *)node)->appendText(text, recurse);
michael@0 723 }
michael@0 724 }
michael@0 725 }
michael@0 726
michael@0 727 int32_t
michael@0 728 UXMLElement::countAttributes() const {
michael@0 729 return fAttNames.size();
michael@0 730 }
michael@0 731
michael@0 732 const UnicodeString *
michael@0 733 UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
michael@0 734 if(0<=i && i<fAttNames.size()) {
michael@0 735 name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
michael@0 736 value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
michael@0 737 return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
michael@0 738 } else {
michael@0 739 return NULL;
michael@0 740 }
michael@0 741 }
michael@0 742
michael@0 743 const UnicodeString *
michael@0 744 UXMLElement::getAttribute(const UnicodeString &name) const {
michael@0 745 // search for the attribute name by comparing the interned pointer,
michael@0 746 // not the string contents
michael@0 747 const UnicodeString *p=fParser->findName(name);
michael@0 748 if(p==NULL) {
michael@0 749 return NULL; // no such attribute seen by the parser at all
michael@0 750 }
michael@0 751
michael@0 752 int32_t i, count=fAttNames.size();
michael@0 753 for(i=0; i<count; ++i) {
michael@0 754 if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
michael@0 755 return (const UnicodeString *)fAttValues.elementAt(i);
michael@0 756 }
michael@0 757 }
michael@0 758 return NULL;
michael@0 759 }
michael@0 760
michael@0 761 int32_t
michael@0 762 UXMLElement::countChildren() const {
michael@0 763 return fChildren.size();
michael@0 764 }
michael@0 765
michael@0 766 const UObject *
michael@0 767 UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
michael@0 768 if(0<=i && i<fChildren.size()) {
michael@0 769 const UObject *node=(const UObject *)fChildren.elementAt(i);
michael@0 770 if(dynamic_cast<const UXMLElement *>(node)!=NULL) {
michael@0 771 type=UXML_NODE_TYPE_ELEMENT;
michael@0 772 } else {
michael@0 773 type=UXML_NODE_TYPE_STRING;
michael@0 774 }
michael@0 775 return node;
michael@0 776 } else {
michael@0 777 return NULL;
michael@0 778 }
michael@0 779 }
michael@0 780
michael@0 781 const UXMLElement *
michael@0 782 UXMLElement::nextChildElement(int32_t &i) const {
michael@0 783 if(i<0) {
michael@0 784 return NULL;
michael@0 785 }
michael@0 786
michael@0 787 const UObject *node;
michael@0 788 int32_t count=fChildren.size();
michael@0 789 while(i<count) {
michael@0 790 node=(const UObject *)fChildren.elementAt(i++);
michael@0 791 const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
michael@0 792 if(elem!=NULL) {
michael@0 793 return elem;
michael@0 794 }
michael@0 795 }
michael@0 796 return NULL;
michael@0 797 }
michael@0 798
michael@0 799 const UXMLElement *
michael@0 800 UXMLElement::getChildElement(const UnicodeString &name) const {
michael@0 801 // search for the element name by comparing the interned pointer,
michael@0 802 // not the string contents
michael@0 803 const UnicodeString *p=fParser->findName(name);
michael@0 804 if(p==NULL) {
michael@0 805 return NULL; // no such element seen by the parser at all
michael@0 806 }
michael@0 807
michael@0 808 const UObject *node;
michael@0 809 int32_t i, count=fChildren.size();
michael@0 810 for(i=0; i<count; ++i) {
michael@0 811 node=(const UObject *)fChildren.elementAt(i);
michael@0 812 const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
michael@0 813 if(elem!=NULL) {
michael@0 814 if(p==elem->fName) {
michael@0 815 return elem;
michael@0 816 }
michael@0 817 }
michael@0 818 }
michael@0 819 return NULL;
michael@0 820 }
michael@0 821
michael@0 822 U_NAMESPACE_END
michael@0 823
michael@0 824 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
michael@0 825

mercurial