Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* |
michael@0 | 2 | ******************************************************************************* |
michael@0 | 3 | * |
michael@0 | 4 | * Copyright (C) 2004-2010, International Business Machines |
michael@0 | 5 | * Corporation and others. All Rights Reserved. |
michael@0 | 6 | * |
michael@0 | 7 | ******************************************************************************* |
michael@0 | 8 | * file name: xmlparser.cpp |
michael@0 | 9 | * encoding: US-ASCII |
michael@0 | 10 | * tab size: 8 (not used) |
michael@0 | 11 | * indentation:4 |
michael@0 | 12 | * |
michael@0 | 13 | * created on: 2004jul21 |
michael@0 | 14 | * created by: Andy Heninger |
michael@0 | 15 | */ |
michael@0 | 16 | |
michael@0 | 17 | #include <stdio.h> |
michael@0 | 18 | #include "unicode/uchar.h" |
michael@0 | 19 | #include "unicode/ucnv.h" |
michael@0 | 20 | #include "unicode/regex.h" |
michael@0 | 21 | #include "filestrm.h" |
michael@0 | 22 | #include "xmlparser.h" |
michael@0 | 23 | |
michael@0 | 24 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION |
michael@0 | 25 | |
michael@0 | 26 | // character constants |
michael@0 | 27 | enum { |
michael@0 | 28 | x_QUOT=0x22, |
michael@0 | 29 | x_AMP=0x26, |
michael@0 | 30 | x_APOS=0x27, |
michael@0 | 31 | x_LT=0x3c, |
michael@0 | 32 | x_GT=0x3e, |
michael@0 | 33 | x_l=0x6c |
michael@0 | 34 | }; |
michael@0 | 35 | |
michael@0 | 36 | #define XML_SPACES "[ \\u0009\\u000d\\u000a]" |
michael@0 | 37 | |
michael@0 | 38 | // XML #4 |
michael@0 | 39 | #define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \ |
michael@0 | 40 | "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \ |
michael@0 | 41 | "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \ |
michael@0 | 42 | "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" |
michael@0 | 43 | |
michael@0 | 44 | // XML #5 |
michael@0 | 45 | #define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]" |
michael@0 | 46 | |
michael@0 | 47 | // XML #6 |
michael@0 | 48 | #define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*" |
michael@0 | 49 | |
michael@0 | 50 | U_NAMESPACE_BEGIN |
michael@0 | 51 | |
michael@0 | 52 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser) |
michael@0 | 53 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement) |
michael@0 | 54 | |
michael@0 | 55 | // |
michael@0 | 56 | // UXMLParser constructor. Mostly just initializes the ICU regexes that are |
michael@0 | 57 | // used for parsing. |
michael@0 | 58 | // |
michael@0 | 59 | UXMLParser::UXMLParser(UErrorCode &status) : |
michael@0 | 60 | // XML Declaration. XML Production #23. |
michael@0 | 61 | // example: "<?xml version=1.0 encoding="utf-16" ?> |
michael@0 | 62 | // This is a sloppy implementation - just look for the leading <?xml and the closing ?> |
michael@0 | 63 | // allow for a possible leading BOM. |
michael@0 | 64 | mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status), |
michael@0 | 65 | |
michael@0 | 66 | // XML Comment production #15 |
michael@0 | 67 | // example: "<!-- whatever --> |
michael@0 | 68 | // note, does not detect an illegal "--" within comments |
michael@0 | 69 | mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status), |
michael@0 | 70 | |
michael@0 | 71 | // XML Spaces |
michael@0 | 72 | // production [3] |
michael@0 | 73 | mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status), |
michael@0 | 74 | |
michael@0 | 75 | // XML Doctype decl production #28 |
michael@0 | 76 | // example "<!DOCTYPE foo SYSTEM "somewhere" > |
michael@0 | 77 | // or "<!DOCTYPE foo [internal dtd]> |
michael@0 | 78 | // TODO: we don't actually parse the DOCTYPE or internal subsets. |
michael@0 | 79 | // Some internal dtd subsets could confuse this simple-minded |
michael@0 | 80 | // attempt at skipping over them, specifically, occcurences |
michael@0 | 81 | // of closeing square brackets. These could appear in comments, |
michael@0 | 82 | // or in parameter entity declarations, for example. |
michael@0 | 83 | mXMLDoctype(UnicodeString( |
michael@0 | 84 | "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV |
michael@0 | 85 | ), 0, status), |
michael@0 | 86 | |
michael@0 | 87 | // XML PI production #16 |
michael@0 | 88 | // example "<?target stuff?> |
michael@0 | 89 | mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status), |
michael@0 | 90 | |
michael@0 | 91 | // XML Element Start Productions #40, #41 |
michael@0 | 92 | // example <foo att1='abc' att2="d e f" > |
michael@0 | 93 | // capture #1: the tag name |
michael@0 | 94 | // |
michael@0 | 95 | mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" |
michael@0 | 96 | "(?:" |
michael@0 | 97 | XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " |
michael@0 | 98 | "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' |
michael@0 | 99 | ")*" // * for zero or more attributes. |
michael@0 | 100 | XML_SPACES "*?>", -1, US_INV), 0, status), // match " >" |
michael@0 | 101 | |
michael@0 | 102 | // XML Element End production #42 |
michael@0 | 103 | // example </foo> |
michael@0 | 104 | mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status), |
michael@0 | 105 | |
michael@0 | 106 | // XML Element Empty production #44 |
michael@0 | 107 | // example <foo att1="abc" att2="d e f" /> |
michael@0 | 108 | mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" |
michael@0 | 109 | "(?:" |
michael@0 | 110 | XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " |
michael@0 | 111 | "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' |
michael@0 | 112 | ")*" // * for zero or more attributes. |
michael@0 | 113 | XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />" |
michael@0 | 114 | |
michael@0 | 115 | |
michael@0 | 116 | // XMLCharData. Everything but '<'. Note that & will be dealt with later. |
michael@0 | 117 | mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status), |
michael@0 | 118 | |
michael@0 | 119 | // Attribute name = "value". XML Productions 10, 40/41 |
michael@0 | 120 | // Capture group 1 is name, |
michael@0 | 121 | // 2 is the attribute value, including the quotes. |
michael@0 | 122 | // |
michael@0 | 123 | // Note that attributes are scanned twice. The first time is with |
michael@0 | 124 | // the regex for an entire element start. There, the attributes |
michael@0 | 125 | // are checked syntactically, but not separted out one by one. |
michael@0 | 126 | // Here, we match a single attribute, and make its name and |
michael@0 | 127 | // attribute value available to the parser code. |
michael@0 | 128 | mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*" |
michael@0 | 129 | "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status), |
michael@0 | 130 | |
michael@0 | 131 | |
michael@0 | 132 | mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status), |
michael@0 | 133 | |
michael@0 | 134 | // Match any of the new-line sequences in content. |
michael@0 | 135 | // All are changed to \u000a. |
michael@0 | 136 | mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status), |
michael@0 | 137 | |
michael@0 | 138 | // & char references |
michael@0 | 139 | // We will figure out what we've got based on which capture group has content. |
michael@0 | 140 | // The last one is a catchall for unrecognized entity references.. |
michael@0 | 141 | // 1 2 3 4 5 6 7 8 |
michael@0 | 142 | mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"), |
michael@0 | 143 | 0, status), |
michael@0 | 144 | |
michael@0 | 145 | fNames(status), |
michael@0 | 146 | fElementStack(status), |
michael@0 | 147 | fOneLF((UChar)0x0a) // Plain new-line string, used in new line normalization. |
michael@0 | 148 | { |
michael@0 | 149 | } |
michael@0 | 150 | |
michael@0 | 151 | UXMLParser * |
michael@0 | 152 | UXMLParser::createParser(UErrorCode &errorCode) { |
michael@0 | 153 | if (U_FAILURE(errorCode)) { |
michael@0 | 154 | return NULL; |
michael@0 | 155 | } else { |
michael@0 | 156 | return new UXMLParser(errorCode); |
michael@0 | 157 | } |
michael@0 | 158 | } |
michael@0 | 159 | |
michael@0 | 160 | UXMLParser::~UXMLParser() {} |
michael@0 | 161 | |
michael@0 | 162 | UXMLElement * |
michael@0 | 163 | UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) { |
michael@0 | 164 | char bytes[4096], charsetBuffer[100]; |
michael@0 | 165 | FileStream *f; |
michael@0 | 166 | const char *charset, *pb; |
michael@0 | 167 | UnicodeString src; |
michael@0 | 168 | UConverter *cnv; |
michael@0 | 169 | UChar *buffer, *pu; |
michael@0 | 170 | int32_t fileLength, bytesLength, length, capacity; |
michael@0 | 171 | UBool flush; |
michael@0 | 172 | |
michael@0 | 173 | if(U_FAILURE(errorCode)) { |
michael@0 | 174 | return NULL; |
michael@0 | 175 | } |
michael@0 | 176 | |
michael@0 | 177 | f=T_FileStream_open(filename, "rb"); |
michael@0 | 178 | if(f==NULL) { |
michael@0 | 179 | errorCode=U_FILE_ACCESS_ERROR; |
michael@0 | 180 | return NULL; |
michael@0 | 181 | } |
michael@0 | 182 | |
michael@0 | 183 | bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); |
michael@0 | 184 | if(bytesLength<(int32_t)sizeof(bytes)) { |
michael@0 | 185 | // we have already read the entire file |
michael@0 | 186 | fileLength=bytesLength; |
michael@0 | 187 | } else { |
michael@0 | 188 | // get the file length |
michael@0 | 189 | fileLength=T_FileStream_size(f); |
michael@0 | 190 | } |
michael@0 | 191 | |
michael@0 | 192 | /* |
michael@0 | 193 | * get the charset: |
michael@0 | 194 | * 1. Unicode signature |
michael@0 | 195 | * 2. treat as ISO-8859-1 and read XML encoding="charser" |
michael@0 | 196 | * 3. default to UTF-8 |
michael@0 | 197 | */ |
michael@0 | 198 | charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode); |
michael@0 | 199 | if(U_SUCCESS(errorCode) && charset!=NULL) { |
michael@0 | 200 | // open converter according to Unicode signature |
michael@0 | 201 | cnv=ucnv_open(charset, &errorCode); |
michael@0 | 202 | } else { |
michael@0 | 203 | // read as Latin-1 and parse the XML declaration and encoding |
michael@0 | 204 | cnv=ucnv_open("ISO-8859-1", &errorCode); |
michael@0 | 205 | if(U_FAILURE(errorCode)) { |
michael@0 | 206 | // unexpected error opening Latin-1 converter |
michael@0 | 207 | goto exit; |
michael@0 | 208 | } |
michael@0 | 209 | |
michael@0 | 210 | buffer=src.getBuffer(bytesLength); |
michael@0 | 211 | if(buffer==NULL) { |
michael@0 | 212 | // unexpected failure to reserve some string capacity |
michael@0 | 213 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 214 | goto exit; |
michael@0 | 215 | } |
michael@0 | 216 | pb=bytes; |
michael@0 | 217 | pu=buffer; |
michael@0 | 218 | ucnv_toUnicode( |
michael@0 | 219 | cnv, |
michael@0 | 220 | &pu, buffer+src.getCapacity(), |
michael@0 | 221 | &pb, bytes+bytesLength, |
michael@0 | 222 | NULL, TRUE, &errorCode); |
michael@0 | 223 | src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); |
michael@0 | 224 | ucnv_close(cnv); |
michael@0 | 225 | cnv=NULL; |
michael@0 | 226 | if(U_FAILURE(errorCode)) { |
michael@0 | 227 | // unexpected error in conversion from Latin-1 |
michael@0 | 228 | src.remove(); |
michael@0 | 229 | goto exit; |
michael@0 | 230 | } |
michael@0 | 231 | |
michael@0 | 232 | // parse XML declaration |
michael@0 | 233 | if(mXMLDecl.reset(src).lookingAt(0, errorCode)) { |
michael@0 | 234 | int32_t declEnd=mXMLDecl.end(errorCode); |
michael@0 | 235 | // go beyond <?xml |
michael@0 | 236 | int32_t pos=src.indexOf((UChar)x_l)+1; |
michael@0 | 237 | |
michael@0 | 238 | mAttrValue.reset(src); |
michael@0 | 239 | while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loop runs once per attribute on this element. |
michael@0 | 240 | UnicodeString attName = mAttrValue.group(1, errorCode); |
michael@0 | 241 | UnicodeString attValue = mAttrValue.group(2, errorCode); |
michael@0 | 242 | |
michael@0 | 243 | // Trim the quotes from the att value. These are left over from the original regex |
michael@0 | 244 | // that parsed the attribue, which couldn't conveniently strip them. |
michael@0 | 245 | attValue.remove(0,1); // one char from the beginning |
michael@0 | 246 | attValue.truncate(attValue.length()-1); // and one from the end. |
michael@0 | 247 | |
michael@0 | 248 | if(attName==UNICODE_STRING("encoding", 8)) { |
michael@0 | 249 | length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer)); |
michael@0 | 250 | charset=charsetBuffer; |
michael@0 | 251 | break; |
michael@0 | 252 | } |
michael@0 | 253 | pos = mAttrValue.end(2, errorCode); |
michael@0 | 254 | } |
michael@0 | 255 | |
michael@0 | 256 | if(charset==NULL) { |
michael@0 | 257 | // default to UTF-8 |
michael@0 | 258 | charset="UTF-8"; |
michael@0 | 259 | } |
michael@0 | 260 | cnv=ucnv_open(charset, &errorCode); |
michael@0 | 261 | } |
michael@0 | 262 | } |
michael@0 | 263 | |
michael@0 | 264 | if(U_FAILURE(errorCode)) { |
michael@0 | 265 | // unable to open the converter |
michael@0 | 266 | goto exit; |
michael@0 | 267 | } |
michael@0 | 268 | |
michael@0 | 269 | // convert the file contents |
michael@0 | 270 | capacity=fileLength; // estimated capacity |
michael@0 | 271 | src.getBuffer(capacity); |
michael@0 | 272 | src.releaseBuffer(0); // zero length |
michael@0 | 273 | flush=FALSE; |
michael@0 | 274 | for(;;) { |
michael@0 | 275 | // convert contents of bytes[bytesLength] |
michael@0 | 276 | pb=bytes; |
michael@0 | 277 | for(;;) { |
michael@0 | 278 | length=src.length(); |
michael@0 | 279 | buffer=src.getBuffer(capacity); |
michael@0 | 280 | if(buffer==NULL) { |
michael@0 | 281 | // unexpected failure to reserve some string capacity |
michael@0 | 282 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 283 | goto exit; |
michael@0 | 284 | } |
michael@0 | 285 | |
michael@0 | 286 | pu=buffer+length; |
michael@0 | 287 | ucnv_toUnicode( |
michael@0 | 288 | cnv, &pu, buffer+src.getCapacity(), |
michael@0 | 289 | &pb, bytes+bytesLength, |
michael@0 | 290 | NULL, FALSE, &errorCode); |
michael@0 | 291 | src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); |
michael@0 | 292 | if(errorCode==U_BUFFER_OVERFLOW_ERROR) { |
michael@0 | 293 | errorCode=U_ZERO_ERROR; |
michael@0 | 294 | capacity=(3*src.getCapacity())/2; // increase capacity by 50% |
michael@0 | 295 | } else { |
michael@0 | 296 | break; |
michael@0 | 297 | } |
michael@0 | 298 | } |
michael@0 | 299 | |
michael@0 | 300 | if(U_FAILURE(errorCode)) { |
michael@0 | 301 | break; // conversion error |
michael@0 | 302 | } |
michael@0 | 303 | |
michael@0 | 304 | if(flush) { |
michael@0 | 305 | break; // completely converted the file |
michael@0 | 306 | } |
michael@0 | 307 | |
michael@0 | 308 | // read next block |
michael@0 | 309 | bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); |
michael@0 | 310 | if(bytesLength==0) { |
michael@0 | 311 | // reached end of file, convert once more to flush the converter |
michael@0 | 312 | flush=TRUE; |
michael@0 | 313 | } |
michael@0 | 314 | }; |
michael@0 | 315 | |
michael@0 | 316 | exit: |
michael@0 | 317 | ucnv_close(cnv); |
michael@0 | 318 | T_FileStream_close(f); |
michael@0 | 319 | |
michael@0 | 320 | if(U_SUCCESS(errorCode)) { |
michael@0 | 321 | return parse(src, errorCode); |
michael@0 | 322 | } else { |
michael@0 | 323 | return NULL; |
michael@0 | 324 | } |
michael@0 | 325 | } |
michael@0 | 326 | |
michael@0 | 327 | UXMLElement * |
michael@0 | 328 | UXMLParser::parse(const UnicodeString &src, UErrorCode &status) { |
michael@0 | 329 | if(U_FAILURE(status)) { |
michael@0 | 330 | return NULL; |
michael@0 | 331 | } |
michael@0 | 332 | |
michael@0 | 333 | UXMLElement *root = NULL; |
michael@0 | 334 | fPos = 0; // TODO use just a local pos variable and pass it into functions |
michael@0 | 335 | // where necessary? |
michael@0 | 336 | |
michael@0 | 337 | // set all matchers to work on the input string |
michael@0 | 338 | mXMLDecl.reset(src); |
michael@0 | 339 | mXMLComment.reset(src); |
michael@0 | 340 | mXMLSP.reset(src); |
michael@0 | 341 | mXMLDoctype.reset(src); |
michael@0 | 342 | mXMLPI.reset(src); |
michael@0 | 343 | mXMLElemStart.reset(src); |
michael@0 | 344 | mXMLElemEnd.reset(src); |
michael@0 | 345 | mXMLElemEmpty.reset(src); |
michael@0 | 346 | mXMLCharData.reset(src); |
michael@0 | 347 | mAttrValue.reset(src); |
michael@0 | 348 | mAttrNormalizer.reset(src); |
michael@0 | 349 | mNewLineNormalizer.reset(src); |
michael@0 | 350 | mAmps.reset(src); |
michael@0 | 351 | |
michael@0 | 352 | // Consume the XML Declaration, if present. |
michael@0 | 353 | if (mXMLDecl.lookingAt(fPos, status)) { |
michael@0 | 354 | fPos = mXMLDecl.end(status); |
michael@0 | 355 | } |
michael@0 | 356 | |
michael@0 | 357 | // Consume "misc" [XML production 27] appearing before DocType |
michael@0 | 358 | parseMisc(status); |
michael@0 | 359 | |
michael@0 | 360 | // Consume a DocType declaration, if present. |
michael@0 | 361 | if (mXMLDoctype.lookingAt(fPos, status)) { |
michael@0 | 362 | fPos = mXMLDoctype.end(status); |
michael@0 | 363 | } |
michael@0 | 364 | |
michael@0 | 365 | // Consume additional "misc" [XML production 27] appearing after the DocType |
michael@0 | 366 | parseMisc(status); |
michael@0 | 367 | |
michael@0 | 368 | // Get the root element |
michael@0 | 369 | if (mXMLElemEmpty.lookingAt(fPos, status)) { |
michael@0 | 370 | // Root is an empty element (no nested elements or content) |
michael@0 | 371 | root = createElement(mXMLElemEmpty, status); |
michael@0 | 372 | fPos = mXMLElemEmpty.end(status); |
michael@0 | 373 | } else { |
michael@0 | 374 | if (mXMLElemStart.lookingAt(fPos, status) == FALSE) { |
michael@0 | 375 | error("Root Element expected", status); |
michael@0 | 376 | goto errorExit; |
michael@0 | 377 | } |
michael@0 | 378 | root = createElement(mXMLElemStart, status); |
michael@0 | 379 | UXMLElement *el = root; |
michael@0 | 380 | |
michael@0 | 381 | // |
michael@0 | 382 | // This is the loop that consumes the root element of the document, |
michael@0 | 383 | // including all nested content. Nested elements are handled by |
michael@0 | 384 | // explicit pushes/pops of the element stack; there is no recursion |
michael@0 | 385 | // in the control flow of this code. |
michael@0 | 386 | // "el" always refers to the current element, the one to which content |
michael@0 | 387 | // is being added. It is above the top of the element stack. |
michael@0 | 388 | for (;;) { |
michael@0 | 389 | // Nested Element Start |
michael@0 | 390 | if (mXMLElemStart.lookingAt(fPos, status)) { |
michael@0 | 391 | UXMLElement *t = createElement(mXMLElemStart, status); |
michael@0 | 392 | el->fChildren.addElement(t, status); |
michael@0 | 393 | t->fParent = el; |
michael@0 | 394 | fElementStack.push(el, status); |
michael@0 | 395 | el = t; |
michael@0 | 396 | continue; |
michael@0 | 397 | } |
michael@0 | 398 | |
michael@0 | 399 | // Text Content. String is concatenated onto the current node's content, |
michael@0 | 400 | // but only if it contains something other than spaces. |
michael@0 | 401 | UnicodeString s = scanContent(status); |
michael@0 | 402 | if (s.length() > 0) { |
michael@0 | 403 | mXMLSP.reset(s); |
michael@0 | 404 | if (mXMLSP.matches(status) == FALSE) { |
michael@0 | 405 | // This chunk of text contains something other than just |
michael@0 | 406 | // white space. Make a child node for it. |
michael@0 | 407 | replaceCharRefs(s, status); |
michael@0 | 408 | el->fChildren.addElement(s.clone(), status); |
michael@0 | 409 | } |
michael@0 | 410 | mXMLSP.reset(src); // The matchers need to stay set to the main input string. |
michael@0 | 411 | continue; |
michael@0 | 412 | } |
michael@0 | 413 | |
michael@0 | 414 | // Comments. Discard. |
michael@0 | 415 | if (mXMLComment.lookingAt(fPos, status)) { |
michael@0 | 416 | fPos = mXMLComment.end(status); |
michael@0 | 417 | continue; |
michael@0 | 418 | } |
michael@0 | 419 | |
michael@0 | 420 | // PIs. Discard. |
michael@0 | 421 | if (mXMLPI.lookingAt(fPos, status)) { |
michael@0 | 422 | fPos = mXMLPI.end(status); |
michael@0 | 423 | continue; |
michael@0 | 424 | } |
michael@0 | 425 | |
michael@0 | 426 | // Element End |
michael@0 | 427 | if (mXMLElemEnd.lookingAt(fPos, status)) { |
michael@0 | 428 | fPos = mXMLElemEnd.end(0, status); |
michael@0 | 429 | const UnicodeString name = mXMLElemEnd.group(1, status); |
michael@0 | 430 | if (name != *el->fName) { |
michael@0 | 431 | error("Element start / end tag mismatch", status); |
michael@0 | 432 | goto errorExit; |
michael@0 | 433 | } |
michael@0 | 434 | if (fElementStack.empty()) { |
michael@0 | 435 | // Close of the root element. We're done with the doc. |
michael@0 | 436 | el = NULL; |
michael@0 | 437 | break; |
michael@0 | 438 | } |
michael@0 | 439 | el = (UXMLElement *)fElementStack.pop(); |
michael@0 | 440 | continue; |
michael@0 | 441 | } |
michael@0 | 442 | |
michael@0 | 443 | // Empty Element. Stored as a child of the current element, but not stacked. |
michael@0 | 444 | if (mXMLElemEmpty.lookingAt(fPos, status)) { |
michael@0 | 445 | UXMLElement *t = createElement(mXMLElemEmpty, status); |
michael@0 | 446 | el->fChildren.addElement(t, status); |
michael@0 | 447 | continue; |
michael@0 | 448 | } |
michael@0 | 449 | |
michael@0 | 450 | // Hit something within the document that doesn't match anything. |
michael@0 | 451 | // It's an error. |
michael@0 | 452 | error("Unrecognized markup", status); |
michael@0 | 453 | break; |
michael@0 | 454 | } |
michael@0 | 455 | |
michael@0 | 456 | if (el != NULL || !fElementStack.empty()) { |
michael@0 | 457 | // We bailed out early, for some reason. |
michael@0 | 458 | error("Root element not closed.", status); |
michael@0 | 459 | goto errorExit; |
michael@0 | 460 | } |
michael@0 | 461 | } |
michael@0 | 462 | |
michael@0 | 463 | // Root Element parse is complete. |
michael@0 | 464 | // Consume the annoying xml "Misc" that can appear at the end of the doc. |
michael@0 | 465 | parseMisc(status); |
michael@0 | 466 | |
michael@0 | 467 | // We should have reached the end of the input |
michael@0 | 468 | if (fPos != src.length()) { |
michael@0 | 469 | error("Extra content at the end of the document", status); |
michael@0 | 470 | goto errorExit; |
michael@0 | 471 | } |
michael@0 | 472 | |
michael@0 | 473 | // Success! |
michael@0 | 474 | return root; |
michael@0 | 475 | |
michael@0 | 476 | errorExit: |
michael@0 | 477 | delete root; |
michael@0 | 478 | return NULL; |
michael@0 | 479 | } |
michael@0 | 480 | |
michael@0 | 481 | // |
michael@0 | 482 | // createElement |
michael@0 | 483 | // We've just matched an element start tag. Create and fill in a UXMLElement object |
michael@0 | 484 | // for it. |
michael@0 | 485 | // |
michael@0 | 486 | UXMLElement * |
michael@0 | 487 | UXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) { |
michael@0 | 488 | // First capture group is the element's name. |
michael@0 | 489 | UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status); |
michael@0 | 490 | |
michael@0 | 491 | // Scan for attributes. |
michael@0 | 492 | int32_t pos = mEl.end(1, status); // The position after the end of the tag name |
michael@0 | 493 | |
michael@0 | 494 | while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute on this element. |
michael@0 | 495 | UnicodeString attName = mAttrValue.group(1, status); |
michael@0 | 496 | UnicodeString attValue = mAttrValue.group(2, status); |
michael@0 | 497 | |
michael@0 | 498 | // Trim the quotes from the att value. These are left over from the original regex |
michael@0 | 499 | // that parsed the attribue, which couldn't conveniently strip them. |
michael@0 | 500 | attValue.remove(0,1); // one char from the beginning |
michael@0 | 501 | attValue.truncate(attValue.length()-1); // and one from the end. |
michael@0 | 502 | |
michael@0 | 503 | // XML Attribue value normalization. |
michael@0 | 504 | // This is one of the really screwy parts of the XML spec. |
michael@0 | 505 | // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize |
michael@0 | 506 | // Note that non-validating parsers must treat all entities as type CDATA |
michael@0 | 507 | // which simplifies things some. |
michael@0 | 508 | |
michael@0 | 509 | // Att normalization step 1: normalize any newlines in the attribute value |
michael@0 | 510 | mNewLineNormalizer.reset(attValue); |
michael@0 | 511 | attValue = mNewLineNormalizer.replaceAll(fOneLF, status); |
michael@0 | 512 | |
michael@0 | 513 | // Next change all xml white space chars to plain \u0020 spaces. |
michael@0 | 514 | mAttrNormalizer.reset(attValue); |
michael@0 | 515 | UnicodeString oneSpace((UChar)0x0020); |
michael@0 | 516 | attValue = mAttrNormalizer.replaceAll(oneSpace, status); |
michael@0 | 517 | |
michael@0 | 518 | // Replace character entities. |
michael@0 | 519 | replaceCharRefs(attValue, status); |
michael@0 | 520 | |
michael@0 | 521 | // Save the attribute name and value in our document structure. |
michael@0 | 522 | el->fAttNames.addElement((void *)intern(attName, status), status); |
michael@0 | 523 | el->fAttValues.addElement(attValue.clone(), status); |
michael@0 | 524 | pos = mAttrValue.end(2, status); |
michael@0 | 525 | } |
michael@0 | 526 | fPos = mEl.end(0, status); |
michael@0 | 527 | return el; |
michael@0 | 528 | } |
michael@0 | 529 | |
michael@0 | 530 | // |
michael@0 | 531 | // parseMisc |
michael@0 | 532 | // Consume XML "Misc" [production #27] |
michael@0 | 533 | // which is any combination of space, PI and comments |
michael@0 | 534 | // Need to watch end-of-input because xml MISC stuff is allowed after |
michael@0 | 535 | // the document element, so we WILL scan off the end in this function |
michael@0 | 536 | // |
michael@0 | 537 | void |
michael@0 | 538 | UXMLParser::parseMisc(UErrorCode &status) { |
michael@0 | 539 | for (;;) { |
michael@0 | 540 | if (fPos >= mXMLPI.input().length()) { |
michael@0 | 541 | break; |
michael@0 | 542 | } |
michael@0 | 543 | if (mXMLPI.lookingAt(fPos, status)) { |
michael@0 | 544 | fPos = mXMLPI.end(status); |
michael@0 | 545 | continue; |
michael@0 | 546 | } |
michael@0 | 547 | if (mXMLSP.lookingAt(fPos, status)) { |
michael@0 | 548 | fPos = mXMLSP.end(status); |
michael@0 | 549 | continue; |
michael@0 | 550 | } |
michael@0 | 551 | if (mXMLComment.lookingAt(fPos, status)) { |
michael@0 | 552 | fPos = mXMLComment.end(status); |
michael@0 | 553 | continue; |
michael@0 | 554 | } |
michael@0 | 555 | break; |
michael@0 | 556 | } |
michael@0 | 557 | } |
michael@0 | 558 | |
michael@0 | 559 | // |
michael@0 | 560 | // Scan for document content. |
michael@0 | 561 | // |
michael@0 | 562 | UnicodeString |
michael@0 | 563 | UXMLParser::scanContent(UErrorCode &status) { |
michael@0 | 564 | UnicodeString result; |
michael@0 | 565 | if (mXMLCharData.lookingAt(fPos, status)) { |
michael@0 | 566 | result = mXMLCharData.group((int32_t)0, status); |
michael@0 | 567 | // Normalize the new-lines. (Before char ref substitution) |
michael@0 | 568 | mNewLineNormalizer.reset(result); |
michael@0 | 569 | result = mNewLineNormalizer.replaceAll(fOneLF, status); |
michael@0 | 570 | |
michael@0 | 571 | // TODO: handle CDATA |
michael@0 | 572 | fPos = mXMLCharData.end(0, status); |
michael@0 | 573 | } |
michael@0 | 574 | |
michael@0 | 575 | return result; |
michael@0 | 576 | } |
michael@0 | 577 | |
michael@0 | 578 | // |
michael@0 | 579 | // replaceCharRefs |
michael@0 | 580 | // |
michael@0 | 581 | // replace the char entities < & { ካ etc. in a string |
michael@0 | 582 | // with the corresponding actual character. |
michael@0 | 583 | // |
michael@0 | 584 | void |
michael@0 | 585 | UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) { |
michael@0 | 586 | UnicodeString result; |
michael@0 | 587 | UnicodeString replacement; |
michael@0 | 588 | int i; |
michael@0 | 589 | |
michael@0 | 590 | mAmps.reset(s); |
michael@0 | 591 | // See the initialization for the regex matcher mAmps. |
michael@0 | 592 | // Which entity we've matched is determined by which capture group has content, |
michael@0 | 593 | // which is flaged by start() of that group not being -1. |
michael@0 | 594 | while (mAmps.find()) { |
michael@0 | 595 | if (mAmps.start(1, status) != -1) { |
michael@0 | 596 | replacement.setTo((UChar)x_AMP); |
michael@0 | 597 | } else if (mAmps.start(2, status) != -1) { |
michael@0 | 598 | replacement.setTo((UChar)x_LT); |
michael@0 | 599 | } else if (mAmps.start(3, status) != -1) { |
michael@0 | 600 | replacement.setTo((UChar)x_GT); |
michael@0 | 601 | } else if (mAmps.start(4, status) != -1) { |
michael@0 | 602 | replacement.setTo((UChar)x_APOS); |
michael@0 | 603 | } else if (mAmps.start(5, status) != -1) { |
michael@0 | 604 | replacement.setTo((UChar)x_QUOT); |
michael@0 | 605 | } else if (mAmps.start(6, status) != -1) { |
michael@0 | 606 | UnicodeString hexString = mAmps.group(6, status); |
michael@0 | 607 | UChar32 val = 0; |
michael@0 | 608 | for (i=0; i<hexString.length(); i++) { |
michael@0 | 609 | val = (val << 4) + u_digit(hexString.charAt(i), 16); |
michael@0 | 610 | } |
michael@0 | 611 | // TODO: some verification that the character is valid |
michael@0 | 612 | replacement.setTo(val); |
michael@0 | 613 | } else if (mAmps.start(7, status) != -1) { |
michael@0 | 614 | UnicodeString decimalString = mAmps.group(7, status); |
michael@0 | 615 | UChar32 val = 0; |
michael@0 | 616 | for (i=0; i<decimalString.length(); i++) { |
michael@0 | 617 | val = val*10 + u_digit(decimalString.charAt(i), 10); |
michael@0 | 618 | } |
michael@0 | 619 | // TODO: some verification that the character is valid |
michael@0 | 620 | replacement.setTo(val); |
michael@0 | 621 | } else { |
michael@0 | 622 | // An unrecognized &entity; Leave it alone. |
michael@0 | 623 | // TODO: check that it really looks like an entity, and is not some |
michael@0 | 624 | // random & in the text. |
michael@0 | 625 | replacement = mAmps.group((int32_t)0, status); |
michael@0 | 626 | } |
michael@0 | 627 | mAmps.appendReplacement(result, replacement, status); |
michael@0 | 628 | } |
michael@0 | 629 | mAmps.appendTail(result); |
michael@0 | 630 | s = result; |
michael@0 | 631 | } |
michael@0 | 632 | |
michael@0 | 633 | void |
michael@0 | 634 | UXMLParser::error(const char *message, UErrorCode &status) { |
michael@0 | 635 | // TODO: something better here... |
michael@0 | 636 | const UnicodeString &src=mXMLDecl.input(); |
michael@0 | 637 | int line = 0; |
michael@0 | 638 | int ci = 0; |
michael@0 | 639 | while (ci < fPos && ci>=0) { |
michael@0 | 640 | ci = src.indexOf((UChar)0x0a, ci+1); |
michael@0 | 641 | line++; |
michael@0 | 642 | } |
michael@0 | 643 | fprintf(stderr, "Error: %s at line %d\n", message, line); |
michael@0 | 644 | if (U_SUCCESS(status)) { |
michael@0 | 645 | status = U_PARSE_ERROR; |
michael@0 | 646 | } |
michael@0 | 647 | } |
michael@0 | 648 | |
michael@0 | 649 | // intern strings like in Java |
michael@0 | 650 | |
michael@0 | 651 | const UnicodeString * |
michael@0 | 652 | UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) { |
michael@0 | 653 | const UHashElement *he=fNames.find(s); |
michael@0 | 654 | if(he!=NULL) { |
michael@0 | 655 | // already a known name, return its hashed key pointer |
michael@0 | 656 | return (const UnicodeString *)he->key.pointer; |
michael@0 | 657 | } else { |
michael@0 | 658 | // add this new name and return its hashed key pointer |
michael@0 | 659 | fNames.puti(s, 0, errorCode); |
michael@0 | 660 | he=fNames.find(s); |
michael@0 | 661 | return (const UnicodeString *)he->key.pointer; |
michael@0 | 662 | } |
michael@0 | 663 | } |
michael@0 | 664 | |
michael@0 | 665 | const UnicodeString * |
michael@0 | 666 | UXMLParser::findName(const UnicodeString &s) const { |
michael@0 | 667 | const UHashElement *he=fNames.find(s); |
michael@0 | 668 | if(he!=NULL) { |
michael@0 | 669 | // a known name, return its hashed key pointer |
michael@0 | 670 | return (const UnicodeString *)he->key.pointer; |
michael@0 | 671 | } else { |
michael@0 | 672 | // unknown name |
michael@0 | 673 | return NULL; |
michael@0 | 674 | } |
michael@0 | 675 | } |
michael@0 | 676 | |
michael@0 | 677 | // UXMLElement ------------------------------------------------------------- *** |
michael@0 | 678 | |
michael@0 | 679 | UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) : |
michael@0 | 680 | fParser(parser), |
michael@0 | 681 | fName(name), |
michael@0 | 682 | fAttNames(errorCode), |
michael@0 | 683 | fAttValues(errorCode), |
michael@0 | 684 | fChildren(errorCode), |
michael@0 | 685 | fParent(NULL) |
michael@0 | 686 | { |
michael@0 | 687 | } |
michael@0 | 688 | |
michael@0 | 689 | UXMLElement::~UXMLElement() { |
michael@0 | 690 | int i; |
michael@0 | 691 | // attribute names are owned by the UXMLParser, don't delete them here |
michael@0 | 692 | for (i=fAttValues.size()-1; i>=0; i--) { |
michael@0 | 693 | delete (UObject *)fAttValues.elementAt(i); |
michael@0 | 694 | } |
michael@0 | 695 | for (i=fChildren.size()-1; i>=0; i--) { |
michael@0 | 696 | delete (UObject *)fChildren.elementAt(i); |
michael@0 | 697 | } |
michael@0 | 698 | } |
michael@0 | 699 | |
michael@0 | 700 | const UnicodeString & |
michael@0 | 701 | UXMLElement::getTagName() const { |
michael@0 | 702 | return *fName; |
michael@0 | 703 | } |
michael@0 | 704 | |
michael@0 | 705 | UnicodeString |
michael@0 | 706 | UXMLElement::getText(UBool recurse) const { |
michael@0 | 707 | UnicodeString text; |
michael@0 | 708 | appendText(text, recurse); |
michael@0 | 709 | return text; |
michael@0 | 710 | } |
michael@0 | 711 | |
michael@0 | 712 | void |
michael@0 | 713 | UXMLElement::appendText(UnicodeString &text, UBool recurse) const { |
michael@0 | 714 | const UObject *node; |
michael@0 | 715 | int32_t i, count=fChildren.size(); |
michael@0 | 716 | for(i=0; i<count; ++i) { |
michael@0 | 717 | node=(const UObject *)fChildren.elementAt(i); |
michael@0 | 718 | const UnicodeString *s=dynamic_cast<const UnicodeString *>(node); |
michael@0 | 719 | if(s!=NULL) { |
michael@0 | 720 | text.append(*s); |
michael@0 | 721 | } else if(recurse) /* must be a UXMLElement */ { |
michael@0 | 722 | ((const UXMLElement *)node)->appendText(text, recurse); |
michael@0 | 723 | } |
michael@0 | 724 | } |
michael@0 | 725 | } |
michael@0 | 726 | |
michael@0 | 727 | int32_t |
michael@0 | 728 | UXMLElement::countAttributes() const { |
michael@0 | 729 | return fAttNames.size(); |
michael@0 | 730 | } |
michael@0 | 731 | |
michael@0 | 732 | const UnicodeString * |
michael@0 | 733 | UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const { |
michael@0 | 734 | if(0<=i && i<fAttNames.size()) { |
michael@0 | 735 | name.setTo(*(const UnicodeString *)fAttNames.elementAt(i)); |
michael@0 | 736 | value.setTo(*(const UnicodeString *)fAttValues.elementAt(i)); |
michael@0 | 737 | return &value; // or return (UnicodeString *)fAttValues.elementAt(i); |
michael@0 | 738 | } else { |
michael@0 | 739 | return NULL; |
michael@0 | 740 | } |
michael@0 | 741 | } |
michael@0 | 742 | |
michael@0 | 743 | const UnicodeString * |
michael@0 | 744 | UXMLElement::getAttribute(const UnicodeString &name) const { |
michael@0 | 745 | // search for the attribute name by comparing the interned pointer, |
michael@0 | 746 | // not the string contents |
michael@0 | 747 | const UnicodeString *p=fParser->findName(name); |
michael@0 | 748 | if(p==NULL) { |
michael@0 | 749 | return NULL; // no such attribute seen by the parser at all |
michael@0 | 750 | } |
michael@0 | 751 | |
michael@0 | 752 | int32_t i, count=fAttNames.size(); |
michael@0 | 753 | for(i=0; i<count; ++i) { |
michael@0 | 754 | if(p==(const UnicodeString *)fAttNames.elementAt(i)) { |
michael@0 | 755 | return (const UnicodeString *)fAttValues.elementAt(i); |
michael@0 | 756 | } |
michael@0 | 757 | } |
michael@0 | 758 | return NULL; |
michael@0 | 759 | } |
michael@0 | 760 | |
michael@0 | 761 | int32_t |
michael@0 | 762 | UXMLElement::countChildren() const { |
michael@0 | 763 | return fChildren.size(); |
michael@0 | 764 | } |
michael@0 | 765 | |
michael@0 | 766 | const UObject * |
michael@0 | 767 | UXMLElement::getChild(int32_t i, UXMLNodeType &type) const { |
michael@0 | 768 | if(0<=i && i<fChildren.size()) { |
michael@0 | 769 | const UObject *node=(const UObject *)fChildren.elementAt(i); |
michael@0 | 770 | if(dynamic_cast<const UXMLElement *>(node)!=NULL) { |
michael@0 | 771 | type=UXML_NODE_TYPE_ELEMENT; |
michael@0 | 772 | } else { |
michael@0 | 773 | type=UXML_NODE_TYPE_STRING; |
michael@0 | 774 | } |
michael@0 | 775 | return node; |
michael@0 | 776 | } else { |
michael@0 | 777 | return NULL; |
michael@0 | 778 | } |
michael@0 | 779 | } |
michael@0 | 780 | |
michael@0 | 781 | const UXMLElement * |
michael@0 | 782 | UXMLElement::nextChildElement(int32_t &i) const { |
michael@0 | 783 | if(i<0) { |
michael@0 | 784 | return NULL; |
michael@0 | 785 | } |
michael@0 | 786 | |
michael@0 | 787 | const UObject *node; |
michael@0 | 788 | int32_t count=fChildren.size(); |
michael@0 | 789 | while(i<count) { |
michael@0 | 790 | node=(const UObject *)fChildren.elementAt(i++); |
michael@0 | 791 | const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); |
michael@0 | 792 | if(elem!=NULL) { |
michael@0 | 793 | return elem; |
michael@0 | 794 | } |
michael@0 | 795 | } |
michael@0 | 796 | return NULL; |
michael@0 | 797 | } |
michael@0 | 798 | |
michael@0 | 799 | const UXMLElement * |
michael@0 | 800 | UXMLElement::getChildElement(const UnicodeString &name) const { |
michael@0 | 801 | // search for the element name by comparing the interned pointer, |
michael@0 | 802 | // not the string contents |
michael@0 | 803 | const UnicodeString *p=fParser->findName(name); |
michael@0 | 804 | if(p==NULL) { |
michael@0 | 805 | return NULL; // no such element seen by the parser at all |
michael@0 | 806 | } |
michael@0 | 807 | |
michael@0 | 808 | const UObject *node; |
michael@0 | 809 | int32_t i, count=fChildren.size(); |
michael@0 | 810 | for(i=0; i<count; ++i) { |
michael@0 | 811 | node=(const UObject *)fChildren.elementAt(i); |
michael@0 | 812 | const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); |
michael@0 | 813 | if(elem!=NULL) { |
michael@0 | 814 | if(p==elem->fName) { |
michael@0 | 815 | return elem; |
michael@0 | 816 | } |
michael@0 | 817 | } |
michael@0 | 818 | } |
michael@0 | 819 | return NULL; |
michael@0 | 820 | } |
michael@0 | 821 | |
michael@0 | 822 | U_NAMESPACE_END |
michael@0 | 823 | |
michael@0 | 824 | #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ |
michael@0 | 825 |