michael@0: /* michael@0: ******************************************************************************* michael@0: * michael@0: * Copyright (C) 2004-2010, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ******************************************************************************* michael@0: * file name: xmlparser.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2004jul21 michael@0: * created by: Andy Heninger michael@0: */ michael@0: michael@0: #include michael@0: #include "unicode/uchar.h" michael@0: #include "unicode/ucnv.h" michael@0: #include "unicode/regex.h" michael@0: #include "filestrm.h" michael@0: #include "xmlparser.h" michael@0: michael@0: #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION michael@0: michael@0: // character constants michael@0: enum { michael@0: x_QUOT=0x22, michael@0: x_AMP=0x26, michael@0: x_APOS=0x27, michael@0: x_LT=0x3c, michael@0: x_GT=0x3e, michael@0: x_l=0x6c michael@0: }; michael@0: michael@0: #define XML_SPACES "[ \\u0009\\u000d\\u000a]" michael@0: michael@0: // XML #4 michael@0: #define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \ michael@0: "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \ michael@0: "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \ michael@0: "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" michael@0: michael@0: // XML #5 michael@0: #define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]" michael@0: michael@0: // XML #6 michael@0: #define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser) michael@0: UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement) michael@0: michael@0: // michael@0: // UXMLParser constructor. Mostly just initializes the ICU regexes that are michael@0: // used for parsing. michael@0: // michael@0: UXMLParser::UXMLParser(UErrorCode &status) : michael@0: // XML Declaration. XML Production #23. michael@0: // example: " michael@0: // This is a sloppy implementation - just look for the leading michael@0: // allow for a possible leading BOM. michael@0: mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status), michael@0: michael@0: // XML Comment production #15 michael@0: // example: " michael@0: // note, does not detect an illegal "--" within comments michael@0: mXMLComment(UnicodeString("(?s)", -1, US_INV), 0, status), michael@0: michael@0: // XML Spaces michael@0: // production [3] michael@0: mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status), michael@0: michael@0: // XML Doctype decl production #28 michael@0: // example " michael@0: // or " michael@0: // TODO: we don't actually parse the DOCTYPE or internal subsets. michael@0: // Some internal dtd subsets could confuse this simple-minded michael@0: // attempt at skipping over them, specifically, occcurences michael@0: // of closeing square brackets. These could appear in comments, michael@0: // or in parameter entity declarations, for example. michael@0: mXMLDoctype(UnicodeString( michael@0: "(?s)|\\[.*?\\].*?>)", -1, US_INV michael@0: ), 0, status), michael@0: michael@0: // XML PI production #16 michael@0: // example " michael@0: mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status), michael@0: michael@0: // XML Element Start Productions #40, #41 michael@0: // example michael@0: // capture #1: the tag name michael@0: // michael@0: mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "", -1, US_INV), 0, status), // match " >" michael@0: michael@0: // XML Element End production #42 michael@0: // example michael@0: mXMLElemEnd (UnicodeString("", -1, US_INV), 0, status), michael@0: michael@0: // XML Element Empty production #44 michael@0: // example michael@0: mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "", -1, US_INV), 0, status), // match " />" michael@0: michael@0: michael@0: // XMLCharData. Everything but '<'. Note that & will be dealt with later. michael@0: mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status), michael@0: michael@0: // Attribute name = "value". XML Productions 10, 40/41 michael@0: // Capture group 1 is name, michael@0: // 2 is the attribute value, including the quotes. michael@0: // michael@0: // Note that attributes are scanned twice. The first time is with michael@0: // the regex for an entire element start. There, the attributes michael@0: // are checked syntactically, but not separted out one by one. michael@0: // Here, we match a single attribute, and make its name and michael@0: // attribute value available to the parser code. michael@0: mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*" michael@0: "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status), michael@0: michael@0: michael@0: mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status), michael@0: michael@0: // Match any of the new-line sequences in content. michael@0: // All are changed to \u000a. michael@0: mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status), michael@0: michael@0: // & char references michael@0: // We will figure out what we've got based on which capture group has content. michael@0: // The last one is a catchall for unrecognized entity references.. michael@0: // 1 2 3 4 5 6 7 8 michael@0: mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"), michael@0: 0, status), michael@0: michael@0: fNames(status), michael@0: fElementStack(status), michael@0: fOneLF((UChar)0x0a) // Plain new-line string, used in new line normalization. michael@0: { michael@0: } michael@0: michael@0: UXMLParser * michael@0: UXMLParser::createParser(UErrorCode &errorCode) { michael@0: if (U_FAILURE(errorCode)) { michael@0: return NULL; michael@0: } else { michael@0: return new UXMLParser(errorCode); michael@0: } michael@0: } michael@0: michael@0: UXMLParser::~UXMLParser() {} michael@0: michael@0: UXMLElement * michael@0: UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) { michael@0: char bytes[4096], charsetBuffer[100]; michael@0: FileStream *f; michael@0: const char *charset, *pb; michael@0: UnicodeString src; michael@0: UConverter *cnv; michael@0: UChar *buffer, *pu; michael@0: int32_t fileLength, bytesLength, length, capacity; michael@0: UBool flush; michael@0: michael@0: if(U_FAILURE(errorCode)) { michael@0: return NULL; michael@0: } michael@0: michael@0: f=T_FileStream_open(filename, "rb"); michael@0: if(f==NULL) { michael@0: errorCode=U_FILE_ACCESS_ERROR; michael@0: return NULL; michael@0: } michael@0: michael@0: bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); michael@0: if(bytesLength<(int32_t)sizeof(bytes)) { michael@0: // we have already read the entire file michael@0: fileLength=bytesLength; michael@0: } else { michael@0: // get the file length michael@0: fileLength=T_FileStream_size(f); michael@0: } michael@0: michael@0: /* michael@0: * get the charset: michael@0: * 1. Unicode signature michael@0: * 2. treat as ISO-8859-1 and read XML encoding="charser" michael@0: * 3. default to UTF-8 michael@0: */ michael@0: charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode); michael@0: if(U_SUCCESS(errorCode) && charset!=NULL) { michael@0: // open converter according to Unicode signature michael@0: cnv=ucnv_open(charset, &errorCode); michael@0: } else { michael@0: // read as Latin-1 and parse the XML declaration and encoding michael@0: cnv=ucnv_open("ISO-8859-1", &errorCode); michael@0: if(U_FAILURE(errorCode)) { michael@0: // unexpected error opening Latin-1 converter michael@0: goto exit; michael@0: } michael@0: michael@0: buffer=src.getBuffer(bytesLength); michael@0: if(buffer==NULL) { michael@0: // unexpected failure to reserve some string capacity michael@0: errorCode=U_MEMORY_ALLOCATION_ERROR; michael@0: goto exit; michael@0: } michael@0: pb=bytes; michael@0: pu=buffer; michael@0: ucnv_toUnicode( michael@0: cnv, michael@0: &pu, buffer+src.getCapacity(), michael@0: &pb, bytes+bytesLength, michael@0: NULL, TRUE, &errorCode); michael@0: src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); michael@0: ucnv_close(cnv); michael@0: cnv=NULL; michael@0: if(U_FAILURE(errorCode)) { michael@0: // unexpected error in conversion from Latin-1 michael@0: src.remove(); michael@0: goto exit; michael@0: } michael@0: michael@0: // parse XML declaration michael@0: if(mXMLDecl.reset(src).lookingAt(0, errorCode)) { michael@0: int32_t declEnd=mXMLDecl.end(errorCode); michael@0: // go beyond fChildren.addElement(t, status); michael@0: t->fParent = el; michael@0: fElementStack.push(el, status); michael@0: el = t; michael@0: continue; michael@0: } michael@0: michael@0: // Text Content. String is concatenated onto the current node's content, michael@0: // but only if it contains something other than spaces. michael@0: UnicodeString s = scanContent(status); michael@0: if (s.length() > 0) { michael@0: mXMLSP.reset(s); michael@0: if (mXMLSP.matches(status) == FALSE) { michael@0: // This chunk of text contains something other than just michael@0: // white space. Make a child node for it. michael@0: replaceCharRefs(s, status); michael@0: el->fChildren.addElement(s.clone(), status); michael@0: } michael@0: mXMLSP.reset(src); // The matchers need to stay set to the main input string. michael@0: continue; michael@0: } michael@0: michael@0: // Comments. Discard. michael@0: if (mXMLComment.lookingAt(fPos, status)) { michael@0: fPos = mXMLComment.end(status); michael@0: continue; michael@0: } michael@0: michael@0: // PIs. Discard. michael@0: if (mXMLPI.lookingAt(fPos, status)) { michael@0: fPos = mXMLPI.end(status); michael@0: continue; michael@0: } michael@0: michael@0: // Element End michael@0: if (mXMLElemEnd.lookingAt(fPos, status)) { michael@0: fPos = mXMLElemEnd.end(0, status); michael@0: const UnicodeString name = mXMLElemEnd.group(1, status); michael@0: if (name != *el->fName) { michael@0: error("Element start / end tag mismatch", status); michael@0: goto errorExit; michael@0: } michael@0: if (fElementStack.empty()) { michael@0: // Close of the root element. We're done with the doc. michael@0: el = NULL; michael@0: break; michael@0: } michael@0: el = (UXMLElement *)fElementStack.pop(); michael@0: continue; michael@0: } michael@0: michael@0: // Empty Element. Stored as a child of the current element, but not stacked. michael@0: if (mXMLElemEmpty.lookingAt(fPos, status)) { michael@0: UXMLElement *t = createElement(mXMLElemEmpty, status); michael@0: el->fChildren.addElement(t, status); michael@0: continue; michael@0: } michael@0: michael@0: // Hit something within the document that doesn't match anything. michael@0: // It's an error. michael@0: error("Unrecognized markup", status); michael@0: break; michael@0: } michael@0: michael@0: if (el != NULL || !fElementStack.empty()) { michael@0: // We bailed out early, for some reason. michael@0: error("Root element not closed.", status); michael@0: goto errorExit; michael@0: } michael@0: } michael@0: michael@0: // Root Element parse is complete. michael@0: // Consume the annoying xml "Misc" that can appear at the end of the doc. michael@0: parseMisc(status); michael@0: michael@0: // We should have reached the end of the input michael@0: if (fPos != src.length()) { michael@0: error("Extra content at the end of the document", status); michael@0: goto errorExit; michael@0: } michael@0: michael@0: // Success! michael@0: return root; michael@0: michael@0: errorExit: michael@0: delete root; michael@0: return NULL; michael@0: } michael@0: michael@0: // michael@0: // createElement michael@0: // We've just matched an element start tag. Create and fill in a UXMLElement object michael@0: // for it. michael@0: // michael@0: UXMLElement * michael@0: UXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) { michael@0: // First capture group is the element's name. michael@0: UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status); michael@0: michael@0: // Scan for attributes. michael@0: int32_t pos = mEl.end(1, status); // The position after the end of the tag name michael@0: michael@0: while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute on this element. michael@0: UnicodeString attName = mAttrValue.group(1, status); michael@0: UnicodeString attValue = mAttrValue.group(2, status); michael@0: michael@0: // Trim the quotes from the att value. These are left over from the original regex michael@0: // that parsed the attribue, which couldn't conveniently strip them. michael@0: attValue.remove(0,1); // one char from the beginning michael@0: attValue.truncate(attValue.length()-1); // and one from the end. michael@0: michael@0: // XML Attribue value normalization. michael@0: // This is one of the really screwy parts of the XML spec. michael@0: // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize michael@0: // Note that non-validating parsers must treat all entities as type CDATA michael@0: // which simplifies things some. michael@0: michael@0: // Att normalization step 1: normalize any newlines in the attribute value michael@0: mNewLineNormalizer.reset(attValue); michael@0: attValue = mNewLineNormalizer.replaceAll(fOneLF, status); michael@0: michael@0: // Next change all xml white space chars to plain \u0020 spaces. michael@0: mAttrNormalizer.reset(attValue); michael@0: UnicodeString oneSpace((UChar)0x0020); michael@0: attValue = mAttrNormalizer.replaceAll(oneSpace, status); michael@0: michael@0: // Replace character entities. michael@0: replaceCharRefs(attValue, status); michael@0: michael@0: // Save the attribute name and value in our document structure. michael@0: el->fAttNames.addElement((void *)intern(attName, status), status); michael@0: el->fAttValues.addElement(attValue.clone(), status); michael@0: pos = mAttrValue.end(2, status); michael@0: } michael@0: fPos = mEl.end(0, status); michael@0: return el; michael@0: } michael@0: michael@0: // michael@0: // parseMisc michael@0: // Consume XML "Misc" [production #27] michael@0: // which is any combination of space, PI and comments michael@0: // Need to watch end-of-input because xml MISC stuff is allowed after michael@0: // the document element, so we WILL scan off the end in this function michael@0: // michael@0: void michael@0: UXMLParser::parseMisc(UErrorCode &status) { michael@0: for (;;) { michael@0: if (fPos >= mXMLPI.input().length()) { michael@0: break; michael@0: } michael@0: if (mXMLPI.lookingAt(fPos, status)) { michael@0: fPos = mXMLPI.end(status); michael@0: continue; michael@0: } michael@0: if (mXMLSP.lookingAt(fPos, status)) { michael@0: fPos = mXMLSP.end(status); michael@0: continue; michael@0: } michael@0: if (mXMLComment.lookingAt(fPos, status)) { michael@0: fPos = mXMLComment.end(status); michael@0: continue; michael@0: } michael@0: break; michael@0: } michael@0: } michael@0: michael@0: // michael@0: // Scan for document content. michael@0: // michael@0: UnicodeString michael@0: UXMLParser::scanContent(UErrorCode &status) { michael@0: UnicodeString result; michael@0: if (mXMLCharData.lookingAt(fPos, status)) { michael@0: result = mXMLCharData.group((int32_t)0, status); michael@0: // Normalize the new-lines. (Before char ref substitution) michael@0: mNewLineNormalizer.reset(result); michael@0: result = mNewLineNormalizer.replaceAll(fOneLF, status); michael@0: michael@0: // TODO: handle CDATA michael@0: fPos = mXMLCharData.end(0, status); michael@0: } michael@0: michael@0: return result; michael@0: } michael@0: michael@0: // michael@0: // replaceCharRefs michael@0: // michael@0: // replace the char entities < & { ካ etc. in a string michael@0: // with the corresponding actual character. michael@0: // michael@0: void michael@0: UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) { michael@0: UnicodeString result; michael@0: UnicodeString replacement; michael@0: int i; michael@0: michael@0: mAmps.reset(s); michael@0: // See the initialization for the regex matcher mAmps. michael@0: // Which entity we've matched is determined by which capture group has content, michael@0: // which is flaged by start() of that group not being -1. michael@0: while (mAmps.find()) { michael@0: if (mAmps.start(1, status) != -1) { michael@0: replacement.setTo((UChar)x_AMP); michael@0: } else if (mAmps.start(2, status) != -1) { michael@0: replacement.setTo((UChar)x_LT); michael@0: } else if (mAmps.start(3, status) != -1) { michael@0: replacement.setTo((UChar)x_GT); michael@0: } else if (mAmps.start(4, status) != -1) { michael@0: replacement.setTo((UChar)x_APOS); michael@0: } else if (mAmps.start(5, status) != -1) { michael@0: replacement.setTo((UChar)x_QUOT); michael@0: } else if (mAmps.start(6, status) != -1) { michael@0: UnicodeString hexString = mAmps.group(6, status); michael@0: UChar32 val = 0; michael@0: for (i=0; i=0) { michael@0: ci = src.indexOf((UChar)0x0a, ci+1); michael@0: line++; michael@0: } michael@0: fprintf(stderr, "Error: %s at line %d\n", message, line); michael@0: if (U_SUCCESS(status)) { michael@0: status = U_PARSE_ERROR; michael@0: } michael@0: } michael@0: michael@0: // intern strings like in Java michael@0: michael@0: const UnicodeString * michael@0: UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) { michael@0: const UHashElement *he=fNames.find(s); michael@0: if(he!=NULL) { michael@0: // already a known name, return its hashed key pointer michael@0: return (const UnicodeString *)he->key.pointer; michael@0: } else { michael@0: // add this new name and return its hashed key pointer michael@0: fNames.puti(s, 0, errorCode); michael@0: he=fNames.find(s); michael@0: return (const UnicodeString *)he->key.pointer; michael@0: } michael@0: } michael@0: michael@0: const UnicodeString * michael@0: UXMLParser::findName(const UnicodeString &s) const { michael@0: const UHashElement *he=fNames.find(s); michael@0: if(he!=NULL) { michael@0: // a known name, return its hashed key pointer michael@0: return (const UnicodeString *)he->key.pointer; michael@0: } else { michael@0: // unknown name michael@0: return NULL; michael@0: } michael@0: } michael@0: michael@0: // UXMLElement ------------------------------------------------------------- *** michael@0: michael@0: UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) : michael@0: fParser(parser), michael@0: fName(name), michael@0: fAttNames(errorCode), michael@0: fAttValues(errorCode), michael@0: fChildren(errorCode), michael@0: fParent(NULL) michael@0: { michael@0: } michael@0: michael@0: UXMLElement::~UXMLElement() { michael@0: int i; michael@0: // attribute names are owned by the UXMLParser, don't delete them here michael@0: for (i=fAttValues.size()-1; i>=0; i--) { michael@0: delete (UObject *)fAttValues.elementAt(i); michael@0: } michael@0: for (i=fChildren.size()-1; i>=0; i--) { michael@0: delete (UObject *)fChildren.elementAt(i); michael@0: } michael@0: } michael@0: michael@0: const UnicodeString & michael@0: UXMLElement::getTagName() const { michael@0: return *fName; michael@0: } michael@0: michael@0: UnicodeString michael@0: UXMLElement::getText(UBool recurse) const { michael@0: UnicodeString text; michael@0: appendText(text, recurse); michael@0: return text; michael@0: } michael@0: michael@0: void michael@0: UXMLElement::appendText(UnicodeString &text, UBool recurse) const { michael@0: const UObject *node; michael@0: int32_t i, count=fChildren.size(); michael@0: for(i=0; i(node); michael@0: if(s!=NULL) { michael@0: text.append(*s); michael@0: } else if(recurse) /* must be a UXMLElement */ { michael@0: ((const UXMLElement *)node)->appendText(text, recurse); michael@0: } michael@0: } michael@0: } michael@0: michael@0: int32_t michael@0: UXMLElement::countAttributes() const { michael@0: return fAttNames.size(); michael@0: } michael@0: michael@0: const UnicodeString * michael@0: UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const { michael@0: if(0<=i && ifindName(name); michael@0: if(p==NULL) { michael@0: return NULL; // no such attribute seen by the parser at all michael@0: } michael@0: michael@0: int32_t i, count=fAttNames.size(); michael@0: for(i=0; i(node)!=NULL) { michael@0: type=UXML_NODE_TYPE_ELEMENT; michael@0: } else { michael@0: type=UXML_NODE_TYPE_STRING; michael@0: } michael@0: return node; michael@0: } else { michael@0: return NULL; michael@0: } michael@0: } michael@0: michael@0: const UXMLElement * michael@0: UXMLElement::nextChildElement(int32_t &i) const { michael@0: if(i<0) { michael@0: return NULL; michael@0: } michael@0: michael@0: const UObject *node; michael@0: int32_t count=fChildren.size(); michael@0: while(i(node); michael@0: if(elem!=NULL) { michael@0: return elem; michael@0: } michael@0: } michael@0: return NULL; michael@0: } michael@0: michael@0: const UXMLElement * michael@0: UXMLElement::getChildElement(const UnicodeString &name) const { michael@0: // search for the element name by comparing the interned pointer, michael@0: // not the string contents michael@0: const UnicodeString *p=fParser->findName(name); michael@0: if(p==NULL) { michael@0: return NULL; // no such element seen by the parser at all michael@0: } michael@0: michael@0: const UObject *node; michael@0: int32_t i, count=fChildren.size(); michael@0: for(i=0; i(node); michael@0: if(elem!=NULL) { michael@0: if(p==elem->fName) { michael@0: return elem; michael@0: } michael@0: } michael@0: } michael@0: return NULL; michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ michael@0: