|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 2004-2010, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * file name: xmlparser.cpp |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 2004jul21 |
|
14 * created by: Andy Heninger |
|
15 */ |
|
16 |
|
17 #include <stdio.h> |
|
18 #include "unicode/uchar.h" |
|
19 #include "unicode/ucnv.h" |
|
20 #include "unicode/regex.h" |
|
21 #include "filestrm.h" |
|
22 #include "xmlparser.h" |
|
23 |
|
24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION |
|
25 |
|
26 // character constants |
|
27 enum { |
|
28 x_QUOT=0x22, |
|
29 x_AMP=0x26, |
|
30 x_APOS=0x27, |
|
31 x_LT=0x3c, |
|
32 x_GT=0x3e, |
|
33 x_l=0x6c |
|
34 }; |
|
35 |
|
36 #define XML_SPACES "[ \\u0009\\u000d\\u000a]" |
|
37 |
|
38 // XML #4 |
|
39 #define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \ |
|
40 "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \ |
|
41 "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \ |
|
42 "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" |
|
43 |
|
44 // XML #5 |
|
45 #define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]" |
|
46 |
|
47 // XML #6 |
|
48 #define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*" |
|
49 |
|
50 U_NAMESPACE_BEGIN |
|
51 |
|
52 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser) |
|
53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement) |
|
54 |
|
55 // |
|
56 // UXMLParser constructor. Mostly just initializes the ICU regexes that are |
|
57 // used for parsing. |
|
58 // |
|
59 UXMLParser::UXMLParser(UErrorCode &status) : |
|
60 // XML Declaration. XML Production #23. |
|
61 // example: "<?xml version=1.0 encoding="utf-16" ?> |
|
62 // This is a sloppy implementation - just look for the leading <?xml and the closing ?> |
|
63 // allow for a possible leading BOM. |
|
64 mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status), |
|
65 |
|
66 // XML Comment production #15 |
|
67 // example: "<!-- whatever --> |
|
68 // note, does not detect an illegal "--" within comments |
|
69 mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status), |
|
70 |
|
71 // XML Spaces |
|
72 // production [3] |
|
73 mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status), |
|
74 |
|
75 // XML Doctype decl production #28 |
|
76 // example "<!DOCTYPE foo SYSTEM "somewhere" > |
|
77 // or "<!DOCTYPE foo [internal dtd]> |
|
78 // TODO: we don't actually parse the DOCTYPE or internal subsets. |
|
79 // Some internal dtd subsets could confuse this simple-minded |
|
80 // attempt at skipping over them, specifically, occcurences |
|
81 // of closeing square brackets. These could appear in comments, |
|
82 // or in parameter entity declarations, for example. |
|
83 mXMLDoctype(UnicodeString( |
|
84 "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV |
|
85 ), 0, status), |
|
86 |
|
87 // XML PI production #16 |
|
88 // example "<?target stuff?> |
|
89 mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status), |
|
90 |
|
91 // XML Element Start Productions #40, #41 |
|
92 // example <foo att1='abc' att2="d e f" > |
|
93 // capture #1: the tag name |
|
94 // |
|
95 mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" |
|
96 "(?:" |
|
97 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " |
|
98 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' |
|
99 ")*" // * for zero or more attributes. |
|
100 XML_SPACES "*?>", -1, US_INV), 0, status), // match " >" |
|
101 |
|
102 // XML Element End production #42 |
|
103 // example </foo> |
|
104 mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status), |
|
105 |
|
106 // XML Element Empty production #44 |
|
107 // example <foo att1="abc" att2="d e f" /> |
|
108 mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" |
|
109 "(?:" |
|
110 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " |
|
111 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' |
|
112 ")*" // * for zero or more attributes. |
|
113 XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />" |
|
114 |
|
115 |
|
116 // XMLCharData. Everything but '<'. Note that & will be dealt with later. |
|
117 mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status), |
|
118 |
|
119 // Attribute name = "value". XML Productions 10, 40/41 |
|
120 // Capture group 1 is name, |
|
121 // 2 is the attribute value, including the quotes. |
|
122 // |
|
123 // Note that attributes are scanned twice. The first time is with |
|
124 // the regex for an entire element start. There, the attributes |
|
125 // are checked syntactically, but not separted out one by one. |
|
126 // Here, we match a single attribute, and make its name and |
|
127 // attribute value available to the parser code. |
|
128 mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*" |
|
129 "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status), |
|
130 |
|
131 |
|
132 mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status), |
|
133 |
|
134 // Match any of the new-line sequences in content. |
|
135 // All are changed to \u000a. |
|
136 mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status), |
|
137 |
|
138 // & char references |
|
139 // We will figure out what we've got based on which capture group has content. |
|
140 // The last one is a catchall for unrecognized entity references.. |
|
141 // 1 2 3 4 5 6 7 8 |
|
142 mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"), |
|
143 0, status), |
|
144 |
|
145 fNames(status), |
|
146 fElementStack(status), |
|
147 fOneLF((UChar)0x0a) // Plain new-line string, used in new line normalization. |
|
148 { |
|
149 } |
|
150 |
|
151 UXMLParser * |
|
152 UXMLParser::createParser(UErrorCode &errorCode) { |
|
153 if (U_FAILURE(errorCode)) { |
|
154 return NULL; |
|
155 } else { |
|
156 return new UXMLParser(errorCode); |
|
157 } |
|
158 } |
|
159 |
|
160 UXMLParser::~UXMLParser() {} |
|
161 |
|
162 UXMLElement * |
|
163 UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) { |
|
164 char bytes[4096], charsetBuffer[100]; |
|
165 FileStream *f; |
|
166 const char *charset, *pb; |
|
167 UnicodeString src; |
|
168 UConverter *cnv; |
|
169 UChar *buffer, *pu; |
|
170 int32_t fileLength, bytesLength, length, capacity; |
|
171 UBool flush; |
|
172 |
|
173 if(U_FAILURE(errorCode)) { |
|
174 return NULL; |
|
175 } |
|
176 |
|
177 f=T_FileStream_open(filename, "rb"); |
|
178 if(f==NULL) { |
|
179 errorCode=U_FILE_ACCESS_ERROR; |
|
180 return NULL; |
|
181 } |
|
182 |
|
183 bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); |
|
184 if(bytesLength<(int32_t)sizeof(bytes)) { |
|
185 // we have already read the entire file |
|
186 fileLength=bytesLength; |
|
187 } else { |
|
188 // get the file length |
|
189 fileLength=T_FileStream_size(f); |
|
190 } |
|
191 |
|
192 /* |
|
193 * get the charset: |
|
194 * 1. Unicode signature |
|
195 * 2. treat as ISO-8859-1 and read XML encoding="charser" |
|
196 * 3. default to UTF-8 |
|
197 */ |
|
198 charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode); |
|
199 if(U_SUCCESS(errorCode) && charset!=NULL) { |
|
200 // open converter according to Unicode signature |
|
201 cnv=ucnv_open(charset, &errorCode); |
|
202 } else { |
|
203 // read as Latin-1 and parse the XML declaration and encoding |
|
204 cnv=ucnv_open("ISO-8859-1", &errorCode); |
|
205 if(U_FAILURE(errorCode)) { |
|
206 // unexpected error opening Latin-1 converter |
|
207 goto exit; |
|
208 } |
|
209 |
|
210 buffer=src.getBuffer(bytesLength); |
|
211 if(buffer==NULL) { |
|
212 // unexpected failure to reserve some string capacity |
|
213 errorCode=U_MEMORY_ALLOCATION_ERROR; |
|
214 goto exit; |
|
215 } |
|
216 pb=bytes; |
|
217 pu=buffer; |
|
218 ucnv_toUnicode( |
|
219 cnv, |
|
220 &pu, buffer+src.getCapacity(), |
|
221 &pb, bytes+bytesLength, |
|
222 NULL, TRUE, &errorCode); |
|
223 src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); |
|
224 ucnv_close(cnv); |
|
225 cnv=NULL; |
|
226 if(U_FAILURE(errorCode)) { |
|
227 // unexpected error in conversion from Latin-1 |
|
228 src.remove(); |
|
229 goto exit; |
|
230 } |
|
231 |
|
232 // parse XML declaration |
|
233 if(mXMLDecl.reset(src).lookingAt(0, errorCode)) { |
|
234 int32_t declEnd=mXMLDecl.end(errorCode); |
|
235 // go beyond <?xml |
|
236 int32_t pos=src.indexOf((UChar)x_l)+1; |
|
237 |
|
238 mAttrValue.reset(src); |
|
239 while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loop runs once per attribute on this element. |
|
240 UnicodeString attName = mAttrValue.group(1, errorCode); |
|
241 UnicodeString attValue = mAttrValue.group(2, errorCode); |
|
242 |
|
243 // Trim the quotes from the att value. These are left over from the original regex |
|
244 // that parsed the attribue, which couldn't conveniently strip them. |
|
245 attValue.remove(0,1); // one char from the beginning |
|
246 attValue.truncate(attValue.length()-1); // and one from the end. |
|
247 |
|
248 if(attName==UNICODE_STRING("encoding", 8)) { |
|
249 length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer)); |
|
250 charset=charsetBuffer; |
|
251 break; |
|
252 } |
|
253 pos = mAttrValue.end(2, errorCode); |
|
254 } |
|
255 |
|
256 if(charset==NULL) { |
|
257 // default to UTF-8 |
|
258 charset="UTF-8"; |
|
259 } |
|
260 cnv=ucnv_open(charset, &errorCode); |
|
261 } |
|
262 } |
|
263 |
|
264 if(U_FAILURE(errorCode)) { |
|
265 // unable to open the converter |
|
266 goto exit; |
|
267 } |
|
268 |
|
269 // convert the file contents |
|
270 capacity=fileLength; // estimated capacity |
|
271 src.getBuffer(capacity); |
|
272 src.releaseBuffer(0); // zero length |
|
273 flush=FALSE; |
|
274 for(;;) { |
|
275 // convert contents of bytes[bytesLength] |
|
276 pb=bytes; |
|
277 for(;;) { |
|
278 length=src.length(); |
|
279 buffer=src.getBuffer(capacity); |
|
280 if(buffer==NULL) { |
|
281 // unexpected failure to reserve some string capacity |
|
282 errorCode=U_MEMORY_ALLOCATION_ERROR; |
|
283 goto exit; |
|
284 } |
|
285 |
|
286 pu=buffer+length; |
|
287 ucnv_toUnicode( |
|
288 cnv, &pu, buffer+src.getCapacity(), |
|
289 &pb, bytes+bytesLength, |
|
290 NULL, FALSE, &errorCode); |
|
291 src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); |
|
292 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { |
|
293 errorCode=U_ZERO_ERROR; |
|
294 capacity=(3*src.getCapacity())/2; // increase capacity by 50% |
|
295 } else { |
|
296 break; |
|
297 } |
|
298 } |
|
299 |
|
300 if(U_FAILURE(errorCode)) { |
|
301 break; // conversion error |
|
302 } |
|
303 |
|
304 if(flush) { |
|
305 break; // completely converted the file |
|
306 } |
|
307 |
|
308 // read next block |
|
309 bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); |
|
310 if(bytesLength==0) { |
|
311 // reached end of file, convert once more to flush the converter |
|
312 flush=TRUE; |
|
313 } |
|
314 }; |
|
315 |
|
316 exit: |
|
317 ucnv_close(cnv); |
|
318 T_FileStream_close(f); |
|
319 |
|
320 if(U_SUCCESS(errorCode)) { |
|
321 return parse(src, errorCode); |
|
322 } else { |
|
323 return NULL; |
|
324 } |
|
325 } |
|
326 |
|
327 UXMLElement * |
|
328 UXMLParser::parse(const UnicodeString &src, UErrorCode &status) { |
|
329 if(U_FAILURE(status)) { |
|
330 return NULL; |
|
331 } |
|
332 |
|
333 UXMLElement *root = NULL; |
|
334 fPos = 0; // TODO use just a local pos variable and pass it into functions |
|
335 // where necessary? |
|
336 |
|
337 // set all matchers to work on the input string |
|
338 mXMLDecl.reset(src); |
|
339 mXMLComment.reset(src); |
|
340 mXMLSP.reset(src); |
|
341 mXMLDoctype.reset(src); |
|
342 mXMLPI.reset(src); |
|
343 mXMLElemStart.reset(src); |
|
344 mXMLElemEnd.reset(src); |
|
345 mXMLElemEmpty.reset(src); |
|
346 mXMLCharData.reset(src); |
|
347 mAttrValue.reset(src); |
|
348 mAttrNormalizer.reset(src); |
|
349 mNewLineNormalizer.reset(src); |
|
350 mAmps.reset(src); |
|
351 |
|
352 // Consume the XML Declaration, if present. |
|
353 if (mXMLDecl.lookingAt(fPos, status)) { |
|
354 fPos = mXMLDecl.end(status); |
|
355 } |
|
356 |
|
357 // Consume "misc" [XML production 27] appearing before DocType |
|
358 parseMisc(status); |
|
359 |
|
360 // Consume a DocType declaration, if present. |
|
361 if (mXMLDoctype.lookingAt(fPos, status)) { |
|
362 fPos = mXMLDoctype.end(status); |
|
363 } |
|
364 |
|
365 // Consume additional "misc" [XML production 27] appearing after the DocType |
|
366 parseMisc(status); |
|
367 |
|
368 // Get the root element |
|
369 if (mXMLElemEmpty.lookingAt(fPos, status)) { |
|
370 // Root is an empty element (no nested elements or content) |
|
371 root = createElement(mXMLElemEmpty, status); |
|
372 fPos = mXMLElemEmpty.end(status); |
|
373 } else { |
|
374 if (mXMLElemStart.lookingAt(fPos, status) == FALSE) { |
|
375 error("Root Element expected", status); |
|
376 goto errorExit; |
|
377 } |
|
378 root = createElement(mXMLElemStart, status); |
|
379 UXMLElement *el = root; |
|
380 |
|
381 // |
|
382 // This is the loop that consumes the root element of the document, |
|
383 // including all nested content. Nested elements are handled by |
|
384 // explicit pushes/pops of the element stack; there is no recursion |
|
385 // in the control flow of this code. |
|
386 // "el" always refers to the current element, the one to which content |
|
387 // is being added. It is above the top of the element stack. |
|
388 for (;;) { |
|
389 // Nested Element Start |
|
390 if (mXMLElemStart.lookingAt(fPos, status)) { |
|
391 UXMLElement *t = createElement(mXMLElemStart, status); |
|
392 el->fChildren.addElement(t, status); |
|
393 t->fParent = el; |
|
394 fElementStack.push(el, status); |
|
395 el = t; |
|
396 continue; |
|
397 } |
|
398 |
|
399 // Text Content. String is concatenated onto the current node's content, |
|
400 // but only if it contains something other than spaces. |
|
401 UnicodeString s = scanContent(status); |
|
402 if (s.length() > 0) { |
|
403 mXMLSP.reset(s); |
|
404 if (mXMLSP.matches(status) == FALSE) { |
|
405 // This chunk of text contains something other than just |
|
406 // white space. Make a child node for it. |
|
407 replaceCharRefs(s, status); |
|
408 el->fChildren.addElement(s.clone(), status); |
|
409 } |
|
410 mXMLSP.reset(src); // The matchers need to stay set to the main input string. |
|
411 continue; |
|
412 } |
|
413 |
|
414 // Comments. Discard. |
|
415 if (mXMLComment.lookingAt(fPos, status)) { |
|
416 fPos = mXMLComment.end(status); |
|
417 continue; |
|
418 } |
|
419 |
|
420 // PIs. Discard. |
|
421 if (mXMLPI.lookingAt(fPos, status)) { |
|
422 fPos = mXMLPI.end(status); |
|
423 continue; |
|
424 } |
|
425 |
|
426 // Element End |
|
427 if (mXMLElemEnd.lookingAt(fPos, status)) { |
|
428 fPos = mXMLElemEnd.end(0, status); |
|
429 const UnicodeString name = mXMLElemEnd.group(1, status); |
|
430 if (name != *el->fName) { |
|
431 error("Element start / end tag mismatch", status); |
|
432 goto errorExit; |
|
433 } |
|
434 if (fElementStack.empty()) { |
|
435 // Close of the root element. We're done with the doc. |
|
436 el = NULL; |
|
437 break; |
|
438 } |
|
439 el = (UXMLElement *)fElementStack.pop(); |
|
440 continue; |
|
441 } |
|
442 |
|
443 // Empty Element. Stored as a child of the current element, but not stacked. |
|
444 if (mXMLElemEmpty.lookingAt(fPos, status)) { |
|
445 UXMLElement *t = createElement(mXMLElemEmpty, status); |
|
446 el->fChildren.addElement(t, status); |
|
447 continue; |
|
448 } |
|
449 |
|
450 // Hit something within the document that doesn't match anything. |
|
451 // It's an error. |
|
452 error("Unrecognized markup", status); |
|
453 break; |
|
454 } |
|
455 |
|
456 if (el != NULL || !fElementStack.empty()) { |
|
457 // We bailed out early, for some reason. |
|
458 error("Root element not closed.", status); |
|
459 goto errorExit; |
|
460 } |
|
461 } |
|
462 |
|
463 // Root Element parse is complete. |
|
464 // Consume the annoying xml "Misc" that can appear at the end of the doc. |
|
465 parseMisc(status); |
|
466 |
|
467 // We should have reached the end of the input |
|
468 if (fPos != src.length()) { |
|
469 error("Extra content at the end of the document", status); |
|
470 goto errorExit; |
|
471 } |
|
472 |
|
473 // Success! |
|
474 return root; |
|
475 |
|
476 errorExit: |
|
477 delete root; |
|
478 return NULL; |
|
479 } |
|
480 |
|
481 // |
|
482 // createElement |
|
483 // We've just matched an element start tag. Create and fill in a UXMLElement object |
|
484 // for it. |
|
485 // |
|
486 UXMLElement * |
|
487 UXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) { |
|
488 // First capture group is the element's name. |
|
489 UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status); |
|
490 |
|
491 // Scan for attributes. |
|
492 int32_t pos = mEl.end(1, status); // The position after the end of the tag name |
|
493 |
|
494 while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute on this element. |
|
495 UnicodeString attName = mAttrValue.group(1, status); |
|
496 UnicodeString attValue = mAttrValue.group(2, status); |
|
497 |
|
498 // Trim the quotes from the att value. These are left over from the original regex |
|
499 // that parsed the attribue, which couldn't conveniently strip them. |
|
500 attValue.remove(0,1); // one char from the beginning |
|
501 attValue.truncate(attValue.length()-1); // and one from the end. |
|
502 |
|
503 // XML Attribue value normalization. |
|
504 // This is one of the really screwy parts of the XML spec. |
|
505 // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize |
|
506 // Note that non-validating parsers must treat all entities as type CDATA |
|
507 // which simplifies things some. |
|
508 |
|
509 // Att normalization step 1: normalize any newlines in the attribute value |
|
510 mNewLineNormalizer.reset(attValue); |
|
511 attValue = mNewLineNormalizer.replaceAll(fOneLF, status); |
|
512 |
|
513 // Next change all xml white space chars to plain \u0020 spaces. |
|
514 mAttrNormalizer.reset(attValue); |
|
515 UnicodeString oneSpace((UChar)0x0020); |
|
516 attValue = mAttrNormalizer.replaceAll(oneSpace, status); |
|
517 |
|
518 // Replace character entities. |
|
519 replaceCharRefs(attValue, status); |
|
520 |
|
521 // Save the attribute name and value in our document structure. |
|
522 el->fAttNames.addElement((void *)intern(attName, status), status); |
|
523 el->fAttValues.addElement(attValue.clone(), status); |
|
524 pos = mAttrValue.end(2, status); |
|
525 } |
|
526 fPos = mEl.end(0, status); |
|
527 return el; |
|
528 } |
|
529 |
|
530 // |
|
531 // parseMisc |
|
532 // Consume XML "Misc" [production #27] |
|
533 // which is any combination of space, PI and comments |
|
534 // Need to watch end-of-input because xml MISC stuff is allowed after |
|
535 // the document element, so we WILL scan off the end in this function |
|
536 // |
|
537 void |
|
538 UXMLParser::parseMisc(UErrorCode &status) { |
|
539 for (;;) { |
|
540 if (fPos >= mXMLPI.input().length()) { |
|
541 break; |
|
542 } |
|
543 if (mXMLPI.lookingAt(fPos, status)) { |
|
544 fPos = mXMLPI.end(status); |
|
545 continue; |
|
546 } |
|
547 if (mXMLSP.lookingAt(fPos, status)) { |
|
548 fPos = mXMLSP.end(status); |
|
549 continue; |
|
550 } |
|
551 if (mXMLComment.lookingAt(fPos, status)) { |
|
552 fPos = mXMLComment.end(status); |
|
553 continue; |
|
554 } |
|
555 break; |
|
556 } |
|
557 } |
|
558 |
|
559 // |
|
560 // Scan for document content. |
|
561 // |
|
562 UnicodeString |
|
563 UXMLParser::scanContent(UErrorCode &status) { |
|
564 UnicodeString result; |
|
565 if (mXMLCharData.lookingAt(fPos, status)) { |
|
566 result = mXMLCharData.group((int32_t)0, status); |
|
567 // Normalize the new-lines. (Before char ref substitution) |
|
568 mNewLineNormalizer.reset(result); |
|
569 result = mNewLineNormalizer.replaceAll(fOneLF, status); |
|
570 |
|
571 // TODO: handle CDATA |
|
572 fPos = mXMLCharData.end(0, status); |
|
573 } |
|
574 |
|
575 return result; |
|
576 } |
|
577 |
|
578 // |
|
579 // replaceCharRefs |
|
580 // |
|
581 // replace the char entities < & { ካ etc. in a string |
|
582 // with the corresponding actual character. |
|
583 // |
|
584 void |
|
585 UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) { |
|
586 UnicodeString result; |
|
587 UnicodeString replacement; |
|
588 int i; |
|
589 |
|
590 mAmps.reset(s); |
|
591 // See the initialization for the regex matcher mAmps. |
|
592 // Which entity we've matched is determined by which capture group has content, |
|
593 // which is flaged by start() of that group not being -1. |
|
594 while (mAmps.find()) { |
|
595 if (mAmps.start(1, status) != -1) { |
|
596 replacement.setTo((UChar)x_AMP); |
|
597 } else if (mAmps.start(2, status) != -1) { |
|
598 replacement.setTo((UChar)x_LT); |
|
599 } else if (mAmps.start(3, status) != -1) { |
|
600 replacement.setTo((UChar)x_GT); |
|
601 } else if (mAmps.start(4, status) != -1) { |
|
602 replacement.setTo((UChar)x_APOS); |
|
603 } else if (mAmps.start(5, status) != -1) { |
|
604 replacement.setTo((UChar)x_QUOT); |
|
605 } else if (mAmps.start(6, status) != -1) { |
|
606 UnicodeString hexString = mAmps.group(6, status); |
|
607 UChar32 val = 0; |
|
608 for (i=0; i<hexString.length(); i++) { |
|
609 val = (val << 4) + u_digit(hexString.charAt(i), 16); |
|
610 } |
|
611 // TODO: some verification that the character is valid |
|
612 replacement.setTo(val); |
|
613 } else if (mAmps.start(7, status) != -1) { |
|
614 UnicodeString decimalString = mAmps.group(7, status); |
|
615 UChar32 val = 0; |
|
616 for (i=0; i<decimalString.length(); i++) { |
|
617 val = val*10 + u_digit(decimalString.charAt(i), 10); |
|
618 } |
|
619 // TODO: some verification that the character is valid |
|
620 replacement.setTo(val); |
|
621 } else { |
|
622 // An unrecognized &entity; Leave it alone. |
|
623 // TODO: check that it really looks like an entity, and is not some |
|
624 // random & in the text. |
|
625 replacement = mAmps.group((int32_t)0, status); |
|
626 } |
|
627 mAmps.appendReplacement(result, replacement, status); |
|
628 } |
|
629 mAmps.appendTail(result); |
|
630 s = result; |
|
631 } |
|
632 |
|
633 void |
|
634 UXMLParser::error(const char *message, UErrorCode &status) { |
|
635 // TODO: something better here... |
|
636 const UnicodeString &src=mXMLDecl.input(); |
|
637 int line = 0; |
|
638 int ci = 0; |
|
639 while (ci < fPos && ci>=0) { |
|
640 ci = src.indexOf((UChar)0x0a, ci+1); |
|
641 line++; |
|
642 } |
|
643 fprintf(stderr, "Error: %s at line %d\n", message, line); |
|
644 if (U_SUCCESS(status)) { |
|
645 status = U_PARSE_ERROR; |
|
646 } |
|
647 } |
|
648 |
|
649 // intern strings like in Java |
|
650 |
|
651 const UnicodeString * |
|
652 UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) { |
|
653 const UHashElement *he=fNames.find(s); |
|
654 if(he!=NULL) { |
|
655 // already a known name, return its hashed key pointer |
|
656 return (const UnicodeString *)he->key.pointer; |
|
657 } else { |
|
658 // add this new name and return its hashed key pointer |
|
659 fNames.puti(s, 0, errorCode); |
|
660 he=fNames.find(s); |
|
661 return (const UnicodeString *)he->key.pointer; |
|
662 } |
|
663 } |
|
664 |
|
665 const UnicodeString * |
|
666 UXMLParser::findName(const UnicodeString &s) const { |
|
667 const UHashElement *he=fNames.find(s); |
|
668 if(he!=NULL) { |
|
669 // a known name, return its hashed key pointer |
|
670 return (const UnicodeString *)he->key.pointer; |
|
671 } else { |
|
672 // unknown name |
|
673 return NULL; |
|
674 } |
|
675 } |
|
676 |
|
677 // UXMLElement ------------------------------------------------------------- *** |
|
678 |
|
679 UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) : |
|
680 fParser(parser), |
|
681 fName(name), |
|
682 fAttNames(errorCode), |
|
683 fAttValues(errorCode), |
|
684 fChildren(errorCode), |
|
685 fParent(NULL) |
|
686 { |
|
687 } |
|
688 |
|
689 UXMLElement::~UXMLElement() { |
|
690 int i; |
|
691 // attribute names are owned by the UXMLParser, don't delete them here |
|
692 for (i=fAttValues.size()-1; i>=0; i--) { |
|
693 delete (UObject *)fAttValues.elementAt(i); |
|
694 } |
|
695 for (i=fChildren.size()-1; i>=0; i--) { |
|
696 delete (UObject *)fChildren.elementAt(i); |
|
697 } |
|
698 } |
|
699 |
|
700 const UnicodeString & |
|
701 UXMLElement::getTagName() const { |
|
702 return *fName; |
|
703 } |
|
704 |
|
705 UnicodeString |
|
706 UXMLElement::getText(UBool recurse) const { |
|
707 UnicodeString text; |
|
708 appendText(text, recurse); |
|
709 return text; |
|
710 } |
|
711 |
|
712 void |
|
713 UXMLElement::appendText(UnicodeString &text, UBool recurse) const { |
|
714 const UObject *node; |
|
715 int32_t i, count=fChildren.size(); |
|
716 for(i=0; i<count; ++i) { |
|
717 node=(const UObject *)fChildren.elementAt(i); |
|
718 const UnicodeString *s=dynamic_cast<const UnicodeString *>(node); |
|
719 if(s!=NULL) { |
|
720 text.append(*s); |
|
721 } else if(recurse) /* must be a UXMLElement */ { |
|
722 ((const UXMLElement *)node)->appendText(text, recurse); |
|
723 } |
|
724 } |
|
725 } |
|
726 |
|
727 int32_t |
|
728 UXMLElement::countAttributes() const { |
|
729 return fAttNames.size(); |
|
730 } |
|
731 |
|
732 const UnicodeString * |
|
733 UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const { |
|
734 if(0<=i && i<fAttNames.size()) { |
|
735 name.setTo(*(const UnicodeString *)fAttNames.elementAt(i)); |
|
736 value.setTo(*(const UnicodeString *)fAttValues.elementAt(i)); |
|
737 return &value; // or return (UnicodeString *)fAttValues.elementAt(i); |
|
738 } else { |
|
739 return NULL; |
|
740 } |
|
741 } |
|
742 |
|
743 const UnicodeString * |
|
744 UXMLElement::getAttribute(const UnicodeString &name) const { |
|
745 // search for the attribute name by comparing the interned pointer, |
|
746 // not the string contents |
|
747 const UnicodeString *p=fParser->findName(name); |
|
748 if(p==NULL) { |
|
749 return NULL; // no such attribute seen by the parser at all |
|
750 } |
|
751 |
|
752 int32_t i, count=fAttNames.size(); |
|
753 for(i=0; i<count; ++i) { |
|
754 if(p==(const UnicodeString *)fAttNames.elementAt(i)) { |
|
755 return (const UnicodeString *)fAttValues.elementAt(i); |
|
756 } |
|
757 } |
|
758 return NULL; |
|
759 } |
|
760 |
|
761 int32_t |
|
762 UXMLElement::countChildren() const { |
|
763 return fChildren.size(); |
|
764 } |
|
765 |
|
766 const UObject * |
|
767 UXMLElement::getChild(int32_t i, UXMLNodeType &type) const { |
|
768 if(0<=i && i<fChildren.size()) { |
|
769 const UObject *node=(const UObject *)fChildren.elementAt(i); |
|
770 if(dynamic_cast<const UXMLElement *>(node)!=NULL) { |
|
771 type=UXML_NODE_TYPE_ELEMENT; |
|
772 } else { |
|
773 type=UXML_NODE_TYPE_STRING; |
|
774 } |
|
775 return node; |
|
776 } else { |
|
777 return NULL; |
|
778 } |
|
779 } |
|
780 |
|
781 const UXMLElement * |
|
782 UXMLElement::nextChildElement(int32_t &i) const { |
|
783 if(i<0) { |
|
784 return NULL; |
|
785 } |
|
786 |
|
787 const UObject *node; |
|
788 int32_t count=fChildren.size(); |
|
789 while(i<count) { |
|
790 node=(const UObject *)fChildren.elementAt(i++); |
|
791 const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); |
|
792 if(elem!=NULL) { |
|
793 return elem; |
|
794 } |
|
795 } |
|
796 return NULL; |
|
797 } |
|
798 |
|
799 const UXMLElement * |
|
800 UXMLElement::getChildElement(const UnicodeString &name) const { |
|
801 // search for the element name by comparing the interned pointer, |
|
802 // not the string contents |
|
803 const UnicodeString *p=fParser->findName(name); |
|
804 if(p==NULL) { |
|
805 return NULL; // no such element seen by the parser at all |
|
806 } |
|
807 |
|
808 const UObject *node; |
|
809 int32_t i, count=fChildren.size(); |
|
810 for(i=0; i<count; ++i) { |
|
811 node=(const UObject *)fChildren.elementAt(i); |
|
812 const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); |
|
813 if(elem!=NULL) { |
|
814 if(p==elem->fName) { |
|
815 return elem; |
|
816 } |
|
817 } |
|
818 } |
|
819 return NULL; |
|
820 } |
|
821 |
|
822 U_NAMESPACE_END |
|
823 |
|
824 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ |
|
825 |