Teuchos - Trilinos Tools Package Version of the Day
Loading...
Searching...
No Matches
Teuchos_XMLParser.cpp
1// @HEADER
2// ***********************************************************************
3//
4// Teuchos: Common Tools Package
5// Copyright (2004) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38//
39// ***********************************************************************
40// @HEADER
41
42// BUGS: There is a bug in Teuchos_XMLObjectImplem.cpp, line 82
43// when printing attribute values, one must check if the value contains quote
44// or apost;
45// a quot'd attval cannot contain literal quot
46// a apos'd attval cannot contain literal apos
47// either they have to be matched appropriately or (easier) all quot and apos must
48// be replaced by " and '
49
50#include "Teuchos_XMLParser.hpp"
52#include "Teuchos_Assert.hpp"
53#include <stack>
54
55using namespace Teuchos;
56
57// this parser currently does not support:
58// * processing instructions
59// * XML schemas
60// * CDATA sections...see http://www.w3.org/TR/2004/REC-xml-20040204/#dt-cdsection
61// * full Unicode support (we read unsigned bytes, so we get only 0x00 through 0xFF)
62//
63// it tolerates (read: ignores) xml declarations, at any point in the file where a tag would be valid
64//
65// it currently does support:
66// * comments
67// * empty element tags, e.g. <hello />
68// * entity references: &amp; &lt; &gt; &apos; &quot;
69// * numeric character references: &#32;
70// * std::exception/error handling on parse errors
71
72
73/* From the W3C XML 1.0 Third Edition
74 http://www.w3.org/TR/2004/REC-xml-20040204/
75
76 The following productions specify well-formed XML documents.
77 These have been reduced to the support anticipated for support by this parser.
78
79 element ::= EmptyElemTag
80 | STag content ETag
81 STag ::= '<' Name (S Attribute)* S? '>'
82 Attribute ::= Name Eq AttValue
83 ETag ::= '</' Name S? '>'
84 content ::= CharData? ((element | Reference | CDSect | Comment) CharData?)*
85 EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
86
87 AttValue ::= '"' ([^<&"] | Reference)* '"'
88 | "'" ([^<&'] | Reference)* "'"
89
90 CharRef ::= '&#' [0-9]+ ';'
91 EntityRef ::= '&' Name ';'
92 Reference ::= EntityRef | CharRef
93
94 #x20 (space)
95 #x9 (horizontal tab)
96 #xD (carriage return)
97 #xA (new line, new line line feed)
98
99 S ::= (#x20 | #x9 | #xD | #xA)+
100 Eq ::= S? '=' S?
101 NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | #x00B7
102 Name ::= (Letter | '_' | ':') (NameChar)*
103
104 Letter ::= [#x0041-#x005A] | [#x0061-#x007A]
105 | [#x00C0-#x00D6] | [#x00D8-#x00F6]
106 | [#x00F8-#x00FF]
107 Digit ::= [#x0030-#x0039]
108
109 Char ::= #x9 | #xA | #xD | [#x20-#xFF]
110 CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
111 that is, some std::string of characters not containing '<' or '&' or ']]>'
112 Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
113 that is, '<!--' txt '-->', where txt does not contain '--'
114
115 CDSect ::= CDStart CData CDEnd
116 CDStart ::= '<![CDATA['
117 CData ::= (Char* - (Char* ']]>' Char*))
118 CDEnd ::= ']]>'
119
120 document ::= prolog element Misc*
121 prolog ::= XMLDecl? Misc*
122 XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
123 Misc ::= Comment | S
124
125 VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
126 Eq ::= S? '=' S?
127 VersionNum ::= '1.' [0-9]+
128 Misc ::= Comment | S
129
130
131
132*/
133
134#define XMLPARSER_TFE( T , S ) \
135 TEUCHOS_TEST_FOR_EXCEPTION( T, std::runtime_error, "XML parse error at line " << _lineNo << ": " << S )
136
138{
139
141
142 _entities.clear();
143 _entities["apos"] = "'";
144 _entities["quot"] = "\"";
145 _entities["lt"] = "<";
146 _entities["gt"] = ">";
147 _entities["amp"] = "&";
148
149 bool done = false;
150 int curopen = 0; // number of currently open tags, or "do we process character data?"
151 bool gotRoot = false;
152 std::stack<long> tagLineStarts;
153 std::stack<string> tags;
154
155 while (!done) {
156
157 std::string tag, cdata;
158 unsigned char c1, c2;
159 Teuchos::map<std::string,string> attrs;
160
161 // Consume any whitespace
162 if (curopen == 0) {
163 // this will leave a lookahead in c1
164 c1 = '\0';
165 if ( getSpace(c1) ) {
166 done = true;
167 break;
168 }
169 }
170 else {
171 // need to manually lookahead
172 if (_is->readBytes(&c1,1) < 1) {
173 done = true;
174 break;
175 }
176 if (c1 == '\n') ++_lineNo; // a newline while processing character data; not an error
177 }
178
179 if (c1 == '<') {
180 // determine if it is a STag/EmptyElemTag or ETag or Comment
181 // get lookahead
182 XMLPARSER_TFE( _is->readBytes(&c2,1) < 1 , "stream ended in tag begin/end");
183
184 if (c2 == '/') {
185 // we have: </
186 // try to get an ETag
187 getETag(tag);
188 // have to check whether we have an enclosing, otherwise tags and tagLineStarts have no top()
189 XMLPARSER_TFE( curopen == 0, "document not well-formed: encountered end element '" << tag << "' while not enclosed." );
190 XMLPARSER_TFE( handler->endElement(tag)!=0, "document not well-formed: end element tag = '" << tag << "'"
191 << " did not match start element '" << tags.top()
192 << "' from line " << tagLineStarts.top() );
193 curopen--;
194 tagLineStarts.pop();
195 tags.pop();
196 }
197 else if (isLetter(c2) || c2==':' || c2=='_') {
198 // it looks like a STag or an EmptyElemTag
199 bool emptytag;
200 tagLineStarts.push(_lineNo);
201 getSTag(c2, tag, attrs, emptytag);
202 tags.push(tag);
203 handler->startElement(tag,attrs);
204 if (curopen == 0) {
205 XMLPARSER_TFE(gotRoot == true, "document not well-formed: more than one root element specified" );
206 gotRoot = true;
207 }
208 curopen++;
209 if (emptytag) {
210 // we just open this tag, so we should have any trouble closing it
211 XMLPARSER_TFE( handler->endElement(tag)!=0, "unknown failure from handler while processing tag '" << tag << "'" );
212 curopen--;
213 tagLineStarts.pop();
214 tags.pop();
215 }
216 }
217 else if (c2 == '?') {
218 // it is starting to look like an xml declaration
219 XMLPARSER_TFE( assertChar('x') != 0 , "was expecting an XML declaration; element not well-formed or exploits unsupported feature" );
220 XMLPARSER_TFE( assertChar('m') != 0 , "was expecting an XML declaration; element not well-formed or exploits unsupported feature" );
221 XMLPARSER_TFE( assertChar('l') != 0 , "was expecting an XML declaration; element not well-formed or exploits unsupported feature" );
222 ignoreXMLDeclaration();
223 }
224 else if (c2 == '!') {
225 // it is starting to look like a comment; we need '--'
226 // if we don't get this, it means
227 // * the document is not well-formed
228 // * the document employs a feature not supported by this parser,
229 // e.g. <!ELEMENT... <!ATTLIST... <!DOCTYPE... <![CDATA[...
230 XMLPARSER_TFE( assertChar('-') != 0 , "element not well-formed or exploits unsupported feature" );
231 XMLPARSER_TFE( assertChar('-') != 0 , "element not well-formed or exploits unsupported feature" );
232 getComment(_lineNo);
233 }
234 else {
235 XMLPARSER_TFE(true, "element not well-formed or exploits unsupported feature" );
236 }
237 }
238 else if ( (curopen > 0) && (c1 == '&') ) {
239 std::string chars = "";
240 getReference(chars);
241 handler->characters(chars);
242 }
243 else if ( (curopen > 0) ) {
244 std::string chars = "";
245 chars.push_back(c1);
246 handler->characters(chars);
247 }
248 else {
249 XMLPARSER_TFE(1 , "document not well-formed: character data outside of an enclosing tag");
250 }
251 }
252
253 XMLPARSER_TFE( curopen != 0 , "file ended before closing element '" << tags.top() << "' from line " << tagLineStarts.top() );
254
255 return handler->getObject();
256
257}
258
259
260void XMLParser::getETag(std::string &tag)
261{
262 /* Recall from the specification:
263 ETag ::= '</' Name S? '>'
264 Name ::= (Letter | '_' | ':') (NameChar)*
265
266 We have already consumed: </
267 */
268
269 bool tagover = false;
270 unsigned char c;
271 // clear tag
272 tag = "";
273 XMLPARSER_TFE( _is->readBytes(&c,1) < 1 , "EOF before end element was terminated");
274 XMLPARSER_TFE( !isLetter(c) && c!='_' && c!=':' , "tag not well-formed");
275 tag.push_back(c);
276 while (1) {
277 XMLPARSER_TFE( _is->readBytes(&c,1) < 1 , "EOF before end element was terminated");
278 if ( isNameChar(c) ) {
279 if (tagover) {
280 XMLPARSER_TFE(1, "end element not well-formed: expected '>'");
281 }
282 tag.push_back(c);
283 }
284 else if (isSpace(c)) {
285 // mark the end of the tag and consume the whitespace
286 // if it is ia newline, it isn't an error
287 if (c == '\n') ++_lineNo;
288 tagover = true;
289 }
290 else if (c == '>') {
291 break;
292 }
293 else {
294 XMLPARSER_TFE(1, "end element not well-formed");
295 }
296 }
297}
298
299
300void XMLParser::getSTag(unsigned char lookahead, std::string &tag, Teuchos::map<std::string,string> &attrs, bool &emptytag)
301{
302
303 /* Recall from the specification:
304
305 STag ::= '<' Name (S Attribute)* S? '>'
306 EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
307 Name ::= (Letter | '_' | ':') (NameChar)*
308 NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | #x00B7
309
310 S ::= (#x20 | #x9 | #xD | #xA)+
311 Attribute ::= Name Eq AttValue
312 Eq ::= S? '=' S?
313 AttValue ::= '"' ([^<&"] | Reference)* '"'
314 | "'" ([^<&'] | Reference)* "'"
315 Reference ::= EntityRef | CharRef
316 CharRef ::= '&#' [0-9]+ ';'
317 EntityRef ::= '&' Name ';'
318
319 We have already consumed: <lookahead
320 */
321
322 unsigned char c;
323 attrs.clear();
324
325 tag = lookahead;
326 // get the rest of the tag: (NameChar)*
327 while (1) {
328 XMLPARSER_TFE( _is->readBytes(&c,1) < 1 , "EOF before start element was terminated");
329 if (isNameChar(c)) {
330 tag.push_back(c);
331 }
332 else {
333 break;
334 }
335 }
336
337 // after the name: should be one of the following
338 // (S Attribute) | S? '>' | S? '/>'
339 do {
340
341 bool hadspace = false;
342
343 // if space, consume the whitespace
344 if ( isSpace(c) ) {
345 hadspace = true;
346 XMLPARSER_TFE( getSpace(c)!=0, "EOF before start element was terminated");
347 }
348
349 // now, either Attribute | '>' | '/>'
350 if ( (isLetter(c) || c=='_' || c==':') && hadspace ) {
351
352 // Attribute
353 // get attribute name, starting with contents of c
354 std::string attname, attval;
355 attname = c;
356 do {
357 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
358 if ( isNameChar(c) ) {
359 attname.push_back(c);
360 }
361 else if ( isSpace(c) || c=='=' ) {
362 break;
363 }
364 else {
365 XMLPARSER_TFE(1, "attribute not well-formed: expected whitespace or '='");
366 }
367 } while (1);
368
369 // if whitespace, consume it
370 if (isSpace(c)) {
371 getSpace(c);
372 }
373 // should be on '='
374 if (c != '=') {
375 XMLPARSER_TFE(1, "attribute not well-formed: expected '='");
376 }
377
378 // get any whitespace following the '='
379 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
380 if (isSpace(c)) {
381 getSpace(c);
382 }
383
384 // now get the quoted attribute value
385 bool apost;
386 attval = "";
387 if (c == '\'') {
388 apost = true;
389 }
390 else if (c == '\"') {
391 apost = false;
392 }
393 else {
394 XMLPARSER_TFE(1, "attribute value must be quoted with either ''' or '\"'");
395 }
396 do {
397 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
398 if (apost && c=='\'') {
399 // end of attval
400 break;
401 }
402 else if (!apost && c=='\"') {
403 // end of attval
404 break;
405 }
406 else if ( c == '&' ) {
407 // finish: need to add support for Reference
408 std::string refstr;
409 getReference(refstr);
410 attval += refstr;
411 }
412 else if ( c!='<' ) {
413 // valid character for attval
414 attval.push_back(c);
415 }
416 else {
417 XMLPARSER_TFE(1, "invalid character in attribute value");
418 }
419 } while(1);
420
421 // add attribute to list
422 XMLPARSER_TFE( attrs.find(attname) != attrs.end() , "cannot have two attributes with the same name");
423 attrs[attname] = attval;
424 }
425 else if (c == '>') {
426 emptytag = false;
427 break;
428 }
429 else if (c == '/') {
430 XMLPARSER_TFE(assertChar('>')!=0, "empty element tag not well-formed: expected '>'");
431 emptytag = true;
432 break;
433 }
434 else {
435 XMLPARSER_TFE(1, "start element not well-formed: invalid character");
436 }
437
438 // get next char
439 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
440
441 } while(1);
442}
443
444
445void XMLParser::getComment(long /* startLine */)
446{
447 /* Recall from the specification:
448 Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
449 that is, '<!--' txt '-->', where txt does not contain '--'
450 We have already consumed: <!--
451
452 Be wary here of the fact that c=='-' implies isChar(c)
453 */
454 unsigned char c;
455 while (1) {
456 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before terminating comment begun at line " << _lineNo );
457 if (c == '\n') ++_lineNo;
458 // if we have a -
459 if (c=='-') {
460 // then it must be the end of the comment or be a Char
461 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before terminating comment begun at line " << _lineNo );
462 if (c == '\n') ++_lineNo;
463 if (c=='-') {
464 // this had better be leading to the end of the comment
465 XMLPARSER_TFE( assertChar('>')!=0, "comment not well-formed: missing expected '>' at line " << _lineNo );
466 break;
467 }
468 else if (!isChar(c)) {
469 XMLPARSER_TFE(1, "comment not well-formed: invalid character at line " << _lineNo );
470 }
471 }
472 else if (!isChar(c)) {
473 XMLPARSER_TFE(1, "comment not well-formed: invalid character at line " << _lineNo );
474 }
475 }
476}
477
478
479void XMLParser::getReference(std::string &refstr) {
480 // finish: does CharRef support only dec, or hex as well?
481 unsigned char c;
482 unsigned int num, base;
483 refstr = "";
484 // none of these bytes read are allowed to be a newline, so don't do any incrementing of _lineNo
485 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
486 if (c == '#') {
487 // get a CharRef
488 // CharRef ::= '&#' [0-9]+ ';'
489 // | '&#x' [0-9]+ ';'
490 // get first number
491 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
492 if (c == 'x') {
493 base = 16;
494 num = 0;
495 }
496 else if ('0' <= c && c <= '9') {
497 base = 10;
498 num = c - '0';
499 }
500 else {
501 XMLPARSER_TFE(1, "invalid character in character reference: expected 'x' or [0-9]");
502 }
503
504 do {
505 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
506 XMLPARSER_TFE( c != ';' && !('0' <= c && c <= '9') , "invalid character in character reference: expected [0-9] or ';'");
507 if (c == ';') {
508 break;
509 }
510 num = num*base + (c-'0');
511 } while (1);
512 XMLPARSER_TFE(num > 0xFF, "character reference value out of range");
513 refstr.push_back( (unsigned char)num );
514 }
515 else if (isLetter(c) || c=='_' || c==':') {
516 // get an EntityRef
517 // EntityRef ::= '&' Name ';'
518 std::string entname = "";
519 entname.push_back(c);
520 do {
521 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
522 if (c==';') {
523 break;
524 }
525 else if ( isLetter(c) || ('0' <= c && c <= '9')
526 || c=='.' || c=='-' || c=='_' || c==':'
527 || c==0xB7 ) {
528 entname.push_back(c);
529 }
530 else {
531 XMLPARSER_TFE(1, "entity reference not well-formed: invalid character");
532 }
533 } while (1);
534 XMLPARSER_TFE( _entities.find(entname) == _entities.end(), "entity reference not well-formed: undefined entity");
535 refstr = _entities[entname];
536 }
537 else {
538 XMLPARSER_TFE(1, "reference not well-formed: expected name or '#'");
539 }
540}
541
542
543int XMLParser::getSpace(unsigned char &lookahead) {
544 // if space, consume the whitespace
545 do {
546 if (lookahead == '\n') ++_lineNo;
547 if (_is->readBytes(&lookahead,1) < 1) {
548 return 1; // inform caller that we reached the end
549 }
550 }
551 while (isSpace(lookahead));
552 return 0;
553}
554
555
556bool XMLParser::isLetter(unsigned char c) {
557 if ( (0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A) ||
558 (0xC0 <= c && c <= 0xD6) || (0xD8 <= c && c <= 0xF6) ||
559 (0xF8 <= c) /* unsigned char must be <= 0xFF */ )
560 {
561 return true;
562 }
563 return false;
564}
565
566
567bool XMLParser::isNameChar(unsigned char c) {
568 if ( isLetter(c) || ('0' <= c && c <= '9') ||
569 c=='.' || c=='-' || c=='_' || c==':' || c==0xB7 )
570 {
571 return true;
572 }
573 return false;
574}
575
576
577bool XMLParser::isSpace(unsigned char c) {
578 if ( c==0x20 || c==0x9 || c==0xD || c==0xA )
579 {
580 return true;
581 }
582 return false;
583}
584
585
586bool XMLParser::isChar(unsigned char c) {
587 if ( c==0x9 || c==0xA || c==0xD || 0x20 <= c) { // unsigned char must be <= 0xFF
588 return true;
589 }
590 return false;
591}
592
593
594int XMLParser::assertChar(unsigned char cexp)
595{
596 // pull the next character off the stream and verify that it is what is expected
597 // if not, return an error to the caller
598 unsigned char c;
599 // don't worry about newlines; assertChar is always wrapped in TEST_FOR_EXCEPTION, so we don't want to advance the line counter
600 if (_is->readBytes(&c,1) < 1) {
601 return 1;
602 }
603 if (c != cexp) {
604 return 2;
605 }
606 return 0;
607}
608
609void XMLParser::ignoreXMLDeclaration()
610{
611 /* Be a little lax on the spec here; read until we get to '?', then assert '>'
612 We have already consumed: <xml
613 */
614 unsigned char c;
615 while (1) {
616 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before terminating XML declaration begun at line " << _lineNo );
617 if (c == '\n') ++_lineNo;
618 // if we have a -
619 if (c=='?') {
620 // this had better be leading to the end of the declaration
621 XMLPARSER_TFE( assertChar('>')!=0, "XML declaration not well-formed: missing expected '>' at line " << _lineNo );
622 break;
623 }
624 }
625}
Defines a class for assembling an XMLObject from XML input.
A class providing a simple XML parser. Methods can be overloaded to exploit external XML parsing libr...
Smart reference counting pointer class for automatic garbage collection.
TreeBuildingXMLHandler assembles a XMLObject from your XML input.
Representation of an XML data tree. XMLObject is a ref-counted handle to a XMLObjectImplem object,...
XMLObject parse()
Consume the XMLInputStream to build an XMLObject.
The Teuchos namespace contains all of the classes, structs and enums used by Teuchos,...
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Deprecated.