comparison orange/xml/PhobosXML.d @ 9:99c52d46822a

Serialization works now with D2, deserialization still doesn't work
author Jacob Carlborg <doob@me.com>
date Sat, 24 Jul 2010 18:58:18 +0200
parents
children d17ae98330bf
comparison
equal deleted inserted replaced
8:613a0bb20207 9:99c52d46822a
1 // Written in the D programming language.
2
3 /**
4 Classes and functions for creating and parsing XML
5
6 The basic architecture of this module is that there are standalone functions,
7 classes for constructing an XML document from scratch (Tag, Element and
8 Document), and also classes for parsing a pre-existing XML file (ElementParser
9 and DocumentParser). The parsing classes <i>may</i> be used to build a
10 Document, but that is not their primary purpose. The handling capabilities of
11 DocumentParser and ElementParser are sufficiently customizable that you can
12 make them do pretty much whatever you want.
13
14 Example: This example creates a DOM (Document Object Model) tree
15 from an XML file.
16 ------------------------------------------------------------------------------
17 import std.xml;
18 import std.stdio;
19 import std.string;
20
21 // books.xml is used in various samples throughout the Microsoft XML Core
22 // Services (MSXML) SDK.
23 //
24 // See http://msdn2.microsoft.com/en-us/library/ms762271(VS.85).aspx
25
26 void main()
27 {
28 string s = cast(string)std.file.read("books.xml");
29
30 // Check for well-formedness
31 check(s);
32
33 // Make a DOM tree
34 auto doc = new Document(s);
35
36 // Plain-print it
37 writefln(doc);
38 }
39 ------------------------------------------------------------------------------
40
41 Example: This example does much the same thing, except that the file is
42 deconstructed and reconstructed by hand. This is more work, but the
43 techniques involved offer vastly more power.
44 ------------------------------------------------------------------------------
45 import std.xml;
46 import std.stdio;
47 import std.string;
48
49 struct Book
50 {
51 string id;
52 string author;
53 string title;
54 string genre;
55 string price;
56 string pubDate;
57 string description;
58 }
59
60 void main()
61 {
62 string s = cast(string)std.file.read("books.xml");
63
64 // Check for well-formedness
65 check(s);
66
67 // Take it apart
68 Book[] books;
69
70 auto xml = new DocumentParser(s);
71 xml.onStartTag["book"] = (ElementParser xml)
72 {
73 Book book;
74 book.id = xml.tag.attr["id"];
75
76 xml.onEndTag["author"] = (in Element e) { book.author = e.text; };
77 xml.onEndTag["title"] = (in Element e) { book.title = e.text; };
78 xml.onEndTag["genre"] = (in Element e) { book.genre = e.text; };
79 xml.onEndTag["price"] = (in Element e) { book.price = e.text; };
80 xml.onEndTag["publish-date"] = (in Element e) { book.pubDate = e.text; };
81 xml.onEndTag["description"] = (in Element e) { book.description = e.text; };
82
83 xml.parse();
84
85 books ~= book;
86 };
87 xml.parse();
88
89 // Put it back together again;
90 auto doc = new Document(new Tag("catalog"));
91 foreach(book;books)
92 {
93 auto element = new Element("book");
94 element.tag.attr["id"] = book.id;
95
96 element ~= new Element("author", book.author);
97 element ~= new Element("title", book.title);
98 element ~= new Element("genre", book.genre);
99 element ~= new Element("price", book.price);
100 element ~= new Element("publish-date",book.pubDate);
101 element ~= new Element("description", book.description);
102
103 doc ~= element;
104 }
105
106 // Pretty-print it
107 writefln(join(doc.pretty(3),"\n"));
108 }
109 -------------------------------------------------------------------------------
110 Macros:
111 WIKI=Phobos/StdXml
112
113 Copyright: Copyright Janice Caron 2008 - 2009.
114 License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
115 Authors: Janice Caron
116
117 Copyright Janice Caron 2008 - 2009.
118 Distributed under the Boost Software License, Version 1.0.
119 (See accompanying file LICENSE_1_0.txt or copy at
120 http://www.boost.org/LICENSE_1_0.txt)
121 */
122 module orange.xml.PhobosXML;
123
124 version (Tango) {}
125 else
126 version = Phobos;
127
128 version (Phobos):
129
130 import std.array;
131 import std.string;
132 import std.encoding;
133 import orange.util.io;
134
135 immutable cdata = "<![CDATA[";
136
137 final class Attribute : Element
138 {
139 private alias string tstring;
140 private tstring name_;
141 private tstring value_;
142
143 this (tstring name, tstring value)
144 {
145 super(name);
146 name_ = name;
147 value_ = value;
148 }
149
150 tstring name ()
151 {
152 return name_;
153 }
154
155 tstring value ()
156 {
157 return value_;
158 }
159 }
160
161 /*struct TagProxy
162 {
163 private alias string tstring;
164 private tstring name_;
165
166 private static TagProxy opCall (tstring name)
167 {
168 TagProxy tp;
169 tp.name_ = name;
170
171 return tp;
172 }
173
174 tstring name ()
175 {
176 return name_;
177 }
178 }*/
179
180 /**
181 * Returns true if the character is a character according to the XML standard
182 *
183 * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
184 *
185 * Params:
186 * c = the character to be tested
187 */
188 bool isChar(dchar c) // rule 2
189 {
190 if (c <= 0xD7FF)
191 {
192 if (c >= 0x20)
193 return true;
194 switch(c)
195 {
196 case 0xA:
197 case 0x9:
198 case 0xD:
199 return true;
200 default:
201 return false;
202 }
203 }
204 else if (0xE000 <= c && c <= 0x10FFFF)
205 {
206 if ((c & 0x1FFFFE) != 0xFFFE) // U+FFFE and U+FFFF
207 return true;
208 }
209 return false;
210 }
211
212 unittest
213 {
214 // const CharTable=[0x9,0x9,0xA,0xA,0xD,0xD,0x20,0xD7FF,0xE000,0xFFFD,
215 // 0x10000,0x10FFFF];
216 assert(!isChar(cast(dchar)0x8));
217 assert( isChar(cast(dchar)0x9));
218 assert( isChar(cast(dchar)0xA));
219 assert(!isChar(cast(dchar)0xB));
220 assert(!isChar(cast(dchar)0xC));
221 assert( isChar(cast(dchar)0xD));
222 assert(!isChar(cast(dchar)0xE));
223 assert(!isChar(cast(dchar)0x1F));
224 assert( isChar(cast(dchar)0x20));
225 assert( isChar('J'));
226 assert( isChar(cast(dchar)0xD7FF));
227 assert(!isChar(cast(dchar)0xD800));
228 assert(!isChar(cast(dchar)0xDFFF));
229 assert( isChar(cast(dchar)0xE000));
230 assert( isChar(cast(dchar)0xFFFD));
231 assert(!isChar(cast(dchar)0xFFFE));
232 assert(!isChar(cast(dchar)0xFFFF));
233 assert( isChar(cast(dchar)0x10000));
234 assert( isChar(cast(dchar)0x10FFFF));
235 assert(!isChar(cast(dchar)0x110000));
236
237 debug (stdxml_TestHardcodedChecks)
238 {
239 foreach (c; 0 .. dchar.max + 1)
240 assert(isChar(c) == lookup(CharTable, c));
241 }
242 }
243
244 /**
245 * Returns true if the character is whitespace according to the XML standard
246 *
247 * Only the following characters are considered whitespace in XML - space, tab,
248 * carriage return and linefeed
249 *
250 * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
251 *
252 * Params:
253 * c = the character to be tested
254 */
255 bool isSpace(dchar c)
256 {
257 return c == '\u0020' || c == '\u0009' || c == '\u000A' || c == '\u000D';
258 }
259
260 /**
261 * Returns true if the character is a digit according to the XML standard
262 *
263 * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
264 *
265 * Params:
266 * c = the character to be tested
267 */
268 bool isDigit(dchar c)
269 {
270 if (c <= 0x0039 && c >= 0x0030)
271 return true;
272 else
273 return lookup(DigitTable,c);
274 }
275
276 unittest
277 {
278 debug (stdxml_TestHardcodedChecks)
279 {
280 foreach (c; 0 .. dchar.max + 1)
281 assert(isDigit(c) == lookup(DigitTable, c));
282 }
283 }
284
285 /**
286 * Returns true if the character is a letter according to the XML standard
287 *
288 * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
289 *
290 * Params:
291 * c = the character to be tested
292 */
293 bool isLetter(dchar c) // rule 84
294 {
295 return isIdeographic(c) || isBaseChar(c);
296 }
297
298 /**
299 * Returns true if the character is an ideographic character according to the
300 * XML standard
301 *
302 * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
303 *
304 * Params:
305 * c = the character to be tested
306 */
307 bool isIdeographic(dchar c)
308 {
309 if (c == 0x3007)
310 return true;
311 if (c <= 0x3029 && c >= 0x3021 )
312 return true;
313 if (c <= 0x9FA5 && c >= 0x4E00)
314 return true;
315 return false;
316 }
317
318 unittest
319 {
320 assert(isIdeographic('\u4E00'));
321 assert(isIdeographic('\u9FA5'));
322 assert(isIdeographic('\u3007'));
323 assert(isIdeographic('\u3021'));
324 assert(isIdeographic('\u3029'));
325
326 debug (stdxml_TestHardcodedChecks)
327 {
328 foreach (c; 0 .. dchar.max + 1)
329 assert(isIdeographic(c) == lookup(IdeographicTable, c));
330 }
331 }
332
333 /**
334 * Returns true if the character is a base character according to the XML
335 * standard
336 *
337 * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
338 *
339 * Params:
340 * c = the character to be tested
341 */
342 bool isBaseChar(dchar c)
343 {
344 return lookup(BaseCharTable,c);
345 }
346
347 /**
348 * Returns true if the character is a combining character according to the
349 * XML standard
350 *
351 * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
352 *
353 * Params:
354 * c = the character to be tested
355 */
356 bool isCombiningChar(dchar c)
357 {
358 return lookup(CombiningCharTable,c);
359 }
360
361 /**
362 * Returns true if the character is an extender according to the XML standard
363 *
364 * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
365 *
366 * Params:
367 * c = the character to be tested
368 */
369 bool isExtender(dchar c)
370 {
371 return lookup(ExtenderTable,c);
372 }
373
374 /**
375 * Encodes a string by replacing all characters which need to be escaped with
376 * appropriate predefined XML entities.
377 *
378 * encode() escapes certain characters (ampersand, quote, apostrophe, less-than
379 * and greater-than), and similarly, decode() unescapes them. These functions
380 * are provided for convenience only. You do not need to use them when using
381 * the std.xml classes, because then all the encoding and decoding will be done
382 * for you automatically.
383 *
384 * If the string is not modified, the original will be returned.
385 *
386 * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
387 *
388 * Params:
389 * s = The string to be encoded
390 *
391 * Returns: The encoded string
392 *
393 * Examples:
394 * --------------
395 * writefln(encode("a > b")); // writes "a &gt; b"
396 * --------------
397 */
398 S encode(S)(S s, S buffer = null)
399 {
400 string r;
401 size_t lastI;
402 if (buffer) buffer.length = 0;
403 auto result = appender(&buffer);
404
405 foreach (i, c; s)
406 {
407 switch (c)
408 {
409 case '&': r = "&amp;"; break;
410 case '"': r = "&quot;"; break;
411 case '\'': r = "&apos;"; break;
412 case '<': r = "&lt;"; break;
413 case '>': r = "&gt;"; break;
414 default: continue;
415 }
416 // Replace with r
417 result.put(s[lastI .. i]);
418 result.put(r);
419 lastI = i + 1;
420 }
421
422 if (!result.data) return s;
423 result.put(s[lastI .. $]);
424 return result.data;
425 }
426
427 unittest
428 {
429 assert(encode("hello") is "hello");
430 assert(encode("a > b") == "a &gt; b", encode("a > b"));
431 assert(encode("a < b") == "a &lt; b");
432 assert(encode("don't") == "don&apos;t");
433 assert(encode("\"hi\"") == "&quot;hi&quot;", encode("\"hi\""));
434 assert(encode("cat & dog") == "cat &amp; dog");
435 }
436
437 /**
438 * Mode to use for decoding.
439 *
440 * $(DDOC_ENUM_MEMBERS NONE) Do not decode
441 * $(DDOC_ENUM_MEMBERS LOOSE) Decode, but ignore errors
442 * $(DDOC_ENUM_MEMBERS STRICT) Decode, and throw exception on error
443 */
444 enum DecodeMode
445 {
446 NONE, LOOSE, STRICT
447 }
448
449 /**
450 * Decodes a string by unescaping all predefined XML entities.
451 *
452 * encode() escapes certain characters (ampersand, quote, apostrophe, less-than
453 * and greater-than), and similarly, decode() unescapes them. These functions
454 * are provided for convenience only. You do not need to use them when using
455 * the std.xml classes, because then all the encoding and decoding will be done
456 * for you automatically.
457 *
458 * This function decodes the entities &amp;amp;, &amp;quot;, &amp;apos;,
459 * &amp;lt; and &amp;gt,
460 * as well as decimal and hexadecimal entities such as &amp;#x20AC;
461 *
462 * If the string does not contain an ampersand, the original will be returned.
463 *
464 * Note that the "mode" parameter can be one of DecodeMode.NONE (do not
465 * decode), DecodeMode.LOOSE (decode, but ignore errors), or DecodeMode.STRICT
466 * (decode, and throw a DecodeException in the event of an error).
467 *
468 * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
469 *
470 * Params:
471 * s = The string to be decoded
472 * mode = (optional) Mode to use for decoding. (Defaults to LOOSE).
473 *
474 * Throws: DecodeException if mode == DecodeMode.STRICT and decode fails
475 *
476 * Returns: The decoded string
477 *
478 * Examples:
479 * --------------
480 * writefln(decode("a &gt; b")); // writes "a > b"
481 * --------------
482 */
483 string decode(string s, DecodeMode mode=DecodeMode.LOOSE)
484 {
485 if (mode == DecodeMode.NONE) return s;
486
487 char[] buffer;
488 for (int i=0; i<s.length; ++i)
489 {
490 char c = s[i];
491 if (c != '&')
492 {
493 if (buffer.length != 0) buffer ~= c;
494 }
495 else
496 {
497 if (buffer.length == 0)
498 {
499 buffer = s[0 .. i].dup;
500 }
501 if (startsWith(s[i..$],"&#"))
502 {
503 try
504 {
505 dchar d;
506 string t = s[i..$];
507 checkCharRef(t, d);
508 char[4] temp;
509 buffer ~= temp[0 .. std.utf.encode(temp, d)];
510 i = s.length - t.length - 1;
511 }
512 catch(Err e)
513 {
514 if (mode == DecodeMode.STRICT)
515 throw new DecodeException("Unescaped &");
516 buffer ~= '&';
517 }
518 }
519 else if (startsWith(s[i..$],"&amp;" )) { buffer ~= '&'; i += 4; }
520 else if (startsWith(s[i..$],"&quot;")) { buffer ~= '"'; i += 5; }
521 else if (startsWith(s[i..$],"&apos;")) { buffer ~= '\''; i += 5; }
522 else if (startsWith(s[i..$],"&lt;" )) { buffer ~= '<'; i += 3; }
523 else if (startsWith(s[i..$],"&gt;" )) { buffer ~= '>'; i += 3; }
524 else
525 {
526 if (mode == DecodeMode.STRICT)
527 throw new DecodeException("Unescaped &");
528 buffer ~= '&';
529 }
530 }
531 }
532 return (buffer.length == 0) ? s : cast(string)buffer;
533 }
534
535 unittest
536 {
537 void assertNot(string s)
538 {
539 bool b = false;
540 try { decode(s,DecodeMode.STRICT); }
541 catch (DecodeException e) { b = true; }
542 assert(b,s);
543 }
544
545 // Assert that things that should work, do
546 assert(decode("hello", DecodeMode.STRICT) is "hello");
547 assert(decode("a &gt; b", DecodeMode.STRICT) == "a > b");
548 assert(decode("a &lt; b", DecodeMode.STRICT) == "a < b");
549 assert(decode("don&apos;t", DecodeMode.STRICT) == "don't");
550 assert(decode("&quot;hi&quot;", DecodeMode.STRICT) == "\"hi\"");
551 assert(decode("cat &amp; dog", DecodeMode.STRICT) == "cat & dog");
552 assert(decode("&#42;", DecodeMode.STRICT) == "*");
553 assert(decode("&#x2A;", DecodeMode.STRICT) == "*");
554 assert(decode("cat & dog", DecodeMode.LOOSE) == "cat & dog");
555 assert(decode("a &gt b", DecodeMode.LOOSE) == "a &gt b");
556 assert(decode("&#;", DecodeMode.LOOSE) == "&#;");
557 assert(decode("&#x;", DecodeMode.LOOSE) == "&#x;");
558 assert(decode("&#2G;", DecodeMode.LOOSE) == "&#2G;");
559 assert(decode("&#x2G;", DecodeMode.LOOSE) == "&#x2G;");
560
561 // Assert that things that shouldn't work, don't
562 assertNot("cat & dog");
563 assertNot("a &gt b");
564 assertNot("&#;");
565 assertNot("&#x;");
566 assertNot("&#2G;");
567 assertNot("&#x2G;");
568 }
569
570 /**
571 * Class representing an XML document.
572 *
573 * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
574 *
575 */
576 class Document : Element
577 {
578 /**
579 * Contains all text which occurs before the root element.
580 * Defaults to &lt;?xml version="1.0"?&gt;
581 */
582 string prolog = "<?xml version=\"1.0\"?>";
583 /**
584 * Contains all text which occurs after the root element.
585 * Defaults to the empty string
586 */
587 string epilog;
588
589 /**
590 * Constructs a Document by parsing XML text.
591 *
592 * This function creates a complete DOM (Document Object Model) tree.
593 *
594 * The input to this function MUST be valid XML.
595 * This is enforced by DocumentParser's in contract.
596 *
597 * Params:
598 * s = the complete XML text.
599 */
600 this(string s)
601 in
602 {
603 assert(s.length != 0);
604 }
605 body
606 {
607 auto xml = new DocumentParser(s);
608 string tagString = xml.tag.tagString;
609
610 this(xml.tag);
611 prolog = s[0 .. tagString.ptr - s.ptr];
612 parse(xml);
613 epilog = *xml.s;
614 }
615
616 /**
617 * Constructs a Document from a Tag.
618 *
619 * Params:
620 * tag = the start tag of the document.
621 */
622 this(const(Tag) tag)
623 {
624 super(tag);
625 }
626
627 const
628 {
629 /**
630 * Compares two Documents for equality
631 *
632 * Examples:
633 * --------------
634 * Document d1,d2;
635 * if (d1 == d2) { }
636 * --------------
637 */
638 override bool opEquals(Object o)
639 {
640 const doc = toType!(const Document)(o);
641 return
642 (prolog != doc.prolog ) ? false : (
643 (super != cast(const Element)doc) ? false : (
644 (epilog != doc.epilog ) ? false : (
645 true )));
646 }
647
648 /**
649 * Compares two Documents
650 *
651 * You should rarely need to call this function. It exists so that
652 * Documents can be used as associative array keys.
653 *
654 * Examples:
655 * --------------
656 * Document d1,d2;
657 * if (d1 < d2) { }
658 * --------------
659 */
660 override int opCmp(Object o)
661 {
662 const doc = toType!(const Document)(o);
663 return
664 ((prolog != doc.prolog )
665 ? ( prolog < doc.prolog ? -1 : 1 ) :
666 ((super != cast(const Element)doc)
667 ? ( super < cast(const Element)doc ? -1 : 1 ) :
668 ((epilog != doc.epilog )
669 ? ( epilog < doc.epilog ? -1 : 1 ) :
670 0 )));
671 }
672
673 /**
674 * Returns the hash of a Document
675 *
676 * You should rarely need to call this function. It exists so that
677 * Documents can be used as associative array keys.
678 */
679 override hash_t toHash()
680 {
681 return hash(prolog,hash(epilog,super.toHash));
682 }
683
684 /**
685 * Returns the string representation of a Document. (That is, the
686 * complete XML of a document).
687 */
688 override string toString()
689 {
690 return prolog ~ super.toString ~ epilog;
691 }
692 }
693 }
694
695 /**
696 * Class representing an XML element.
697 *
698 * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
699 */
700 class Element : Item
701 {
702 Tag tag; /// The start tag of the element
703 Item[] items; /// The element's items
704 Text[] texts; /// The element's text items
705 CData[] cdatas; /// The element's CData items
706 Comment[] comments; /// The element's comments
707 ProcessingInstruction[] pis; /// The element's processing instructions
708 Element[] elements; /// The element's child elements
709 Element parent_;
710
711 /**
712 * Constructs an Element given a name and a string to be used as a Text
713 * interior.
714 *
715 * Params:
716 * name = the name of the element.
717 * interior = (optional) the string interior.
718 *
719 * Examples:
720 * -------------------------------------------------------
721 * auto element = new Element("title","Serenity")
722 * // constructs the element <title>Serenity</title>
723 * -------------------------------------------------------
724 */
725 this(string name, string interior=null)
726 {
727 this(new Tag(name));
728 if (interior.length != 0) opCatAssign(new Text(interior));
729 }
730
731 /**
732 * Constructs an Element from a Tag.
733 *
734 * Params:
735 * tag = the start or empty tag of the element.
736 */
737 this(const(Tag) tag_)
738 {
739 this.tag = new Tag(tag_.name);
740 tag.type = TagType.EMPTY;
741 foreach(k,v;tag_.attr) tag.attr[k] = v;
742 tag.tagString = tag_.tagString;
743 }
744
745 Element parent ()
746 {
747 return parent_;
748 }
749
750 Element parent (Element parent)
751 {
752 return parent_ = parent;
753 }
754
755 string name ()
756 {
757 return tag.name;
758 }
759
760 string value ()
761 {
762 return text;
763 }
764
765 alias elements children;
766
767 Attribute[] attributes ()
768 {
769 auto attrs = new Attribute[tag.attr.length];
770 attrs = attrs[0 .. 0];
771
772 foreach (k, v ; tag.attr)
773 attrs ~= new Attribute(k, v);
774
775 return attrs;
776 }
777
778 Element query ()
779 {
780 return this;
781 }
782
783 Element attribute (string prefix, string name, string value = null)
784 {
785 tag.attr[name] = value;
786
787 return this;
788 }
789
790 /**
791 * Append a text item to the interior of this element
792 *
793 * Params:
794 * item = the item you wish to append.
795 *
796 * Examples:
797 * --------------
798 * Element element;
799 * element ~= new Text("hello");
800 * --------------
801 */
802 void opCatAssign(Text item)
803 {
804 texts ~= item;
805 appendItem(item);
806 }
807
808 /**
809 * Append a CData item to the interior of this element
810 *
811 * Params:
812 * item = the item you wish to append.
813 *
814 * Examples:
815 * --------------
816 * Element element;
817 * element ~= new CData("hello");
818 * --------------
819 */
820 void opCatAssign(CData item)
821 {
822 cdatas ~= item;
823 appendItem(item);
824 }
825
826 /**
827 * Append a comment to the interior of this element
828 *
829 * Params:
830 * item = the item you wish to append.
831 *
832 * Examples:
833 * --------------
834 * Element element;
835 * element ~= new Comment("hello");
836 * --------------
837 */
838 void opCatAssign(Comment item)
839 {
840 comments ~= item;
841 appendItem(item);
842 }
843
844 /**
845 * Append a processing instruction to the interior of this element
846 *
847 * Params:
848 * item = the item you wish to append.
849 *
850 * Examples:
851 * --------------
852 * Element element;
853 * element ~= new ProcessingInstruction("hello");
854 * --------------
855 */
856 void opCatAssign(ProcessingInstruction item)
857 {
858 pis ~= item;
859 appendItem(item);
860 }
861
862 /**
863 * Append a complete element to the interior of this element
864 *
865 * Params:
866 * item = the item you wish to append.
867 *
868 * Examples:
869 * --------------
870 * Element element;
871 * Element other = new Element("br");
872 * element ~= other;
873 * // appends element representing <br />
874 * --------------
875 */
876 void opCatAssign(Element item)
877 {
878 elements ~= item;
879 appendItem(item);
880 }
881
882 private void appendItem(Item item)
883 {
884 items ~= item;
885 if (tag.type == TagType.EMPTY && !item.isEmptyXML)
886 tag.type = TagType.START;
887 }
888
889 private void parse(ElementParser xml)
890 {
891 xml.onText = (string s) { opCatAssign(new Text(s)); };
892 xml.onCData = (string s) { opCatAssign(new CData(s)); };
893 xml.onComment = (string s) { opCatAssign(new Comment(s)); };
894 xml.onPI = (string s) { opCatAssign(new ProcessingInstruction(s)); };
895
896 xml.onStartTag[null] = (ElementParser xml)
897 {
898 auto e = new Element(xml.tag);
899 e.parse(xml);
900 opCatAssign(e);
901 };
902
903 xml.parse();
904 }
905
906 /**
907 * Compares two Elements for equality
908 *
909 * Examples:
910 * --------------
911 * Element e1,e2;
912 * if (e1 == e2) { }
913 * --------------
914 */
915 override bool opEquals(Object o)
916 {
917 const element = toType!(const Element)(o);
918 uint len = items.length;
919 if (len != element.items.length) return false;
920 for (uint i=0; i<len; ++i)
921 {
922 if (!items[i].opEquals(element.items[i])) return false;
923 }
924 return true;
925 }
926
927 /**
928 * Compares two Elements
929 *
930 * You should rarely need to call this function. It exists so that Elements
931 * can be used as associative array keys.
932 *
933 * Examples:
934 * --------------
935 * Element e1,e2;
936 * if (e1 < e2) { }
937 * --------------
938 */
939 override int opCmp(Object o)
940 {
941 const element = toType!(const Element)(o);
942 for (uint i=0; ; ++i)
943 {
944 if (i == items.length && i == element.items.length) return 0;
945 if (i == items.length) return -1;
946 if (i == element.items.length) return 1;
947 if (items[i] != element.items[i])
948 return items[i].opCmp(element.items[i]);
949 }
950 }
951
952 /**
953 * Returns the hash of an Element
954 *
955 * You should rarely need to call this function. It exists so that Elements
956 * can be used as associative array keys.
957 */
958 override hash_t toHash()
959 {
960 hash_t hash = tag.toHash;
961 foreach(item;items) hash += item.toHash();
962 return hash;
963 }
964
965 const
966 {
967 /**
968 * Returns the decoded interior of an element.
969 *
970 * The element is assumed to containt text <i>only</i>. So, for
971 * example, given XML such as "&lt;title&gt;Good &amp;amp;
972 * Bad&lt;/title&gt;", will return "Good &amp; Bad".
973 *
974 * Params:
975 * mode = (optional) Mode to use for decoding. (Defaults to LOOSE).
976 *
977 * Throws: DecodeException if decode fails
978 */
979 string text(DecodeMode mode=DecodeMode.LOOSE)
980 {
981 string buffer;
982 foreach(item;items)
983 {
984 Text t = cast(Text)item;
985 if (t is null) throw new DecodeException(item.toString);
986 buffer ~= decode(t.toString,mode);
987 }
988 return buffer;
989 }
990
991 /**
992 * Returns an indented string representation of this item
993 *
994 * Params:
995 * indent = (optional) number of spaces by which to indent this
996 * element. Defaults to 2.
997 */
998 override string[] pretty(uint indent=2)
999 {
1000
1001 if (isEmptyXML || tag.isEmpty) return [ tag.toEmptyString ];
1002
1003 if (items.length == 1)
1004 {
1005 Text t = cast(Text)(items[0]);
1006 if (t !is null)
1007 {
1008 return [tag.toStartString ~ t.toString ~ tag.toEndString];
1009 }
1010 }
1011
1012 string[] a = [ tag.toStartString ];
1013 foreach(item;items)
1014 {
1015 string[] b = item.pretty(indent);
1016 foreach(s;b)
1017 {
1018 a ~= rjustify(s,s.length + indent);
1019 }
1020 }
1021 a ~= tag.toEndString;
1022 return a;
1023 }
1024
1025 /**
1026 * Returns the string representation of an Element
1027 *
1028 * Examples:
1029 * --------------
1030 * auto element = new Element("br");
1031 * writefln(element.toString); // writes "<br />"
1032 * --------------
1033 */
1034 override string toString()
1035 {
1036 if (isEmptyXML || tag.isEmpty) return tag.toEmptyString;
1037
1038 string buffer = tag.toStartString;
1039 foreach(item;items) { buffer ~= item.toString; }
1040 buffer ~= tag.toEndString;
1041 return buffer;
1042 }
1043
1044 override bool isEmptyXML() { return false; } /// Returns false always
1045 }
1046 }
1047
1048 /**
1049 * Tag types.
1050 *
1051 * $(DDOC_ENUM_MEMBERS START) Used for start tags
1052 * $(DDOC_ENUM_MEMBERS END) Used for end tags
1053 * $(DDOC_ENUM_MEMBERS EMPTY) Used for empty tags
1054 *
1055 */
1056 enum TagType { START, END, EMPTY };
1057
1058 /**
1059 * Class representing an XML tag.
1060 *
1061 * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
1062 *
1063 * The class invariant guarantees
1064 * <ul>
1065 * <li> that $(B type) is a valid enum TagType value</li>
1066 * <li> that $(B name) consists of valid characters</li>
1067 * <li> that each attribute name consists of valid characters</li>
1068 * </ul>
1069 */
1070 class Tag
1071 {
1072 TagType type = TagType.START; /// Type of tag
1073 string name; /// Tag name
1074 string[string] attr; /// Associative array of attributes
1075 private string tagString;
1076
1077 invariant()
1078 {
1079 string s;
1080 string t;
1081
1082 assert(type == TagType.START
1083 || type == TagType.END
1084 || type == TagType.EMPTY);
1085
1086 s = name;
1087 try { checkName(s,t); }
1088 catch(Err e) { assert(false,"Invalid tag name:" ~ e.toString); }
1089
1090 foreach(k,v;attr)
1091 {
1092 s = k;
1093 try { checkName(s,t); }
1094 catch(Err e)
1095 { assert(false,"Invalid atrribute name:" ~ e.toString); }
1096 }
1097 }
1098
1099 /**
1100 * Constructs an instance of Tag with a specified name and type
1101 *
1102 * The constructor does not initialize the attributes. To initialize the
1103 * attributes, you access the $(B attr) member variable.
1104 *
1105 * Params:
1106 * name = the Tag's name
1107 * type = (optional) the Tag's type. If omitted, defaults to
1108 * TagType.START.
1109 *
1110 * Examples:
1111 * --------------
1112 * auto tag = new Tag("img",Tag.EMPTY);
1113 * tag.attr["src"] = "http://example.com/example.jpg";
1114 * --------------
1115 */
1116 this(string name, TagType type=TagType.START)
1117 {
1118 this.name = name;
1119 this.type = type;
1120 }
1121
1122 /* Private constructor (so don't ddoc this!)
1123 *
1124 * Constructs a Tag by parsing the string representation, e.g. "<html>".
1125 *
1126 * The string is passed by reference, and is advanced over all characters
1127 * consumed.
1128 *
1129 * The second parameter is a dummy parameter only, required solely to
1130 * distinguish this constructor from the public one.
1131 */
1132 private this(ref string s, bool dummy)
1133 {
1134 tagString = s;
1135 try
1136 {
1137 reqc(s,'<');
1138 if (optc(s,'/')) type = TagType.END;
1139 name = munch(s,"^/>"~whitespace);
1140 munch(s,whitespace);
1141 while(s.length > 0 && s[0] != '>' && s[0] != '/')
1142 {
1143 string key = munch(s,"^="~whitespace);
1144 munch(s,whitespace);
1145 reqc(s,'=');
1146 munch(s,whitespace);
1147 reqc(s,'"');
1148 string val = decode(munch(s,"^\""), DecodeMode.LOOSE);
1149 reqc(s,'"');
1150 munch(s,whitespace);
1151 attr[key] = val;
1152 }
1153 if (optc(s,'/'))
1154 {
1155 if (type == TagType.END) throw new TagException("");
1156 type = TagType.EMPTY;
1157 }
1158 reqc(s,'>');
1159 tagString.length = (s.ptr - tagString.ptr);
1160 }
1161 catch(XMLException e)
1162 {
1163 tagString.length = (s.ptr - tagString.ptr);
1164 throw new TagException(tagString);
1165 }
1166 }
1167
1168 const
1169 {
1170 /**
1171 * Compares two Tags for equality
1172 *
1173 * You should rarely need to call this function. It exists so that Tags
1174 * can be used as associative array keys.
1175 *
1176 * Examples:
1177 * --------------
1178 * Tag tag1,tag2
1179 * if (tag1 == tag2) { }
1180 * --------------
1181 */
1182 override bool opEquals(Object o)
1183 {
1184 const tag = toType!(const Tag)(o);
1185 return
1186 (name != tag.name) ? false : (
1187 (attr != tag.attr) ? false : (
1188 (type != tag.type) ? false : (
1189 true )));
1190 }
1191
1192 /**
1193 * Compares two Tags
1194 *
1195 * Examples:
1196 * --------------
1197 * Tag tag1,tag2
1198 * if (tag1 < tag2) { }
1199 * --------------
1200 */
1201 override int opCmp(Object o)
1202 {
1203 const tag = toType!(const Tag)(o);
1204 return
1205 ((name != tag.name) ? ( name < tag.name ? -1 : 1 ) :
1206 ((attr != tag.attr) ? ( attr < tag.attr ? -1 : 1 ) :
1207 ((type != tag.type) ? ( type < tag.type ? -1 : 1 ) :
1208 0 )));
1209 }
1210
1211 /**
1212 * Returns the hash of a Tag
1213 *
1214 * You should rarely need to call this function. It exists so that Tags
1215 * can be used as associative array keys.
1216 */
1217 override hash_t toHash()
1218 {
1219 hash_t hash = 0;
1220 foreach(dchar c;name) hash = hash * 11 + c;
1221 return hash;
1222 }
1223
1224 /**
1225 * Returns the string representation of a Tag
1226 *
1227 * Examples:
1228 * --------------
1229 * auto tag = new Tag("book",TagType.START);
1230 * writefln(tag.toString); // writes "<book>"
1231 * --------------
1232 */
1233 override string toString()
1234 {
1235 if (isEmpty) return toEmptyString();
1236 return (isEnd) ? toEndString() : toStartString();
1237 }
1238
1239 private
1240 {
1241 string toNonEndString()
1242 {
1243 string s = "<" ~ name;
1244 foreach(key,val;attr)
1245 s ~= format(" %s=\"%s\"",key,decode(val,DecodeMode.LOOSE));
1246 return s;
1247 }
1248
1249 string toStartString() { return toNonEndString() ~ ">"; }
1250
1251 string toEndString() { return "</" ~ name ~ ">"; }
1252
1253 string toEmptyString() { return toNonEndString() ~ " />"; }
1254 }
1255
1256 /**
1257 * Returns true if the Tag is a start tag
1258 *
1259 * Examples:
1260 * --------------
1261 * if (tag.isStart) { }
1262 * --------------
1263 */
1264 bool isStart() { return type == TagType.START; }
1265
1266 /**
1267 * Returns true if the Tag is an end tag
1268 *
1269 * Examples:
1270 * --------------
1271 * if (tag.isEnd) { }
1272 * --------------
1273 */
1274 bool isEnd() { return type == TagType.END; }
1275
1276 /**
1277 * Returns true if the Tag is an empty tag
1278 *
1279 * Examples:
1280 * --------------
1281 * if (tag.isEmpty) { }
1282 * --------------
1283 */
1284 bool isEmpty() { return type == TagType.EMPTY; }
1285 }
1286 }
1287
1288 /**
1289 * Class representing a comment
1290 */
1291 class Comment : Item
1292 {
1293 private string content;
1294
1295 /**
1296 * Construct a comment
1297 *
1298 * Params:
1299 * content = the body of the comment
1300 *
1301 * Throws: CommentException if the comment body is illegal (contains "--"
1302 * or exactly equals "-")
1303 *
1304 * Examples:
1305 * --------------
1306 * auto item = new Comment("This is a comment");
1307 * // constructs <!--This is a comment-->
1308 * --------------
1309 */
1310 this(string content)
1311 {
1312 if (content == "-" || content.indexOf("==") != -1)
1313 throw new CommentException(content);
1314 this.content = content;
1315 }
1316
1317 /**
1318 * Compares two comments for equality
1319 *
1320 * Examples:
1321 * --------------
1322 * Comment item1,item2;
1323 * if (item1 == item2) { }
1324 * --------------
1325 */
1326 override bool opEquals(Object o)
1327 {
1328 const item = toType!(const Item)(o);
1329 const t = cast(Comment)item;
1330 return t !is null && content == t.content;
1331 }
1332
1333 /**
1334 * Compares two comments
1335 *
1336 * You should rarely need to call this function. It exists so that Comments
1337 * can be used as associative array keys.
1338 *
1339 * Examples:
1340 * --------------
1341 * Comment item1,item2;
1342 * if (item1 < item2) { }
1343 * --------------
1344 */
1345 override int opCmp(Object o)
1346 {
1347 const item = toType!(const Item)(o);
1348 const t = cast(Comment)item;
1349 return t !is null && (content != t.content
1350 ? (content < t.content ? -1 : 1 ) : 0 );
1351 }
1352
1353 /**
1354 * Returns the hash of a Comment
1355 *
1356 * You should rarely need to call this function. It exists so that Comments
1357 * can be used as associative array keys.
1358 */
1359 override hash_t toHash() { return hash(content); }
1360
1361 /**
1362 * Returns a string representation of this comment
1363 */
1364 override const string toString() { return "<!--" ~ content ~ "-->"; }
1365
1366 override const bool isEmptyXML() { return false; } /// Returns false always
1367 }
1368
1369 /**
1370 * Class representing a Character Data section
1371 */
1372 class CData : Item
1373 {
1374 private string content;
1375
1376 /**
1377 * Construct a chraracter data section
1378 *
1379 * Params:
1380 * content = the body of the character data segment
1381 *
1382 * Throws: CDataException if the segment body is illegal (contains "]]>")
1383 *
1384 * Examples:
1385 * --------------
1386 * auto item = new CData("<b>hello</b>");
1387 * // constructs <![CDATA[<b>hello</b>]]>
1388 * --------------
1389 */
1390 this(string content)
1391 {
1392 if (content.indexOf("]]>") != -1) throw new CDataException(content);
1393 this.content = content;
1394 }
1395
1396 /**
1397 * Compares two CDatas for equality
1398 *
1399 * Examples:
1400 * --------------
1401 * CData item1,item2;
1402 * if (item1 == item2) { }
1403 * --------------
1404 */
1405 override bool opEquals(Object o)
1406 {
1407 const item = toType!(const Item)(o);
1408 const t = cast(CData)item;
1409 return t !is null && content == t.content;
1410 }
1411
1412 /**
1413 * Compares two CDatas
1414 *
1415 * You should rarely need to call this function. It exists so that CDatas
1416 * can be used as associative array keys.
1417 *
1418 * Examples:
1419 * --------------
1420 * CData item1,item2;
1421 * if (item1 < item2) { }
1422 * --------------
1423 */
1424 override int opCmp(Object o)
1425 {
1426 const item = toType!(const Item)(o);
1427 const t = cast(CData)item;
1428 return t !is null && (content != t.content
1429 ? (content < t.content ? -1 : 1 ) : 0 );
1430 }
1431
1432 /**
1433 * Returns the hash of a CData
1434 *
1435 * You should rarely need to call this function. It exists so that CDatas
1436 * can be used as associative array keys.
1437 */
1438 override hash_t toHash() { return hash(content); }
1439
1440 /**
1441 * Returns a string representation of this CData section
1442 */
1443 override const string toString() { return cdata ~ content ~ "]]>"; }
1444
1445 override const bool isEmptyXML() { return false; } /// Returns false always
1446 }
1447
1448 /**
1449 * Class representing a text (aka Parsed Character Data) section
1450 */
1451 class Text : Item
1452 {
1453 private string content;
1454
1455 /**
1456 * Construct a text (aka PCData) section
1457 *
1458 * Params:
1459 * content = the text. This function encodes the text before
1460 * insertion, so it is safe to insert any text
1461 *
1462 * Examples:
1463 * --------------
1464 * auto Text = new CData("a < b");
1465 * // constructs a &lt; b
1466 * --------------
1467 */
1468 this(string content)
1469 {
1470 this.content = encode(content);
1471 }
1472
1473 /**
1474 * Compares two text sections for equality
1475 *
1476 * Examples:
1477 * --------------
1478 * Text item1,item2;
1479 * if (item1 == item2) { }
1480 * --------------
1481 */
1482 override bool opEquals(Object o)
1483 {
1484 const item = toType!(const Item)(o);
1485 const t = cast(Text)item;
1486 return t !is null && content == t.content;
1487 }
1488
1489 /**
1490 * Compares two text sections
1491 *
1492 * You should rarely need to call this function. It exists so that Texts
1493 * can be used as associative array keys.
1494 *
1495 * Examples:
1496 * --------------
1497 * Text item1,item2;
1498 * if (item1 < item2) { }
1499 * --------------
1500 */
1501 override int opCmp(Object o)
1502 {
1503 const item = toType!(const Item)(o);
1504 const t = cast(Text)item;
1505 return t !is null
1506 && (content != t.content ? (content < t.content ? -1 : 1 ) : 0 );
1507 }
1508
1509 /**
1510 * Returns the hash of a text section
1511 *
1512 * You should rarely need to call this function. It exists so that Texts
1513 * can be used as associative array keys.
1514 */
1515 override hash_t toHash() { return hash(content); }
1516
1517 /**
1518 * Returns a string representation of this Text section
1519 */
1520 override const string toString() { return content; }
1521
1522 /**
1523 * Returns true if the content is the empty string
1524 */
1525 override const bool isEmptyXML() { return content.length == 0; }
1526 }
1527
1528 /**
1529 * Class representing an XML Instruction section
1530 */
1531 class XMLInstruction : Item
1532 {
1533 private string content;
1534
1535 /**
1536 * Construct an XML Instruction section
1537 *
1538 * Params:
1539 * content = the body of the instruction segment
1540 *
1541 * Throws: XIException if the segment body is illegal (contains ">")
1542 *
1543 * Examples:
1544 * --------------
1545 * auto item = new XMLInstruction("ATTLIST");
1546 * // constructs <!ATTLIST>
1547 * --------------
1548 */
1549 this(string content)
1550 {
1551 if (content.indexOf(">") != -1) throw new XIException(content);
1552 this.content = content;
1553 }
1554
1555 /**
1556 * Compares two XML instructions for equality
1557 *
1558 * Examples:
1559 * --------------
1560 * XMLInstruction item1,item2;
1561 * if (item1 == item2) { }
1562 * --------------
1563 */
1564 override bool opEquals(Object o)
1565 {
1566 const item = toType!(const Item)(o);
1567 const t = cast(XMLInstruction)item;
1568 return t !is null && content == t.content;
1569 }
1570
1571 /**
1572 * Compares two XML instructions
1573 *
1574 * You should rarely need to call this function. It exists so that
1575 * XmlInstructions can be used as associative array keys.
1576 *
1577 * Examples:
1578 * --------------
1579 * XMLInstruction item1,item2;
1580 * if (item1 < item2) { }
1581 * --------------
1582 */
1583 override int opCmp(Object o)
1584 {
1585 const item = toType!(const Item)(o);
1586 const t = cast(XMLInstruction)item;
1587 return t !is null
1588 && (content != t.content ? (content < t.content ? -1 : 1 ) : 0 );
1589 }
1590
1591 /**
1592 * Returns the hash of an XMLInstruction
1593 *
1594 * You should rarely need to call this function. It exists so that
1595 * XmlInstructions can be used as associative array keys.
1596 */
1597 override hash_t toHash() { return hash(content); }
1598
1599 /**
1600 * Returns a string representation of this XmlInstruction
1601 */
1602 override const string toString() { return "<!" ~ content ~ ">"; }
1603
1604 override const bool isEmptyXML() { return false; } /// Returns false always
1605 }
1606
1607 /**
1608 * Class representing a Processing Instruction section
1609 */
1610 class ProcessingInstruction : Item
1611 {
1612 private string content;
1613
1614 /**
1615 * Construct a Processing Instruction section
1616 *
1617 * Params:
1618 * content = the body of the instruction segment
1619 *
1620 * Throws: PIException if the segment body is illegal (contains "?>")
1621 *
1622 * Examples:
1623 * --------------
1624 * auto item = new ProcessingInstruction("php");
1625 * // constructs <?php?>
1626 * --------------
1627 */
1628 this(string content)
1629 {
1630 if (content.indexOf("?>") != -1) throw new PIException(content);
1631 this.content = content;
1632 }
1633
1634 /**
1635 * Compares two processing instructions for equality
1636 *
1637 * Examples:
1638 * --------------
1639 * ProcessingInstruction item1,item2;
1640 * if (item1 == item2) { }
1641 * --------------
1642 */
1643 override bool opEquals(Object o)
1644 {
1645 const item = toType!(const Item)(o);
1646 const t = cast(ProcessingInstruction)item;
1647 return t !is null && content == t.content;
1648 }
1649
1650 /**
1651 * Compares two processing instructions
1652 *
1653 * You should rarely need to call this function. It exists so that
1654 * ProcessingInstructions can be used as associative array keys.
1655 *
1656 * Examples:
1657 * --------------
1658 * ProcessingInstruction item1,item2;
1659 * if (item1 < item2) { }
1660 * --------------
1661 */
1662 override int opCmp(Object o)
1663 {
1664 const item = toType!(const Item)(o);
1665 const t = cast(ProcessingInstruction)item;
1666 return t !is null
1667 && (content != t.content ? (content < t.content ? -1 : 1 ) : 0 );
1668 }
1669
1670 /**
1671 * Returns the hash of a ProcessingInstruction
1672 *
1673 * You should rarely need to call this function. It exists so that
1674 * ProcessingInstructions can be used as associative array keys.
1675 */
1676 override hash_t toHash() { return hash(content); }
1677
1678 /**
1679 * Returns a string representation of this ProcessingInstruction
1680 */
1681 override const string toString() { return "<?" ~ content ~ "?>"; }
1682
1683 override const bool isEmptyXML() { return false; } /// Returns false always
1684 }
1685
1686 /**
1687 * Abstract base class for XML items
1688 */
1689 abstract class Item
1690 {
1691 /// Compares with another Item of same type for equality
1692 abstract override bool opEquals(Object o);
1693
1694 /// Compares with another Item of same type
1695 abstract override int opCmp(Object o);
1696
1697 /// Returns the hash of this item
1698 abstract override hash_t toHash();
1699
1700 /// Returns a string representation of this item
1701 abstract override const string toString();
1702
1703 /**
1704 * Returns an indented string representation of this item
1705 *
1706 * Params:
1707 * indent = number of spaces by which to indent child elements
1708 */
1709 const string[] pretty(uint indent)
1710 {
1711 string s = strip(toString());
1712 return s.length == 0 ? [] : [ s ];
1713 }
1714
1715 /// Returns true if the item represents empty XML text
1716 abstract const bool isEmptyXML();
1717 }
1718
1719 /**
1720 * Class for parsing an XML Document.
1721 *
1722 * This is a subclass of ElementParser. Most of the useful functions are
1723 * documented there.
1724 *
1725 * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
1726 *
1727 * Bugs:
1728 * Currently only supports UTF documents.
1729 *
1730 * If there is an encoding attribute in the prolog, it is ignored.
1731 *
1732 */
1733 class DocumentParser : ElementParser
1734 {
1735 string xmlText;
1736
1737 /**
1738 * Constructs a DocumentParser.
1739 *
1740 * The input to this function MUST be valid XML.
1741 * This is enforced by the function's in contract.
1742 *
1743 * Params:
1744 * xmltext = the entire XML document as text
1745 *
1746 */
1747 this(string xmlText_)
1748 in
1749 {
1750 assert(xmlText_.length != 0);
1751 try
1752 {
1753 // Confirm that the input is valid XML
1754 check(xmlText_);
1755 }
1756 catch (CheckException e)
1757 {
1758 // And if it's not, tell the user why not
1759 assert(false, "\n" ~ e.toString());
1760 }
1761 }
1762 body
1763 {
1764 xmlText = xmlText_;
1765 s = &xmlText;
1766 super(); // Initialize everything
1767 parse(); // Parse through the root tag (but not beyond)
1768 }
1769 }
1770
1771 /**
1772 * Class for parsing an XML element.
1773 *
1774 * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
1775 *
1776 * Note that you cannot construct instances of this class directly. You can
1777 * construct a DocumentParser (which is a subclass of ElementParser), but
1778 * otherwise, Instances of ElementParser will be created for you by the
1779 * library, and passed your way via onStartTag handlers.
1780 *
1781 */
1782 class ElementParser
1783 {
1784 alias void delegate(string) Handler;
1785 alias void delegate(in Element element) ElementHandler;
1786 alias void delegate(ElementParser parser) ParserHandler;
1787
1788 private
1789 {
1790 Tag tag_;
1791 string elementStart;
1792 string* s;
1793
1794 Handler commentHandler = null;
1795 Handler cdataHandler = null;
1796 Handler xiHandler = null;
1797 Handler piHandler = null;
1798 Handler rawTextHandler = null;
1799 Handler textHandler = null;
1800
1801 // Private constructor for start tags
1802 this(ElementParser parent)
1803 {
1804 s = parent.s;
1805 this();
1806 tag_ = parent.tag_;
1807 }
1808
1809 // Private constructor for empty tags
1810 this(Tag tag, string* t)
1811 {
1812 s = t;
1813 this();
1814 tag_ = tag;
1815 }
1816 }
1817
1818 /**
1819 * The Tag at the start of the element being parsed. You can read this to
1820 * determine the tag's name and attributes.
1821 */
1822 const const(Tag) tag() { return tag_; }
1823
1824 /**
1825 * Register a handler which will be called whenever a start tag is
1826 * encountered which matches the specified name. You can also pass null as
1827 * the name, in which case the handler will be called for any unmatched
1828 * start tag.
1829 *
1830 * Examples:
1831 * --------------
1832 * // Call this function whenever a <podcast> start tag is encountered
1833 * onStartTag["podcast"] = (ElementParser xml)
1834 * {
1835 * // Your code here
1836 * //
1837 * // This is a a closure, so code here may reference
1838 * // variables which are outside of this scope
1839 * };
1840 *
1841 * // call myEpisodeStartHandler (defined elsewhere) whenever an <episode>
1842 * // start tag is encountered
1843 * onStartTag["episode"] = &myEpisodeStartHandler;
1844 *
1845 * // call delegate dg for all other start tags
1846 * onStartTag[null] = dg;
1847 * --------------
1848 *
1849 * This library will supply your function with a new instance of
1850 * ElementHandler, which may be used to parse inside the element whose
1851 * start tag was just found, or to identify the tag attributes of the
1852 * element, etc.
1853 *
1854 * Note that your function will be called for both start tags and empty
1855 * tags. That is, we make no distinction between &lt;br&gt;&lt;/br&gt;
1856 * and &lt;br/&gt;.
1857 */
1858 ParserHandler[string] onStartTag;
1859
1860 /**
1861 * Register a handler which will be called whenever an end tag is
1862 * encountered which matches the specified name. You can also pass null as
1863 * the name, in which case the handler will be called for any unmatched
1864 * end tag.
1865 *
1866 * Examples:
1867 * --------------
1868 * // Call this function whenever a </podcast> end tag is encountered
1869 * onEndTag["podcast"] = (in Element e)
1870 * {
1871 * // Your code here
1872 * //
1873 * // This is a a closure, so code here may reference
1874 * // variables which are outside of this scope
1875 * };
1876 *
1877 * // call myEpisodeEndHandler (defined elsewhere) whenever an </episode>
1878 * // end tag is encountered
1879 * onEndTag["episode"] = &myEpisodeEndHandler;
1880 *
1881 * // call delegate dg for all other end tags
1882 * onEndTag[null] = dg;
1883 * --------------
1884 *
1885 * Note that your function will be called for both start tags and empty
1886 * tags. That is, we make no distinction between &lt;br&gt;&lt;/br&gt;
1887 * and &lt;br/&gt;.
1888 */
1889 ElementHandler[string] onEndTag;
1890
1891 protected this()
1892 {
1893 elementStart = *s;
1894 }
1895
1896 /**
1897 * Register a handler which will be called whenever text is encountered.
1898 *
1899 * Examples:
1900 * --------------
1901 * // Call this function whenever text is encountered
1902 * onText = (string s)
1903 * {
1904 * // Your code here
1905 *
1906 * // The passed parameter s will have been decoded by the time you see
1907 * // it, and so may contain any character.
1908 * //
1909 * // This is a a closure, so code here may reference
1910 * // variables which are outside of this scope
1911 * };
1912 * --------------
1913 */
1914 void onText(Handler handler) { textHandler = handler; }
1915
1916 /**
1917 * Register an alternative handler which will be called whenever text
1918 * is encountered. This differs from onText in that onText will decode
1919 * the text, wheras onTextRaw will not. This allows you to make design
1920 * choices, since onText will be more accurate, but slower, while
1921 * onTextRaw will be faster, but less accurate. Of course, you can
1922 * still call decode() within your handler, if you want, but you'd
1923 * probably want to use onTextRaw only in circumstances where you
1924 * know that decoding is unnecessary.
1925 *
1926 * Examples:
1927 * --------------
1928 * // Call this function whenever text is encountered
1929 * onText = (string s)
1930 * {
1931 * // Your code here
1932 *
1933 * // The passed parameter s will NOT have been decoded.
1934 * //
1935 * // This is a a closure, so code here may reference
1936 * // variables which are outside of this scope
1937 * };
1938 * --------------
1939 */
1940 void onTextRaw(Handler handler) { rawTextHandler = handler; }
1941
1942 /**
1943 * Register a handler which will be called whenever a character data
1944 * segement is encountered.
1945 *
1946 * Examples:
1947 * --------------
1948 * // Call this function whenever a CData section is encountered
1949 * onCData = (string s)
1950 * {
1951 * // Your code here
1952 *
1953 * // The passed parameter s does not include the opening <![CDATA[
1954 * // nor closing ]]>
1955 * //
1956 * // This is a a closure, so code here may reference
1957 * // variables which are outside of this scope
1958 * };
1959 * --------------
1960 */
1961 void onCData(Handler handler) { cdataHandler = handler; }
1962
1963 /**
1964 * Register a handler which will be called whenever a comment is
1965 * encountered.
1966 *
1967 * Examples:
1968 * --------------
1969 * // Call this function whenever a comment is encountered
1970 * onComment = (string s)
1971 * {
1972 * // Your code here
1973 *
1974 * // The passed parameter s does not include the opening <!-- nor
1975 * // closing -->
1976 * //
1977 * // This is a a closure, so code here may reference
1978 * // variables which are outside of this scope
1979 * };
1980 * --------------
1981 */
1982 void onComment(Handler handler) { commentHandler = handler; }
1983
1984 /**
1985 * Register a handler which will be called whenever a processing
1986 * instruction is encountered.
1987 *
1988 * Examples:
1989 * --------------
1990 * // Call this function whenever a processing instruction is encountered
1991 * onPI = (string s)
1992 * {
1993 * // Your code here
1994 *
1995 * // The passed parameter s does not include the opening <? nor
1996 * // closing ?>
1997 * //
1998 * // This is a a closure, so code here may reference
1999 * // variables which are outside of this scope
2000 * };
2001 * --------------
2002 */
2003 void onPI(Handler handler) { piHandler = handler; }
2004
2005 /**
2006 * Register a handler which will be called whenever an XML instruction is
2007 * encountered.
2008 *
2009 * Examples:
2010 * --------------
2011 * // Call this function whenever an XML instruction is encountered
2012 * // (Note: XML instructions may only occur preceeding the root tag of a
2013 * // document).
2014 * onPI = (string s)
2015 * {
2016 * // Your code here
2017 *
2018 * // The passed parameter s does not include the opening <! nor
2019 * // closing >
2020 * //
2021 * // This is a a closure, so code here may reference
2022 * // variables which are outside of this scope
2023 * };
2024 * --------------
2025 */
2026 void onXI(Handler handler) { xiHandler = handler; }
2027
2028 /**
2029 * Parse an XML element.
2030 *
2031 * Parsing will continue until the end of the current element. Any items
2032 * encountered for which a handler has been registered will invoke that
2033 * handler.
2034 *
2035 * Throws: various kinds of XMLException
2036 */
2037 void parse()
2038 {
2039 string t;
2040 Tag root = tag_;
2041 Tag[string] startTags;
2042 if (tag_ !is null) startTags[tag_.name] = tag_;
2043
2044 while(s.length != 0)
2045 {
2046 if (startsWith(*s,"<!--"))
2047 {
2048 chop(*s,4);
2049 t = chop(*s,indexOf(*s,"-->"));
2050 if (commentHandler.funcptr !is null) commentHandler(t);
2051 chop(*s,3);
2052 }
2053 else if (startsWith(*s,"<![CDATA["))
2054 {
2055 chop(*s,9);
2056 t = chop(*s,indexOf(*s,"]]>"));
2057 if (cdataHandler.funcptr !is null) cdataHandler(t);
2058 chop(*s,3);
2059 }
2060 else if (startsWith(*s,"<!"))
2061 {
2062 chop(*s,2);
2063 t = chop(*s,indexOf(*s,">"));
2064 if (xiHandler.funcptr !is null) xiHandler(t);
2065 chop(*s,1);
2066 }
2067 else if (startsWith(*s,"<?"))
2068 {
2069 chop(*s,2);
2070 t = chop(*s,indexOf(*s,"?>"));
2071 if (piHandler.funcptr !is null) piHandler(t);
2072 chop(*s,2);
2073 }
2074 else if (startsWith(*s,"<"))
2075 {
2076 tag_ = new Tag(*s,true);
2077 if (root is null)
2078 return; // Return to constructor of derived class
2079
2080 if (tag_.isStart)
2081 {
2082 startTags[tag_.name] = tag_;
2083
2084 auto parser = new ElementParser(this);
2085
2086 auto handler = tag_.name in onStartTag;
2087 if (handler !is null) (*handler)(parser);
2088 else
2089 {
2090 handler = null in onStartTag;
2091 if (handler !is null) (*handler)(parser);
2092 }
2093 }
2094 else if (tag_.isEnd)
2095 {
2096 auto startTag = startTags[tag_.name];
2097 string text;
2098
2099 immutable(char)* p = startTag.tagString.ptr
2100 + startTag.tagString.length;
2101 immutable(char)* q = tag_.tagString.ptr;
2102 text = decode(p[0..(q-p)], DecodeMode.LOOSE);
2103
2104 auto element = new Element(startTag);
2105 if (text.length != 0) element ~= new Text(text);
2106
2107 auto handler = tag_.name in onEndTag;
2108 if (handler !is null) (*handler)(element);
2109 else
2110 {
2111 handler = null in onEndTag;
2112 if (handler !is null) (*handler)(element);
2113 }
2114
2115 if (tag_.name == root.name) return;
2116 }
2117 else if (tag_.isEmpty)
2118 {
2119 Tag startTag = new Tag(tag_.name);
2120
2121 // FIX by hed010gy, for bug 2979
2122 // http://d.puremagic.com/issues/show_bug.cgi?id=2979
2123 if (tag_.attr.length > 0)
2124 foreach(tn,tv; tag_.attr) startTag.attr[tn]=tv;
2125 // END FIX
2126
2127 // Handle the pretend start tag
2128 string s2;
2129 auto parser = new ElementParser(startTag,&s2);
2130 auto handler1 = startTag.name in onStartTag;
2131 if (handler1 !is null) (*handler1)(parser);
2132 else
2133 {
2134 handler1 = null in onStartTag;
2135 if (handler1 !is null) (*handler1)(parser);
2136 }
2137
2138 // Handle the pretend end tag
2139 auto element = new Element(startTag);
2140 auto handler2 = tag_.name in onEndTag;
2141 if (handler2 !is null) (*handler2)(element);
2142 else
2143 {
2144 handler2 = null in onEndTag;
2145 if (handler2 !is null) (*handler2)(element);
2146 }
2147 }
2148 }
2149 else
2150 {
2151 t = chop(*s,indexOf(*s,"<"));
2152 if (rawTextHandler.funcptr !is null)
2153 rawTextHandler(t);
2154 else if (textHandler.funcptr !is null)
2155 textHandler(decode(t,DecodeMode.LOOSE));
2156 }
2157 }
2158 }
2159
2160 /**
2161 * Returns that part of the element which has already been parsed
2162 */
2163 const override string toString()
2164 {
2165 int n = elementStart.length - s.length;
2166 return elementStart[0..n];
2167 }
2168
2169 }
2170
2171 private
2172 {
2173 template Check(string msg)
2174 {
2175 string old = s;
2176
2177 void fail()
2178 {
2179 s = old;
2180 throw new Err(s,msg);
2181 }
2182
2183 void fail(Err e)
2184 {
2185 s = old;
2186 throw new Err(s,msg,e);
2187 }
2188
2189 void fail(string msg2)
2190 {
2191 fail(new Err(s,msg2));
2192 }
2193 }
2194
2195 void checkMisc(ref string s) // rule 27
2196 {
2197 mixin Check!("Misc");
2198
2199 try
2200 {
2201 if (s.startsWith("<!--")) { checkComment(s); }
2202 else if (s.startsWith("<?")) { checkPI(s); }
2203 else { checkSpace(s); }
2204 }
2205 catch(Err e) { fail(e); }
2206 }
2207
2208 void checkDocument(ref string s) // rule 1
2209 {
2210 mixin Check!("Document");
2211 try
2212 {
2213 checkProlog(s);
2214 checkElement(s);
2215 star!(checkMisc)(s);
2216 }
2217 catch(Err e) { fail(e); }
2218 }
2219
2220 void checkChars(ref string s) // rule 2
2221 {
2222 // TO DO - Fix std.utf stride and decode functions, then use those
2223 // instead
2224
2225 mixin Check!("Chars");
2226
2227 dchar c;
2228 int n = -1;
2229 foreach(int i,dchar d; s)
2230 {
2231 if (!isChar(d))
2232 {
2233 c = d;
2234 n = i;
2235 break;
2236 }
2237 }
2238 if (n != -1)
2239 {
2240 s = s[n..$];
2241 fail(format("invalid character: U+%04X",c));
2242 }
2243 }
2244
2245 void checkSpace(ref string s) // rule 3
2246 {
2247 mixin Check!("Whitespace");
2248 munch(s,"\u0020\u0009\u000A\u000D");
2249 if (s is old) fail();
2250 }
2251
2252 void checkName(ref string s, out string name) // rule 5
2253 {
2254 mixin Check!("Name");
2255
2256 if (s.length == 0) fail();
2257 int n;
2258 foreach(int i,dchar c;s)
2259 {
2260 if (c == '_' || c == ':' || isLetter(c)) continue;
2261 if (i == 0) fail();
2262 if (c == '-' || c == '.' || isDigit(c)
2263 || isCombiningChar(c) || isExtender(c)) continue;
2264 n = i;
2265 break;
2266 }
2267 name = s[0..n];
2268 s = s[n..$];
2269 }
2270
2271 void checkAttValue(ref string s) // rule 10
2272 {
2273 mixin Check!("AttValue");
2274
2275 if (s.length == 0) fail();
2276 char c = s[0];
2277 if (c != '\u0022' && c != '\u0027')
2278 fail("attribute value requires quotes");
2279 s = s[1..$];
2280 for(;;)
2281 {
2282 munch(s,"^<&"~c);
2283 if (s.length == 0) fail("unterminated attribute value");
2284 if (s[0] == '<') fail("< found in attribute value");
2285 if (s[0] == c) break;
2286 try { checkReference(s); } catch(Err e) { fail(e); }
2287 }
2288 s = s[1..$];
2289 }
2290
2291 void checkCharData(ref string s) // rule 14
2292 {
2293 mixin Check!("CharData");
2294
2295 while (s.length != 0)
2296 {
2297 if (s.startsWith("&")) break;
2298 if (s.startsWith("<")) break;
2299 if (s.startsWith("]]>")) fail("]]> found within char data");
2300 s = s[1..$];
2301 }
2302 }
2303
2304 void checkComment(ref string s) // rule 15
2305 {
2306 mixin Check!("Comment");
2307
2308 try { checkLiteral("<!--",s); } catch(Err e) { fail(e); }
2309 int n = s.indexOf("--");
2310 if (n == -1) fail("unterminated comment");
2311 s = s[n..$];
2312 try { checkLiteral("-->",s); } catch(Err e) { fail(e); }
2313 }
2314
2315 void checkPI(ref string s) // rule 16
2316 {
2317 mixin Check!("PI");
2318
2319 try
2320 {
2321 checkLiteral("<?",s);
2322 checkEnd("?>",s);
2323 }
2324 catch(Err e) { fail(e); }
2325 }
2326
2327 void checkCDSect(ref string s) // rule 18
2328 {
2329 mixin Check!("CDSect");
2330
2331 try
2332 {
2333 checkLiteral(cdata,s);
2334 checkEnd("]]>",s);
2335 }
2336 catch(Err e) { fail(e); }
2337 }
2338
2339 void checkProlog(ref string s) // rule 22
2340 {
2341 mixin Check!("Prolog");
2342
2343 try
2344 {
2345 checkXMLDecl(s);
2346 star!(checkMisc)(s);
2347 opt!(seq!(checkDocTypeDecl,star!(checkMisc)))(s);
2348 }
2349 catch(Err e) { fail(e); }
2350 }
2351
2352 void checkXMLDecl(ref string s) // rule 23
2353 {
2354 mixin Check!("XMLDecl");
2355
2356 try
2357 {
2358 checkLiteral("<?xml",s);
2359 checkVersionInfo(s);
2360 opt!(checkEncodingDecl)(s);
2361 opt!(checkSDDecl)(s);
2362 opt!(checkSpace)(s);
2363 checkLiteral("?>",s);
2364 }
2365 catch(Err e) { fail(e); }
2366 }
2367
2368 void checkVersionInfo(ref string s) // rule 24
2369 {
2370 mixin Check!("VersionInfo");
2371
2372 try
2373 {
2374 checkSpace(s);
2375 checkLiteral("version",s);
2376 checkEq(s);
2377 quoted!(checkVersionNum)(s);
2378 }
2379 catch(Err e) { fail(e); }
2380 }
2381
2382 void checkEq(ref string s) // rule 25
2383 {
2384 mixin Check!("Eq");
2385
2386 try
2387 {
2388 opt!(checkSpace)(s);
2389 checkLiteral("=",s);
2390 opt!(checkSpace)(s);
2391 }
2392 catch(Err e) { fail(e); }
2393 }
2394
2395 void checkVersionNum(ref string s) // rule 26
2396 {
2397 mixin Check!("VersionNum");
2398
2399 munch(s,"a-zA-Z0-9_.:-");
2400 if (s is old) fail();
2401 }
2402
2403 void checkDocTypeDecl(ref string s) // rule 28
2404 {
2405 mixin Check!("DocTypeDecl");
2406
2407 try
2408 {
2409 checkLiteral("<!DOCTYPE",s);
2410 //
2411 // TO DO -- ensure DOCTYPE is well formed
2412 // (But not yet. That's one of our "future directions")
2413 //
2414 checkEnd(">",s);
2415 }
2416 catch(Err e) { fail(e); }
2417 }
2418
2419 void checkSDDecl(ref string s) // rule 32
2420 {
2421 mixin Check!("SDDecl");
2422
2423 try
2424 {
2425 checkSpace(s);
2426 checkLiteral("standalone",s);
2427 checkEq(s);
2428 }
2429 catch(Err e) { fail(e); }
2430
2431 int n = 0;
2432 if (s.startsWith("'yes'") || s.startsWith("\"yes\"")) n = 5;
2433 else if (s.startsWith("'no'" ) || s.startsWith("\"no\"" )) n = 4;
2434 else fail("standalone attribute value must be 'yes', \"yes\","
2435 " 'no' or \"no\"");
2436 s = s[n..$];
2437 }
2438
2439 void checkElement(ref string s) // rule 39
2440 {
2441 mixin Check!("Element");
2442
2443 string sname,ename,t;
2444 try { checkTag(s,t,sname); } catch(Err e) { fail(e); }
2445
2446 if (t == "STag")
2447 {
2448 try
2449 {
2450 checkContent(s);
2451 t = s;
2452 checkETag(s,ename);
2453 }
2454 catch(Err e) { fail(e); }
2455
2456 if (sname != ename)
2457 {
2458 s = t;
2459 fail("end tag name \"" ~ ename
2460 ~ "\" differs from start tag name \""~sname~"\"");
2461 }
2462 }
2463 }
2464
2465 // rules 40 and 44
2466 void checkTag(ref string s, out string type, out string name)
2467 {
2468 mixin Check!("Tag");
2469
2470 try
2471 {
2472 type = "STag";
2473 checkLiteral("<",s);
2474 checkName(s,name);
2475 star!(seq!(checkSpace,checkAttribute))(s);
2476 opt!(checkSpace)(s);
2477 if (s.length != 0 && s[0] == '/')
2478 {
2479 s = s[1..$];
2480 type = "ETag";
2481 }
2482 checkLiteral(">",s);
2483 }
2484 catch(Err e) { fail(e); }
2485 }
2486
2487 void checkAttribute(ref string s) // rule 41
2488 {
2489 mixin Check!("Attribute");
2490
2491 try
2492 {
2493 string name;
2494 checkName(s,name);
2495 checkEq(s);
2496 checkAttValue(s);
2497 }
2498 catch(Err e) { fail(e); }
2499 }
2500
2501 void checkETag(ref string s, out string name) // rule 42
2502 {
2503 mixin Check!("ETag");
2504
2505 try
2506 {
2507 checkLiteral("</",s);
2508 checkName(s,name);
2509 opt!(checkSpace)(s);
2510 checkLiteral(">",s);
2511 }
2512 catch(Err e) { fail(e); }
2513 }
2514
2515 void checkContent(ref string s) // rule 43
2516 {
2517 mixin Check!("Content");
2518
2519 try
2520 {
2521 while (s.length != 0)
2522 {
2523 old = s;
2524 if (s.startsWith("&")) { checkReference(s); }
2525 else if (s.startsWith("<!--")) { checkComment(s); }
2526 else if (s.startsWith("<?")) { checkPI(s); }
2527 else if (s.startsWith(cdata)) { checkCDSect(s); }
2528 else if (s.startsWith("</")) { break; }
2529 else if (s.startsWith("<")) { checkElement(s); }
2530 else { checkCharData(s); }
2531 }
2532 }
2533 catch(Err e) { fail(e); }
2534 }
2535
2536 void checkCharRef(ref string s, out dchar c) // rule 66
2537 {
2538 mixin Check!("CharRef");
2539
2540 c = 0;
2541 try { checkLiteral("&#",s); } catch(Err e) { fail(e); }
2542 int radix = 10;
2543 if (s.length != 0 && s[0] == 'x')
2544 {
2545 s = s[1..$];
2546 radix = 16;
2547 }
2548 if (s.length == 0) fail("unterminated character reference");
2549 if (s[0] == ';')
2550 fail("character reference must have at least one digit");
2551 while (s.length != 0)
2552 {
2553 char d = s[0];
2554 int n = 0;
2555 switch(d)
2556 {
2557 case 'F','f': ++n;
2558 case 'E','e': ++n;
2559 case 'D','d': ++n;
2560 case 'C','c': ++n;
2561 case 'B','b': ++n;
2562 case 'A','a': ++n;
2563 case '9': ++n;
2564 case '8': ++n;
2565 case '7': ++n;
2566 case '6': ++n;
2567 case '5': ++n;
2568 case '4': ++n;
2569 case '3': ++n;
2570 case '2': ++n;
2571 case '1': ++n;
2572 case '0': break;
2573 default: n = 100; break;
2574 }
2575 if (n >= radix) break;
2576 c *= radix;
2577 c += n;
2578 s = s[1..$];
2579 }
2580 if (!isChar(c)) fail(format("U+%04X is not a legal character",c));
2581 if (s.length == 0 || s[0] != ';') fail("expected ;");
2582 else s = s[1..$];
2583 }
2584
2585 void checkReference(ref string s) // rule 67
2586 {
2587 mixin Check!("Reference");
2588
2589 try
2590 {
2591 dchar c;
2592 if (s.startsWith("&#")) checkCharRef(s,c);
2593 else checkEntityRef(s);
2594 }
2595 catch(Err e) { fail(e); }
2596 }
2597
2598 void checkEntityRef(ref string s) // rule 68
2599 {
2600 mixin Check!("EntityRef");
2601
2602 try
2603 {
2604 string name;
2605 checkLiteral("&",s);
2606 checkName(s,name);
2607 checkLiteral(";",s);
2608 }
2609 catch(Err e) { fail(e); }
2610 }
2611
2612 void checkEncName(ref string s) // rule 81
2613 {
2614 mixin Check!("EncName");
2615
2616 munch(s,"a-zA-Z");
2617 if (s is old) fail();
2618 munch(s,"a-zA-Z0-9_.-");
2619 }
2620
2621 void checkEncodingDecl(ref string s) // rule 80
2622 {
2623 mixin Check!("EncodingDecl");
2624
2625 try
2626 {
2627 checkSpace(s);
2628 checkLiteral("encoding",s);
2629 checkEq(s);
2630 quoted!(checkEncName)(s);
2631 }
2632 catch(Err e) { fail(e); }
2633 }
2634
2635 // Helper functions
2636
2637 void checkLiteral(string literal,ref string s)
2638 {
2639 mixin Check!("Literal");
2640
2641 if (!s.startsWith(literal)) fail("Expected literal \""~literal~"\"");
2642 s = s[literal.length..$];
2643 }
2644
2645 void checkEnd(string end,ref string s)
2646 {
2647 // Deliberately no mixin Check here.
2648
2649 int n = s.indexOf(end);
2650 if (n == -1) throw new Err(s,"Unable to find terminating \""~end~"\"");
2651 s = s[n..$];
2652 checkLiteral(end,s);
2653 }
2654
2655 // Metafunctions -- none of these use mixin Check
2656
2657 void opt(alias f)(ref string s)
2658 {
2659 try { f(s); } catch(Err e) {}
2660 }
2661
2662 void plus(alias f)(ref string s)
2663 {
2664 f(s);
2665 star!(f)(s);
2666 }
2667
2668 void star(alias f)(ref string s)
2669 {
2670 while (s.length != 0)
2671 {
2672 try { f(s); }
2673 catch(Err e) { return; }
2674 }
2675 }
2676
2677 void quoted(alias f)(ref string s)
2678 {
2679 if (s.startsWith("'"))
2680 {
2681 checkLiteral("'",s);
2682 f(s);
2683 checkLiteral("'",s);
2684 }
2685 else
2686 {
2687 checkLiteral("\"",s);
2688 f(s);
2689 checkLiteral("\"",s);
2690 }
2691 }
2692
2693 void seq(alias f,alias g)(ref string s)
2694 {
2695 f(s);
2696 g(s);
2697 }
2698 }
2699
2700 /**
2701 * Check an entire XML document for well-formedness
2702 *
2703 * Params:
2704 * s = the document to be checked, passed as a string
2705 *
2706 * Throws: CheckException if the document is not well formed
2707 *
2708 * CheckException's toString() method will yield the complete heirarchy of
2709 * parse failure (the XML equivalent of a stack trace), giving the line and
2710 * column number of every failure at every level.
2711 */
2712 void check(string s)
2713 {
2714 try
2715 {
2716 checkChars(s);
2717 checkDocument(s);
2718 if (s.length != 0) throw new Err(s,"Junk found after document");
2719 }
2720 catch(Err e)
2721 {
2722 e.complete(s);
2723 throw e;
2724 }
2725 }
2726
2727 unittest
2728 {
2729 version (none) // WHY ARE WE NOT RUNNING THIS UNIT TEST?
2730 {
2731 try
2732 {
2733 check(q"[<?xml version="1.0"?>
2734 <catalog>
2735 <book id="bk101">
2736 <author>Gambardella, Matthew</author>
2737 <title>XML Developer's Guide</title>
2738 <genre>Computer</genre>
2739 <price>44.95</price>
2740 <publish_date>2000-10-01</publish_date>
2741 <description>An in-depth look at creating applications
2742 with XML.</description>
2743 </book>
2744 <book id="bk102">
2745 <author>Ralls, Kim</author>
2746 <title>Midnight Rain</title>
2747 <genre>Fantasy</genres>
2748 <price>5.95</price>
2749 <publish_date>2000-12-16</publish_date>
2750 <description>A former architect battles corporate zombies,
2751 an evil sorceress, and her own childhood to become queen
2752 of the world.</description>
2753 </book>
2754 <book id="bk103">
2755 <author>Corets, Eva</author>
2756 <title>Maeve Ascendant</title>
2757 <genre>Fantasy</genre>
2758 <price>5.95</price>
2759 <publish_date>2000-11-17</publish_date>
2760 <description>After the collapse of a nanotechnology
2761 society in England, the young survivors lay the
2762 foundation for a new society.</description>
2763 </book>
2764 </catalog>
2765 ]");
2766 assert(false);
2767 }
2768 catch(CheckException e)
2769 {
2770 int n = e.toString().indexOf("end tag name \"genres\" differs"
2771 " from start tag name \"genre\"");
2772 assert(n != -1);
2773 }
2774 }
2775 }
2776
2777 unittest
2778 {
2779 string s = q"EOS
2780 <?xml version="1.0"?>
2781 <set>
2782 <one>A</one>
2783 <!-- comment -->
2784 <two>B</two>
2785 </set>
2786 EOS";
2787 try
2788 {
2789 check(s);
2790 }
2791 catch (CheckException e)
2792 {
2793 assert(0, e.toString());
2794 }
2795 }
2796
2797 unittest
2798 {
2799 string s = q"EOS
2800 <?xml version="1.0" encoding="utf-8"?> <Tests>
2801 <Test thing="What &amp; Up">What &amp; Up Second</Test>
2802 </Tests>
2803 EOS";
2804 auto xml = new DocumentParser(s);
2805
2806 xml.onStartTag["Test"] = (ElementParser xml) {
2807 assert(xml.tag.attr["thing"] == "What & Up");
2808 };
2809
2810 xml.onEndTag["Test"] = (in Element e) {
2811 assert(e.text == "What & Up Second");
2812 };
2813 xml.parse();
2814 }
2815
2816 /** The base class for exceptions thrown by this module */
2817 class XMLException : Exception { this(string msg) { super(msg); } }
2818
2819 // Other exceptions
2820
2821 /// Thrown during Comment constructor
2822 class CommentException : XMLException
2823 { private this(string msg) { super(msg); } }
2824
2825 /// Thrown during CData constructor
2826 class CDataException : XMLException
2827 { private this(string msg) { super(msg); } }
2828
2829 /// Thrown during XMLInstruction constructor
2830 class XIException : XMLException
2831 { private this(string msg) { super(msg); } }
2832
2833 /// Thrown during ProcessingInstruction constructor
2834 class PIException : XMLException
2835 { private this(string msg) { super(msg); } }
2836
2837 /// Thrown during Text constructor
2838 class TextException : XMLException
2839 { private this(string msg) { super(msg); } }
2840
2841 /// Thrown during decode()
2842 class DecodeException : XMLException
2843 { private this(string msg) { super(msg); } }
2844
2845 /// Thrown if comparing with wrong type
2846 class InvalidTypeException : XMLException
2847 { private this(string msg) { super(msg); } }
2848
2849 /// Thrown when parsing for Tags
2850 class TagException : XMLException
2851 { private this(string msg) { super(msg); } }
2852
2853 /**
2854 * Thrown during check()
2855 */
2856 class CheckException : XMLException
2857 {
2858 CheckException err; /// Parent in heirarchy
2859 private string tail;
2860 /**
2861 * Name of production rule which failed to parse,
2862 * or specific error message
2863 */
2864 string msg;
2865 uint line = 0; /// Line number at which parse failure occurred
2866 uint column = 0; /// Column number at which parse failure occurred
2867
2868 private this(string tail,string msg,Err err=null)
2869 {
2870 super(null);
2871 this.tail = tail;
2872 this.msg = msg;
2873 this.err = err;
2874 }
2875
2876 private void complete(string entire)
2877 {
2878 string head = entire[0..$-tail.length];
2879 int n = head.lastIndexOf('\n') + 1;
2880 line = head.count("\n") + 1;
2881 dstring t;
2882 transcode(head[n..$],t);
2883 column = t.length + 1;
2884 if (err !is null) err.complete(entire);
2885 }
2886
2887 override const string toString()
2888 {
2889 string s;
2890 if (line != 0) s = format("Line %d, column %d: ",line,column);
2891 s ~= msg;
2892 s ~= '\n';
2893 if (err !is null) s = err.toString ~ s;
2894 return s;
2895 }
2896 }
2897
2898 private alias CheckException Err;
2899
2900 // Private helper functions
2901
2902 private
2903 {
2904 T toType(T)(Object o)
2905 {
2906 T t = cast(T)(o);
2907 if (t is null)
2908 {
2909 throw new InvalidTypeException("Attempt to compare a "
2910 ~ T.stringof ~ " with an instance of another type");
2911 }
2912 return t;
2913 }
2914
2915 string chop(ref string s, int n)
2916 {
2917 if (n == -1) n = s.length;
2918 string t = s[0..n];
2919 s = s[n..$];
2920 return t;
2921 }
2922
2923 bool optc(ref string s, char c)
2924 {
2925 bool b = s.length != 0 && s[0] == c;
2926 if (b) s = s[1..$];
2927 return b;
2928 }
2929
2930 void reqc(ref string s, char c)
2931 {
2932 if (s.length == 0 || s[0] != c) throw new TagException("");
2933 s = s[1..$];
2934 }
2935
2936 hash_t hash(string s,hash_t h=0)
2937 {
2938 foreach(dchar c;s) h = h * 11 + c;
2939 return h;
2940 }
2941
2942 // Definitions from the XML specification
2943 immutable CharTable=[0x9,0x9,0xA,0xA,0xD,0xD,0x20,0xD7FF,0xE000,0xFFFD,
2944 0x10000,0x10FFFF];
2945 immutable BaseCharTable=[0x0041,0x005A,0x0061,0x007A,0x00C0,0x00D6,0x00D8,
2946 0x00F6,0x00F8,0x00FF,0x0100,0x0131,0x0134,0x013E,0x0141,0x0148,0x014A,
2947 0x017E,0x0180,0x01C3,0x01CD,0x01F0,0x01F4,0x01F5,0x01FA,0x0217,0x0250,
2948 0x02A8,0x02BB,0x02C1,0x0386,0x0386,0x0388,0x038A,0x038C,0x038C,0x038E,
2949 0x03A1,0x03A3,0x03CE,0x03D0,0x03D6,0x03DA,0x03DA,0x03DC,0x03DC,0x03DE,
2950 0x03DE,0x03E0,0x03E0,0x03E2,0x03F3,0x0401,0x040C,0x040E,0x044F,0x0451,
2951 0x045C,0x045E,0x0481,0x0490,0x04C4,0x04C7,0x04C8,0x04CB,0x04CC,0x04D0,
2952 0x04EB,0x04EE,0x04F5,0x04F8,0x04F9,0x0531,0x0556,0x0559,0x0559,0x0561,
2953 0x0586,0x05D0,0x05EA,0x05F0,0x05F2,0x0621,0x063A,0x0641,0x064A,0x0671,
2954 0x06B7,0x06BA,0x06BE,0x06C0,0x06CE,0x06D0,0x06D3,0x06D5,0x06D5,0x06E5,
2955 0x06E6,0x0905,0x0939,0x093D,0x093D,0x0958,0x0961,0x0985,0x098C,0x098F,
2956 0x0990,0x0993,0x09A8,0x09AA,0x09B0,0x09B2,0x09B2,0x09B6,0x09B9,0x09DC,
2957 0x09DD,0x09DF,0x09E1,0x09F0,0x09F1,0x0A05,0x0A0A,0x0A0F,0x0A10,0x0A13,
2958 0x0A28,0x0A2A,0x0A30,0x0A32,0x0A33,0x0A35,0x0A36,0x0A38,0x0A39,0x0A59,
2959 0x0A5C,0x0A5E,0x0A5E,0x0A72,0x0A74,0x0A85,0x0A8B,0x0A8D,0x0A8D,0x0A8F,
2960 0x0A91,0x0A93,0x0AA8,0x0AAA,0x0AB0,0x0AB2,0x0AB3,0x0AB5,0x0AB9,0x0ABD,
2961 0x0ABD,0x0AE0,0x0AE0,0x0B05,0x0B0C,0x0B0F,0x0B10,0x0B13,0x0B28,0x0B2A,
2962 0x0B30,0x0B32,0x0B33,0x0B36,0x0B39,0x0B3D,0x0B3D,0x0B5C,0x0B5D,0x0B5F,
2963 0x0B61,0x0B85,0x0B8A,0x0B8E,0x0B90,0x0B92,0x0B95,0x0B99,0x0B9A,0x0B9C,
2964 0x0B9C,0x0B9E,0x0B9F,0x0BA3,0x0BA4,0x0BA8,0x0BAA,0x0BAE,0x0BB5,0x0BB7,
2965 0x0BB9,0x0C05,0x0C0C,0x0C0E,0x0C10,0x0C12,0x0C28,0x0C2A,0x0C33,0x0C35,
2966 0x0C39,0x0C60,0x0C61,0x0C85,0x0C8C,0x0C8E,0x0C90,0x0C92,0x0CA8,0x0CAA,
2967 0x0CB3,0x0CB5,0x0CB9,0x0CDE,0x0CDE,0x0CE0,0x0CE1,0x0D05,0x0D0C,0x0D0E,
2968 0x0D10,0x0D12,0x0D28,0x0D2A,0x0D39,0x0D60,0x0D61,0x0E01,0x0E2E,0x0E30,
2969 0x0E30,0x0E32,0x0E33,0x0E40,0x0E45,0x0E81,0x0E82,0x0E84,0x0E84,0x0E87,
2970 0x0E88,0x0E8A,0x0E8A,0x0E8D,0x0E8D,0x0E94,0x0E97,0x0E99,0x0E9F,0x0EA1,
2971 0x0EA3,0x0EA5,0x0EA5,0x0EA7,0x0EA7,0x0EAA,0x0EAB,0x0EAD,0x0EAE,0x0EB0,
2972 0x0EB0,0x0EB2,0x0EB3,0x0EBD,0x0EBD,0x0EC0,0x0EC4,0x0F40,0x0F47,0x0F49,
2973 0x0F69,0x10A0,0x10C5,0x10D0,0x10F6,0x1100,0x1100,0x1102,0x1103,0x1105,
2974 0x1107,0x1109,0x1109,0x110B,0x110C,0x110E,0x1112,0x113C,0x113C,0x113E,
2975 0x113E,0x1140,0x1140,0x114C,0x114C,0x114E,0x114E,0x1150,0x1150,0x1154,
2976 0x1155,0x1159,0x1159,0x115F,0x1161,0x1163,0x1163,0x1165,0x1165,0x1167,
2977 0x1167,0x1169,0x1169,0x116D,0x116E,0x1172,0x1173,0x1175,0x1175,0x119E,
2978 0x119E,0x11A8,0x11A8,0x11AB,0x11AB,0x11AE,0x11AF,0x11B7,0x11B8,0x11BA,
2979 0x11BA,0x11BC,0x11C2,0x11EB,0x11EB,0x11F0,0x11F0,0x11F9,0x11F9,0x1E00,
2980 0x1E9B,0x1EA0,0x1EF9,0x1F00,0x1F15,0x1F18,0x1F1D,0x1F20,0x1F45,0x1F48,
2981 0x1F4D,0x1F50,0x1F57,0x1F59,0x1F59,0x1F5B,0x1F5B,0x1F5D,0x1F5D,0x1F5F,
2982 0x1F7D,0x1F80,0x1FB4,0x1FB6,0x1FBC,0x1FBE,0x1FBE,0x1FC2,0x1FC4,0x1FC6,
2983 0x1FCC,0x1FD0,0x1FD3,0x1FD6,0x1FDB,0x1FE0,0x1FEC,0x1FF2,0x1FF4,0x1FF6,
2984 0x1FFC,0x2126,0x2126,0x212A,0x212B,0x212E,0x212E,0x2180,0x2182,0x3041,
2985 0x3094,0x30A1,0x30FA,0x3105,0x312C,0xAC00,0xD7A3];
2986 immutable IdeographicTable=[0x3007,0x3007,0x3021,0x3029,0x4E00,0x9FA5];
2987 immutable CombiningCharTable=[0x0300,0x0345,0x0360,0x0361,0x0483,0x0486,
2988 0x0591,0x05A1,0x05A3,0x05B9,0x05BB,0x05BD,0x05BF,0x05BF,0x05C1,0x05C2,
2989 0x05C4,0x05C4,0x064B,0x0652,0x0670,0x0670,0x06D6,0x06DC,0x06DD,0x06DF,
2990 0x06E0,0x06E4,0x06E7,0x06E8,0x06EA,0x06ED,0x0901,0x0903,0x093C,0x093C,
2991 0x093E,0x094C,0x094D,0x094D,0x0951,0x0954,0x0962,0x0963,0x0981,0x0983,
2992 0x09BC,0x09BC,0x09BE,0x09BE,0x09BF,0x09BF,0x09C0,0x09C4,0x09C7,0x09C8,
2993 0x09CB,0x09CD,0x09D7,0x09D7,0x09E2,0x09E3,0x0A02,0x0A02,0x0A3C,0x0A3C,
2994 0x0A3E,0x0A3E,0x0A3F,0x0A3F,0x0A40,0x0A42,0x0A47,0x0A48,0x0A4B,0x0A4D,
2995 0x0A70,0x0A71,0x0A81,0x0A83,0x0ABC,0x0ABC,0x0ABE,0x0AC5,0x0AC7,0x0AC9,
2996 0x0ACB,0x0ACD,0x0B01,0x0B03,0x0B3C,0x0B3C,0x0B3E,0x0B43,0x0B47,0x0B48,
2997 0x0B4B,0x0B4D,0x0B56,0x0B57,0x0B82,0x0B83,0x0BBE,0x0BC2,0x0BC6,0x0BC8,
2998 0x0BCA,0x0BCD,0x0BD7,0x0BD7,0x0C01,0x0C03,0x0C3E,0x0C44,0x0C46,0x0C48,
2999 0x0C4A,0x0C4D,0x0C55,0x0C56,0x0C82,0x0C83,0x0CBE,0x0CC4,0x0CC6,0x0CC8,
3000 0x0CCA,0x0CCD,0x0CD5,0x0CD6,0x0D02,0x0D03,0x0D3E,0x0D43,0x0D46,0x0D48,
3001 0x0D4A,0x0D4D,0x0D57,0x0D57,0x0E31,0x0E31,0x0E34,0x0E3A,0x0E47,0x0E4E,
3002 0x0EB1,0x0EB1,0x0EB4,0x0EB9,0x0EBB,0x0EBC,0x0EC8,0x0ECD,0x0F18,0x0F19,
3003 0x0F35,0x0F35,0x0F37,0x0F37,0x0F39,0x0F39,0x0F3E,0x0F3E,0x0F3F,0x0F3F,
3004 0x0F71,0x0F84,0x0F86,0x0F8B,0x0F90,0x0F95,0x0F97,0x0F97,0x0F99,0x0FAD,
3005 0x0FB1,0x0FB7,0x0FB9,0x0FB9,0x20D0,0x20DC,0x20E1,0x20E1,0x302A,0x302F,
3006 0x3099,0x3099,0x309A,0x309A];
3007 immutable DigitTable=[0x0030,0x0039,0x0660,0x0669,0x06F0,0x06F9,0x0966,
3008 0x096F,0x09E6,0x09EF,0x0A66,0x0A6F,0x0AE6,0x0AEF,0x0B66,0x0B6F,0x0BE7,
3009 0x0BEF,0x0C66,0x0C6F,0x0CE6,0x0CEF,0x0D66,0x0D6F,0x0E50,0x0E59,0x0ED0,
3010 0x0ED9,0x0F20,0x0F29];
3011 immutable ExtenderTable=[0x00B7,0x00B7,0x02D0,0x02D0,0x02D1,0x02D1,0x0387,
3012 0x0387,0x0640,0x0640,0x0E46,0x0E46,0x0EC6,0x0EC6,0x3005,0x3005,0x3031,
3013 0x3035,0x309D,0x309E,0x30FC,0x30FE];
3014
3015 bool lookup(const(int)[] table, int c)
3016 {
3017 while (table.length != 0)
3018 {
3019 int m = (table.length >> 1) & ~1;
3020 if (c < table[m])
3021 {
3022 table = table[0..m];
3023 }
3024 else if (c > table[m+1])
3025 {
3026 table = table[m+2..$];
3027 }
3028 else return true;
3029 }
3030 return false;
3031 }
3032
3033 string startOf(string s)
3034 {
3035 string r;
3036 foreach(char c;s)
3037 {
3038 r ~= (c < 0x20 || c > 0x7F) ? '.' : c;
3039 if (r.length >= 40) { r ~= "___"; break; }
3040 }
3041 return r;
3042 }
3043
3044 void exit(string s=null)
3045 {
3046 throw new XMLException(s);
3047 }
3048 }
3049