Mercurial > projects > ldc
comparison dmd2/html.c @ 758:f04dde6e882c
Added initial D2 support, D2 frontend and changes to codegen to make things compile.
author | Tomas Lindquist Olsen <tomas.l.olsen@gmail.com> |
---|---|
date | Tue, 11 Nov 2008 01:38:48 +0100 |
parents | |
children | 638d16625da2 |
comparison
equal
deleted
inserted
replaced
757:2c730d530c98 | 758:f04dde6e882c |
---|---|
1 | |
2 // Copyright (c) 1999-2006 by Digital Mars | |
3 // All Rights Reserved | |
4 // written by Walter Bright | |
5 // http://www.digitalmars.com | |
6 // License for redistribution is by either the Artistic License | |
7 // in artistic.txt, or the GNU General Public License in gnu.txt. | |
8 // See the included readme.txt for details. | |
9 | |
10 | |
11 /* HTML parser | |
12 */ | |
13 | |
14 #include <stdio.h> | |
15 #include <string.h> | |
16 #include <ctype.h> | |
17 #include <stdarg.h> | |
18 #include <errno.h> | |
19 #include <wchar.h> | |
20 | |
21 #include "mars.h" | |
22 #include "html.h" | |
23 | |
24 #include <assert.h> | |
25 #include "root.h" | |
26 | |
27 extern int HtmlNamedEntity(unsigned char *p, int length); | |
28 | |
29 static int isLineSeparator(const unsigned char* p); | |
30 | |
31 /********************************** | |
32 * Determine if beginning of tag identifier | |
33 * or a continuation of a tag identifier. | |
34 */ | |
35 | |
36 inline int istagstart(int c) | |
37 { | |
38 return (isalpha(c) || c == '_'); | |
39 } | |
40 | |
41 inline int istag(int c) | |
42 { | |
43 return (isalnum(c) || c == '_'); | |
44 } | |
45 | |
46 /********************************************** | |
47 */ | |
48 | |
49 Html::Html(const char *sourcename, unsigned char *base, unsigned length) | |
50 { | |
51 //printf("Html::Html()\n"); | |
52 this->sourcename = sourcename; | |
53 this->base = base; | |
54 p = base; | |
55 end = base + length; | |
56 linnum = 1; | |
57 dbuf = NULL; | |
58 inCode = 0; | |
59 } | |
60 | |
61 /********************************************** | |
62 * Print error & quit. | |
63 */ | |
64 | |
65 void Html::error(const char *format, ...) | |
66 { | |
67 if (!global.gag) | |
68 { | |
69 printf("%s(%d) : HTML Error: ", sourcename, linnum); | |
70 | |
71 va_list ap; | |
72 va_start(ap, format); | |
73 vprintf(format, ap); | |
74 va_end(ap); | |
75 | |
76 printf("\n"); | |
77 fflush(stdout); | |
78 } | |
79 | |
80 global.errors++; | |
81 } | |
82 | |
83 /********************************************** | |
84 * Extract all the code from an HTML file, | |
85 * concatenate it all together, and store in buf. | |
86 */ | |
87 | |
88 void Html::extractCode(OutBuffer *buf) | |
89 { | |
90 //printf("Html::extractCode()\n"); | |
91 dbuf = buf; // save for other routines | |
92 buf->reserve(end - p); | |
93 inCode = 0; | |
94 while (1) | |
95 { | |
96 //printf("p = %p, *p = x%x\n", p, *p); | |
97 switch (*p) | |
98 { | |
99 #if 0 // strings are not recognized outside of tags | |
100 case '"': | |
101 case '\'': | |
102 skipString(); | |
103 continue; | |
104 #endif | |
105 case '<': | |
106 if (p[1] == '!' && isCommentStart()) | |
107 { // Comments start with <!-- | |
108 scanComment(); | |
109 } | |
110 else if(p[1] == '!' && isCDATAStart()) | |
111 { | |
112 scanCDATA(); | |
113 } | |
114 else if (p[1] == '/' && istagstart(*skipWhite(p + 2))) | |
115 skipTag(); | |
116 else if (istagstart(*skipWhite(p + 1))) | |
117 skipTag(); | |
118 else | |
119 goto Ldefault; | |
120 continue; | |
121 | |
122 case 0: | |
123 case 0x1a: | |
124 break; // end of file | |
125 | |
126 case '&': | |
127 if (inCode) | |
128 { // Translate character entity into ascii for D parser | |
129 int c; | |
130 | |
131 c = charEntity(); | |
132 buf->writeUTF8(c); | |
133 } | |
134 else | |
135 p++; | |
136 continue; | |
137 | |
138 case '\r': | |
139 if (p[1] == '\n') | |
140 goto Ldefault; | |
141 case '\n': | |
142 linnum++; | |
143 // Always extract new lines, so that D lexer counts the | |
144 // lines right. | |
145 buf->writeByte(*p); | |
146 p++; | |
147 continue; | |
148 | |
149 default: | |
150 Ldefault: | |
151 if (inCode) | |
152 buf->writeByte(*p); | |
153 p++; | |
154 continue; | |
155 } | |
156 break; | |
157 } | |
158 buf->writeByte(0); // ending sentinel | |
159 //printf("D code is: '%s'\n", (char *)buf->data); | |
160 } | |
161 | |
162 /*********************************************** | |
163 * Scan to end of <> tag. | |
164 * Look for <code> and </code> tags to start/stop D processing. | |
165 * Input: | |
166 * p is on opening '<' of tag; it's already verified that | |
167 * it's a tag by lookahead | |
168 * Output: | |
169 * p is past closing '>' of tag | |
170 */ | |
171 | |
172 void Html::skipTag() | |
173 { | |
174 enum TagState // what parsing state we're in | |
175 { | |
176 TStagstart, // start of tag name | |
177 TStag, // in a tag name | |
178 TSrest, // following tag name | |
179 }; | |
180 enum TagState state = TStagstart; | |
181 int inot; | |
182 unsigned char *tagstart = NULL; | |
183 int taglen = 0; | |
184 | |
185 p++; | |
186 inot = 0; | |
187 if (*p == '/') | |
188 { inot = 1; | |
189 p++; | |
190 } | |
191 while (1) | |
192 { | |
193 switch (*p) | |
194 { | |
195 case '>': // found end of tag | |
196 p++; | |
197 break; | |
198 | |
199 case '"': | |
200 case '\'': | |
201 state = TSrest; | |
202 skipString(); | |
203 continue; | |
204 | |
205 case '<': | |
206 if (p[1] == '!' && isCommentStart()) | |
207 { // Comments start with <!-- | |
208 scanComment(); | |
209 } | |
210 else if (p[1] == '/' && istagstart(*skipWhite(p + 2))) | |
211 { error("nested tag"); | |
212 skipTag(); | |
213 } | |
214 else if (istagstart(*skipWhite(p + 1))) | |
215 { error("nested tag"); | |
216 skipTag(); | |
217 } | |
218 // Treat comments as if they were whitespace | |
219 state = TSrest; | |
220 continue; | |
221 | |
222 case 0: | |
223 case 0x1a: | |
224 error("end of file before end of tag"); | |
225 break; // end of file | |
226 | |
227 case '\r': | |
228 if (p[1] == '\n') | |
229 goto Ldefault; | |
230 case '\n': | |
231 linnum++; | |
232 // Always extract new lines, so that code lexer counts the | |
233 // lines right. | |
234 dbuf->writeByte(*p); | |
235 state = TSrest; // end of tag | |
236 p++; | |
237 continue; | |
238 | |
239 case ' ': | |
240 case '\t': | |
241 case '\f': | |
242 case '\v': | |
243 if (state == TStagstart) | |
244 { p++; | |
245 continue; | |
246 } | |
247 default: | |
248 Ldefault: | |
249 switch (state) | |
250 { | |
251 case TStagstart: // start of tag name | |
252 assert(istagstart(*p)); | |
253 state = TStag; | |
254 tagstart = p; | |
255 taglen = 0; | |
256 break; | |
257 | |
258 case TStag: | |
259 if (istag(*p)) | |
260 { // Continuing tag name | |
261 taglen++; | |
262 } | |
263 else | |
264 { // End of tag name | |
265 state = TSrest; | |
266 } | |
267 break; | |
268 | |
269 case TSrest: | |
270 break; | |
271 } | |
272 p++; | |
273 continue; | |
274 } | |
275 break; | |
276 } | |
277 | |
278 // See if we parsed a <code> or </code> tag | |
279 if (taglen && memicmp((char *) tagstart, (char *) "CODE", taglen) == 0 | |
280 && *(p - 2) != '/') // ignore "<code />" (XHTML) | |
281 { | |
282 if (inot) | |
283 { inCode--; | |
284 if (inCode < 0) | |
285 inCode = 0; // ignore extra </code>'s | |
286 } | |
287 else | |
288 inCode++; | |
289 } | |
290 } | |
291 | |
292 /*********************************************** | |
293 * Scan to end of attribute string. | |
294 */ | |
295 | |
296 void Html::skipString() | |
297 { | |
298 int tc = *p; | |
299 | |
300 while (1) | |
301 { | |
302 p++; | |
303 switch (*p) | |
304 { | |
305 case '"': | |
306 case '\'': | |
307 if (*p == tc) | |
308 { p++; | |
309 break; | |
310 } | |
311 continue; | |
312 | |
313 case '\r': | |
314 if (p[1] == '\n') | |
315 goto Ldefault; | |
316 case '\n': | |
317 linnum++; | |
318 // Always extract new lines, so that D lexer counts the | |
319 // lines right. | |
320 dbuf->writeByte(*p); | |
321 continue; | |
322 | |
323 case 0: | |
324 case 0x1a: | |
325 Leof: | |
326 error("end of file before closing %c of string", tc); | |
327 break; | |
328 | |
329 default: | |
330 Ldefault: | |
331 continue; | |
332 } | |
333 break; | |
334 } | |
335 } | |
336 | |
337 /********************************* | |
338 * If p points to any white space, skip it | |
339 * and return pointer just past it. | |
340 */ | |
341 | |
342 unsigned char *Html::skipWhite(unsigned char *q) | |
343 { | |
344 for (; 1; q++) | |
345 { | |
346 switch (*q) | |
347 { | |
348 case ' ': | |
349 case '\t': | |
350 case '\f': | |
351 case '\v': | |
352 case '\r': | |
353 case '\n': | |
354 continue; | |
355 | |
356 default: | |
357 break; | |
358 } | |
359 break; | |
360 } | |
361 return q; | |
362 } | |
363 | |
364 /*************************************************** | |
365 * Scan to end of comment. | |
366 * Comments are defined any of a number of ways. | |
367 * IE 5.0: <!-- followed by > | |
368 * "HTML The Definitive Guide": <!-- text with at least one space in it --> | |
369 * Netscape: <!-- --> comments nest | |
370 * w3c: whitespace can appear between -- and > of comment close | |
371 */ | |
372 | |
373 void Html::scanComment() | |
374 { | |
375 // Most of the complexity is dealing with the case that | |
376 // an arbitrary amount of whitespace can appear between | |
377 // the -- and the > of a comment close. | |
378 int scangt = 0; | |
379 | |
380 //printf("scanComment()\n"); | |
381 if (*p == '\n') | |
382 { linnum++; | |
383 // Always extract new lines, so that D lexer counts the | |
384 // lines right. | |
385 dbuf->writeByte(*p); | |
386 } | |
387 while (1) | |
388 { | |
389 //scangt = 1; // IE 5.0 compatibility | |
390 p++; | |
391 switch (*p) | |
392 { | |
393 case '-': | |
394 if (p[1] == '-') | |
395 { | |
396 if (p[2] == '>') // optimize for most common case | |
397 { | |
398 p += 3; | |
399 break; | |
400 } | |
401 p++; | |
402 scangt = 1; | |
403 } | |
404 else | |
405 scangt = 0; | |
406 continue; | |
407 | |
408 case '>': | |
409 if (scangt) | |
410 { // found --> | |
411 p++; | |
412 break; | |
413 } | |
414 continue; | |
415 | |
416 case ' ': | |
417 case '\t': | |
418 case '\f': | |
419 case '\v': | |
420 // skip white space | |
421 continue; | |
422 | |
423 case '\r': | |
424 if (p[1] == '\n') | |
425 goto Ldefault; | |
426 case '\n': | |
427 linnum++; // remember to count lines | |
428 // Always extract new lines, so that D lexer counts the | |
429 // lines right. | |
430 dbuf->writeByte(*p); | |
431 continue; | |
432 | |
433 case 0: | |
434 case 0x1a: | |
435 error("end of file before closing --> of comment"); | |
436 break; | |
437 | |
438 default: | |
439 Ldefault: | |
440 scangt = 0; // it's not --> | |
441 continue; | |
442 } | |
443 break; | |
444 } | |
445 //printf("*p = '%c'\n", *p); | |
446 } | |
447 | |
448 /******************************************** | |
449 * Determine if we are at the start of a comment. | |
450 * Input: | |
451 * p is on the opening '<' | |
452 * Returns: | |
453 * 0 if not start of a comment | |
454 * 1 if start of a comment, p is adjusted to point past -- | |
455 */ | |
456 | |
457 int Html::isCommentStart() | |
458 #ifdef __DMC__ | |
459 __out(result) | |
460 { | |
461 if (result == 0) | |
462 ; | |
463 else if (result == 1) | |
464 { | |
465 assert(p[-2] == '-' && p[-1] == '-'); | |
466 } | |
467 else | |
468 assert(0); | |
469 } | |
470 __body | |
471 #endif /* __DMC__ */ | |
472 { unsigned char *s; | |
473 | |
474 if (p[0] == '<' && p[1] == '!') | |
475 { | |
476 for (s = p + 2; 1; s++) | |
477 { | |
478 switch (*s) | |
479 { | |
480 case ' ': | |
481 case '\t': | |
482 case '\r': | |
483 case '\f': | |
484 case '\v': | |
485 // skip white space, even though spec says no | |
486 // white space is allowed | |
487 continue; | |
488 | |
489 case '-': | |
490 if (s[1] == '-') | |
491 { | |
492 p = s + 2; | |
493 return 1; | |
494 } | |
495 goto No; | |
496 | |
497 default: | |
498 goto No; | |
499 } | |
500 } | |
501 } | |
502 No: | |
503 return 0; | |
504 } | |
505 | |
506 int Html::isCDATAStart() | |
507 { | |
508 const char * CDATA_START_MARKER = "<![CDATA["; | |
509 size_t len = strlen(CDATA_START_MARKER); | |
510 | |
511 if (strncmp((char*)p, CDATA_START_MARKER, len) == 0) | |
512 { | |
513 p += len; | |
514 return 1; | |
515 } | |
516 else | |
517 { | |
518 return 0; | |
519 } | |
520 } | |
521 | |
522 void Html::scanCDATA() | |
523 { | |
524 while(*p && *p != 0x1A) | |
525 { | |
526 int lineSepLength = isLineSeparator(p); | |
527 if (lineSepLength>0) | |
528 { | |
529 /* Always extract new lines, so that D lexer counts the lines | |
530 * right. | |
531 */ | |
532 linnum++; | |
533 dbuf->writeUTF8('\n'); | |
534 p += lineSepLength; | |
535 continue; | |
536 } | |
537 else if (p[0] == ']' && p[1] == ']' && p[2] == '>') | |
538 { | |
539 /* end of CDATA section */ | |
540 p += 3; | |
541 return; | |
542 } | |
543 else if (inCode) | |
544 { | |
545 /* this CDATA section contains D code */ | |
546 dbuf->writeByte(*p); | |
547 } | |
548 | |
549 p++; | |
550 } | |
551 } | |
552 | |
553 /******************************************** | |
554 * Convert an HTML character entity into a character. | |
555 * Forms are: | |
556 * &name; named entity | |
557 * &#ddd; decimal | |
558 * &#xhhhh; hex | |
559 * Input: | |
560 * p is on the & | |
561 */ | |
562 | |
563 int Html::charEntity() | |
564 { int c = 0; | |
565 int v; | |
566 int hex; | |
567 unsigned char *pstart = p; | |
568 | |
569 //printf("Html::charEntity('%c')\n", *p); | |
570 if (p[1] == '#') | |
571 { | |
572 p++; | |
573 if (p[1] == 'x' || p[1] == 'X') | |
574 { p++; | |
575 hex = 1; | |
576 } | |
577 else | |
578 hex = 0; | |
579 if (p[1] == ';') | |
580 goto Linvalid; | |
581 while (1) | |
582 { | |
583 p++; | |
584 switch (*p) | |
585 { | |
586 case 0: | |
587 case 0x1a: | |
588 error("end of file before end of character entity"); | |
589 goto Lignore; | |
590 | |
591 case '\n': | |
592 case '\r': | |
593 case '<': // tag start | |
594 // Termination is assumed | |
595 break; | |
596 | |
597 case ';': | |
598 // Termination is explicit | |
599 p++; | |
600 break; | |
601 | |
602 case '0': case '1': case '2': case '3': case '4': | |
603 case '5': case '6': case '7': case '8': case '9': | |
604 v = *p - '0'; | |
605 goto Lvalue; | |
606 | |
607 case 'a': case 'b': case 'c': | |
608 case 'd': case 'e': case 'f': | |
609 if (!hex) | |
610 goto Linvalid; | |
611 v = (*p - 'a') + 10; | |
612 goto Lvalue; | |
613 | |
614 case 'A': case 'B': case 'C': | |
615 case 'D': case 'E': case 'F': | |
616 if (!hex) | |
617 goto Linvalid; | |
618 v = (*p - 'A') + 10; | |
619 goto Lvalue; | |
620 | |
621 Lvalue: | |
622 if (hex) | |
623 c = (c << 4) + v; | |
624 else | |
625 c = (c * 10) + v; | |
626 if (c > 0x10FFFF) | |
627 { | |
628 error("character entity out of range"); | |
629 goto Lignore; | |
630 } | |
631 continue; | |
632 | |
633 default: | |
634 Linvalid: | |
635 error("invalid numeric character reference"); | |
636 goto Lignore; | |
637 } | |
638 break; | |
639 } | |
640 } | |
641 else | |
642 { | |
643 // It's a named entity; gather all characters until ; | |
644 unsigned char *idstart = p + 1; | |
645 | |
646 while (1) | |
647 { | |
648 p++; | |
649 switch (*p) | |
650 { | |
651 case 0: | |
652 case 0x1a: | |
653 error("end of file before end of character entity"); | |
654 break; | |
655 | |
656 case '\n': | |
657 case '\r': | |
658 case '<': // tag start | |
659 // Termination is assumed | |
660 c = HtmlNamedEntity(idstart, p - idstart); | |
661 if (c == -1) | |
662 goto Lignore; | |
663 break; | |
664 | |
665 case ';': | |
666 // Termination is explicit | |
667 c = HtmlNamedEntity(idstart, p - idstart); | |
668 if (c == -1) | |
669 goto Lignore; | |
670 p++; | |
671 break; | |
672 | |
673 default: | |
674 continue; | |
675 } | |
676 break; | |
677 } | |
678 } | |
679 | |
680 // Kludge to convert non-breaking space to ascii space | |
681 if (c == 160) | |
682 c = ' '; | |
683 | |
684 return c; | |
685 | |
686 Lignore: | |
687 //printf("Lignore\n"); | |
688 p = pstart + 1; | |
689 return '&'; | |
690 } | |
691 | |
692 /** | |
693 * identify DOS, Linux, Mac, Next and Unicode line endings | |
694 * 0 if this is no line separator | |
695 * >0 the length of the separator | |
696 * Note: input has to be UTF-8 | |
697 */ | |
698 static int isLineSeparator(const unsigned char* p) | |
699 { | |
700 // Linux | |
701 if( p[0]=='\n') | |
702 return 1; | |
703 | |
704 // Mac & Dos | |
705 if( p[0]=='\r') | |
706 return (p[1]=='\n') ? 2 : 1; | |
707 | |
708 // Unicode (line || paragraph sep.) | |
709 if( p[0]==0xE2 && p[1]==0x80 && (p[2]==0xA8 || p[2]==0xA9)) | |
710 return 3; | |
711 | |
712 // Next | |
713 if( p[0]==0xC2 && p[1]==0x85) | |
714 return 2; | |
715 | |
716 return 0; | |
717 } | |
718 |