comparison dmd/html.c @ 1:c53b6e3fe49a trunk

[svn r5] Initial commit. Most things are very rough.
author lindquist
date Sat, 01 Sep 2007 21:43:27 +0200
parents
children
comparison
equal deleted inserted replaced
0:a9e71648e74d 1:c53b6e3fe49a
1
2 // Copyright (c) 1999-2006 by Digital Mars
3 // All Rights Reserved
4 // written by Walter Bright
5 // http://www.digitalmars.com
6 // License for redistribution is by either the Artistic License
7 // in artistic.txt, or the GNU General Public License in gnu.txt.
8 // See the included readme.txt for details.
9
10
11 /* HTML parser
12 */
13
14 #include <stdio.h>
15 #include <string.h>
16 #include <ctype.h>
17 #include <stdarg.h>
18 #include <errno.h>
19 #include <wchar.h>
20
21 #include "mars.h"
22 #include "html.h"
23
24 #include <assert.h>
25 #include "root.h"
26
27 extern int HtmlNamedEntity(unsigned char *p, int length);
28
29 static int isLineSeparator(const unsigned char* p);
30
31 /**********************************
32 * Determine if beginning of tag identifier
33 * or a continuation of a tag identifier.
34 */
35
36 inline int istagstart(int c)
37 {
38 return (isalpha(c) || c == '_');
39 }
40
41 inline int istag(int c)
42 {
43 return (isalnum(c) || c == '_');
44 }
45
46 /**********************************************
47 */
48
49 Html::Html(const char *sourcename, unsigned char *base, unsigned length)
50 {
51 //printf("Html::Html()\n");
52 this->sourcename = sourcename;
53 this->base = base;
54 p = base;
55 end = base + length;
56 linnum = 1;
57 dbuf = NULL;
58 inCode = 0;
59 }
60
61 /**********************************************
62 * Print error & quit.
63 */
64
65 void Html::error(const char *format, ...)
66 {
67 if (!global.gag)
68 {
69 printf("%s(%d) : HTML Error: ", sourcename, linnum);
70
71 va_list ap;
72 va_start(ap, format);
73 vprintf(format, ap);
74 va_end(ap);
75
76 printf("\n");
77 fflush(stdout);
78 }
79
80 global.errors++;
81 }
82
83 /**********************************************
84 * Extract all the code from an HTML file,
85 * concatenate it all together, and store in buf.
86 */
87
88 void Html::extractCode(OutBuffer *buf)
89 {
90 //printf("Html::extractCode()\n");
91 dbuf = buf; // save for other routines
92 buf->reserve(end - p);
93 inCode = 0;
94 while (1)
95 {
96 //printf("p = %p, *p = x%x\n", p, *p);
97 switch (*p)
98 {
99 #if 0 // strings are not recognized outside of tags
100 case '"':
101 case '\'':
102 skipString();
103 continue;
104 #endif
105 case '<':
106 if (p[1] == '!' && isCommentStart())
107 { // Comments start with <!--
108 scanComment();
109 }
110 else if(p[1] == '!' && isCDATAStart())
111 {
112 scanCDATA();
113 }
114 else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
115 skipTag();
116 else if (istagstart(*skipWhite(p + 1)))
117 skipTag();
118 else
119 goto Ldefault;
120 continue;
121
122 case 0:
123 case 0x1a:
124 break; // end of file
125
126 case '&':
127 if (inCode)
128 { // Translate character entity into ascii for D parser
129 int c;
130
131 c = charEntity();
132 buf->writeUTF8(c);
133 }
134 else
135 p++;
136 continue;
137
138 case '\r':
139 if (p[1] == '\n')
140 goto Ldefault;
141 case '\n':
142 linnum++;
143 // Always extract new lines, so that D lexer counts the
144 // lines right.
145 buf->writeByte(*p);
146 p++;
147 continue;
148
149 default:
150 Ldefault:
151 if (inCode)
152 buf->writeByte(*p);
153 p++;
154 continue;
155 }
156 break;
157 }
158 buf->writeByte(0); // ending sentinel
159 //printf("D code is: '%s'\n", (char *)buf->data);
160 }
161
162 /***********************************************
163 * Scan to end of <> tag.
164 * Look for <code> and </code> tags to start/stop D processing.
165 * Input:
166 * p is on opening '<' of tag; it's already verified that
167 * it's a tag by lookahead
168 * Output:
169 * p is past closing '>' of tag
170 */
171
172 void Html::skipTag()
173 {
174 enum TagState // what parsing state we're in
175 {
176 TStagstart, // start of tag name
177 TStag, // in a tag name
178 TSrest, // following tag name
179 };
180 enum TagState state = TStagstart;
181 int inot;
182 unsigned char *tagstart = NULL;
183 int taglen = 0;
184
185 p++;
186 inot = 0;
187 if (*p == '/')
188 { inot = 1;
189 p++;
190 }
191 while (1)
192 {
193 switch (*p)
194 {
195 case '>': // found end of tag
196 p++;
197 break;
198
199 case '"':
200 case '\'':
201 state = TSrest;
202 skipString();
203 continue;
204
205 case '<':
206 if (p[1] == '!' && isCommentStart())
207 { // Comments start with <!--
208 scanComment();
209 }
210 else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
211 { error("nested tag");
212 skipTag();
213 }
214 else if (istagstart(*skipWhite(p + 1)))
215 { error("nested tag");
216 skipTag();
217 }
218 // Treat comments as if they were whitespace
219 state = TSrest;
220 continue;
221
222 case 0:
223 case 0x1a:
224 error("end of file before end of tag");
225 break; // end of file
226
227 case '\r':
228 if (p[1] == '\n')
229 goto Ldefault;
230 case '\n':
231 linnum++;
232 // Always extract new lines, so that code lexer counts the
233 // lines right.
234 dbuf->writeByte(*p);
235 state = TSrest; // end of tag
236 p++;
237 continue;
238
239 case ' ':
240 case '\t':
241 case '\f':
242 case '\v':
243 if (state == TStagstart)
244 { p++;
245 continue;
246 }
247 default:
248 Ldefault:
249 switch (state)
250 {
251 case TStagstart: // start of tag name
252 assert(istagstart(*p));
253 state = TStag;
254 tagstart = p;
255 taglen = 0;
256 break;
257
258 case TStag:
259 if (istag(*p))
260 { // Continuing tag name
261 taglen++;
262 }
263 else
264 { // End of tag name
265 state = TSrest;
266 }
267 break;
268
269 case TSrest:
270 break;
271 }
272 p++;
273 continue;
274 }
275 break;
276 }
277
278 // See if we parsed a <code> or </code> tag
279 if (taglen && memicmp((char *) tagstart, (char *) "CODE", taglen) == 0
280 && *(p - 2) != '/') // ignore "<code />" (XHTML)
281 {
282 if (inot)
283 { inCode--;
284 if (inCode < 0)
285 inCode = 0; // ignore extra </code>'s
286 }
287 else
288 inCode++;
289 }
290 }
291
292 /***********************************************
293 * Scan to end of attribute string.
294 */
295
296 void Html::skipString()
297 {
298 int tc = *p;
299
300 while (1)
301 {
302 p++;
303 switch (*p)
304 {
305 case '"':
306 case '\'':
307 if (*p == tc)
308 { p++;
309 break;
310 }
311 continue;
312
313 case '\r':
314 if (p[1] == '\n')
315 goto Ldefault;
316 case '\n':
317 linnum++;
318 // Always extract new lines, so that D lexer counts the
319 // lines right.
320 dbuf->writeByte(*p);
321 continue;
322
323 case 0:
324 case 0x1a:
325 Leof:
326 error("end of file before closing %c of string", tc);
327 break;
328
329 default:
330 Ldefault:
331 continue;
332 }
333 break;
334 }
335 }
336
337 /*********************************
338 * If p points to any white space, skip it
339 * and return pointer just past it.
340 */
341
342 unsigned char *Html::skipWhite(unsigned char *q)
343 {
344 for (; 1; q++)
345 {
346 switch (*q)
347 {
348 case ' ':
349 case '\t':
350 case '\f':
351 case '\v':
352 case '\r':
353 case '\n':
354 continue;
355
356 default:
357 break;
358 }
359 break;
360 }
361 return q;
362 }
363
364 /***************************************************
365 * Scan to end of comment.
366 * Comments are defined any of a number of ways.
367 * IE 5.0: <!-- followed by >
368 * "HTML The Definitive Guide": <!-- text with at least one space in it -->
369 * Netscape: <!-- --> comments nest
370 * w3c: whitespace can appear between -- and > of comment close
371 */
372
373 void Html::scanComment()
374 {
375 // Most of the complexity is dealing with the case that
376 // an arbitrary amount of whitespace can appear between
377 // the -- and the > of a comment close.
378 int scangt = 0;
379
380 //printf("scanComment()\n");
381 if (*p == '\n')
382 { linnum++;
383 // Always extract new lines, so that D lexer counts the
384 // lines right.
385 dbuf->writeByte(*p);
386 }
387 while (1)
388 {
389 //scangt = 1; // IE 5.0 compatibility
390 p++;
391 switch (*p)
392 {
393 case '-':
394 if (p[1] == '-')
395 {
396 if (p[2] == '>') // optimize for most common case
397 {
398 p += 3;
399 break;
400 }
401 p++;
402 scangt = 1;
403 }
404 else
405 scangt = 0;
406 continue;
407
408 case '>':
409 if (scangt)
410 { // found -->
411 p++;
412 break;
413 }
414 continue;
415
416 case ' ':
417 case '\t':
418 case '\f':
419 case '\v':
420 // skip white space
421 continue;
422
423 case '\r':
424 if (p[1] == '\n')
425 goto Ldefault;
426 case '\n':
427 linnum++; // remember to count lines
428 // Always extract new lines, so that D lexer counts the
429 // lines right.
430 dbuf->writeByte(*p);
431 continue;
432
433 case 0:
434 case 0x1a:
435 error("end of file before closing --> of comment");
436 break;
437
438 default:
439 Ldefault:
440 scangt = 0; // it's not -->
441 continue;
442 }
443 break;
444 }
445 //printf("*p = '%c'\n", *p);
446 }
447
448 /********************************************
449 * Determine if we are at the start of a comment.
450 * Input:
451 * p is on the opening '<'
452 * Returns:
453 * 0 if not start of a comment
454 * 1 if start of a comment, p is adjusted to point past --
455 */
456
457 int Html::isCommentStart()
458 #ifdef __DMC__
459 __out(result)
460 {
461 if (result == 0)
462 ;
463 else if (result == 1)
464 {
465 assert(p[-2] == '-' && p[-1] == '-');
466 }
467 else
468 assert(0);
469 }
470 __body
471 #endif /* __DMC__ */
472 { unsigned char *s;
473
474 if (p[0] == '<' && p[1] == '!')
475 {
476 for (s = p + 2; 1; s++)
477 {
478 switch (*s)
479 {
480 case ' ':
481 case '\t':
482 case '\r':
483 case '\f':
484 case '\v':
485 // skip white space, even though spec says no
486 // white space is allowed
487 continue;
488
489 case '-':
490 if (s[1] == '-')
491 {
492 p = s + 2;
493 return 1;
494 }
495 goto No;
496
497 default:
498 goto No;
499 }
500 }
501 }
502 No:
503 return 0;
504 }
505
506 int Html::isCDATAStart()
507 {
508 const char * CDATA_START_MARKER = "<![CDATA[";
509 size_t len = strlen(CDATA_START_MARKER);
510
511 if (strncmp((char*)p, CDATA_START_MARKER, len) == 0)
512 {
513 p += len;
514 return 1;
515 }
516 else
517 {
518 return 0;
519 }
520 }
521
522 void Html::scanCDATA()
523 {
524 while(*p && *p != 0x1A)
525 {
526 int lineSepLength = isLineSeparator(p);
527 if (lineSepLength>0)
528 {
529 /* Always extract new lines, so that D lexer counts the lines
530 * right.
531 */
532 linnum++;
533 dbuf->writeUTF8('\n');
534 p += lineSepLength;
535 continue;
536 }
537 else if (p[0] == ']' && p[1] == ']' && p[2] == '>')
538 {
539 /* end of CDATA section */
540 p += 3;
541 return;
542 }
543 else if (inCode)
544 {
545 /* this CDATA section contains D code */
546 dbuf->writeByte(*p);
547 }
548
549 p++;
550 }
551 }
552
553 /********************************************
554 * Convert an HTML character entity into a character.
555 * Forms are:
556 * &name; named entity
557 * &#ddd; decimal
558 * &#xhhhh; hex
559 * Input:
560 * p is on the &
561 */
562
563 int Html::charEntity()
564 { int c = 0;
565 int v;
566 int hex;
567 unsigned char *pstart = p;
568
569 //printf("Html::charEntity('%c')\n", *p);
570 if (p[1] == '#')
571 {
572 p++;
573 if (p[1] == 'x' || p[1] == 'X')
574 { p++;
575 hex = 1;
576 }
577 else
578 hex = 0;
579 if (p[1] == ';')
580 goto Linvalid;
581 while (1)
582 {
583 p++;
584 switch (*p)
585 {
586 case 0:
587 case 0x1a:
588 error("end of file before end of character entity");
589 goto Lignore;
590
591 case '\n':
592 case '\r':
593 case '<': // tag start
594 // Termination is assumed
595 break;
596
597 case ';':
598 // Termination is explicit
599 p++;
600 break;
601
602 case '0': case '1': case '2': case '3': case '4':
603 case '5': case '6': case '7': case '8': case '9':
604 v = *p - '0';
605 goto Lvalue;
606
607 case 'a': case 'b': case 'c':
608 case 'd': case 'e': case 'f':
609 if (!hex)
610 goto Linvalid;
611 v = (*p - 'a') + 10;
612 goto Lvalue;
613
614 case 'A': case 'B': case 'C':
615 case 'D': case 'E': case 'F':
616 if (!hex)
617 goto Linvalid;
618 v = (*p - 'A') + 10;
619 goto Lvalue;
620
621 Lvalue:
622 if (hex)
623 c = (c << 4) + v;
624 else
625 c = (c * 10) + v;
626 if (c > 0x10FFFF)
627 {
628 error("character entity out of range");
629 goto Lignore;
630 }
631 continue;
632
633 default:
634 Linvalid:
635 error("invalid numeric character reference");
636 goto Lignore;
637 }
638 break;
639 }
640 }
641 else
642 {
643 // It's a named entity; gather all characters until ;
644 unsigned char *idstart = p + 1;
645
646 while (1)
647 {
648 p++;
649 switch (*p)
650 {
651 case 0:
652 case 0x1a:
653 error("end of file before end of character entity");
654 break;
655
656 case '\n':
657 case '\r':
658 case '<': // tag start
659 // Termination is assumed
660 c = HtmlNamedEntity(idstart, p - idstart);
661 if (c == -1)
662 goto Lignore;
663 break;
664
665 case ';':
666 // Termination is explicit
667 c = HtmlNamedEntity(idstart, p - idstart);
668 if (c == -1)
669 goto Lignore;
670 p++;
671 break;
672
673 default:
674 continue;
675 }
676 break;
677 }
678 }
679
680 // Kludge to convert non-breaking space to ascii space
681 if (c == 160)
682 c = ' ';
683
684 return c;
685
686 Lignore:
687 //printf("Lignore\n");
688 p = pstart + 1;
689 return '&';
690 }
691
692 /**
693 * identify DOS, Linux, Mac, Next and Unicode line endings
694 * 0 if this is no line separator
695 * >0 the length of the separator
696 * Note: input has to be UTF-8
697 */
698 static int isLineSeparator(const unsigned char* p)
699 {
700 // Linux
701 if( p[0]=='\n')
702 return 1;
703
704 // Mac & Dos
705 if( p[0]=='\r')
706 return (p[1]=='\n') ? 2 : 1;
707
708 // Unicode (line || paragraph sep.)
709 if( p[0]==0xE2 && p[1]==0x80 && (p[2]==0xA8 || p[2]==0xA9))
710 return 3;
711
712 // Next
713 if( p[0]==0xC2 && p[1]==0x85)
714 return 2;
715
716 return 0;
717 }
718