1
|
1
|
|
2 // Copyright (c) 1999-2006 by Digital Mars
|
|
3 // All Rights Reserved
|
|
4 // written by Walter Bright
|
|
5 // http://www.digitalmars.com
|
|
6 // License for redistribution is by either the Artistic License
|
|
7 // in artistic.txt, or the GNU General Public License in gnu.txt.
|
|
8 // See the included readme.txt for details.
|
|
9
|
|
10
|
|
11 /* HTML parser
|
|
12 */
|
|
13
|
|
14 #include <stdio.h>
|
|
15 #include <string.h>
|
|
16 #include <ctype.h>
|
|
17 #include <stdarg.h>
|
|
18 #include <errno.h>
|
|
19 #include <wchar.h>
|
|
20
|
|
21 #include "mars.h"
|
|
22 #include "html.h"
|
|
23
|
|
24 #include <assert.h>
|
|
25 #include "root.h"
|
|
26
|
|
27 extern int HtmlNamedEntity(unsigned char *p, int length);
|
|
28
|
|
29 static int isLineSeparator(const unsigned char* p);
|
|
30
|
|
31 /**********************************
|
|
32 * Determine if beginning of tag identifier
|
|
33 * or a continuation of a tag identifier.
|
|
34 */
|
|
35
|
|
36 inline int istagstart(int c)
|
|
37 {
|
|
38 return (isalpha(c) || c == '_');
|
|
39 }
|
|
40
|
|
41 inline int istag(int c)
|
|
42 {
|
|
43 return (isalnum(c) || c == '_');
|
|
44 }
|
|
45
|
|
46 /**********************************************
|
|
47 */
|
|
48
|
|
49 Html::Html(const char *sourcename, unsigned char *base, unsigned length)
|
|
50 {
|
|
51 //printf("Html::Html()\n");
|
|
52 this->sourcename = sourcename;
|
|
53 this->base = base;
|
|
54 p = base;
|
|
55 end = base + length;
|
|
56 linnum = 1;
|
|
57 dbuf = NULL;
|
|
58 inCode = 0;
|
|
59 }
|
|
60
|
|
61 /**********************************************
|
|
62 * Print error & quit.
|
|
63 */
|
|
64
|
|
65 void Html::error(const char *format, ...)
|
|
66 {
|
|
67 if (!global.gag)
|
|
68 {
|
|
69 printf("%s(%d) : HTML Error: ", sourcename, linnum);
|
|
70
|
|
71 va_list ap;
|
|
72 va_start(ap, format);
|
|
73 vprintf(format, ap);
|
|
74 va_end(ap);
|
|
75
|
|
76 printf("\n");
|
|
77 fflush(stdout);
|
|
78 }
|
|
79
|
|
80 global.errors++;
|
|
81 }
|
|
82
|
|
83 /**********************************************
|
|
84 * Extract all the code from an HTML file,
|
|
85 * concatenate it all together, and store in buf.
|
|
86 */
|
|
87
|
|
88 void Html::extractCode(OutBuffer *buf)
|
|
89 {
|
|
90 //printf("Html::extractCode()\n");
|
|
91 dbuf = buf; // save for other routines
|
|
92 buf->reserve(end - p);
|
|
93 inCode = 0;
|
|
94 while (1)
|
|
95 {
|
|
96 //printf("p = %p, *p = x%x\n", p, *p);
|
|
97 switch (*p)
|
|
98 {
|
|
99 #if 0 // strings are not recognized outside of tags
|
|
100 case '"':
|
|
101 case '\'':
|
|
102 skipString();
|
|
103 continue;
|
|
104 #endif
|
|
105 case '<':
|
|
106 if (p[1] == '!' && isCommentStart())
|
|
107 { // Comments start with <!--
|
|
108 scanComment();
|
|
109 }
|
|
110 else if(p[1] == '!' && isCDATAStart())
|
|
111 {
|
|
112 scanCDATA();
|
|
113 }
|
|
114 else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
|
|
115 skipTag();
|
|
116 else if (istagstart(*skipWhite(p + 1)))
|
|
117 skipTag();
|
|
118 else
|
|
119 goto Ldefault;
|
|
120 continue;
|
|
121
|
|
122 case 0:
|
|
123 case 0x1a:
|
|
124 break; // end of file
|
|
125
|
|
126 case '&':
|
|
127 if (inCode)
|
|
128 { // Translate character entity into ascii for D parser
|
|
129 int c;
|
|
130
|
|
131 c = charEntity();
|
|
132 buf->writeUTF8(c);
|
|
133 }
|
|
134 else
|
|
135 p++;
|
|
136 continue;
|
|
137
|
|
138 case '\r':
|
|
139 if (p[1] == '\n')
|
|
140 goto Ldefault;
|
|
141 case '\n':
|
|
142 linnum++;
|
|
143 // Always extract new lines, so that D lexer counts the
|
|
144 // lines right.
|
|
145 buf->writeByte(*p);
|
|
146 p++;
|
|
147 continue;
|
|
148
|
|
149 default:
|
|
150 Ldefault:
|
|
151 if (inCode)
|
|
152 buf->writeByte(*p);
|
|
153 p++;
|
|
154 continue;
|
|
155 }
|
|
156 break;
|
|
157 }
|
|
158 buf->writeByte(0); // ending sentinel
|
|
159 //printf("D code is: '%s'\n", (char *)buf->data);
|
|
160 }
|
|
161
|
|
162 /***********************************************
|
|
163 * Scan to end of <> tag.
|
|
164 * Look for <code> and </code> tags to start/stop D processing.
|
|
165 * Input:
|
|
166 * p is on opening '<' of tag; it's already verified that
|
|
167 * it's a tag by lookahead
|
|
168 * Output:
|
|
169 * p is past closing '>' of tag
|
|
170 */
|
|
171
|
|
172 void Html::skipTag()
|
|
173 {
|
|
174 enum TagState // what parsing state we're in
|
|
175 {
|
|
176 TStagstart, // start of tag name
|
|
177 TStag, // in a tag name
|
|
178 TSrest, // following tag name
|
|
179 };
|
|
180 enum TagState state = TStagstart;
|
|
181 int inot;
|
|
182 unsigned char *tagstart = NULL;
|
|
183 int taglen = 0;
|
|
184
|
|
185 p++;
|
|
186 inot = 0;
|
|
187 if (*p == '/')
|
|
188 { inot = 1;
|
|
189 p++;
|
|
190 }
|
|
191 while (1)
|
|
192 {
|
|
193 switch (*p)
|
|
194 {
|
|
195 case '>': // found end of tag
|
|
196 p++;
|
|
197 break;
|
|
198
|
|
199 case '"':
|
|
200 case '\'':
|
|
201 state = TSrest;
|
|
202 skipString();
|
|
203 continue;
|
|
204
|
|
205 case '<':
|
|
206 if (p[1] == '!' && isCommentStart())
|
|
207 { // Comments start with <!--
|
|
208 scanComment();
|
|
209 }
|
|
210 else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
|
|
211 { error("nested tag");
|
|
212 skipTag();
|
|
213 }
|
|
214 else if (istagstart(*skipWhite(p + 1)))
|
|
215 { error("nested tag");
|
|
216 skipTag();
|
|
217 }
|
|
218 // Treat comments as if they were whitespace
|
|
219 state = TSrest;
|
|
220 continue;
|
|
221
|
|
222 case 0:
|
|
223 case 0x1a:
|
|
224 error("end of file before end of tag");
|
|
225 break; // end of file
|
|
226
|
|
227 case '\r':
|
|
228 if (p[1] == '\n')
|
|
229 goto Ldefault;
|
|
230 case '\n':
|
|
231 linnum++;
|
|
232 // Always extract new lines, so that code lexer counts the
|
|
233 // lines right.
|
|
234 dbuf->writeByte(*p);
|
|
235 state = TSrest; // end of tag
|
|
236 p++;
|
|
237 continue;
|
|
238
|
|
239 case ' ':
|
|
240 case '\t':
|
|
241 case '\f':
|
|
242 case '\v':
|
|
243 if (state == TStagstart)
|
|
244 { p++;
|
|
245 continue;
|
|
246 }
|
|
247 default:
|
|
248 Ldefault:
|
|
249 switch (state)
|
|
250 {
|
|
251 case TStagstart: // start of tag name
|
|
252 assert(istagstart(*p));
|
|
253 state = TStag;
|
|
254 tagstart = p;
|
|
255 taglen = 0;
|
|
256 break;
|
|
257
|
|
258 case TStag:
|
|
259 if (istag(*p))
|
|
260 { // Continuing tag name
|
|
261 taglen++;
|
|
262 }
|
|
263 else
|
|
264 { // End of tag name
|
|
265 state = TSrest;
|
|
266 }
|
|
267 break;
|
|
268
|
|
269 case TSrest:
|
|
270 break;
|
|
271 }
|
|
272 p++;
|
|
273 continue;
|
|
274 }
|
|
275 break;
|
|
276 }
|
|
277
|
|
278 // See if we parsed a <code> or </code> tag
|
|
279 if (taglen && memicmp((char *) tagstart, (char *) "CODE", taglen) == 0
|
|
280 && *(p - 2) != '/') // ignore "<code />" (XHTML)
|
|
281 {
|
|
282 if (inot)
|
|
283 { inCode--;
|
|
284 if (inCode < 0)
|
|
285 inCode = 0; // ignore extra </code>'s
|
|
286 }
|
|
287 else
|
|
288 inCode++;
|
|
289 }
|
|
290 }
|
|
291
|
|
292 /***********************************************
|
|
293 * Scan to end of attribute string.
|
|
294 */
|
|
295
|
|
296 void Html::skipString()
|
|
297 {
|
|
298 int tc = *p;
|
|
299
|
|
300 while (1)
|
|
301 {
|
|
302 p++;
|
|
303 switch (*p)
|
|
304 {
|
|
305 case '"':
|
|
306 case '\'':
|
|
307 if (*p == tc)
|
|
308 { p++;
|
|
309 break;
|
|
310 }
|
|
311 continue;
|
|
312
|
|
313 case '\r':
|
|
314 if (p[1] == '\n')
|
|
315 goto Ldefault;
|
|
316 case '\n':
|
|
317 linnum++;
|
|
318 // Always extract new lines, so that D lexer counts the
|
|
319 // lines right.
|
|
320 dbuf->writeByte(*p);
|
|
321 continue;
|
|
322
|
|
323 case 0:
|
|
324 case 0x1a:
|
|
325 Leof:
|
|
326 error("end of file before closing %c of string", tc);
|
|
327 break;
|
|
328
|
|
329 default:
|
|
330 Ldefault:
|
|
331 continue;
|
|
332 }
|
|
333 break;
|
|
334 }
|
|
335 }
|
|
336
|
|
337 /*********************************
|
|
338 * If p points to any white space, skip it
|
|
339 * and return pointer just past it.
|
|
340 */
|
|
341
|
|
342 unsigned char *Html::skipWhite(unsigned char *q)
|
|
343 {
|
|
344 for (; 1; q++)
|
|
345 {
|
|
346 switch (*q)
|
|
347 {
|
|
348 case ' ':
|
|
349 case '\t':
|
|
350 case '\f':
|
|
351 case '\v':
|
|
352 case '\r':
|
|
353 case '\n':
|
|
354 continue;
|
|
355
|
|
356 default:
|
|
357 break;
|
|
358 }
|
|
359 break;
|
|
360 }
|
|
361 return q;
|
|
362 }
|
|
363
|
|
364 /***************************************************
|
|
365 * Scan to end of comment.
|
|
366 * Comments are defined any of a number of ways.
|
|
367 * IE 5.0: <!-- followed by >
|
|
368 * "HTML The Definitive Guide": <!-- text with at least one space in it -->
|
|
369 * Netscape: <!-- --> comments nest
|
|
370 * w3c: whitespace can appear between -- and > of comment close
|
|
371 */
|
|
372
|
|
373 void Html::scanComment()
|
|
374 {
|
|
375 // Most of the complexity is dealing with the case that
|
|
376 // an arbitrary amount of whitespace can appear between
|
|
377 // the -- and the > of a comment close.
|
|
378 int scangt = 0;
|
|
379
|
|
380 //printf("scanComment()\n");
|
|
381 if (*p == '\n')
|
|
382 { linnum++;
|
|
383 // Always extract new lines, so that D lexer counts the
|
|
384 // lines right.
|
|
385 dbuf->writeByte(*p);
|
|
386 }
|
|
387 while (1)
|
|
388 {
|
|
389 //scangt = 1; // IE 5.0 compatibility
|
|
390 p++;
|
|
391 switch (*p)
|
|
392 {
|
|
393 case '-':
|
|
394 if (p[1] == '-')
|
|
395 {
|
|
396 if (p[2] == '>') // optimize for most common case
|
|
397 {
|
|
398 p += 3;
|
|
399 break;
|
|
400 }
|
|
401 p++;
|
|
402 scangt = 1;
|
|
403 }
|
|
404 else
|
|
405 scangt = 0;
|
|
406 continue;
|
|
407
|
|
408 case '>':
|
|
409 if (scangt)
|
|
410 { // found -->
|
|
411 p++;
|
|
412 break;
|
|
413 }
|
|
414 continue;
|
|
415
|
|
416 case ' ':
|
|
417 case '\t':
|
|
418 case '\f':
|
|
419 case '\v':
|
|
420 // skip white space
|
|
421 continue;
|
|
422
|
|
423 case '\r':
|
|
424 if (p[1] == '\n')
|
|
425 goto Ldefault;
|
|
426 case '\n':
|
|
427 linnum++; // remember to count lines
|
|
428 // Always extract new lines, so that D lexer counts the
|
|
429 // lines right.
|
|
430 dbuf->writeByte(*p);
|
|
431 continue;
|
|
432
|
|
433 case 0:
|
|
434 case 0x1a:
|
|
435 error("end of file before closing --> of comment");
|
|
436 break;
|
|
437
|
|
438 default:
|
|
439 Ldefault:
|
|
440 scangt = 0; // it's not -->
|
|
441 continue;
|
|
442 }
|
|
443 break;
|
|
444 }
|
|
445 //printf("*p = '%c'\n", *p);
|
|
446 }
|
|
447
|
|
448 /********************************************
|
|
449 * Determine if we are at the start of a comment.
|
|
450 * Input:
|
|
451 * p is on the opening '<'
|
|
452 * Returns:
|
|
453 * 0 if not start of a comment
|
|
454 * 1 if start of a comment, p is adjusted to point past --
|
|
455 */
|
|
456
|
|
457 int Html::isCommentStart()
|
|
458 #ifdef __DMC__
|
|
459 __out(result)
|
|
460 {
|
|
461 if (result == 0)
|
|
462 ;
|
|
463 else if (result == 1)
|
|
464 {
|
|
465 assert(p[-2] == '-' && p[-1] == '-');
|
|
466 }
|
|
467 else
|
|
468 assert(0);
|
|
469 }
|
|
470 __body
|
|
471 #endif /* __DMC__ */
|
|
472 { unsigned char *s;
|
|
473
|
|
474 if (p[0] == '<' && p[1] == '!')
|
|
475 {
|
|
476 for (s = p + 2; 1; s++)
|
|
477 {
|
|
478 switch (*s)
|
|
479 {
|
|
480 case ' ':
|
|
481 case '\t':
|
|
482 case '\r':
|
|
483 case '\f':
|
|
484 case '\v':
|
|
485 // skip white space, even though spec says no
|
|
486 // white space is allowed
|
|
487 continue;
|
|
488
|
|
489 case '-':
|
|
490 if (s[1] == '-')
|
|
491 {
|
|
492 p = s + 2;
|
|
493 return 1;
|
|
494 }
|
|
495 goto No;
|
|
496
|
|
497 default:
|
|
498 goto No;
|
|
499 }
|
|
500 }
|
|
501 }
|
|
502 No:
|
|
503 return 0;
|
|
504 }
|
|
505
|
|
506 int Html::isCDATAStart()
|
|
507 {
|
|
508 const char * CDATA_START_MARKER = "<![CDATA[";
|
|
509 size_t len = strlen(CDATA_START_MARKER);
|
|
510
|
|
511 if (strncmp((char*)p, CDATA_START_MARKER, len) == 0)
|
|
512 {
|
|
513 p += len;
|
|
514 return 1;
|
|
515 }
|
|
516 else
|
|
517 {
|
|
518 return 0;
|
|
519 }
|
|
520 }
|
|
521
|
|
522 void Html::scanCDATA()
|
|
523 {
|
|
524 while(*p && *p != 0x1A)
|
|
525 {
|
|
526 int lineSepLength = isLineSeparator(p);
|
|
527 if (lineSepLength>0)
|
|
528 {
|
|
529 /* Always extract new lines, so that D lexer counts the lines
|
|
530 * right.
|
|
531 */
|
|
532 linnum++;
|
|
533 dbuf->writeUTF8('\n');
|
|
534 p += lineSepLength;
|
|
535 continue;
|
|
536 }
|
|
537 else if (p[0] == ']' && p[1] == ']' && p[2] == '>')
|
|
538 {
|
|
539 /* end of CDATA section */
|
|
540 p += 3;
|
|
541 return;
|
|
542 }
|
|
543 else if (inCode)
|
|
544 {
|
|
545 /* this CDATA section contains D code */
|
|
546 dbuf->writeByte(*p);
|
|
547 }
|
|
548
|
|
549 p++;
|
|
550 }
|
|
551 }
|
|
552
|
|
553 /********************************************
|
|
554 * Convert an HTML character entity into a character.
|
|
555 * Forms are:
|
|
556 * &name; named entity
|
|
557 * &#ddd; decimal
|
|
558 * &#xhhhh; hex
|
|
559 * Input:
|
|
560 * p is on the &
|
|
561 */
|
|
562
|
|
563 int Html::charEntity()
|
|
564 { int c = 0;
|
|
565 int v;
|
|
566 int hex;
|
|
567 unsigned char *pstart = p;
|
|
568
|
|
569 //printf("Html::charEntity('%c')\n", *p);
|
|
570 if (p[1] == '#')
|
|
571 {
|
|
572 p++;
|
|
573 if (p[1] == 'x' || p[1] == 'X')
|
|
574 { p++;
|
|
575 hex = 1;
|
|
576 }
|
|
577 else
|
|
578 hex = 0;
|
|
579 if (p[1] == ';')
|
|
580 goto Linvalid;
|
|
581 while (1)
|
|
582 {
|
|
583 p++;
|
|
584 switch (*p)
|
|
585 {
|
|
586 case 0:
|
|
587 case 0x1a:
|
|
588 error("end of file before end of character entity");
|
|
589 goto Lignore;
|
|
590
|
|
591 case '\n':
|
|
592 case '\r':
|
|
593 case '<': // tag start
|
|
594 // Termination is assumed
|
|
595 break;
|
|
596
|
|
597 case ';':
|
|
598 // Termination is explicit
|
|
599 p++;
|
|
600 break;
|
|
601
|
|
602 case '0': case '1': case '2': case '3': case '4':
|
|
603 case '5': case '6': case '7': case '8': case '9':
|
|
604 v = *p - '0';
|
|
605 goto Lvalue;
|
|
606
|
|
607 case 'a': case 'b': case 'c':
|
|
608 case 'd': case 'e': case 'f':
|
|
609 if (!hex)
|
|
610 goto Linvalid;
|
|
611 v = (*p - 'a') + 10;
|
|
612 goto Lvalue;
|
|
613
|
|
614 case 'A': case 'B': case 'C':
|
|
615 case 'D': case 'E': case 'F':
|
|
616 if (!hex)
|
|
617 goto Linvalid;
|
|
618 v = (*p - 'A') + 10;
|
|
619 goto Lvalue;
|
|
620
|
|
621 Lvalue:
|
|
622 if (hex)
|
|
623 c = (c << 4) + v;
|
|
624 else
|
|
625 c = (c * 10) + v;
|
|
626 if (c > 0x10FFFF)
|
|
627 {
|
|
628 error("character entity out of range");
|
|
629 goto Lignore;
|
|
630 }
|
|
631 continue;
|
|
632
|
|
633 default:
|
|
634 Linvalid:
|
|
635 error("invalid numeric character reference");
|
|
636 goto Lignore;
|
|
637 }
|
|
638 break;
|
|
639 }
|
|
640 }
|
|
641 else
|
|
642 {
|
|
643 // It's a named entity; gather all characters until ;
|
|
644 unsigned char *idstart = p + 1;
|
|
645
|
|
646 while (1)
|
|
647 {
|
|
648 p++;
|
|
649 switch (*p)
|
|
650 {
|
|
651 case 0:
|
|
652 case 0x1a:
|
|
653 error("end of file before end of character entity");
|
|
654 break;
|
|
655
|
|
656 case '\n':
|
|
657 case '\r':
|
|
658 case '<': // tag start
|
|
659 // Termination is assumed
|
|
660 c = HtmlNamedEntity(idstart, p - idstart);
|
|
661 if (c == -1)
|
|
662 goto Lignore;
|
|
663 break;
|
|
664
|
|
665 case ';':
|
|
666 // Termination is explicit
|
|
667 c = HtmlNamedEntity(idstart, p - idstart);
|
|
668 if (c == -1)
|
|
669 goto Lignore;
|
|
670 p++;
|
|
671 break;
|
|
672
|
|
673 default:
|
|
674 continue;
|
|
675 }
|
|
676 break;
|
|
677 }
|
|
678 }
|
|
679
|
|
680 // Kludge to convert non-breaking space to ascii space
|
|
681 if (c == 160)
|
|
682 c = ' ';
|
|
683
|
|
684 return c;
|
|
685
|
|
686 Lignore:
|
|
687 //printf("Lignore\n");
|
|
688 p = pstart + 1;
|
|
689 return '&';
|
|
690 }
|
|
691
|
|
692 /**
|
|
693 * identify DOS, Linux, Mac, Next and Unicode line endings
|
|
694 * 0 if this is no line separator
|
|
695 * >0 the length of the separator
|
|
696 * Note: input has to be UTF-8
|
|
697 */
|
|
698 static int isLineSeparator(const unsigned char* p)
|
|
699 {
|
|
700 // Linux
|
|
701 if( p[0]=='\n')
|
|
702 return 1;
|
|
703
|
|
704 // Mac & Dos
|
|
705 if( p[0]=='\r')
|
|
706 return (p[1]=='\n') ? 2 : 1;
|
|
707
|
|
708 // Unicode (line || paragraph sep.)
|
|
709 if( p[0]==0xE2 && p[1]==0x80 && (p[2]==0xA8 || p[2]==0xA9))
|
|
710 return 3;
|
|
711
|
|
712 // Next
|
|
713 if( p[0]==0xC2 && p[1]==0x85)
|
|
714 return 2;
|
|
715
|
|
716 return 0;
|
|
717 }
|
|
718
|