Mercurial > projects > ldc
view dmd2/html.c @ 1229:fafe7c8d6734
Now compiles again, and fixed type of the Interface[N] ClassInfo symbol.
author | Tomas Lindquist Olsen <tomas.l.olsen gmail.com> |
---|---|
date | Thu, 16 Apr 2009 11:01:49 +0200 |
parents | f04dde6e882c |
children | 638d16625da2 |
line wrap: on
line source
// Copyright (c) 1999-2006 by Digital Mars // All Rights Reserved // written by Walter Bright // http://www.digitalmars.com // License for redistribution is by either the Artistic License // in artistic.txt, or the GNU General Public License in gnu.txt. // See the included readme.txt for details. /* HTML parser */ #include <stdio.h> #include <string.h> #include <ctype.h> #include <stdarg.h> #include <errno.h> #include <wchar.h> #include "mars.h" #include "html.h" #include <assert.h> #include "root.h" extern int HtmlNamedEntity(unsigned char *p, int length); static int isLineSeparator(const unsigned char* p); /********************************** * Determine if beginning of tag identifier * or a continuation of a tag identifier. */ inline int istagstart(int c) { return (isalpha(c) || c == '_'); } inline int istag(int c) { return (isalnum(c) || c == '_'); } /********************************************** */ Html::Html(const char *sourcename, unsigned char *base, unsigned length) { //printf("Html::Html()\n"); this->sourcename = sourcename; this->base = base; p = base; end = base + length; linnum = 1; dbuf = NULL; inCode = 0; } /********************************************** * Print error & quit. */ void Html::error(const char *format, ...) { if (!global.gag) { printf("%s(%d) : HTML Error: ", sourcename, linnum); va_list ap; va_start(ap, format); vprintf(format, ap); va_end(ap); printf("\n"); fflush(stdout); } global.errors++; } /********************************************** * Extract all the code from an HTML file, * concatenate it all together, and store in buf. */ void Html::extractCode(OutBuffer *buf) { //printf("Html::extractCode()\n"); dbuf = buf; // save for other routines buf->reserve(end - p); inCode = 0; while (1) { //printf("p = %p, *p = x%x\n", p, *p); switch (*p) { #if 0 // strings are not recognized outside of tags case '"': case '\'': skipString(); continue; #endif case '<': if (p[1] == '!' && isCommentStart()) { // Comments start with <!-- scanComment(); } else if(p[1] == '!' && isCDATAStart()) { scanCDATA(); } else if (p[1] == '/' && istagstart(*skipWhite(p + 2))) skipTag(); else if (istagstart(*skipWhite(p + 1))) skipTag(); else goto Ldefault; continue; case 0: case 0x1a: break; // end of file case '&': if (inCode) { // Translate character entity into ascii for D parser int c; c = charEntity(); buf->writeUTF8(c); } else p++; continue; case '\r': if (p[1] == '\n') goto Ldefault; case '\n': linnum++; // Always extract new lines, so that D lexer counts the // lines right. buf->writeByte(*p); p++; continue; default: Ldefault: if (inCode) buf->writeByte(*p); p++; continue; } break; } buf->writeByte(0); // ending sentinel //printf("D code is: '%s'\n", (char *)buf->data); } /*********************************************** * Scan to end of <> tag. * Look for <code> and </code> tags to start/stop D processing. * Input: * p is on opening '<' of tag; it's already verified that * it's a tag by lookahead * Output: * p is past closing '>' of tag */ void Html::skipTag() { enum TagState // what parsing state we're in { TStagstart, // start of tag name TStag, // in a tag name TSrest, // following tag name }; enum TagState state = TStagstart; int inot; unsigned char *tagstart = NULL; int taglen = 0; p++; inot = 0; if (*p == '/') { inot = 1; p++; } while (1) { switch (*p) { case '>': // found end of tag p++; break; case '"': case '\'': state = TSrest; skipString(); continue; case '<': if (p[1] == '!' && isCommentStart()) { // Comments start with <!-- scanComment(); } else if (p[1] == '/' && istagstart(*skipWhite(p + 2))) { error("nested tag"); skipTag(); } else if (istagstart(*skipWhite(p + 1))) { error("nested tag"); skipTag(); } // Treat comments as if they were whitespace state = TSrest; continue; case 0: case 0x1a: error("end of file before end of tag"); break; // end of file case '\r': if (p[1] == '\n') goto Ldefault; case '\n': linnum++; // Always extract new lines, so that code lexer counts the // lines right. dbuf->writeByte(*p); state = TSrest; // end of tag p++; continue; case ' ': case '\t': case '\f': case '\v': if (state == TStagstart) { p++; continue; } default: Ldefault: switch (state) { case TStagstart: // start of tag name assert(istagstart(*p)); state = TStag; tagstart = p; taglen = 0; break; case TStag: if (istag(*p)) { // Continuing tag name taglen++; } else { // End of tag name state = TSrest; } break; case TSrest: break; } p++; continue; } break; } // See if we parsed a <code> or </code> tag if (taglen && memicmp((char *) tagstart, (char *) "CODE", taglen) == 0 && *(p - 2) != '/') // ignore "<code />" (XHTML) { if (inot) { inCode--; if (inCode < 0) inCode = 0; // ignore extra </code>'s } else inCode++; } } /*********************************************** * Scan to end of attribute string. */ void Html::skipString() { int tc = *p; while (1) { p++; switch (*p) { case '"': case '\'': if (*p == tc) { p++; break; } continue; case '\r': if (p[1] == '\n') goto Ldefault; case '\n': linnum++; // Always extract new lines, so that D lexer counts the // lines right. dbuf->writeByte(*p); continue; case 0: case 0x1a: Leof: error("end of file before closing %c of string", tc); break; default: Ldefault: continue; } break; } } /********************************* * If p points to any white space, skip it * and return pointer just past it. */ unsigned char *Html::skipWhite(unsigned char *q) { for (; 1; q++) { switch (*q) { case ' ': case '\t': case '\f': case '\v': case '\r': case '\n': continue; default: break; } break; } return q; } /*************************************************** * Scan to end of comment. * Comments are defined any of a number of ways. * IE 5.0: <!-- followed by > * "HTML The Definitive Guide": <!-- text with at least one space in it --> * Netscape: <!-- --> comments nest * w3c: whitespace can appear between -- and > of comment close */ void Html::scanComment() { // Most of the complexity is dealing with the case that // an arbitrary amount of whitespace can appear between // the -- and the > of a comment close. int scangt = 0; //printf("scanComment()\n"); if (*p == '\n') { linnum++; // Always extract new lines, so that D lexer counts the // lines right. dbuf->writeByte(*p); } while (1) { //scangt = 1; // IE 5.0 compatibility p++; switch (*p) { case '-': if (p[1] == '-') { if (p[2] == '>') // optimize for most common case { p += 3; break; } p++; scangt = 1; } else scangt = 0; continue; case '>': if (scangt) { // found --> p++; break; } continue; case ' ': case '\t': case '\f': case '\v': // skip white space continue; case '\r': if (p[1] == '\n') goto Ldefault; case '\n': linnum++; // remember to count lines // Always extract new lines, so that D lexer counts the // lines right. dbuf->writeByte(*p); continue; case 0: case 0x1a: error("end of file before closing --> of comment"); break; default: Ldefault: scangt = 0; // it's not --> continue; } break; } //printf("*p = '%c'\n", *p); } /******************************************** * Determine if we are at the start of a comment. * Input: * p is on the opening '<' * Returns: * 0 if not start of a comment * 1 if start of a comment, p is adjusted to point past -- */ int Html::isCommentStart() #ifdef __DMC__ __out(result) { if (result == 0) ; else if (result == 1) { assert(p[-2] == '-' && p[-1] == '-'); } else assert(0); } __body #endif /* __DMC__ */ { unsigned char *s; if (p[0] == '<' && p[1] == '!') { for (s = p + 2; 1; s++) { switch (*s) { case ' ': case '\t': case '\r': case '\f': case '\v': // skip white space, even though spec says no // white space is allowed continue; case '-': if (s[1] == '-') { p = s + 2; return 1; } goto No; default: goto No; } } } No: return 0; } int Html::isCDATAStart() { const char * CDATA_START_MARKER = "<![CDATA["; size_t len = strlen(CDATA_START_MARKER); if (strncmp((char*)p, CDATA_START_MARKER, len) == 0) { p += len; return 1; } else { return 0; } } void Html::scanCDATA() { while(*p && *p != 0x1A) { int lineSepLength = isLineSeparator(p); if (lineSepLength>0) { /* Always extract new lines, so that D lexer counts the lines * right. */ linnum++; dbuf->writeUTF8('\n'); p += lineSepLength; continue; } else if (p[0] == ']' && p[1] == ']' && p[2] == '>') { /* end of CDATA section */ p += 3; return; } else if (inCode) { /* this CDATA section contains D code */ dbuf->writeByte(*p); } p++; } } /******************************************** * Convert an HTML character entity into a character. * Forms are: * &name; named entity * &#ddd; decimal * &#xhhhh; hex * Input: * p is on the & */ int Html::charEntity() { int c = 0; int v; int hex; unsigned char *pstart = p; //printf("Html::charEntity('%c')\n", *p); if (p[1] == '#') { p++; if (p[1] == 'x' || p[1] == 'X') { p++; hex = 1; } else hex = 0; if (p[1] == ';') goto Linvalid; while (1) { p++; switch (*p) { case 0: case 0x1a: error("end of file before end of character entity"); goto Lignore; case '\n': case '\r': case '<': // tag start // Termination is assumed break; case ';': // Termination is explicit p++; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': v = *p - '0'; goto Lvalue; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': if (!hex) goto Linvalid; v = (*p - 'a') + 10; goto Lvalue; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': if (!hex) goto Linvalid; v = (*p - 'A') + 10; goto Lvalue; Lvalue: if (hex) c = (c << 4) + v; else c = (c * 10) + v; if (c > 0x10FFFF) { error("character entity out of range"); goto Lignore; } continue; default: Linvalid: error("invalid numeric character reference"); goto Lignore; } break; } } else { // It's a named entity; gather all characters until ; unsigned char *idstart = p + 1; while (1) { p++; switch (*p) { case 0: case 0x1a: error("end of file before end of character entity"); break; case '\n': case '\r': case '<': // tag start // Termination is assumed c = HtmlNamedEntity(idstart, p - idstart); if (c == -1) goto Lignore; break; case ';': // Termination is explicit c = HtmlNamedEntity(idstart, p - idstart); if (c == -1) goto Lignore; p++; break; default: continue; } break; } } // Kludge to convert non-breaking space to ascii space if (c == 160) c = ' '; return c; Lignore: //printf("Lignore\n"); p = pstart + 1; return '&'; } /** * identify DOS, Linux, Mac, Next and Unicode line endings * 0 if this is no line separator * >0 the length of the separator * Note: input has to be UTF-8 */ static int isLineSeparator(const unsigned char* p) { // Linux if( p[0]=='\n') return 1; // Mac & Dos if( p[0]=='\r') return (p[1]=='\n') ? 2 : 1; // Unicode (line || paragraph sep.) if( p[0]==0xE2 && p[1]==0x80 && (p[2]==0xA8 || p[2]==0xA9)) return 3; // Next if( p[0]==0xC2 && p[1]==0x85) return 2; return 0; }