diff dmd/html.c @ 1:c53b6e3fe49a trunk

[svn r5] Initial commit. Most things are very rough.
author lindquist
date Sat, 01 Sep 2007 21:43:27 +0200
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dmd/html.c	Sat Sep 01 21:43:27 2007 +0200
@@ -0,0 +1,718 @@
+// Copyright (c) 1999-2006 by Digital Mars
+// All Rights Reserved
+// written by Walter Bright
+// http://www.digitalmars.com
+// License for redistribution is by either the Artistic License
+// in artistic.txt, or the GNU General Public License in gnu.txt.
+// See the included readme.txt for details.
+/* HTML parser
+ */
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <wchar.h>
+#include "mars.h"
+#include "html.h"
+#include <assert.h>
+#include "root.h"
+extern int HtmlNamedEntity(unsigned char *p, int length);
+static int isLineSeparator(const unsigned char* p);
+ * Determine if beginning of tag identifier
+ * or a continuation of a tag identifier.
+ */
+inline int istagstart(int c)
+    return (isalpha(c) || c == '_');
+inline int istag(int c)
+    return (isalnum(c) || c == '_');
+ */
+Html::Html(const char *sourcename, unsigned char *base, unsigned length)
+    //printf("Html::Html()\n");
+    this->sourcename = sourcename;
+    this->base = base;
+    p = base;
+    end = base + length;
+    linnum = 1;
+    dbuf = NULL;
+    inCode = 0;
+ * Print error & quit.
+ */
+void Html::error(const char *format, ...)
+    if (!global.gag)
+    {
+	printf("%s(%d) : HTML Error: ", sourcename, linnum);
+	va_list ap;
+	va_start(ap, format);
+	vprintf(format, ap);
+	va_end(ap);
+	printf("\n");
+	fflush(stdout);
+    }
+    global.errors++;
+ * Extract all the code from an HTML file,
+ * concatenate it all together, and store in buf.
+ */
+void Html::extractCode(OutBuffer *buf)
+    //printf("Html::extractCode()\n");
+    dbuf = buf;			// save for other routines
+    buf->reserve(end - p);
+    inCode = 0;
+    while (1)
+    {
+	//printf("p = %p, *p = x%x\n", p, *p);
+	switch (*p)
+	{
+#if 0 // strings are not recognized outside of tags
+	    case '"':
+	    case '\'':
+		skipString();
+		continue;
+	    case '<':
+		if (p[1] == '!' && isCommentStart())
+		{   // Comments start with <!--
+		    scanComment();
+		}
+  		else if(p[1] == '!' && isCDATAStart())
+  		{
+  		    scanCDATA();
+  		}
+		else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
+		    skipTag();
+		else if (istagstart(*skipWhite(p + 1)))
+		    skipTag();
+		else
+		    goto Ldefault;
+		continue;
+	    case 0:
+	    case 0x1a:
+		break;		// end of file
+	    case '&':
+		if (inCode)
+		{   // Translate character entity into ascii for D parser
+		    int c;
+		    c = charEntity();
+		    buf->writeUTF8(c);
+		}
+		else
+		    p++;
+		continue;
+	    case '\r':
+		if (p[1] == '\n')
+		    goto Ldefault;
+	    case '\n':
+		linnum++;
+		// Always extract new lines, so that D lexer counts the
+		// lines right.
+		buf->writeByte(*p);
+		p++;
+		continue;
+	    default:
+	    Ldefault:
+		if (inCode)
+		    buf->writeByte(*p);
+		p++;
+		continue;
+	}
+	break;
+    }
+    buf->writeByte(0);				// ending sentinel
+    //printf("D code is: '%s'\n", (char *)buf->data);
+ * Scan to end of <> tag.
+ * Look for <code> and </code> tags to start/stop D processing.
+ * Input:
+ *	p is on opening '<' of tag; it's already verified that
+ *	it's a tag by lookahead
+ * Output:
+ *	p is past closing '>' of tag
+ */
+void Html::skipTag()
+    enum TagState	// what parsing state we're in
+    {
+	TStagstart,	// start of tag name
+	TStag,		// in a tag name
+	TSrest,		// following tag name
+    };
+    enum TagState state = TStagstart;
+    int inot;
+    unsigned char *tagstart = NULL;
+    int taglen = 0;
+    p++;
+    inot = 0;
+    if (*p == '/')
+    {	inot = 1;
+	p++;
+    }
+    while (1)
+    {
+	switch (*p)
+	{
+	    case '>':		// found end of tag
+		p++;
+		break;
+	    case '"':
+	    case '\'':
+		state = TSrest;
+		skipString();
+		continue;
+	    case '<':
+		if (p[1] == '!' && isCommentStart())
+		{   // Comments start with <!--
+		    scanComment();
+		}
+		else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
+		{   error("nested tag");
+		    skipTag();
+		}
+		else if (istagstart(*skipWhite(p + 1)))
+		{   error("nested tag");
+		    skipTag();
+		}
+		// Treat comments as if they were whitespace
+		state = TSrest;
+		continue;
+	    case 0:
+	    case 0x1a:
+		error("end of file before end of tag");
+		break;		// end of file
+	    case '\r':
+		if (p[1] == '\n')
+		    goto Ldefault;
+	    case '\n':
+		linnum++;
+		// Always extract new lines, so that code lexer counts the
+		// lines right.
+		dbuf->writeByte(*p);
+		state = TSrest;			// end of tag
+		p++;
+		continue;
+	    case ' ':
+	    case '\t':
+	    case '\f':
+	    case '\v':
+		if (state == TStagstart)
+		{   p++;
+		    continue;
+		}
+	    default:
+	    Ldefault:
+		switch (state)
+		{
+		    case TStagstart:		// start of tag name
+			assert(istagstart(*p));
+			state = TStag;
+			tagstart = p;
+			taglen = 0;
+			break;
+		    case TStag:
+			if (istag(*p))
+			{   // Continuing tag name
+			    taglen++;
+			}
+			else
+			{   // End of tag name
+			    state = TSrest;
+			}
+			break;
+		    case TSrest:
+			break;
+		}
+		p++;
+		continue;
+	}
+	break;
+    }
+    // See if we parsed a <code> or </code> tag
+    if (taglen && memicmp((char *) tagstart, (char *) "CODE", taglen) == 0
+	&& *(p - 2) != '/') // ignore "<code />" (XHTML)
+    {
+	if (inot)
+	{   inCode--;
+	    if (inCode < 0)
+		inCode = 0;		// ignore extra </code>'s
+	}
+	else
+	    inCode++;
+    }
+ * Scan to end of attribute string.
+ */
+void Html::skipString()
+    int tc = *p;
+    while (1)
+    {
+	p++;
+	switch (*p)
+	{
+	    case '"':
+	    case '\'':
+		if (*p == tc)
+		{   p++;
+		    break;
+		}
+		continue;
+	    case '\r':
+		if (p[1] == '\n')
+		    goto Ldefault;
+	    case '\n':
+		linnum++;
+		// Always extract new lines, so that D lexer counts the
+		// lines right.
+		dbuf->writeByte(*p);
+		continue;
+	    case 0:
+	    case 0x1a:
+	    Leof:
+		error("end of file before closing %c of string", tc);
+		break;
+	    default:
+	    Ldefault:
+		continue;
+	}
+	break;
+    }
+ * If p points to any white space, skip it
+ * and return pointer just past it.
+ */
+unsigned char *Html::skipWhite(unsigned char *q)
+    for (; 1; q++)
+    {
+	switch (*q)
+	{
+	    case ' ':
+	    case '\t':
+	    case '\f':
+	    case '\v':
+	    case '\r':
+	    case '\n':
+		continue;
+	    default:
+		break;
+	}
+	break;
+    }
+    return q;
+ * Scan to end of comment.
+ * Comments are defined any of a number of ways.
+ * IE 5.0: <!-- followed by >
+ * "HTML The Definitive Guide": <!-- text with at least one space in it -->
+ * Netscape: <!-- --> comments nest
+ * w3c: whitespace can appear between -- and > of comment close
+ */
+void Html::scanComment()
+    // Most of the complexity is dealing with the case that
+    // an arbitrary amount of whitespace can appear between
+    // the -- and the > of a comment close.
+    int scangt = 0;
+    //printf("scanComment()\n");
+    if (*p == '\n')
+    {	linnum++;
+	// Always extract new lines, so that D lexer counts the
+	// lines right.
+	dbuf->writeByte(*p);
+    }
+    while (1)
+    {
+	//scangt = 1;			// IE 5.0 compatibility
+	p++;
+	switch (*p)
+	{
+	    case '-':
+		if (p[1] == '-')
+		{
+		    if (p[2] == '>')	// optimize for most common case
+		    {
+			p += 3;
+			break;
+		    }
+		    p++;
+		    scangt = 1;
+		}
+		else
+		    scangt = 0;
+		continue;
+	    case '>':
+		if (scangt)
+		{   // found -->
+		    p++;
+		    break;
+		}
+		continue;
+	    case ' ':
+	    case '\t':
+	    case '\f':
+	    case '\v':
+		// skip white space
+		continue;
+	    case '\r':
+		if (p[1] == '\n')
+		    goto Ldefault;
+	    case '\n':
+		linnum++;		// remember to count lines
+		// Always extract new lines, so that D lexer counts the
+		// lines right.
+		dbuf->writeByte(*p);
+		continue;
+	    case 0:
+	    case 0x1a:
+		error("end of file before closing --> of comment");
+		break;
+	    default:
+	    Ldefault:
+		scangt = 0;		// it's not -->
+		continue;
+	}
+	break;
+    }
+    //printf("*p = '%c'\n", *p);
+ * Determine if we are at the start of a comment.
+ * Input:
+ *	p is on the opening '<'
+ * Returns:
+ *	0 if not start of a comment
+ * 	1 if start of a comment, p is adjusted to point past --
+ */
+int Html::isCommentStart()
+#ifdef __DMC__
+    __out(result)
+    {
+	if (result == 0)
+	    ;
+	else if (result == 1)
+	{
+	    assert(p[-2] == '-' && p[-1] == '-');
+	}
+	else
+	    assert(0);
+    }
+    __body
+#endif /* __DMC__ */
+    {	unsigned char *s;
+	if (p[0] == '<' && p[1] == '!')
+	{
+	    for (s = p + 2; 1; s++)
+	    {
+		switch (*s)
+		{
+		    case ' ':
+		    case '\t':
+		    case '\r':
+		    case '\f':
+		    case '\v':
+			// skip white space, even though spec says no
+			// white space is allowed
+			continue;
+		    case '-':
+			if (s[1] == '-')
+			{
+			    p = s + 2;
+			    return 1;
+			}
+			goto No;
+		    default:
+			goto No;
+		}
+	    }
+	}
+    No:
+	return 0;
+    }
+int Html::isCDATAStart()
+    const char * CDATA_START_MARKER = "<![CDATA[";
+    size_t len = strlen(CDATA_START_MARKER);
+    if (strncmp((char*)p, CDATA_START_MARKER, len) == 0)
+    {
+	p += len;
+	return 1;
+    }
+    else
+    {
+	return 0;
+    }
+void Html::scanCDATA()
+    while(*p && *p != 0x1A)
+    {
+	int lineSepLength = isLineSeparator(p);
+	if (lineSepLength>0)
+	{
+	    /* Always extract new lines, so that D lexer counts the lines
+	     * right.
+	     */
+	    linnum++;
+	    dbuf->writeUTF8('\n');
+	    p += lineSepLength;
+	    continue;
+        }
+	else if (p[0] == ']' && p[1] == ']' && p[2] == '>')
+	{
+	    /* end of CDATA section */
+	    p += 3;
+	    return;
+	}
+	else if (inCode)
+	{
+	    /* this CDATA section contains D code */
+	    dbuf->writeByte(*p);
+	}
+	p++;
+    }
+ * Convert an HTML character entity into a character.
+ * Forms are:
+ *	&name;		named entity
+ *	&#ddd;		decimal
+ *	&#xhhhh;	hex
+ * Input:
+ *	p is on the &
+ */
+int Html::charEntity()
+{   int c = 0;
+    int v;
+    int hex;
+    unsigned char *pstart = p;
+    //printf("Html::charEntity('%c')\n", *p);
+    if (p[1] == '#')
+    {
+	p++;
+	if (p[1] == 'x' || p[1] == 'X')
+	{   p++;
+	    hex = 1;
+	}
+	else
+	    hex = 0;
+	if (p[1] == ';')
+	    goto Linvalid;
+	while (1)
+	{
+	    p++;
+	    switch (*p)
+	    {
+		case 0:
+		case 0x1a:
+		    error("end of file before end of character entity");
+		    goto Lignore;
+		case '\n':
+		case '\r':
+		case '<':	// tag start
+		    // Termination is assumed
+		    break;
+		case ';':
+		    // Termination is explicit
+		    p++;
+		    break;
+		case '0': case '1': case '2': case '3': case '4':
+		case '5': case '6': case '7': case '8': case '9':
+		    v = *p - '0';
+		    goto Lvalue;
+		case 'a': case 'b': case 'c':
+		case 'd': case 'e': case 'f':
+		    if (!hex)
+			goto Linvalid;
+		    v = (*p - 'a') + 10;
+		    goto Lvalue;
+		case 'A': case 'B': case 'C':
+		case 'D': case 'E': case 'F':
+		    if (!hex)
+			goto Linvalid;
+		    v = (*p - 'A') + 10;
+		    goto Lvalue;
+		Lvalue:
+		    if (hex)
+			c = (c << 4) + v;
+		    else
+			c = (c * 10) + v;
+		    if (c > 0x10FFFF)
+		    {
+			error("character entity out of range");
+			goto Lignore;
+		    }
+		    continue;
+		default:
+		Linvalid:
+		    error("invalid numeric character reference");
+		    goto Lignore;
+	    }
+	    break;
+	}
+    }
+    else
+    {
+	// It's a named entity; gather all characters until ;
+	unsigned char *idstart = p + 1;
+	while (1)
+	{
+	    p++;
+	    switch (*p)
+	    {
+		case 0:
+		case 0x1a:
+		    error("end of file before end of character entity");
+		    break;
+		case '\n':
+		case '\r':
+		case '<':	// tag start
+		    // Termination is assumed
+		    c = HtmlNamedEntity(idstart, p - idstart);
+		    if (c == -1)
+			goto Lignore;
+		    break;
+		case ';':
+		    // Termination is explicit
+		    c = HtmlNamedEntity(idstart, p - idstart);
+		    if (c == -1)
+			goto Lignore;
+		    p++;
+		    break;
+		default:
+		    continue;
+	    }
+	    break;
+	}
+    }
+    // Kludge to convert non-breaking space to ascii space
+    if (c == 160)
+	c = ' ';
+    return c;
+    //printf("Lignore\n");
+    p = pstart + 1;
+    return '&';
+ * identify DOS, Linux, Mac, Next and Unicode line endings
+ * 0 if this is no line separator
+ * >0 the length of the separator
+ * Note: input has to be UTF-8
+ */
+static int isLineSeparator(const unsigned char* p)
+    // Linux
+    if( p[0]=='\n')
+	return 1;
+    // Mac & Dos
+    if( p[0]=='\r')
+	return (p[1]=='\n') ? 2 : 1;
+    // Unicode (line || paragraph sep.)
+    if( p[0]==0xE2 && p[1]==0x80 && (p[2]==0xA8 || p[2]==0xA9))
+	return 3;
+    // Next
+    if( p[0]==0xC2 && p[1]==0x85)
+	return 2;
+    return 0;