projects/ldc: dmd/html.c comparison

comparison dmd/html.c @ 1:c53b6e3fe49a trunk

[svn r5] Initial commit. Most things are very rough.

author	lindquist
date	Sat, 01 Sep 2007 21:43:27 +0200
parents
children

comparison

equal deleted inserted replaced

-:a9e71648e74d
+:c53b6e3fe49a
+// Copyright (c) 1999-2006 by Digital Mars
+// All Rights Reserved
+// written by Walter Bright
+// http://www.digitalmars.com
+// License for redistribution is by either the Artistic License
+// in artistic.txt, or the GNU General Public License in gnu.txt.
+// See the included readme.txt for details.
+/* HTML parser
+*/
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <wchar.h>
+#include "mars.h"
+#include "html.h"
+#include <assert.h>
+#include "root.h"
+extern int HtmlNamedEntity(unsigned char *p, int length);
+static int isLineSeparator(const unsigned char* p);
+/**********************************
+* Determine if beginning of tag identifier
+* or a continuation of a tag identifier.
+*/
+inline int istagstart(int c)
+{
+return (isalpha(c) || c == '_');
+}
+inline int istag(int c)
+{
+return (isalnum(c) || c == '_');
+}
+/**********************************************
+*/
+Html::Html(const char *sourcename, unsigned char *base, unsigned length)
+{
+//printf("Html::Html()\n");
+this->sourcename = sourcename;
+this->base = base;
+p = base;
+end = base + length;
+linnum = 1;
+dbuf = NULL;
+inCode = 0;
+}
+/**********************************************
+* Print error & quit.
+*/
+void Html::error(const char *format, ...)
+{
+if (!global.gag)
+{
+	printf("%s(%d) : HTML Error: ", sourcename, linnum);
+	va_list ap;
+	va_start(ap, format);
+	vprintf(format, ap);
+	va_end(ap);
+	printf("\n");
+	fflush(stdout);
+}
+global.errors++;
+}
+/**********************************************
+* Extract all the code from an HTML file,
+* concatenate it all together, and store in buf.
+*/
+void Html::extractCode(OutBuffer *buf)
+{
+//printf("Html::extractCode()\n");
+dbuf = buf;			// save for other routines
+buf->reserve(end - p);
+inCode = 0;
+while (1)
+{
+	//printf("p = %p, *p = x%x\n", p, *p);
+	switch (*p)
+	{
+#if 0 // strings are not recognized outside of tags
+	    case '"':
+	    case '\'':
+		skipString();
+		continue;
+#endif
+	    case '<':
+		if (p[1] == '!' && isCommentStart())
+		{   // Comments start with <!--
+		    scanComment();
+		}
+		else if(p[1] == '!' && isCDATAStart())
+		{
+		    scanCDATA();
+		}
+		else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
+		    skipTag();
+		else if (istagstart(*skipWhite(p + 1)))
+		    skipTag();
+		else
+		    goto Ldefault;
+		continue;
+	    case 0:
+	    case 0x1a:
+		break;		// end of file
+	    case '&':
+		if (inCode)
+		{   // Translate character entity into ascii for D parser
+		    int c;
+		    c = charEntity();
+		    buf->writeUTF8(c);
+		}
+		else
+		    p++;
+		continue;
+	    case '\r':
+		if (p[1] == '\n')
+		    goto Ldefault;
+	    case '\n':
+		linnum++;
+		// Always extract new lines, so that D lexer counts the
+		// lines right.
+		buf->writeByte(*p);
+		p++;
+		continue;
+	    default:
+	    Ldefault:
+		if (inCode)
+		    buf->writeByte(*p);
+		p++;
+		continue;
+	}
+	break;
+}
+buf->writeByte(0);				// ending sentinel
+//printf("D code is: '%s'\n", (char *)buf->data);
+}
+/***********************************************
+* Scan to end of <> tag.
+* Look for <code> and </code> tags to start/stop D processing.
+* Input:
+*	p is on opening '<' of tag; it's already verified that
+*	it's a tag by lookahead
+* Output:
+*	p is past closing '>' of tag
+*/
+void Html::skipTag()
+{
+enum TagState	// what parsing state we're in
+{
+	TStagstart,	// start of tag name
+	TStag,		// in a tag name
+	TSrest,		// following tag name
+};
+enum TagState state = TStagstart;
+int inot;
+unsigned char *tagstart = NULL;
+int taglen = 0;
+p++;
+inot = 0;
+if (*p == '/')
+{	inot = 1;
+	p++;
+}
+while (1)
+{
+	switch (*p)
+	{
+	    case '>':		// found end of tag
+		p++;
+		break;
+	    case '"':
+	    case '\'':
+		state = TSrest;
+		skipString();
+		continue;
+	    case '<':
+		if (p[1] == '!' && isCommentStart())
+		{   // Comments start with <!--
+		    scanComment();
+		}
+		else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
+		{   error("nested tag");
+		    skipTag();
+		}
+		else if (istagstart(*skipWhite(p + 1)))
+		{   error("nested tag");
+		    skipTag();
+		}
+		// Treat comments as if they were whitespace
+		state = TSrest;
+		continue;
+	    case 0:
+	    case 0x1a:
+		error("end of file before end of tag");
+		break;		// end of file
+	    case '\r':
+		if (p[1] == '\n')
+		    goto Ldefault;
+	    case '\n':
+		linnum++;
+		// Always extract new lines, so that code lexer counts the
+		// lines right.
+		dbuf->writeByte(*p);
+		state = TSrest;			// end of tag
+		p++;
+		continue;
+	    case ' ':
+	    case '\t':
+	    case '\f':
+	    case '\v':
+		if (state == TStagstart)
+		{   p++;
+		    continue;
+		}
+	    default:
+	    Ldefault:
+		switch (state)
+		{
+		    case TStagstart:		// start of tag name
+			assert(istagstart(*p));
+			state = TStag;
+			tagstart = p;
+			taglen = 0;
+			break;
+		    case TStag:
+			if (istag(*p))
+			{   // Continuing tag name
+			    taglen++;
+			}
+			else
+			{   // End of tag name
+			    state = TSrest;
+			}
+			break;
+		    case TSrest:
+			break;
+		}
+		p++;
+		continue;
+	}
+	break;
+}
+// See if we parsed a <code> or </code> tag
+if (taglen && memicmp((char *) tagstart, (char *) "CODE", taglen) == 0
+	&& *(p - 2) != '/') // ignore "<code />" (XHTML)
+{
+	if (inot)
+	{   inCode--;
+	    if (inCode < 0)
+		inCode = 0;		// ignore extra </code>'s
+	}
+	else
+	    inCode++;
+}
+}
+/***********************************************
+* Scan to end of attribute string.
+*/
+void Html::skipString()
+{
+int tc = *p;
+while (1)
+{
+	p++;
+	switch (*p)
+	{
+	    case '"':
+	    case '\'':
+		if (*p == tc)
+		{   p++;
+		    break;
+		}
+		continue;
+	    case '\r':
+		if (p[1] == '\n')
+		    goto Ldefault;
+	    case '\n':
+		linnum++;
+		// Always extract new lines, so that D lexer counts the
+		// lines right.
+		dbuf->writeByte(*p);
+		continue;
+	    case 0:
+	    case 0x1a:
+	    Leof:
+		error("end of file before closing %c of string", tc);
+		break;
+	    default:
+	    Ldefault:
+		continue;
+	}
+	break;
+}
+}
+/*********************************
+* If p points to any white space, skip it
+* and return pointer just past it.
+*/
+unsigned char *Html::skipWhite(unsigned char *q)
+{
+for (; 1; q++)
+{
+	switch (*q)
+	{
+	    case ' ':
+	    case '\t':
+	    case '\f':
+	    case '\v':
+	    case '\r':
+	    case '\n':
+		continue;
+	    default:
+		break;
+	}
+	break;
+}
+return q;
+}
+/***************************************************
+* Scan to end of comment.
+* Comments are defined any of a number of ways.
+* IE 5.0: <!-- followed by >
+* "HTML The Definitive Guide": <!-- text with at least one space in it -->
+* Netscape: <!-- --> comments nest
+* w3c: whitespace can appear between -- and > of comment close
+*/
+void Html::scanComment()
+{
+// Most of the complexity is dealing with the case that
+// an arbitrary amount of whitespace can appear between
+// the -- and the > of a comment close.
+int scangt = 0;
+//printf("scanComment()\n");
+if (*p == '\n')
+{	linnum++;
+	// Always extract new lines, so that D lexer counts the
+	// lines right.
+	dbuf->writeByte(*p);
+}
+while (1)
+{
+	//scangt = 1;			// IE 5.0 compatibility
+	p++;
+	switch (*p)
+	{
+	    case '-':
+		if (p[1] == '-')
+		{
+		    if (p[2] == '>')	// optimize for most common case
+		    {
+			p += 3;
+			break;
+		    }
+		    p++;
+		    scangt = 1;
+		}
+		else
+		    scangt = 0;
+		continue;
+	    case '>':
+		if (scangt)
+		{   // found -->
+		    p++;
+		    break;
+		}
+		continue;
+	    case ' ':
+	    case '\t':
+	    case '\f':
+	    case '\v':
+		// skip white space
+		continue;
+	    case '\r':
+		if (p[1] == '\n')
+		    goto Ldefault;
+	    case '\n':
+		linnum++;		// remember to count lines
+		// Always extract new lines, so that D lexer counts the
+		// lines right.
+		dbuf->writeByte(*p);
+		continue;
+	    case 0:
+	    case 0x1a:
+		error("end of file before closing --> of comment");
+		break;
+	    default:
+	    Ldefault:
+		scangt = 0;		// it's not -->
+		continue;
+	}
+	break;
+}
+//printf("*p = '%c'\n", *p);
+}
+/********************************************
+* Determine if we are at the start of a comment.
+* Input:
+*	p is on the opening '<'
+* Returns:
+*	0 if not start of a comment
+* 	1 if start of a comment, p is adjusted to point past --
+*/
+int Html::isCommentStart()
+#ifdef __DMC__
+__out(result)
+{
+	if (result == 0)
+	    ;
+	else if (result == 1)
+	{
+	    assert(p[-2] == '-' && p[-1] == '-');
+	}
+	else
+	    assert(0);
+}
+__body
+#endif /* __DMC__ */
+{	unsigned char *s;
+	if (p[0] == '<' && p[1] == '!')
+	{
+	    for (s = p + 2; 1; s++)
+	    {
+		switch (*s)
+		{
+		    case ' ':
+		    case '\t':
+		    case '\r':
+		    case '\f':
+		    case '\v':
+			// skip white space, even though spec says no
+			// white space is allowed
+			continue;
+		    case '-':
+			if (s[1] == '-')
+			{
+			    p = s + 2;
+			    return 1;
+			}
+			goto No;
+		    default:
+			goto No;
+		}
+	    }
+	}
+No:
+	return 0;
+}
+int Html::isCDATAStart()
+{
+const char * CDATA_START_MARKER = "<![CDATA[";
+size_t len = strlen(CDATA_START_MARKER);
+if (strncmp((char*)p, CDATA_START_MARKER, len) == 0)
+{
+	p += len;
+	return 1;
+}
+else
+{
+	return 0;
+}
+}
+void Html::scanCDATA()
+{
+while(*p && *p != 0x1A)
+{
+	int lineSepLength = isLineSeparator(p);
+	if (lineSepLength>0)
+	{
+	    /* Always extract new lines, so that D lexer counts the lines
+	     * right.
+	     */
+	    linnum++;
+	    dbuf->writeUTF8('\n');
+	    p += lineSepLength;
+	    continue;
+}
+	else if (p[0] == ']' && p[1] == ']' && p[2] == '>')
+	{
+	    /* end of CDATA section */
+	    p += 3;
+	    return;
+	}
+	else if (inCode)
+	{
+	    /* this CDATA section contains D code */
+	    dbuf->writeByte(*p);
+	}
+	p++;
+}
+}
+/********************************************
+* Convert an HTML character entity into a character.
+* Forms are:
+*	&name;		named entity
+*	&#ddd;		decimal
+*	&#xhhhh;	hex
+* Input:
+*	p is on the &
+*/
+int Html::charEntity()
+{   int c = 0;
+int v;
+int hex;
+unsigned char *pstart = p;
+//printf("Html::charEntity('%c')\n", *p);
+if (p[1] == '#')
+{
+	p++;
+	if (p[1] == 'x' || p[1] == 'X')
+	{   p++;
+	    hex = 1;
+	}
+	else
+	    hex = 0;
+	if (p[1] == ';')
+	    goto Linvalid;
+	while (1)
+	{
+	    p++;
+	    switch (*p)
+	    {
+		case 0:
+		case 0x1a:
+		    error("end of file before end of character entity");
+		    goto Lignore;
+		case '\n':
+		case '\r':
+		case '<':	// tag start
+		    // Termination is assumed
+		    break;
+		case ';':
+		    // Termination is explicit
+		    p++;
+		    break;
+		case '0': case '1': case '2': case '3': case '4':
+		case '5': case '6': case '7': case '8': case '9':
+		    v = *p - '0';
+		    goto Lvalue;
+		case 'a': case 'b': case 'c':
+		case 'd': case 'e': case 'f':
+		    if (!hex)
+			goto Linvalid;
+		    v = (*p - 'a') + 10;
+		    goto Lvalue;
+		case 'A': case 'B': case 'C':
+		case 'D': case 'E': case 'F':
+		    if (!hex)
+			goto Linvalid;
+		    v = (*p - 'A') + 10;
+		    goto Lvalue;
+		Lvalue:
+		    if (hex)
+			c = (c << 4) + v;
+		    else
+			c = (c * 10) + v;
+		    if (c > 0x10FFFF)
+		    {
+			error("character entity out of range");
+			goto Lignore;
+		    }
+		    continue;
+		default:
+		Linvalid:
+		    error("invalid numeric character reference");
+		    goto Lignore;
+	    }
+	    break;
+	}
+}
+else
+{
+	// It's a named entity; gather all characters until ;
+	unsigned char *idstart = p + 1;
+	while (1)
+	{
+	    p++;
+	    switch (*p)
+	    {
+		case 0:
+		case 0x1a:
+		    error("end of file before end of character entity");
+		    break;
+		case '\n':
+		case '\r':
+		case '<':	// tag start
+		    // Termination is assumed
+		    c = HtmlNamedEntity(idstart, p - idstart);
+		    if (c == -1)
+			goto Lignore;
+		    break;
+		case ';':
+		    // Termination is explicit
+		    c = HtmlNamedEntity(idstart, p - idstart);
+		    if (c == -1)
+			goto Lignore;
+		    p++;
+		    break;
+		default:
+		    continue;
+	    }
+	    break;
+	}
+}
+// Kludge to convert non-breaking space to ascii space
+if (c == 160)
+	c = ' ';
+return c;
+Lignore:
+//printf("Lignore\n");
+p = pstart + 1;
+return '&';
+}
+/**
+* identify DOS, Linux, Mac, Next and Unicode line endings
+* 0 if this is no line separator
+* >0 the length of the separator
+* Note: input has to be UTF-8
+*/
+static int isLineSeparator(const unsigned char* p)
+{
+// Linux
+if( p[0]=='\n')
+	return 1;
+// Mac & Dos
+if( p[0]=='\r')
+	return (p[1]=='\n') ? 2 : 1;
+// Unicode (line || paragraph sep.)
+if( p[0]==0xE2 && p[1]==0x80 && (p[2]==0xA8 || p[2]==0xA9))
+	return 3;
+// Next
+if( p[0]==0xC2 && p[1]==0x85)
+	return 2;
+return 0;
+}

Mercurial > projects > ldc

comparison dmd/html.c @ 1:c53b6e3fe49a trunk