view dmd/html.c @ 650:aa6a0b7968f7

Added test case for bug #100 Removed dubious check for not emitting static private global in other modules without access. This should be handled properly somewhere else, it's causing unresolved global errors for stuff that should work (in MiniD)
author Tomas Lindquist Olsen <tomas.l.olsen@gmail.com>
date Sun, 05 Oct 2008 17:28:15 +0200
parents c53b6e3fe49a
children
line wrap: on
line source


// Copyright (c) 1999-2006 by Digital Mars
// All Rights Reserved
// written by Walter Bright
// http://www.digitalmars.com
// License for redistribution is by either the Artistic License
// in artistic.txt, or the GNU General Public License in gnu.txt.
// See the included readme.txt for details.


/* HTML parser
 */

#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdarg.h>
#include <errno.h>
#include <wchar.h>

#include "mars.h"
#include "html.h"

#include <assert.h>
#include "root.h"

extern int HtmlNamedEntity(unsigned char *p, int length);

static int isLineSeparator(const unsigned char* p);

/**********************************
 * Determine if beginning of tag identifier
 * or a continuation of a tag identifier.
 */

inline int istagstart(int c)
{
    return (isalpha(c) || c == '_');
}

inline int istag(int c)
{
    return (isalnum(c) || c == '_');
}

/**********************************************
 */

Html::Html(const char *sourcename, unsigned char *base, unsigned length)
{
    //printf("Html::Html()\n");
    this->sourcename = sourcename;
    this->base = base;
    p = base;
    end = base + length;
    linnum = 1;
    dbuf = NULL;
    inCode = 0;
}

/**********************************************
 * Print error & quit.
 */

void Html::error(const char *format, ...)
{
    if (!global.gag)
    {
	printf("%s(%d) : HTML Error: ", sourcename, linnum);

	va_list ap;
	va_start(ap, format);
	vprintf(format, ap);
	va_end(ap);

	printf("\n");
	fflush(stdout);
    }

    global.errors++;
}

/**********************************************
 * Extract all the code from an HTML file,
 * concatenate it all together, and store in buf.
 */

void Html::extractCode(OutBuffer *buf)
{
    //printf("Html::extractCode()\n");
    dbuf = buf;			// save for other routines
    buf->reserve(end - p);
    inCode = 0;
    while (1)
    {
	//printf("p = %p, *p = x%x\n", p, *p);
	switch (*p)
	{
#if 0 // strings are not recognized outside of tags
	    case '"':
	    case '\'':
		skipString();
		continue;
#endif
	    case '<':
		if (p[1] == '!' && isCommentStart())
		{   // Comments start with <!--
		    scanComment();
		}
  		else if(p[1] == '!' && isCDATAStart())
  		{
  		    scanCDATA();
  		}
		else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
		    skipTag();
		else if (istagstart(*skipWhite(p + 1)))
		    skipTag();
		else
		    goto Ldefault;
		continue;

	    case 0:
	    case 0x1a:
		break;		// end of file

	    case '&':
		if (inCode)
		{   // Translate character entity into ascii for D parser
		    int c;

		    c = charEntity();
		    buf->writeUTF8(c);
		}
		else
		    p++;
		continue;

	    case '\r':
		if (p[1] == '\n')
		    goto Ldefault;
	    case '\n':
		linnum++;
		// Always extract new lines, so that D lexer counts the
		// lines right.
		buf->writeByte(*p);
		p++;
		continue;

	    default:
	    Ldefault:
		if (inCode)
		    buf->writeByte(*p);
		p++;
		continue;
	}
	break;
    }
    buf->writeByte(0);				// ending sentinel
    //printf("D code is: '%s'\n", (char *)buf->data);
}

/***********************************************
 * Scan to end of <> tag.
 * Look for <code> and </code> tags to start/stop D processing.
 * Input:
 *	p is on opening '<' of tag; it's already verified that
 *	it's a tag by lookahead
 * Output:
 *	p is past closing '>' of tag
 */

void Html::skipTag()
{
    enum TagState	// what parsing state we're in
    {
	TStagstart,	// start of tag name
	TStag,		// in a tag name
	TSrest,		// following tag name
    };
    enum TagState state = TStagstart;
    int inot;
    unsigned char *tagstart = NULL;
    int taglen = 0;

    p++;
    inot = 0;
    if (*p == '/')
    {	inot = 1;
	p++;
    }
    while (1)
    {
	switch (*p)
	{
	    case '>':		// found end of tag
		p++;
		break;

	    case '"':
	    case '\'':
		state = TSrest;
		skipString();
		continue;

	    case '<':
		if (p[1] == '!' && isCommentStart())
		{   // Comments start with <!--
		    scanComment();
		}
		else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
		{   error("nested tag");
		    skipTag();
		}
		else if (istagstart(*skipWhite(p + 1)))
		{   error("nested tag");
		    skipTag();
		}
		// Treat comments as if they were whitespace
		state = TSrest;
		continue;

	    case 0:
	    case 0x1a:
		error("end of file before end of tag");
		break;		// end of file

	    case '\r':
		if (p[1] == '\n')
		    goto Ldefault;
	    case '\n':
		linnum++;
		// Always extract new lines, so that code lexer counts the
		// lines right.
		dbuf->writeByte(*p);
		state = TSrest;			// end of tag
		p++;
		continue;

	    case ' ':
	    case '\t':
	    case '\f':
	    case '\v':
		if (state == TStagstart)
		{   p++;
		    continue;
		}
	    default:
	    Ldefault:
		switch (state)
		{
		    case TStagstart:		// start of tag name
			assert(istagstart(*p));
			state = TStag;
			tagstart = p;
			taglen = 0;
			break;

		    case TStag:
			if (istag(*p))
			{   // Continuing tag name
			    taglen++;
			}
			else
			{   // End of tag name
			    state = TSrest;
			}
			break;

		    case TSrest:
			break;
		}
		p++;
		continue;
	}
	break;
    }

    // See if we parsed a <code> or </code> tag
    if (taglen && memicmp((char *) tagstart, (char *) "CODE", taglen) == 0
	&& *(p - 2) != '/') // ignore "<code />" (XHTML)
    {
	if (inot)
	{   inCode--;
	    if (inCode < 0)
		inCode = 0;		// ignore extra </code>'s
	}
	else
	    inCode++;
    }
}

/***********************************************
 * Scan to end of attribute string.
 */

void Html::skipString()
{
    int tc = *p;

    while (1)
    {
	p++;
	switch (*p)
	{
	    case '"':
	    case '\'':
		if (*p == tc)
		{   p++;
		    break;
		}
		continue;

	    case '\r':
		if (p[1] == '\n')
		    goto Ldefault;
	    case '\n':
		linnum++;
		// Always extract new lines, so that D lexer counts the
		// lines right.
		dbuf->writeByte(*p);
		continue;

	    case 0:
	    case 0x1a:
	    Leof:
		error("end of file before closing %c of string", tc);
		break;

	    default:
	    Ldefault:
		continue;
	}
	break;
    }
}

/*********************************
 * If p points to any white space, skip it
 * and return pointer just past it.
 */

unsigned char *Html::skipWhite(unsigned char *q)
{
    for (; 1; q++)
    {
	switch (*q)
	{
	    case ' ':
	    case '\t':
	    case '\f':
	    case '\v':
	    case '\r':
	    case '\n':
		continue;

	    default:
		break;
	}
	break;
    }
    return q;
}

/***************************************************
 * Scan to end of comment.
 * Comments are defined any of a number of ways.
 * IE 5.0: <!-- followed by >
 * "HTML The Definitive Guide": <!-- text with at least one space in it -->
 * Netscape: <!-- --> comments nest
 * w3c: whitespace can appear between -- and > of comment close
 */

void Html::scanComment()
{
    // Most of the complexity is dealing with the case that
    // an arbitrary amount of whitespace can appear between
    // the -- and the > of a comment close.
    int scangt = 0;

    //printf("scanComment()\n");
    if (*p == '\n')
    {	linnum++;
	// Always extract new lines, so that D lexer counts the
	// lines right.
	dbuf->writeByte(*p);
    }
    while (1)
    {
	//scangt = 1;			// IE 5.0 compatibility
	p++;
	switch (*p)
	{
	    case '-':
		if (p[1] == '-')
		{
		    if (p[2] == '>')	// optimize for most common case
		    {
			p += 3;
			break;
		    }
		    p++;
		    scangt = 1;
		}
		else
		    scangt = 0;
		continue;

	    case '>':
		if (scangt)
		{   // found -->
		    p++;
		    break;
		}
		continue;

	    case ' ':
	    case '\t':
	    case '\f':
	    case '\v':
		// skip white space
		continue;

	    case '\r':
		if (p[1] == '\n')
		    goto Ldefault;
	    case '\n':
		linnum++;		// remember to count lines
		// Always extract new lines, so that D lexer counts the
		// lines right.
		dbuf->writeByte(*p);
		continue;

	    case 0:
	    case 0x1a:
		error("end of file before closing --> of comment");
		break;

	    default:
	    Ldefault:
		scangt = 0;		// it's not -->
		continue;
	}
	break;
    }
    //printf("*p = '%c'\n", *p);
}

/********************************************
 * Determine if we are at the start of a comment.
 * Input:
 *	p is on the opening '<'
 * Returns:
 *	0 if not start of a comment
 * 	1 if start of a comment, p is adjusted to point past --
 */

int Html::isCommentStart()
#ifdef __DMC__
    __out(result)
    {
	if (result == 0)
	    ;
	else if (result == 1)
	{
	    assert(p[-2] == '-' && p[-1] == '-');
	}
	else
	    assert(0);
    }
    __body
#endif /* __DMC__ */
    {	unsigned char *s;

	if (p[0] == '<' && p[1] == '!')
	{
	    for (s = p + 2; 1; s++)
	    {
		switch (*s)
		{
		    case ' ':
		    case '\t':
		    case '\r':
		    case '\f':
		    case '\v':
			// skip white space, even though spec says no
			// white space is allowed
			continue;

		    case '-':
			if (s[1] == '-')
			{
			    p = s + 2;
			    return 1;
			}
			goto No;

		    default:
			goto No;
		}
	    }
	}
    No:
	return 0;
    }

int Html::isCDATAStart()
{
    const char * CDATA_START_MARKER = "<![CDATA[";
    size_t len = strlen(CDATA_START_MARKER);

    if (strncmp((char*)p, CDATA_START_MARKER, len) == 0)
    {
	p += len;
	return 1;
    }
    else
    {
	return 0;
    }
}

void Html::scanCDATA()
{
    while(*p && *p != 0x1A)
    {
	int lineSepLength = isLineSeparator(p);
	if (lineSepLength>0)
	{
	    /* Always extract new lines, so that D lexer counts the lines
	     * right.
	     */
	    linnum++;
	    dbuf->writeUTF8('\n');
	    p += lineSepLength;
	    continue;
        }
	else if (p[0] == ']' && p[1] == ']' && p[2] == '>')
	{
	    /* end of CDATA section */
	    p += 3;
	    return;
	}
	else if (inCode)
	{
	    /* this CDATA section contains D code */
	    dbuf->writeByte(*p);
	}

	p++;
    }
}

/********************************************
 * Convert an HTML character entity into a character.
 * Forms are:
 *	&name;		named entity
 *	&#ddd;		decimal
 *	&#xhhhh;	hex
 * Input:
 *	p is on the &
 */

int Html::charEntity()
{   int c = 0;
    int v;
    int hex;
    unsigned char *pstart = p;

    //printf("Html::charEntity('%c')\n", *p);
    if (p[1] == '#')
    {
	p++;
	if (p[1] == 'x' || p[1] == 'X')
	{   p++;
	    hex = 1;
	}
	else
	    hex = 0;
	if (p[1] == ';')
	    goto Linvalid;
	while (1)
	{
	    p++;
	    switch (*p)
	    {
		case 0:
		case 0x1a:
		    error("end of file before end of character entity");
		    goto Lignore;

		case '\n':
		case '\r':
		case '<':	// tag start
		    // Termination is assumed
		    break;

		case ';':
		    // Termination is explicit
		    p++;
		    break;

		case '0': case '1': case '2': case '3': case '4':
		case '5': case '6': case '7': case '8': case '9':
		    v = *p - '0';
		    goto Lvalue;

		case 'a': case 'b': case 'c':
		case 'd': case 'e': case 'f':
		    if (!hex)
			goto Linvalid;
		    v = (*p - 'a') + 10;
		    goto Lvalue;

		case 'A': case 'B': case 'C':
		case 'D': case 'E': case 'F':
		    if (!hex)
			goto Linvalid;
		    v = (*p - 'A') + 10;
		    goto Lvalue;

		Lvalue:
		    if (hex)
			c = (c << 4) + v;
		    else
			c = (c * 10) + v;
		    if (c > 0x10FFFF)
		    {
			error("character entity out of range");
			goto Lignore;
		    }
		    continue;

		default:
		Linvalid:
		    error("invalid numeric character reference");
		    goto Lignore;
	    }
	    break;
	}
    }
    else
    {
	// It's a named entity; gather all characters until ;
	unsigned char *idstart = p + 1;

	while (1)
	{
	    p++;
	    switch (*p)
	    {
		case 0:
		case 0x1a:
		    error("end of file before end of character entity");
		    break;

		case '\n':
		case '\r':
		case '<':	// tag start
		    // Termination is assumed
		    c = HtmlNamedEntity(idstart, p - idstart);
		    if (c == -1)
			goto Lignore;
		    break;

		case ';':
		    // Termination is explicit
		    c = HtmlNamedEntity(idstart, p - idstart);
		    if (c == -1)
			goto Lignore;
		    p++;
		    break;

		default:
		    continue;
	    }
	    break;
	}
    }

    // Kludge to convert non-breaking space to ascii space
    if (c == 160)
	c = ' ';

    return c;

Lignore:
    //printf("Lignore\n");
    p = pstart + 1;
    return '&';
}

/**
 * identify DOS, Linux, Mac, Next and Unicode line endings
 * 0 if this is no line separator
 * >0 the length of the separator
 * Note: input has to be UTF-8
 */
static int isLineSeparator(const unsigned char* p)
{
    // Linux
    if( p[0]=='\n')
	return 1;

    // Mac & Dos
    if( p[0]=='\r')
	return (p[1]=='\n') ? 2 : 1;

    // Unicode (line || paragraph sep.)
    if( p[0]==0xE2 && p[1]==0x80 && (p[2]==0xA8 || p[2]==0xA9))
	return 3;

    // Next
    if( p[0]==0xC2 && p[1]==0x85)
	return 2;

    return 0;
}