view dmd/html.c @ 1117:4c20fcc4252b

Fun with parameter attributes: For several of the "synthetic" parameters added to D functions, we can apply noalias and nocapture. They are sret parameters, 'nest' pointers passed to nested functions, and _argptr: Nocapture: - Sret and nest are nocapture because they don't represent D-level variables, and thus the callee can't (validly) obtain a pointer to them, let alone keep it around after it returns. - _argptr is nocapture because although the callee has access to it as a pointer, that pointer is invalidated when it returns. All three are noalias because they're function-local variables - Sret and _argptr are noalias because they're freshly alloca'd memory only used for a single function call that's not allowed to keep an aliasing pointer to it around (since the parameter is nocapture). - 'Nest' is noalias because the callee only ever has access to one such pointer per parent function, and every parent function has a different one. This commit also ensures attributes set on sret, _arguments and _argptr are propagated to calls to such functions. It also adds one exception to the general rule that attributes on function types should propagate to calls: the type of a delegate's function pointer has a 'nest' parameter, but this can either be a true 'nest' (for delegates to nested functions) or a 'this' (for delegates to member functions). Since 'this' is neither noalias nor nocapture, and there's generally no way to tell which one it is, we remove these attributes at the call site if the callee is a delegate.
author Frits van Bommel <fvbommel wxs.nl>
date Sat, 14 Mar 2009 22:15:31 +0100
parents c53b6e3fe49a
children
line wrap: on
line source


// Copyright (c) 1999-2006 by Digital Mars
// All Rights Reserved
// written by Walter Bright
// http://www.digitalmars.com
// License for redistribution is by either the Artistic License
// in artistic.txt, or the GNU General Public License in gnu.txt.
// See the included readme.txt for details.


/* HTML parser
 */

#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdarg.h>
#include <errno.h>
#include <wchar.h>

#include "mars.h"
#include "html.h"

#include <assert.h>
#include "root.h"

extern int HtmlNamedEntity(unsigned char *p, int length);

static int isLineSeparator(const unsigned char* p);

/**********************************
 * Determine if beginning of tag identifier
 * or a continuation of a tag identifier.
 */

inline int istagstart(int c)
{
    return (isalpha(c) || c == '_');
}

inline int istag(int c)
{
    return (isalnum(c) || c == '_');
}

/**********************************************
 */

Html::Html(const char *sourcename, unsigned char *base, unsigned length)
{
    //printf("Html::Html()\n");
    this->sourcename = sourcename;
    this->base = base;
    p = base;
    end = base + length;
    linnum = 1;
    dbuf = NULL;
    inCode = 0;
}

/**********************************************
 * Print error & quit.
 */

void Html::error(const char *format, ...)
{
    if (!global.gag)
    {
	printf("%s(%d) : HTML Error: ", sourcename, linnum);

	va_list ap;
	va_start(ap, format);
	vprintf(format, ap);
	va_end(ap);

	printf("\n");
	fflush(stdout);
    }

    global.errors++;
}

/**********************************************
 * Extract all the code from an HTML file,
 * concatenate it all together, and store in buf.
 */

void Html::extractCode(OutBuffer *buf)
{
    //printf("Html::extractCode()\n");
    dbuf = buf;			// save for other routines
    buf->reserve(end - p);
    inCode = 0;
    while (1)
    {
	//printf("p = %p, *p = x%x\n", p, *p);
	switch (*p)
	{
#if 0 // strings are not recognized outside of tags
	    case '"':
	    case '\'':
		skipString();
		continue;
#endif
	    case '<':
		if (p[1] == '!' && isCommentStart())
		{   // Comments start with <!--
		    scanComment();
		}
  		else if(p[1] == '!' && isCDATAStart())
  		{
  		    scanCDATA();
  		}
		else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
		    skipTag();
		else if (istagstart(*skipWhite(p + 1)))
		    skipTag();
		else
		    goto Ldefault;
		continue;

	    case 0:
	    case 0x1a:
		break;		// end of file

	    case '&':
		if (inCode)
		{   // Translate character entity into ascii for D parser
		    int c;

		    c = charEntity();
		    buf->writeUTF8(c);
		}
		else
		    p++;
		continue;

	    case '\r':
		if (p[1] == '\n')
		    goto Ldefault;
	    case '\n':
		linnum++;
		// Always extract new lines, so that D lexer counts the
		// lines right.
		buf->writeByte(*p);
		p++;
		continue;

	    default:
	    Ldefault:
		if (inCode)
		    buf->writeByte(*p);
		p++;
		continue;
	}
	break;
    }
    buf->writeByte(0);				// ending sentinel
    //printf("D code is: '%s'\n", (char *)buf->data);
}

/***********************************************
 * Scan to end of <> tag.
 * Look for <code> and </code> tags to start/stop D processing.
 * Input:
 *	p is on opening '<' of tag; it's already verified that
 *	it's a tag by lookahead
 * Output:
 *	p is past closing '>' of tag
 */

void Html::skipTag()
{
    enum TagState	// what parsing state we're in
    {
	TStagstart,	// start of tag name
	TStag,		// in a tag name
	TSrest,		// following tag name
    };
    enum TagState state = TStagstart;
    int inot;
    unsigned char *tagstart = NULL;
    int taglen = 0;

    p++;
    inot = 0;
    if (*p == '/')
    {	inot = 1;
	p++;
    }
    while (1)
    {
	switch (*p)
	{
	    case '>':		// found end of tag
		p++;
		break;

	    case '"':
	    case '\'':
		state = TSrest;
		skipString();
		continue;

	    case '<':
		if (p[1] == '!' && isCommentStart())
		{   // Comments start with <!--
		    scanComment();
		}
		else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
		{   error("nested tag");
		    skipTag();
		}
		else if (istagstart(*skipWhite(p + 1)))
		{   error("nested tag");
		    skipTag();
		}
		// Treat comments as if they were whitespace
		state = TSrest;
		continue;

	    case 0:
	    case 0x1a:
		error("end of file before end of tag");
		break;		// end of file

	    case '\r':
		if (p[1] == '\n')
		    goto Ldefault;
	    case '\n':
		linnum++;
		// Always extract new lines, so that code lexer counts the
		// lines right.
		dbuf->writeByte(*p);
		state = TSrest;			// end of tag
		p++;
		continue;

	    case ' ':
	    case '\t':
	    case '\f':
	    case '\v':
		if (state == TStagstart)
		{   p++;
		    continue;
		}
	    default:
	    Ldefault:
		switch (state)
		{
		    case TStagstart:		// start of tag name
			assert(istagstart(*p));
			state = TStag;
			tagstart = p;
			taglen = 0;
			break;

		    case TStag:
			if (istag(*p))
			{   // Continuing tag name
			    taglen++;
			}
			else
			{   // End of tag name
			    state = TSrest;
			}
			break;

		    case TSrest:
			break;
		}
		p++;
		continue;
	}
	break;
    }

    // See if we parsed a <code> or </code> tag
    if (taglen && memicmp((char *) tagstart, (char *) "CODE", taglen) == 0
	&& *(p - 2) != '/') // ignore "<code />" (XHTML)
    {
	if (inot)
	{   inCode--;
	    if (inCode < 0)
		inCode = 0;		// ignore extra </code>'s
	}
	else
	    inCode++;
    }
}

/***********************************************
 * Scan to end of attribute string.
 */

void Html::skipString()
{
    int tc = *p;

    while (1)
    {
	p++;
	switch (*p)
	{
	    case '"':
	    case '\'':
		if (*p == tc)
		{   p++;
		    break;
		}
		continue;

	    case '\r':
		if (p[1] == '\n')
		    goto Ldefault;
	    case '\n':
		linnum++;
		// Always extract new lines, so that D lexer counts the
		// lines right.
		dbuf->writeByte(*p);
		continue;

	    case 0:
	    case 0x1a:
	    Leof:
		error("end of file before closing %c of string", tc);
		break;

	    default:
	    Ldefault:
		continue;
	}
	break;
    }
}

/*********************************
 * If p points to any white space, skip it
 * and return pointer just past it.
 */

unsigned char *Html::skipWhite(unsigned char *q)
{
    for (; 1; q++)
    {
	switch (*q)
	{
	    case ' ':
	    case '\t':
	    case '\f':
	    case '\v':
	    case '\r':
	    case '\n':
		continue;

	    default:
		break;
	}
	break;
    }
    return q;
}

/***************************************************
 * Scan to end of comment.
 * Comments are defined any of a number of ways.
 * IE 5.0: <!-- followed by >
 * "HTML The Definitive Guide": <!-- text with at least one space in it -->
 * Netscape: <!-- --> comments nest
 * w3c: whitespace can appear between -- and > of comment close
 */

void Html::scanComment()
{
    // Most of the complexity is dealing with the case that
    // an arbitrary amount of whitespace can appear between
    // the -- and the > of a comment close.
    int scangt = 0;

    //printf("scanComment()\n");
    if (*p == '\n')
    {	linnum++;
	// Always extract new lines, so that D lexer counts the
	// lines right.
	dbuf->writeByte(*p);
    }
    while (1)
    {
	//scangt = 1;			// IE 5.0 compatibility
	p++;
	switch (*p)
	{
	    case '-':
		if (p[1] == '-')
		{
		    if (p[2] == '>')	// optimize for most common case
		    {
			p += 3;
			break;
		    }
		    p++;
		    scangt = 1;
		}
		else
		    scangt = 0;
		continue;

	    case '>':
		if (scangt)
		{   // found -->
		    p++;
		    break;
		}
		continue;

	    case ' ':
	    case '\t':
	    case '\f':
	    case '\v':
		// skip white space
		continue;

	    case '\r':
		if (p[1] == '\n')
		    goto Ldefault;
	    case '\n':
		linnum++;		// remember to count lines
		// Always extract new lines, so that D lexer counts the
		// lines right.
		dbuf->writeByte(*p);
		continue;

	    case 0:
	    case 0x1a:
		error("end of file before closing --> of comment");
		break;

	    default:
	    Ldefault:
		scangt = 0;		// it's not -->
		continue;
	}
	break;
    }
    //printf("*p = '%c'\n", *p);
}

/********************************************
 * Determine if we are at the start of a comment.
 * Input:
 *	p is on the opening '<'
 * Returns:
 *	0 if not start of a comment
 * 	1 if start of a comment, p is adjusted to point past --
 */

int Html::isCommentStart()
#ifdef __DMC__
    __out(result)
    {
	if (result == 0)
	    ;
	else if (result == 1)
	{
	    assert(p[-2] == '-' && p[-1] == '-');
	}
	else
	    assert(0);
    }
    __body
#endif /* __DMC__ */
    {	unsigned char *s;

	if (p[0] == '<' && p[1] == '!')
	{
	    for (s = p + 2; 1; s++)
	    {
		switch (*s)
		{
		    case ' ':
		    case '\t':
		    case '\r':
		    case '\f':
		    case '\v':
			// skip white space, even though spec says no
			// white space is allowed
			continue;

		    case '-':
			if (s[1] == '-')
			{
			    p = s + 2;
			    return 1;
			}
			goto No;

		    default:
			goto No;
		}
	    }
	}
    No:
	return 0;
    }

int Html::isCDATAStart()
{
    const char * CDATA_START_MARKER = "<![CDATA[";
    size_t len = strlen(CDATA_START_MARKER);

    if (strncmp((char*)p, CDATA_START_MARKER, len) == 0)
    {
	p += len;
	return 1;
    }
    else
    {
	return 0;
    }
}

void Html::scanCDATA()
{
    while(*p && *p != 0x1A)
    {
	int lineSepLength = isLineSeparator(p);
	if (lineSepLength>0)
	{
	    /* Always extract new lines, so that D lexer counts the lines
	     * right.
	     */
	    linnum++;
	    dbuf->writeUTF8('\n');
	    p += lineSepLength;
	    continue;
        }
	else if (p[0] == ']' && p[1] == ']' && p[2] == '>')
	{
	    /* end of CDATA section */
	    p += 3;
	    return;
	}
	else if (inCode)
	{
	    /* this CDATA section contains D code */
	    dbuf->writeByte(*p);
	}

	p++;
    }
}

/********************************************
 * Convert an HTML character entity into a character.
 * Forms are:
 *	&name;		named entity
 *	&#ddd;		decimal
 *	&#xhhhh;	hex
 * Input:
 *	p is on the &
 */

int Html::charEntity()
{   int c = 0;
    int v;
    int hex;
    unsigned char *pstart = p;

    //printf("Html::charEntity('%c')\n", *p);
    if (p[1] == '#')
    {
	p++;
	if (p[1] == 'x' || p[1] == 'X')
	{   p++;
	    hex = 1;
	}
	else
	    hex = 0;
	if (p[1] == ';')
	    goto Linvalid;
	while (1)
	{
	    p++;
	    switch (*p)
	    {
		case 0:
		case 0x1a:
		    error("end of file before end of character entity");
		    goto Lignore;

		case '\n':
		case '\r':
		case '<':	// tag start
		    // Termination is assumed
		    break;

		case ';':
		    // Termination is explicit
		    p++;
		    break;

		case '0': case '1': case '2': case '3': case '4':
		case '5': case '6': case '7': case '8': case '9':
		    v = *p - '0';
		    goto Lvalue;

		case 'a': case 'b': case 'c':
		case 'd': case 'e': case 'f':
		    if (!hex)
			goto Linvalid;
		    v = (*p - 'a') + 10;
		    goto Lvalue;

		case 'A': case 'B': case 'C':
		case 'D': case 'E': case 'F':
		    if (!hex)
			goto Linvalid;
		    v = (*p - 'A') + 10;
		    goto Lvalue;

		Lvalue:
		    if (hex)
			c = (c << 4) + v;
		    else
			c = (c * 10) + v;
		    if (c > 0x10FFFF)
		    {
			error("character entity out of range");
			goto Lignore;
		    }
		    continue;

		default:
		Linvalid:
		    error("invalid numeric character reference");
		    goto Lignore;
	    }
	    break;
	}
    }
    else
    {
	// It's a named entity; gather all characters until ;
	unsigned char *idstart = p + 1;

	while (1)
	{
	    p++;
	    switch (*p)
	    {
		case 0:
		case 0x1a:
		    error("end of file before end of character entity");
		    break;

		case '\n':
		case '\r':
		case '<':	// tag start
		    // Termination is assumed
		    c = HtmlNamedEntity(idstart, p - idstart);
		    if (c == -1)
			goto Lignore;
		    break;

		case ';':
		    // Termination is explicit
		    c = HtmlNamedEntity(idstart, p - idstart);
		    if (c == -1)
			goto Lignore;
		    p++;
		    break;

		default:
		    continue;
	    }
	    break;
	}
    }

    // Kludge to convert non-breaking space to ascii space
    if (c == 160)
	c = ' ';

    return c;

Lignore:
    //printf("Lignore\n");
    p = pstart + 1;
    return '&';
}

/**
 * identify DOS, Linux, Mac, Next and Unicode line endings
 * 0 if this is no line separator
 * >0 the length of the separator
 * Note: input has to be UTF-8
 */
static int isLineSeparator(const unsigned char* p)
{
    // Linux
    if( p[0]=='\n')
	return 1;

    // Mac & Dos
    if( p[0]=='\r')
	return (p[1]=='\n') ? 2 : 1;

    // Unicode (line || paragraph sep.)
    if( p[0]==0xE2 && p[1]==0x80 && (p[2]==0xA8 || p[2]==0xA9))
	return 3;

    // Next
    if( p[0]==0xC2 && p[1]==0x85)
	return 2;

    return 0;
}