Mercurial > projects > ldc
view dmd/html.c @ 1117:4c20fcc4252b
Fun with parameter attributes: For several of the "synthetic" parameters added
to D functions, we can apply noalias and nocapture. They are sret parameters,
'nest' pointers passed to nested functions, and _argptr:
Nocapture:
- Sret and nest are nocapture because they don't represent D-level variables,
and thus the callee can't (validly) obtain a pointer to them, let alone keep
it around after it returns.
- _argptr is nocapture because although the callee has access to it as a
pointer, that pointer is invalidated when it returns.
All three are noalias because they're function-local variables
- Sret and _argptr are noalias because they're freshly alloca'd memory only
used for a single function call that's not allowed to keep an aliasing
pointer to it around (since the parameter is nocapture).
- 'Nest' is noalias because the callee only ever has access to one such pointer
per parent function, and every parent function has a different one.
This commit also ensures attributes set on sret, _arguments and _argptr are
propagated to calls to such functions.
It also adds one exception to the general rule that attributes on function types
should propagate to calls: the type of a delegate's function pointer has a
'nest' parameter, but this can either be a true 'nest' (for delegates to nested
functions) or a 'this' (for delegates to member functions). Since 'this' is
neither noalias nor nocapture, and there's generally no way to tell which one it
is, we remove these attributes at the call site if the callee is a delegate.
author | Frits van Bommel <fvbommel wxs.nl> |
---|---|
date | Sat, 14 Mar 2009 22:15:31 +0100 |
parents | c53b6e3fe49a |
children |
line wrap: on
line source
// Copyright (c) 1999-2006 by Digital Mars // All Rights Reserved // written by Walter Bright // http://www.digitalmars.com // License for redistribution is by either the Artistic License // in artistic.txt, or the GNU General Public License in gnu.txt. // See the included readme.txt for details. /* HTML parser */ #include <stdio.h> #include <string.h> #include <ctype.h> #include <stdarg.h> #include <errno.h> #include <wchar.h> #include "mars.h" #include "html.h" #include <assert.h> #include "root.h" extern int HtmlNamedEntity(unsigned char *p, int length); static int isLineSeparator(const unsigned char* p); /********************************** * Determine if beginning of tag identifier * or a continuation of a tag identifier. */ inline int istagstart(int c) { return (isalpha(c) || c == '_'); } inline int istag(int c) { return (isalnum(c) || c == '_'); } /********************************************** */ Html::Html(const char *sourcename, unsigned char *base, unsigned length) { //printf("Html::Html()\n"); this->sourcename = sourcename; this->base = base; p = base; end = base + length; linnum = 1; dbuf = NULL; inCode = 0; } /********************************************** * Print error & quit. */ void Html::error(const char *format, ...) { if (!global.gag) { printf("%s(%d) : HTML Error: ", sourcename, linnum); va_list ap; va_start(ap, format); vprintf(format, ap); va_end(ap); printf("\n"); fflush(stdout); } global.errors++; } /********************************************** * Extract all the code from an HTML file, * concatenate it all together, and store in buf. */ void Html::extractCode(OutBuffer *buf) { //printf("Html::extractCode()\n"); dbuf = buf; // save for other routines buf->reserve(end - p); inCode = 0; while (1) { //printf("p = %p, *p = x%x\n", p, *p); switch (*p) { #if 0 // strings are not recognized outside of tags case '"': case '\'': skipString(); continue; #endif case '<': if (p[1] == '!' && isCommentStart()) { // Comments start with <!-- scanComment(); } else if(p[1] == '!' && isCDATAStart()) { scanCDATA(); } else if (p[1] == '/' && istagstart(*skipWhite(p + 2))) skipTag(); else if (istagstart(*skipWhite(p + 1))) skipTag(); else goto Ldefault; continue; case 0: case 0x1a: break; // end of file case '&': if (inCode) { // Translate character entity into ascii for D parser int c; c = charEntity(); buf->writeUTF8(c); } else p++; continue; case '\r': if (p[1] == '\n') goto Ldefault; case '\n': linnum++; // Always extract new lines, so that D lexer counts the // lines right. buf->writeByte(*p); p++; continue; default: Ldefault: if (inCode) buf->writeByte(*p); p++; continue; } break; } buf->writeByte(0); // ending sentinel //printf("D code is: '%s'\n", (char *)buf->data); } /*********************************************** * Scan to end of <> tag. * Look for <code> and </code> tags to start/stop D processing. * Input: * p is on opening '<' of tag; it's already verified that * it's a tag by lookahead * Output: * p is past closing '>' of tag */ void Html::skipTag() { enum TagState // what parsing state we're in { TStagstart, // start of tag name TStag, // in a tag name TSrest, // following tag name }; enum TagState state = TStagstart; int inot; unsigned char *tagstart = NULL; int taglen = 0; p++; inot = 0; if (*p == '/') { inot = 1; p++; } while (1) { switch (*p) { case '>': // found end of tag p++; break; case '"': case '\'': state = TSrest; skipString(); continue; case '<': if (p[1] == '!' && isCommentStart()) { // Comments start with <!-- scanComment(); } else if (p[1] == '/' && istagstart(*skipWhite(p + 2))) { error("nested tag"); skipTag(); } else if (istagstart(*skipWhite(p + 1))) { error("nested tag"); skipTag(); } // Treat comments as if they were whitespace state = TSrest; continue; case 0: case 0x1a: error("end of file before end of tag"); break; // end of file case '\r': if (p[1] == '\n') goto Ldefault; case '\n': linnum++; // Always extract new lines, so that code lexer counts the // lines right. dbuf->writeByte(*p); state = TSrest; // end of tag p++; continue; case ' ': case '\t': case '\f': case '\v': if (state == TStagstart) { p++; continue; } default: Ldefault: switch (state) { case TStagstart: // start of tag name assert(istagstart(*p)); state = TStag; tagstart = p; taglen = 0; break; case TStag: if (istag(*p)) { // Continuing tag name taglen++; } else { // End of tag name state = TSrest; } break; case TSrest: break; } p++; continue; } break; } // See if we parsed a <code> or </code> tag if (taglen && memicmp((char *) tagstart, (char *) "CODE", taglen) == 0 && *(p - 2) != '/') // ignore "<code />" (XHTML) { if (inot) { inCode--; if (inCode < 0) inCode = 0; // ignore extra </code>'s } else inCode++; } } /*********************************************** * Scan to end of attribute string. */ void Html::skipString() { int tc = *p; while (1) { p++; switch (*p) { case '"': case '\'': if (*p == tc) { p++; break; } continue; case '\r': if (p[1] == '\n') goto Ldefault; case '\n': linnum++; // Always extract new lines, so that D lexer counts the // lines right. dbuf->writeByte(*p); continue; case 0: case 0x1a: Leof: error("end of file before closing %c of string", tc); break; default: Ldefault: continue; } break; } } /********************************* * If p points to any white space, skip it * and return pointer just past it. */ unsigned char *Html::skipWhite(unsigned char *q) { for (; 1; q++) { switch (*q) { case ' ': case '\t': case '\f': case '\v': case '\r': case '\n': continue; default: break; } break; } return q; } /*************************************************** * Scan to end of comment. * Comments are defined any of a number of ways. * IE 5.0: <!-- followed by > * "HTML The Definitive Guide": <!-- text with at least one space in it --> * Netscape: <!-- --> comments nest * w3c: whitespace can appear between -- and > of comment close */ void Html::scanComment() { // Most of the complexity is dealing with the case that // an arbitrary amount of whitespace can appear between // the -- and the > of a comment close. int scangt = 0; //printf("scanComment()\n"); if (*p == '\n') { linnum++; // Always extract new lines, so that D lexer counts the // lines right. dbuf->writeByte(*p); } while (1) { //scangt = 1; // IE 5.0 compatibility p++; switch (*p) { case '-': if (p[1] == '-') { if (p[2] == '>') // optimize for most common case { p += 3; break; } p++; scangt = 1; } else scangt = 0; continue; case '>': if (scangt) { // found --> p++; break; } continue; case ' ': case '\t': case '\f': case '\v': // skip white space continue; case '\r': if (p[1] == '\n') goto Ldefault; case '\n': linnum++; // remember to count lines // Always extract new lines, so that D lexer counts the // lines right. dbuf->writeByte(*p); continue; case 0: case 0x1a: error("end of file before closing --> of comment"); break; default: Ldefault: scangt = 0; // it's not --> continue; } break; } //printf("*p = '%c'\n", *p); } /******************************************** * Determine if we are at the start of a comment. * Input: * p is on the opening '<' * Returns: * 0 if not start of a comment * 1 if start of a comment, p is adjusted to point past -- */ int Html::isCommentStart() #ifdef __DMC__ __out(result) { if (result == 0) ; else if (result == 1) { assert(p[-2] == '-' && p[-1] == '-'); } else assert(0); } __body #endif /* __DMC__ */ { unsigned char *s; if (p[0] == '<' && p[1] == '!') { for (s = p + 2; 1; s++) { switch (*s) { case ' ': case '\t': case '\r': case '\f': case '\v': // skip white space, even though spec says no // white space is allowed continue; case '-': if (s[1] == '-') { p = s + 2; return 1; } goto No; default: goto No; } } } No: return 0; } int Html::isCDATAStart() { const char * CDATA_START_MARKER = "<![CDATA["; size_t len = strlen(CDATA_START_MARKER); if (strncmp((char*)p, CDATA_START_MARKER, len) == 0) { p += len; return 1; } else { return 0; } } void Html::scanCDATA() { while(*p && *p != 0x1A) { int lineSepLength = isLineSeparator(p); if (lineSepLength>0) { /* Always extract new lines, so that D lexer counts the lines * right. */ linnum++; dbuf->writeUTF8('\n'); p += lineSepLength; continue; } else if (p[0] == ']' && p[1] == ']' && p[2] == '>') { /* end of CDATA section */ p += 3; return; } else if (inCode) { /* this CDATA section contains D code */ dbuf->writeByte(*p); } p++; } } /******************************************** * Convert an HTML character entity into a character. * Forms are: * &name; named entity * &#ddd; decimal * &#xhhhh; hex * Input: * p is on the & */ int Html::charEntity() { int c = 0; int v; int hex; unsigned char *pstart = p; //printf("Html::charEntity('%c')\n", *p); if (p[1] == '#') { p++; if (p[1] == 'x' || p[1] == 'X') { p++; hex = 1; } else hex = 0; if (p[1] == ';') goto Linvalid; while (1) { p++; switch (*p) { case 0: case 0x1a: error("end of file before end of character entity"); goto Lignore; case '\n': case '\r': case '<': // tag start // Termination is assumed break; case ';': // Termination is explicit p++; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': v = *p - '0'; goto Lvalue; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': if (!hex) goto Linvalid; v = (*p - 'a') + 10; goto Lvalue; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': if (!hex) goto Linvalid; v = (*p - 'A') + 10; goto Lvalue; Lvalue: if (hex) c = (c << 4) + v; else c = (c * 10) + v; if (c > 0x10FFFF) { error("character entity out of range"); goto Lignore; } continue; default: Linvalid: error("invalid numeric character reference"); goto Lignore; } break; } } else { // It's a named entity; gather all characters until ; unsigned char *idstart = p + 1; while (1) { p++; switch (*p) { case 0: case 0x1a: error("end of file before end of character entity"); break; case '\n': case '\r': case '<': // tag start // Termination is assumed c = HtmlNamedEntity(idstart, p - idstart); if (c == -1) goto Lignore; break; case ';': // Termination is explicit c = HtmlNamedEntity(idstart, p - idstart); if (c == -1) goto Lignore; p++; break; default: continue; } break; } } // Kludge to convert non-breaking space to ascii space if (c == 160) c = ' '; return c; Lignore: //printf("Lignore\n"); p = pstart + 1; return '&'; } /** * identify DOS, Linux, Mac, Next and Unicode line endings * 0 if this is no line separator * >0 the length of the separator * Note: input has to be UTF-8 */ static int isLineSeparator(const unsigned char* p) { // Linux if( p[0]=='\n') return 1; // Mac & Dos if( p[0]=='\r') return (p[1]=='\n') ? 2 : 1; // Unicode (line || paragraph sep.) if( p[0]==0xE2 && p[1]==0x80 && (p[2]==0xA8 || p[2]==0xA9)) return 3; // Next if( p[0]==0xC2 && p[1]==0x85) return 2; return 0; }