Mercurial > projects > ldc
diff dmd2/lexer.c @ 758:f04dde6e882c
Added initial D2 support, D2 frontend and changes to codegen to make things compile.
author | Tomas Lindquist Olsen <tomas.l.olsen@gmail.com> |
---|---|
date | Tue, 11 Nov 2008 01:38:48 +0100 |
parents | |
children | 356e65836fb5 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dmd2/lexer.c Tue Nov 11 01:38:48 2008 +0100 @@ -0,0 +1,3094 @@ + +// Compiler implementation of the D programming language +// Copyright (c) 1999-2008 by Digital Mars +// All Rights Reserved +// written by Walter Bright +// http://www.digitalmars.com +// License for redistribution is by either the Artistic License +// in artistic.txt, or the GNU General Public License in gnu.txt. +// See the included readme.txt for details. + +/* Lexical Analyzer */ + +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include <stdarg.h> +#include <errno.h> +#include <wchar.h> +#include <stdlib.h> +#include <assert.h> +#include <sys/time.h> +#include <math.h> + +#ifdef IN_GCC + +#include <time.h> +#include "mem.h" + +#else + +#if __GNUC__ +#include <time.h> +#endif + +#if IN_LLVM +#include "mem.h" +#elif _WIN32 +#include "..\root\mem.h" +#else +#include "../root/mem.h" +#endif +#endif + +#include "stringtable.h" + +#include "lexer.h" +#include "utf.h" +#include "identifier.h" +#include "id.h" +#include "module.h" + +#if _WIN32 && __DMC__ +// from \dm\src\include\setlocal.h +extern "C" char * __cdecl __locale_decpoint; +#endif + +extern int HtmlNamedEntity(unsigned char *p, int length); + +#define LS 0x2028 // UTF line separator +#define PS 0x2029 // UTF paragraph separator + +/******************************************** + * Do our own char maps + */ + +static unsigned char cmtable[256]; + +const int CMoctal = 0x1; +const int CMhex = 0x2; +const int CMidchar = 0x4; + +inline unsigned char isoctal (unsigned char c) { return cmtable[c] & CMoctal; } +inline unsigned char ishex (unsigned char c) { return cmtable[c] & CMhex; } +inline unsigned char isidchar(unsigned char c) { return cmtable[c] & CMidchar; } + +static void cmtable_init() +{ + for (unsigned c = 0; c < sizeof(cmtable) / sizeof(cmtable[0]); c++) + { + if ('0' <= c && c <= '7') + cmtable[c] |= CMoctal; + if (isdigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')) + cmtable[c] |= CMhex; + if (isalnum(c) || c == '_') + cmtable[c] |= CMidchar; + } +} + + +/************************* Token **********************************************/ + +const char *Token::tochars[TOKMAX]; + +void *Token::operator new(size_t size) +{ Token *t; + + if (Lexer::freelist) + { + t = Lexer::freelist; + Lexer::freelist = t->next; + return t; + } + + return ::operator new(size); +} + +#ifdef DEBUG +void Token::print() +{ + fprintf(stdmsg, "%s\n", toChars()); +} +#endif + +const char *Token::toChars() +{ const char *p; + static char buffer[3 + 3 * sizeof(value) + 1]; + + p = buffer; + switch (value) + { + case TOKint32v: +#if IN_GCC + sprintf(buffer,"%d",(d_int32)int64value); +#else + sprintf(buffer,"%d",int32value); +#endif + break; + + case TOKuns32v: + case TOKcharv: + case TOKwcharv: + case TOKdcharv: +#if IN_GCC + sprintf(buffer,"%uU",(d_uns32)uns64value); +#else + sprintf(buffer,"%uU",uns32value); +#endif + break; + + case TOKint64v: + sprintf(buffer,"%lldL",int64value); + break; + + case TOKuns64v: + sprintf(buffer,"%lluUL",uns64value); + break; + +#if IN_GCC + case TOKfloat32v: + case TOKfloat64v: + case TOKfloat80v: + float80value.format(buffer, sizeof(buffer)); + break; + case TOKimaginary32v: + case TOKimaginary64v: + case TOKimaginary80v: + float80value.format(buffer, sizeof(buffer)); + // %% buffer + strcat(buffer, "i"); + break; +#else + case TOKfloat32v: + sprintf(buffer,"%Lgf", float80value); + break; + + case TOKfloat64v: + sprintf(buffer,"%Lg", float80value); + break; + + case TOKfloat80v: + sprintf(buffer,"%LgL", float80value); + break; + + case TOKimaginary32v: + sprintf(buffer,"%Lgfi", float80value); + break; + + case TOKimaginary64v: + sprintf(buffer,"%Lgi", float80value); + break; + + case TOKimaginary80v: + sprintf(buffer,"%LgLi", float80value); + break; +#endif + + case TOKstring: +#if CSTRINGS + p = string; +#else + { OutBuffer buf; + + buf.writeByte('"'); + for (size_t i = 0; i < len; ) + { unsigned c; + + utf_decodeChar((unsigned char *)ustring, len, &i, &c); + switch (c) + { + case 0: + break; + + case '"': + case '\\': + buf.writeByte('\\'); + default: + if (isprint(c)) + buf.writeByte(c); + else if (c <= 0x7F) + buf.printf("\\x%02x", c); + else if (c <= 0xFFFF) + buf.printf("\\u%04x", c); + else + buf.printf("\\U%08x", c); + continue; + } + break; + } + buf.writeByte('"'); + if (postfix) + buf.writeByte('"'); + buf.writeByte(0); + p = (char *)buf.extractData(); + } +#endif + break; + + case TOKidentifier: + case TOKenum: + case TOKstruct: + case TOKimport: + CASE_BASIC_TYPES: + p = ident->toChars(); + break; + + default: + p = toChars(value); + break; + } + return p; +} + +const char *Token::toChars(enum TOK value) +{ const char *p; + static char buffer[3 + 3 * sizeof(value) + 1]; + + p = tochars[value]; + if (!p) + { sprintf(buffer,"TOK%d",value); + p = buffer; + } + return p; +} + +/*************************** Lexer ********************************************/ + +Token *Lexer::freelist = NULL; +StringTable Lexer::stringtable; +OutBuffer Lexer::stringbuffer; + +Lexer::Lexer(Module *mod, + unsigned char *base, unsigned begoffset, unsigned endoffset, + int doDocComment, int commentToken) + : loc(mod, 1) +{ + //printf("Lexer::Lexer(%p,%d)\n",base,length); + //printf("lexer.mod = %p, %p\n", mod, this->loc.mod); + memset(&token,0,sizeof(token)); + this->base = base; + this->end = base + endoffset; + p = base + begoffset; + this->mod = mod; + this->doDocComment = doDocComment; + this->anyToken = 0; + this->commentToken = commentToken; + //initKeywords(); + + /* If first line starts with '#!', ignore the line + */ + + if (p[0] == '#' && p[1] =='!') + { + p += 2; + while (1) + { unsigned char c = *p; + switch (c) + { + case '\n': + p++; + break; + + case '\r': + p++; + if (*p == '\n') + p++; + break; + + case 0: + case 0x1A: + break; + + default: + if (c & 0x80) + { unsigned u = decodeUTF(); + if (u == PS || u == LS) + break; + } + p++; + continue; + } + break; + } + loc.linnum = 2; + } +} + + +void Lexer::error(const char *format, ...) +{ + if (mod && !global.gag) + { + char *p = loc.toChars(); + if (*p) + fprintf(stdmsg, "%s: ", p); + mem.free(p); + + va_list ap; + va_start(ap, format); + vfprintf(stdmsg, format, ap); + va_end(ap); + + fprintf(stdmsg, "\n"); + fflush(stdmsg); + + if (global.errors >= 20) // moderate blizzard of cascading messages + fatal(); + } + global.errors++; +} + +void Lexer::error(Loc loc, const char *format, ...) +{ + if (mod && !global.gag) + { + char *p = loc.toChars(); + if (*p) + fprintf(stdmsg, "%s: ", p); + mem.free(p); + + va_list ap; + va_start(ap, format); + vfprintf(stdmsg, format, ap); + va_end(ap); + + fprintf(stdmsg, "\n"); + fflush(stdmsg); + + if (global.errors >= 20) // moderate blizzard of cascading messages + fatal(); + } + global.errors++; +} + +TOK Lexer::nextToken() +{ Token *t; + + if (token.next) + { + t = token.next; + memcpy(&token,t,sizeof(Token)); + t->next = freelist; + freelist = t; + } + else + { + scan(&token); + } + //token.print(); + return token.value; +} + +Token *Lexer::peek(Token *ct) +{ Token *t; + + if (ct->next) + t = ct->next; + else + { + t = new Token(); + scan(t); + t->next = NULL; + ct->next = t; + } + return t; +} + +/*********************** + * Look ahead at next token's value. + */ + +TOK Lexer::peekNext() +{ + return peek(&token)->value; +} + +/********************************* + * tk is on the opening (. + * Look ahead and return token that is past the closing ). + */ + +Token *Lexer::peekPastParen(Token *tk) +{ + //printf("peekPastParen()\n"); + int parens = 1; + int curlynest = 0; + while (1) + { + tk = peek(tk); + //tk->print(); + switch (tk->value) + { + case TOKlparen: + parens++; + continue; + + case TOKrparen: + --parens; + if (parens) + continue; + tk = peek(tk); + break; + + case TOKlcurly: + curlynest++; + continue; + + case TOKrcurly: + if (--curlynest >= 0) + continue; + break; + + case TOKsemicolon: + if (curlynest) + continue; + break; + + case TOKeof: + break; + + default: + continue; + } + return tk; + } +} + +/********************************** + * Determine if string is a valid Identifier. + * Placed here because of commonality with Lexer functionality. + * Returns: + * 0 invalid + */ + +int Lexer::isValidIdentifier(char *p) +{ + size_t len; + size_t idx; + + if (!p || !*p) + goto Linvalid; + + if (*p >= '0' && *p <= '9') // beware of isdigit() on signed chars + goto Linvalid; + + len = strlen(p); + idx = 0; + while (p[idx]) + { dchar_t dc; + + const char *q = utf_decodeChar((unsigned char *)p, len, &idx, &dc); + if (q) + goto Linvalid; + + if (!((dc >= 0x80 && isUniAlpha(dc)) || isalnum(dc) || dc == '_')) + goto Linvalid; + } + return 1; + +Linvalid: + return 0; +} + +/**************************** + * Turn next token in buffer into a token. + */ + +void Lexer::scan(Token *t) +{ + unsigned lastLine = loc.linnum; + unsigned linnum; + + t->blockComment = NULL; + t->lineComment = NULL; + while (1) + { + t->ptr = p; + //printf("p = %p, *p = '%c'\n",p,*p); + switch (*p) + { + case 0: + case 0x1A: + t->value = TOKeof; // end of file + return; + + case ' ': + case '\t': + case '\v': + case '\f': + p++; + continue; // skip white space + + case '\r': + p++; + if (*p != '\n') // if CR stands by itself + loc.linnum++; + continue; // skip white space + + case '\n': + p++; + loc.linnum++; + continue; // skip white space + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + t->value = number(t); + return; + +#if CSTRINGS + case '\'': + t->value = charConstant(t, 0); + return; + + case '"': + t->value = stringConstant(t,0); + return; + + case 'l': + case 'L': + if (p[1] == '\'') + { + p++; + t->value = charConstant(t, 1); + return; + } + else if (p[1] == '"') + { + p++; + t->value = stringConstant(t, 1); + return; + } +#else + case '\'': + t->value = charConstant(t,0); + return; + + case 'r': + if (p[1] != '"') + goto case_ident; + p++; + case '`': + t->value = wysiwygStringConstant(t, *p); + return; + + case 'x': + if (p[1] != '"') + goto case_ident; + p++; + t->value = hexStringConstant(t); + return; + +#if DMDV2 + case 'q': + if (p[1] == '"') + { + p++; + t->value = delimitedStringConstant(t); + return; + } + else if (p[1] == '{') + { + p++; + t->value = tokenStringConstant(t); + return; + } + else + goto case_ident; +#endif + + case '"': + t->value = escapeStringConstant(t,0); + return; + + case '\\': // escaped string literal + { unsigned c; + + stringbuffer.reset(); + do + { + p++; + switch (*p) + { + case 'u': + case 'U': + case '&': + c = escapeSequence(); + stringbuffer.writeUTF8(c); + break; + + default: + c = escapeSequence(); + stringbuffer.writeByte(c); + break; + } + } while (*p == '\\'); + t->len = stringbuffer.offset; + stringbuffer.writeByte(0); + t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset); + memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); + t->postfix = 0; + t->value = TOKstring; + return; + } + + case 'l': + case 'L': +#endif + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': case 'g': case 'h': case 'i': case 'j': + case 'k': case 'm': case 'n': case 'o': +#if DMDV2 + case 'p': /*case 'q': case 'r':*/ case 's': case 't': +#else + case 'p': case 'q': /*case 'r':*/ case 's': case 't': +#endif + case 'u': case 'v': case 'w': /*case 'x':*/ case 'y': + case 'z': + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': case 'G': case 'H': case 'I': case 'J': + case 'K': case 'M': case 'N': case 'O': + case 'P': case 'Q': case 'R': case 'S': case 'T': + case 'U': case 'V': case 'W': case 'X': case 'Y': + case 'Z': + case '_': + case_ident: + { unsigned char c; + StringValue *sv; + Identifier *id; + + do + { + c = *++p; + } while (isidchar(c) || (c & 0x80 && isUniAlpha(decodeUTF()))); + sv = stringtable.update((char *)t->ptr, p - t->ptr); + id = (Identifier *) sv->ptrvalue; + if (!id) + { id = new Identifier(sv->lstring.string,TOKidentifier); + sv->ptrvalue = id; + } + t->ident = id; + t->value = (enum TOK) id->value; + anyToken = 1; + if (*t->ptr == '_') // if special identifier token + { + static char date[11+1]; + static char time[8+1]; + static char timestamp[24+1]; + + if (!date[0]) // lazy evaluation + { time_t t; + char *p; + + ::time(&t); + p = ctime(&t); + assert(p); + sprintf(date, "%.6s %.4s", p + 4, p + 20); + sprintf(time, "%.8s", p + 11); + sprintf(timestamp, "%.24s", p); + } + +#if DMDV1 + if (mod && id == Id::FILE) + { + t->ustring = (unsigned char *)(loc.filename ? loc.filename : mod->ident->toChars()); + goto Lstring; + } + else if (mod && id == Id::LINE) + { + t->value = TOKint64v; + t->uns64value = loc.linnum; + } + else +#endif + if (id == Id::DATE) + { + t->ustring = (unsigned char *)date; + goto Lstring; + } + else if (id == Id::TIME) + { + t->ustring = (unsigned char *)time; + goto Lstring; + } + else if (id == Id::VENDOR) + { + t->ustring = (unsigned char *)"LDC"; + goto Lstring; + } + else if (id == Id::TIMESTAMP) + { + t->ustring = (unsigned char *)timestamp; + Lstring: + t->value = TOKstring; + Llen: + t->postfix = 0; + t->len = strlen((char *)t->ustring); + } + else if (id == Id::VERSIONX) + { unsigned major = 0; + unsigned minor = 0; + + for (const char *p = global.version + 1; 1; p++) + { + char c = *p; + if (isdigit(c)) + minor = minor * 10 + c - '0'; + else if (c == '.') + { major = minor; + minor = 0; + } + else + break; + } + t->value = TOKint64v; + t->uns64value = major * 1000 + minor; + } +#if DMDV2 + else if (id == Id::EOFX) + { + t->value = TOKeof; + // Advance scanner to end of file + while (!(*p == 0 || *p == 0x1A)) + p++; + } +#endif + } + //printf("t->value = %d\n",t->value); + return; + } + + case '/': + p++; + switch (*p) + { + case '=': + p++; + t->value = TOKdivass; + return; + + case '*': + p++; + linnum = loc.linnum; + while (1) + { + while (1) + { unsigned char c = *p; + switch (c) + { + case '/': + break; + + case '\n': + loc.linnum++; + p++; + continue; + + case '\r': + p++; + if (*p != '\n') + loc.linnum++; + continue; + + case 0: + case 0x1A: + error("unterminated /* */ comment"); + p = end; + t->value = TOKeof; + return; + + default: + if (c & 0x80) + { unsigned u = decodeUTF(); + if (u == PS || u == LS) + loc.linnum++; + } + p++; + continue; + } + break; + } + p++; + if (p[-2] == '*' && p - 3 != t->ptr) + break; + } + if (commentToken) + { + t->value = TOKcomment; + return; + } + else if (doDocComment && t->ptr[2] == '*' && p - 4 != t->ptr) + { // if /** but not /**/ + getDocComment(t, lastLine == linnum); + } + continue; + + case '/': // do // style comments + linnum = loc.linnum; + while (1) + { unsigned char c = *++p; + switch (c) + { + case '\n': + break; + + case '\r': + if (p[1] == '\n') + p++; + break; + + case 0: + case 0x1A: + if (commentToken) + { + p = end; + t->value = TOKcomment; + return; + } + if (doDocComment && t->ptr[2] == '/') + getDocComment(t, lastLine == linnum); + p = end; + t->value = TOKeof; + return; + + default: + if (c & 0x80) + { unsigned u = decodeUTF(); + if (u == PS || u == LS) + break; + } + continue; + } + break; + } + + if (commentToken) + { + p++; + loc.linnum++; + t->value = TOKcomment; + return; + } + if (doDocComment && t->ptr[2] == '/') + getDocComment(t, lastLine == linnum); + + p++; + loc.linnum++; + continue; + + case '+': + { int nest; + + linnum = loc.linnum; + p++; + nest = 1; + while (1) + { unsigned char c = *p; + switch (c) + { + case '/': + p++; + if (*p == '+') + { + p++; + nest++; + } + continue; + + case '+': + p++; + if (*p == '/') + { + p++; + if (--nest == 0) + break; + } + continue; + + case '\r': + p++; + if (*p != '\n') + loc.linnum++; + continue; + + case '\n': + loc.linnum++; + p++; + continue; + + case 0: + case 0x1A: + error("unterminated /+ +/ comment"); + p = end; + t->value = TOKeof; + return; + + default: + if (c & 0x80) + { unsigned u = decodeUTF(); + if (u == PS || u == LS) + loc.linnum++; + } + p++; + continue; + } + break; + } + if (commentToken) + { + t->value = TOKcomment; + return; + } + if (doDocComment && t->ptr[2] == '+' && p - 4 != t->ptr) + { // if /++ but not /++/ + getDocComment(t, lastLine == linnum); + } + continue; + } + } + t->value = TOKdiv; + return; + + case '.': + p++; + if (isdigit(*p)) + { /* Note that we don't allow ._1 and ._ as being + * valid floating point numbers. + */ + p--; + t->value = inreal(t); + } + else if (p[0] == '.') + { + if (p[1] == '.') + { p += 2; + t->value = TOKdotdotdot; + } + else + { p++; + t->value = TOKslice; + } + } + else + t->value = TOKdot; + return; + + case '&': + p++; + if (*p == '=') + { p++; + t->value = TOKandass; + } + else if (*p == '&') + { p++; + t->value = TOKandand; + } + else + t->value = TOKand; + return; + + case '|': + p++; + if (*p == '=') + { p++; + t->value = TOKorass; + } + else if (*p == '|') + { p++; + t->value = TOKoror; + } + else + t->value = TOKor; + return; + + case '-': + p++; + if (*p == '=') + { p++; + t->value = TOKminass; + } +#if 0 + else if (*p == '>') + { p++; + t->value = TOKarrow; + } +#endif + else if (*p == '-') + { p++; + t->value = TOKminusminus; + } + else + t->value = TOKmin; + return; + + case '+': + p++; + if (*p == '=') + { p++; + t->value = TOKaddass; + } + else if (*p == '+') + { p++; + t->value = TOKplusplus; + } + else + t->value = TOKadd; + return; + + case '<': + p++; + if (*p == '=') + { p++; + t->value = TOKle; // <= + } + else if (*p == '<') + { p++; + if (*p == '=') + { p++; + t->value = TOKshlass; // <<= + } + else + t->value = TOKshl; // << + } + else if (*p == '>') + { p++; + if (*p == '=') + { p++; + t->value = TOKleg; // <>= + } + else + t->value = TOKlg; // <> + } + else + t->value = TOKlt; // < + return; + + case '>': + p++; + if (*p == '=') + { p++; + t->value = TOKge; // >= + } + else if (*p == '>') + { p++; + if (*p == '=') + { p++; + t->value = TOKshrass; // >>= + } + else if (*p == '>') + { p++; + if (*p == '=') + { p++; + t->value = TOKushrass; // >>>= + } + else + t->value = TOKushr; // >>> + } + else + t->value = TOKshr; // >> + } + else + t->value = TOKgt; // > + return; + + case '!': + p++; + if (*p == '=') + { p++; + if (*p == '=' && global.params.Dversion == 1) + { p++; + t->value = TOKnotidentity; // !== + } + else + t->value = TOKnotequal; // != + } + else if (*p == '<') + { p++; + if (*p == '>') + { p++; + if (*p == '=') + { p++; + t->value = TOKunord; // !<>= + } + else + t->value = TOKue; // !<> + } + else if (*p == '=') + { p++; + t->value = TOKug; // !<= + } + else + t->value = TOKuge; // !< + } + else if (*p == '>') + { p++; + if (*p == '=') + { p++; + t->value = TOKul; // !>= + } + else + t->value = TOKule; // !> + } + else + t->value = TOKnot; // ! + return; + + case '=': + p++; + if (*p == '=') + { p++; + if (*p == '=' && global.params.Dversion == 1) + { p++; + t->value = TOKidentity; // === + } + else + t->value = TOKequal; // == + } + else + t->value = TOKassign; // = + return; + + case '~': + p++; + if (*p == '=') + { p++; + t->value = TOKcatass; // ~= + } + else + t->value = TOKtilde; // ~ + return; + +#define SINGLE(c,tok) case c: p++; t->value = tok; return; + + SINGLE('(', TOKlparen) + SINGLE(')', TOKrparen) + SINGLE('[', TOKlbracket) + SINGLE(']', TOKrbracket) + SINGLE('{', TOKlcurly) + SINGLE('}', TOKrcurly) + SINGLE('?', TOKquestion) + SINGLE(',', TOKcomma) + SINGLE(';', TOKsemicolon) + SINGLE(':', TOKcolon) + SINGLE('$', TOKdollar) + +#undef SINGLE + +#define DOUBLE(c1,tok1,c2,tok2) \ + case c1: \ + p++; \ + if (*p == c2) \ + { p++; \ + t->value = tok2; \ + } \ + else \ + t->value = tok1; \ + return; + + DOUBLE('*', TOKmul, '=', TOKmulass) + DOUBLE('%', TOKmod, '=', TOKmodass) + DOUBLE('^', TOKxor, '=', TOKxorass) + +#undef DOUBLE + + case '#': + p++; + pragma(); + continue; + + default: + { unsigned char c = *p; + + if (c & 0x80) + { unsigned u = decodeUTF(); + + // Check for start of unicode identifier + if (isUniAlpha(u)) + goto case_ident; + + if (u == PS || u == LS) + { + loc.linnum++; + p++; + continue; + } + } + if (isprint(c)) + error("unsupported char '%c'", c); + else + error("unsupported char 0x%02x", c); + p++; + continue; + } + } + } +} + +/******************************************* + * Parse escape sequence. + */ + +unsigned Lexer::escapeSequence() +{ unsigned c; + int n; + int ndigits; + + c = *p; + switch (c) + { + case '\'': + case '"': + case '?': + case '\\': + Lconsume: + p++; + break; + + case 'a': c = 7; goto Lconsume; + case 'b': c = 8; goto Lconsume; + case 'f': c = 12; goto Lconsume; + case 'n': c = 10; goto Lconsume; + case 'r': c = 13; goto Lconsume; + case 't': c = 9; goto Lconsume; + case 'v': c = 11; goto Lconsume; + + case 'u': + ndigits = 4; + goto Lhex; + case 'U': + ndigits = 8; + goto Lhex; + case 'x': + ndigits = 2; + Lhex: + p++; + c = *p; + if (ishex(c)) + { unsigned v; + + n = 0; + v = 0; + while (1) + { + if (isdigit(c)) + c -= '0'; + else if (islower(c)) + c -= 'a' - 10; + else + c -= 'A' - 10; + v = v * 16 + c; + c = *++p; + if (++n == ndigits) + break; + if (!ishex(c)) + { error("escape hex sequence has %d hex digits instead of %d", n, ndigits); + break; + } + } + if (ndigits != 2 && !utf_isValidDchar(v)) + error("invalid UTF character \\U%08x", v); + c = v; + } + else + error("undefined escape hex sequence \\%c\n",c); + break; + + case '&': // named character entity + for (unsigned char *idstart = ++p; 1; p++) + { + switch (*p) + { + case ';': + c = HtmlNamedEntity(idstart, p - idstart); + if (c == ~0) + { error("unnamed character entity &%.*s;", (int)(p - idstart), idstart); + c = ' '; + } + p++; + break; + + default: + if (isalpha(*p) || + (p != idstart + 1 && isdigit(*p))) + continue; + error("unterminated named entity"); + break; + } + break; + } + break; + + case 0: + case 0x1A: // end of file + c = '\\'; + break; + + default: + if (isoctal(c)) + { unsigned v; + + n = 0; + v = 0; + do + { + v = v * 8 + (c - '0'); + c = *++p; + } while (++n < 3 && isoctal(c)); + c = v; + if (c > 0xFF) + error("0%03o is larger than a byte", c); + } + else + error("undefined escape sequence \\%c\n",c); + break; + } + return c; +} + +/************************************** + */ + +TOK Lexer::wysiwygStringConstant(Token *t, int tc) +{ unsigned c; + Loc start = loc; + + p++; + stringbuffer.reset(); + while (1) + { + c = *p++; + switch (c) + { + case '\n': + loc.linnum++; + break; + + case '\r': + if (*p == '\n') + continue; // ignore + c = '\n'; // treat EndOfLine as \n character + loc.linnum++; + break; + + case 0: + case 0x1A: + error("unterminated string constant starting at %s", start.toChars()); + t->ustring = (unsigned char *)""; + t->len = 0; + t->postfix = 0; + return TOKstring; + + case '"': + case '`': + if (c == tc) + { + t->len = stringbuffer.offset; + stringbuffer.writeByte(0); + t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset); + memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); + stringPostfix(t); + return TOKstring; + } + break; + + default: + if (c & 0x80) + { p--; + unsigned u = decodeUTF(); + p++; + if (u == PS || u == LS) + loc.linnum++; + stringbuffer.writeUTF8(u); + continue; + } + break; + } + stringbuffer.writeByte(c); + } +} + +/************************************** + * Lex hex strings: + * x"0A ae 34FE BD" + */ + +TOK Lexer::hexStringConstant(Token *t) +{ unsigned c; + Loc start = loc; + unsigned n = 0; + unsigned v; + + p++; + stringbuffer.reset(); + while (1) + { + c = *p++; + switch (c) + { + case ' ': + case '\t': + case '\v': + case '\f': + continue; // skip white space + + case '\r': + if (*p == '\n') + continue; // ignore + // Treat isolated '\r' as if it were a '\n' + case '\n': + loc.linnum++; + continue; + + case 0: + case 0x1A: + error("unterminated string constant starting at %s", start.toChars()); + t->ustring = (unsigned char *)""; + t->len = 0; + t->postfix = 0; + return TOKstring; + + case '"': + if (n & 1) + { error("odd number (%d) of hex characters in hex string", n); + stringbuffer.writeByte(v); + } + t->len = stringbuffer.offset; + stringbuffer.writeByte(0); + t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset); + memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); + stringPostfix(t); + return TOKstring; + + default: + if (c >= '0' && c <= '9') + c -= '0'; + else if (c >= 'a' && c <= 'f') + c -= 'a' - 10; + else if (c >= 'A' && c <= 'F') + c -= 'A' - 10; + else if (c & 0x80) + { p--; + unsigned u = decodeUTF(); + p++; + if (u == PS || u == LS) + loc.linnum++; + else + error("non-hex character \\u%x", u); + } + else + error("non-hex character '%c'", c); + if (n & 1) + { v = (v << 4) | c; + stringbuffer.writeByte(v); + } + else + v = c; + n++; + break; + } + } +} + + +#if DMDV2 +/************************************** + * Lex delimited strings: + * q"(foo(xxx))" // "foo(xxx)" + * q"[foo(]" // "foo(" + * q"/foo]/" // "foo]" + * q"HERE + * foo + * HERE" // "foo\n" + * Input: + * p is on the " + */ + +TOK Lexer::delimitedStringConstant(Token *t) +{ unsigned c; + Loc start = loc; + unsigned delimleft = 0; + unsigned delimright = 0; + unsigned nest = 1; + unsigned nestcount; + Identifier *hereid = NULL; + unsigned blankrol = 0; + unsigned startline = 0; + + p++; + stringbuffer.reset(); + while (1) + { + c = *p++; + //printf("c = '%c'\n", c); + switch (c) + { + case '\n': + Lnextline: + loc.linnum++; + startline = 1; + if (blankrol) + { blankrol = 0; + continue; + } + if (hereid) + { + stringbuffer.writeUTF8(c); + continue; + } + break; + + case '\r': + if (*p == '\n') + continue; // ignore + c = '\n'; // treat EndOfLine as \n character + goto Lnextline; + + case 0: + case 0x1A: + goto Lerror; + + default: + if (c & 0x80) + { p--; + c = decodeUTF(); + p++; + if (c == PS || c == LS) + goto Lnextline; + } + break; + } + if (delimleft == 0) + { delimleft = c; + nest = 1; + nestcount = 1; + if (c == '(') + delimright = ')'; + else if (c == '{') + delimright = '}'; + else if (c == '[') + delimright = ']'; + else if (c == '<') + delimright = '>'; + else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) + { // Start of identifier; must be a heredoc + Token t; + p--; + scan(&t); // read in heredoc identifier + if (t.value != TOKidentifier) + { error("identifier expected for heredoc, not %s", t.toChars()); + delimright = c; + } + else + { hereid = t.ident; + //printf("hereid = '%s'\n", hereid->toChars()); + blankrol = 1; + } + nest = 0; + } + else + { delimright = c; + nest = 0; + } + } + else + { + if (blankrol) + { error("heredoc rest of line should be blank"); + blankrol = 0; + continue; + } + if (nest == 1) + { + if (c == delimleft) + nestcount++; + else if (c == delimright) + { nestcount--; + if (nestcount == 0) + goto Ldone; + } + } + else if (c == delimright) + goto Ldone; + if (startline && isalpha(c)) + { Token t; + unsigned char *psave = p; + p--; + scan(&t); // read in possible heredoc identifier + //printf("endid = '%s'\n", t.ident->toChars()); + if (t.value == TOKidentifier && t.ident->equals(hereid)) + { /* should check that rest of line is blank + */ + goto Ldone; + } + p = psave; + } + stringbuffer.writeUTF8(c); + startline = 0; + } + } + +Ldone: + if (*p == '"') + p++; + else + error("delimited string must end in %c\"", delimright); + t->len = stringbuffer.offset; + stringbuffer.writeByte(0); + t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset); + memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); + stringPostfix(t); + return TOKstring; + +Lerror: + error("unterminated string constant starting at %s", start.toChars()); + t->ustring = (unsigned char *)""; + t->len = 0; + t->postfix = 0; + return TOKstring; +} + +/************************************** + * Lex delimited strings: + * q{ foo(xxx) } // " foo(xxx) " + * q{foo(} // "foo(" + * q{{foo}"}"} // "{foo}"}"" + * Input: + * p is on the q + */ + +TOK Lexer::tokenStringConstant(Token *t) +{ + unsigned nest = 1; + Loc start = loc; + unsigned char *pstart = ++p; + + while (1) + { Token tok; + + scan(&tok); + switch (tok.value) + { + case TOKlcurly: + nest++; + continue; + + case TOKrcurly: + if (--nest == 0) + goto Ldone; + continue; + + case TOKeof: + goto Lerror; + + default: + continue; + } + } + +Ldone: + t->len = p - 1 - pstart; + t->ustring = (unsigned char *)mem.malloc(t->len + 1); + memcpy(t->ustring, pstart, t->len); + t->ustring[t->len] = 0; + stringPostfix(t); + return TOKstring; + +Lerror: + error("unterminated token string constant starting at %s", start.toChars()); + t->ustring = (unsigned char *)""; + t->len = 0; + t->postfix = 0; + return TOKstring; +} + +#endif + + +/************************************** + */ + +TOK Lexer::escapeStringConstant(Token *t, int wide) +{ unsigned c; + Loc start = loc; + + p++; + stringbuffer.reset(); + while (1) + { + c = *p++; + switch (c) + { + case '\\': + switch (*p) + { + case 'u': + case 'U': + case '&': + c = escapeSequence(); + stringbuffer.writeUTF8(c); + continue; + + default: + c = escapeSequence(); + break; + } + break; + + case '\n': + loc.linnum++; + break; + + case '\r': + if (*p == '\n') + continue; // ignore + c = '\n'; // treat EndOfLine as \n character + loc.linnum++; + break; + + case '"': + t->len = stringbuffer.offset; + stringbuffer.writeByte(0); + t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset); + memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); + stringPostfix(t); + return TOKstring; + + case 0: + case 0x1A: + p--; + error("unterminated string constant starting at %s", start.toChars()); + t->ustring = (unsigned char *)""; + t->len = 0; + t->postfix = 0; + return TOKstring; + + default: + if (c & 0x80) + { + p--; + c = decodeUTF(); + if (c == LS || c == PS) + { c = '\n'; + loc.linnum++; + } + p++; + stringbuffer.writeUTF8(c); + continue; + } + break; + } + stringbuffer.writeByte(c); + } +} + +/************************************** + */ + +TOK Lexer::charConstant(Token *t, int wide) +{ + unsigned c; + TOK tk = TOKcharv; + + //printf("Lexer::charConstant\n"); + p++; + c = *p++; + switch (c) + { + case '\\': + switch (*p) + { + case 'u': + t->uns64value = escapeSequence(); + tk = TOKwcharv; + break; + + case 'U': + case '&': + t->uns64value = escapeSequence(); + tk = TOKdcharv; + break; + + default: + t->uns64value = escapeSequence(); + break; + } + break; + + case '\n': + L1: + loc.linnum++; + case '\r': + case 0: + case 0x1A: + case '\'': + error("unterminated character constant"); + return tk; + + default: + if (c & 0x80) + { + p--; + c = decodeUTF(); + p++; + if (c == LS || c == PS) + goto L1; + if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE)) + tk = TOKwcharv; + else + tk = TOKdcharv; + } + t->uns64value = c; + break; + } + + if (*p != '\'') + { error("unterminated character constant"); + return tk; + } + p++; + return tk; +} + +/*************************************** + * Get postfix of string literal. + */ + +void Lexer::stringPostfix(Token *t) +{ + switch (*p) + { + case 'c': + case 'w': + case 'd': + t->postfix = *p; + p++; + break; + + default: + t->postfix = 0; + break; + } +} + +/*************************************** + * Read \u or \U unicode sequence + * Input: + * u 'u' or 'U' + */ + +#if 0 +unsigned Lexer::wchar(unsigned u) +{ + unsigned value; + unsigned n; + unsigned char c; + unsigned nchars; + + nchars = (u == 'U') ? 8 : 4; + value = 0; + for (n = 0; 1; n++) + { + ++p; + if (n == nchars) + break; + c = *p; + if (!ishex(c)) + { error("\\%c sequence must be followed by %d hex characters", u, nchars); + break; + } + if (isdigit(c)) + c -= '0'; + else if (islower(c)) + c -= 'a' - 10; + else + c -= 'A' - 10; + value <<= 4; + value |= c; + } + return value; +} +#endif + +/************************************** + * Read in a number. + * If it's an integer, store it in tok.TKutok.Vlong. + * integers can be decimal, octal or hex + * Handle the suffixes U, UL, LU, L, etc. + * If it's double, store it in tok.TKutok.Vdouble. + * Returns: + * TKnum + * TKdouble,... + */ + +TOK Lexer::number(Token *t) +{ + // We use a state machine to collect numbers + enum STATE { STATE_initial, STATE_0, STATE_decimal, STATE_octal, STATE_octale, + STATE_hex, STATE_binary, STATE_hex0, STATE_binary0, + STATE_hexh, STATE_error }; + enum STATE state; + + enum FLAGS + { FLAGS_decimal = 1, // decimal + FLAGS_unsigned = 2, // u or U suffix + FLAGS_long = 4, // l or L suffix + }; + enum FLAGS flags = FLAGS_decimal; + + int i; + int base; + unsigned c; + unsigned char *start; + TOK result; + + //printf("Lexer::number()\n"); + state = STATE_initial; + base = 0; + stringbuffer.reset(); + start = p; + while (1) + { + c = *p; + switch (state) + { + case STATE_initial: // opening state + if (c == '0') + state = STATE_0; + else + state = STATE_decimal; + break; + + case STATE_0: + flags = (FLAGS) (flags & ~FLAGS_decimal); + switch (c) + { +#if ZEROH + case 'H': // 0h + case 'h': + goto hexh; +#endif + case 'X': + case 'x': + state = STATE_hex0; + break; + + case '.': + if (p[1] == '.') // .. is a separate token + goto done; + case 'i': + case 'f': + case 'F': + goto real; +#if ZEROH + case 'E': + case 'e': + goto case_hex; +#endif + case 'B': + case 'b': + state = STATE_binary0; + break; + + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + state = STATE_octal; + break; + +#if ZEROH + case '8': case '9': case 'A': + case 'C': case 'D': case 'F': + case 'a': case 'c': case 'd': case 'f': + case_hex: + state = STATE_hexh; + break; +#endif + case '_': + state = STATE_octal; + p++; + continue; + + case 'L': + if (p[1] == 'i') + goto real; + goto done; + + default: + goto done; + } + break; + + case STATE_decimal: // reading decimal number + if (!isdigit(c)) + { +#if ZEROH + if (ishex(c) + || c == 'H' || c == 'h' + ) + goto hexh; +#endif + if (c == '_') // ignore embedded _ + { p++; + continue; + } + if (c == '.' && p[1] != '.') + goto real; + else if (c == 'i' || c == 'f' || c == 'F' || + c == 'e' || c == 'E') + { + real: // It's a real number. Back up and rescan as a real + p = start; + return inreal(t); + } + else if (c == 'L' && p[1] == 'i') + goto real; + goto done; + } + break; + + case STATE_hex0: // reading hex number + case STATE_hex: + if (!ishex(c)) + { + if (c == '_') // ignore embedded _ + { p++; + continue; + } + if (c == '.' && p[1] != '.') + goto real; + if (c == 'P' || c == 'p' || c == 'i') + goto real; + if (state == STATE_hex0) + error("Hex digit expected, not '%c'", c); + goto done; + } + state = STATE_hex; + break; + +#if ZEROH + hexh: + state = STATE_hexh; + case STATE_hexh: // parse numbers like 0FFh + if (!ishex(c)) + { + if (c == 'H' || c == 'h') + { + p++; + base = 16; + goto done; + } + else + { + // Check for something like 1E3 or 0E24 + if (memchr((char *)stringbuffer.data, 'E', stringbuffer.offset) || + memchr((char *)stringbuffer.data, 'e', stringbuffer.offset)) + goto real; + error("Hex digit expected, not '%c'", c); + goto done; + } + } + break; +#endif + + case STATE_octal: // reading octal number + case STATE_octale: // reading octal number with non-octal digits + if (!isoctal(c)) + { +#if ZEROH + if (ishex(c) + || c == 'H' || c == 'h' + ) + goto hexh; +#endif + if (c == '_') // ignore embedded _ + { p++; + continue; + } + if (c == '.' && p[1] != '.') + goto real; + if (c == 'i') + goto real; + if (isdigit(c)) + { + state = STATE_octale; + } + else + goto done; + } + break; + + case STATE_binary0: // starting binary number + case STATE_binary: // reading binary number + if (c != '0' && c != '1') + { +#if ZEROH + if (ishex(c) + || c == 'H' || c == 'h' + ) + goto hexh; +#endif + if (c == '_') // ignore embedded _ + { p++; + continue; + } + if (state == STATE_binary0) + { error("binary digit expected"); + state = STATE_error; + break; + } + else + goto done; + } + state = STATE_binary; + break; + + case STATE_error: // for error recovery + if (!isdigit(c)) // scan until non-digit + goto done; + break; + + default: + assert(0); + } + stringbuffer.writeByte(c); + p++; + } +done: + stringbuffer.writeByte(0); // terminate string + if (state == STATE_octale) + error("Octal digit expected"); + + uinteger_t n; // unsigned >=64 bit integer type + + if (stringbuffer.offset == 2 && (state == STATE_decimal || state == STATE_0)) + n = stringbuffer.data[0] - '0'; + else + { + // Convert string to integer +#if __DMC__ + errno = 0; + n = strtoull((char *)stringbuffer.data,NULL,base); + if (errno == ERANGE) + error("integer overflow"); +#else + // Not everybody implements strtoull() + char *p = (char *)stringbuffer.data; + int r = 10, d; + + if (*p == '0') + { + if (p[1] == 'x' || p[1] == 'X') + p += 2, r = 16; + else if (p[1] == 'b' || p[1] == 'B') + p += 2, r = 2; + else if (isdigit(p[1])) + p += 1, r = 8; + } + + n = 0; + while (1) + { + if (*p >= '0' && *p <= '9') + d = *p - '0'; + else if (*p >= 'a' && *p <= 'z') + d = *p - 'a' + 10; + else if (*p >= 'A' && *p <= 'Z') + d = *p - 'A' + 10; + else + break; + if (d >= r) + break; + uinteger_t n2 = n * r; + //printf("n2 / r = %llx, n = %llx\n", n2/r, n); + if (n2 / r != n || n2 + d < n) + { + error ("integer overflow"); + break; + } + + n = n2 + d; + p++; + } +#endif + if (sizeof(n) > 8 && + n > 0xFFFFFFFFFFFFFFFFULL) // if n needs more than 64 bits + error("integer overflow"); + } + + // Parse trailing 'u', 'U', 'l' or 'L' in any combination + while (1) + { unsigned char f; + + switch (*p) + { case 'U': + case 'u': + f = FLAGS_unsigned; + goto L1; + + case 'l': + if (1 || !global.params.useDeprecated) + error("'l' suffix is deprecated, use 'L' instead"); + case 'L': + f = FLAGS_long; + L1: + p++; + if (flags & f) + error("unrecognized token"); + flags = (FLAGS) (flags | f); + continue; + default: + break; + } + break; + } + + switch (flags) + { + case 0: + /* Octal or Hexadecimal constant. + * First that fits: int, uint, long, ulong + */ + if (n & 0x8000000000000000LL) + result = TOKuns64v; + else if (n & 0xFFFFFFFF00000000LL) + result = TOKint64v; + else if (n & 0x80000000) + result = TOKuns32v; + else + result = TOKint32v; + break; + + case FLAGS_decimal: + /* First that fits: int, long, long long + */ + if (n & 0x8000000000000000LL) + { error("signed integer overflow"); + result = TOKuns64v; + } + else if (n & 0xFFFFFFFF80000000LL) + result = TOKint64v; + else + result = TOKint32v; + break; + + case FLAGS_unsigned: + case FLAGS_decimal | FLAGS_unsigned: + /* First that fits: uint, ulong + */ + if (n & 0xFFFFFFFF00000000LL) + result = TOKuns64v; + else + result = TOKuns32v; + break; + + case FLAGS_decimal | FLAGS_long: + if (n & 0x8000000000000000LL) + { error("signed integer overflow"); + result = TOKuns64v; + } + else + result = TOKint64v; + break; + + case FLAGS_long: + if (n & 0x8000000000000000LL) + result = TOKuns64v; + else + result = TOKint64v; + break; + + case FLAGS_unsigned | FLAGS_long: + case FLAGS_decimal | FLAGS_unsigned | FLAGS_long: + result = TOKuns64v; + break; + + default: + #ifdef DEBUG + printf("%x\n",flags); + #endif + assert(0); + } + t->uns64value = n; + return result; +} + +/************************************** + * Read in characters, converting them to real. + * Bugs: + * Exponent overflow not detected. + * Too much requested precision is not detected. + */ + +TOK Lexer::inreal(Token *t) +#ifdef __DMC__ +__in +{ + assert(*p == '.' || isdigit(*p)); +} +__out (result) +{ + switch (result) + { + case TOKfloat32v: + case TOKfloat64v: + case TOKfloat80v: + case TOKimaginary32v: + case TOKimaginary64v: + case TOKimaginary80v: + break; + + default: + assert(0); + } +} +__body +#endif /* __DMC__ */ +{ int dblstate; + unsigned c; + char hex; // is this a hexadecimal-floating-constant? + TOK result; + + //printf("Lexer::inreal()\n"); + stringbuffer.reset(); + dblstate = 0; + hex = 0; +Lnext: + while (1) + { + // Get next char from input + c = *p++; + //printf("dblstate = %d, c = '%c'\n", dblstate, c); + while (1) + { + switch (dblstate) + { + case 0: // opening state + if (c == '0') + dblstate = 9; + else if (c == '.') + dblstate = 3; + else + dblstate = 1; + break; + + case 9: + dblstate = 1; + if (c == 'X' || c == 'x') + { hex++; + break; + } + case 1: // digits to left of . + case 3: // digits to right of . + case 7: // continuing exponent digits + if (!isdigit(c) && !(hex && isxdigit(c))) + { + if (c == '_') + goto Lnext; // ignore embedded '_' + dblstate++; + continue; + } + break; + + case 2: // no more digits to left of . + if (c == '.') + { dblstate++; + break; + } + case 4: // no more digits to right of . + if ((c == 'E' || c == 'e') || + hex && (c == 'P' || c == 'p')) + { dblstate = 5; + hex = 0; // exponent is always decimal + break; + } + if (hex) + error("binary-exponent-part required"); + goto done; + + case 5: // looking immediately to right of E + dblstate++; + if (c == '-' || c == '+') + break; + case 6: // 1st exponent digit expected + if (!isdigit(c)) + error("exponent expected"); + dblstate++; + break; + + case 8: // past end of exponent digits + goto done; + } + break; + } + stringbuffer.writeByte(c); + } +done: + p--; + + stringbuffer.writeByte(0); + +#if _WIN32 && __DMC__ + char *save = __locale_decpoint; + __locale_decpoint = "."; +#endif +#ifdef IN_GCC + t->float80value = real_t::parse((char *)stringbuffer.data, real_t::LongDouble); +#else + t->float80value = strtold((char *)stringbuffer.data, NULL); +#endif + errno = 0; + float strtofres; + double strtodres; + switch (*p) + { + case 'F': + case 'f': +#ifdef IN_GCC + real_t::parse((char *)stringbuffer.data, real_t::Float); +#else + strtofres = strtof((char *)stringbuffer.data, NULL); + // LDC change: don't error on gradual underflow + if (errno == ERANGE && + strtofres != 0 && strtofres != HUGE_VALF && strtofres != -HUGE_VALF) + errno = 0; +#endif + result = TOKfloat32v; + p++; + break; + + default: +#ifdef IN_GCC + real_t::parse((char *)stringbuffer.data, real_t::Double); +#else + strtodres = strtod((char *)stringbuffer.data, NULL); + // LDC change: don't error on gradual underflow + if (errno == ERANGE && + strtodres != 0 && strtodres != HUGE_VAL && strtodres != -HUGE_VAL) + errno = 0; +#endif + result = TOKfloat64v; + break; + + case 'l': + if (!global.params.useDeprecated) + error("'l' suffix is deprecated, use 'L' instead"); + case 'L': + result = TOKfloat80v; + p++; + break; + } + if (*p == 'i' || *p == 'I') + { + if (!global.params.useDeprecated && *p == 'I') + error("'I' suffix is deprecated, use 'i' instead"); + p++; + switch (result) + { + case TOKfloat32v: + result = TOKimaginary32v; + break; + case TOKfloat64v: + result = TOKimaginary64v; + break; + case TOKfloat80v: + result = TOKimaginary80v; + break; + } + } +#if _WIN32 && __DMC__ + __locale_decpoint = save; +#endif + if (errno == ERANGE) + error("number is not representable"); + return result; +} + +/********************************************* + * Do pragma. + * Currently, the only pragma supported is: + * #line linnum [filespec] + */ + +void Lexer::pragma() +{ + Token tok; + int linnum; + char *filespec = NULL; + Loc loc = this->loc; + + scan(&tok); + if (tok.value != TOKidentifier || tok.ident != Id::line) + goto Lerr; + + scan(&tok); + if (tok.value == TOKint32v || tok.value == TOKint64v) + linnum = tok.uns64value - 1; + else + goto Lerr; + + while (1) + { + switch (*p) + { + case 0: + case 0x1A: + case '\n': + Lnewline: + this->loc.linnum = linnum; + if (filespec) + this->loc.filename = filespec; + return; + + case '\r': + p++; + if (*p != '\n') + { p--; + goto Lnewline; + } + continue; + + case ' ': + case '\t': + case '\v': + case '\f': + p++; + continue; // skip white space + + case '_': + if (mod && memcmp(p, "__FILE__", 8) == 0) + { + p += 8; + filespec = mem.strdup(loc.filename ? loc.filename : mod->ident->toChars()); + } + continue; + + case '"': + if (filespec) + goto Lerr; + stringbuffer.reset(); + p++; + while (1) + { unsigned c; + + c = *p; + switch (c) + { + case '\n': + case '\r': + case 0: + case 0x1A: + goto Lerr; + + case '"': + stringbuffer.writeByte(0); + filespec = mem.strdup((char *)stringbuffer.data); + p++; + break; + + default: + if (c & 0x80) + { unsigned u = decodeUTF(); + if (u == PS || u == LS) + goto Lerr; + } + stringbuffer.writeByte(c); + p++; + continue; + } + break; + } + continue; + + default: + if (*p & 0x80) + { unsigned u = decodeUTF(); + if (u == PS || u == LS) + goto Lnewline; + } + goto Lerr; + } + } + +Lerr: + error(loc, "#line integer [\"filespec\"]\\n expected"); +} + + +/******************************************** + * Decode UTF character. + * Issue error messages for invalid sequences. + * Return decoded character, advance p to last character in UTF sequence. + */ + +unsigned Lexer::decodeUTF() +{ + dchar_t u; + unsigned char c; + unsigned char *s = p; + size_t len; + size_t idx; + const char *msg; + + c = *s; + assert(c & 0x80); + + // Check length of remaining string up to 6 UTF-8 characters + for (len = 1; len < 6 && s[len]; len++) + ; + + idx = 0; + msg = utf_decodeChar(s, len, &idx, &u); + p += idx - 1; + if (msg) + { + error("%s", msg); + } + return u; +} + + +/*************************************************** + * Parse doc comment embedded between t->ptr and p. + * Remove trailing blanks and tabs from lines. + * Replace all newlines with \n. + * Remove leading comment character from each line. + * Decide if it's a lineComment or a blockComment. + * Append to previous one for this token. + */ + +void Lexer::getDocComment(Token *t, unsigned lineComment) +{ + OutBuffer buf; + unsigned char ct = t->ptr[2]; + unsigned char *q = t->ptr + 3; // start of comment text + int linestart = 0; + + unsigned char *qend = p; + if (ct == '*' || ct == '+') + qend -= 2; + + /* Scan over initial row of ****'s or ++++'s or ////'s + */ + for (; q < qend; q++) + { + if (*q != ct) + break; + } + + /* Remove trailing row of ****'s or ++++'s + */ + if (ct != '/') + { + for (; q < qend; qend--) + { + if (qend[-1] != ct) + break; + } + } + + for (; q < qend; q++) + { + unsigned char c = *q; + + switch (c) + { + case '*': + case '+': + if (linestart && c == ct) + { linestart = 0; + /* Trim preceding whitespace up to preceding \n + */ + while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) + buf.offset--; + continue; + } + break; + + case ' ': + case '\t': + break; + + case '\r': + if (q[1] == '\n') + continue; // skip the \r + goto Lnewline; + + default: + if (c == 226) + { + // If LS or PS + if (q[1] == 128 && + (q[2] == 168 || q[2] == 169)) + { + q += 2; + goto Lnewline; + } + } + linestart = 0; + break; + + Lnewline: + c = '\n'; // replace all newlines with \n + case '\n': + linestart = 1; + + /* Trim trailing whitespace + */ + while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) + buf.offset--; + + break; + } + buf.writeByte(c); + } + + // Always end with a newline + if (!buf.offset || buf.data[buf.offset - 1] != '\n') + buf.writeByte('\n'); + + buf.writeByte(0); + + // It's a line comment if the start of the doc comment comes + // after other non-whitespace on the same line. + unsigned char** dc = (lineComment && anyToken) + ? &t->lineComment + : &t->blockComment; + + // Combine with previous doc comment, if any + if (*dc) + *dc = combineComments(*dc, (unsigned char *)buf.data); + else + *dc = (unsigned char *)buf.extractData(); +} + +/******************************************** + * Combine two document comments into one. + */ + +unsigned char *Lexer::combineComments(unsigned char *c1, unsigned char *c2) +{ + unsigned char *c = c2; + + if (c1) + { c = c1; + if (c2) + { size_t len1 = strlen((char *)c1); + size_t len2 = strlen((char *)c2); + + c = (unsigned char *)mem.malloc(len1 + 1 + len2 + 1); + memcpy(c, c1, len1); + c[len1] = '\n'; + memcpy(c + len1 + 1, c2, len2); + c[len1 + 1 + len2] = 0; + } + } + return c; +} + +/******************************************** + * Create an identifier in the string table. + */ + +Identifier *Lexer::idPool(const char *s) +{ + size_t len = strlen(s); + StringValue *sv = stringtable.update(s, len); + Identifier *id = (Identifier *) sv->ptrvalue; + if (!id) + { + id = new Identifier(sv->lstring.string, TOKidentifier); + sv->ptrvalue = id; + } + return id; +} + +/********************************************* + * Create a unique identifier using the prefix s. + */ + +Identifier *Lexer::uniqueId(const char *s, int num) +{ char buffer[32]; + size_t slen = strlen(s); + + assert(slen + sizeof(num) * 3 + 1 <= sizeof(buffer)); + sprintf(buffer, "%s%d", s, num); + return idPool(buffer); +} + +Identifier *Lexer::uniqueId(const char *s) +{ + static int num; + return uniqueId(s, ++num); +} + +/**************************************** + */ + +struct Keyword +{ const char *name; + enum TOK value; +}; + +static Keyword keywords[] = +{ +// { "", TOK }, + + { "this", TOKthis }, + { "super", TOKsuper }, + { "assert", TOKassert }, + { "null", TOKnull }, + { "true", TOKtrue }, + { "false", TOKfalse }, + { "cast", TOKcast }, + { "new", TOKnew }, + { "delete", TOKdelete }, + { "throw", TOKthrow }, + { "module", TOKmodule }, + { "pragma", TOKpragma }, + { "typeof", TOKtypeof }, + { "typeid", TOKtypeid }, + + { "template", TOKtemplate }, + + { "void", TOKvoid }, + { "byte", TOKint8 }, + { "ubyte", TOKuns8 }, + { "short", TOKint16 }, + { "ushort", TOKuns16 }, + { "int", TOKint32 }, + { "uint", TOKuns32 }, + { "long", TOKint64 }, + { "ulong", TOKuns64 }, + { "cent", TOKcent, }, + { "ucent", TOKucent, }, + { "float", TOKfloat32 }, + { "double", TOKfloat64 }, + { "real", TOKfloat80 }, + + { "bool", TOKbool }, + { "char", TOKchar }, + { "wchar", TOKwchar }, + { "dchar", TOKdchar }, + + { "ifloat", TOKimaginary32 }, + { "idouble", TOKimaginary64 }, + { "ireal", TOKimaginary80 }, + + { "cfloat", TOKcomplex32 }, + { "cdouble", TOKcomplex64 }, + { "creal", TOKcomplex80 }, + + { "delegate", TOKdelegate }, + { "function", TOKfunction }, + + { "is", TOKis }, + { "if", TOKif }, + { "else", TOKelse }, + { "while", TOKwhile }, + { "for", TOKfor }, + { "do", TOKdo }, + { "switch", TOKswitch }, + { "case", TOKcase }, + { "default", TOKdefault }, + { "break", TOKbreak }, + { "continue", TOKcontinue }, + { "synchronized", TOKsynchronized }, + { "return", TOKreturn }, + { "goto", TOKgoto }, + { "try", TOKtry }, + { "catch", TOKcatch }, + { "finally", TOKfinally }, + { "with", TOKwith }, + { "asm", TOKasm }, + { "foreach", TOKforeach }, + { "foreach_reverse", TOKforeach_reverse }, + { "scope", TOKscope }, + + { "struct", TOKstruct }, + { "class", TOKclass }, + { "interface", TOKinterface }, + { "union", TOKunion }, + { "enum", TOKenum }, + { "import", TOKimport }, + { "mixin", TOKmixin }, + { "static", TOKstatic }, + { "final", TOKfinal }, + { "const", TOKconst }, + { "immutable", TOKimmutable }, + { "typedef", TOKtypedef }, + { "alias", TOKalias }, + { "override", TOKoverride }, + { "abstract", TOKabstract }, + { "volatile", TOKvolatile }, + { "debug", TOKdebug }, + { "deprecated", TOKdeprecated }, + { "in", TOKin }, + { "out", TOKout }, + { "inout", TOKinout }, + { "lazy", TOKlazy }, + { "auto", TOKauto }, + + { "align", TOKalign }, + { "extern", TOKextern }, + { "private", TOKprivate }, + { "package", TOKpackage }, + { "protected", TOKprotected }, + { "public", TOKpublic }, + { "export", TOKexport }, + + { "body", TOKbody }, + { "invariant", TOKinvariant }, + { "unittest", TOKunittest }, + { "version", TOKversion }, + //{ "manifest", TOKmanifest }, + + // Added after 1.0 + { "ref", TOKref }, + { "macro", TOKmacro }, +#if DMDV2 + { "pure", TOKpure }, + { "nothrow", TOKnothrow }, + { "__thread", TOKtls }, + { "__traits", TOKtraits }, + { "__overloadset", TOKoverloadset }, + { "__FILE__", TOKfile }, + { "__LINE__", TOKline }, + { "shared", TOKshared }, +#endif +}; + +int Token::isKeyword() +{ + for (unsigned u = 0; u < sizeof(keywords) / sizeof(keywords[0]); u++) + { + if (keywords[u].value == value) + return 1; + } + return 0; +} + +void Lexer::initKeywords() +{ StringValue *sv; + unsigned u; + enum TOK v; + unsigned nkeywords = sizeof(keywords) / sizeof(keywords[0]); + + if (global.params.Dversion == 1) + nkeywords -= 2; + + cmtable_init(); + + for (u = 0; u < nkeywords; u++) + { const char *s; + + //printf("keyword[%d] = '%s'\n",u, keywords[u].name); + s = keywords[u].name; + v = keywords[u].value; + sv = stringtable.insert(s, strlen(s)); + sv->ptrvalue = (void *) new Identifier(sv->lstring.string,v); + + //printf("tochars[%d] = '%s'\n",v, s); + Token::tochars[v] = s; + } + + Token::tochars[TOKeof] = "EOF"; + Token::tochars[TOKlcurly] = "{"; + Token::tochars[TOKrcurly] = "}"; + Token::tochars[TOKlparen] = "("; + Token::tochars[TOKrparen] = ")"; + Token::tochars[TOKlbracket] = "["; + Token::tochars[TOKrbracket] = "]"; + Token::tochars[TOKsemicolon] = ";"; + Token::tochars[TOKcolon] = ":"; + Token::tochars[TOKcomma] = ","; + Token::tochars[TOKdot] = "."; + Token::tochars[TOKxor] = "^"; + Token::tochars[TOKxorass] = "^="; + Token::tochars[TOKassign] = "="; + Token::tochars[TOKconstruct] = "="; +#if DMDV2 + Token::tochars[TOKblit] = "="; +#endif + Token::tochars[TOKlt] = "<"; + Token::tochars[TOKgt] = ">"; + Token::tochars[TOKle] = "<="; + Token::tochars[TOKge] = ">="; + Token::tochars[TOKequal] = "=="; + Token::tochars[TOKnotequal] = "!="; + Token::tochars[TOKnotidentity] = "!is"; + Token::tochars[TOKtobool] = "!!"; + + Token::tochars[TOKunord] = "!<>="; + Token::tochars[TOKue] = "!<>"; + Token::tochars[TOKlg] = "<>"; + Token::tochars[TOKleg] = "<>="; + Token::tochars[TOKule] = "!>"; + Token::tochars[TOKul] = "!>="; + Token::tochars[TOKuge] = "!<"; + Token::tochars[TOKug] = "!<="; + + Token::tochars[TOKnot] = "!"; + Token::tochars[TOKtobool] = "!!"; + Token::tochars[TOKshl] = "<<"; + Token::tochars[TOKshr] = ">>"; + Token::tochars[TOKushr] = ">>>"; + Token::tochars[TOKadd] = "+"; + Token::tochars[TOKmin] = "-"; + Token::tochars[TOKmul] = "*"; + Token::tochars[TOKdiv] = "/"; + Token::tochars[TOKmod] = "%"; + Token::tochars[TOKslice] = ".."; + Token::tochars[TOKdotdotdot] = "..."; + Token::tochars[TOKand] = "&"; + Token::tochars[TOKandand] = "&&"; + Token::tochars[TOKor] = "|"; + Token::tochars[TOKoror] = "||"; + Token::tochars[TOKarray] = "[]"; + Token::tochars[TOKindex] = "[i]"; + Token::tochars[TOKaddress] = "&"; + Token::tochars[TOKstar] = "*"; + Token::tochars[TOKtilde] = "~"; + Token::tochars[TOKdollar] = "$"; + Token::tochars[TOKcast] = "cast"; + Token::tochars[TOKplusplus] = "++"; + Token::tochars[TOKminusminus] = "--"; + Token::tochars[TOKtype] = "type"; + Token::tochars[TOKquestion] = "?"; + Token::tochars[TOKneg] = "-"; + Token::tochars[TOKuadd] = "+"; + Token::tochars[TOKvar] = "var"; + Token::tochars[TOKaddass] = "+="; + Token::tochars[TOKminass] = "-="; + Token::tochars[TOKmulass] = "*="; + Token::tochars[TOKdivass] = "/="; + Token::tochars[TOKmodass] = "%="; + Token::tochars[TOKshlass] = "<<="; + Token::tochars[TOKshrass] = ">>="; + Token::tochars[TOKushrass] = ">>>="; + Token::tochars[TOKandass] = "&="; + Token::tochars[TOKorass] = "|="; + Token::tochars[TOKcatass] = "~="; + Token::tochars[TOKcat] = "~"; + Token::tochars[TOKcall] = "call"; + Token::tochars[TOKidentity] = "is"; + Token::tochars[TOKnotidentity] = "!is"; + + Token::tochars[TOKorass] = "|="; + Token::tochars[TOKidentifier] = "identifier"; + + // For debugging + Token::tochars[TOKdotexp] = "dotexp"; + Token::tochars[TOKdotti] = "dotti"; + Token::tochars[TOKdotvar] = "dotvar"; + Token::tochars[TOKdottype] = "dottype"; + Token::tochars[TOKsymoff] = "symoff"; + Token::tochars[TOKtypedot] = "typedot"; + Token::tochars[TOKarraylength] = "arraylength"; + Token::tochars[TOKarrayliteral] = "arrayliteral"; + Token::tochars[TOKassocarrayliteral] = "assocarrayliteral"; + Token::tochars[TOKstructliteral] = "structliteral"; + Token::tochars[TOKstring] = "string"; + Token::tochars[TOKdsymbol] = "symbol"; + Token::tochars[TOKtuple] = "tuple"; + Token::tochars[TOKdeclaration] = "declaration"; + Token::tochars[TOKdottd] = "dottd"; + Token::tochars[TOKon_scope_exit] = "scope(exit)"; +}