projects/dil: trunk/src/dil/Lexer.d comparison

comparison trunk/src/dil/Lexer.d @ 485:ea8c7459f1c4

Changed a lot of things in the Lexer. Newlines are tokenized now, instead of being treated as whitespace. Newline tokens store location info as well, which make quite a few functions unnecessary. Added a static method getLocation() which returns a Location instance for any given token. This will also be very useful for finding the location of AST nodes (through Node.begin,) which is needed for reporting parser and semantic errors and emitting documentation. Removed rescanNewlines(), LocState, getState(), restoreState(), evaluateHashLine() and updateErrorLoc(). Added isUnicodeNewlineChar(), isUnicodeNewline(), isNewline(), isNewlineEnd(), isEndOfLine(), scanNewline(), getLocation() and error_(). Replaced some clunky expressions with isascii(), isNewlineEnd(), isEndOfLine(), isUnicodeNewline(), isUnicodeNewlineChar(). Fix in scanNormalStringLiteral(): scanPostfix() must be before label Lreturn. Fixed Lexer unittest. Fix in parseDeclarationDefinitionsBlock(): 'token' should be 'begin'. Added method isMultiline() to Token and added documentation comments.: File name too long abort: file /home/aziz/dil/trunk/Changed a lot of things in the Lexer. Newlines are tokenized now, instead of being treated as whitespace. Newline tokens store location info as well, which make quite a few functions unnecessary. Added a static method getLocation() which returns a Location instance for any given token. This will also be very useful for finding the location of AST nodes (through Node.begin,) which is needed for reporting parser and semantic errors and emitting documentation. Removed rescanNewlines(), LocState, getState(), restoreState(), evaluateHashLine() and updateErrorLoc(). Added isUnicodeNewlineChar(), isUnicodeNewline(), isNewline(), isNewlineEnd(), isEndOfLine(), scanNewline(), getLocation() and error_(). Replaced some clunky expressions with isascii(), isNewlineEnd(), isEndOfLine(), isUnicodeNewline(), isUnicodeNewlineChar(). Fix in scanNormalStringLiteral(): scanPostfix() must be before label Lreturn. Fixed Lexer unittest. Fix in parseDeclarationDefinitionsBlock(): 'token' should be 'begin'. Added method isMultiline() to Token and added documentation comments.

author	Aziz K?ksal <aziz.koeksal@gmail.com>
date	Fri, 30 Nov 2007 20:17:29 +0100
parents	325714d8aa6c
children	bccca748d745

comparison

equal deleted inserted replaced

-:265c0b655f18
+:ea8c7459f1c4
 import std.uni;
 import common;
 const char[3] LS = \u2028; /// Line separator.
 const char[3] PS = \u2029; /// Paragraph separator.
 const dchar LSd = 0x2028;
 const dchar PSd = 0x2029;
+static assert(LS[0] == PS[0] && LS[1] == PS[1]);
 /// U+FFFD = �. Used to replace invalid Unicode characters.
 const dchar REPLACEMENT_CHAR = '\uFFFD';
 const uint _Z_ = 26; /// Control+Z
 class Lexer
 {
-Token* head; /// The head of the doubly linked token list.
+Token* head;      /// The head of the doubly linked token list.
-Token* tail; /// The tail of the linked list. Set in scan().
+Token* tail;      /// The tail of the linked list. Set in scan().
-Token* token; /// Points to the current token in the token list.
+Token* token;     /// Points to the current token in the token list.
-string text; /// The source text.
+string text;      /// The source text.
-char[] filePath; /// Path to the source file.
+char[] filePath;  /// Path to the source text.
-char* p; /// Points to the current character in the source text.
+char* p;          /// Points to the current character in the source text.
-char* end; /// Points one character past the end of the source text.
+char* end;        /// Points one character past the end of the source text.
 // Members used for error messages:
 Information[] errors;
-char* lineBegin; /// Always points to the beginning of the current line.
+/// Always points to the beginning of the current line.
-uint loc = 1; /// Actual line of code.
+char* lineBegin;
-uint loc_hline; /// Line number set by #line.
+//   Token* newline;     /// Current newline token.
+uint lineNum = 1;   /// Current, actual source text line number.
+uint lineNum_hline; /// Line number set by #line.
 uint inTokenString; /// > 0 if inside q{ }
-Location errorLoc;
+char[] errorPath;   /// The path displayed in error messages.
 Identifier[string] idtable;
-version(token2LocTable)
+/++
-/// Maps every token that starts a new line to a Location.
+Construct a Lexer object.
-Location[Token*] token2LocTable;
+Params:
+text     = the UTF-8 source code.
+filePath = the path to the source code; used for error messages.
++/
 this(string text, string filePath)
 {
-this.filePath = filePath;
+this.filePath = this.errorPath = filePath;
 this.text = text;
 if (text.length == 0 || text[$-1] != 0)
 {
 this.text.length = this.text.length + 1;
 }
 this.p = this.text.ptr;
 this.end = this.p + this.text.length;
 this.lineBegin = this.p;
-this.errorLoc = new Location(filePath, 1, this.lineBegin, this.lineBegin);
 loadKeywords(this.idtable);
 this.head = new Token;
 this.head.type = TOK.HEAD;
+this.head.start = this.head.end = this.p;
 this.token = this.head;
+// Add a newline as the first token after the head.
+auto newline = new Token;
+newline.type = TOK.Newline;
+newline.start = newline.end = this.p;
+newline.filePath = this.errorPath;
+newline.lineNum = 1;
+newline.lineNum_hline = 0;
+// Link in.
+this.token.next = newline;
+newline.prev = this.token;
+this.token = newline;
+//     this.newline = newline;
 scanShebang();
-version(token2LocTable)
-{
-// Add first token to table.
-auto firstToken = this.head;
-peek(firstToken);
-token2LocTable[firstToken] = new Location(1, null);
-}
 }
 ~this()
 {
 auto token = head.next;
 token = token.next;
 }
 delete tail;
 }
+/++
+The "shebang" may optionally appear once at the beginning of a file.
+Regexp: #![^\EndOfLine]*
++/
 void scanShebang()
 {
 if (*p == '#' && p[1] == '!')
 {
-Token* t = new Token;
+auto t = new Token;
+t.type = TOK.Shebang;
 t.start = p;
-t.type = TOK.Shebang;
 ++p;
-assert(*p == '!');
+while (!isEndOfLine(++p))
-while (1)
+isascii(*p) || decodeUTF8();
-{
+t.end = p;
-t.end = ++p;
+this.token.next = t;
-switch (*p)
+t.prev = this.token;
-{
-case '\r', '\n', 0, _Z_:
-break;
-case LS[0]:
-if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
-break;
-default:
-if (*p & 128)
-decodeUTF8();
-continue;
-}
-break; // Exit loop.
-}
-// Reset p. The newline will be scanned as whitespace in scan().
-p = t.end;
-this.head.next = t;
-t.prev = this.head;
 }
 }
 void finalizeSpecialToken(ref Token t)
 {
 assert(t.srcText[0..2] == "__");
 switch (t.type)
 {
 case TOK.FILE:
-t.str = this.errorLoc.filePath;
+t.str = this.errorPath;
 break;
 case TOK.LINE:
-t.uint_ = this.errorLineNum(this.loc);
+t.uint_ = this.errorLineNumber(this.lineNum);
 break;
 case TOK.DATE,
 TOK.TIME,
 TOK.TIMESTAMP:
 time_t time_val;
 default:
 assert(0);
 }
 }
-void setLineBegin(char* p)
+private void setLineBegin(char* p)
 {
 // Check that we can look behind one character.
 assert((p-1) >= text.ptr && p < end);
 // Check that previous character is a newline.
-assert(p[-1] == '\n' ||  p[-1] == '\r' ||
+assert(isNewlineEnd(p - 1));
-p[-1] == LS[2] || p[-1] == PS[2]);
 this.lineBegin = p;
 }
-private void scanNext(bool rescan)(ref Token* t)
+private void scanNext(ref Token* t)
 {
 assert(t !is null);
 if (t.next)
 {
 t = t.next;
-static if (rescan == true)
+//       if (t.type == TOK.Newline)
-rescanNewlines(*t);
+//         this.newline = t;
 }
 else if (t != this.tail)
 {
 Token* new_t = new Token;
 scan(*new_t);
 t.next = new_t;
 t = new_t;
 }
 }
+/// Advance t one token forward.
 void peek(ref Token* t)
 {
-scanNext!(false)(t);
+scanNext(t);
 }
+/// Advance to the next token in the source text.
 TOK nextToken()
 {
-scanNext!(true)(this.token);
+scanNext(this.token);
 return this.token.type;
 }
-void rescanNewlines(ref Token t)
+/// Returns true if d is a Unicode line or paragraph separator.
-{
+static bool isUnicodeNewlineChar(dchar d)
-auto p = t.ws;
+{
-auto end = t.start;
+return d == LSd || d == PSd;
+}
-if (p !is null)
-{
+/// Returns true if p points to a line or paragraph separator.
-assert(end !is null);
+static bool isUnicodeNewline(char* p)
-// Scan preceding whitespace for newlines.
+{
-do
+return *p == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]);
-{
+}
-switch (*p)
-{
+/++
-case '\r':
+Returns true if p points to the start of a Newline.
-if (p[1] == '\n')
+Newline: \n | \r | \r\n | LS | PS
-++p;
++/
-case '\n':
+static bool isNewline(char* p)
-++loc;
+{
-setLineBegin(p + 1);
+return *p == '\n' || *p == '\r' || isUnicodeNewline(p);
-break;
+}
-case LS[0]:
-assert(p+2 < end && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]));
+/// Returns true if p points to the last character of a Newline.
-++p; ++p;
+bool isNewlineEnd(char* p)
-++loc;
+{
-setLineBegin(p + 1);
+if (*p == '\n' || *p == '\r')
-break;
+return true;
-default:
+if (*p == LS[2] || *p == PS[2])
-assert(isspace(*p));
+if ((p-2) >= text.ptr)
-}
+if (p[-1] == LS[1] && p[-2] == LS[0])
+return true;
+return false;
+}
+/++
+Returns true if p points to the first character of an EndOfLine.
+EndOfLine: Newline | 0 | _Z_
++/
+static bool isEndOfLine(char* p)
+{
+return isNewline(p) || *p == 0 || *p == _Z_;
+}
+/++
+Scans a Newline and sets p one character past it.
+Returns '\n' if scanned or 0 otherwise.
++/
+static dchar scanNewline(ref char* p)
+{
+switch (*p)
+{
+case '\r':
+if (p[1] == '\n')
 ++p;
-} while (p < end)
+case '\n':
-}
+++p;
+return '\n';
-if (t.type == TOK.String && t.start[0] != '\\' ||
+default:
-t.type == TOK.Comment && t.start[1] != '/')
+if (isUnicodeNewline(p))
 {
-// String literals and comments are the only tokens that can have
+++p; ++p; ++p;
-// newlines.
+return '\n';
-p = t.start;
+}
-end = t.end;
+}
-assert(p !is null && end !is null);
+return 0;
-do
+}
-{
-switch (*p)
+/// Returns a Location for the given token.
-{
+static Location getLocation(Token* token)
-case '\r':
+{
-if (p[1] == '\n')
+auto search_t = token.prev;
-++p;
+// Find previous newline token.
-case '\n':
+while (search_t.type != TOK.Newline)
-++loc;
+search_t = search_t.prev;
-setLineBegin(p + 1);
+auto filePath  = search_t.filePath;
-break;
+auto lineNum   = search_t.lineNum - search_t.lineNum_hline;
-case LS[0]:
+auto lineBegin = search_t.end;
-if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
+// Determine actual line begin and line number.
+while (1)
+{
+search_t = search_t.next;
+if (search_t == token)
+break;
+// Multiline tokens must be rescanned for newlines.
+if (search_t.isMultiline)
+{
+auto p = search_t.start, end = search_t.end;
+while (p != end)
+{
+if (Lexer.scanNewline(p) == '\n')
 {
-++p; ++p;
+lineBegin = p;
-++loc;
+++lineNum;
-setLineBegin(p + 1);
-break;
 }
-default:
+else
-}
+++p;
-++p;
+}
-} while (p < end)
+}
 }
-else
+return new Location(filePath, lineNum, lineBegin, token.start);
-{
+}
-if (t.type == TOK.HashLine)
-evaluateHashLine(t);
+/++
+This is the old scan method.
-assert(delegate() {
+TODO: profile old and new to see which one is faster.
-p = t.start;
++/
-end = t.end;
-while (p < end)
-{
-if (*p == '\n' || *p == '\r' ||
-(p+2) < end && *p == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
-return false;
-++p;
-}
-return true;
-}() == true, "Token '" ~ t.srcText ~ "' has unexpected newline."
-);
-}
-}
-struct LocState
-{
-char[] filePath;
-uint loc;
-uint loc_hline;
-char* lineBegin;
-}
-LocState getState()
-{
-LocState s;
-s.filePath = this.errorLoc.filePath;
-s.lineBegin = this.lineBegin;
-s.loc_hline = this.loc_hline;
-s.loc = this.loc;
-return s;
-}
-void restoreState(LocState s)
-{
-if (s.lineBegin == this.lineBegin)
-return;
-assert(s.loc != this.loc);
-this.errorLoc.setFilePath(s.filePath);
-this.lineBegin = s.lineBegin;
-this.loc = s.loc;
-this.loc_hline = s.loc_hline;
-}
 public void scan_(out Token t)
 in
 {
 assert(text.ptr <= p && p < end);
 }
 assert(text.ptr <= t.end && t.end <= end, Token.toString(t.type));
 }
 body
 {
 // Scan whitespace.
-auto pws = p;
+if (isspace(*p))
-auto old_loc = this.loc;
+{
-while (1)
+t.ws = p;
-{
+while (isspace(*++p))
+{}
+}
+// Scan a token.
+uint c = *p;
+{
+t.start = p;
+// Newline.
 switch (*p)
 {
 case '\r':
 if (p[1] == '\n')
 ++p;
 case '\n':
-assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]);
+assert(isNewlineEnd(p));
 ++p;
-++loc;
+++lineNum;
 setLineBegin(p);
-continue;
+//         this.newline = &t;
-case LS[0]:
+t.type = TOK.Newline;
-if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
+t.filePath = this.errorPath;
+t.lineNum = lineNum;
+t.lineNum_hline = lineNum_hline;
+t.end = p;
+return;
+default:
+if (isUnicodeNewline(p))
 {
 ++p; ++p;
 goto case '\n';
 }
-// goto default;
+}
-default:
+// Identifier or string literal.
-if (!isspace(*p))
-break;
-++p;
-continue;
-}
-break; // Exit loop.
-}
-if (p != pws)
-{
-t.ws = pws;
-if (old_loc != this.loc)
-version(token2LocTable)
-token2LocTable[&t] = new Location(loc, null);
-}
-// Scan token.
-uint c = *p;
-{
-t.start = p;
 if (isidbeg(c))
 {
 if (c == 'r' && p[1] == '"' && ++p)
 return scanRawStringLiteral(t);
 if (c == 'x' && p[1] == '"')
 if (c == 'q' && p[1] == '"')
 return scanDelimitedStringLiteral(t);
 if (c == 'q' && p[1] == '{')
 return scanTokenStringLiteral(t);
 }
+// Scan identifier.
 Lidentifier:
 do
 { c = *++p; }
-while (isident(c) || c & 128 && isUniAlpha(decodeUTF8()))
+while (isident(c) || !isascii(c) && isUniAlpha(decodeUTF8()))
 t.end = p;
 string str = t.srcText;
 Identifier* id = str in idtable;
 case '+':
 return scanNestedComment(t);
 case '*':
 return scanBlockComment(t);
 case '/':
-while (1)
+while (!isEndOfLine(++p))
-{
+isascii(*p) || decodeUTF8();
-c = *++p;
-switch (c)
-{
-case '\r', '\n', 0, _Z_:
-break;
-case LS[0]:
-if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
-break;
-default:
-if (c & 128)
-decodeUTF8();
-continue;
-}
-break; // Exit loop.
-}
 t.type = TOK.Comment;
 t.end = p;
 return;
 default:
 t.type = TOK.Div;
 case '\\':
 char[] buffer;
 do
 {
 c = scanEscapeSequence();
-if (c < 128)
+if (isascii(c))
 buffer ~= c;
 else
 encodeUTF8(buffer, c);
 } while (*p == '\\')
 buffer ~= 0;
 tail = &t;
 assert(t.start == t.end);
 return;
 }
-if (c & 128)
+if (!isascii(c))
 {
 c = decodeUTF8();
 if (isUniAlpha(c))
 goto Lidentifier;
 }
 assert(text.ptr <= t.end && t.end <= end, Token.toString(t.type));
 }
 body
 {
 // Scan whitespace.
-auto pws = p;
+if (isspace(*p))
-auto old_loc = this.loc;
+{
-while (1)
+t.ws = p;
-{
+while (isspace(*++p))
-switch (*p)
+{}
-{
+}
-case '\r':
-if (p[1] == '\n')
+// Scan a token.
-++p;
+t.start = p;
-case '\n':
+// Newline.
-assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]);
+switch (*p)
+{
+case '\r':
+if (p[1] == '\n')
 ++p;
-++loc;
+case '\n':
-setLineBegin(p);
+assert(isNewlineEnd(p));
-continue;
+++p;
-case LS[0]:
+++lineNum;
-if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
+setLineBegin(p);
-{
+//       this.newline = &t;
-++p; ++p;
+t.type = TOK.Newline;
-goto case '\n';
+t.filePath = this.errorPath;
-}
+t.lineNum = lineNum;
-// goto default;
+t.lineNum_hline = lineNum_hline;
-default:
+t.end = p;
-if (!isspace(*p))
+return;
-break;
+default:
-++p;
+if (isUnicodeNewline(p))
-continue;
+{
-}
+++p; ++p;
-break; // Exit loop.
+goto case '\n';
 }
+}
-if (p != pws)
-{
-t.ws = pws;
-if (old_loc != this.loc)
-version(token2LocTable)
-token2LocTable[&t] = new Location(loc, null);
-}
-// Scan token.
-t.start = p;
 uint c = *p;
 assert(end - p != 0);
 switch (end - p)
 {
 ++p; // Skip /
 return scanBlockComment(t);
 case toUint!("//"):
 ++p; // Skip /
 assert(*p == '/');
-while (1)
+while (!isEndOfLine(++p))
-{
+isascii(*p) || decodeUTF8();
-c = *++p;
-switch (c)
-{
-case '\r', '\n', 0, _Z_:
-break;
-case LS[0]:
-if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
-break;
-default:
-if (c & 128)
-decodeUTF8();
-continue;
-}
-break; // Exit loop.
-}
 t.type = TOK.Comment;
 t.end = p;
 return;
 case toUint!(">="):
 t.type = TOK.GreaterEqual;
 case '\\':
 char[] buffer;
 do
 {
 c = scanEscapeSequence();
-if (c < 128)
+if (isascii(c))
 buffer ~= c;
 else
 encodeUTF8(buffer, c);
 } while (*p == '\\')
 buffer ~= 0;
 if (c == 'q' && p[1] == '"')
 return scanDelimitedStringLiteral(t);
 if (c == 'q' && p[1] == '{')
 return scanTokenStringLiteral(t);
 }
+// Scan identifier.
 Lidentifier:
 do
 { c = *++p; }
-while (isident(c) || c & 128 && isUniAlpha(decodeUTF8()))
+while (isident(c) || !isascii(c) && isUniAlpha(decodeUTF8()))
 t.end = p;
 string str = t.srcText;
 Identifier* id = str in idtable;
 tail = &t;
 assert(t.start == t.end);
 return;
 }
-if (c & 128)
+if (!isascii(c))
 {
 c = decodeUTF8();
 if (isUniAlpha(c))
 goto Lidentifier;
 }
 }
 void scanBlockComment(ref Token t)
 {
 assert(p[-1] == '/' && *p == '*');
-auto tokenLineNum = loc;
+auto tokenLineNum = lineNum;
 auto tokenLineBegin = lineBegin;
 uint c;
 while (1)
 {
 c = *++p;
 {
 case '\r':
 if (p[1] == '\n')
 ++p;
 case '\n':
-assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]);
+assert(isNewlineEnd(p));
-++loc;
+++lineNum;
 setLineBegin(p+1);
 continue;
 case 0, _Z_:
 error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedBlockComment);
 goto LreturnBC;
 default:
-if (c & 128)
+if (!isascii(c))
 {
 c = decodeUTF8();
-if (c == LSd || c == PSd)
+if (isUnicodeNewlineChar(c))
 goto case '\n';
 continue;
 }
 }
 }
 void scanNestedComment(ref Token t)
 {
 assert(p[-1] == '/' && *p == '+');
-auto tokenLineNum = loc;
+auto tokenLineNum = lineNum;
 auto tokenLineBegin = lineBegin;
 uint level = 1;
 uint c;
 while (1)
 {
 {
 case '\r':
 if (p[1] == '\n')
 ++p;
 case '\n':
-assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]);
+assert(isNewlineEnd(p));
-++loc;
+++lineNum;
 setLineBegin(p+1);
 continue;
 case 0, _Z_:
 error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedNestedComment);
 goto LreturnNC;
 default:
-if (c & 128)
+if (!isascii(c))
 {
 c = decodeUTF8();
-if (c == LSd || c == PSd)
+if (isUnicodeNewlineChar(c))
 goto case '\n';
 continue;
 }
 }
 }
 }
 assert(0);
 }
+char scanPostfix()
+{
+assert(p[-1] == '"' || p[-1] == '`' ||
+{ version(D2) return p[-1] == '}';
+else return 0; }()
+);
+switch (*p)
+{
+case 'c':
+case 'w':
+case 'd':
+return *p++;
+default:
+return 0;
+}
+assert(0);
+}
 void scanNormalStringLiteral(ref Token t)
 {
 assert(*p == '"');
-auto tokenLineNum = loc;
+auto tokenLineNum = lineNum;
 auto tokenLineBegin = lineBegin;
+t.type = TOK.String;
 char[] buffer;
-t.type = TOK.String;
 uint c;
 while (1)
 {
 c = *++p;
 switch (c)
 {
 case '"':
 ++p;
+t.pf = scanPostfix();
 Lreturn:
-buffer ~= 0;
+t.str = buffer ~ '\0';
-t.str = buffer;
-t.pf = scanPostfix();
 t.end = p;
 return;
 case '\\':
 c = scanEscapeSequence();
 --p;
-if (c < 128)
+if (isascii(c))
 break;
 encodeUTF8(buffer, c);
 continue;
 case '\r':
 if (p[1] == '\n')
 ++p;
 case '\n':
-assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]);
+assert(isNewlineEnd(p));
-++loc;
+c = '\n'; // Convert Newline to \n.
-c = '\n'; // Convert EndOfLine to \n.
+++lineNum;
 setLineBegin(p+1);
 break;
 case 0, _Z_:
 error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedString);
 goto Lreturn;
 default:
-if (c & 128)
+if (!isascii(c))
 {
 c = decodeUTF8();
-if (c == LSd || c == PSd)
+if (isUnicodeNewlineChar(c))
 goto case '\n';
 encodeUTF8(buffer, c);
 continue;
 }
 }
 assert(isascii(c));
 }
 void scanCharacterLiteral(ref Token t)
 {
 assert(*p == '\'');
-MID id = MID.UnterminatedCharacterLiteral;
 ++p;
-TOK type = TOK.CharLiteral;
+t.type = TOK.CharLiteral;
 switch (*p)
 {
 case '\\':
 switch (p[1])
 {
 case 'u':
-type = TOK.WCharLiteral; break;
+t.type = TOK.WCharLiteral; break;
 case 'U':
-type = TOK.DCharLiteral; break;
+t.type = TOK.DCharLiteral; break;
 default:
 }
 t.dchar_ = scanEscapeSequence();
 break;
 case '\'':
-++p;
+error(t.start, MID.EmptyCharacterLiteral);
-id = MID.EmptyCharacterLiteral;
+break;
-// fall through
-case '\n', '\r', 0, _Z_:
-goto Lerr;
-case LS[0]:
-if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
-goto Lerr;
-// fall through
 default:
+if (isEndOfLine(p))
+break;
 uint c = *p;
-if (c & 128)
+if (!isascii(c))
 {
 c = decodeUTF8();
-if (c <= 0xFFFF)
+t.type = c <= 0xFFFF ? TOK.WCharLiteral : TOK.DCharLiteral;
-type = TOK.WCharLiteral;
-else
-type = TOK.DCharLiteral;
 }
 t.dchar_ = c;
 ++p;
 }
 if (*p == '\'')
 ++p;
 else
-Lerr:
+error(t.start, MID.UnterminatedCharacterLiteral);
-error(t.start, id);
-t.type = type;
 t.end = p;
 }
-char scanPostfix()
-{
-switch (*p)
-{
-case 'c':
-case 'w':
-case 'd':
-return *p++;
-default:
-return 0;
-}
-assert(0);
-}
 void scanRawStringLiteral(ref Token t)
 {
-auto tokenLineNum = loc;
+assert(*p == '`' || *p == '"' && p[-1] == 'r');
+auto tokenLineNum = lineNum;
 auto tokenLineBegin = lineBegin;
+t.type = TOK.String;
 uint delim = *p;
-assert(delim == '`' || delim == '"' && p[-1] == 'r');
-t.type = TOK.String;
 char[] buffer;
 uint c;
 while (1)
 {
 c = *++p;
 {
 case '\r':
 if (p[1] == '\n')
 ++p;
 case '\n':
-assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]);
+assert(isNewlineEnd(p));
-c = '\n'; // Convert EndOfLine ('\r','\r\n','\n',LS,PS) to '\n'
+c = '\n'; // Convert Newline to '\n'.
-++loc;
+++lineNum;
 setLineBegin(p+1);
 break;
 case '`':
 case '"':
 if (c == delim)
 t.end = p;
 return;
 }
 break;
 case 0, _Z_:
-if (delim == 'r')
+error(tokenLineNum, tokenLineBegin, t.start,
-error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedRawString);
+delim == 'r' ? MID.UnterminatedRawString : MID.UnterminatedBackQuoteString);
-else
-error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedBackQuoteString);
 goto Lreturn;
 default:
-if (c & 128)
+if (!isascii(c))
 {
 c = decodeUTF8();
-if (c == LSd || c == PSd)
+if (isUnicodeNewlineChar(c))
 goto case '\n';
 encodeUTF8(buffer, c);
 continue;
 }
 }
 void scanHexStringLiteral(ref Token t)
 {
 assert(p[0] == 'x' && p[1] == '"');
 t.type = TOK.String;
-auto tokenLineNum = loc;
+auto tokenLineNum = lineNum;
 auto tokenLineBegin = lineBegin;
 uint c;
 ubyte[] buffer;
 ubyte h; // hex number
 {
 c = *++p;
 switch (c)
 {
 case '"':
-++p;
 if (n & 1)
 error(tokenLineNum, tokenLineBegin, t.start, MID.OddNumberOfDigitsInHexString);
+++p;
 t.pf = scanPostfix();
 Lreturn:
-buffer ~= 0;
+t.str = cast(string) (buffer ~= 0);
-t.str = cast(string) buffer;
 t.end = p;
 return;
 case '\r':
 if (p[1] == '\n')
 ++p;
 case '\n':
-assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]);
+assert(isNewlineEnd(p));
-++loc;
+++lineNum;
 setLineBegin(p+1);
 continue;
 default:
 if (ishexad(c))
 {
 ++n;
 continue;
 }
 else if (isspace(c))
 continue; // Skip spaces.
-else if (c == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
-{
-++p; ++p;
-goto case '\n';
-}
 else if (c == 0 || c == _Z_)
 {
 error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedHexString);
 t.pf = 0;
 goto Lreturn;
 }
 else
 {
 auto errorAt = p;
-if (c & 128)
+if (!isascii(c))
+{
 c = decodeUTF8();
+if (isUnicodeNewlineChar(c))
+goto case '\n';
+}
 error(errorAt, MID.NonHexCharInHexString, cast(dchar)c);
 }
 }
 }
 assert(0);
 void scanDelimitedStringLiteral(ref Token t)
 {
 assert(p[0] == 'q' && p[1] == '"');
 t.type = TOK.String;
-auto tokenLineNum = loc;
+auto tokenLineNum = lineNum;
 auto tokenLineBegin = lineBegin;
 char[] buffer;
 dchar opening_delim = 0, // 0 if no nested delimiter or '[', '(', '<', '{'
 closing_delim; // Will be ']', ')', '>', '},
 {
 case '\r':
 if (p[1] == '\n')
 ++p;
 case '\n':
-assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]);
+assert(isNewlineEnd(p));
 ++p;
-++loc;
+++lineNum;
 setLineBegin(p);
 return '\n';
-case LS[0]:
+default:
-if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
+if (isUnicodeNewline(p))
 {
 ++p; ++p;
 goto case '\n';
 }
-default:
 }
 return 0;
 }
 // Skip leading newlines:
-while (scanNewline() != 0){}
+while (scanNewline() != 0)
-assert(*p != '\n' && *p != '\r' &&
+{}
-!(*p == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2])));
+assert(!isNewline(p));
 char* begin = p;
 c = *p;
 closing_delim = c;
 // TODO: Check for non-printable characters?
-if (c & 128)
+if (!isascii(c))
 {
 closing_delim = decodeUTF8();
 if (!isUniAlpha(closing_delim))
 break; // Not an identifier.
 }
 break; // Not an identifier.
 // Parse Identifier + EndOfLine
 do
 { c = *++p; }
-while (isident(c) || c & 128 && isUniAlpha(decodeUTF8()))
+while (isident(c) || !isascii(c) && isUniAlpha(decodeUTF8()))
 // Store identifier
 str_delim = begin[0..p-begin];
 // Scan newline
 if (scanNewline() == '\n')
 --p; // Go back one because of "c = *++p;" in main loop.
 {
 case '\r':
 if (p[1] == '\n')
 ++p;
 case '\n':
-assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]);
+assert(isNewlineEnd(p));
-c = '\n'; // Convert EndOfLine ('\r','\r\n','\n',LS,PS) to '\n'
+c = '\n'; // Convert Newline to '\n'.
-++loc;
+++lineNum;
 setLineBegin(p+1);
 break;
 case 0, _Z_:
 // TODO: error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedDelimitedString);
 goto Lreturn3;
 default:
-if (c & 128)
+if (!isascii(c))
 {
 auto begin = p;
 c = decodeUTF8();
-if (c == LSd || c == PSd)
+if (isUnicodeNewlineChar(c))
 goto case '\n';
 if (c == closing_delim)
 {
 if (str_delim.length)
 {
 void scanTokenStringLiteral(ref Token t)
 {
 assert(p[0] == 'q' && p[1] == '{');
 t.type = TOK.String;
-auto tokenLineNum = loc;
+auto tokenLineNum = lineNum;
 auto tokenLineBegin = lineBegin;
 // A guard against changes to particular members:
-// this.loc_hline and this.errorLoc.filePath
+// this.lineNum_hline and this.errorPath
 ++inTokenString;
-uint loc = this.loc;
+uint lineNum = this.lineNum;
 uint level = 1;
 ++p; ++p; // Skip q{
 auto prev_t = &t;
 {
 // Assign to buffer before scanPostfix().
 t.end = p;
 buffer = t.srcText[2..$-1].dup ~ '\0';
 t.pf = scanPostfix();
-t.end = p;
+t.end = p; // Assign again because of postfix.
 }
-// Convert EndOfLines to '\n'
+// Convert newlines to '\n'.
-if (loc != this.loc)
+if (lineNum != this.lineNum)
 {
 assert(buffer[$-1] == '\0');
 uint i, j;
 for (; i < buffer.length; ++i)
 switch (buffer[i])
 {
 case '\r':
 if (buffer[i+1] == '\n')
 ++i;
 case '\n':
-buffer[j++] = '\n';
+assert(isNewlineEnd(buffer.ptr + i));
+buffer[j++] = '\n'; // Convert Newline to '\n'.
 break;
-case LS[0]:
+default:
-auto b = buffer[i..$];
+if (isUnicodeNewline(buffer.ptr + i))
-if (b[1] == LS[1] && (b[2] == LS[2] || b[2] == PS[2]))
 {
 ++i; ++i;
 goto case '\n';
 }
-// goto default;
+buffer[j++] = buffer[i]; // Copy.
-default:
+}
-buffer[j++] = buffer[i]; // Copy character
+buffer.length = j; // Adjust length.
-}
-buffer.length = j; // Adjust length
 }
 assert(buffer[$-1] == '\0');
 t.str = buffer;
 --inTokenString;
 error(sequenceStart, MID.UnterminatedHTMLEntity, sequenceStart[0 .. p - sequenceStart]);
 }
 else
 error(sequenceStart, MID.InvalidBeginHTMLEntity);
 }
-else if (*p == '\n' || *p == '\r' ||
+else if (isEndOfLine(p))
-*p == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
+error(sequenceStart, MID.UndefinedEscapeSequence,
-{
+(*p == 0 || *p == _Z_) ? `\EOF` : `\NewLine`);
-error(sequenceStart, MID.UndefinedEscapeSequence, r"\NewLine");
-}
-else if (*p == 0 || *p == _Z_)
-{
-error(sequenceStart, MID.UndefinedEscapeSequence, r"\EOF");
-}
 else
 {
 char[] str = `\`;
-if (*p & 128)
+if (isascii(c))
+str ~= *p;
+else
 encodeUTF8(str, decodeUTF8());
-else
-str ~= *p;
 ++p;
 // TODO: check for unprintable character?
 error(sequenceStart, MID.UndefinedEscapeSequence, str);
 }
 }
 enum State
 { /+Space,+/ Integer, Filespec, End }
 State state = State.Integer;
-Loop:
+while (!isEndOfLine(++p))
-while (1)
+{
-{
+if (isspace(*p))
-switch (*++p)
+continue;
-{
+if (state == State.Integer)
-case LS[0]:
+{
-if (!(p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2])))
+if (!isdigit(*p))
-goto default;
+{
-case '\r', '\n', 0, _Z_:
+errorAtColumn = p;
-break Loop;
+mid = MID.ExpectedIntegerAfterSTLine;
-default:
+goto Lerr;
-if (isspace(*p))
+}
-continue;
+t.tokLineNum = new Token;
-if (state == State.Integer)
+scan(*t.tokLineNum);
-{
+if (t.tokLineNum.type != TOK.Int32 && t.tokLineNum.type != TOK.Uint32)
-if (!isdigit(*p))
+{
+errorAtColumn = t.tokLineNum.start;
+mid = MID.ExpectedIntegerAfterSTLine;
+goto Lerr;
+}
+--p; // Go one back because scan() advanced p past the integer.
+state = State.Filespec;
+}
+else if (state == State.Filespec)
+{
+if (*p != '"')
+{
+errorAtColumn = p;
+mid = MID.ExpectedFilespec;
+goto Lerr;
+}
+t.tokLineFilespec = new Token;
+t.tokLineFilespec.start = p;
+t.tokLineFilespec.type = TOK.Filespec;
+while (*++p != '"')
+{
+if (isEndOfLine(p))
 {
-errorAtColumn = p;
+errorAtColumn = t.tokLineFilespec.start;
-mid = MID.ExpectedIntegerAfterSTLine;
+mid = MID.UnterminatedFilespec;
+t.tokLineFilespec.end = p;
 goto Lerr;
 }
-t.line_num = new Token;
+isascii(*p) || decodeUTF8();
-scan(*t.line_num);
+}
-if (t.line_num.type != TOK.Int32 && t.line_num.type != TOK.Uint32)
+auto start = t.tokLineFilespec.start +1; // +1 skips '"'
-{
+t.tokLineFilespec.str = start[0 .. p - start];
-errorAtColumn = t.line_num.start;
+t.tokLineFilespec.end = p + 1;
-mid = MID.ExpectedIntegerAfterSTLine;
+state = State.End;
-goto Lerr;
+}
-}
+else/+ if (state == State.End)+/
---p; // Go one back because scan() advanced p past the integer.
+{
-state = State.Filespec;
+mid = MID.UnterminatedSpecialToken;
-}
+goto Lerr;
-else if (state == State.Filespec)
+}
-{
+}
-if (*p != '"')
+assert(isEndOfLine(p));
-{
-errorAtColumn = p;
-mid = MID.ExpectedFilespec;
-goto Lerr;
-}
-t.line_filespec = new Token;
-t.line_filespec.start = p;
-t.line_filespec.type = TOK.Filespec;
-while (1)
-{
-switch (*++p)
-{
-case '"':
-break;
-case LS[0]:
-if (!(p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2])))
-goto default;
-case '\r', '\n', 0, _Z_:
-errorAtColumn = t.line_filespec.start;
-mid = MID.UnterminatedFilespec;
-t.line_filespec.end = p;
-goto Lerr;
-default:
-if (*p & 128)
-decodeUTF8();
-continue;
-}
-break; // Exit loop.
-}
-auto start = t.line_filespec.start +1; // +1 skips '"'
-t.line_filespec.str = start[0 .. p - start];
-t.line_filespec.end = p + 1;
-state = State.End;
-}
-else/+ if (state == State.End)+/
-{
-mid = MID.UnterminatedSpecialToken;
-goto Lerr;
-}
-}
-}
-assert(*p == '\r' || *p == '\n' || *p == 0 || *p == _Z_ ||
-*p == LS[0] && (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
-);
 if (state == State.Integer)
 {
 errorAtColumn = p;
 mid = MID.ExpectedIntegerAfterSTLine;
 goto Lerr;
 }
 // Evaluate #line only when not in token string.
-if (!inTokenString)
+if (!inTokenString && t.tokLineNum)
-evaluateHashLine(t);
+{
+this.lineNum_hline = this.lineNum - t.tokLineNum.uint_ + 1;
+if (t.tokLineFilespec)
+this.errorPath = t.tokLineFilespec.str;
+}
 t.end = p;
 return;
 Lerr:
 t.end = p;
 error(errorAtColumn, mid);
 }
-void evaluateHashLine(ref Token t)
+/++
-{
-assert(t.type == TOK.HashLine);
-if (t.line_num)
-{
-this.loc_hline = this.loc - t.line_num.uint_ + 1;
-if (t.line_filespec)
-this.errorLoc.setFilePath(t.line_filespec.str);
-}
-}
-/+
 Insert an empty dummy token before t.
 Useful in the parsing phase for representing a node in the AST
 that doesn't consume an actual token from the source text.
 +/
 Token* insertEmptyTokenBefore(Token* t)
 new_t.next = t;
 t.prev = new_t;
 return new_t;
 }
-void updateErrorLoc(char* columnPos)
+uint errorLineNumber(uint lineNum)
 {
-updateErrorLoc(this.loc, this.lineBegin, columnPos);
+return lineNum - this.lineNum_hline;
-}
-void updateErrorLoc(uint lineNum, char* lineBegin, char* columnPos)
-{
-errorLoc.set(this.errorLineNum(lineNum), lineBegin, columnPos);
-}
-uint errorLineNum(uint loc)
-{
-return loc - this.loc_hline;
 }
 void error(char* columnPos, MID mid, ...)
 {
-updateErrorLoc(columnPos);
+error_(this.lineNum, this.lineBegin, columnPos, mid, _arguments, _argptr);
-errors ~= new Information(InfoType.Lexer, mid, errorLoc.clone, Format(_arguments, _argptr, GetMsg(mid)));
 }
 void error(uint lineNum, char* lineBegin, char* columnPos, MID mid, ...)
 {
-updateErrorLoc(lineNum, lineBegin, columnPos);
+error_(lineNum, lineBegin, columnPos, mid, _arguments, _argptr);
-errors ~= new Information(InfoType.Lexer, mid, errorLoc.clone, Format(_arguments, _argptr, GetMsg(mid)));
+}
+void error_(uint lineNum, char* lineBegin, char* columnPos, MID mid,
+TypeInfo[] _arguments, void* _argptr)
+{
+lineNum = this.errorLineNumber(lineNum);
+auto location = new Location(errorPath, lineNum, lineBegin, columnPos);
+auto msg = Format(_arguments, _argptr, GetMsg(mid));
+errors ~= new Information(InfoType.Lexer, mid, location, msg);
 }
 Token* getTokens()
 {
 while (nextToken() != TOK.EOF)
 return isUniAlpha(std.utf.decode(ident, idx));
 }
 try
 {
-if (isidbeg(ident[0]) ||
+if (isidbeg(ident[0]) || !isascii(ident[0]) && isFirstCharUniAlpha())
-ident[0] & 128 && isFirstCharUniAlpha())
 {
 foreach (dchar c; ident[idx..$])
 if (!isident(c) && !isUniAlpha(c))
 return false;
 }
 unittest
 {
 Stdout("Testing Lexer.\n");
 struct Pair
 {
-char[] token;
+char[] tokenText;
 TOK type;
 }
 static Pair[] pairs = [
-{"//çay\n", TOK.Comment},       {"&",       TOK.AndBinary},
+{"//çay",   TOK.Comment},       {"\n",      TOK.Newline},
+{"&",       TOK.AndBinary},
 {"/*çağ*/", TOK.Comment},       {"&&",      TOK.AndLogical},
 {"/+çak+/", TOK.Comment},       {"&=",      TOK.AndAssign},
 {">",       TOK.Greater},       {"+",       TOK.Plus},
 {">=",      TOK.GreaterEqual},  {"++",      TOK.PlusPlus},
 {">>",      TOK.RShift},        {"+=",      TOK.PlusAssign},
 {"||",      TOK.OrLogical},     {":",       TOK.Colon},
 {"|=",      TOK.OrAssign},      {";",       TOK.Semicolon},
 {"?",       TOK.Question},      {",",       TOK.Comma},
 {"$",       TOK.Dollar},        {"cam",     TOK.Identifier},
 {"çay",     TOK.Identifier},    {".0",      TOK.Float64},
-{"0",       TOK.Int32},
+{"0",       TOK.Int32},         {"\n",      TOK.Newline},
+{"\r",      TOK.Newline},       {"\r\n",    TOK.Newline},
+{"\u2028",  TOK.Newline},       {"\u2029",  TOK.Newline}
 ];
 char[] src;
-foreach (pair; pairs)
+// Join all token texts into a single string.
-src ~= pair.token ~ " ";
+foreach (i, pair; pairs)
+if (pair.type == TOK.Comment && pair.tokenText[1] == '/') // Line comment.
-assert(pairs[0].token == "//çay\n");
+{
-// Remove \n after src has been constructed.
+assert(pairs[i+1].type == TOK.Newline); // Must be followed by a newline.
-// It won't be part of the scanned token string.
+src ~= pair.tokenText;
-pairs[0].token = "//çay";
+}
+else
+src ~= pair.tokenText ~ " ";
 auto lx = new Lexer(src, "");
 auto token = lx.getTokens();
 uint i;
 assert(token == lx.head);
-token = token.next;
+assert(token.next.type == TOK.Newline);
+token = token.next.next;
 do
 {
 assert(i < pairs.length);
-assert(token.srcText == pairs[i].token, Format("Scanned '{0}' but expected '{1}'", token.srcText, pairs[i].token));
+assert(token.srcText == pairs[i].tokenText, Format("Scanned '{0}' but expected '{1}'", token.srcText, pairs[i].tokenText));
 ++i;
 token = token.next;
 } while (token.type != TOK.EOF)
 }
 {
 Stdout("Testing method Lexer.peek()\n");
 string sourceText = "unittest { }";
 auto lx = new Lexer(sourceText, null);
-Token* next = lx.head;
+auto next = lx.head;
+lx.peek(next);
+assert(next.type == TOK.Newline);
 lx.peek(next);
 assert(next.type == TOK.Unittest);
 lx.peek(next);
 assert(next.type == TOK.LBrace);
 lx.peek(next);
 assert(next.type == TOK.RBrace);
+lx.peek(next);
+assert(next.type == TOK.EOF);
+lx = new Lexer("", null);
+next = lx.head;
+lx.peek(next);
+assert(next.type == TOK.Newline);
 lx.peek(next);
 assert(next.type == TOK.EOF);
 }
 unittest

Mercurial > projects > dil

comparison trunk/src/dil/Lexer.d @ 485:ea8c7459f1c4