Mercurial > projects > dang
view lexer/Lexer.d @ 89:a49bb982a7b0 new_gen
Using the new SourceLocation system to handle errors. Also, this is the first push for making the errors don't throw, but continue to check the source.
author | Anders Johnsen <skabet@gmail.com> |
---|---|
date | Sun, 04 May 2008 20:27:01 +0200 |
parents | eb5b2c719a39 |
children | 1a24e61eb104 |
line wrap: on
line source
module lexer.Lexer; import basic.Message, basic.SourceManager; import lexer.Token, lexer.Keyword; import tango.io.Stdout; /** The Lexer class will supply you with methods to tokenize a D file. Supply the Lexer with a DataSource and you can 'peek' and 'next' Tokens from the file. For more info about Tokens, look up the lexer.Token module. */ class Lexer { public: /** Create a new Lexer. */ this(SourceLocation start, SourceManager src_mgr, MessageHandler messages) { this.messages = messages; sm = src_mgr; start_loc = start; position = 0; source = sm.getRawData(start_loc); charTable.length = 256; foreach (c; "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_") charTable[c] = CharType.Letter; foreach (c; "0123456789") charTable[c] = CharType.Number; foreach (c; "(){}[];:.,=!<>+-*/%") charTable[c] = CharType.Symbol; foreach (c; " \n") charTable[c] = CharType.Whitespace; symbolFunctions.length = 256; symbolFunctions['('] = &openParentheses; symbolFunctions[')'] = &closeParentheses; symbolFunctions['{'] = &openBrace; symbolFunctions['}'] = &closeBrace; symbolFunctions['['] = &openBracket; symbolFunctions[']'] = &closeBracket; symbolFunctions[';'] = &seperator; symbolFunctions[':'] = : symbolFunctions['.'] = ˙ symbolFunctions[','] = , symbolFunctions['='] = &eq; symbolFunctions['!'] = ≠ symbolFunctions['<'] = ≤ symbolFunctions['>'] = ≥ symbolFunctions['+'] = + symbolFunctions['-'] = − symbolFunctions['*'] = ☆ symbolFunctions['/'] = &slash; symbolFunctions['%'] = &percent; } /** Get the next token from the source. This method will move the internal position forward to the next Token. return: A Token - Token.type is TokType.EOF if there is no more tokens in the file. */ Token next() { switch (getNextChar) { case CharType.EOF: SLoc loc; return Token(Tok.EOF, loc, 0); case CharType.Whitespace: position += 1; return this.next; case CharType.Symbol: return lexSymbol; case CharType.Letter: return lexLetter; case CharType.Number: return lexNumber; } } /** Get the next token from the source. This method will NOT move the internal position forward, and thereby having no side-effects. return: A Token - Token.type is TokType.EOF if there is no more tokens in the file. */ Token peek(int skip = 0) { int oldPosition = this.position; while (skip-- > 0) this.next; Token t = this.next; this.position = oldPosition; return t; } private: Token eq() { if(source[position] == '=') return Token(Tok.Eq, Loc(position++ - 1), 2); return Token(Tok.Assign, Loc(position - 1), 1); } Token openBrace() { return Token(Tok.OpenBrace, Loc(position - 1), 1); } Token closeBrace() { return Token(Tok.CloseBrace, Loc(position - 1), 1); } Token openParentheses() { return Token(Tok.OpenParentheses, Loc(position - 1), 1); } Token closeParentheses() { return Token(Tok.CloseParentheses, Loc(position - 1), 1); } Token openBracket() { return Token(Tok.OpenBracket, Loc(position - 1), 1); } Token closeBracket() { return Token(Tok.CloseBracket, Loc(position - 1), 1); } Token seperator() { return Token(Tok.Seperator, Loc(position - 1), 1); } Token colon() { return Token(Tok.Colon, Loc(position - 1), 1); } Token dot() { int pos = 0; while(getNextChar(0) == CharType.Number || this.source[position + pos + 1] == '_') { if(getNextChar(0) == CharType.Number) { position--; return lexNumber(); } pos++; } return Token(Tok.Dot, Loc(position - 1), 1); } Token comma() { return Token(Tok.Comma, Loc(position - 1), 1); } Token ne() { if(source[position] == '=') return Token(Tok.Ne, Loc(position++ - 1), 2); return Token(Tok.Not, Loc(position - 1), 1); } Token le() { if(source[position] == '=') return Token(Tok.Le, Loc(position++ - 1), 2); return Token(Tok.Lt, Loc(position - 1), 1); } Token ge() { if(source[position] == '=') return Token(Tok.Ge, Loc(position++ - 1), 2); return Token(Tok.Gt, Loc(position - 1), 1); } Token plus() { return Token(Tok.Plus, Loc(position - 1), 1); } Token minus() { return Token(Tok.Minus, Loc(position - 1), 1); } Token star() { return Token(Tok.Star, Loc(position - 1), 1); } Token slash() { switch(source[position]) { case '/': while(getNextChar != CharType.EOF) { if(source[position++] == '\n') return this.next; } return Token(Tok.EOF, Loc(position), 0); case '*': position += 2; while(getNextChar != CharType.EOF) { ++position; if(source[position-2] == '*') if(source[position-1] == '/') return this.next; } messages.report(UnexpectedEOFBlock,Loc(position)); case '+': position += 2; int nesting = 1; while(getNextChar != CharType.EOF) { ++position; if(source[position-2] == '+') if(source[position-1] == '/') { position++; nesting--; } if(source[position-2] == '/') if(source[position-1] == '+') { nesting++; position++; } if(nesting == 0) return this.next; } messages.report(UnexpectedEOFBlock,Loc(position)); default: return Token(Tok.Slash, Loc(position - 1), 1); } } Token percent() { return Token(Tok.Percent, Loc(position - 1), 1); } Token lexNumber () { bool sign = false; bool dot = false; bool e = false; int i = 0; bool end = false; while(!end) { switch(getNextChar(i)) { case CharType.Number: break; case CharType.Symbol: if(this.source[position+i] == '.') { if(dot) messages.report(OnlyOneDotFloating, Loc(position + i)); dot = true; break; } end = true; continue; case CharType.Letter: if(this.source[position+i] == '_') break; if (this.source[position+i] == 'e' || this.source[position+i] == 'E') { if (e) messages.report(OnlyOneEFloating, Loc(position + i)); e = true; break; } end = true; continue; default: end = true; continue; } i++; } position += i; return Token(Tok.Integer, Loc(position - i), i); } Token lexSymbol () { Token t = symbolFunctions[source[position++]](); return t; } Token lexLetter () { int i = 0; bool hasNumber = false; while (getNextChar(++i) == CharType.Letter || getNextChar(i) == CharType.Number) { if (getNextChar(i) == CharType.Number) { hasNumber = true; } } Token t = Token(Tok.Identifier, Loc(), i); if (!hasNumber) { char[] str = source[position .. position + i]; if(str in keywords) t.type = keywords[str]; } position += i; return t; } CharType getNextChar(int offset = 0) { if (position + offset >= this.source.length) return CharType.EOF; char current = source[position + offset]; CharType c = charTable[current]; if(c == CharType.INVALID) messages.report(InvalidSymbol, SLoc()).arg(current); return c; } private final SourceLocation Loc(int pos = -1) { if (pos < 0) return start_loc + position; return start_loc + pos; } SourceManager sm; SourceLocation start_loc; int position; char[] source; MessageHandler messages; CharType[] charTable; Token delegate()[] symbolFunctions; } enum CharType : ubyte { INVALID, Letter, Number, Symbol, Whitespace, EOF }