Mercurial > projects > dang
diff src/lexer/Lexer.d @ 206:d3c148ca429b
Major moving of files. all src now goes into src, all docs in docs.
author | Anders Johnsen <skabet@gmail.com> |
---|---|
date | Tue, 12 Aug 2008 18:14:56 +0200 |
parents | |
children | e0551773a005 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lexer/Lexer.d Tue Aug 12 18:14:56 2008 +0200 @@ -0,0 +1,447 @@ +module lexer.Lexer; + +import basic.Message, + basic.SourceManager; + +import lexer.Token, + lexer.Keyword; + +import tango.io.Stdout; + +/** + The Lexer class will supply you with methods to tokenize a D file. Supply the + Lexer with a DataSource and you can 'peek' and 'next' Tokens from the file. + + For more info about Tokens, look up the lexer.Token module. +*/ +class Lexer +{ +public: + + /** + Create a new Lexer. + */ + this(SourceLocation start, SourceManager src_mgr, MessageHandler messages) + { + this.messages = messages; + sm = src_mgr; + start_loc = start; + position = 0; + source = sm.getRawData(start_loc); + + + charTable.length = 256; + foreach (c; "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_") + charTable[c] = CharType.Letter; + + foreach (c; "0123456789") + charTable[c] = CharType.Number; + + foreach (c; "(){}[];:.,=!<>+-*/%\"`") + charTable[c] = CharType.Symbol; + + foreach (c; " \n") + charTable[c] = CharType.Whitespace; + + foreach (c; "'\\") + charTable[c] = CharType.Other; + + symbolFunctions.length = 256; + + symbolFunctions['('] = &openParentheses; + symbolFunctions[')'] = &closeParentheses; + symbolFunctions['{'] = &openBrace; + symbolFunctions['}'] = &closeBrace; + symbolFunctions['['] = &openBracket; + symbolFunctions[']'] = &closeBracket; + symbolFunctions[';'] = &seperator; + symbolFunctions[':'] = : + symbolFunctions['.'] = ˙ + symbolFunctions[','] = , + symbolFunctions['='] = &eq; + symbolFunctions['!'] = ≠ + symbolFunctions['<'] = ≤ + symbolFunctions['>'] = ≥ + symbolFunctions['+'] = + + symbolFunctions['-'] = − + symbolFunctions['*'] = ☆ + symbolFunctions['/'] = &slash; + symbolFunctions['%'] = &percent; + symbolFunctions['"'] = &string; + symbolFunctions['`'] = &string; + } + + /** + Get the next token from the source. This method will move the + internal position forward to the next Token. + + return: A Token - Token.type is TokType.EOF if there is + no more tokens in the file. + */ + Token next() + { + switch (getNextChar) + { + case CharType.EOF: + SLoc loc; + return Token(Tok.EOF, loc, 0); + + case CharType.Whitespace: + position += 1; + return this.next; + + case CharType.Symbol: + return lexSymbol; + + case CharType.Letter: + return lexLetter; + + case CharType.Number: + return lexNumber; + case CharType.Other: + messages.report(UnexpectedTok, Loc(position)).fatal(ExitLevel.Lexer); + } + } + + /** + Get the next token from the source. This method will NOT move the + internal position forward, and thereby having no side-effects. + + return: A Token - Token.type is TokType.EOF if there is + no more tokens in the file. + */ + Token peek(int skip = 0) + { + int oldPosition = this.position; + while (skip-- > 0) + this.next; + Token t = this.next; + this.position = oldPosition; + return t; + } + +private: + Token eq() + { + if(source[position] == '=') + return Token(Tok.Eq, Loc(position++ - 1), 2); + return Token(Tok.Assign, Loc(position - 1), 1); + } + Token openBrace() + { + return Token(Tok.OpenBrace, Loc(position - 1), 1); + } + Token closeBrace() + { + return Token(Tok.CloseBrace, Loc(position - 1), 1); + } + Token openParentheses() + { + return Token(Tok.OpenParentheses, Loc(position - 1), 1); + } + Token closeParentheses() + { + return Token(Tok.CloseParentheses, Loc(position - 1), 1); + } + Token openBracket() + { + return Token(Tok.OpenBracket, Loc(position - 1), 1); + } + Token closeBracket() + { + return Token(Tok.CloseBracket, Loc(position - 1), 1); + } + Token seperator() + { + return Token(Tok.Seperator, Loc(position - 1), 1); + } + Token colon() + { + return Token(Tok.Colon, Loc(position - 1), 1); + } + Token dot() + { + int pos = 0; + while(getNextChar(0) == CharType.Number || + this.source[position + pos + 1] == '_') + { + if(getNextChar(0) == CharType.Number) + { + position--; + return lexNumber(); + } + pos++; + } + return Token(Tok.Dot, Loc(position - 1), 1); + } + Token comma() + { + return Token(Tok.Comma, Loc(position - 1), 1); + } + Token ne() + { + if(source[position] == '=') + return Token(Tok.Ne, Loc(position++ - 1), 2); + return Token(Tok.Not, Loc(position - 1), 1); + } + Token le() + { + if(source[position] == '=') + return Token(Tok.Le, Loc(position++ - 1), 2); + return Token(Tok.Lt, Loc(position - 1), 1); + } + Token ge() + { + if(source[position] == '=') + return Token(Tok.Ge, Loc(position++ - 1), 2); + return Token(Tok.Gt, Loc(position - 1), 1); + } + Token plus() + { + return Token(Tok.Plus, Loc(position - 1), 1); + } + Token minus() + { + return Token(Tok.Minus, Loc(position - 1), 1); + } + Token star() + { + return Token(Tok.Star, Loc(position - 1), 1); + } + Token slash() + { + switch(source[position]) + { + case '/': + while(getNextChar != CharType.EOF) + { + if(source[position++] == '\n') + return this.next; + } + return Token(Tok.EOF, Loc(position), 0); + + case '*': + position += 2; + while(getNextChar != CharType.EOF) + { + ++position; + if(source[position-2] == '*') + if(source[position-1] == '/') + { + return this.next; + } + } + messages.report(UnexpectedEOFBlock,Loc(position)); + + case '+': + position += 2; + int nesting = 1; + while(getNextChar != CharType.EOF) + { + ++position; + if(source[position-2] == '+') + if(source[position-1] == '/') + { + position++; + nesting--; + } + + if(source[position-2] == '/') + if(source[position-1] == '+') + { + nesting++; + position++; + } + + if(nesting == 0) + return this.next; + } + messages.report(UnexpectedEOFBlock,Loc(position)); + + default: + return Token(Tok.Slash, Loc(position - 1), 1); + } + } + + Token percent() + { + return Token(Tok.Percent, Loc(position - 1), 1); + } + + Token string() + { + --position; + int start = position; + if(getNextChar() == CharType.Letter) + position++; + char end = '`'; + switch(source[position]) + { + case '"': + if(position > 0) + if(source[position-1] == 'r') + { + end = '"'; + goto string_wys; + } + ++position; + while(getNextChar != CharType.EOF) + { + ++position; + if (source[position-1] == '"' ) + return Token(Tok.String, Loc(start), position - start); + else if (source[position-1] == '\\') + position++; + } + break; + case '`': +string_wys: + ++position; + while(getNextChar != CharType.EOF) + { + ++position; + if (source[position-1] == end ) + return Token(Tok.String, Loc(start), position - start); + } + break; + } + messages.report(UnexpectedEOFBlock, Loc(position)).fatal(ExitLevel.Lexer); + } + + Token lexNumber () + { + bool sign = false; + bool dot = false; + bool e = false; + + int i = 0; + + bool end = false; + while(!end) + { + switch(getNextChar(i)) + { + case CharType.Number: + break; + case CharType.Symbol: + if(this.source[position+i] == '.') + { + if(dot) + messages.report(OnlyOneDotFloating, Loc(position + i)); + dot = true; + break; + } + end = true; + continue; + case CharType.Letter: + if(this.source[position+i] == '_') + break; + if (this.source[position+i] == 'e' || + this.source[position+i] == 'E') + { + if (e) + messages.report(OnlyOneEFloating, Loc(position + i)); + e = true; + break; + } + end = true; + continue; + + default: + end = true; + continue; + } + i++; + } + + position += i; + + return Token(Tok.Integer, Loc(position - i), i); + } + + Token lexSymbol () + { + Token t = symbolFunctions[source[position++]](); + + return t; + } + + Token lexLetter () + { + int i = 0; + bool hasNumber = false; + if (source[position+1] == '"' || + source[position+1] == '`') + { + ++position; + return string; + } + while (getNextChar(++i) == CharType.Letter || + getNextChar(i) == CharType.Number) + { + if (getNextChar(i) == CharType.Number) + { + hasNumber = true; + } + } + + Token t = Token(Tok.Identifier, Loc(), i); + + if (!hasNumber) + { + char[] str = source[position .. position + i]; + if(str in keywords) + t.type = keywords[str]; + } + + position += i; + + return t; + } + + CharType getNextChar(int offset = 0) + { + if (position + offset >= this.source.length) + return CharType.EOF; + + char current = source[position + offset]; + + CharType c = charTable[current]; + + if(c == CharType.INVALID) + messages.report(InvalidSymbol, Loc()) + .arg(Integer.toString(cast(int)current)) + .fatal(ExitLevel.Lexer); + + return c; + + } + + private final SourceLocation Loc(int pos = -1) + { + if (pos < 0) + return start_loc + position; + return start_loc + pos; + } + + SourceManager sm; + SourceLocation start_loc; + int position; + char[] source; + MessageHandler messages; + CharType[] charTable; + Token delegate()[] symbolFunctions; +} + +enum CharType : ubyte +{ + INVALID, + Letter, + Number, + Symbol, + Whitespace, + Other, + + EOF +} +