diff src/lexer/Lexer.d @ 206:d3c148ca429b

Major moving of files. all src now goes into src, all docs in docs.
author Anders Johnsen <skabet@gmail.com>
date Tue, 12 Aug 2008 18:14:56 +0200
parents
children e0551773a005
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lexer/Lexer.d	Tue Aug 12 18:14:56 2008 +0200
@@ -0,0 +1,447 @@
+module lexer.Lexer;
+
+import basic.Message,
+       basic.SourceManager;
+
+import lexer.Token,
+       lexer.Keyword;
+
+import tango.io.Stdout;
+
+/**
+  The Lexer class will supply you with methods to tokenize a D file. Supply the
+  Lexer with a DataSource and you can 'peek' and 'next' Tokens from the file. 
+
+  For more info about Tokens, look up the lexer.Token module.
+*/  
+class Lexer
+{
+public:
+
+    /**
+      Create a new Lexer.
+    */
+    this(SourceLocation start, SourceManager src_mgr, MessageHandler messages)
+    {
+        this.messages = messages;
+        sm = src_mgr;
+        start_loc = start;
+        position = 0;
+        source = sm.getRawData(start_loc);
+
+
+        charTable.length = 256;
+        foreach (c; "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_")
+            charTable[c] = CharType.Letter;
+
+        foreach (c; "0123456789")
+            charTable[c] = CharType.Number;
+
+        foreach (c; "(){}[];:.,=!<>+-*/%\"`")
+            charTable[c] = CharType.Symbol;
+
+        foreach (c; " \n")
+            charTable[c] = CharType.Whitespace;
+
+        foreach (c; "'\\")
+            charTable[c] = CharType.Other;
+
+        symbolFunctions.length = 256;
+
+        symbolFunctions['('] = &openParentheses;
+        symbolFunctions[')'] = &closeParentheses;
+        symbolFunctions['{'] = &openBrace;
+        symbolFunctions['}'] = &closeBrace;
+        symbolFunctions['['] = &openBracket;
+        symbolFunctions[']'] = &closeBracket;
+        symbolFunctions[';'] = &seperator;
+        symbolFunctions[':'] = &colon;
+        symbolFunctions['.'] = &dot;
+        symbolFunctions[','] = &comma;
+        symbolFunctions['='] = &eq;
+        symbolFunctions['!'] = &ne;
+        symbolFunctions['<'] = &le;
+        symbolFunctions['>'] = &ge;
+        symbolFunctions['+'] = &plus;
+        symbolFunctions['-'] = &minus;
+        symbolFunctions['*'] = &star;
+        symbolFunctions['/'] = &slash;
+        symbolFunctions['%'] = &percent;
+        symbolFunctions['"'] = &string;
+        symbolFunctions['`'] = &string;
+    }
+
+    /**
+      Get the next token from the source. This method will move the
+      internal position forward to the next Token.
+
+      return: A Token - Token.type is TokType.EOF if there is
+        no more tokens in the file.
+      */
+    Token next()
+    {
+        switch (getNextChar)
+        {
+            case CharType.EOF:
+                SLoc loc;
+                return Token(Tok.EOF, loc, 0); 
+
+            case CharType.Whitespace:
+                position += 1;
+                return this.next;
+
+            case CharType.Symbol:
+                return lexSymbol;
+
+            case CharType.Letter:
+                return lexLetter;
+
+            case CharType.Number:
+                return lexNumber;
+            case CharType.Other:
+                messages.report(UnexpectedTok, Loc(position)).fatal(ExitLevel.Lexer);
+        }
+    }
+
+    /**
+      Get the next token from the source. This method will NOT move the
+      internal position forward, and thereby having no side-effects.
+
+      return: A Token - Token.type is TokType.EOF if there is
+        no more tokens in the file.
+      */
+    Token peek(int skip = 0)
+    {
+        int oldPosition = this.position;
+        while (skip-- > 0)
+            this.next;
+        Token t = this.next;
+        this.position = oldPosition;
+        return t;
+    }
+
+private:
+    Token eq()
+    {
+        if(source[position] == '=')
+            return Token(Tok.Eq, Loc(position++ - 1), 2);
+        return Token(Tok.Assign, Loc(position - 1), 1);
+    }
+    Token openBrace() 
+    {
+        return Token(Tok.OpenBrace, Loc(position - 1), 1);
+    }
+    Token closeBrace() 
+    {
+        return Token(Tok.CloseBrace, Loc(position - 1), 1);
+    }
+    Token openParentheses() 
+    {
+        return Token(Tok.OpenParentheses, Loc(position - 1), 1);
+    }
+    Token closeParentheses()
+    {
+        return Token(Tok.CloseParentheses, Loc(position - 1), 1);
+    }
+    Token openBracket() 
+    {
+        return Token(Tok.OpenBracket, Loc(position - 1), 1);
+    }
+    Token closeBracket()
+    {
+        return Token(Tok.CloseBracket, Loc(position - 1), 1);
+    }
+    Token seperator()
+    {
+        return Token(Tok.Seperator, Loc(position - 1), 1);
+    }
+    Token colon()
+    {
+        return Token(Tok.Colon, Loc(position - 1), 1);
+    }
+    Token dot() 
+    {
+        int pos = 0;
+        while(getNextChar(0) == CharType.Number || 
+              this.source[position + pos + 1] == '_')
+        {
+            if(getNextChar(0) == CharType.Number)
+            {
+                position--;
+                return lexNumber();
+            }
+            pos++;
+        }
+        return Token(Tok.Dot, Loc(position - 1), 1);
+    }
+    Token comma() 
+    {
+        return Token(Tok.Comma, Loc(position - 1), 1);
+    }
+    Token ne() 
+    {
+        if(source[position] == '=')
+            return Token(Tok.Ne, Loc(position++ - 1), 2);
+        return Token(Tok.Not, Loc(position - 1), 1);
+    }
+    Token le()
+    {
+        if(source[position] == '=')
+            return Token(Tok.Le, Loc(position++ - 1), 2);
+        return Token(Tok.Lt, Loc(position - 1), 1);
+    }
+    Token ge() 
+    {
+        if(source[position] == '=')
+            return Token(Tok.Ge, Loc(position++ - 1), 2);
+        return Token(Tok.Gt, Loc(position - 1), 1);
+    }
+    Token plus() 
+    {
+        return Token(Tok.Plus, Loc(position - 1), 1);
+    }
+    Token minus() 
+    {
+        return Token(Tok.Minus, Loc(position - 1), 1);
+    }
+    Token star() 
+    {
+        return Token(Tok.Star, Loc(position - 1), 1);
+    }
+    Token slash()
+    {
+        switch(source[position])
+        {
+            case '/':
+                while(getNextChar != CharType.EOF)
+                {
+                    if(source[position++] == '\n')
+                        return this.next;
+                }
+                return Token(Tok.EOF, Loc(position), 0);
+
+            case '*':
+                position += 2;
+                while(getNextChar != CharType.EOF)
+                {
+                    ++position;
+                    if(source[position-2] == '*')
+                        if(source[position-1] == '/')
+                        {
+                            return this.next;
+                        }
+                }
+                messages.report(UnexpectedEOFBlock,Loc(position));
+
+            case '+':
+                position += 2;
+                int nesting = 1;
+                while(getNextChar != CharType.EOF)
+                {
+                    ++position;
+                    if(source[position-2] == '+')
+                        if(source[position-1] == '/')
+                        {
+                            position++;
+                            nesting--;
+                        }
+
+                    if(source[position-2] == '/')
+                        if(source[position-1] == '+')
+                        {
+                            nesting++;
+                            position++;
+                        }
+
+                    if(nesting == 0)
+                        return this.next;
+                }
+                messages.report(UnexpectedEOFBlock,Loc(position));
+
+            default:
+                return Token(Tok.Slash, Loc(position - 1), 1);
+        }
+    }
+
+    Token percent() 
+    {
+        return Token(Tok.Percent, Loc(position - 1), 1);
+    }
+
+    Token string()
+    {
+        --position;
+        int start = position;
+        if(getNextChar() == CharType.Letter)
+            position++;
+        char end = '`';
+        switch(source[position])
+        {
+            case '"':
+                if(position > 0)
+                    if(source[position-1] == 'r')
+                    {
+                        end = '"';
+                        goto string_wys;
+                    }
+                ++position;
+                while(getNextChar != CharType.EOF)
+                {
+                    ++position;
+                    if (source[position-1] == '"' )
+                        return Token(Tok.String, Loc(start), position - start);
+                    else if (source[position-1] == '\\')
+                        position++;
+                }
+                break;
+                case '`':
+string_wys:     
+                ++position;
+                while(getNextChar != CharType.EOF)
+                {
+                    ++position;
+                    if (source[position-1] == end )
+                        return Token(Tok.String, Loc(start), position - start);
+                }
+                break;
+        }
+        messages.report(UnexpectedEOFBlock, Loc(position)).fatal(ExitLevel.Lexer);
+    }
+    
+    Token lexNumber ()
+    {
+        bool sign = false;
+        bool dot = false;
+        bool e = false;
+
+        int i = 0;
+
+        bool end = false;
+        while(!end)
+        {
+            switch(getNextChar(i))
+            {
+                case CharType.Number:
+                    break;
+                case CharType.Symbol:
+                    if(this.source[position+i] == '.')
+                    {
+                        if(dot)
+                            messages.report(OnlyOneDotFloating, Loc(position + i));
+                        dot = true;
+                        break;
+                    }
+                    end = true;
+                    continue;
+                case CharType.Letter:
+                    if(this.source[position+i] == '_')
+                        break;
+                    if (this.source[position+i] == 'e' || 
+                        this.source[position+i] == 'E')
+                    {
+                        if (e)
+                            messages.report(OnlyOneEFloating, Loc(position + i));
+                        e = true;
+                        break;
+                    }
+                    end = true;
+                    continue;
+
+                default:
+                    end = true;
+                    continue;
+            }
+            i++;
+        }
+
+        position += i;
+
+        return Token(Tok.Integer, Loc(position - i), i);
+    }
+
+    Token lexSymbol ()
+    {
+        Token t = symbolFunctions[source[position++]]();
+
+        return t;
+    }
+
+    Token lexLetter ()
+    {
+        int i = 0;
+        bool hasNumber = false;
+        if (source[position+1] == '"' ||
+            source[position+1] == '`')
+        {
+            ++position;
+            return string;
+        }
+        while (getNextChar(++i) == CharType.Letter || 
+                getNextChar(i) == CharType.Number)
+        {
+            if (getNextChar(i) == CharType.Number)
+            {
+                hasNumber = true;
+            }
+        }
+
+        Token t = Token(Tok.Identifier, Loc(), i);
+
+        if (!hasNumber)
+        {
+            char[] str = source[position .. position + i];
+            if(str in keywords)
+                t.type = keywords[str];
+        }
+
+        position += i;
+
+        return t;
+    }
+
+    CharType getNextChar(int offset = 0)
+    {
+        if (position + offset >= this.source.length)
+            return CharType.EOF;
+
+        char current = source[position + offset];
+
+        CharType c = charTable[current];
+
+        if(c == CharType.INVALID)
+            messages.report(InvalidSymbol, Loc())
+                .arg(Integer.toString(cast(int)current))
+                .fatal(ExitLevel.Lexer);
+
+        return c;
+
+    }
+
+    private final SourceLocation Loc(int pos = -1)
+    {
+        if (pos < 0)
+            return start_loc + position;
+        return start_loc + pos;
+    }
+
+    SourceManager sm;
+    SourceLocation start_loc;
+    int position;
+    char[] source;
+    MessageHandler messages;
+    CharType[] charTable;
+    Token delegate()[] symbolFunctions;
+}
+
+enum CharType : ubyte
+{
+    INVALID,
+    Letter,
+    Number,
+    Symbol,
+    Whitespace,
+    Other,
+
+    EOF
+}
+