view lexer/Lexer.d @ 88:eb5b2c719a39 new_gen

Major change to locations, tokens and expressions. A location (now SourceLocation or SLoc) is only 32 bit in size - disadvantage is that it can't find its own text. You have to go through the new SourceManager to do that. This has caused changes to a lot of stuff and removal of DataSource and the old Location Additionally Exp has gotten some location stuff, so we can give proper error messages. Not in Decl and Stmt yet, but thats coming too.
author Anders Halager <halager@gmail.com>
date Sun, 04 May 2008 18:13:46 +0200
parents 192da4976daa
children a49bb982a7b0
line wrap: on
line source

module lexer.Lexer;

import misc.Error,
       basic.SourceManager;

import lexer.Token,
       lexer.Keyword;

import tango.io.Stdout;

/**
  The Lexer class will supply you with methods to tokenize a D file. Supply the
  Lexer with a DataSource and you can 'peek' and 'next' Tokens from the file. 

  For more info about Tokens, look up the lexer.Token module.
*/  
class Lexer
{
public:

    /**
      Create a new Lexer.
    */
    this(SourceLocation start, SourceManager src_mgr)
    {
        sm = src_mgr;
        start_loc = start;
        position = 0;
        source = sm.getRawData(start_loc);


        charTable.length = 256;
        foreach (c; "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_")
            charTable[c] = CharType.Letter;

        foreach (c; "0123456789")
            charTable[c] = CharType.Number;

        foreach (c; "(){}[];:.,=!<>+-*/%")
            charTable[c] = CharType.Symbol;

        foreach (c; " \n")
            charTable[c] = CharType.Whitespace;

        symbolFunctions.length = 256;

        symbolFunctions['('] = &openParentheses;
        symbolFunctions[')'] = &closeParentheses;
        symbolFunctions['{'] = &openBrace;
        symbolFunctions['}'] = &closeBrace;
        symbolFunctions['['] = &openBracket;
        symbolFunctions[']'] = &closeBracket;
        symbolFunctions[';'] = &seperator;
        symbolFunctions[':'] = &colon;
        symbolFunctions['.'] = &dot;
        symbolFunctions[','] = &comma;
        symbolFunctions['='] = &eq;
        symbolFunctions['!'] = &ne;
        symbolFunctions['<'] = &le;
        symbolFunctions['>'] = &ge;
        symbolFunctions['+'] = &plus;
        symbolFunctions['-'] = &minus;
        symbolFunctions['*'] = &star;
        symbolFunctions['/'] = &slash;
        symbolFunctions['%'] = &percent;
    }

    /**
      Get the next token from the source. This method will move the
      internal position forward to the next Token.

      return: A Token - Token.type is TokType.EOF if there is
        no more tokens in the file.
      */
    Token next()
    {
        switch (getNextChar)
        {
            case CharType.EOF:
                SLoc loc;
                return Token(Tok.EOF, loc, 0); 

            case CharType.Whitespace:
                position += 1;
                return this.next;

            case CharType.Symbol:
                return lexSymbol;

            case CharType.Letter:
                return lexLetter;

            case CharType.Number:
                return lexNumber;
        }
    }

    /**
      Get the next token from the source. This method will NOT move the
      internal position forward, and thereby having no side-effects.

      return: A Token - Token.type is TokType.EOF if there is
        no more tokens in the file.
      */
    Token peek(int skip = 0)
    {
        int oldPosition = this.position;
        while (skip-- > 0)
            this.next;
        Token t = this.next;
        this.position = oldPosition;
        return t;
    }

    /**
      Return all errors that occurred while tokenizing the string.

        TODO: Error system not implemented yet - this is a stub!
      */
    public Error[] getErrors()
    {
        return this.errors;
    }

private:
    Token eq()
    {
        if(source[position] == '=')
            return Token(Tok.Eq, Loc(position++ - 1), 2);
        return Token(Tok.Assign, Loc(position - 1), 1);
    }
    Token openBrace() 
    {
        return Token(Tok.OpenBrace, Loc(position - 1), 1);
    }
    Token closeBrace() 
    {
        return Token(Tok.CloseBrace, Loc(position - 1), 1);
    }
    Token openParentheses() 
    {
        return Token(Tok.OpenParentheses, Loc(position - 1), 1);
    }
    Token closeParentheses()
    {
        return Token(Tok.CloseParentheses, Loc(position - 1), 1);
    }
    Token openBracket() 
    {
        return Token(Tok.OpenBracket, Loc(position - 1), 1);
    }
    Token closeBracket()
    {
        return Token(Tok.CloseBracket, Loc(position - 1), 1);
    }
    Token seperator()
    {
        return Token(Tok.Seperator, Loc(position - 1), 1);
    }
    Token colon()
    {
        return Token(Tok.Colon, Loc(position - 1), 1);
    }
    Token dot() 
    {
        int pos = 0;
        while(getNextChar(0) == CharType.Number || 
              this.source[position + pos + 1] == '_')
        {
            if(getNextChar(0) == CharType.Number)
            {
                position--;
                return lexNumber();
            }
            pos++;
        }
        return Token(Tok.Dot, Loc(position - 1), 1);
    }
    Token comma() 
    {
        return Token(Tok.Comma, Loc(position - 1), 1);
    }
    Token ne() 
    {
        if(source[position] == '=')
            return Token(Tok.Ne, Loc(position++ - 1), 2);
        return Token(Tok.Not, Loc(position - 1), 1);
    }
    Token le()
    {
        if(source[position] == '=')
            return Token(Tok.Le, Loc(position++ - 1), 2);
        return Token(Tok.Lt, Loc(position - 1), 1);
    }
    Token ge() 
    {
        if(source[position] == '=')
            return Token(Tok.Ge, Loc(position++ - 1), 2);
        return Token(Tok.Gt, Loc(position - 1), 1);
    }
    Token plus() 
    {
        return Token(Tok.Plus, Loc(position - 1), 1);
    }
    Token minus() 
    {
        return Token(Tok.Minus, Loc(position - 1), 1);
    }
    Token star() 
    {
        return Token(Tok.Star, Loc(position - 1), 1);
    }
    Token slash() 
    {
        switch(source[position])
        {
            case '/':
                while(getNextChar != CharType.EOF)
                {
                    if(source[position++] == '\n')
                        return this.next;
                }
                return Token(Tok.EOF, Loc(position), 0);

            case '*':
                position += 2;
                while(getNextChar != CharType.EOF)
                {
                    ++position;
                    if(source[position-2] == '*')
                        if(source[position-1] == '/')
                            return this.next;
                }
                throw error(__LINE__, "Unexpected end of file. Unclosed comment block");

            case '+':
                position += 2;
                int nesting = 1;
                while(getNextChar != CharType.EOF)
                {
                    ++position;
                    if(source[position-2] == '+')
                        if(source[position-1] == '/')
                        {
                            position++;
                            nesting--;
                        }

                    if(source[position-2] == '/')
                        if(source[position-1] == '+')
                        {
                            nesting++;
                            position++;
                        }

                    if(nesting == 0)
                        return this.next;
                }
                throw error(__LINE__, "Unexpected end of file. Unclosed comment block");

            default:
                return Token(Tok.Slash, Loc(position - 1), 1);
        }
    }

    Token percent() 
    {
        return Token(Tok.Percent, Loc(position - 1), 1);
    }
    
    Token lexNumber ()
    {
        bool sign = false;
        bool dot = false;
        bool e = false;

        int i = 0;

        bool end = false;
        while(!end)
        {
            switch(getNextChar(i))
            {
                case CharType.Number:
                    break;
                case CharType.Symbol:
                    if(this.source[position+i] == '.')
                    {
                        if(dot)
                            throw error(__LINE__,"Only one '.' is allowed in an floating number")
                                .tok(Token(Tok.Float, Loc(position + i), 1));
                        dot = true;
                        break;
                    }
                    end = true;
                    continue;
                case CharType.Letter:
                    if(this.source[position+i] == '_')
                        break;
                    if (this.source[position+i] == 'e' || 
                        this.source[position+i] == 'E')
                    {
                        if (e)
                            throw error(__LINE__,"Only one '"~this.source[position+i]
                                    ~"' is allowed in an floating number");
                        e = true;
                        break;
                    }
                    end = true;
                    continue;

                default:
                    end = true;
                    continue;
            }
            i++;
        }

        position += i;

        return Token(Tok.Integer, Loc(position - i), i);
    }

    Token lexSymbol ()
    {
        Token t = symbolFunctions[source[position++]]();

        return t;
    }

    Token lexLetter ()
    {
        int i = 0;
        bool hasNumber = false;
        while (getNextChar(++i) == CharType.Letter || 
                getNextChar(i) == CharType.Number)
        {
            if (getNextChar(i) == CharType.Number)
            {
                hasNumber = true;
            }
        }

        Token t = Token(Tok.Identifier, Loc(), i);

        if (!hasNumber)
        {
            char[] str = source[position .. position + i];
            if(str in keywords)
                t.type = keywords[str];
        }

        position += i;

        return t;
    }

    CharType getNextChar(int offset = 0)
    {
        if (position + offset >= this.source.length)
            return CharType.EOF;

        char current = source[position + offset];

        CharType c = charTable[current];

        if(c == CharType.INVALID)
            throw error(__LINE__, "Read invalid symbol: '%0'").arg(current);

        return c;

    }

    Error error(uint line, char[] msg)
    {
        return (new Error(msg));//.loc(Loc(position));
    }

    private final SourceLocation Loc(int pos = -1)
    {
        if (pos < 0)
            return start_loc + position;
        return start_loc + pos;
    }

    SourceManager sm;
    SourceLocation start_loc;
    int position;
    char[] source;
    Error[] errors;
    CharType[] charTable;
    Token delegate()[] symbolFunctions;
}

enum CharType : ubyte
{
    INVALID,
    Letter,
    Number,
    Symbol,
    Whitespace,

    EOF
}