Mercurial > projects > dang

module lexer.Lexer;

import misc.Error,
       basic.SourceManager;

import lexer.Token,
       lexer.Keyword;

import tango.io.Stdout;

/**
  The Lexer class will supply you with methods to tokenize a D file. Supply the
  Lexer with a DataSource and you can 'peek' and 'next' Tokens from the file.

  For more info about Tokens, look up the lexer.Token module.
*/
class Lexer
{
public:

    /**
      Create a new Lexer.
    */
    this(SourceLocation start, SourceManager src_mgr)
    {
        sm = src_mgr;
        start_loc = start;
        position = 0;
        source = sm.getRawData(start_loc);


        charTable.length = 256;
        foreach (c; "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_")
            charTable[c] = CharType.Letter;

        foreach (c; "0123456789")
            charTable[c] = CharType.Number;

        foreach (c; "(){}[];:.,=!<>+-*/%")
            charTable[c] = CharType.Symbol;

        foreach (c; " \n")
            charTable[c] = CharType.Whitespace;

        symbolFunctions.length = 256;

        symbolFunctions['('] = &openParentheses;
        symbolFunctions[')'] = &closeParentheses;
        symbolFunctions['{'] = &openBrace;
        symbolFunctions['}'] = &closeBrace;
        symbolFunctions['['] = &openBracket;
        symbolFunctions[']'] = &closeBracket;
        symbolFunctions[';'] = &seperator;
        symbolFunctions[':'] = &colon;
        symbolFunctions['.'] = &dot;
        symbolFunctions[','] = &comma;
        symbolFunctions['='] = &eq;
        symbolFunctions['!'] = &ne;
        symbolFunctions['<'] = &le;
        symbolFunctions['>'] = &ge;
        symbolFunctions['+'] = &plus;
        symbolFunctions['-'] = &minus;
        symbolFunctions['*'] = &star;
        symbolFunctions['/'] = &slash;
        symbolFunctions['%'] = &percent;
    }

    /**
      Get the next token from the source. This method will move the
      internal position forward to the next Token.

      return: A Token - Token.type is TokType.EOF if there is
        no more tokens in the file.
      */
    Token next()
    {
        switch (getNextChar)
        {
            case CharType.EOF:
                SLoc loc;
                return Token(Tok.EOF, loc, 0);

            case CharType.Whitespace:
                position += 1;
                return this.next;

            case CharType.Symbol:
                return lexSymbol;

            case CharType.Letter:
                return lexLetter;

            case CharType.Number:
                return lexNumber;
        }
    }

    /**
      Get the next token from the source. This method will NOT move the
      internal position forward, and thereby having no side-effects.

      return: A Token - Token.type is TokType.EOF if there is
        no more tokens in the file.
      */
    Token peek(int skip = 0)
    {
        int oldPosition = this.position;
        while (skip-- > 0)
            this.next;
        Token t = this.next;
        this.position = oldPosition;
        return t;
    }

    /**
      Return all errors that occurred while tokenizing the string.

        TODO: Error system not implemented yet - this is a stub!
      */
    public Error[] getErrors()
    {
        return this.errors;
    }

private:
    Token eq()
    {
        if(source[position] == '=')
            return Token(Tok.Eq, Loc(position++ - 1), 2);
        return Token(Tok.Assign, Loc(position - 1), 1);
    }
    Token openBrace()
    {
        return Token(Tok.OpenBrace, Loc(position - 1), 1);
    }
    Token closeBrace()
    {
        return Token(Tok.CloseBrace, Loc(position - 1), 1);
    }
    Token openParentheses()
    {
        return Token(Tok.OpenParentheses, Loc(position - 1), 1);
    }
    Token closeParentheses()
    {
        return Token(Tok.CloseParentheses, Loc(position - 1), 1);
    }
    Token openBracket()
    {
        return Token(Tok.OpenBracket, Loc(position - 1), 1);
    }
    Token closeBracket()
    {
        return Token(Tok.CloseBracket, Loc(position - 1), 1);
    }
    Token seperator()
    {
        return Token(Tok.Seperator, Loc(position - 1), 1);
    }
    Token colon()
    {
        return Token(Tok.Colon, Loc(position - 1), 1);
    }
    Token dot()
    {
        int pos = 0;
        while(getNextChar(0) == CharType.Number ||
              this.source[position + pos + 1] == '_')
        {
            if(getNextChar(0) == CharType.Number)
            {
                position--;
                return lexNumber();
            }
            pos++;
        }
        return Token(Tok.Dot, Loc(position - 1), 1);
    }
    Token comma()
    {
        return Token(Tok.Comma, Loc(position - 1), 1);
    }
    Token ne()
    {
        if(source[position] == '=')
            return Token(Tok.Ne, Loc(position++ - 1), 2);
        return Token(Tok.Not, Loc(position - 1), 1);
    }
    Token le()
    {
        if(source[position] == '=')
            return Token(Tok.Le, Loc(position++ - 1), 2);
        return Token(Tok.Lt, Loc(position - 1), 1);
    }
    Token ge()
    {
        if(source[position] == '=')
            return Token(Tok.Ge, Loc(position++ - 1), 2);
        return Token(Tok.Gt, Loc(position - 1), 1);
    }
    Token plus()
    {
        return Token(Tok.Plus, Loc(position - 1), 1);
    }
    Token minus()
    {
        return Token(Tok.Minus, Loc(position - 1), 1);
    }
    Token star()
    {
        return Token(Tok.Star, Loc(position - 1), 1);
    }
    Token slash()
    {
        switch(source[position])
        {
            case '/':
                while(getNextChar != CharType.EOF)
                {
                    if(source[position++] == '\n')
                        return this.next;
                }
                return Token(Tok.EOF, Loc(position), 0);

            case '*':
                position += 2;
                while(getNextChar != CharType.EOF)
                {
                    ++position;
                    if(source[position-2] == '*')
                        if(source[position-1] == '/')
                            return this.next;
                }
                throw error(__LINE__, "Unexpected end of file. Unclosed comment block");

            case '+':
                position += 2;
                int nesting = 1;
                while(getNextChar != CharType.EOF)
                {
                    ++position;
                    if(source[position-2] == '+')
                        if(source[position-1] == '/')
                        {
                            position++;
                            nesting--;
                        }

                    if(source[position-2] == '/')
                        if(source[position-1] == '+')
                        {
                            nesting++;
                            position++;
                        }

                    if(nesting == 0)
                        return this.next;
                }
                throw error(__LINE__, "Unexpected end of file. Unclosed comment block");

            default:
                return Token(Tok.Slash, Loc(position - 1), 1);
        }
    }

    Token percent()
    {
        return Token(Tok.Percent, Loc(position - 1), 1);
    }

    Token lexNumber ()
    {
        bool sign = false;
        bool dot = false;
        bool e = false;

        int i = 0;

        bool end = false;
        while(!end)
        {
            switch(getNextChar(i))
            {
                case CharType.Number:
                    break;
                case CharType.Symbol:
                    if(this.source[position+i] == '.')
                    {
                        if(dot)
                            throw error(__LINE__,"Only one '.' is allowed in an floating number")
                                .tok(Token(Tok.Float, Loc(position + i), 1));
                        dot = true;
                        break;
                    }
                    end = true;
                    continue;
                case CharType.Letter:
                    if(this.source[position+i] == '_')
                        break;
                    if (this.source[position+i] == 'e' ||
                        this.source[position+i] == 'E')
                    {
                        if (e)
                            throw error(__LINE__,"Only one '"~this.source[position+i]
                                    ~"' is allowed in an floating number");
                        e = true;
                        break;
                    }
                    end = true;
                    continue;

                default:
                    end = true;
                    continue;
            }
            i++;
        }

        position += i;

        return Token(Tok.Integer, Loc(position - i), i);
    }

    Token lexSymbol ()
    {
        Token t = symbolFunctions[source[position++]]();

        return t;
    }

    Token lexLetter ()
    {
        int i = 0;
        bool hasNumber = false;
        while (getNextChar(++i) == CharType.Letter ||
                getNextChar(i) == CharType.Number)
        {
            if (getNextChar(i) == CharType.Number)
            {
                hasNumber = true;
            }
        }

        Token t = Token(Tok.Identifier, Loc(), i);

        if (!hasNumber)
        {
            char[] str = source[position .. position + i];
            if(str in keywords)
                t.type = keywords[str];
        }

        position += i;

        return t;
    }

    CharType getNextChar(int offset = 0)
    {
        if (position + offset >= this.source.length)
            return CharType.EOF;

        char current = source[position + offset];

        CharType c = charTable[current];

        if(c == CharType.INVALID)
            throw error(__LINE__, "Read invalid symbol: '%0'").arg(current);

        return c;

    }

    Error error(uint line, char[] msg)
    {
        return (new Error(msg));//.loc(Loc(position));
    }

    private final SourceLocation Loc(int pos = -1)
    {
        if (pos < 0)
            return start_loc + position;
        return start_loc + pos;
    }

    SourceManager sm;
    SourceLocation start_loc;
    int position;
    char[] source;
    Error[] errors;
    CharType[] charTable;
    Token delegate()[] symbolFunctions;
}

enum CharType : ubyte
{
    INVALID,
    Letter,
    Number,
    Symbol,
    Whitespace,

    EOF
}
author	Anders Halager <halager@gmail.com>
date	Sun, 04 May 2008 18:13:46 +0200
parents	192da4976daa
children	a49bb982a7b0