view lexer/Lexer.d @ 74:192da4976daa new_gen

Renamed Add, Sub, Mul, Div and Mod in lexer to what they are (Plus, Minus....)
author johnsen@johnsen-laptop
date Fri, 02 May 2008 13:19:23 +0200
parents 628cb46ab13b
children eb5b2c719a39
line wrap: on
line source

module lexer.Lexer;

import misc.Error,
       misc.DataSource;

import lexer.Token,
       lexer.Keyword;

import tango.io.Stdout;

/**
  The Lexer class will supply you with methods to tokenize a D file. Supply the
  Lexer with a DataSource and you can 'peek' and 'next' Tokens from the file. 

  For more info about Tokens, look up the lexer.Token module.
*/  
class Lexer
{
public:

    /**
      Create a new Lexer.

      params:
        source = The source to tokenize.

    */

    this (DataSource source)
    {
        this.source = source;
        position = 0;


        charTable.length = 256;
        foreach( char c ; "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_")
            charTable[c] = CharType.Letter;

        foreach( char c ; "0123456789")
            charTable[c] = CharType.Number;

        foreach( char c ; "(){}[];:.,=!<>+-*/%")
            charTable[c] = CharType.Symbol;

        foreach( char c ; " \n")
            charTable[c] = CharType.Whitespace;

        symbolFunctions.length = 256;

        symbolFunctions['('] = &openParentheses;
        symbolFunctions[')'] = &closeParentheses;
        symbolFunctions['{'] = &openBrace;
        symbolFunctions['}'] = &closeBrace;
        symbolFunctions['['] = &openBracket;
        symbolFunctions[']'] = &closeBracket;
        symbolFunctions[';'] = &seperator;
        symbolFunctions[':'] = &colon;
        symbolFunctions['.'] = &dot;
        symbolFunctions[','] = &comma;
        symbolFunctions['='] = &eq;
        symbolFunctions['!'] = &ne;
        symbolFunctions['<'] = &le;
        symbolFunctions['>'] = &ge;
        symbolFunctions['+'] = &plus;
        symbolFunctions['-'] = &minus;
        symbolFunctions['*'] = &star;
        symbolFunctions['/'] = &slash;
        symbolFunctions['%'] = &percent;
    }

    /**
      Get the next token from the source. This method will move the
      internal position forward to the next Token.

      return: A Token - Token.type is TokType.EOF if there is
        no more tokens in the file.
      */
    Token next ()
    {
        switch (getNextChar)
        {
            case CharType.EOF:
                Location l;
                return Token (Tok.EOF, l, 0); 

            case CharType.Whitespace:
                position += 1;
                return this.next;

            case CharType.Symbol:
                return lexSymbol;

            case CharType.Letter:
                return lexLetter;

            case CharType.Number:
                return lexNumber;
        }
    }

    /**
      Get the next token from the source. This method will NOT move the
      internal position forward, and thereby having no side-effects.

      return: A Token - Token.type is TokType.EOF if there is
        no more tokens in the file.
      */
    Token peek ( int skip = 0)
    {
        int oldPosition = this.position;
        while(skip-- > 0)
            this.next;
        Token t = this.next;
        this.position = oldPosition;
        return t;
    }

    /**
      Return all errors that occurred while tokenizing the string.

        TODO: Error system not implemented yet - this is a stub!
      */
    public Error[] getErrors()
    {
        return this.errors;
    }

private:
    Token eq()
    {
        if(source.data[position] == '=')
            return Token(Tok.Eq, Location(position++ - 1, source), 2);
        return Token(Tok.Assign, Location(position - 1, source), 1);
    }
    Token openBrace() 
    {
        return Token(Tok.OpenBrace, Location(position - 1, source), 1);
    }
    Token closeBrace() 
    {
        return Token(Tok.CloseBrace, Location(position - 1, this.source), 1);
    }
    Token openParentheses() 
    {
        return Token(Tok.OpenParentheses, Location(position - 1, this.source), 1);
    }
    Token closeParentheses()
    {
        return Token(Tok.CloseParentheses, Location(position - 1, this.source), 1);
    }
    Token openBracket() 
    {
        return Token(Tok.OpenBracket, Location(position - 1, this.source), 1);
    }
    Token closeBracket()
    {
        return Token(Tok.CloseBracket, Location(position - 1, source), 1);
    }
    Token seperator()
    {
        return Token(Tok.Seperator, Location(position - 1, source), 1);
    }
    Token colon()
    {
        return Token(Tok.Colon, Location(position - 1, this.source), 1);
    }
    Token dot() 
    {
        int pos = 0;
        while(getNextChar(0) == CharType.Number || 
              this.source.data[position + pos + 1] == '_')
        {
            if(getNextChar(0) == CharType.Number)
            {
                position--;
                return lexNumber();
            }
            pos++;
        }
        return Token(Tok.Dot, Location(position - 1, this.source), 1);
    }
    Token comma() 
    {
        return Token(Tok.Comma, Location(position - 1, this.source), 1);
    }
    Token ne() 
    {
        if(source.data[position] == '=')
            return Token(Tok.Ne, Location(position++ - 1, this.source), 2);
        return Token(Tok.Not, Location(position - 1, this.source), 1);
    }
    Token le()
    {
        if(source.data[position] == '=')
            return Token(Tok.Le, Location(position++ - 1, this.source), 2);
        return Token(Tok.Lt, Location(position - 1, this.source), 1);
    }
    Token ge() 
    {
        if(source.data[position] == '=')
            return Token(Tok.Ge, Location(position++ - 1, this.source), 2);
        return Token(Tok.Gt, Location(position - 1, this.source), 1);
    }
    Token plus() 
    {
        return Token(Tok.Plus, Location(position - 1, this.source), 1);
    }
    Token minus() 
    {
        return Token(Tok.Minus, Location(position - 1, this.source), 1);
    }
    Token star() 
    {
        return Token(Tok.Star, Location(position - 1, this.source), 1);
    }
    Token slash() 
    {
        switch(source.data[position])
        {
            case '/':
                while(getNextChar != CharType.EOF)
                {
                    if(source.data[position++] == '\n')
                        return this.next;
                }
                return Token(Tok.EOF, Location(position, this.source), 0);

            case '*':
                position += 2;
                while(getNextChar != CharType.EOF)
                {
                    ++position;
                    if(source.data[position-2] == '*')
                        if(source.data[position-1] == '/')
                            return this.next;
                }
                throw error(__LINE__, "Unexpected end of file. Unclosed comment block");

            case '+':
                position += 2;
                int nesting = 1;
                while(getNextChar != CharType.EOF)
                {
                    ++position;
                    if(source.data[position-2] == '+')
                        if(source.data[position-1] == '/')
                        {
                            position++;
                            nesting--;
                        }

                    if(source.data[position-2] == '/')
                        if(source.data[position-1] == '+')
                        {
                            nesting++;
                            position++;
                        }

                    if(nesting == 0)
                        return this.next;
                }
                throw error(__LINE__, "Unexpected end of file. Unclosed comment block");

            default:
                return Token(Tok.Slash, Location(position - 1, this.source), 1);
        }
    }

    Token percent() 
    {
        return Token(Tok.Percent, Location(position - 1, this.source), 1);
    }
    
    Token lexNumber ()
    {
        bool sign = false;
        bool dot = false;
        bool e = false;

        int i = 0;

        bool end = false;
        while(!end)
        {
            switch(getNextChar(i))
            {
                case CharType.Number:
                    break;
                case CharType.Symbol:
                    if(this.source.data[position+i] == '.')
                    {
                        if(dot)
                            throw error(__LINE__,"Only one '.' is allowed in an floating number")
                                .tok(Token(Tok.Float, Location(position + i, this.source), 1));
                        dot = true;
                        break;
                    }
                    end = true;
                    continue;
                case CharType.Letter:
                    if(this.source.data[position+i] == '_')
                        break;
                    if (this.source.data[position+i] == 'e' || 
                        this.source.data[position+i] == 'E')
                    {
                        if (e)
                            throw error(__LINE__,"Only one '"~this.source.data[position+i]
                                    ~"' is allowed in an floating number");
                        e = true;
                        break;
                    }
                    end = true;
                    continue;

                default:
                    end = true;
                    continue;
            }
            i++;
        }

        position += i;

        return Token(Tok.Integer, Location(position - i, this.source), i);
    }

    Token lexSymbol ()
    {
        Token t = symbolFunctions[source.data[position++]]();

        return t;
    }

    Token lexLetter ()
    {
        int i = 0;
        bool hasNumber = false;
        while (getNextChar(++i) == CharType.Letter || 
                getNextChar(i) == CharType.Number)
        {
            if (getNextChar(i) == CharType.Number)
            {
                hasNumber = true;
            }
        }

        Token t = Token(Tok.Identifier, Location(position, source), i);

        if (!hasNumber)
        {
            char[] str = source.data[position .. position + i];
            if(str in keywords)
                t.type = keywords[str];
        }

        position += i;

        return t;
    }

    CharType getNextChar(int offset = 0)
    {
        if (position + offset >= this.source.data.length)
            return CharType.EOF;

        char current = source.data[position + offset];

        CharType c = charTable[current];

        if(c == CharType.INVALID)
            throw error(__LINE__, "Read invalid symbol: '%0'").arg(current);

        return c;

    }

    Error error(uint line, char[] msg)
    {
        return (new Error(msg)).loc(Location(position, source));
    }

    int position;
    DataSource source;
    Error[] errors;
    CharType[] charTable;
    Token delegate()[] symbolFunctions;
}

enum CharType : ubyte
{
    INVALID,
    Letter,
    Number,
    Symbol,
    Whitespace,

    EOF
}