view lexer/Lexer.d @ 42:4e879f82dd64 new_gen

Added some docs for the lexer - now you can understand _some_ of the madness going on here :)
author Anders Johnsen <skabet@gmail.com>
date Tue, 22 Apr 2008 22:25:07 +0200
parents f977aa28eb32
children a712c530b7cc
line wrap: on
line source

module lexer.Lexer;

import misc.Error,
       misc.DataSource;

import lexer.Token,
       lexer.Keyword;

import tango.io.Stdout;

/**
  The Lexer class will supply you with methods to tokenize a D file. Supply the
  Lexer with a DataSource and you can 'peek' and 'next' Tokens from the file. 

  For more info about Tokens, look up the lexer.Token module.
*/  
class Lexer
{
public:

    /**
      Create a new Lexer.

      params:
        source = The source to tokenize.

    */

    this (DataSource source)
    {
        this.source = source;
        position = 0;


        charTable.length = 256;
        foreach( char c ; "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
            charTable[c] = CharType.Letter;

        foreach( char c ; "0123456789")
            charTable[c] = CharType.Number;

        foreach( char c ; "(){};:.,=!<>+-*/")
            charTable[c] = CharType.Symbol;

        foreach( char c ; " \n")
            charTable[c] = CharType.Whitespace;

        symbolFunctions.length = 256;

        symbolFunctions['('] = &openParentheses;
        symbolFunctions[')'] = &closeParentheses;
        symbolFunctions['{'] = &openBrace;
        symbolFunctions['}'] = &closeBrace;
        symbolFunctions[';'] = &seperator;
        symbolFunctions[':'] = &colon;
        symbolFunctions['.'] = &dot;
        symbolFunctions[','] = &comma;
        symbolFunctions['='] = &eq;
        symbolFunctions['!'] = &ne;
        symbolFunctions['<'] = &le;
        symbolFunctions['>'] = &ge;
        symbolFunctions['+'] = &add;
        symbolFunctions['-'] = &sub;
        symbolFunctions['*'] = &mul;
        symbolFunctions['/'] = &div;
    }

    /**
      Get the next token from the source. This method will move the
      internal position forward to the next Token.

      return: A Token - Token.type is equals TokType.EOF if there is
        no more tokens in the file.
      */
    Token next ()
    {
        switch (getNextChar)
        {
            case CharType.EOF:
                Location l;
                return Token (Tok.EOF, l, 0); 

            case CharType.Whitespace:
                position += 1;
                return this.next;

            case CharType.Symbol:
                return lexSymbol;

            case CharType.Letter:
                return lexLetter;

            case CharType.Number:
                return lexNumber;
        }
    }

    /**
      Get the next token from the source. This method will NOT move the
      internal position forward, and thereby having no side-effects.

      return: A Token - Token.type is equals TokType.EOF if there is
        no more tokens in the file.
      */
    Token peek ( int skip = 0)
    {
        int oldPosition = this.position;
        while(skip-- > 0)
            this.next;
        Token t = this.next;
        this.position = oldPosition;
        return t;
    }

    /**
      Return all errors that occurred while tokenizing the string.

        TODO: Error system not implemented yet - this is a stub!
      */
    public Error[] getErrors()
    {
        return this.errors;
    }

private:
    Token eq()
    {
        if(source.data[position] == '=')
            return Token(Tok.Eq, Location(position++ - 1, source), 2);
        return Token(Tok.Assign, Location(position - 1, source), 1);
    }
    Token openBrace() 
    {
        return Token(Tok.OpenBrace, Location(position - 1, source), 1);
    }
    Token openParentheses() 
    {
        return Token(Tok.OpenParentheses, Location(position - 1, this.source), 1);
    }
    Token closeParentheses()
    {
        return Token(Tok.CloseParentheses, Location(position - 1, this.source), 1);
    }
    Token closeBrace() 
    {
        return Token(Tok.CloseBrace, Location(position - 1, this.source), 1);
    }
    Token seperator()
    {
        Token t = Token(Tok.Seperator, Location(position - 1, source), 1);
        return t;
    }
    Token colon()
    {
        return Token(Tok.Colon, Location(position - 1, this.source), 1);
    }
    Token dot() 
    {
        return Token(Tok.Dot, Location(position - 1, this.source), 1);
    }
    Token comma() 
    {
        return Token(Tok.Comma, Location(position - 1, this.source), 1);
    }
    Token ne() 
    {
        if(source.data[position] == '=')
            return Token(Tok.Ne, Location(position++ - 1, this.source), 2);
        return Token(Tok.Not, Location(position - 1, this.source), 1);
    }
    Token le()
    {
        if(source.data[position] == '=')
            return Token(Tok.Le, Location(position++ - 1, this.source), 2);
        return Token(Tok.Lt, Location(position - 1, this.source), 1);
    }
    Token ge() 
    {
        if(source.data[position] == '=')
            return Token(Tok.Ge, Location(position++ - 1, this.source), 2);
        return Token(Tok.Gt, Location(position - 1, this.source), 1);
    }
    Token add() 
    {
        return Token(Tok.Add, Location(position - 1, this.source), 1);
    }
    Token sub() 
    {
        return Token(Tok.Sub, Location(position - 1, this.source), 1);
    }
    Token mul() 
    {
        return Token(Tok.Mul, Location(position - 1, this.source), 1);
    }
    Token div() 
    {
        switch(source.data[position])
        {
            case '/':
                while(getNextChar != CharType.EOF)
                {
                    if(source.data[position++] == '\n')
                        return this.next;
                }
                return Token(Tok.EOF, Location(position, this.source), 0);

            case '*':
                position += 2;
                while(getNextChar != CharType.EOF)
                {
                    ++position;
                    if(source.data[position-2] == '*')
                        if(source.data[position-1] == '/')
                            return this.next;
                }
                throw error(__LINE__, "Unexpected end of file. Unclosed comment block");

            case '+':
                position += 2;
                int nesting = 1;
                while(getNextChar != CharType.EOF)
                {
                    ++position;
                    if(source.data[position-2] == '+')
                        if(source.data[position-1] == '/')
                        {
                            position++;
                            nesting--;
                        }

                    if(source.data[position-2] == '/')
                        if(source.data[position-1] == '+')
                        {
                            nesting++;
                            position++;
                        }

                    if(nesting == 0)
                        return this.next;
                }
                throw error(__LINE__, "Unexpected end of file. Unclosed comment block");

            default:
                return Token(Tok.Div, Location(position - 1, this.source), 1);
        }
    }
    
    Token lexNumber ()
    {
        int i = 0;
        while(getNextChar(++i) == CharType.Number)
        {}

        position += i;

        return Token(Tok.Integer, Location(position - i, this.source), i);
    }

    Token lexSymbol ()
    {
        Token t = symbolFunctions[source.data[position++]]();

        return t;
    }

    Token lexLetter ()
    {
        int i = 0;
        bool hasNumber = false;
        while (getNextChar(++i) == CharType.Letter || 
                getNextChar(i) == CharType.Number)
        {
            if (getNextChar(i) == CharType.Number)
            {
                hasNumber = true;
            }
        }

        Token t = Token(Tok.Identifier, Location(position, source), i);

        if (!hasNumber)
        {
            char[] str = source.data[position .. position + i];
            if(str in keywords)
                t.type = keywords[str];
        }

        position += i;

        return t;
    }

    CharType getNextChar(int offset = 0)
    {
        if (position + offset >= this.source.data.length)
            return CharType.EOF;

        char current = source.data[position + offset];

        CharType c = charTable[current];

//        if(c == CharType.INVALID)
  //          throw error(__LINE__, "Read invalid symbol: '%0'").arg(current);

        return c;

    }

    Error error(uint line, char[] msg)
    {
        return (new Error(msg)).loc(Location(position, source));
    }

    int position;
    DataSource source;
    Error[] errors;
    CharType[] charTable;
    Token delegate()[] symbolFunctions;
}

enum CharType : ubyte
{
    INVALID,
    Letter,
    Number,
    Symbol,
    Whitespace,

    EOF
}