view src/lexer/Lexer.d @ 207:e0551773a005

Added the correct version.
author Anders Johnsen <skabet@gmail.com>
date Tue, 12 Aug 2008 18:19:34 +0200
parents d3c148ca429b
children
line wrap: on
line source

module lexer.Lexer;

import basic.Message,
       basic.SourceManager;

import lexer.Token,
       lexer.Keyword;

import tango.io.Stdout;

/**
  The Lexer class will supply you with methods to tokenize a D file. Supply the
  Lexer with a DataSource and you can 'peek' and 'next' Tokens from the file. 

  For more info about Tokens, look up the lexer.Token module.
*/  
class Lexer
{
public:

    /**
      Create a new Lexer.
    */
    this(SourceLocation start, SourceManager src_mgr, MessageHandler messages)
    {
        this.messages = messages;
        sm = src_mgr;
        start_loc = start;
        position = 0;
        source = sm.getRawData(start_loc);


        charTable.length = 256;
        foreach (c; "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_")
            charTable[c] = CharType.Letter;

        foreach (c; "0123456789")
            charTable[c] = CharType.Number;

        foreach (c; "(){}[];:.,=!<>+-*/%&\"`")
            charTable[c] = CharType.Symbol;

        foreach (c; " \n")
            charTable[c] = CharType.Whitespace;

        foreach (c; "'\\")
            charTable[c] = CharType.Other;

        symbolFunctions.length = 256;

        symbolFunctions['('] = &openParentheses;
        symbolFunctions[')'] = &closeParentheses;
        symbolFunctions['{'] = &openBrace;
        symbolFunctions['}'] = &closeBrace;
        symbolFunctions['['] = &openBracket;
        symbolFunctions[']'] = &closeBracket;
        symbolFunctions[';'] = &seperator;
        symbolFunctions[':'] = &colon;
        symbolFunctions['.'] = &dot;
        symbolFunctions[','] = &comma;
        symbolFunctions['='] = &eq;
        symbolFunctions['!'] = &ne;
        symbolFunctions['<'] = &le;
        symbolFunctions['>'] = &ge;
        symbolFunctions['+'] = &plus;
        symbolFunctions['-'] = &minus;
        symbolFunctions['*'] = &star;
        symbolFunctions['/'] = &slash;
        symbolFunctions['%'] = &percent;
        symbolFunctions['&'] = &and;
        symbolFunctions['"'] = &string;
        symbolFunctions['`'] = &string;

        last = Token(Tok.EOF, SLoc() + 1, 0);
    }

    /**
      Get the next token from the source. This method will move the
      internal position forward to the next Token.

      return: A Token - Token.type is TokType.EOF if there is
        no more tokens in the file.
      */
    Token next()
    {
        Token res;
        switch (getNextChar)
        {
            case CharType.EOF:
                return Token(Tok.EOF, last.location, 0); 

            case CharType.Whitespace:
                position += 1;
                res = this.next;
                break;

            case CharType.Symbol:
                res = lexSymbol;
                break;

            case CharType.Letter:
                res = lexLetter;
                break;

            case CharType.Number:
                res = lexNumber;
                break;
            case CharType.Other:
                messages.report(UnexpectedTok, Loc(position)).fatal(ExitLevel.Lexer);
        }
        if (res.type != Tok.EOF)
            last = res;
        return res;
    }

    /**
      Get the next token from the source. This method will NOT move the
      internal position forward, and thereby having no side-effects.

      return: A Token - Token.type is TokType.EOF if there is
        no more tokens in the file.
      */
    Token peek(int skip = 0)
    {
        int oldPosition = this.position;
        while (skip-- > 0)
            this.next;
        Token t = this.next;
        this.position = oldPosition;
        return t;
    }

    Token last;
private:

    Token eq()
    {
        if(source[position] == '=')
            return Token(Tok.Eq, Loc(position++ - 1), 2);
        return Token(Tok.Assign, Loc(position - 1), 1);
    }
    Token openBrace() 
    {
        return Token(Tok.OpenBrace, Loc(position - 1), 1);
    }
    Token closeBrace() 
    {
        return Token(Tok.CloseBrace, Loc(position - 1), 1);
    }
    Token openParentheses() 
    {
        return Token(Tok.OpenParentheses, Loc(position - 1), 1);
    }
    Token closeParentheses()
    {
        return Token(Tok.CloseParentheses, Loc(position - 1), 1);
    }
    Token openBracket() 
    {
        return Token(Tok.OpenBracket, Loc(position - 1), 1);
    }
    Token closeBracket()
    {
        return Token(Tok.CloseBracket, Loc(position - 1), 1);
    }
    Token seperator()
    {
        return Token(Tok.Seperator, Loc(position - 1), 1);
    }
    Token colon()
    {
        return Token(Tok.Colon, Loc(position - 1), 1);
    }
    Token dot() 
    {
        int pos = 0;
        while(getNextChar(0) == CharType.Number || 
              this.source[position + pos + 1] == '_')
        {
            if(getNextChar(0) == CharType.Number)
            {
                position--;
                return lexNumber();
            }
            pos++;
        }
        return Token(Tok.Dot, Loc(position - 1), 1);
    }
    Token comma() 
    {
        return Token(Tok.Comma, Loc(position - 1), 1);
    }
    Token ne() 
    {
        if(source[position] == '=')
            return Token(Tok.Ne, Loc(position++ - 1), 2);
        return Token(Tok.Not, Loc(position - 1), 1);
    }
    Token le()
    {
        if(source[position] == '=')
            return Token(Tok.Le, Loc(position++ - 1), 2);
        if(source[position] == '<')
            return Token(Tok.LeftShift, Loc(position++ - 1), 2);
        return Token(Tok.Lt, Loc(position - 1), 1);
    }
    Token ge() 
    {
        if(source[position] == '=')
            return Token(Tok.Ge, Loc(position++ - 1), 2);
        if(source[position] == '>')
            if(source[position+1] == '>')
            {
                position += 2;
                return Token(Tok.UnsignedRightShift, Loc(position - 1), 3);
            }
            else
                return Token(Tok.RightShift, Loc(position++ - 1), 2);
        return Token(Tok.Gt, Loc(position - 1), 1);
    }
    Token plus() 
    {
        if(source[position] == '=')
            return Token(Tok.PlusAssign, Loc(position++ - 1), 2);
        return Token(Tok.Plus, Loc(position - 1), 1);
    }
    Token minus() 
    {
        if(source[position] == '=')
            return Token(Tok.MinusAssign, Loc(position++ - 1), 2);
        return Token(Tok.Minus, Loc(position - 1), 1);
    }
    Token star() 
    {
        if(source[position] == '=')
            return Token(Tok.StarAssign, Loc(position++ - 1), 2);
        return Token(Tok.Star, Loc(position - 1), 1);
    }
    Token slash()
    {
        int p = position;
        switch(source[position])
        {
            case '=':
                return Token(Tok.SlashAssign, Loc(position++ - 1), 2);
            case '/':
                while(getNextChar != CharType.EOF)
                {
                    if(source[position++] == '\n')
                        return this.next;
                }
                return Token(Tok.EOF, Loc(position), 0);

            case '*':
                position += 2;
                while(getNextChar != CharType.EOF)
                {
                    ++position;
                    if(source[position-2] == '*')
                        if(source[position-1] == '/')
                        {
                            return this.next;
                        }
                }
                messages.report(UnexpectedEOFBlock,Loc(p)).fatal(ExitLevel.Lexer);

            case '+':
                position += 2;
                int nesting = 1;
                while(getNextChar != CharType.EOF)
                {
                    ++position;
                    if(source[position-2] == '+')
                        if(source[position-1] == '/')
                        {
                            position++;
                            nesting--;
                        }

                    if(source[position-2] == '/')
                        if(source[position-1] == '+')
                        {
                            nesting++;
                            position++;
                        }

                    if(nesting == 0)
                        return this.next;
                }
                messages.report(
                        UnexpectedEOFBlock,
                        Loc(p)).fatal(ExitLevel.Lexer);

            default:
                return Token(Tok.Slash, Loc(position - 1), 1);
        }
    }
    Token and()
    {
        return Token(Tok.And, Loc(position - 1), 1);
    }
    Token percent() 
    {
        if(source[position] == '=')
            return Token(Tok.PercentAssign, Loc(position++ - 1), 2);
        return Token(Tok.Percent, Loc(position - 1), 1);
    }

    Token string()
    {
        --position;
        int start = position;
        if(getNextChar() == CharType.Letter)
            position++;
        char end = '`';
        switch(source[position])
        {
            case '"':
                if(position > 0)
                    if(source[position-1] == 'r')
                    {
                        end = '"';
                        goto string_wys;
                    }
                ++position;
                while(getNextChar != CharType.EOF)
                {
                    ++position;
                    if (source[position-1] == '"' )
                    {
                        if(getNextChar != CharType.EOF)
                            if (source[position] == 'c' ||
                                source[position] == 'w' ||
                                source[position] == 'd')
                                position++;

                        return Token(Tok.String, Loc(start), position - start);
                    }
                    else if (source[position-1] == '\\')
                        position++;
                }
                break;
                case '`':
string_wys:     
                ++position;
                while(getNextChar != CharType.EOF)
                {
                    ++position;
                    if (source[position-1] == end )
                        return Token(Tok.String, Loc(start), position - start);
                }
                break;
        }
        messages.report(UnexpectedEOFBlock, Loc(position)).fatal(ExitLevel.Lexer);
    }
    
    Token lexNumber ()
    {
        bool sign;

        int i = 0;


        bool end = false;
        while(!end)
        {
            switch(getNextChar(i))
            {
                case CharType.Number:
                    break;
                case CharType.Symbol:
                    if(this.source[position+i] == '.')
                    {
                        break;
                    }
                    if (this.source[position+i] == '+' ||
                        this.source[position+i] == '-')
                    {
                        if (source[position+i-1] == 'e' ||
                            source[position+i-1] == 'E')
                            break;
                    }
                    end = true;
                    continue;
                case CharType.Letter:
                    if(this.source[position+i] == '_')
                        break;
                    if (this.source[position+i] == 'e' || 
                        this.source[position+i] == 'E')
                    {
                        break;
                    }
                    end = true;
                    continue;

                default:
                    end = true;
                    continue;
            }
            i++;
        }

        while(source[position+i] == 'u' ||
              source[position+i] == 'U' ||
              source[position+i] == 'L')
            i += 1;

        

        position += i;

        return Token(Tok.Integer, Loc(position - i), i);
    }

    Token lexSymbol ()
    {
        Token t = symbolFunctions[source[position++]]();

        return t;
    }

    Token lexLetter ()
    {
        int i = 0;
        bool hasNumber = false;
        if (source[position+1] == '"' ||
            source[position+1] == '`')
        {
            ++position;
            return string;
        }
        while (getNextChar(++i) == CharType.Letter || 
                getNextChar(i) == CharType.Number)
        {
            if (getNextChar(i) == CharType.Number)
            {
                hasNumber = true;
            }
        }

        Token t = Token(Tok.Identifier, Loc(), i);

        if (!hasNumber)
        {
            char[] str = source[position .. position + i];
            if(str in keywords)
                t.type = keywords[str];
        }

        position += i;

        return t;
    }

    CharType getNextChar(int offset = 0)
    {
        if (position + offset >= this.source.length)
            return CharType.EOF;

        char current = source[position + offset];

        CharType c = charTable[current];

        if(c == CharType.INVALID)
            messages.report(InvalidSymbol, Loc())
                .arg(Integer.toString(cast(int)current))
                .fatal(ExitLevel.Lexer);

        return c;

    }

    private final SourceLocation Loc(int pos = -1)
    {
        if (pos < 0)
            return start_loc + position;
        return start_loc + pos;
    }

    SourceManager sm;
    SourceLocation start_loc;
    int position;
    char[] source;
    MessageHandler messages;
    CharType[] charTable;
    Token delegate()[] symbolFunctions;
}

enum CharType : ubyte
{
    INVALID,
    Letter,
    Number,
    Symbol,
    Whitespace,
    Other,

    EOF
}