view trunk/src/Lexer.d @ 52:f65a83c27638

- Fixed the raw string literal scanner. Newlines weren't copied to the buffer. Converting LS and PS to '\n' as well. - Removed \x0 and \x1A from ptable.
author aziz
date Wed, 27 Jun 2007 17:18:02 +0000
parents cadd2bfe686c
children 1786c2825491
line wrap: on
line source

/++
  Author: Aziz Köksal
  License: GPL2
+/
module Lexer;
import Token;
import Keywords;
import Identifier;
import std.stdio;
import std.utf;
import std.uni;
import std.conv;

/// ASCII character properties table.
static const int ptable[256] = [
 0, 0, 0, 0, 0, 0, 0, 0, 0,32, 0,32,32, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
32, 0, 0x2200, 0, 0, 0, 0, 0x2700, 0, 0, 0, 0, 0, 0, 0, 0,
 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 0, 0, 0, 0, 0, 0x3f00,
 0,12,12,12,12,12,12, 8, 8, 8, 8, 8, 8, 8, 8, 8,
 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0x5c00, 0, 0,16,
 0, 0x70c, 0x80c,12,12,12, 0xc0c, 8, 8, 8, 8, 8, 8, 8, 0xa08, 8,
 8, 8, 0xd08, 8, 0x908, 8, 0xb08, 8, 8, 8, 8, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
];

enum CProperty
{
       Octal = 1,
       Digit = 1<<1,
         Hex = 1<<2,
       Alpha = 1<<3,
  Underscore = 1<<4,
  Whitespace = 1<<5
}

const uint EVMask = 0xFF00; // Bit mask for escape value

private alias CProperty CP;
int isoctal(char c) { return ptable[c] & CP.Octal; }
int isdigit(char c) { return ptable[c] & CP.Digit; }
int ishexad(char c) { return ptable[c] & CP.Hex; }
int isalpha(char c) { return ptable[c] & CP.Alpha; }
int isalnum(char c) { return ptable[c] & (CP.Alpha | CP.Digit); }
int isidbeg(char c) { return ptable[c] & (CP.Alpha | CP.Underscore); }
int isident(char c) { return ptable[c] & (CP.Alpha | CP.Underscore | CP.Digit); }
int isspace(char c) { return ptable[c] & CP.Whitespace; }
int char2ev(char c) { return ptable[c] >> 8; /*(ptable[c] & EVMask) >> 8;*/ }

version(gen_ptable)
static this()
{
  alias ptable p;
  // Initialize character properties table.
  for (int i; i < p.length; ++i)
  {
    p[i] = 0;
    if ('0' <= i && i <= '7')
      p[i] |= CP.Octal;
    if ('0' <= i && i <= '9')
      p[i] |= CP.Digit;
    if (isdigit(i) || 'a' <= i && i <= 'f' || 'A' <= i && i <= 'F')
      p[i] |= CP.Hex;
    if ('a' <= i && i <= 'z' || 'A' <= i && i <= 'Z')
      p[i] |= CP.Alpha;
    if (i == '_')
      p[i] |= CP.Underscore;
    if (i == ' ' || i == '\t' || i == '\v' || i == '\f')
      p[i] |= CP.Whitespace;
  }
  // Store escape sequence values in second byte.
  assert(CProperty.max <= ubyte.max, "character property flags and escape value byte overlap.");
  p['\''] |= 39 << 8;
  p['"'] |= 34 << 8;
  p['?'] |= 63 << 8;
  p['\\'] |= 92 << 8;
  p['a'] |= 7 << 8;
  p['b'] |= 8 << 8;
  p['f'] |= 12 << 8;
  p['n'] |= 10 << 8;
  p['r'] |= 13 << 8;
  p['t'] |= 9 << 8;
  p['v'] |= 11 << 8;
  // Print a formatted array literal.
  char[] array = "[\n";
  for (int i; i < p.length; ++i)
  {
    int c = p[i];
    array ~= std.string.format(c>255?" 0x%x,":"%2d,", c, ((i+1) % 16) ? "":"\n");
  }
  array[$-2..$] = "\n]";
  writefln(array);
}

const char[3] LS = \u2028;
const char[3] PS = \u2029;

const dchar LSd = 0x2028;
const dchar PSd = 0x2029;

const uint _Z_ = 26; /// Control+Z

/// Index into table of error messages.
enum MID
{
  InvalidUnicodeCharacter,
  InvalidUTF8Sequence,
  // ''
  UnterminatedCharacterLiteral,
  EmptyCharacterLiteral,
  // #line
  ExpectedIdentifierSTLine,
  ExpectedNormalStringLiteral,
  ExpectedNumberAfterSTLine,
  NewlineInSpecialToken,
  UnterminatedSpecialToken,
  // ""
  UnterminatedString,
  // x""
  NonHexCharInHexString,
  OddNumberOfDigitsInHexString,
  UnterminatedHexString,
  // /* */ /+ +/
  UnterminatedBlockComment,
  UnterminatedNestedComment,
  // `` r""
  UnterminatedRawString,
  UnterminatedBackQuoteString,
  // \x \u \U
  UndefinedEscapeSequence,
  InsufficientHexDigits,
  // \&[a-zA-Z][a-zA-Z0-9]+;
  UnterminatedHTMLEntity,
  InvalidBeginHTMLEntity,
}

string[] messages = [
  "invalid Unicode character.",
  "invalid UTF-8 sequence.",
  // ''
  "unterminated character literal.",
  "empty character literal.",
  // #line
  "expected 'line' after '#'.",
  `the filespec must be defined in a double quote string literal (e.g. "filespec".)`,
  "positive integer expected after #line",
  "newline not allowed inside special token.",
  "expected a terminating newline after special token.",
  // ""
  "unterminated string literal.",
  // x""
  "non-hex character '{1}' found in hex string.",
  "odd number of hex digits in hex string.",
  "unterminated hex string.",
  // /* */ /+ +/
  "unterminated block comment (/* */).",
  "unterminated nested comment (/+ +/).",
  // `` r""
  "unterminated raw string.",
  "unterminated back quote string.",
  // \x \u \U
  "found undefined escape sequence.",
  "insufficient number of hex digits in escape sequence.",
  // \&[a-zA-Z][a-zA-Z0-9]+;
  "unterminated html entity.",
  "html entities must begin with a letter.",
];

class Problem
{
  enum Type
  {
    Lexer,
    Parser,
    Semantic
  }

  MID id;
  Type type;
  uint loc;
  this(Type type, MID id, uint loc)
  {
    this.id = id;
    this.type = type;
    this.loc = loc;
  }
}

class Lexer
{
  Token token;
  string text;
  char* p;
  char* end;

  uint loc = 1; /// line of code

  char[] fileName;

  Problem[] errors;

  Identifier[string] idtable;

  this(string text, string fileName)
  {
    this.fileName = fileName;

    this.text = text;
    if (text[$-1] != 0)
    {
      this.text.length = this.text.length + 1;
      this.text[$-1] = 0;
    }

    this.p = this.text.ptr;
    this.end = this.p + this.text.length;

    loadKeywords();
  }

  public void scan(out Token t)
  {
    assert(p < end);

    uint c = *p;

    while(1)
    {
      t.start = p;

      if (c == 0)
      {
        assert(*p == 0);
        ++p;
        assert(p == end);
        t.type = TOK.EOF;
        t.end = p;
        return;
      }

      if (c == '\n')
      {
        c = *++p;
        ++loc;
        continue;
      }
      else if (c == '\r')
      {
        c = *++p;
        if (c != '\n')
          ++loc;
        continue;
      }
      else if (c == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
      {
        p += 3;
        c = *p;
        continue;
      }

      if (isidbeg(c))
      {
        if (c == 'r' && p[1] == '"' && ++p)
          return scanRawStringLiteral(t);
        if (c == 'x' && p[1] == '"')
          return scanHexStringLiteral(t);
      Lidentifier:
        do
        { c = *++p; }
        while (isident(c) || c & 128 && isUniAlpha(decodeUTF8()))

        t.end = p;

        string str = t.span;
        Identifier* id = str in idtable;

        if (!id)
        {
          idtable[str] = Identifier.Identifier(TOK.Identifier, str);
          id = str in idtable;
        }
        assert(id);
        t.type = id.type;
        return;
      }

      if (isdigit(c))
        return scanNumber(t);

      if (c == '/')
      {
        c = *++p;
        switch(c)
        {
        case '=':
          ++p;
          t.type = TOK.DivAssign;
          t.end = p;
          return;
        case '+':
          uint level = 1;
          while (1)
          {
            c = *++p;
          LswitchNC: // only jumped to from default case of next switch(c)
            switch (c)
            {
            case '\r':
              if (p[1] == '\n')
                ++p;
            case '\n':
              ++loc;
              continue;
            case 0, _Z_:
              error(MID.UnterminatedNestedComment);
              goto LreturnNC;
            default:
            }

            c <<= 8;
            c |= *++p;
            switch (c)
            {
            case 0x2F2B: // /+
              ++level;
              continue;
            case 0x2B2F: // +/
              if (--level == 0)
              {
                ++p;
              LreturnNC:
                t.type = TOK.Comment;
                t.end = p;
                return;
              }
              continue;
            case 0xE280: // LS[0..1] || PS[0..1]
              if (p[1] == LS[2] || p[1] == PS[2])
              {
                ++loc;
                ++p;
              }
              continue;
            default:
              c &= char.max;
              goto LswitchNC;
            }
          }
        case '*':
          while (1)
          {
            c = *++p;
          LswitchBC: // only jumped to from default case of next switch(c)
            switch (c)
            {
            case '\r':
              if (p[1] == '\n')
                ++p;
            case '\n':
              ++loc;
              continue;
            case 0, _Z_:
              error(MID.UnterminatedBlockComment);
              goto LreturnBC;
            default:
            }

            c <<= 8;
            c |= *++p;
            switch (c)
            {
            case 0x2A2F: // */
              ++p;
            LreturnBC:
              t.type = TOK.Comment;
              t.end = p;
              return;
            case 0xE280: // LS[0..1] || PS[0..1]
              if (p[1] == LS[2] || p[1] == PS[2])
              {
                ++loc;
                ++p;
              }
              continue;
            default:
              c &= char.max;
              goto LswitchBC;
            }
          }
          assert(0);
        case '/':
          while (1)
          {
            c = *++p;
            switch (c)
            {
            case '\r':
              if (p[1] == '\n')
                ++p;
            case '\n':
            case 0, _Z_:
              break;
            case LS[0]:
              if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
                break;
              continue;
            default:
              continue;
            }
            t.type = TOK.Comment;
            t.end = p;
            return;
          }
        default:
          t.type = TOK.Div;
          t.end = p;
          return;
        }
      }

      switch (c)
      {
      case '\'':
        return scanCharacterLiteral(t);
      case '`':
        return scanRawStringLiteral(t);
      case '"':
        return scanNormalStringLiteral(t);
      case '\\':
        char[] buffer;
        do
        {
          ++p;
          c = scanEscapeSequence();
          if (c == 0xFFFF)
            break;
          if (c < 128)
            buffer ~= c;
          else
            encodeUTF8(buffer, c);
        } while (*p == '\\')
        buffer ~= 0;
        t.type = TOK.String;
        t.str = buffer;
        t.end = p;
        return;
      case '>': /* >  >=  >>  >>=  >>>  >>>= */
        c = *++p;
        switch (c)
        {
        case '=':
          t.type = TOK.GreaterEqual;
          goto Lcommon;
        case '>':
          if (p[1] == '>')
          {
            ++p;
            if (p[1] == '=')
            { ++p;
              t.type = TOK.URShiftAssign;
            }
            else
              t.type = TOK.URShift;
          }
          else if (p[1] == '=')
          {
            ++p;
            t.type = TOK.RShiftAssign;
          }
          else
            t.type = TOK.RShift;
          goto Lcommon;
        default:
          t.type = TOK.Greater;
          goto Lcommon2;
        }
        assert(0);
      case '<': /* <  <=  <>  <>=  <<  <<= */
        c = *++p;
        switch (c)
        {
        case '=':
          t.type = TOK.LessEqual;
          goto Lcommon;
        case '<':
          if (p[1] == '=') {
            ++p;
            t.type = TOK.LShiftAssign;
          }
          else
            t.type = TOK.LShift;
          goto Lcommon;
        case '>':
          if (p[1] == '=') {
            ++p;
            t.type = TOK.LorEorG;
          }
          else
            t.type = TOK.LorG;
          goto Lcommon;
        default:
          t.type = TOK.Less;
          goto Lcommon2;
        }
        assert(0);
      case '!': /* !  !<  !>  !<=  !>=  !<>  !<>= */
        c = *++p;
        switch (c)
        {
        case '<':
          c = *++p;
          if (c == '>')
          {
            if (p[1] == '=') {
              ++p;
              t.type = TOK.Unordered;
            }
            else
              t.type = TOK.UorE;
          }
          else if (c == '=')
          {
            t.type = TOK.UorG;
          }
          else {
            t.type = TOK.UorGorE;
            goto Lcommon2;
          }
          goto Lcommon;
        case '>':
          if (p[1] == '=')
          {
            ++p;
            t.type = TOK.UorL;
          }
          else
            t.type = TOK.UorLorE;
          goto Lcommon;
        case '=':
          t.type = TOK.NotEqual;
          goto Lcommon;
        default:
          t.type = TOK.Not;
          goto Lcommon2;
        }
        assert(0);
      case '.': /* .  ..  ... */
        if (p[1] == '.')
        {
          ++p;
          if (p[1] == '.') {
            ++p;
            t.type = TOK.Ellipses;
          }
          else
            t.type = TOK.Slice;
        }
        else
          t.type = TOK.Dot;
        goto Lcommon;
      case '|': /* |  ||  |= */
        c = *++p;
        if (c == '=')
          t.type = TOK.OrAssign;
        else if (c == '|')
          t.type = TOK.OrLogical;
        else {
          t.type = TOK.OrBinary;
          goto Lcommon2;
        }
        goto Lcommon;
      case '&': /* &  &&  &= */
        c = *++p;
        if (c == '=')
          t.type = TOK.AndAssign;
        else if (c == '&')
          t.type = TOK.AndLogical;
        else {
          t.type = TOK.AndBinary;
          goto Lcommon2;
        }
        goto Lcommon;
      case '+': /* +  ++  += */
        c = *++p;
        if (c == '=')
          t.type = TOK.PlusAssign;
        else if (c == '+')
          t.type = TOK.PlusPlus;
        else {
          t.type = TOK.Plus;
          goto Lcommon2;
        }
        goto Lcommon;
      case '-': /* -  --  -= */
        c = *++p;
        if (c == '=')
          t.type = TOK.MinusAssign;
        else if (c == '-')
          t.type = TOK.MinusMinus;
        else {
          t.type = TOK.Minus;
          goto Lcommon2;
        }
        goto Lcommon;
      case '=': /* =  == */
        if (p[1] == '=') {
          ++p;
          t.type = TOK.Equal;
        }
        else
          t.type = TOK.Assign;
        goto Lcommon;
      case '~': /* ~  ~= */
         if (p[1] == '=') {
           ++p;
           t.type = TOK.CatAssign;
         }
         else
           t.type = TOK.Tilde;
         goto Lcommon;
      case '*': /* *  *= */
         if (p[1] == '=') {
           ++p;
           t.type = TOK.MulAssign;
         }
         else
           t.type = TOK.Mul;
         goto Lcommon;
      case '^': /* ^  ^= */
         if (p[1] == '=') {
           ++p;
           t.type = TOK.XorAssign;
         }
         else
           t.type = TOK.Xor;
         goto Lcommon;
      case '%': /* %  %= */
         if (p[1] == '=') {
           ++p;
           t.type = TOK.ModAssign;
         }
         else
           t.type = TOK.Mod;
         goto Lcommon;
      // Single character tokens:
      case '(':
        t.type = TOK.LParen;
        goto Lcommon;
      case ')':
        t.type = TOK.RParen;
        goto Lcommon;
      case '[':
        t.type = TOK.LBracket;
        goto Lcommon;
      case ']':
        t.type = TOK.RBracket;
        goto Lcommon;
      case '{':
        t.type = TOK.LBrace;
        goto Lcommon;
      case '}':
        t.type = TOK.RBrace;
        goto Lcommon;
      case ':':
        t.type = TOK.Colon;
        goto Lcommon;
      case ';':
        t.type = TOK.Semicolon;
        goto Lcommon;
      case '?':
        t.type = TOK.Question;
        goto Lcommon;
      case ',':
        t.type = TOK.Comma;
        goto Lcommon;
      case '$':
        t.type = TOK.Dollar;
      Lcommon:
        ++p;
      Lcommon2:
        t.end = p;
        return;
      case '#':
        scanSpecialToken();
        c = *p;
        continue;
      default:
      }

      if (c & 128 && isUniAlpha(decodeUTF8()))
        goto Lidentifier;
      c = *++p;
    }
  }

  void peek(ref Token t)
  {
    char* tmp = p;
    uint len = errors.length;
    scan(t);
    p = tmp;
    if (errors.length != len)
      errors = errors[0..len];
  }

  void scanNormalStringLiteral(ref Token t)
  {
    assert(*p == '"');
    ++p;
    char[] buffer;
    t.type = TOK.String;
    while (1)
    {
      switch (*p)
      {
      case '"':
        ++p;
      Lreturn:
        buffer ~= 0;
        t.str = buffer;
        t.pf = scanPostfix();
        t.end = p;
        return;
      case '\\':
        ++p;
        dchar d = scanEscapeSequence();
        if (d == 0xFFFF)
          continue;
        if (d < 128)
          buffer ~= d;
        else
          encodeUTF8(buffer, d);
        continue;
      case '\r':
        if (p[1] == '\n')
          ++p;
      case '\n':
        ++p;
        ++loc;
        buffer ~= '\n';
        continue;
      case 0, _Z_:
        error(MID.UnterminatedString);
        goto Lreturn;
      default:
        if (*p & 128)
        {
          char* begin = p;
          dchar d = decodeUTF8();
          if (d == LSd || d == PSd)
            goto case '\n';

          if (d != 0xFFFF)
          {
            ++p;
            buffer ~= begin[0 .. p - begin];
          }
          continue;
        }
        buffer ~= *p++;
      }
    }
  }

  void scanCharacterLiteral(ref Token t)
  {
    assert(*p == '\'');
    MID id = MID.UnterminatedCharacterLiteral;
    ++p;
    switch (*p)
    {
    case '\\':
      ++p;
      t.dchar_ = scanEscapeSequence();
      break;
    case '\'':
      ++p;
      id = MID.EmptyCharacterLiteral;
    case '\n', '\r', 0, _Z_:
      goto Lerr;
    default:
      uint c = *p;
      if (c & 128)
      {
        c = decodeUTF8();
        if (c == LSd || c == PSd)
          goto Lerr;
      }
      t.dchar_ = c;
      ++p;
    }

    if (*p == '\'')
      ++p;
    else
  Lerr:
      error(id);
    t.type = TOK.Character;
    t.end = p;
  }

  char scanPostfix()
  {
    switch (*p)
    {
    case 'c':
    case 'w':
    case 'd':
      return *p++;
    default:
      return 0;
    }
  }

  void scanRawStringLiteral(ref Token t)
  {
    uint delim = *p;
    assert(delim == '`' || delim == '"' && p[-1] == 'r');
    t.type = TOK.String;
    char[] buffer;
    uint c;
    while (1)
    {
      c = *++p;
      switch (c)
      {
      case '\r':
        if (p[1] == '\n')
          ++p;
        c = '\n'; // Convert '\r' and '\r\n' to '\n'
      case '\n':
        ++loc;
        break;
      case '`':
      case '"':
        if (c == delim)
        {
          ++p;
          t.pf = scanPostfix();
        Lreturn:
          t.str = buffer ~ '\0';
          t.end = p;
          return;
        }
        break;
      case LS[0]:
        if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
        {
          c = '\n';
          ++p; ++p;
          ++loc;
        }
        break;
      case 0, _Z_:
        if (delim == 'r')
          error(MID.UnterminatedRawString);
        else
          error(MID.UnterminatedBackQuoteString);
        goto Lreturn;
      default:
      }
      buffer ~= c; // copy character to buffer
    }
    assert(0);
  }

  void scanHexStringLiteral(ref Token t)
  {
    assert(p[0] == 'x' && p[1] == '"');
    p+=2;
    t.type = TOK.String;

    uint c;
    ubyte[] buffer;
    ubyte h; // hex number
    uint n; // number of hex digits
    MID mid;

    while (1)
    {
      c = *p++;
      switch (c)
      {
      case '"':
        if (n & 1)
        {
          mid = MID.OddNumberOfDigitsInHexString;
          error(mid);
        }
        t.str = cast(string) buffer;
        t.pf = scanPostfix();
        t.end = p;
        return;
      case '\r':
        if (*p == '\n')
          ++p;
      case '\n':
        ++loc;
        continue;
      case LS[0]:
        if (*p == LS[1] && (p[1] == LS[2] || p[1] == PS[2])) {
          p += 2;
          ++loc;
        }
        continue;
      case 0, _Z_:
        mid = MID.UnterminatedHexString;
        goto Lerr;
      default:
        if (ishexad(c))
        {
          if (c <= '9')
            c -= '0';
          else if (c <= 'F')
            c -= 'A' - 10;
          else
            c -= 'a' - 10;

          if (n & 1)
          {
            h <<= 4;
            h |= c;
            buffer ~= h;
          }
          else
            h = c;
          ++n;
          continue;
        }
        else if (isspace(c))
          continue;
        mid = MID.NonHexCharInHexString;
        goto Lerr;
      }
    }

    return;
  Lerr:
    error(mid);
    t.pf = 0;
    t.end = p;
  }

  dchar scanEscapeSequence()
  {
    uint c = char2ev(*p);
    if (c) {
      ++p;
      return c;
    }
    c = 0xFFFF;
    uint digits = 2;

    switch (*p)
    {
    case 'x':
      c = 0;
      while (1)
      {
        ++p;
        if (ishexad(*p))
        {
          c *= 16;
          if (*p <= '9')
            c += *p - '0';
          else if (*p <= 'F')
            c += *p - 'A' + 10;
          else
            c += *p - 'a' + 10;
          if (!--digits) {
            ++p;
            break;
          }
        }
        else
        {
          error(MID.InsufficientHexDigits);
          c = 0xFFFF;
          break;
        }
      }
      break;
    case 'u':
      digits = 4;
      goto case 'x';
    case 'U':
      digits = 8;
      goto case 'x';
    default:
      if (isoctal(*p))
      {
        c = 0;
        c += *p - '0';
        ++p;
        if (!isoctal(*p))
          return c;
        c *= 8;
        c += *p - '0';
        ++p;
        if (!isoctal(*p))
          return c;
        c *= 8;
        c += *p - '0';
        ++p;
      }
      else if(*p == '&')
      {
        if (isalpha(*++p))
        {
          while (1)
          {
            if (isalnum(*++p))
              continue;
            if (*p == ';') {
              // TODO: convert entity to unicode codepoint.
              ++p;
              break;
            }
            else {
              error(MID.UnterminatedHTMLEntity);
              break;
            }
          }
        }
        else
          error(MID.InvalidBeginHTMLEntity);
      }
      else
        error(MID.UndefinedEscapeSequence);
    }

    return c;
  }

  void scanNumber(ref Token t)
  {
    while (isdigit(*++p)) {}
    t.type = TOK.Number;
    t.end = p;
    t._uint = toInt(t.span);
  }

  /// Scan special token: #line Integer [Filespec] EndOfLine
  // TODO: Handle case like: #line 0 #line 2
  void scanSpecialToken()
  {
    assert(*p == '#');

    ++p;
    MID mid;
    Token t;
    uint oldloc = this.loc, newloc;

    peek(t);
    if (!(this.loc == oldloc && p == t.start && t.type == TOK.Identifier && t.span == "line"))
    {
      this.loc = oldloc; // reset this.loc because we took a peek at the next token
      mid = MID.ExpectedIdentifierSTLine;
      goto Lerr;
    }
    p = t.end; // consume token

    peek(t);
    if (this.loc == oldloc && t.type == TOK.Number)
    {
      newloc = t._uint - 1;
      p = t.end;
    }
    else
    {
      this.loc = oldloc;
      mid = MID.ExpectedNumberAfterSTLine;
      goto Lerr;
    }

    peek(t);
    if (this.loc != oldloc)
    {
      this.loc = oldloc;
      mid = MID.NewlineInSpecialToken;
      goto Lerr;
    }
    if (t.type == TOK.String)
    {
      if (*t.start != '"')
      {
        mid = MID.ExpectedNormalStringLiteral;
        goto Lerr;
      }
      fileName = t.span[1..$-1]; // contents of "..."
      p = t.end;
    }
    else if (t.type == TOK.Identifier && t.span == "__FILE__")
    {
      p = t.end;
    }
/+
    peek(t);
    if (this.loc == oldloc && t.type != TOK.EOF)
    {
      mid = MID.UnterminatedSpecialToken;
      goto Lerr;
    }
+/
    while (1)
    {
      switch (*p)
      {
      case '\r':
        if (p[1] == '\n')
          ++p;
      case '\n':
        ++p;
        break;
      case LS[0]:
        if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
        {
          p += 2;
          break;
        }
      case 0, _Z_:
        break;
      default:
        if (isspace(*p)) {
          ++p;
          continue;
        }
        mid = MID.UnterminatedSpecialToken;
        goto Lerr;
      }
      break;
    }

    this.loc = newloc;
    return;
  Lerr:
    error(mid);
  }

  uint decodeUTF8()
  {
    assert(*p & 128, "check for ASCII char before calling decodeUTF8().");
    size_t idx;
    uint d = 0xFFFF;
    try
    {
      d = std.utf.decode(p[0 .. end-p], idx);
      p += idx -1;
    }
    catch (UtfException e)
    {
      error(MID.InvalidUTF8Sequence);
      // Skip to next valid utf-8 sequence
      while (UTF8stride[*++p] != 0xFF) {}
    }
    return d;
  }

  void loadKeywords()
  {
    foreach(k; keywords)
      idtable[k.str] = k;
  }

  void error(MID id)
  {
    errors ~= new Problem(Problem.Type.Lexer, id, loc);
  }

  public TOK nextToken()
  {
    scan(this.token);
    return this.token.type;
  }

  Token[] getTokens()
  {
    Token[] tokens;
    while (nextToken() != TOK.EOF)
      tokens ~= this.token;
    tokens ~= this.token;
    return tokens;
  }

  private void encodeUTF8(inout char[] str, dchar d)
  {
    char[6] b;
    assert(d > 0x7F, "check for ASCII char before calling encodeUTF8().");
    if (d < 0x800)
    {
      b[0] = 0xC0 | (d >> 6);
      b[1] = 0x80 | (d & 0x3F);
      str ~= b[0..2];
    }
    else if (d < 0x10000)
    {
      b[0] = 0xE0 | (d >> 12);
      b[1] = 0x80 | ((d >> 6) & 0x3F);
      b[2] = 0x80 | (d & 0x3F);
      str ~= b[0..3];
    }
    else if (d < 0x200000)
    {
      b[0] = 0xF0 | (d >> 18);
      b[1] = 0x80 | ((d >> 12) & 0x3F);
      b[2] = 0x80 | ((d >> 6) & 0x3F);
      b[3] = 0x80 | (d & 0x3F);
      str ~= b[0..4];
    }
    else if (d < 0x4000000)
    {
      b[0] = 0xF8 | (d >> 24);
      b[1] = 0x80 | ((d >> 18) & 0x3F);
      b[2] = 0x80 | ((d >> 12) & 0x3F);
      b[3] = 0x80 | ((d >> 6) & 0x3F);
      b[4] = 0x80 | (d & 0x3F);
      str ~= b[0..5];
    }
    else if (d < 0x80000000)
    {
      b[0] = 0xFC | (d >> 30);
      b[1] = 0x80 | ((d >> 24) & 0x3F);
      b[2] = 0x80 | ((d >> 18) & 0x3F);
      b[3] = 0x80 | ((d >> 12) & 0x3F);
      b[4] = 0x80 | ((d >> 6) & 0x3F);
      b[5] = 0x80 | (d & 0x3F);
      str ~= b[0..6];
    }
    else
      error(MID.InvalidUnicodeCharacter);
  }
}

unittest
{
  string[] toks = [
    ">",    ">=", ">>",  ">>=", ">>>", ">>>=", "<",   "<=",  "<>",
    "<>=",  "<<", "<<=", "!",   "!<",  "!>",   "!<=", "!>=", "!<>",
    "!<>=", ".",  "..",  "...", "&",   "&&",   "&=",  "+",   "++",
    "+=",   "-",  "--",  "-=",  "=",   "==",   "~",   "~=",  "*",
    "*=",   "/",  "/=",  "^",   "^=",  "%",    "%=",  "(",   ")",
    "[",    "]",  "{",   "}",   ":",   ";",    "?",   ",",   "$"
  ];

  char[] src;

  foreach (op; toks)
    src ~= op ~ " ";

  auto lx = new Lexer(src, "");
  auto tokens = lx.getTokens();

  tokens = tokens[0..$-1]; // exclude TOK.EOF

  assert(tokens.length == toks.length );

  foreach (i, t; tokens)
    assert(t.span == toks[i], std.string.format("Lexed '%s' but expected '%s'", t.span, toks[i]));
}