Mercurial > projects > dil

/++
  Author: Aziz Köksal
  License: GPL3
+/
module dil.Lexer;
import dil.Token;
import dil.Information;
import dil.Keywords;
import dil.Identifier;
import dil.Messages;
import dil.HtmlEntities;
import dil.Settings;
import std.stdio;
import std.utf;
import std.uni;
import std.c.stdlib : strtof, strtod, strtold, getErrno, ERANGE;
import std.c.time : time_t, time, ctime;
import std.c.string : strlen;
import std.string;

const char[3] LS = \u2028;
const char[3] PS = \u2029;

const dchar LSd = 0x2028;
const dchar PSd = 0x2029;

const uint _Z_ = 26; /// Control+Z

class Lexer
{
  Token* head; /// The head of the doubly linked token list.
  Token* tail; /// The tail of the linked list. Set in scan().
  Token* token; /// Points to the current token in the token list.
  string text;
  char* p; /// Points to the current character in the source text.
  char* end; /// Points one character past the end of the source text.

  uint loc = 1; /// Actual line of code.

  uint loc_old; /// Store actual line number when #line token is parsed.
  uint loc_hline; /// Line number set by #line.

  char[] fileName;

  Information[] errors;

//   bool reportErrors;

  Identifier[string] idtable;

  this(string text, string fileName)
  {
    this.fileName = fileName;

    this.text = text;
    if (text[$-1] != 0)
    {
      this.text.length = this.text.length + 1;
      this.text[$-1] = 0;
    }

    this.p = this.text.ptr;
    this.end = this.p + this.text.length;
//     this.reportErrors = true;
    loadKeywords();

    this.head = new Token;
    this.head.type = TOK.HEAD;
    this.token = this.head;
    scanShebang();
  }

  ~this()
  {
    auto token = head.next;
    do
    {
      assert(token.type == TOK.EOF ? token == tail && token.next is null : 1);
      delete token.prev;
      token = token.next;
    } while (token !is null)
    delete tail;
  }

  void scanShebang()
  {
    if (*p == '#' && p[1] == '!')
    {
      Token* t = new Token;
      t.start = p;
      t.type = TOK.Shebang;
      ++p;
      while (1)
      {
        t.end = p;
        switch (*++p)
        {
        case '\r':
          if (p[1] == '\n')
            ++p;
        case '\n':
          ++p;
          ++loc;
          break;
        case 0, _Z_:
          break;
        default:
          if (*p & 128)
          {
            auto c = decodeUTF8();
            if (c == LSd || c == PSd)
              goto case '\n';
          }
          continue;
        }
        break; // Exit loop.
      }
      this.head.next = t;
      t.prev = this.head;
    }
  }

  void finalizeSpecialToken(ref Token t)
  {
    assert(t.srcText[0..2] == "__");
    switch (t.type)
    {
    case TOK.FILE:
      t.str = this.fileName;
      break;
    case TOK.LINE:
      t.uint_ = this.loc;
      break;
    case TOK.DATE,
         TOK.TIME,
         TOK.TIMESTAMP:
      time_t time_val;
      time(&time_val);
      char* str = ctime(&time_val);
      char[] time_str = str[0 .. strlen(str)];
      switch (t.type)
      {
      case TOK.DATE:
        time_str = time_str[4..11] ~ time_str[20..24] ~ \0; break;
      case TOK.TIME:
        time_str = time_str[11..19] ~ \0; break;
      case TOK.TIMESTAMP:
        time_str = time_str[0..24] ~ \0; break;
      default: assert(0);
      }
      t.str = time_str;
      break;
    case TOK.VENDOR:
      t.str = VENDOR;
      break;
    case TOK.VERSION:
      t.uint_ = VERSION_MAJOR*1000 + VERSION_MINOR;
      break;
    default:
      assert(0);
    }
  }

  public void scan(out Token t)
  in
  {
    assert(text.ptr <= p && p < end);
  }
  out
  {
    assert(text.ptr <= t.start && t.start < end);
    assert(text.ptr < t.end && t.end <= end, std.string.format(t.type));
  }
  body
  {
    uint c = *p;

    while (1)
    {
      t.start = p;

      if (c == 0 || c == _Z_)
      {
        assert(*p == 0 || *p == _Z_);
        t.type = TOK.EOF;
        t.end = p;
        tail = &t;
        assert(t.start == t.end);
        return;
      }

      if (c == '\n')
      {
        c = *++p;
        ++loc;
        continue;
      }
      else if (c == '\r')
      {
        c = *++p;
        if (c != '\n')
          ++loc;
        continue;
      }
      else if (c == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
      {
        p += 3;
        c = *p;
        continue;
      }

      if (isidbeg(c))
      {
        if (c == 'r' && p[1] == '"' && ++p)
          return scanRawStringLiteral(t);
        if (c == 'x' && p[1] == '"')
          return scanHexStringLiteral(t);
      version(D2)
      {
        if (c == 'q' && p[1] == '"')
          return scanDelimitedStringLiteral(t);
        if (c == 'q' && p[1] == '{')
          return scanTokenStringLiteral(t);
      }
      Lidentifier:
        do
        { c = *++p; }
        while (isident(c) || c & 128 && isUniAlpha(decodeUTF8()))

        t.end = p;

        string str = t.srcText;
        Identifier* id = str in idtable;

        if (!id)
        {
          idtable[str] = Identifier(TOK.Identifier, str);
          id = str in idtable;
        }
        assert(id);
        t.type = id.type;
        if (t.isSpecialToken)
          finalizeSpecialToken(t);
        return;
      }

      if (isdigit(c))
        return scanNumber(t);

      if (c == '/')
      {
        c = *++p;
        switch(c)
        {
        case '=':
          ++p;
          t.type = TOK.DivAssign;
          t.end = p;
          return;
        case '+':
          uint level = 1;
          while (1)
          {
            c = *++p;
          LswitchNC: // only jumped to from default case of next switch(c)
            switch (c)
            {
            case '\r':
              if (p[1] == '\n')
                ++p;
            case '\n':
              ++loc;
              continue;
            case 0, _Z_:
              error(MID.UnterminatedNestedComment);
              goto LreturnNC;
            default:
            }

            c <<= 8;
            c |= *++p;
            switch (c)
            {
            case 0x2F2B: // /+
              ++level;
              continue;
            case 0x2B2F: // +/
              if (--level == 0)
              {
                ++p;
              LreturnNC:
                t.type = TOK.Comment;
                t.end = p;
                return;
              }
              continue;
            case 0xE280: // LS[0..1] || PS[0..1]
              if (p[1] == LS[2] || p[1] == PS[2])
              {
                ++loc;
                ++p;
              }
              continue;
            default:
              c &= char.max;
              goto LswitchNC;
            }
          }
        case '*':
          while (1)
          {
            c = *++p;
          LswitchBC: // only jumped to from default case of next switch(c)
            switch (c)
            {
            case '\r':
              if (p[1] == '\n')
                ++p;
            case '\n':
              ++loc;
              continue;
            case 0, _Z_:
              error(MID.UnterminatedBlockComment);
              goto LreturnBC;
            default:
            }

            c <<= 8;
            c |= *++p;
            switch (c)
            {
            case 0x2A2F: // */
              ++p;
            LreturnBC:
              t.type = TOK.Comment;
              t.end = p;
              return;
            case 0xE280: // LS[0..1] || PS[0..1]
              if (p[1] == LS[2] || p[1] == PS[2])
              {
                ++loc;
                ++p;
              }
              continue;
            default:
              c &= char.max;
              goto LswitchBC;
            }
          }
          assert(0);
        case '/':
          while (1)
          {
            c = *++p;
            switch (c)
            {
            case '\r':
              if (p[1] == '\n')
                ++p;
            case '\n':
            case 0, _Z_:
              break;
            case LS[0]:
              if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
                break;
              continue;
            default:
              continue;
            }
            t.type = TOK.Comment;
            t.end = p;
            return;
          }
        default:
          t.type = TOK.Div;
          t.end = p;
          return;
        }
      }

      switch (c)
      {
      case '\'':
        return scanCharacterLiteral(t);
      case '`':
        return scanRawStringLiteral(t);
      case '"':
        return scanNormalStringLiteral(t);
      case '\\':
        char[] buffer;
        do
        {
          c = scanEscapeSequence();
          if (c < 128)
            buffer ~= c;
          else
            encodeUTF8(buffer, c);
        } while (*p == '\\')
        buffer ~= 0;
        t.type = TOK.String;
        t.str = buffer;
        t.end = p;
        return;
      case '>': /* >  >=  >>  >>=  >>>  >>>= */
        c = *++p;
        switch (c)
        {
        case '=':
          t.type = TOK.GreaterEqual;
          goto Lcommon;
        case '>':
          if (p[1] == '>')
          {
            ++p;
            if (p[1] == '=')
            { ++p;
              t.type = TOK.URShiftAssign;
            }
            else
              t.type = TOK.URShift;
          }
          else if (p[1] == '=')
          {
            ++p;
            t.type = TOK.RShiftAssign;
          }
          else
            t.type = TOK.RShift;
          goto Lcommon;
        default:
          t.type = TOK.Greater;
          goto Lcommon2;
        }
        assert(0);
      case '<': /* <  <=  <>  <>=  <<  <<= */
        c = *++p;
        switch (c)
        {
        case '=':
          t.type = TOK.LessEqual;
          goto Lcommon;
        case '<':
          if (p[1] == '=') {
            ++p;
            t.type = TOK.LShiftAssign;
          }
          else
            t.type = TOK.LShift;
          goto Lcommon;
        case '>':
          if (p[1] == '=') {
            ++p;
            t.type = TOK.LorEorG;
          }
          else
            t.type = TOK.LorG;
          goto Lcommon;
        default:
          t.type = TOK.Less;
          goto Lcommon2;
        }
        assert(0);
      case '!': /* !  !<  !>  !<=  !>=  !<>  !<>= */
        c = *++p;
        switch (c)
        {
        case '<':
          c = *++p;
          if (c == '>')
          {
            if (p[1] == '=') {
              ++p;
              t.type = TOK.Unordered;
            }
            else
              t.type = TOK.UorE;
          }
          else if (c == '=')
          {
            t.type = TOK.UorG;
          }
          else {
            t.type = TOK.UorGorE;
            goto Lcommon2;
          }
          goto Lcommon;
        case '>':
          if (p[1] == '=')
          {
            ++p;
            t.type = TOK.UorL;
          }
          else
            t.type = TOK.UorLorE;
          goto Lcommon;
        case '=':
          t.type = TOK.NotEqual;
          goto Lcommon;
        default:
          t.type = TOK.Not;
          goto Lcommon2;
        }
        assert(0);
      case '.': /* .  .[0-9]  ..  ... */
        if (p[1] == '.')
        {
          ++p;
          if (p[1] == '.') {
            ++p;
            t.type = TOK.Ellipses;
          }
          else
            t.type = TOK.Slice;
        }
        else if (isdigit(p[1]))
        {
          return scanReal(t);
        }
        else
          t.type = TOK.Dot;
        goto Lcommon;
      case '|': /* |  ||  |= */
        c = *++p;
        if (c == '=')
          t.type = TOK.OrAssign;
        else if (c == '|')
          t.type = TOK.OrLogical;
        else {
          t.type = TOK.OrBinary;
          goto Lcommon2;
        }
        goto Lcommon;
      case '&': /* &  &&  &= */
        c = *++p;
        if (c == '=')
          t.type = TOK.AndAssign;
        else if (c == '&')
          t.type = TOK.AndLogical;
        else {
          t.type = TOK.AndBinary;
          goto Lcommon2;
        }
        goto Lcommon;
      case '+': /* +  ++  += */
        c = *++p;
        if (c == '=')
          t.type = TOK.PlusAssign;
        else if (c == '+')
          t.type = TOK.PlusPlus;
        else {
          t.type = TOK.Plus;
          goto Lcommon2;
        }
        goto Lcommon;
      case '-': /* -  --  -= */
        c = *++p;
        if (c == '=')
          t.type = TOK.MinusAssign;
        else if (c == '-')
          t.type = TOK.MinusMinus;
        else {
          t.type = TOK.Minus;
          goto Lcommon2;
        }
        goto Lcommon;
      case '=': /* =  == */
        if (p[1] == '=') {
          ++p;
          t.type = TOK.Equal;
        }
        else
          t.type = TOK.Assign;
        goto Lcommon;
      case '~': /* ~  ~= */
         if (p[1] == '=') {
           ++p;
           t.type = TOK.CatAssign;
         }
         else
           t.type = TOK.Tilde;
         goto Lcommon;
      case '*': /* *  *= */
         if (p[1] == '=') {
           ++p;
           t.type = TOK.MulAssign;
         }
         else
           t.type = TOK.Mul;
         goto Lcommon;
      case '^': /* ^  ^= */
         if (p[1] == '=') {
           ++p;
           t.type = TOK.XorAssign;
         }
         else
           t.type = TOK.Xor;
         goto Lcommon;
      case '%': /* %  %= */
         if (p[1] == '=') {
           ++p;
           t.type = TOK.ModAssign;
         }
         else
           t.type = TOK.Mod;
         goto Lcommon;
      // Single character tokens:
      case '(':
        t.type = TOK.LParen;
        goto Lcommon;
      case ')':
        t.type = TOK.RParen;
        goto Lcommon;
      case '[':
        t.type = TOK.LBracket;
        goto Lcommon;
      case ']':
        t.type = TOK.RBracket;
        goto Lcommon;
      case '{':
        t.type = TOK.LBrace;
        goto Lcommon;
      case '}':
        t.type = TOK.RBrace;
        goto Lcommon;
      case ':':
        t.type = TOK.Colon;
        goto Lcommon;
      case ';':
        t.type = TOK.Semicolon;
        goto Lcommon;
      case '?':
        t.type = TOK.Question;
        goto Lcommon;
      case ',':
        t.type = TOK.Comma;
        goto Lcommon;
      case '$':
        t.type = TOK.Dollar;
      Lcommon:
        ++p;
      Lcommon2:
        t.end = p;
        return;
      case '#':
        return scanSpecialTokenSequence(t);
      default:
      }

      if (c & 128 && isUniAlpha(decodeUTF8()))
        goto Lidentifier;
      c = *++p;
    }
  }

  void scanNormalStringLiteral(ref Token t)
  {
    assert(*p == '"');
    char[] buffer;
    t.type = TOK.String;
    uint c;
    while (1)
    {
      c = *++p;
      switch (c)
      {
      case '"':
        ++p;
      Lreturn:
        buffer ~= 0;
        t.str = buffer;
        t.pf = scanPostfix();
        t.end = p;
        return;
      case '\\':
        c = scanEscapeSequence();
        --p;
        if (c & 128)
          encodeUTF8(buffer, c);
        else
          break;
        continue;
      case '\r':
        if (p[1] == '\n')
          ++p;
      case '\n':
        ++loc;
        c = '\n'; // Convert EndOfLine to \n.
        break;
      case 0, _Z_:
        error(MID.UnterminatedString);
        goto Lreturn;
      default:
        if (c & 128)
        {
//           char* begin = p;
          c = decodeUTF8();
          if (c == LSd || c == PSd)
            goto case '\n';

          // We don't copy per pointer because we might include
          // invalid, skipped utf-8 sequences. See decodeUTF8().
//           ++p;
//           buffer ~= begin[0 .. p - begin];
          encodeUTF8(buffer, c);
          continue;
        }
      }
      // Copy ASCII character.
      buffer ~= c;
    }
    assert(0);
  }

  void scanCharacterLiteral(ref Token t)
  {
    assert(*p == '\'');
    MID id = MID.UnterminatedCharacterLiteral;
    ++p;
    TOK type = TOK.CharLiteral;
    switch (*p)
    {
    case '\\':
      switch (p[1])
      {
      case 'u':
        type = TOK.WCharLiteral; break;
      case 'U':
        type = TOK.DCharLiteral; break;
      default:
      }
      t.dchar_ = scanEscapeSequence();
      break;
    case '\'':
      ++p;
      id = MID.EmptyCharacterLiteral;
    case '\n', '\r', 0, _Z_:
      goto Lerr;
    default:
      uint c = *p;
      if (c & 128)
      {
        c = decodeUTF8();
        if (c == LSd || c == PSd)
          goto Lerr;
        if (c <= 0xFFFF)
          type = TOK.WCharLiteral;
        else
          type = TOK.DCharLiteral;
      }
      t.dchar_ = c;
      ++p;
    }

    if (*p == '\'')
      ++p;
    else
    Lerr:
      error(id);
    t.type = type;
    t.end = p;
  }

  char scanPostfix()
  {
    switch (*p)
    {
    case 'c':
    case 'w':
    case 'd':
      return *p++;
    default:
      return 0;
    }
    assert(0);
  }

  void scanRawStringLiteral(ref Token t)
  {
    uint delim = *p;
    assert(delim == '`' || delim == '"' && p[-1] == 'r');
    t.type = TOK.String;
    char[] buffer;
    uint c;
    while (1)
    {
      c = *++p;
      switch (c)
      {
      case '\r':
        if (p[1] == '\n')
          ++p;
      case '\n':
        c = '\n'; // Convert EndOfLine ('\r','\r\n','\n',LS,PS) to '\n'
        ++loc;
        break;
      case '`':
      case '"':
        if (c == delim)
        {
          ++p;
          t.pf = scanPostfix();
        Lreturn:
          t.str = buffer ~ '\0';
          t.end = p;
          return;
        }
        break;
      case 0, _Z_:
        if (delim == 'r')
          error(MID.UnterminatedRawString);
        else
          error(MID.UnterminatedBackQuoteString);
        goto Lreturn;
      default:
        if (c & 128)
        {
          c = decodeUTF8();
          if (c == LSd || c == PSd)
            goto case '\n';
          encodeUTF8(buffer, c);
          continue;
        }
      }
      buffer ~= c; // copy character to buffer
    }
    assert(0);
  }

  void scanHexStringLiteral(ref Token t)
  {
    assert(p[0] == 'x' && p[1] == '"');
    t.type = TOK.String;

    uint c;
    ubyte[] buffer;
    ubyte h; // hex number
    uint n; // number of hex digits

    ++p;
    while (1)
    {
      c = *++p;
      switch (c)
      {
      case '"':
        ++p;
        if (n & 1)
          error(MID.OddNumberOfDigitsInHexString);
        t.pf = scanPostfix();
      Lreturn:
        buffer ~= 0;
        t.str = cast(string) buffer;
        t.end = p;
        return;
      case '\r':
        if (p[1] == '\n')
          ++p;
      case '\n':
        ++loc;
        continue;
      default:
        if (ishexad(c))
        {
          if (c <= '9')
            c -= '0';
          else if (c <= 'F')
            c -= 'A' - 10;
          else
            c -= 'a' - 10;

          if (n & 1)
          {
            h <<= 4;
            h |= c;
            buffer ~= h;
          }
          else
            h = cast(ubyte)c;
          ++n;
          continue;
        }
        else if (isspace(c))
          continue;

        if (c & 128)
        {
          c = decodeUTF8();
          if (c == LSd || c == PSd)
          {
            ++p; ++p;
            ++loc;
            continue;
          }
        }
        else if (c == 0 || c == _Z_)
        {
          error(MID.UnterminatedHexString);
          t.pf = 0;
          goto Lreturn;
        }
        error(MID.NonHexCharInHexString, cast(dchar)c);
      }
    }
    assert(0);
  }

version(D2)
{
  void scanDelimitedStringLiteral(ref Token t)
  {
    assert(p[0] == 'q' && p[1] == '"');
    t.type = TOK.String;

    char[] buffer;
    dchar opening_delim, // 0 if no nested delimiter or '[', '(', '<', '{'
          closing_delim; // Will be ']', ')', '>', '}', any other character
                         // or the first, decoded character of an identifier.
    char[] str_delim; // Identifier delimiter
    uint level = 1;

    ++p; ++p; // Skip q"
    uint c = *p;
    switch (c)
    {
    case '(':
      opening_delim = c;
      closing_delim = ')'; // *p + 1
      break;
    case '[', '<', '{':
      opening_delim = c;
      closing_delim = c + 2; // Get to closing counterpart. Feature of ASCII table.
      break;
    default:
      char* begin = p;
      closing_delim = c;
      // TODO: What to do about newlines? Skip or accept as delimiter?
      // TODO: Check for non-printable characters?
      if (c & 128)
      {
        closing_delim = decodeUTF8();
        if (!isUniAlpha(c))
          break;
      }
      else if (!isidbeg(c))
        break;
      // Parse identifier + newline
      do
      { c = *++p; }
      while (isident(c) || c & 128 && isUniAlpha(decodeUTF8()))
      // Store identifier
      str_delim = begin[0..p-begin];
      // Scan newline
      switch (*p)
      {
      case '\r':
        if (p[1] == '\n')
          ++p;
      case '\n':
        ++loc;
        break;
      case LS[0]:
        if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
        {
          ++p; ++p;
          ++loc;
          break;
        }
        // goto default;
      default:
        // TODO: error(MID.ExpectedNewlineAfterIdentDelim);
      }
    }

    bool checkStringDelim(char* p)
    {
      assert(str_delim.length != 0);
      if (end-p >= str_delim.length && // Check remaining length.
          p[0..str_delim.length] == str_delim) // Compare.
        return true;
      return false;
    }

    while (1)
    {
      c = *++p;
      switch (c)
      {
      case '\r':
        if (p[1] == '\n')
          ++p;
      case '\n':
        c = '\n'; // Convert EndOfLine ('\r','\r\n','\n',LS,PS) to '\n'
        ++loc;
        break;
      case 0, _Z_:
//         error(MID.UnterminatedDelimitedString);
        goto Lreturn3;
      default:
        if (c & 128)
        {
          auto begin = p;
          c = decodeUTF8();
          if (c == LSd || c == PSd)
            goto case '\n';
          if (c == closing_delim)
          {
            if (str_delim.length && checkStringDelim(begin))
            {
              p = begin + str_delim.length;
              goto Lreturn2;
            }
            assert(level == 1);
            --level;
            goto Lreturn;
          }
          encodeUTF8(buffer, c);
          continue;
        }
        else
        {
          if (c == opening_delim)
            ++level;
          else if (c == closing_delim)
          {
            if (str_delim.length && checkStringDelim(p))
            {
              p += str_delim.length;
              goto Lreturn2;
            }
            if (--level == 0)
              goto Lreturn;
          }
        }
      }
      buffer ~= c; // copy character to buffer
    }
  Lreturn:
    assert(*p == closing_delim);
    assert(level == 0);
    ++p; // Skip closing delimiter.
  Lreturn2:
    if (*p == '"')
      ++p;
    // else
    // TODO: error(MID.ExpectedDblQuoteAfterDelim, str_delim.length ? str_delim : p[-1]);

    t.pf = scanPostfix();
  Lreturn3:
    t.str = buffer ~ '\0';
    t.end = p;
  }

  void scanTokenStringLiteral(ref Token t)
  {
    assert(p[0] == 'q' && p[1] == '{');
    t.type = TOK.String;
    // Copy members that might be changed by subsequent tokens. Like #line for example.
    auto loc_old = this.loc_old;
    auto loc_hline = this.loc_hline;
    auto filePath = this.fileName;

    uint loc = this.loc;
    uint level = 1;

    ++p; ++p; // Skip q{

    auto prev_t = &t;
    Token* token;
    while (1)
    {
      token = new Token;
      scan(*token);
      // Save the tokens in a doubly linked list.
      // Could be useful for various tools.
      token.prev = prev_t;
      prev_t.next = token;
      prev_t = token;
      switch (token.type)
      {
      case TOK.LBrace:
        ++level;
        continue;
      case TOK.RBrace:
        if (--level == 0)
        {
          t.tok_str = t.next;
          t.next = null;
          break;
        }
        continue;
      case TOK.EOF:
        // TODO: error(MID.UnterminatedTokenString);
        t.tok_str = t.next;
        t.next = token;
        break;
      default:
        continue;
      }
      break; // Exit loop.
    }

    assert(token.type == TOK.RBrace || token.type == TOK.EOF);
    assert(token.type == TOK.RBrace && t.next is null ||
           token.type == TOK.EOF && t.next !is null);

    char[] buffer;
    // token points to } or EOF
    if (token.type == TOK.EOF)
    {
      t.end = token.start;
      buffer = t.srcText[2..$].dup ~ '\0';
    }
    else
    {
      // Assign to buffer before scanPostfix().
      t.end = p;
      buffer = t.srcText[2..$-1].dup ~ '\0';
      t.pf = scanPostfix();
      t.end = p;
    }
    // Convert EndOfLines to '\n'
    if (loc != this.loc)
    {
      assert(buffer[$-1] == '\0');
      uint i, j;
      for (; i < buffer.length; ++i)
        switch (buffer[i])
        {
        case '\r':
          if (buffer[i+1] == '\n')
            ++i;
        case '\n':
          buffer[j++] = '\n';
          break;
        case LS[0]:
          auto b = buffer[i..$];
          if (b[1] == LS[1] && (b[2] == LS[2] || b[2] == PS[2]))
          {
            ++i; ++i;
            goto case '\n';
          }
          // goto default;
        default:
          buffer[j++] = buffer[i]; // Copy character
        }
      buffer.length = j; // Adjust length
    }
    assert(buffer[$-1] == '\0');
    t.str = buffer;

    // Restore possibly changed members.
    this.loc_old = loc_old;
    this.loc_hline = loc_hline;
    this.fileName = filePath;
  }
}

  dchar scanEscapeSequence()
  {
    assert(*p == '\\');
    ++p;
    uint c = char2ev(*p);
    if (c)
    {
      ++p;
      return c;
    }
    uint digits = 2;

    switch (*p)
    {
    case 'x':
      c = 0;
      while (1)
      {
        ++p;
        if (ishexad(*p))
        {
          c *= 16;
          if (*p <= '9')
            c += *p - '0';
          else if (*p <= 'F')
            c += *p - 'A' + 10;
          else
            c += *p - 'a' + 10;

          if (!--digits)
          {
            ++p;
            break;
          }
        }
        else
        {
          error(MID.InsufficientHexDigits);
          break;
        }
      }
      if (!isValidDchar(c))
        error(MID.InvalidUnicodeCharacter);
      break;
    case 'u':
      digits = 4;
      goto case 'x';
    case 'U':
      digits = 8;
      goto case 'x';
    default:
      if (isoctal(*p))
      {
        c = 0;
        c += *p - '0';
        ++p;
        if (!isoctal(*p))
          return c;
        c *= 8;
        c += *p - '0';
        ++p;
        if (!isoctal(*p))
          return c;
        c *= 8;
        c += *p - '0';
        ++p;
      }
      else if(*p == '&')
      {
        if (isalpha(*++p))
        {
          auto begin = p;
          while (isalnum(*++p))
          {}

          if (*p == ';')
          {
            c = entity2Unicode(begin[0..p - begin]);
            ++p; // Skip ;
            if (c == 0xFFFF)
              error(MID.UndefinedHTMLEntity, (begin-1)[0..p-(begin-1)]);
          }
          else
            error(MID.UnterminatedHTMLEntity);
        }
        else
          error(MID.InvalidBeginHTMLEntity);
      }
      else
      {
        // TODO: add parameter to localized strings
        dchar d = *p;
        char[] str = `\`;
        if (d & 128)
        {
          d = decodeUTF8();
          encodeUTF8(str, d);
          ++p;
        }
        else
          str ~= d;
        error(MID.UndefinedEscapeSequence/+, str+/);
      }
    }

    return c;
  }

  /*
    IntegerLiteral:= (Dec|Hex|Bin|Oct)Suffix?
    Dec:= (0|[1-9][0-9_]*)
    Hex:= 0[xX] HexDigits
    Bin:= 0[bB][01_]+
    Oct:= 0[0-7_]+
    Suffix:= (L[uU]?|[uU]L?)
    HexDigits:= [0-9a-zA-Z_]+

    Invalid: "0b_", "0x_", "._"
  */
  void scanNumber(ref Token t)
  {
    ulong ulong_;
    bool overflow;
    bool isDecimal;
    size_t digits;

    if (*p != '0')
      goto LscanInteger;
    ++p; // skip zero
    // check for xX bB ...
    switch (*p)
    {
    case 'x','X':
      goto LscanHex;
    case 'b','B':
      goto LscanBin;
    case 'L':
      if (p[1] == 'i')
        goto LscanReal;
    case '.':
      if (p[1] == '.')
        break;
    case 'i','f','F', 'e', 'E': // Imaginary and float literal suffix
      goto LscanReal;
    default:
      if (*p == '_' || isoctal(*p))
        goto LscanOct;
    }

    // Number 0
    assert(p[-1] == '0');
    assert(ulong_ == 0);
    isDecimal = true;
    goto Lfinalize;

  LscanInteger:
    assert(*p != 0 && isdigit(*p));
    isDecimal = true;
    goto Lenter_loop_int;
    while (1)
    {
      if (*++p == '_')
        continue;
      if (!isdigit(*p))
        break;
    Lenter_loop_int:
      if (ulong_ < ulong.max/10 || (ulong_ == ulong.max/10 && *p <= '5'))
      {
        ulong_ *= 10;
        ulong_ += *p - '0';
        continue;
      }
      // Overflow: skip following digits.
      overflow = true;
      while (isdigit(*++p)) {}
      break;
    }

    // The number could be a float, so check overflow below.
    switch (*p)
    {
    case '.':
      if (p[1] != '.')
        goto LscanReal;
      break;
    case 'L':
      if (p[1] != 'i')
        break;
    case 'i', 'f', 'F', 'e', 'E':
      goto LscanReal;
    default:
    }

    if (overflow)
      error(MID.OverflowDecimalNumber);

    assert((isdigit(p[-1]) || p[-1] == '_') && !isdigit(*p) && *p != '_');
    goto Lfinalize;

  LscanHex:
    assert(digits == 0);
    assert(*p == 'x');
    while (1)
    {
      if (*++p == '_')
        continue;
      if (!ishexad(*p))
        break;
      ++digits;
      ulong_ *= 16;
      if (*p <= '9')
        ulong_ += *p - '0';
      else if (*p <= 'F')
        ulong_ += *p - 'A' + 10;
      else
        ulong_ += *p - 'a' + 10;
    }

    switch (*p)
    {
    case '.':
      if (p[1] != '.')
        goto LscanHexReal;
      break;
    case 'L':
      if (p[1] != 'i')
        break;
    case 'i', 'p', 'P':
      goto LscanHexReal;
    default:
    }
    if (digits == 0)
      error(MID.NoDigitsInHexNumber);
    else if (digits > 16)
    {
      // Overflow: skip following digits.
      error(MID.OverflowHexNumber);
      while (ishexad(*++p)) {}
    }
    goto Lfinalize;
  LscanHexReal:
    return scanHexReal(t);

  LscanBin:
    assert(digits == 0);
    assert(*p == 'b');
    while (1)
    {
      if (*++p == '0')
      {
        ++digits;
        ulong_ *= 2;
      }
      else if (*p == '1')
      {
        ++digits;
        ulong_ *= 2;
        ulong_ += *p - '0';
      }
      else if (*p == '_')
        continue;
      else
        break;
    }

    if (digits == 0)
      error(MID.NoDigitsInBinNumber);

    if (digits > 64)
      error(MID.OverflowBinaryNumber);
    assert(p[-1] == '0' || p[-1] == '1' || p[-1] == '_', p[-1] ~ "");
    assert( !(*p == '0' || *p == '1' || *p == '_') );
    goto Lfinalize;

  LscanOct:
    assert(*p == '_' || isoctal(*p));
    if (*p != '_')
      goto Lenter_loop_oct;
    while (1)
    {
      if (*++p == '_')
        continue;
      if (!isoctal(*p))
        break;
    Lenter_loop_oct:
      if (ulong_ < ulong.max/2 || (ulong_ == ulong.max/2 && *p <= '1'))
      {
        ulong_ *= 8;
        ulong_ += *p - '0';
        ++p;
        continue;
      }
      // Overflow: skip following digits.
      overflow = true;
      while (isdigit(*++p)) {}
      break;
    }

    bool hasDecimalDigits;
    if (isdigit(*p))
    {
      hasDecimalDigits = true;
      while (isdigit(*++p)) {}
    }

    // The number could be a float, so check errors below.
    switch (*p)
    {
    case '.':
      if (p[1] != '.')
        goto LscanReal;
      break;
    case 'L':
      if (p[1] != 'i')
        break;
    case 'i', 'f', 'F', 'e', 'E':
      goto LscanReal;
    default:
    }

    if (hasDecimalDigits)
      error(MID.OctalNumberHasDecimals);
    if (overflow)
      error(MID.OverflowOctalNumber);
//     goto Lfinalize;

  Lfinalize:
    enum Suffix
    {
      None     = 0,
      Unsigned = 1,
      Long     = 2
    }

    Suffix suffix;
    while (1)
    {
      switch (*p)
      {
      case 'L':
        if (suffix & Suffix.Long)
          break;
        suffix |= Suffix.Long;
        ++p;
        continue;
      case 'u', 'U':
        if (suffix & Suffix.Unsigned)
          break;
        suffix |= Suffix.Unsigned;
        ++p;
        continue;
      default:
        break;
      }
      break;
    }

    switch (suffix)
    {
    case Suffix.None:
      if (ulong_ & 0x8000000000000000)
      {
        if (isDecimal)
          error(MID.OverflowDecimalSign);
        t.type = TOK.Uint64;
      }
      else if (ulong_ & 0xFFFFFFFF00000000)
        t.type = TOK.Int64;
      else if (ulong_ & 0x80000000)
        t.type = isDecimal ? TOK.Int64 : TOK.Uint32;
      else
        t.type = TOK.Int32;
      break;
    case Suffix.Unsigned:
      if (ulong_ & 0xFFFFFFFF00000000)
        t.type = TOK.Uint64;
      else
        t.type = TOK.Uint32;
      break;
    case Suffix.Long:
      if (ulong_ & 0x8000000000000000)
      {
        if (isDecimal)
          error(MID.OverflowDecimalSign);
        t.type = TOK.Uint64;
      }
      else
        t.type = TOK.Int64;
      break;
    case Suffix.Unsigned | Suffix.Long:
      t.type = TOK.Uint64;
      break;
    default:
      assert(0);
    }
    t.ulong_ = ulong_;
    t.end = p;
    return;
  LscanReal:
    scanReal(t);
    return;
  }

  /*
    FloatLiteral:= Float[fFL]?i?
    Float:= DecFloat | HexFloat
    DecFloat:= ([0-9][0-9_]*[.][0-9_]*DecExponent?) | [.][0-9][0-9_]*DecExponent? | [0-9][0-9_]*DecExponent
    DecExponent:= [eE][+-]?[0-9][0-9_]*
    HexFloat:= 0[xX](HexDigits[.]HexDigits | [.][0-9a-zA-Z]HexDigits? | HexDigits)HexExponent
    HexExponent:= [pP][+-]?[0-9][0-9_]*
  */
  void scanReal(ref Token t)
  {
    if (*p == '.')
      // This function was called by scan() or scanNumber().
      while (isdigit(*++p) || *p == '_') {}
    else
    {
      // This function was called by scanNumber().
      debug switch (*p)
      {
      case 'L':
        if (p[1] != 'i')
          assert(0);
      case 'i', 'f', 'F', 'e', 'E': break;
      default: assert(0);
      }
    }

    // Scan exponent.
    if (*p == 'e' || *p == 'E')
    {
      ++p;
      if (*p == '-' || *p == '+')
        ++p;
      if (!isdigit(*p))
        error(MID.FloatExponentDigitExpected);
      else
        while (isdigit(*++p) || *p == '_') {}
    }

    // Copy string to buffer ignoring underscores.
    char[] buffer;
    char* end = p;
    p = t.start;
    do
    {
      if (*p == '_')
      {
        ++p;
        continue;
      }
      buffer ~= *p;
      ++p;
    } while (p != end)
    buffer ~= 0;
    finalizeFloat(t, buffer);
  }

  void scanHexReal(ref Token t)
  {
    assert(*p == '.' || *p == 'i' || *p == 'p' || *p == 'P' || (*p == 'L' && p[1] == 'i'));
    MID mid;
    if (*p == '.')
      while (ishexad(*++p) || *p == '_') {}
    if (*p != 'p' && *p != 'P')
    {
      mid = MID.HexFloatExponentRequired;
      goto Lerr;
    }
    // Copy mantissa to a buffer ignoring underscores.
    char* end = p;
    p = t.start;
    char[] buffer;
    do
    {
      if (*p == '_')
      {
        ++p;
        continue;
      }
      buffer ~= *p;
      ++p;
    } while (p != end)

    assert(p == end && (*p == 'p' || *p == 'P'));
    // Scan and copy the exponent.
    buffer ~= 'p';
    size_t bufflen = buffer.length;
    while (1)
    {
      if (*++p == '_')
        continue;
      if (isdigit(*p))
        buffer ~= *p;
      else
        break;
    }
    // When the buffer length hasn't changed, no digits were copied.
    if (bufflen == buffer.length) {
      mid = MID.HexFloatMissingExpDigits;
      goto Lerr;
    }
    buffer ~= 0; // Terminate for C functions.
    finalizeFloat(t, buffer);
    return;
  Lerr:
    t.type = TOK.Float32;
    t.end = p;
    error(mid);
  }

  void finalizeFloat(ref Token t, string buffer)
  {
    // Float number is well-formed. Check suffixes and do conversion.
    switch (*p)
    {
    case 'f', 'F':
      t.type = TOK.Float32;
      t.float_ = strtof(buffer.ptr, null);
      ++p;
      break;
    case 'L':
      t.type = TOK.Float80;
      t.real_ = strtold(buffer.ptr, null);
      ++p;
      break;
    default:
      t.type = TOK.Float64;
      t.double_ = strtod(buffer.ptr, null);
      break;
    }
    if (*p == 'i')
    {
      ++p;
      t.type += 3; // Switch to imaginary counterpart.
    }
    if (getErrno == ERANGE)
      error(MID.OverflowFloatNumber);
    t.end = p;
  }

  /// Scan special token: #line Integer [Filespec] EndOfLine
  void scanSpecialTokenSequence(ref Token t)
  {
    assert(*p == '#');

    t.type = TOK.HashLine;

    MID mid;

    ++p;
    if (p[0] != 'l' || p[1] != 'i' || p[2] != 'n' || p[3] != 'e')
    {
      mid = MID.ExpectedIdentifierSTLine;
      goto Lerr;
    }
    p += 3;

    enum State
    { Number, Filespec, End }

    State state;

  Loop:
    while (1)
    {
      switch (*++p)
      {
      case '\r':
        if (p[1] == '\n')
          ++p;
      case '\n', 0, _Z_:
        break Loop;
      case LS[0]:
        if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
        {
          ++p; ++p;
          break Loop;
        }
        goto default;
      default:
        if (isspace(*p))
          continue;
        if (state == State.Number)
        {
          if (!isdigit(*p))
          {
            mid = MID.ExpectedNumberAfterSTLine;
            goto Lerr;
          }
          t.line_num = new Token;
          scan(*t.line_num);
          --p;
          state = State.Filespec;
        }
        else if (state == State.Filespec)
        {
          if (*p != '"')
          {
            mid = MID.ExpectedFilespec;
            goto Lerr;
          }
          t.line_filespec = new Token;
          t.line_filespec.start = p;
          t.line_filespec.type = TOK.Filespec;
          while (1)
          {
            switch (*++p)
            {
            case '"':
              break;
            case LS[0]:
              if (!(p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2])))
                goto default;
            case '\r', '\n', 0, _Z_:
              mid = MID.UnterminatedFilespec;
              t.line_filespec.end = p;
              goto Lerr;
            default:
              if (*p & 128)
                decodeUTF8();
              continue;
            }
            break; // Exit loop.
          }
          auto start = t.line_filespec.start +1; // +1 skips '"'
          t.line_filespec.str = start[0 .. p - start];
          t.line_filespec.end = p + 1;
          state = State.End;
        }
        else/+ if (state == State.End)+/
        {
          mid = MID.UnterminatedSpecialToken;
          goto Lerr;
        }
      }
    }

    if (state == State.Number)
    {
      mid = MID.ExpectedNumberAfterSTLine;
      goto Lerr;
    }

    this.loc_old = this.loc;
    this.loc_hline = t.line_num.uint_ - 1;
    if (t.line_filespec)
      this.fileName = t.line_filespec.str;
    t.end = p;

    return;
  Lerr:
    t.end = p;
    error(mid);
  }

  uint errorLoc()
  {
    // ∆loc + line_num_of(#line)
    return this.loc - this.loc_old + this.loc_hline;
  }

  dchar decodeUTF8()
  {
    assert(*p & 128, "check for ASCII char before calling decodeUTF8().");
    size_t idx;
    dchar d;
    try
    {
      d = std.utf.decode(p[0 .. end-p], idx);
      p += idx -1;
    }
    catch (UtfException e)
    {
      error(MID.InvalidUTF8Sequence);
      // Skip to next valid utf-8 sequence
      while (p < end && UTF8stride[*++p] != 0xFF) {}
      --p;
      assert(p < end);
    }
    return d;
  }

  void loadKeywords()
  {
    foreach(k; keywords)
      idtable[k.str] = k;
  }
/+ // Not needed anymore because tokens are stored in a linked list.
  struct State
  {
    Lexer lexer;
    Token token;
    char* scanPointer;
    int loc;
    string fileName;
    size_t errorLen;
    static State opCall(Lexer lx)
    {
      State s;
      s.lexer = lx;
      s.token = lx.token;
      s.scanPointer = lx.p;
      s.loc = lx.loc;
      s.fileName = lx.fileName;
      s.errorLen = lx.errors.length;
      return s;
    }
    void restore()
    {
      lexer.p = scanPointer;
      lexer.token = token;
      lexer.loc = loc;
      lexer.fileName = fileName;
      lexer.errors = lexer.errors[0..errorLen];
    }
  }

  State getState()
  {
    return State(this);
  }
+/

  private void scanNext(ref Token* t)
  {
    assert(t !is null);
    if (t.next)
      t = t.next;
    else if (t != this.tail)
    {
      Token* new_t = new Token;
      scan(*new_t);
      new_t.prev = t;
      t.next = new_t;
      t = new_t;
    }
  }

  void peek(ref Token* t)
  {
    scanNext(t);
  }

  TOK nextToken()
  {
    scanNext(this.token);
    return this.token.type;
  }

  void error(MID id, ...)
  {
//     if (reportErrors)
    errors ~= new Information(InfoType.Lexer, id, this.errorLoc, arguments(_arguments, _argptr));
  }

  unittest
  {
    writefln("Testing method Lexer.peek()");
    string sourceText = "unittest { }";
    auto lx = new Lexer(sourceText, null);

    Token* next = lx.head;
    lx.peek(next);
    assert(next.type == TOK.Unittest);
    lx.peek(next);
    assert(next.type == TOK.LBrace);
    lx.peek(next);
    assert(next.type == TOK.RBrace);
    lx.peek(next);
    assert(next.type == TOK.EOF);
  }

  Token* getTokens()
  {
    while (nextToken() != TOK.EOF)
    {}
    return head;
  }

  static bool isNonReservedIdentifier(char[] ident)
  {
    if (ident.length == 0)
      return false;

    static Identifier[string] reserved_ids_table;
    if (reserved_ids_table is null)
      foreach(k; keywords)
        reserved_ids_table[k.str] = k;

    size_t idx = 1; // Index to the 2nd character in ident.
    dchar isFirstCharUniAlpha()
    {
      idx = 0;
      // NB: decode() could throw an Exception which would be
      // caught by the next try-catch-block.
      return isUniAlpha(std.utf.decode(ident, idx));
    }

    try
    {
      if (isidbeg(ident[0]) ||
          ident[0] & 128 && isFirstCharUniAlpha())
      {
        foreach (dchar c; ident[idx..$])
          if (!isident(c) && !isUniAlpha(c))
            return false;
      }
    }
    catch (Exception)
      return false;

    return !(ident in reserved_ids_table);
  }

  private void encodeUTF8(inout char[] str, dchar d)
  {
    char[6] b;
    assert(d > 0x7F, "check for ASCII char before calling encodeUTF8().");
    if (d < 0x800)
    {
      b[0] = 0xC0 | (d >> 6);
      b[1] = 0x80 | (d & 0x3F);
      str ~= b[0..2];
    }
    else if (d < 0x10000)
    {
      b[0] = 0xE0 | (d >> 12);
      b[1] = 0x80 | ((d >> 6) & 0x3F);
      b[2] = 0x80 | (d & 0x3F);
      str ~= b[0..3];
    }
    else if (d < 0x200000)
    {
      b[0] = 0xF0 | (d >> 18);
      b[1] = 0x80 | ((d >> 12) & 0x3F);
      b[2] = 0x80 | ((d >> 6) & 0x3F);
      b[3] = 0x80 | (d & 0x3F);
      str ~= b[0..4];
    }
    else if (d < 0x4000000)
    {
      b[0] = 0xF8 | (d >> 24);
      b[1] = 0x80 | ((d >> 18) & 0x3F);
      b[2] = 0x80 | ((d >> 12) & 0x3F);
      b[3] = 0x80 | ((d >> 6) & 0x3F);
      b[4] = 0x80 | (d & 0x3F);
      str ~= b[0..5];
    }
    else if (d < 0x80000000)
    {
      b[0] = 0xFC | (d >> 30);
      b[1] = 0x80 | ((d >> 24) & 0x3F);
      b[2] = 0x80 | ((d >> 18) & 0x3F);
      b[3] = 0x80 | ((d >> 12) & 0x3F);
      b[4] = 0x80 | ((d >> 6) & 0x3F);
      b[5] = 0x80 | (d & 0x3F);
      str ~= b[0..6];
    }
    else
      error(MID.InvalidUnicodeCharacter);
  }
}

unittest
{
  writefln("Testing Lexer.");
  string[] toks = [
    ">",    ">=", ">>",  ">>=", ">>>", ">>>=", "<",   "<=",  "<>",
    "<>=",  "<<", "<<=", "!",   "!<",  "!>",   "!<=", "!>=", "!<>",
    "!<>=", ".",  "..",  "...", "&",   "&&",   "&=",  "+",   "++",
    "+=",   "-",  "--",  "-=",  "=",   "==",   "~",   "~=",  "*",
    "*=",   "/",  "/=",  "^",   "^=",  "%",    "%=",  "(",   ")",
    "[",    "]",  "{",   "}",   ":",   ";",    "?",   ",",   "$"
  ];

  char[] src;

  foreach (op; toks)
    src ~= op ~ " ";

  auto lx = new Lexer(src, "");
  auto token = lx.getTokens();

  uint i;
  assert(token == lx.head);
  token = token.next;
  do
  {
    assert(i < toks.length);
    assert(token.srcText == toks[i], std.string.format("Scanned '%s' but expected '%s'", token.srcText, toks[i]));
    ++i;
    token = token.next;
  } while (token.type != TOK.EOF)
}

unittest
{
  // Numbers unittest
  // 0L 0ULi 0_L 0_UL 0x0U 0x0p2 0_Fi 0_e2 0_F 0_i
  // 0u 0U 0uL 0UL 0L 0LU 0Lu
  // 0Li 0f 0F 0fi 0Fi 0i
  // 0b_1_LU 0b1000u
  // 0x232Lu
}

/// ASCII character properties table.
static const int ptable[256] = [
 0, 0, 0, 0, 0, 0, 0, 0, 0,32, 0,32,32, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
32, 0, 0x2200, 0, 0, 0, 0, 0x2700, 0, 0, 0, 0, 0, 0, 0, 0,
 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 0, 0, 0, 0, 0, 0x3f00,
 0,12,12,12,12,12,12, 8, 8, 8, 8, 8, 8, 8, 8, 8,
 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0x5c00, 0, 0,16,
 0, 0x70c, 0x80c,12,12,12, 0xc0c, 8, 8, 8, 8, 8, 8, 8, 0xa08, 8,
 8, 8, 0xd08, 8, 0x908, 8, 0xb08, 8, 8, 8, 8, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
];

enum CProperty
{
       Octal = 1,
       Digit = 1<<1,
         Hex = 1<<2,
       Alpha = 1<<3,
  Underscore = 1<<4,
  Whitespace = 1<<5
}

const uint EVMask = 0xFF00; // Bit mask for escape value

private alias CProperty CP;
int isoctal(char c) { return ptable[c] & CP.Octal; }
int isdigit(char c) { return ptable[c] & CP.Digit; }
int ishexad(char c) { return ptable[c] & CP.Hex; }
int isalpha(char c) { return ptable[c] & CP.Alpha; }
int isalnum(char c) { return ptable[c] & (CP.Alpha | CP.Digit); }
int isidbeg(char c) { return ptable[c] & (CP.Alpha | CP.Underscore); }
int isident(char c) { return ptable[c] & (CP.Alpha | CP.Underscore | CP.Digit); }
int isspace(char c) { return ptable[c] & CP.Whitespace; }
int char2ev(char c) { return ptable[c] >> 8; /*(ptable[c] & EVMask) >> 8;*/ }

version(gen_ptable)
static this()
{
  alias ptable p;
  // Initialize character properties table.
  for (int i; i < p.length; ++i)
  {
    p[i] = 0;
    if ('0' <= i && i <= '7')
      p[i] |= CP.Octal;
    if ('0' <= i && i <= '9')
      p[i] |= CP.Digit;
    if (isdigit(i) || 'a' <= i && i <= 'f' || 'A' <= i && i <= 'F')
      p[i] |= CP.Hex;
    if ('a' <= i && i <= 'z' || 'A' <= i && i <= 'Z')
      p[i] |= CP.Alpha;
    if (i == '_')
      p[i] |= CP.Underscore;
    if (i == ' ' || i == '\t' || i == '\v' || i == '\f')
      p[i] |= CP.Whitespace;
  }
  // Store escape sequence values in second byte.
  assert(CProperty.max <= ubyte.max, "character property flags and escape value byte overlap.");
  p['\''] |= 39 << 8;
  p['"'] |= 34 << 8;
  p['?'] |= 63 << 8;
  p['\\'] |= 92 << 8;
  p['a'] |= 7 << 8;
  p['b'] |= 8 << 8;
  p['f'] |= 12 << 8;
  p['n'] |= 10 << 8;
  p['r'] |= 13 << 8;
  p['t'] |= 9 << 8;
  p['v'] |= 11 << 8;
  // Print a formatted array literal.
  char[] array = "[\n";
  for (int i; i < p.length; ++i)
  {
    int c = p[i];
    array ~= std.string.format(c>255?" 0x%x,":"%2d,", c, ((i+1) % 16) ? "":"\n");
  }
  array[$-2..$] = "\n]";
  writefln(array);
}
author	Aziz K?ksal <aziz.koeksal@gmail.com>
date	Tue, 11 Sep 2007 23:09:27 +0200
parents	f7ce725e79c3
children	ad0cbd1c8881