changeset 487:bccca748d745

Added 'tokenize' command.
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Sat, 01 Dec 2007 20:20:44 +0100
parents bd176bc73e43
children cfb3805768b6
files trunk/src/dil/Lexer.d trunk/src/dil/Token.d trunk/src/main.d
diffstat 3 files changed, 87 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/trunk/src/dil/Lexer.d	Sat Dec 01 18:22:56 2007 +0100
+++ b/trunk/src/dil/Lexer.d	Sat Dec 01 20:20:44 2007 +0100
@@ -2535,6 +2535,19 @@
     return head;
   }
 
+  /// Scan the whole text until EOF is encountered.
+  void scanAll()
+  {
+    while (nextToken() != TOK.EOF)
+    {}
+  }
+
+  /// HEAD -> Newline -> First Token
+  Token* firstToken()
+  {
+    return this.head.next.next;
+  }
+
   static void loadKeywords(ref Identifier[string] table)
   {
     foreach(k; keywords)
--- a/trunk/src/dil/Token.d	Sat Dec 01 18:22:56 2007 +0100
+++ b/trunk/src/dil/Token.d	Sat Dec 01 20:20:44 2007 +0100
@@ -159,12 +159,20 @@
 
   alias srcText identifier;
 
+  /// Returns the text of the token.
   string srcText()
   {
     assert(start && end);
     return start[0 .. end - start];
   }
 
+  /// Returns the preceding whitespace of the token.
+  string wsChars()
+  {
+    assert(ws && start);
+    return ws[0 .. start - ws];
+  }
+
   /// Find next non-whitespace token. Returns 'this' token if the next token is TOK.EOF or null.
   Token* nextNWS()
   out(token)
@@ -298,6 +306,18 @@
 }
 }
 
+/++
+  Not used at the moment. Could be useful if more
+  info is needed about the location of nodes/tokens.
++/
+struct NewlineInfo
+{
+  char[] oriPath;   /// Original path to the source text.
+  char[] setPath;   /// Path set by #line.
+  uint oriLineNum;  /// Actual line number in the source text.
+  uint setLineNum;  /// Delta line number set by #line.
+}
+
 /// A table mapping each TOK to a string.
 private const string[] tokToString = [
   "Invalid",
--- a/trunk/src/main.d	Sat Dec 01 18:22:56 2007 +0100
+++ b/trunk/src/main.d	Sat Dec 01 20:20:44 2007 +0100
@@ -93,6 +93,43 @@
   case "stats", "statistics":
     cmd.Statistics.execute(args[2..$]);
     break;
+  case "tok", "tokenize":
+    char[] filePath;
+    char[] sourceText;
+    char[] separator;
+    bool ignoreWSToks;
+    bool printWS;
+
+    foreach (arg; args[2..$])
+    {
+      if (strbeg(arg, "-t"))
+        sourceText = arg[2..$];
+      else if (strbeg(arg, "-s"))
+        separator = arg[2..$];
+      else if (arg == "-i")
+        ignoreWSToks = true;
+      else if (arg == "-ws")
+        printWS = true;
+      else
+        filePath = arg;
+    }
+
+    separator  || (separator = "\n");
+    sourceText || (sourceText = loadFile(filePath));
+
+    auto lx = new Lexer(sourceText, null);
+    lx.scanAll();
+    auto token = lx.firstToken();
+
+    for (; token.type != TOK.EOF; token = token.next)
+    {
+      if (token.type == TOK.Newline || ignoreWSToks && token.isWhitespace)
+        continue;
+      if (printWS && token.ws)
+        Stdout(token.wsChars);
+      Stdout(token.srcText)(separator);
+    }
+    break;
   case "parse":
     if (args.length == 3)
       parse(args[2]);
@@ -111,7 +148,8 @@
   "  generate (gen)\n"
   "  help (?)\n"
   "  importgraph (igraph)\n"
-  "  statistics (stats)\n";
+  "  statistics (stats)\n"
+  "  tokenize (tok)\n";
 
 bool strbeg(char[] str, char[] begin)
 {
@@ -139,6 +177,21 @@
   case "importgraph", "igraph":
     msg = GetMsg(MID.HelpImportGraph);
     break;
+  case "tok", "tokenize":
+    msg = `Print the tokens of a D source file.
+Usage:
+  dil tok file.d [Options]
+
+Options:
+  -tTEXT          : tokenize TEXT instead of a file.
+  -sSEPARATOR     : print SEPARATOR instead of newline between tokens.
+  -i              : ignore whitespace tokens (e.g. comments, shebang etc.)
+  -ws             : print a token's preceding whitespace characters.
+
+Example:
+  dil tok -t"module foo; void func(){}"
+  dil tok main.d | grep ^[0-9]`;
+    break;
   default:
     msg = helpMain();
   }