changeset 764:4579e8505d5e

Fixed unittests and removed dil.File. Fixed Converter.UTF16toUTF8(). Fixed an encode() function in dil.Unicode.
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Sat, 16 Feb 2008 03:28:39 +0100
parents f26f13b5a3a3
children bc812843603c
files trunk/src/cmd/DDoc.d trunk/src/cmd/ImportGraph.d trunk/src/cmd/Statistics.d trunk/src/dil/Converter.d trunk/src/dil/File.d trunk/src/dil/Unicode.d trunk/src/dil/lexer/Lexer.d trunk/src/dil/semantic/Module.d trunk/src/main.d
diffstat 9 files changed, 49 insertions(+), 176 deletions(-) [+]
line wrap: on
line diff
--- a/trunk/src/cmd/DDoc.d	Sat Feb 16 01:47:39 2008 +0100
+++ b/trunk/src/cmd/DDoc.d	Sat Feb 16 03:28:39 2008 +0100
@@ -22,8 +22,8 @@
 import dil.semantic.Symbol;
 import dil.semantic.Symbols;
 import dil.Information;
-import dil.File;
 import dil.Converter;
+import dil.SourceText;
 import common;
 
 import tango.stdc.time : time_t, time, ctime;
@@ -40,7 +40,7 @@
   MacroParser mparser;
   foreach (macroPath; macroPaths)
   {
-    auto macros = mparser.parse(loadMacroFile(macroPath));
+    auto macros = mparser.parse(loadMacroFile(macroPath, infoMan));
     mtable = new MacroTable(mtable);
     mtable.insert(macros);
   }
@@ -109,9 +109,11 @@
   file.write(fileText);
 }
 
-string loadMacroFile(string filePath)
+string loadMacroFile(string filePath, InfoManager infoMan)
 {
-  return sanitizeText(loadFile(filePath));
+  auto src = new SourceText(filePath);
+  src.load(infoMan);
+  return sanitizeText(src.data);
 }
 
 /// Traverses the syntax tree and writes DDoc macros to a string buffer.
--- a/trunk/src/cmd/ImportGraph.d	Sat Feb 16 01:47:39 2008 +0100
+++ b/trunk/src/cmd/ImportGraph.d	Sat Feb 16 03:28:39 2008 +0100
@@ -8,7 +8,6 @@
 import dil.ast.Declarations;
 import dil.semantic.Module;
 import dil.parser.ImportParser;
-import dil.File;
 import dil.Settings;
 import dil.SourceText;
 import common;
--- a/trunk/src/cmd/Statistics.d	Sat Feb 16 01:47:39 2008 +0100
+++ b/trunk/src/cmd/Statistics.d	Sat Feb 16 03:28:39 2008 +0100
@@ -4,13 +4,12 @@
 +/
 module cmd.Statistics;
 
-import dil.File;
+import cmd.ASTStats;
 import dil.lexer.Lexer;
 import dil.lexer.Token;
 import dil.parser.Parser;
 import dil.ast.NodesEnum;
 import dil.SourceText;
-import cmd.ASTStats;
 import common;
 
 struct Statistics
--- a/trunk/src/dil/Converter.d	Sat Feb 16 01:47:39 2008 +0100
+++ b/trunk/src/dil/Converter.d	Sat Feb 16 03:28:39 2008 +0100
@@ -122,19 +122,19 @@
          end = text.ptr + text.length;
     char[] result;
     uint lineNum = 1;
-    dchar c = *p;
 
-    do
+    for (; p < end; p++)
     {
+      dchar c = *p;
       static if (isBigEndian)
         c = BEtoMachineWord(c);
       else
         c = LEtoMachineWord(c);
 
-      if (c < 0xD800 || 0xDFFF > c)
+      if (0xD800 > c || c > 0xDFFF)
       {}
       else if (c <= 0xDBFF && p+1 < end)
-      {
+      { // Decode surrogate pairs.
         wchar c2 = p[1];
         static if (isBigEndian)
           c2 = BEtoMachineWord(c2);
@@ -159,16 +159,14 @@
 
       if (isNewline(c))
         ++lineNum;
-      ++p;
       dil.Unicode.encode(result, c);
-    } while (p < end)
+    }
 
     if (data.length % 2)
       infoMan ~= new LexerError(
         new Location(filePath, lineNum),
         MSG.UTF16FileMustBeDivisibleBy2
       );
-
     return result;
   }
 
@@ -295,3 +293,32 @@
   //text = text.ptr[0 .. q - text.ptr]; // Another way.
   return text;
 }
+
+unittest
+{
+  Stdout("Testing function Converter.\n");
+  struct Data2Text
+  {
+    char[] text;
+    char[] expected = "source";
+    ubyte[] data()
+    { return cast(ubyte[])text; }
+  }
+  const Data2Text[] map = [
+    // Without BOM
+    {"source"},
+    {"s\0o\0u\0r\0c\0e\0"},
+    {"\0s\0o\0u\0r\0c\0e"},
+    {"s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0"},
+    {"\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e"},
+    // With BOM
+    {"\xEF\xBB\xBFsource"},
+    {"\xFE\xFF\0s\0o\0u\0r\0c\0e"},
+    {"\xFF\xFEs\0o\0u\0r\0c\0e\0"},
+    {"\x00\x00\xFE\xFF\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e"},
+    {"\xFF\xFE\x00\x00s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0"},
+  ];
+  auto converter = Converter("", new InfoManager);
+  foreach (i, pair; map)
+    assert(converter.data2UTF8(pair.data) == pair.expected, Format("failed at item {}", i));
+}
--- a/trunk/src/dil/File.d	Sat Feb 16 01:47:39 2008 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,151 +0,0 @@
-/++
-  Author: Aziz Köksal
-  License: GPL3
-+/
-module dil.File;
-
-import dil.FileBOM;
-import dil.Information;
-import dil.Converter;
-import tango.io.File;
-import util.utf;
-import common;
-
-/// Loads a file in any valid Unicode format and converts it to UTF-8.
-char[] loadFile(char[] filePath)
-{
-  return data2UTF8(cast(ubyte[]) (new File(filePath)).read());
-}
-
-char[] loadFile(char[] filePath, InfoManager infoMan)
-{
-  auto converter = Converter(filePath, infoMan);
-  return converter.data2UTF8(cast(ubyte[]) (new File(filePath)).read());
-}
-
-char[] data2UTF8(ubyte[] data)
-{
-  if (data.length == 0)
-    return null;
-
-  char[] text;
-  BOM bom = tellBOM(data);
-
-  switch (bom)
-  {
-  case BOM.None:
-    // No BOM found. According to the specs the first character
-    // must be an ASCII character.
-    if (data.length >= 4)
-    {
-      if (data[0..3] == cast(ubyte[3])x"00 00 00")
-      {
-        text = toUTF8(cast(dchar[])utf32BEtoLE(data)); // UTF-32BE: 00 00 00 XX
-        break;
-      }
-      else if (data[1..4] == cast(ubyte[3])x"00 00 00")
-      {
-        text = toUTF8(cast(dchar[])data); // UTF-32LE: XX 00 00 00
-        break;
-      }
-    }
-    if (data.length >= 2)
-    {
-      if (data[0] == 0) // UTF-16BE: 00 XX
-      {
-        text = toUTF8(cast(wchar[])utf16BEtoLE(data));
-        break;
-      }
-      else if (data[1] == 0) // UTF-16LE: XX 00
-      {
-        text = toUTF8(cast(wchar[])data);
-        break;
-      }
-    }
-    text = cast(char[])data; // UTF-8
-    break;
-  case BOM.UTF8:
-    text = cast(char[])data[3..$];
-    break;
-  case BOM.UTF16BE:
-    text = toUTF8(cast(wchar[])utf16BEtoLE(data[2..$]));
-    break;
-  case BOM.UTF16LE:
-    text = toUTF8(cast(wchar[])data[2..$]);
-    break;
-  case BOM.UTF32BE:
-    text = toUTF8(cast(dchar[])utf32BEtoLE(data[4..$]));
-    break;
-  case BOM.UTF32LE:
-    text = toUTF8(cast(dchar[])data[4..$]);
-    break;
-  default:
-    assert(0);
-  }
-  return text;
-}
-
-unittest
-{
-  Stdout("Testing function data2Utf8().\n");
-  struct Data2Text
-  {
-    union
-    {
-      ubyte[] data;
-      char[] u8;
-    }
-    char[] text;
-  }
-  const Data2Text[] map = [
-    // Without BOM
-    {u8:"source", text:"source"},
-    {u8:"s\0o\0u\0r\0c\0e\0", text:"source"},
-    {u8:"\0s\0o\0u\0r\0c\0e", text:"source"},
-    {u8:"s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0", text:"source"},
-    {u8:"\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e", text:"source"},
-    // With BOM
-    {u8:"\xEF\xBB\xBFsource", text:"source"},
-    {u8:"\xFE\xFF\0s\0o\0u\0r\0c\0e", text:"source"},
-    {u8:"\xFF\xFEs\0o\0u\0r\0c\0e\0", text:"source"},
-    {u8:"\x00\x00\xFE\xFF\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e", text:"source"},
-    {u8:"\xFF\xFE\x00\x00s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0", text:"source"},
-  ];
-  alias data2UTF8 f;
-  foreach (pair; map)
-    assert(f(pair.data) == pair.text);
-}
-
-ubyte[] utf16BEtoLE(ubyte[] data)
-{
-  if (data.length % 2)
-    throw new Exception("The byte length of a UTF-16 big endian source file must be divisible by 2.");
-  wchar[] result = cast(wchar[]) new ubyte[data.length];
-  assert(result.length*2 == data.length);
-  // BE to LE "1A 2B" -> "2B 1A"
-  foreach (i, c; cast(ushort[]) data)
-    result[i] = (c << 8) | (c >> 8);
-  return cast(ubyte[]) result;
-}
-
-ubyte[] utf32BEtoLE(ubyte[] data)
-{
-  if (data.length % 4)
-    throw new Exception("The byte length of a UTF-32 big endian source file must be divisible by 4.");
-  dchar[] result = cast(dchar[]) new ubyte[data.length];
-  assert(result.length*4 == data.length);
-  // BE to LE "1A 2B 3C 4D" -> "4D 3C 2B 1A"
-  // TODO: the 'bswap' asm instruction could be used instead of shifts and &-operations.
-  foreach (i, c; cast(uint[]) data)
-    result[i] = (c << 24) |
-               ((c >> 8) & 0xFF00) |
-               ((c << 8) & 0xFF0000) |
-                (c >> 24);
-  return cast(ubyte[]) result;
-}
-
-unittest
-{
-  ubyte[] test = cast(ubyte[])x"1A 2B 3C 4D";
-  assert(utf32BEtoLE(test) == cast(ubyte[])x"4D 3C 2B 1A");
-}
--- a/trunk/src/dil/Unicode.d	Sat Feb 16 01:47:39 2008 +0100
+++ b/trunk/src/dil/Unicode.d	Sat Feb 16 03:28:39 2008 +0100
@@ -157,7 +157,7 @@
   char[6] b = void;
   if (c < 0x80)
     str ~= c;
-  if (c < 0x800)
+  else if (c < 0x800)
   {
     b[0] = 0xC0 | (c >> 6);
     b[1] = 0x80 | (c & 0x3F);
@@ -211,8 +211,7 @@
   if (c < 0x10000)
     str ~= cast(wchar)c;
   else
-  {
-    // Encode with surrogate pair.
+  { // Encode with surrogate pair.
     wchar[2] pair = void;
     c -= 0x10000; // c'
     // higher10bits(c') | 0b1101_10xx_xxxx_xxxx
@@ -225,7 +224,7 @@
 
 /++
   Returns a decoded character from a UTF-16 sequence.
-  In case of an error in the sequence 0xD800 is returned.
+  In case of an error in the sequence ERROR_CHAR is returned.
   Params:
     str = the UTF-16 sequence.
     index = where to start from.
@@ -243,7 +242,7 @@
   {
     wchar c2 = str[index+1];
     if (0xDC00 <= c2 && c2 <= 0xDFFF)
-    {
+    { // Decode surrogate pair.
       // (c - 0xD800) << 10 + 0x10000 ->
       // (c - 0xD800 + 0x40) << 10 ->
       c = (c - 0xD7C0) << 10;
@@ -257,7 +256,7 @@
 
 /++
   Returns a decoded character from a UTF-16 sequence.
-  In case of an error in the sequence 0xD800 is returned.
+  In case of an error in the sequence ERROR_CHAR is returned.
   Params:
     p = start of the UTF-16 sequence.
     end = one past the end of the sequence.
--- a/trunk/src/dil/lexer/Lexer.d	Sat Feb 16 01:47:39 2008 +0100
+++ b/trunk/src/dil/lexer/Lexer.d	Sat Feb 16 03:28:39 2008 +0100
@@ -2740,7 +2740,7 @@
     else
       src ~= pair.tokenText ~ " ";
 
-  auto lx = new Lexer(src, "");
+  auto lx = new Lexer(new SourceText("", src));
   auto token = lx.getTokens();
 
   uint i;
@@ -2759,7 +2759,7 @@
 unittest
 {
   Stdout("Testing method Lexer.peek()\n");
-  string sourceText = "unittest { }";
+  auto sourceText = new SourceText("", "unittest { }");
   auto lx = new Lexer(sourceText, null);
 
   auto next = lx.head;
@@ -2774,7 +2774,7 @@
   lx.peek(next);
   assert(next.kind == TOK.EOF);
 
-  lx = new Lexer("", null);
+  lx = new Lexer(new SourceText("", ""));
   next = lx.head;
   lx.peek(next);
   assert(next.kind == TOK.Newline);
--- a/trunk/src/dil/semantic/Module.d	Sat Feb 16 01:47:39 2008 +0100
+++ b/trunk/src/dil/semantic/Module.d	Sat Feb 16 03:28:39 2008 +0100
@@ -8,7 +8,6 @@
 import dil.ast.Declarations;
 import dil.parser.Parser;
 import dil.lexer.Lexer;
-import dil.File;
 import dil.semantic.Symbol;
 import dil.semantic.Symbols;
 import dil.Information;
--- a/trunk/src/main.d	Sat Feb 16 01:47:39 2008 +0100
+++ b/trunk/src/main.d	Sat Feb 16 03:28:39 2008 +0100
@@ -23,7 +23,6 @@
 import dil.SettingsLoader;
 import dil.CompilerInfo;
 import dil.Information;
-import dil.File;
 import dil.SourceText;
 
 import cmd.Generate;