changeset 351:97a9a2d7d46d

- Added module File.
author aziz
date Sun, 26 Aug 2007 00:12:00 +0000
parents 4ea6759300cf
children 321df078e247
files trunk/src/dil/File.d trunk/src/main.d
diffstat 2 files changed, 142 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/trunk/src/dil/File.d	Sun Aug 26 00:12:00 2007 +0000
@@ -0,0 +1,141 @@
+/++
+  Author: Aziz Köksal
+  License: GPL3
++/
+module dil.File;
+import std.stdio, std.file, std.utf;
+
+char[] loadFile(char[] fileName)
+{
+  ubyte[] data = cast(ubyte[]) std.file.read(fileName);
+  char[] text;
+  BOM bom = tellBOM(data);
+
+  switch (bom)
+  {
+  case BOM.None:
+    break;
+  case BOM.UTF8:
+    text = cast(char[])data[3..$];
+    break;
+  case BOM.UTF16BE:
+    text = toUTF8(cast(wchar[])utf16BEtoLE(data[2..$]));
+    break;
+  case BOM.UTF16LE:
+    text = toUTF8(cast(wchar[])data[2..$]);
+    break;
+  case BOM.UTF32BE:
+    text = toUTF8(cast(dchar[])utf32BEtoLE(data[4..$]));
+    break;
+  case BOM.UTF32LE:
+    text = toUTF8(cast(dchar[])data[4..$]);
+    break;
+  default:
+    assert(0);
+  }
+
+
+  return text;
+}
+
+ubyte[] utf16BEtoLE(ubyte[] data)
+{
+  if (data.length % 2)
+    throw new Exception("UTF-16 big endian source file data must be divisble by 2.");
+  wchar[] result = cast(wchar[]) new ubyte[data.length];
+  // BE to LE "1A 2B" -> "2B 1A"
+  foreach (i, c; cast(wchar[]) data)
+    result[i] = (c << 8) | (c >> 8);
+  return cast(ubyte[]) result;
+}
+
+ubyte[] utf32BEtoLE(ubyte[] data)
+{
+  if (data.length % 4)
+    throw new Exception("UTF-32 big endian source file data must be divisble by 4.");
+  dchar[] result = cast(dchar[]) new ubyte[data.length];
+  // BE to LE "1A 2B 3C 4D" -> "4D 3C 2B 1A"
+  foreach (i, c; cast(dchar[]) data)
+    result[i] = ((c & 0xFF) << 24) |
+                ((c & 0xFF00) << 16) |
+                ((c & 0xFF0000) << 8) |
+                 (c & 0xFF000000);
+  return cast(ubyte[]) result;
+}
+
+/// Byte Order Mark
+enum BOM
+{
+  None,    /// No BOM
+  UTF8,    /// UTF-8: EF BB BF
+  UTF16BE, /// UTF-16 Big Endian: FE FF
+  UTF16LE, /// UTF-16 Little Endian: FF FE
+  UTF32BE, /// UTF-32 Big Endian: 00 00 FE FF
+  UTF32LE  /// UTF-32 Little Endian: FF FE 00 00
+}
+
+BOM tellBOM(ubyte[] data)
+{
+  BOM bom = BOM.None;
+  if (data.length < 2)
+    return bom;
+
+  if (data[0..2] == cast(ubyte[2])x"FE FF")
+  {
+    if (data.length >= 4 && data[2..4] == cast(ubyte[2])x"00 00")
+      bom = BOM.UTF32LE; // FE FF 00 00
+    else
+      bom = BOM.UTF16BE; // FE FF XX XX
+  }
+  else if (data[0..2] == cast(ubyte[2])x"FF FE")
+    bom = BOM.UTF16LE; // FF FE
+  else if (data[0..2] == cast(ubyte[2])x"00 00")
+  {
+    if (data.length >= 4 && data[2..4] == cast(ubyte[2])x"FE FF")
+      bom = BOM.UTF32BE; // 00 00 FE FF
+  }
+  else if (data[0..2] ==  cast(ubyte[2])x"EF BB")
+  {
+    if (data.length >= 3 && data[2] == '\xBF')
+      bom =  BOM.UTF8; // EF BB BF
+  }
+  return bom;
+}
+
+unittest
+{
+  writefln("Testing function tellBOM().");
+
+  struct Data2BOM
+  {
+    ubyte[] data;
+    BOM bom;
+  }
+  alias ubyte[] ub;
+  const Data2BOM[] map = [
+    {cast(ub)x"12",          BOM.None},
+    {cast(ub)x"12 34",       BOM.None},
+    {cast(ub)x"00 00 FF FE", BOM.None},
+    {cast(ub)x"EF BB FF",    BOM.None},
+
+    {cast(ub)x"EF",          BOM.None},
+    {cast(ub)x"EF BB",       BOM.None},
+    {cast(ub)x"FE",          BOM.None},
+    {cast(ub)x"FF",          BOM.None},
+    {cast(ub)x"00",          BOM.None},
+    {cast(ub)x"00 00",       BOM.None},
+    {cast(ub)x"00 00 FE",    BOM.None},
+
+    {cast(ub)x"FE FF 00",    BOM.UTF16BE},
+    {cast(ub)x"FE FF 00 FF", BOM.UTF16BE},
+
+    {cast(ub)x"EF BB BF",    BOM.UTF8},
+    {cast(ub)x"FE FF",       BOM.UTF16BE},
+    {cast(ub)x"FF FE",       BOM.UTF16LE},
+    {cast(ub)x"00 00 FE FF", BOM.UTF32BE},
+    {cast(ub)x"FE FF 00 00", BOM.UTF32LE}
+  ];
+
+  foreach (pair; map)
+    assert(tellBOM(pair.data) == pair.bom, std.string.format("Failed at %s", pair.data));
+}
--- a/trunk/src/main.d	Sat Aug 25 19:47:02 2007 +0000
+++ b/trunk/src/main.d	Sun Aug 26 00:12:00 2007 +0000
@@ -11,6 +11,7 @@
 import dil.Messages;
 import dil.Settings;
 import dil.Declarations, dil.Expressions, dil.SyntaxTree;
+import dil.File;
 
 void main(char[][] args)
 {