Mercurial > projects > dil

/++
  Author: Aziz Köksal
  License: GPL3
+/
module dil.File;

import dil.FileBOM;
import dil.Information;
import dil.Converter;
import tango.io.File;
import std.utf;
import common;

/// Loads a file in any valid Unicode format and converts it to UTF-8.
char[] loadFile(char[] filePath)
{
  return data2UTF8(cast(ubyte[]) (new File(filePath)).read());
}

char[] loadFile(char[] filePath, InfoManager infoMan)
{
  auto converter = Converter(filePath, infoMan);
  return converter.data2UTF8(cast(ubyte[]) (new File(filePath)).read());
}

char[] data2UTF8(ubyte[] data)
{
  if (data.length == 0)
    return null;

  char[] text;
  BOM bom = tellBOM(data);

  switch (bom)
  {
  case BOM.None:
    // No BOM found. According to the specs the first character
    // must be an ASCII character.
    if (data.length >= 4)
    {
      if (data[0..3] == cast(ubyte[3])x"00 00 00")
      {
        text = toUTF8(cast(dchar[])utf32BEtoLE(data)); // UTF-32BE: 00 00 00 XX
        break;
      }
      else if (data[1..4] == cast(ubyte[3])x"00 00 00")
      {
        text = toUTF8(cast(dchar[])data); // UTF-32LE: XX 00 00 00
        break;
      }
    }
    if (data.length >= 2)
    {
      if (data[0] == 0) // UTF-16BE: 00 XX
      {
        text = toUTF8(cast(wchar[])utf16BEtoLE(data));
        break;
      }
      else if (data[1] == 0) // UTF-16LE: XX 00
      {
        text = toUTF8(cast(wchar[])data);
        break;
      }
    }
    text = cast(char[])data; // UTF-8
    break;
  case BOM.UTF8:
    text = cast(char[])data[3..$];
    break;
  case BOM.UTF16BE:
    text = toUTF8(cast(wchar[])utf16BEtoLE(data[2..$]));
    break;
  case BOM.UTF16LE:
    text = toUTF8(cast(wchar[])data[2..$]);
    break;
  case BOM.UTF32BE:
    text = toUTF8(cast(dchar[])utf32BEtoLE(data[4..$]));
    break;
  case BOM.UTF32LE:
    text = toUTF8(cast(dchar[])data[4..$]);
    break;
  default:
    assert(0);
  }
  return text;
}

unittest
{
  Stdout("Testing function data2Utf8().\n");
  struct Data2Text
  {
    union
    {
      ubyte[] data;
      char[] u8;
    }
    char[] text;
  }
  const Data2Text[] map = [
    // Without BOM
    {u8:"source", text:"source"},
    {u8:"s\0o\0u\0r\0c\0e\0", text:"source"},
    {u8:"\0s\0o\0u\0r\0c\0e", text:"source"},
    {u8:"s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0", text:"source"},
    {u8:"\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e", text:"source"},
    // With BOM
    {u8:"\xEF\xBB\xBFsource", text:"source"},
    {u8:"\xFE\xFF\0s\0o\0u\0r\0c\0e", text:"source"},
    {u8:"\xFF\xFEs\0o\0u\0r\0c\0e\0", text:"source"},
    {u8:"\x00\x00\xFE\xFF\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e", text:"source"},
    {u8:"\xFF\xFE\x00\x00s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0", text:"source"},
  ];
  alias data2Utf8 f;
  foreach (pair; map)
    assert(f(pair.data) == pair.text);
}

ubyte[] utf16BEtoLE(ubyte[] data)
{
  if (data.length % 2)
    throw new Exception("The byte length of a UTF-16 big endian source file must be divisible by 2.");
  wchar[] result = cast(wchar[]) new ubyte[data.length];
  assert(result.length*2 == data.length);
  // BE to LE "1A 2B" -> "2B 1A"
  foreach (i, c; cast(ushort[]) data)
    result[i] = (c << 8) | (c >> 8);
  return cast(ubyte[]) result;
}

ubyte[] utf32BEtoLE(ubyte[] data)
{
  if (data.length % 4)
    throw new Exception("The byte length of a UTF-32 big endian source file must be divisible by 4.");
  dchar[] result = cast(dchar[]) new ubyte[data.length];
  assert(result.length*4 == data.length);
  // BE to LE "1A 2B 3C 4D" -> "4D 3C 2B 1A"
  // TODO: the 'bswap' asm instruction could be used instead of shifts and &-operations.
  foreach (i, c; cast(uint[]) data)
    result[i] = (c << 24) |
               ((c >> 8) & 0xFF00) |
               ((c << 8) & 0xFF0000) |
                (c >> 24);
  return cast(ubyte[]) result;
}

unittest
{
  ubyte[] test = cast(ubyte[])x"1A 2B 3C 4D";
  assert(utf32BEtoLE(test) == cast(ubyte[])x"4D 3C 2B 1A");
}
author	Aziz K?ksal <aziz.koeksal@gmail.com>
date	Mon, 17 Dec 2007 16:10:08 +0100
parents	8f86bb9ef715
children	164b4ecd9793