annotate trunk/src/dil/File.d @ 518:8f86bb9ef715

Added module dil.Converter and dil.FileBOM. Moved code from dil.File to dil.FileBOM. Added opCatAssign to class InformationManager. Added encode() function to dil.Unicode.
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Sat, 15 Dec 2007 18:55:06 +0100
parents 3aa00474b381
children 50e64bab9c7a
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
351
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
1 /++
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
2 Author: Aziz Köksal
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
3 License: GPL3
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
4 +/
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
5 module dil.File;
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 408
diff changeset
6
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 408
diff changeset
7 import dil.FileBOM;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 408
diff changeset
8 import dil.Information;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 408
diff changeset
9 import dil.Converter;
391
33b566df6af4 Migrated project to Tango.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 362
diff changeset
10 import tango.io.File;
33b566df6af4 Migrated project to Tango.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 362
diff changeset
11 import std.utf;
33b566df6af4 Migrated project to Tango.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 362
diff changeset
12 import common;
351
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
13
352
321df078e247 - Added code for detecting Unicode format of a file without a BOM.
aziz
parents: 351
diff changeset
14 /// Loads a file in any valid Unicode format and converts it to UTF-8.
391
33b566df6af4 Migrated project to Tango.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 362
diff changeset
15 char[] loadFile(char[] filePath)
351
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
16 {
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 408
diff changeset
17 return data2UTF8(cast(ubyte[]) (new File(filePath)).read());
361
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
18 }
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
19
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 408
diff changeset
20 char[] loadFile(char[] filePath, InformationManager infoMan)
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 408
diff changeset
21 {
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 408
diff changeset
22 auto converter = Converter(filePath, infoMan);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 408
diff changeset
23 return converter.data2UTF8(cast(ubyte[]) (new File(filePath)).read());
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 408
diff changeset
24 }
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 408
diff changeset
25
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 408
diff changeset
26 char[] data2UTF8(ubyte[] data)
361
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
27 {
397
c99f8aeb7b4a Empty source files are handled correctly now.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 391
diff changeset
28 if (data.length == 0)
c99f8aeb7b4a Empty source files are handled correctly now.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 391
diff changeset
29 return null;
c99f8aeb7b4a Empty source files are handled correctly now.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 391
diff changeset
30
351
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
31 char[] text;
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
32 BOM bom = tellBOM(data);
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
33
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
34 switch (bom)
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
35 {
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
36 case BOM.None:
356
426ab95c0e73 - Added an assert to loadFile().
aziz
parents: 355
diff changeset
37 // No BOM found. According to the specs the first character
352
321df078e247 - Added code for detecting Unicode format of a file without a BOM.
aziz
parents: 351
diff changeset
38 // must be an ASCII character.
321df078e247 - Added code for detecting Unicode format of a file without a BOM.
aziz
parents: 351
diff changeset
39 if (data.length >= 4)
321df078e247 - Added code for detecting Unicode format of a file without a BOM.
aziz
parents: 351
diff changeset
40 {
321df078e247 - Added code for detecting Unicode format of a file without a BOM.
aziz
parents: 351
diff changeset
41 if (data[0..3] == cast(ubyte[3])x"00 00 00")
361
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
42 {
354
b03aaa9c6bc5 - Added some comments.
aziz
parents: 353
diff changeset
43 text = toUTF8(cast(dchar[])utf32BEtoLE(data)); // UTF-32BE: 00 00 00 XX
361
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
44 break;
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
45 }
352
321df078e247 - Added code for detecting Unicode format of a file without a BOM.
aziz
parents: 351
diff changeset
46 else if (data[1..4] == cast(ubyte[3])x"00 00 00")
361
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
47 {
354
b03aaa9c6bc5 - Added some comments.
aziz
parents: 353
diff changeset
48 text = toUTF8(cast(dchar[])data); // UTF-32LE: XX 00 00 00
361
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
49 break;
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
50 }
352
321df078e247 - Added code for detecting Unicode format of a file without a BOM.
aziz
parents: 351
diff changeset
51 }
361
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
52 if (data.length >= 2)
352
321df078e247 - Added code for detecting Unicode format of a file without a BOM.
aziz
parents: 351
diff changeset
53 {
354
b03aaa9c6bc5 - Added some comments.
aziz
parents: 353
diff changeset
54 if (data[0] == 0) // UTF-16BE: 00 XX
361
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
55 {
352
321df078e247 - Added code for detecting Unicode format of a file without a BOM.
aziz
parents: 351
diff changeset
56 text = toUTF8(cast(wchar[])utf16BEtoLE(data));
361
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
57 break;
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
58 }
354
b03aaa9c6bc5 - Added some comments.
aziz
parents: 353
diff changeset
59 else if (data[1] == 0) // UTF-16LE: XX 00
361
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
60 {
352
321df078e247 - Added code for detecting Unicode format of a file without a BOM.
aziz
parents: 351
diff changeset
61 text = toUTF8(cast(wchar[])data);
361
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
62 break;
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
63 }
352
321df078e247 - Added code for detecting Unicode format of a file without a BOM.
aziz
parents: 351
diff changeset
64 }
361
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
65 text = cast(char[])data; // UTF-8
351
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
66 break;
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
67 case BOM.UTF8:
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
68 text = cast(char[])data[3..$];
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
69 break;
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
70 case BOM.UTF16BE:
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
71 text = toUTF8(cast(wchar[])utf16BEtoLE(data[2..$]));
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
72 break;
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
73 case BOM.UTF16LE:
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
74 text = toUTF8(cast(wchar[])data[2..$]);
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
75 break;
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
76 case BOM.UTF32BE:
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
77 text = toUTF8(cast(dchar[])utf32BEtoLE(data[4..$]));
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
78 break;
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
79 case BOM.UTF32LE:
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
80 text = toUTF8(cast(dchar[])data[4..$]);
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
81 break;
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
82 default:
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
83 assert(0);
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
84 }
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
85 return text;
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
86 }
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
87
361
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
88 unittest
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
89 {
391
33b566df6af4 Migrated project to Tango.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 362
diff changeset
90 Stdout("Testing function data2Utf8().\n");
361
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
91 struct Data2Text
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
92 {
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
93 union
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
94 {
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
95 ubyte[] data;
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
96 char[] u8;
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
97 }
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
98 char[] text;
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
99 }
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
100 const Data2Text[] map = [
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
101 // Without BOM
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
102 {u8:"source", text:"source"},
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
103 {u8:"s\0o\0u\0r\0c\0e\0", text:"source"},
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
104 {u8:"\0s\0o\0u\0r\0c\0e", text:"source"},
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
105 {u8:"s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0", text:"source"},
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
106 {u8:"\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e", text:"source"},
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
107 // With BOM
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
108 {u8:"\xEF\xBB\xBFsource", text:"source"},
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
109 {u8:"\xFE\xFF\0s\0o\0u\0r\0c\0e", text:"source"},
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
110 {u8:"\xFF\xFEs\0o\0u\0r\0c\0e\0", text:"source"},
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
111 {u8:"\x00\x00\xFE\xFF\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e", text:"source"},
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
112 {u8:"\xFF\xFE\x00\x00s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0", text:"source"},
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
113 ];
391
33b566df6af4 Migrated project to Tango.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 362
diff changeset
114 alias data2Utf8 f;
361
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
115 foreach (pair; map)
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
116 assert(f(pair.data) == pair.text);
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
117 }
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
118
351
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
119 ubyte[] utf16BEtoLE(ubyte[] data)
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
120 {
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
121 if (data.length % 2)
391
33b566df6af4 Migrated project to Tango.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 362
diff changeset
122 throw new Exception("The byte length of a UTF-16 big endian source file must be divisible by 2.");
351
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
123 wchar[] result = cast(wchar[]) new ubyte[data.length];
361
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
124 assert(result.length*2 == data.length);
351
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
125 // BE to LE "1A 2B" -> "2B 1A"
362
1b6e61915858 - Fix: casting ubyte[] to ushort[] and uint[] respectively.
aziz
parents: 361
diff changeset
126 foreach (i, c; cast(ushort[]) data)
351
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
127 result[i] = (c << 8) | (c >> 8);
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
128 return cast(ubyte[]) result;
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
129 }
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
130
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
131 ubyte[] utf32BEtoLE(ubyte[] data)
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
132 {
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
133 if (data.length % 4)
391
33b566df6af4 Migrated project to Tango.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 362
diff changeset
134 throw new Exception("The byte length of a UTF-32 big endian source file must be divisible by 4.");
351
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
135 dchar[] result = cast(dchar[]) new ubyte[data.length];
361
d93dd84cd5f2 - Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents: 356
diff changeset
136 assert(result.length*4 == data.length);
351
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
137 // BE to LE "1A 2B 3C 4D" -> "4D 3C 2B 1A"
408
3aa00474b381 Fixed byte swap expressions in dil.File.utf32BEtoLE().
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 397
diff changeset
138 // TODO: the 'bswap' asm instruction could be used instead of shifts and &-operations.
362
1b6e61915858 - Fix: casting ubyte[] to ushort[] and uint[] respectively.
aziz
parents: 361
diff changeset
139 foreach (i, c; cast(uint[]) data)
408
3aa00474b381 Fixed byte swap expressions in dil.File.utf32BEtoLE().
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 397
diff changeset
140 result[i] = (c << 24) |
3aa00474b381 Fixed byte swap expressions in dil.File.utf32BEtoLE().
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 397
diff changeset
141 ((c >> 8) & 0xFF00) |
3aa00474b381 Fixed byte swap expressions in dil.File.utf32BEtoLE().
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 397
diff changeset
142 ((c << 8) & 0xFF0000) |
3aa00474b381 Fixed byte swap expressions in dil.File.utf32BEtoLE().
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 397
diff changeset
143 (c >> 24);
351
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
144 return cast(ubyte[]) result;
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
145 }
97a9a2d7d46d - Added module File.
aziz
parents:
diff changeset
146
408
3aa00474b381 Fixed byte swap expressions in dil.File.utf32BEtoLE().
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 397
diff changeset
147 unittest
3aa00474b381 Fixed byte swap expressions in dil.File.utf32BEtoLE().
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 397
diff changeset
148 {
3aa00474b381 Fixed byte swap expressions in dil.File.utf32BEtoLE().
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 397
diff changeset
149 ubyte[] test = cast(ubyte[])x"1A 2B 3C 4D";
3aa00474b381 Fixed byte swap expressions in dil.File.utf32BEtoLE().
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 397
diff changeset
150 assert(utf32BEtoLE(test) == cast(ubyte[])x"4D 3C 2B 1A");
3aa00474b381 Fixed byte swap expressions in dil.File.utf32BEtoLE().
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 397
diff changeset
151 }