Mercurial > projects > dil
annotate trunk/src/dil/File.d @ 518:8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Moved code from dil.File to dil.FileBOM.
Added opCatAssign to class InformationManager.
Added encode() function to dil.Unicode.
author | Aziz K?ksal <aziz.koeksal@gmail.com> |
---|---|
date | Sat, 15 Dec 2007 18:55:06 +0100 |
parents | 3aa00474b381 |
children | 50e64bab9c7a |
rev | line source |
---|---|
351 | 1 /++ |
2 Author: Aziz Köksal | |
3 License: GPL3 | |
4 +/ | |
5 module dil.File; | |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
408
diff
changeset
|
6 |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
408
diff
changeset
|
7 import dil.FileBOM; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
408
diff
changeset
|
8 import dil.Information; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
408
diff
changeset
|
9 import dil.Converter; |
391
33b566df6af4
Migrated project to Tango.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
362
diff
changeset
|
10 import tango.io.File; |
33b566df6af4
Migrated project to Tango.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
362
diff
changeset
|
11 import std.utf; |
33b566df6af4
Migrated project to Tango.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
362
diff
changeset
|
12 import common; |
351 | 13 |
352
321df078e247
- Added code for detecting Unicode format of a file without a BOM.
aziz
parents:
351
diff
changeset
|
14 /// Loads a file in any valid Unicode format and converts it to UTF-8. |
391
33b566df6af4
Migrated project to Tango.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
362
diff
changeset
|
15 char[] loadFile(char[] filePath) |
351 | 16 { |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
408
diff
changeset
|
17 return data2UTF8(cast(ubyte[]) (new File(filePath)).read()); |
361
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
18 } |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
19 |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
408
diff
changeset
|
20 char[] loadFile(char[] filePath, InformationManager infoMan) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
408
diff
changeset
|
21 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
408
diff
changeset
|
22 auto converter = Converter(filePath, infoMan); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
408
diff
changeset
|
23 return converter.data2UTF8(cast(ubyte[]) (new File(filePath)).read()); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
408
diff
changeset
|
24 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
408
diff
changeset
|
25 |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
408
diff
changeset
|
26 char[] data2UTF8(ubyte[] data) |
361
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
27 { |
397
c99f8aeb7b4a
Empty source files are handled correctly now.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
391
diff
changeset
|
28 if (data.length == 0) |
c99f8aeb7b4a
Empty source files are handled correctly now.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
391
diff
changeset
|
29 return null; |
c99f8aeb7b4a
Empty source files are handled correctly now.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
391
diff
changeset
|
30 |
351 | 31 char[] text; |
32 BOM bom = tellBOM(data); | |
33 | |
34 switch (bom) | |
35 { | |
36 case BOM.None: | |
356 | 37 // No BOM found. According to the specs the first character |
352
321df078e247
- Added code for detecting Unicode format of a file without a BOM.
aziz
parents:
351
diff
changeset
|
38 // must be an ASCII character. |
321df078e247
- Added code for detecting Unicode format of a file without a BOM.
aziz
parents:
351
diff
changeset
|
39 if (data.length >= 4) |
321df078e247
- Added code for detecting Unicode format of a file without a BOM.
aziz
parents:
351
diff
changeset
|
40 { |
321df078e247
- Added code for detecting Unicode format of a file without a BOM.
aziz
parents:
351
diff
changeset
|
41 if (data[0..3] == cast(ubyte[3])x"00 00 00") |
361
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
42 { |
354 | 43 text = toUTF8(cast(dchar[])utf32BEtoLE(data)); // UTF-32BE: 00 00 00 XX |
361
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
44 break; |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
45 } |
352
321df078e247
- Added code for detecting Unicode format of a file without a BOM.
aziz
parents:
351
diff
changeset
|
46 else if (data[1..4] == cast(ubyte[3])x"00 00 00") |
361
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
47 { |
354 | 48 text = toUTF8(cast(dchar[])data); // UTF-32LE: XX 00 00 00 |
361
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
49 break; |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
50 } |
352
321df078e247
- Added code for detecting Unicode format of a file without a BOM.
aziz
parents:
351
diff
changeset
|
51 } |
361
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
52 if (data.length >= 2) |
352
321df078e247
- Added code for detecting Unicode format of a file without a BOM.
aziz
parents:
351
diff
changeset
|
53 { |
354 | 54 if (data[0] == 0) // UTF-16BE: 00 XX |
361
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
55 { |
352
321df078e247
- Added code for detecting Unicode format of a file without a BOM.
aziz
parents:
351
diff
changeset
|
56 text = toUTF8(cast(wchar[])utf16BEtoLE(data)); |
361
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
57 break; |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
58 } |
354 | 59 else if (data[1] == 0) // UTF-16LE: XX 00 |
361
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
60 { |
352
321df078e247
- Added code for detecting Unicode format of a file without a BOM.
aziz
parents:
351
diff
changeset
|
61 text = toUTF8(cast(wchar[])data); |
361
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
62 break; |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
63 } |
352
321df078e247
- Added code for detecting Unicode format of a file without a BOM.
aziz
parents:
351
diff
changeset
|
64 } |
361
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
65 text = cast(char[])data; // UTF-8 |
351 | 66 break; |
67 case BOM.UTF8: | |
68 text = cast(char[])data[3..$]; | |
69 break; | |
70 case BOM.UTF16BE: | |
71 text = toUTF8(cast(wchar[])utf16BEtoLE(data[2..$])); | |
72 break; | |
73 case BOM.UTF16LE: | |
74 text = toUTF8(cast(wchar[])data[2..$]); | |
75 break; | |
76 case BOM.UTF32BE: | |
77 text = toUTF8(cast(dchar[])utf32BEtoLE(data[4..$])); | |
78 break; | |
79 case BOM.UTF32LE: | |
80 text = toUTF8(cast(dchar[])data[4..$]); | |
81 break; | |
82 default: | |
83 assert(0); | |
84 } | |
85 return text; | |
86 } | |
87 | |
361
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
88 unittest |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
89 { |
391
33b566df6af4
Migrated project to Tango.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
362
diff
changeset
|
90 Stdout("Testing function data2Utf8().\n"); |
361
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
91 struct Data2Text |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
92 { |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
93 union |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
94 { |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
95 ubyte[] data; |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
96 char[] u8; |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
97 } |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
98 char[] text; |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
99 } |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
100 const Data2Text[] map = [ |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
101 // Without BOM |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
102 {u8:"source", text:"source"}, |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
103 {u8:"s\0o\0u\0r\0c\0e\0", text:"source"}, |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
104 {u8:"\0s\0o\0u\0r\0c\0e", text:"source"}, |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
105 {u8:"s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0", text:"source"}, |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
106 {u8:"\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e", text:"source"}, |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
107 // With BOM |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
108 {u8:"\xEF\xBB\xBFsource", text:"source"}, |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
109 {u8:"\xFE\xFF\0s\0o\0u\0r\0c\0e", text:"source"}, |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
110 {u8:"\xFF\xFEs\0o\0u\0r\0c\0e\0", text:"source"}, |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
111 {u8:"\x00\x00\xFE\xFF\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e", text:"source"}, |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
112 {u8:"\xFF\xFE\x00\x00s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0", text:"source"}, |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
113 ]; |
391
33b566df6af4
Migrated project to Tango.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
362
diff
changeset
|
114 alias data2Utf8 f; |
361
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
115 foreach (pair; map) |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
116 assert(f(pair.data) == pair.text); |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
117 } |
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
118 |
351 | 119 ubyte[] utf16BEtoLE(ubyte[] data) |
120 { | |
121 if (data.length % 2) | |
391
33b566df6af4
Migrated project to Tango.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
362
diff
changeset
|
122 throw new Exception("The byte length of a UTF-16 big endian source file must be divisible by 2."); |
351 | 123 wchar[] result = cast(wchar[]) new ubyte[data.length]; |
361
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
124 assert(result.length*2 == data.length); |
351 | 125 // BE to LE "1A 2B" -> "2B 1A" |
362
1b6e61915858
- Fix: casting ubyte[] to ushort[] and uint[] respectively.
aziz
parents:
361
diff
changeset
|
126 foreach (i, c; cast(ushort[]) data) |
351 | 127 result[i] = (c << 8) | (c >> 8); |
128 return cast(ubyte[]) result; | |
129 } | |
130 | |
131 ubyte[] utf32BEtoLE(ubyte[] data) | |
132 { | |
133 if (data.length % 4) | |
391
33b566df6af4
Migrated project to Tango.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
362
diff
changeset
|
134 throw new Exception("The byte length of a UTF-32 big endian source file must be divisible by 4."); |
351 | 135 dchar[] result = cast(dchar[]) new ubyte[data.length]; |
361
d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
aziz
parents:
356
diff
changeset
|
136 assert(result.length*4 == data.length); |
351 | 137 // BE to LE "1A 2B 3C 4D" -> "4D 3C 2B 1A" |
408
3aa00474b381
Fixed byte swap expressions in dil.File.utf32BEtoLE().
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
397
diff
changeset
|
138 // TODO: the 'bswap' asm instruction could be used instead of shifts and &-operations. |
362
1b6e61915858
- Fix: casting ubyte[] to ushort[] and uint[] respectively.
aziz
parents:
361
diff
changeset
|
139 foreach (i, c; cast(uint[]) data) |
408
3aa00474b381
Fixed byte swap expressions in dil.File.utf32BEtoLE().
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
397
diff
changeset
|
140 result[i] = (c << 24) | |
3aa00474b381
Fixed byte swap expressions in dil.File.utf32BEtoLE().
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
397
diff
changeset
|
141 ((c >> 8) & 0xFF00) | |
3aa00474b381
Fixed byte swap expressions in dil.File.utf32BEtoLE().
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
397
diff
changeset
|
142 ((c << 8) & 0xFF0000) | |
3aa00474b381
Fixed byte swap expressions in dil.File.utf32BEtoLE().
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
397
diff
changeset
|
143 (c >> 24); |
351 | 144 return cast(ubyte[]) result; |
145 } | |
146 | |
408
3aa00474b381
Fixed byte swap expressions in dil.File.utf32BEtoLE().
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
397
diff
changeset
|
147 unittest |
3aa00474b381
Fixed byte swap expressions in dil.File.utf32BEtoLE().
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
397
diff
changeset
|
148 { |
3aa00474b381
Fixed byte swap expressions in dil.File.utf32BEtoLE().
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
397
diff
changeset
|
149 ubyte[] test = cast(ubyte[])x"1A 2B 3C 4D"; |
3aa00474b381
Fixed byte swap expressions in dil.File.utf32BEtoLE().
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
397
diff
changeset
|
150 assert(utf32BEtoLE(test) == cast(ubyte[])x"4D 3C 2B 1A"); |
3aa00474b381
Fixed byte swap expressions in dil.File.utf32BEtoLE().
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
397
diff
changeset
|
151 } |