comparison trunk/src/dil/File.d @ 518:8f86bb9ef715

Added module dil.Converter and dil.FileBOM. Moved code from dil.File to dil.FileBOM. Added opCatAssign to class InformationManager. Added encode() function to dil.Unicode.
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Sat, 15 Dec 2007 18:55:06 +0100
parents 3aa00474b381
children 50e64bab9c7a
comparison
equal deleted inserted replaced
517:b465c669d70c 518:8f86bb9ef715
1 /++ 1 /++
2 Author: Aziz Köksal 2 Author: Aziz Köksal
3 License: GPL3 3 License: GPL3
4 +/ 4 +/
5 module dil.File; 5 module dil.File;
6
7 import dil.FileBOM;
8 import dil.Information;
9 import dil.Converter;
6 import tango.io.File; 10 import tango.io.File;
7 import std.utf; 11 import std.utf;
8 import common; 12 import common;
9 13
10 /// Loads a file in any valid Unicode format and converts it to UTF-8. 14 /// Loads a file in any valid Unicode format and converts it to UTF-8.
11 char[] loadFile(char[] filePath) 15 char[] loadFile(char[] filePath)
12 { 16 {
13 return data2Utf8(cast(ubyte[]) (new File(filePath)).read()); 17 return data2UTF8(cast(ubyte[]) (new File(filePath)).read());
14 } 18 }
15 19
16 char[] data2Utf8(ubyte[] data) 20 char[] loadFile(char[] filePath, InformationManager infoMan)
21 {
22 auto converter = Converter(filePath, infoMan);
23 return converter.data2UTF8(cast(ubyte[]) (new File(filePath)).read());
24 }
25
26 char[] data2UTF8(ubyte[] data)
17 { 27 {
18 if (data.length == 0) 28 if (data.length == 0)
19 return null; 29 return null;
20 30
21 char[] text; 31 char[] text;
137 unittest 147 unittest
138 { 148 {
139 ubyte[] test = cast(ubyte[])x"1A 2B 3C 4D"; 149 ubyte[] test = cast(ubyte[])x"1A 2B 3C 4D";
140 assert(utf32BEtoLE(test) == cast(ubyte[])x"4D 3C 2B 1A"); 150 assert(utf32BEtoLE(test) == cast(ubyte[])x"4D 3C 2B 1A");
141 } 151 }
142
143 /// Byte Order Mark
144 enum BOM
145 {
146 None, /// No BOM
147 UTF8, /// UTF-8: EF BB BF
148 UTF16BE, /// UTF-16 Big Endian: FE FF
149 UTF16LE, /// UTF-16 Little Endian: FF FE
150 UTF32BE, /// UTF-32 Big Endian: 00 00 FE FF
151 UTF32LE /// UTF-32 Little Endian: FF FE 00 00
152 }
153
154 BOM tellBOM(ubyte[] data)
155 {
156 BOM bom = BOM.None;
157 if (data.length < 2)
158 return bom;
159
160 if (data[0..2] == cast(ubyte[2])x"FE FF")
161 {
162 bom = BOM.UTF16BE; // FE FF
163 }
164 else if (data[0..2] == cast(ubyte[2])x"FF FE")
165 {
166 if (data.length >= 4 && data[2..4] == cast(ubyte[2])x"00 00")
167 bom = BOM.UTF32LE; // FF FE 00 00
168 else
169 bom = BOM.UTF16LE; // FF FE XX XX
170 }
171 else if (data[0..2] == cast(ubyte[2])x"00 00")
172 {
173 if (data.length >= 4 && data[2..4] == cast(ubyte[2])x"FE FF")
174 bom = BOM.UTF32BE; // 00 00 FE FF
175 }
176 else if (data[0..2] == cast(ubyte[2])x"EF BB")
177 {
178 if (data.length >= 3 && data[2] == '\xBF')
179 bom = BOM.UTF8; // EF BB BF
180 }
181 return bom;
182 }
183
184 unittest
185 {
186 Stdout("Testing function tellBOM().\n");
187
188 struct Data2BOM
189 {
190 ubyte[] data;
191 BOM bom;
192 }
193 alias ubyte[] ub;
194 const Data2BOM[] map = [
195 {cast(ub)x"12", BOM.None},
196 {cast(ub)x"12 34", BOM.None},
197 {cast(ub)x"00 00 FF FE", BOM.None},
198 {cast(ub)x"EF BB FF", BOM.None},
199
200 {cast(ub)x"EF", BOM.None},
201 {cast(ub)x"EF BB", BOM.None},
202 {cast(ub)x"FE", BOM.None},
203 {cast(ub)x"FF", BOM.None},
204 {cast(ub)x"00", BOM.None},
205 {cast(ub)x"00 00", BOM.None},
206 {cast(ub)x"00 00 FE", BOM.None},
207
208 {cast(ub)x"FE FF 00", BOM.UTF16BE},
209 {cast(ub)x"FE FF 00 FF", BOM.UTF16BE},
210
211 {cast(ub)x"EF BB BF", BOM.UTF8},
212 {cast(ub)x"FE FF", BOM.UTF16BE},
213 {cast(ub)x"FF FE", BOM.UTF16LE},
214 {cast(ub)x"00 00 FE FF", BOM.UTF32BE},
215 {cast(ub)x"FF FE 00 00", BOM.UTF32LE}
216 ];
217
218 foreach (pair; map)
219 assert(tellBOM(pair.data) == pair.bom, Format("Failed at {0}", pair.data));
220 }