comparison trunk/src/dil/Converter.d @ 518:8f86bb9ef715

Added module dil.Converter and dil.FileBOM. Moved code from dil.File to dil.FileBOM. Added opCatAssign to class InformationManager. Added encode() function to dil.Unicode.
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Sat, 15 Dec 2007 18:55:06 +0100
parents
children 50e64bab9c7a
comparison
equal deleted inserted replaced
517:b465c669d70c 518:8f86bb9ef715
1 /++
2 Author: Aziz Köksal
3 License: GPL3
4 +/
5 module dil.Converter;
6
7 import dil.Information;
8 import dil.Location;
9 import dil.Unicode;
10 import dil.FileBOM;
11 import common;
12
13 /// Converts various Unicode encoding formats to UTF-8.
14 struct Converter
15 {
16 char[] filePath; /// For error messages.
17 InformationManager infoMan;
18
19 static Converter opCall(char[] filePath, InformationManager infoMan)
20 {
21 Converter conv;
22 conv.filePath = filePath;
23 conv.infoMan = infoMan;
24 return conv;
25 }
26
27 dchar swapBytes(dchar c)
28 {
29 return c = (c << 24) |
30 ((c >> 8) & 0xFF00) |
31 ((c << 8) & 0xFF0000) |
32 (c >> 24);
33 }
34
35 wchar swapBytes(wchar c)
36 {
37 return (c << 8) | (c >> 8);
38 }
39
40 wchar BEtoMachineDword(dchar c)
41 {
42 version(LittleEndian)
43 return swapBytes(c);
44 else
45 return c;
46 }
47
48 wchar LEtoMachineDword(dchar c)
49 {
50 version(LittleEndian)
51 return c;
52 else
53 return swapBytes(c);
54 }
55
56 wchar BEtoMachineWord(wchar c)
57 {
58 version(LittleEndian)
59 return swapBytes(c);
60 else
61 return c;
62 }
63
64 wchar LEtoMachineWord(wchar c)
65 {
66 version(LittleEndian)
67 return c;
68 else
69 return swapBytes(c);
70 }
71
72 char[] UTF32toUTF8(bool isBigEndian)(ubyte[] data)
73 {
74 if (data.length % 4)
75 {
76 infoMan.info ~= new LexerError(new Location(filePath, 0),
77 "the byte length of a UTF-32 source file must be divisible by 4."
78 );
79 data = data[0 .. $ - $ % 4]; // Trim to valid size.
80 }
81 if (data.length == 0)
82 return null;
83
84 char[] result;
85 foreach (dchar c; cast(dchar[])data)
86 {
87 static if (isBigEndian)
88 c = BEtoMachineDword(c);
89 else
90 c = LEtoMachineDword(c);
91
92 if (!isValidChar(c))
93 {
94 // TODO: correct location.
95 auto loc = new Location(filePath, 0);
96 infoMan.info ~= new LexerError(null, Format("invalid UTF-32 character '{:X}'.", c));
97 c = REPLACEMENT_CHAR;
98 }
99
100 dil.Unicode.encode(result, c);
101 }
102 return result;
103 }
104
105 alias UTF32toUTF8!(true) UTF32BEtoUTF8;
106 alias UTF32toUTF8!(false) UTF32LEtoUTF8;
107
108 char[] UTF16toUTF8(bool isBigEndian)(ubyte[] data)
109 {
110 if (data.length % 2)
111 {
112 infoMan ~= new LexerError(new Location(filePath, 0),
113 "the byte length of a UTF-16 source file must be divisible by 2."
114 );
115 data = data[0 .. $-1]; // Trim to valid size.
116 }
117
118 if (data.length == 0)
119 return null;
120
121 wchar[] text = cast(wchar[])data;
122 wchar* p = text.ptr,
123 end = text.ptr + text.length;
124 char[] result;
125
126 dchar c = *p;
127
128 do
129 {
130 static if (isBigEndian)
131 c = BEtoMachineWord(c);
132 else
133 c = LEtoMachineWord(c);
134
135 if (c < 0xD800 || 0xDFFF > c)
136 {}
137 else if (c <= 0xDBFF && p+1 < end)
138 {
139 wchar c2 = p[1];
140 static if (isBigEndian)
141 c2 = BEtoMachineWord(c2);
142 else
143 c2 = LEtoMachineWord(c2);
144
145 if (0xDC00 <= c2 && c2 <= 0xDFFF)
146 {
147 c = (c - 0xD7C0) << 10;
148 c |= (c2 & 0x3FF);
149 ++p;
150 }
151 }
152 else
153 {
154 // TODO: correct location.
155 auto loc = new Location(filePath, 0);
156 infoMan ~= new LexerError(loc, Format("invalid UTF-16 character '{:X}'.", c));
157 c = REPLACEMENT_CHAR;
158 }
159 ++p;
160 dil.Unicode.encode(result, c);
161 } while (p < end)
162 return result;
163 }
164
165 alias UTF16toUTF8!(true) UTF16BEtoUTF8;
166 alias UTF16toUTF8!(false) UTF16LEtoUTF8;
167
168 char[] data2UTF8(ubyte[] data)
169 {
170 if (data.length == 0)
171 return null;
172
173 char[] text;
174 BOM bom = tellBOM(data);
175
176 switch (bom)
177 {
178 case BOM.None:
179 // No BOM found. According to the specs the first character
180 // must be an ASCII character.
181 if (data.length >= 4)
182 {
183 if (data[0..3] == cast(ubyte[3])x"00 00 00")
184 {
185 text = UTF32BEtoUTF8(data); // UTF-32BE: 00 00 00 XX
186 break;
187 }
188 else if (data[1..4] == cast(ubyte[3])x"00 00 00")
189 {
190 text = UTF32LEtoUTF8(data); // UTF-32LE: XX 00 00 00
191 break;
192 }
193 }
194 if (data.length >= 2)
195 {
196 if (data[0] == 0) // UTF-16BE: 00 XX
197 {
198 text = UTF16BEtoUTF8(data);
199 break;
200 }
201 else if (data[1] == 0) // UTF-16LE: XX 00
202 {
203 text = UTF16LEtoUTF8(data);
204 break;
205 }
206 }
207 text = cast(char[])data; // UTF-8
208 break;
209 case BOM.UTF8:
210 text = cast(char[])data[3..$];
211 break;
212 case BOM.UTF16BE:
213 text = UTF16BEtoUTF8(data[2..$]);
214 break;
215 case BOM.UTF16LE:
216 text = UTF16LEtoUTF8(data[2..$]);
217 break;
218 case BOM.UTF32BE:
219 text = UTF32BEtoUTF8(data[4..$]);
220 break;
221 case BOM.UTF32LE:
222 text = UTF32LEtoUTF8(data[4..$]);
223 break;
224 default:
225 assert(0);
226 }
227 return text;
228 }
229 }