comparison src/dil/Converter.d @ 806:bcb74c9b895c

Moved out files in the trunk folder to the root.
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Sun, 09 Mar 2008 00:12:19 +0100
parents trunk/src/dil/Converter.d@3b34f6a95a27
children
comparison
equal deleted inserted replaced
805:a3fab8b74a7d 806:bcb74c9b895c
1 /++
2 Author: Aziz Köksal
3 License: GPL3
4 +/
5 module dil.Converter;
6
7 import dil.Information;
8 import dil.Location;
9 import dil.Unicode;
10 import dil.FileBOM;
11 import dil.lexer.Funcs;
12 import dil.Messages;
13 import common;
14
15 /// Converts various Unicode encoding formats to UTF-8.
16 struct Converter
17 {
18 char[] filePath; /// For error messages.
19 InfoManager infoMan;
20
21 static Converter opCall(char[] filePath, InfoManager infoMan)
22 {
23 Converter conv;
24 conv.filePath = filePath;
25 conv.infoMan = infoMan;
26 return conv;
27 }
28
29 /// Byte-swaps c.
30 dchar swapBytes(dchar c)
31 {
32 return c = (c << 24) |
33 ((c >> 8) & 0xFF00) |
34 ((c << 8) & 0xFF0000) |
35 (c >> 24);
36 }
37
38 /// Byte-swaps c.
39 wchar swapBytes(wchar c)
40 {
41 return (c << 8) | (c >> 8);
42 }
43
44 /// Swaps the bytes of c on a little-endian machine.
45 dchar BEtoMachineDword(dchar c)
46 {
47 version(LittleEndian)
48 return swapBytes(c);
49 else
50 return c;
51 }
52
53 /// Swaps the bytes of c on a big-endian machine.
54 dchar LEtoMachineDword(dchar c)
55 {
56 version(LittleEndian)
57 return c;
58 else
59 return swapBytes(c);
60 }
61
62 /// Swaps the bytes of c on a little-endian machine.
63 wchar BEtoMachineWord(wchar c)
64 {
65 version(LittleEndian)
66 return swapBytes(c);
67 else
68 return c;
69 }
70
71 /// Swaps the bytes of c on a big-endian machine.
72 wchar LEtoMachineWord(wchar c)
73 {
74 version(LittleEndian)
75 return c;
76 else
77 return swapBytes(c);
78 }
79
80 /// Converts a UTF-32 text to UTF-8.
81 char[] UTF32toUTF8(bool isBigEndian)(ubyte[] data)
82 {
83 if (data.length == 0)
84 return null;
85
86 char[] result;
87 uint lineNum = 1;
88 dchar[] text = cast(dchar[]) data[0 .. $-($%4)]; // Trim to multiple of 4.
89 foreach (dchar c; text)
90 {
91 static if (isBigEndian)
92 c = BEtoMachineDword(c);
93 else
94 c = LEtoMachineDword(c);
95
96 if (!isValidChar(c))
97 {
98 infoMan ~= new LexerError(
99 new Location(filePath, lineNum),
100 Format(MSG.InvalidUTF32Character, c)
101 );
102 c = REPLACEMENT_CHAR;
103 }
104
105 if (isNewline(c))
106 ++lineNum;
107 dil.Unicode.encode(result, c);
108 }
109
110 if (data.length % 4)
111 infoMan ~= new LexerError(
112 new Location(filePath, lineNum),
113 MSG.UTF32FileMustBeDivisibleBy4
114 );
115
116 return result;
117 }
118
119 alias UTF32toUTF8!(true) UTF32BEtoUTF8; /// Instantiation for UTF-32 BE.
120 alias UTF32toUTF8!(false) UTF32LEtoUTF8; /// Instantiation for UTF-32 LE.
121
122 /// Converts a UTF-16 text to UTF-8.
123 char[] UTF16toUTF8(bool isBigEndian)(ubyte[] data)
124 {
125 if (data.length == 0)
126 return null;
127
128 wchar[] text = cast(wchar[]) data[0 .. $-($%2)]; // Trim to multiple of two.
129 wchar* p = text.ptr,
130 end = text.ptr + text.length;
131 char[] result;
132 uint lineNum = 1;
133
134 for (; p < end; p++)
135 {
136 dchar c = *p;
137 static if (isBigEndian)
138 c = BEtoMachineWord(c);
139 else
140 c = LEtoMachineWord(c);
141
142 if (0xD800 > c || c > 0xDFFF)
143 {}
144 else if (c <= 0xDBFF && p+1 < end)
145 { // Decode surrogate pairs.
146 wchar c2 = p[1];
147 static if (isBigEndian)
148 c2 = BEtoMachineWord(c2);
149 else
150 c2 = LEtoMachineWord(c2);
151
152 if (0xDC00 <= c2 && c2 <= 0xDFFF)
153 {
154 c = (c - 0xD7C0) << 10;
155 c |= (c2 & 0x3FF);
156 ++p;
157 }
158 }
159 else
160 {
161 infoMan ~= new LexerError(
162 new Location(filePath, lineNum),
163 Format(MSG.InvalidUTF16Character, c)
164 );
165 c = REPLACEMENT_CHAR;
166 }
167
168 if (isNewline(c))
169 ++lineNum;
170 dil.Unicode.encode(result, c);
171 }
172
173 if (data.length % 2)
174 infoMan ~= new LexerError(
175 new Location(filePath, lineNum),
176 MSG.UTF16FileMustBeDivisibleBy2
177 );
178 return result;
179 }
180
181 alias UTF16toUTF8!(true) UTF16BEtoUTF8; /// Instantiation for UTF-16 BE.
182 alias UTF16toUTF8!(false) UTF16LEtoUTF8; /// Instantiation for UTF-16 LE.
183
184 /// Converts the text in data to UTF-8.
185 /// Leaves data unchanged if it is in UTF-8 already.
186 char[] data2UTF8(ubyte[] data)
187 {
188 if (data.length == 0)
189 return "";
190
191 char[] text;
192 BOM bom = tellBOM(data);
193
194 switch (bom)
195 {
196 case BOM.None:
197 // No BOM found. According to the specs the first character
198 // must be an ASCII character.
199 if (data.length >= 4)
200 {
201 if (data[0..3] == cast(ubyte[3])x"00 00 00")
202 {
203 text = UTF32BEtoUTF8(data); // UTF-32BE: 00 00 00 XX
204 break;
205 }
206 else if (data[1..4] == cast(ubyte[3])x"00 00 00")
207 {
208 text = UTF32LEtoUTF8(data); // UTF-32LE: XX 00 00 00
209 break;
210 }
211 }
212 if (data.length >= 2)
213 {
214 if (data[0] == 0) // UTF-16BE: 00 XX
215 {
216 text = UTF16BEtoUTF8(data);
217 break;
218 }
219 else if (data[1] == 0) // UTF-16LE: XX 00
220 {
221 text = UTF16LEtoUTF8(data);
222 break;
223 }
224 }
225 text = cast(char[])data; // UTF-8
226 break;
227 case BOM.UTF8:
228 text = cast(char[])data[3..$];
229 break;
230 case BOM.UTF16BE:
231 text = UTF16BEtoUTF8(data[2..$]);
232 break;
233 case BOM.UTF16LE:
234 text = UTF16LEtoUTF8(data[2..$]);
235 break;
236 case BOM.UTF32BE:
237 text = UTF32BEtoUTF8(data[4..$]);
238 break;
239 case BOM.UTF32LE:
240 text = UTF32LEtoUTF8(data[4..$]);
241 break;
242 default:
243 assert(0);
244 }
245 return text;
246 }
247 }
248
249 /// Replaces invalid UTF-8 sequences with U+FFFD (if there's enough space,)
250 /// and Newlines with '\n'.
251 string sanitizeText(string text)
252 {
253 if (!text.length)
254 return null;
255
256 char* p = text.ptr;
257 char* end = p + text.length;
258 char* q = p;
259
260 for (; p < end; p++, q++)
261 {
262 assert(q <= p);
263 switch (*p)
264 {
265 case '\r':
266 if (p+1 < end && p[1] == '\n')
267 p++;
268 case '\n':
269 *q = '\n';
270 continue;
271 default:
272 if (isascii(*p))
273 break;
274 if (p+2 < end && isUnicodeNewline(p))
275 {
276 p += 2;
277 goto case '\n';
278 }
279 auto p2 = p; // Beginning of the UTF-8 sequence.
280 dchar c = decode(p, end);
281 if (c == ERROR_CHAR)
282 { // Skip to next ASCII character or valid UTF-8 sequence.
283 while (++p < end && isTrailByte(*p))
284 {}
285 alias REPLACEMENT_STR R;
286 if (q+2 < p) // Copy replacement char if there is enough space.
287 (*q = R[0]), (*++q = R[1]), (*++q = R[2]);
288 p--;
289 }
290 else
291 { // Copy the valid UTF-8 sequence.
292 while (p2 <= p) // p points to the last trail byte.
293 *q++ = *p2++; // Copy code units.
294 q--;
295 }
296 continue;
297 }
298 assert(isascii(*p));
299 *q = *p;
300 }
301 assert(p == end);
302 text.length = text.length - (p - q);
303 //text = text.ptr[0 .. q - text.ptr]; // Another way.
304 return text;
305 }
306
307 unittest
308 {
309 Stdout("Testing function Converter.\n");
310 struct Data2Text
311 {
312 char[] text;
313 char[] expected = "source";
314 ubyte[] data()
315 { return cast(ubyte[])text; }
316 }
317 const Data2Text[] map = [
318 // Without BOM
319 {"source"},
320 {"s\0o\0u\0r\0c\0e\0"},
321 {"\0s\0o\0u\0r\0c\0e"},
322 {"s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0"},
323 {"\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e"},
324 // With BOM
325 {"\xEF\xBB\xBFsource"},
326 {"\xFE\xFF\0s\0o\0u\0r\0c\0e"},
327 {"\xFF\xFEs\0o\0u\0r\0c\0e\0"},
328 {"\x00\x00\xFE\xFF\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e"},
329 {"\xFF\xFE\x00\x00s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0"},
330 ];
331 auto converter = Converter("", new InfoManager);
332 foreach (i, pair; map)
333 assert(converter.data2UTF8(pair.data) == pair.expected, Format("failed at item {}", i));
334 }