Mercurial > projects > dil
comparison src/dil/Converter.d @ 806:bcb74c9b895c
Moved out files in the trunk folder to the root.
author | Aziz K?ksal <aziz.koeksal@gmail.com> |
---|---|
date | Sun, 09 Mar 2008 00:12:19 +0100 |
parents | trunk/src/dil/Converter.d@3b34f6a95a27 |
children |
comparison
equal
deleted
inserted
replaced
805:a3fab8b74a7d | 806:bcb74c9b895c |
---|---|
1 /++ | |
2 Author: Aziz Köksal | |
3 License: GPL3 | |
4 +/ | |
5 module dil.Converter; | |
6 | |
7 import dil.Information; | |
8 import dil.Location; | |
9 import dil.Unicode; | |
10 import dil.FileBOM; | |
11 import dil.lexer.Funcs; | |
12 import dil.Messages; | |
13 import common; | |
14 | |
15 /// Converts various Unicode encoding formats to UTF-8. | |
16 struct Converter | |
17 { | |
18 char[] filePath; /// For error messages. | |
19 InfoManager infoMan; | |
20 | |
21 static Converter opCall(char[] filePath, InfoManager infoMan) | |
22 { | |
23 Converter conv; | |
24 conv.filePath = filePath; | |
25 conv.infoMan = infoMan; | |
26 return conv; | |
27 } | |
28 | |
29 /// Byte-swaps c. | |
30 dchar swapBytes(dchar c) | |
31 { | |
32 return c = (c << 24) | | |
33 ((c >> 8) & 0xFF00) | | |
34 ((c << 8) & 0xFF0000) | | |
35 (c >> 24); | |
36 } | |
37 | |
38 /// Byte-swaps c. | |
39 wchar swapBytes(wchar c) | |
40 { | |
41 return (c << 8) | (c >> 8); | |
42 } | |
43 | |
44 /// Swaps the bytes of c on a little-endian machine. | |
45 dchar BEtoMachineDword(dchar c) | |
46 { | |
47 version(LittleEndian) | |
48 return swapBytes(c); | |
49 else | |
50 return c; | |
51 } | |
52 | |
53 /// Swaps the bytes of c on a big-endian machine. | |
54 dchar LEtoMachineDword(dchar c) | |
55 { | |
56 version(LittleEndian) | |
57 return c; | |
58 else | |
59 return swapBytes(c); | |
60 } | |
61 | |
62 /// Swaps the bytes of c on a little-endian machine. | |
63 wchar BEtoMachineWord(wchar c) | |
64 { | |
65 version(LittleEndian) | |
66 return swapBytes(c); | |
67 else | |
68 return c; | |
69 } | |
70 | |
71 /// Swaps the bytes of c on a big-endian machine. | |
72 wchar LEtoMachineWord(wchar c) | |
73 { | |
74 version(LittleEndian) | |
75 return c; | |
76 else | |
77 return swapBytes(c); | |
78 } | |
79 | |
80 /// Converts a UTF-32 text to UTF-8. | |
81 char[] UTF32toUTF8(bool isBigEndian)(ubyte[] data) | |
82 { | |
83 if (data.length == 0) | |
84 return null; | |
85 | |
86 char[] result; | |
87 uint lineNum = 1; | |
88 dchar[] text = cast(dchar[]) data[0 .. $-($%4)]; // Trim to multiple of 4. | |
89 foreach (dchar c; text) | |
90 { | |
91 static if (isBigEndian) | |
92 c = BEtoMachineDword(c); | |
93 else | |
94 c = LEtoMachineDword(c); | |
95 | |
96 if (!isValidChar(c)) | |
97 { | |
98 infoMan ~= new LexerError( | |
99 new Location(filePath, lineNum), | |
100 Format(MSG.InvalidUTF32Character, c) | |
101 ); | |
102 c = REPLACEMENT_CHAR; | |
103 } | |
104 | |
105 if (isNewline(c)) | |
106 ++lineNum; | |
107 dil.Unicode.encode(result, c); | |
108 } | |
109 | |
110 if (data.length % 4) | |
111 infoMan ~= new LexerError( | |
112 new Location(filePath, lineNum), | |
113 MSG.UTF32FileMustBeDivisibleBy4 | |
114 ); | |
115 | |
116 return result; | |
117 } | |
118 | |
119 alias UTF32toUTF8!(true) UTF32BEtoUTF8; /// Instantiation for UTF-32 BE. | |
120 alias UTF32toUTF8!(false) UTF32LEtoUTF8; /// Instantiation for UTF-32 LE. | |
121 | |
122 /// Converts a UTF-16 text to UTF-8. | |
123 char[] UTF16toUTF8(bool isBigEndian)(ubyte[] data) | |
124 { | |
125 if (data.length == 0) | |
126 return null; | |
127 | |
128 wchar[] text = cast(wchar[]) data[0 .. $-($%2)]; // Trim to multiple of two. | |
129 wchar* p = text.ptr, | |
130 end = text.ptr + text.length; | |
131 char[] result; | |
132 uint lineNum = 1; | |
133 | |
134 for (; p < end; p++) | |
135 { | |
136 dchar c = *p; | |
137 static if (isBigEndian) | |
138 c = BEtoMachineWord(c); | |
139 else | |
140 c = LEtoMachineWord(c); | |
141 | |
142 if (0xD800 > c || c > 0xDFFF) | |
143 {} | |
144 else if (c <= 0xDBFF && p+1 < end) | |
145 { // Decode surrogate pairs. | |
146 wchar c2 = p[1]; | |
147 static if (isBigEndian) | |
148 c2 = BEtoMachineWord(c2); | |
149 else | |
150 c2 = LEtoMachineWord(c2); | |
151 | |
152 if (0xDC00 <= c2 && c2 <= 0xDFFF) | |
153 { | |
154 c = (c - 0xD7C0) << 10; | |
155 c |= (c2 & 0x3FF); | |
156 ++p; | |
157 } | |
158 } | |
159 else | |
160 { | |
161 infoMan ~= new LexerError( | |
162 new Location(filePath, lineNum), | |
163 Format(MSG.InvalidUTF16Character, c) | |
164 ); | |
165 c = REPLACEMENT_CHAR; | |
166 } | |
167 | |
168 if (isNewline(c)) | |
169 ++lineNum; | |
170 dil.Unicode.encode(result, c); | |
171 } | |
172 | |
173 if (data.length % 2) | |
174 infoMan ~= new LexerError( | |
175 new Location(filePath, lineNum), | |
176 MSG.UTF16FileMustBeDivisibleBy2 | |
177 ); | |
178 return result; | |
179 } | |
180 | |
181 alias UTF16toUTF8!(true) UTF16BEtoUTF8; /// Instantiation for UTF-16 BE. | |
182 alias UTF16toUTF8!(false) UTF16LEtoUTF8; /// Instantiation for UTF-16 LE. | |
183 | |
184 /// Converts the text in data to UTF-8. | |
185 /// Leaves data unchanged if it is in UTF-8 already. | |
186 char[] data2UTF8(ubyte[] data) | |
187 { | |
188 if (data.length == 0) | |
189 return ""; | |
190 | |
191 char[] text; | |
192 BOM bom = tellBOM(data); | |
193 | |
194 switch (bom) | |
195 { | |
196 case BOM.None: | |
197 // No BOM found. According to the specs the first character | |
198 // must be an ASCII character. | |
199 if (data.length >= 4) | |
200 { | |
201 if (data[0..3] == cast(ubyte[3])x"00 00 00") | |
202 { | |
203 text = UTF32BEtoUTF8(data); // UTF-32BE: 00 00 00 XX | |
204 break; | |
205 } | |
206 else if (data[1..4] == cast(ubyte[3])x"00 00 00") | |
207 { | |
208 text = UTF32LEtoUTF8(data); // UTF-32LE: XX 00 00 00 | |
209 break; | |
210 } | |
211 } | |
212 if (data.length >= 2) | |
213 { | |
214 if (data[0] == 0) // UTF-16BE: 00 XX | |
215 { | |
216 text = UTF16BEtoUTF8(data); | |
217 break; | |
218 } | |
219 else if (data[1] == 0) // UTF-16LE: XX 00 | |
220 { | |
221 text = UTF16LEtoUTF8(data); | |
222 break; | |
223 } | |
224 } | |
225 text = cast(char[])data; // UTF-8 | |
226 break; | |
227 case BOM.UTF8: | |
228 text = cast(char[])data[3..$]; | |
229 break; | |
230 case BOM.UTF16BE: | |
231 text = UTF16BEtoUTF8(data[2..$]); | |
232 break; | |
233 case BOM.UTF16LE: | |
234 text = UTF16LEtoUTF8(data[2..$]); | |
235 break; | |
236 case BOM.UTF32BE: | |
237 text = UTF32BEtoUTF8(data[4..$]); | |
238 break; | |
239 case BOM.UTF32LE: | |
240 text = UTF32LEtoUTF8(data[4..$]); | |
241 break; | |
242 default: | |
243 assert(0); | |
244 } | |
245 return text; | |
246 } | |
247 } | |
248 | |
249 /// Replaces invalid UTF-8 sequences with U+FFFD (if there's enough space,) | |
250 /// and Newlines with '\n'. | |
251 string sanitizeText(string text) | |
252 { | |
253 if (!text.length) | |
254 return null; | |
255 | |
256 char* p = text.ptr; | |
257 char* end = p + text.length; | |
258 char* q = p; | |
259 | |
260 for (; p < end; p++, q++) | |
261 { | |
262 assert(q <= p); | |
263 switch (*p) | |
264 { | |
265 case '\r': | |
266 if (p+1 < end && p[1] == '\n') | |
267 p++; | |
268 case '\n': | |
269 *q = '\n'; | |
270 continue; | |
271 default: | |
272 if (isascii(*p)) | |
273 break; | |
274 if (p+2 < end && isUnicodeNewline(p)) | |
275 { | |
276 p += 2; | |
277 goto case '\n'; | |
278 } | |
279 auto p2 = p; // Beginning of the UTF-8 sequence. | |
280 dchar c = decode(p, end); | |
281 if (c == ERROR_CHAR) | |
282 { // Skip to next ASCII character or valid UTF-8 sequence. | |
283 while (++p < end && isTrailByte(*p)) | |
284 {} | |
285 alias REPLACEMENT_STR R; | |
286 if (q+2 < p) // Copy replacement char if there is enough space. | |
287 (*q = R[0]), (*++q = R[1]), (*++q = R[2]); | |
288 p--; | |
289 } | |
290 else | |
291 { // Copy the valid UTF-8 sequence. | |
292 while (p2 <= p) // p points to the last trail byte. | |
293 *q++ = *p2++; // Copy code units. | |
294 q--; | |
295 } | |
296 continue; | |
297 } | |
298 assert(isascii(*p)); | |
299 *q = *p; | |
300 } | |
301 assert(p == end); | |
302 text.length = text.length - (p - q); | |
303 //text = text.ptr[0 .. q - text.ptr]; // Another way. | |
304 return text; | |
305 } | |
306 | |
307 unittest | |
308 { | |
309 Stdout("Testing function Converter.\n"); | |
310 struct Data2Text | |
311 { | |
312 char[] text; | |
313 char[] expected = "source"; | |
314 ubyte[] data() | |
315 { return cast(ubyte[])text; } | |
316 } | |
317 const Data2Text[] map = [ | |
318 // Without BOM | |
319 {"source"}, | |
320 {"s\0o\0u\0r\0c\0e\0"}, | |
321 {"\0s\0o\0u\0r\0c\0e"}, | |
322 {"s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0"}, | |
323 {"\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e"}, | |
324 // With BOM | |
325 {"\xEF\xBB\xBFsource"}, | |
326 {"\xFE\xFF\0s\0o\0u\0r\0c\0e"}, | |
327 {"\xFF\xFEs\0o\0u\0r\0c\0e\0"}, | |
328 {"\x00\x00\xFE\xFF\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e"}, | |
329 {"\xFF\xFE\x00\x00s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0"}, | |
330 ]; | |
331 auto converter = Converter("", new InfoManager); | |
332 foreach (i, pair; map) | |
333 assert(converter.data2UTF8(pair.data) == pair.expected, Format("failed at item {}", i)); | |
334 } |