Mercurial > projects > dil
comparison trunk/src/dil/Converter.d @ 518:8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Moved code from dil.File to dil.FileBOM.
Added opCatAssign to class InformationManager.
Added encode() function to dil.Unicode.
author | Aziz K?ksal <aziz.koeksal@gmail.com> |
---|---|
date | Sat, 15 Dec 2007 18:55:06 +0100 |
parents | |
children | 50e64bab9c7a |
comparison
equal
deleted
inserted
replaced
517:b465c669d70c | 518:8f86bb9ef715 |
---|---|
1 /++ | |
2 Author: Aziz Köksal | |
3 License: GPL3 | |
4 +/ | |
5 module dil.Converter; | |
6 | |
7 import dil.Information; | |
8 import dil.Location; | |
9 import dil.Unicode; | |
10 import dil.FileBOM; | |
11 import common; | |
12 | |
13 /// Converts various Unicode encoding formats to UTF-8. | |
14 struct Converter | |
15 { | |
16 char[] filePath; /// For error messages. | |
17 InformationManager infoMan; | |
18 | |
19 static Converter opCall(char[] filePath, InformationManager infoMan) | |
20 { | |
21 Converter conv; | |
22 conv.filePath = filePath; | |
23 conv.infoMan = infoMan; | |
24 return conv; | |
25 } | |
26 | |
27 dchar swapBytes(dchar c) | |
28 { | |
29 return c = (c << 24) | | |
30 ((c >> 8) & 0xFF00) | | |
31 ((c << 8) & 0xFF0000) | | |
32 (c >> 24); | |
33 } | |
34 | |
35 wchar swapBytes(wchar c) | |
36 { | |
37 return (c << 8) | (c >> 8); | |
38 } | |
39 | |
40 wchar BEtoMachineDword(dchar c) | |
41 { | |
42 version(LittleEndian) | |
43 return swapBytes(c); | |
44 else | |
45 return c; | |
46 } | |
47 | |
48 wchar LEtoMachineDword(dchar c) | |
49 { | |
50 version(LittleEndian) | |
51 return c; | |
52 else | |
53 return swapBytes(c); | |
54 } | |
55 | |
56 wchar BEtoMachineWord(wchar c) | |
57 { | |
58 version(LittleEndian) | |
59 return swapBytes(c); | |
60 else | |
61 return c; | |
62 } | |
63 | |
64 wchar LEtoMachineWord(wchar c) | |
65 { | |
66 version(LittleEndian) | |
67 return c; | |
68 else | |
69 return swapBytes(c); | |
70 } | |
71 | |
72 char[] UTF32toUTF8(bool isBigEndian)(ubyte[] data) | |
73 { | |
74 if (data.length % 4) | |
75 { | |
76 infoMan.info ~= new LexerError(new Location(filePath, 0), | |
77 "the byte length of a UTF-32 source file must be divisible by 4." | |
78 ); | |
79 data = data[0 .. $ - $ % 4]; // Trim to valid size. | |
80 } | |
81 if (data.length == 0) | |
82 return null; | |
83 | |
84 char[] result; | |
85 foreach (dchar c; cast(dchar[])data) | |
86 { | |
87 static if (isBigEndian) | |
88 c = BEtoMachineDword(c); | |
89 else | |
90 c = LEtoMachineDword(c); | |
91 | |
92 if (!isValidChar(c)) | |
93 { | |
94 // TODO: correct location. | |
95 auto loc = new Location(filePath, 0); | |
96 infoMan.info ~= new LexerError(null, Format("invalid UTF-32 character '{:X}'.", c)); | |
97 c = REPLACEMENT_CHAR; | |
98 } | |
99 | |
100 dil.Unicode.encode(result, c); | |
101 } | |
102 return result; | |
103 } | |
104 | |
105 alias UTF32toUTF8!(true) UTF32BEtoUTF8; | |
106 alias UTF32toUTF8!(false) UTF32LEtoUTF8; | |
107 | |
108 char[] UTF16toUTF8(bool isBigEndian)(ubyte[] data) | |
109 { | |
110 if (data.length % 2) | |
111 { | |
112 infoMan ~= new LexerError(new Location(filePath, 0), | |
113 "the byte length of a UTF-16 source file must be divisible by 2." | |
114 ); | |
115 data = data[0 .. $-1]; // Trim to valid size. | |
116 } | |
117 | |
118 if (data.length == 0) | |
119 return null; | |
120 | |
121 wchar[] text = cast(wchar[])data; | |
122 wchar* p = text.ptr, | |
123 end = text.ptr + text.length; | |
124 char[] result; | |
125 | |
126 dchar c = *p; | |
127 | |
128 do | |
129 { | |
130 static if (isBigEndian) | |
131 c = BEtoMachineWord(c); | |
132 else | |
133 c = LEtoMachineWord(c); | |
134 | |
135 if (c < 0xD800 || 0xDFFF > c) | |
136 {} | |
137 else if (c <= 0xDBFF && p+1 < end) | |
138 { | |
139 wchar c2 = p[1]; | |
140 static if (isBigEndian) | |
141 c2 = BEtoMachineWord(c2); | |
142 else | |
143 c2 = LEtoMachineWord(c2); | |
144 | |
145 if (0xDC00 <= c2 && c2 <= 0xDFFF) | |
146 { | |
147 c = (c - 0xD7C0) << 10; | |
148 c |= (c2 & 0x3FF); | |
149 ++p; | |
150 } | |
151 } | |
152 else | |
153 { | |
154 // TODO: correct location. | |
155 auto loc = new Location(filePath, 0); | |
156 infoMan ~= new LexerError(loc, Format("invalid UTF-16 character '{:X}'.", c)); | |
157 c = REPLACEMENT_CHAR; | |
158 } | |
159 ++p; | |
160 dil.Unicode.encode(result, c); | |
161 } while (p < end) | |
162 return result; | |
163 } | |
164 | |
165 alias UTF16toUTF8!(true) UTF16BEtoUTF8; | |
166 alias UTF16toUTF8!(false) UTF16LEtoUTF8; | |
167 | |
168 char[] data2UTF8(ubyte[] data) | |
169 { | |
170 if (data.length == 0) | |
171 return null; | |
172 | |
173 char[] text; | |
174 BOM bom = tellBOM(data); | |
175 | |
176 switch (bom) | |
177 { | |
178 case BOM.None: | |
179 // No BOM found. According to the specs the first character | |
180 // must be an ASCII character. | |
181 if (data.length >= 4) | |
182 { | |
183 if (data[0..3] == cast(ubyte[3])x"00 00 00") | |
184 { | |
185 text = UTF32BEtoUTF8(data); // UTF-32BE: 00 00 00 XX | |
186 break; | |
187 } | |
188 else if (data[1..4] == cast(ubyte[3])x"00 00 00") | |
189 { | |
190 text = UTF32LEtoUTF8(data); // UTF-32LE: XX 00 00 00 | |
191 break; | |
192 } | |
193 } | |
194 if (data.length >= 2) | |
195 { | |
196 if (data[0] == 0) // UTF-16BE: 00 XX | |
197 { | |
198 text = UTF16BEtoUTF8(data); | |
199 break; | |
200 } | |
201 else if (data[1] == 0) // UTF-16LE: XX 00 | |
202 { | |
203 text = UTF16LEtoUTF8(data); | |
204 break; | |
205 } | |
206 } | |
207 text = cast(char[])data; // UTF-8 | |
208 break; | |
209 case BOM.UTF8: | |
210 text = cast(char[])data[3..$]; | |
211 break; | |
212 case BOM.UTF16BE: | |
213 text = UTF16BEtoUTF8(data[2..$]); | |
214 break; | |
215 case BOM.UTF16LE: | |
216 text = UTF16LEtoUTF8(data[2..$]); | |
217 break; | |
218 case BOM.UTF32BE: | |
219 text = UTF32BEtoUTF8(data[4..$]); | |
220 break; | |
221 case BOM.UTF32LE: | |
222 text = UTF32LEtoUTF8(data[4..$]); | |
223 break; | |
224 default: | |
225 assert(0); | |
226 } | |
227 return text; | |
228 } | |
229 } |