Mercurial > projects > dang
comparison basic/LiteralParsing.d @ 106:89db676fbacb
Now able of understanding strings.
author | Anders Johnsen <skabet@gmail.com> |
---|---|
date | Thu, 22 May 2008 12:09:11 +0200 |
parents | |
children | c658172ca8a0 |
comparison
equal
deleted
inserted
replaced
105:f1282c5fe8e3 | 106:89db676fbacb |
---|---|
1 module basic.LiteralParsing.d; | |
2 | |
3 import basic.SourceLocation, | |
4 basic.Message; | |
5 | |
6 import tango.io.Stdout, | |
7 tango.core.BitManip, | |
8 Integer = tango.text.convert.Integer, | |
9 tango.text.Util; | |
10 | |
11 enum StructType | |
12 { | |
13 Char, | |
14 WChar, | |
15 DChar | |
16 } | |
17 | |
18 struct String | |
19 { | |
20 StructType type; | |
21 ubyte[] data; | |
22 } | |
23 | |
24 private struct EscapeReturn | |
25 { | |
26 ubyte[] data; | |
27 int length; | |
28 } | |
29 | |
30 String parseString(char[] str, SourceLocation loc, MessageHandler messages) | |
31 { | |
32 String strBuf; | |
33 strBuf.data.length = str.length; | |
34 strBuf.data.length = 0; | |
35 | |
36 switch(str[0]) | |
37 { | |
38 case 'r': | |
39 strBuf = parseWysiwygString(str[1..$], strBuf); | |
40 break; | |
41 case '`': | |
42 strBuf = parseWysiwygString(str, strBuf); | |
43 break; | |
44 case '"': | |
45 strBuf = parseDoubleQuotedString(str, strBuf, loc, messages); | |
46 break; | |
47 case 'x': | |
48 strBuf = parseHexString(str[1..$], strBuf, loc + 1, messages); | |
49 break; | |
50 default: | |
51 messages.report(InvalidStrPrefix, loc, loc + 1); | |
52 | |
53 } | |
54 | |
55 printString(str, strBuf); | |
56 | |
57 return strBuf; | |
58 } | |
59 | |
60 String parseHexString(char[] str, String strBuf, | |
61 SourceLocation loc, MessageHandler messages) | |
62 { | |
63 int i = 1; // first char is " | |
64 char[] hex = "0123456789abcdefABCDEF"; | |
65 char[] whitespace = "\r\n "; | |
66 char[] hexBuf; | |
67 | |
68 while(str[i] != '"') | |
69 { | |
70 if(hex.contains(str[i])) | |
71 { | |
72 hexBuf ~= str[i]; | |
73 if(hexBuf.length == 2) | |
74 { | |
75 strBuf.data ~= Integer.toInt(hexBuf, 16); | |
76 hexBuf.length = 0; | |
77 } | |
78 } | |
79 else if(whitespace.contains(str[i])) | |
80 {} | |
81 else | |
82 messages.report(InvalidHexStrChar, loc + i, loc + i + 1); | |
83 | |
84 i++; | |
85 } | |
86 | |
87 | |
88 | |
89 return strBuf; | |
90 } | |
91 | |
92 | |
93 String parseDoubleQuotedString(char[] str, String strBuf, | |
94 SourceLocation loc, MessageHandler messages) | |
95 { | |
96 int i = 1; // first char is " | |
97 | |
98 while(str[i] != '"') | |
99 { | |
100 switch(str[i]) | |
101 { | |
102 case '\\': // EscapeSequence | |
103 EscapeReturn res = parseEscapeSequence(str[i..$], loc + i, messages); | |
104 strBuf.data ~= res.data; | |
105 i += res.length; | |
106 break; | |
107 default: | |
108 strBuf.data ~= str[i]; | |
109 i++; | |
110 } | |
111 if(i >= str.length) | |
112 break; | |
113 } | |
114 | |
115 return strBuf; | |
116 } | |
117 | |
118 EscapeReturn parseEscapeSequence(char[] str, | |
119 SourceLocation loc, MessageHandler messages) | |
120 { | |
121 EscapeReturn res; | |
122 | |
123 switch(str[1]) | |
124 { | |
125 case '\'': | |
126 res.length = 2; | |
127 res.data ~= '\''; | |
128 break; | |
129 case '"': | |
130 res.length = 2; | |
131 res.data ~= '\"'; | |
132 break; | |
133 case '?': | |
134 res.length = 2; | |
135 res.data ~= '\?'; | |
136 break; | |
137 case '\\': | |
138 res.length = 2; | |
139 res.data ~= '\\'; | |
140 break; | |
141 case 'a': | |
142 res.length = 2; | |
143 res.data ~= '\a'; | |
144 break; | |
145 case 'b': | |
146 res.length = 2; | |
147 res.data ~= '\b'; | |
148 break; | |
149 case 'f': | |
150 res.length = 2; | |
151 res.data ~= '\f'; | |
152 break; | |
153 case 'n': | |
154 res.length = 2; | |
155 res.data ~= '\n'; | |
156 break; | |
157 case 'r': | |
158 res.length = 2; | |
159 res.data ~= '\r'; | |
160 break; | |
161 case 't': | |
162 res.length = 2; | |
163 res.data ~= '\t'; | |
164 break; | |
165 case 'v': | |
166 res.length = 2; | |
167 res.data ~= '\v'; | |
168 break; | |
169 case 'x': | |
170 char[] hex = "0123456789abcdefABCDEF"; | |
171 char[] hexBuf; | |
172 if(str.length - 1 >= 4) | |
173 { | |
174 for(int i = 2; i < 4; i++) | |
175 if(hex.contains(str[i])) | |
176 hexBuf ~= str[i]; | |
177 else | |
178 messages.report(StringHexInvalid, loc + i, loc + i + 1) | |
179 .arg(Integer.toString(i-1)) | |
180 .arg(Integer.toString(2)); | |
181 res.length = 4; | |
182 } | |
183 else | |
184 { | |
185 messages.report(StringShortEscape, loc, loc + str.length); | |
186 res.length = str.length - 1; | |
187 } | |
188 res.data ~= cast(ubyte)Integer.toInt(hexBuf, 16); | |
189 break; | |
190 case 'u': | |
191 char[] hex = "0123456789abcdefABCDEF"; | |
192 char[] hexBuf; | |
193 if(str.length - 1 >= 6) | |
194 { | |
195 for(int i = 2; i < 6; i++) | |
196 if(hex.contains(str[i])) | |
197 hexBuf ~= str[i]; | |
198 else | |
199 messages.report(StringHexInvalid, loc + i, loc + i + 1) | |
200 .arg(Integer.toString(i-1)) | |
201 .arg(Integer.toString(6)); | |
202 res.length = 6; | |
203 } | |
204 else | |
205 { | |
206 messages.report(StringShortEscape, loc, loc + str.length); | |
207 res.length = str.length - 1; | |
208 } | |
209 uint i = Integer.toLong(hexBuf, 16); | |
210 if(!isValidUtf8(i)) | |
211 messages.report(InvalidUtf8Hex, loc, loc+6); | |
212 else | |
213 res.data ~= parseToUtf8(i); | |
214 break; | |
215 case 'U': | |
216 char[] hex = "0123456789abcdefABCDEF"; | |
217 char[] hexBuf; | |
218 if(str.length - 1 >= 10) | |
219 { | |
220 for(int i = 2; i < 10; i++) | |
221 if(hex.contains(str[i])) | |
222 hexBuf ~= str[i]; | |
223 else | |
224 messages.report(StringHexInvalid, loc + i, loc + i + 1) | |
225 .arg(Integer.toString(i-1)) | |
226 .arg(Integer.toString(10)); | |
227 res.length = 10; | |
228 } | |
229 else | |
230 { | |
231 messages.report(StringShortEscape, loc, loc + str.length); | |
232 res.length = str.length - 1; | |
233 } | |
234 uint i = Integer.toLong(hexBuf, 16); | |
235 if(!isValidUtf8(i)) | |
236 messages.report(InvalidUtf8Hex, loc, loc+10); | |
237 else | |
238 res.data ~= parseToUtf8(i); | |
239 break; | |
240 case '0': | |
241 case '1': | |
242 case '2': | |
243 case '3': | |
244 case '4': | |
245 case '5': | |
246 case '6': | |
247 case '7': | |
248 char[] oct = "01234567"; | |
249 char[] octBuf; | |
250 octBuf ~= str[1]; | |
251 res.length = 2; | |
252 for(int i = 2; i < 4; i++) | |
253 if(oct.contains(str[i])) | |
254 { | |
255 octBuf ~= str[i]; | |
256 res.length += 1; | |
257 } | |
258 else | |
259 break; | |
260 | |
261 uint i = Integer.toLong(octBuf, 8); | |
262 res.data ~= i; | |
263 break; | |
264 default: | |
265 messages.report(InvalidStrEscape, loc, loc + 2); | |
266 res.length += 2; | |
267 } | |
268 | |
269 return res; | |
270 } | |
271 | |
272 String parseWysiwygString(char[] str, String strBuf) | |
273 { | |
274 char start = str[0]; | |
275 | |
276 int i = 1; | |
277 | |
278 while(str[i] != start) | |
279 { | |
280 strBuf.data ~= cast(ubyte)str[i]; | |
281 i++; | |
282 } | |
283 return strBuf; | |
284 } | |
285 | |
286 ubyte[] parseToUtf8(uint i) | |
287 { | |
288 if(i <= 0x00007F) | |
289 return [cast(ubyte)i]; | |
290 else if(i <= 0x0007FF) | |
291 { | |
292 ubyte a = (i << 26) >> 26; | |
293 bts(cast(uint*)&a, 7); | |
294 ubyte b = (i << 19) >> 25; | |
295 bts(cast(uint*)&b, 7); | |
296 bts(cast(uint*)&b, 6); | |
297 return [b,a]; | |
298 } | |
299 else if(i <= 0x00FFFF) | |
300 { | |
301 ubyte a = (i << 26) >> 26; | |
302 bts(cast(uint*)&a, 7); | |
303 ubyte b = (i << 20) >> 26; | |
304 bts(cast(uint*)&b, 7); | |
305 ubyte c = (i << 16) >> 28; | |
306 bts(cast(uint*)&c, 7); | |
307 bts(cast(uint*)&c, 6); | |
308 bts(cast(uint*)&c, 5); | |
309 return [c,b,a]; | |
310 } | |
311 else if(i <= 0x10FFFF) | |
312 { | |
313 ubyte a = (i << 26) >> 26; | |
314 bts(cast(uint*)&a, 7); | |
315 ubyte b = (i << 20) >> 26; | |
316 bts(cast(uint*)&b, 7); | |
317 ubyte c = (i << 14) >> 26; | |
318 bts(cast(uint*)&c, 7); | |
319 ubyte d = (i << 11) >> 29; | |
320 bts(cast(uint*)&d, 7); | |
321 bts(cast(uint*)&d, 6); | |
322 bts(cast(uint*)&d, 5); | |
323 bts(cast(uint*)&d, 4); | |
324 return [d,c,b,a]; | |
325 } | |
326 } | |
327 | |
328 bool isValidUtf8(uint i) | |
329 { | |
330 if(i <= 0x10FFFF) | |
331 return true; | |
332 return false; | |
333 } | |
334 | |
335 void printString(char[] str, String strBuf) | |
336 { | |
337 char[] s; | |
338 switch(strBuf.type) | |
339 { | |
340 case StructType.Char: | |
341 Stdout(str)(" have become").newline() | |
342 (cast(char[])strBuf.data).newline; | |
343 break; | |
344 case StructType.WChar: | |
345 Stdout(str)(" have become").newline() | |
346 (cast(wchar[])strBuf.data).newline; | |
347 break; | |
348 case StructType.DChar: | |
349 Stdout(str)(" have become").newline() | |
350 (cast(dchar[])strBuf.data).newline; | |
351 break; | |
352 } | |
353 } |