comparison basic/LiteralParsing.d @ 106:89db676fbacb

Now able of understanding strings.
author Anders Johnsen <skabet@gmail.com>
date Thu, 22 May 2008 12:09:11 +0200
parents
children c658172ca8a0
comparison
equal deleted inserted replaced
105:f1282c5fe8e3 106:89db676fbacb
1 module basic.LiteralParsing.d;
2
3 import basic.SourceLocation,
4 basic.Message;
5
6 import tango.io.Stdout,
7 tango.core.BitManip,
8 Integer = tango.text.convert.Integer,
9 tango.text.Util;
10
11 enum StructType
12 {
13 Char,
14 WChar,
15 DChar
16 }
17
18 struct String
19 {
20 StructType type;
21 ubyte[] data;
22 }
23
24 private struct EscapeReturn
25 {
26 ubyte[] data;
27 int length;
28 }
29
30 String parseString(char[] str, SourceLocation loc, MessageHandler messages)
31 {
32 String strBuf;
33 strBuf.data.length = str.length;
34 strBuf.data.length = 0;
35
36 switch(str[0])
37 {
38 case 'r':
39 strBuf = parseWysiwygString(str[1..$], strBuf);
40 break;
41 case '`':
42 strBuf = parseWysiwygString(str, strBuf);
43 break;
44 case '"':
45 strBuf = parseDoubleQuotedString(str, strBuf, loc, messages);
46 break;
47 case 'x':
48 strBuf = parseHexString(str[1..$], strBuf, loc + 1, messages);
49 break;
50 default:
51 messages.report(InvalidStrPrefix, loc, loc + 1);
52
53 }
54
55 printString(str, strBuf);
56
57 return strBuf;
58 }
59
60 String parseHexString(char[] str, String strBuf,
61 SourceLocation loc, MessageHandler messages)
62 {
63 int i = 1; // first char is "
64 char[] hex = "0123456789abcdefABCDEF";
65 char[] whitespace = "\r\n ";
66 char[] hexBuf;
67
68 while(str[i] != '"')
69 {
70 if(hex.contains(str[i]))
71 {
72 hexBuf ~= str[i];
73 if(hexBuf.length == 2)
74 {
75 strBuf.data ~= Integer.toInt(hexBuf, 16);
76 hexBuf.length = 0;
77 }
78 }
79 else if(whitespace.contains(str[i]))
80 {}
81 else
82 messages.report(InvalidHexStrChar, loc + i, loc + i + 1);
83
84 i++;
85 }
86
87
88
89 return strBuf;
90 }
91
92
93 String parseDoubleQuotedString(char[] str, String strBuf,
94 SourceLocation loc, MessageHandler messages)
95 {
96 int i = 1; // first char is "
97
98 while(str[i] != '"')
99 {
100 switch(str[i])
101 {
102 case '\\': // EscapeSequence
103 EscapeReturn res = parseEscapeSequence(str[i..$], loc + i, messages);
104 strBuf.data ~= res.data;
105 i += res.length;
106 break;
107 default:
108 strBuf.data ~= str[i];
109 i++;
110 }
111 if(i >= str.length)
112 break;
113 }
114
115 return strBuf;
116 }
117
118 EscapeReturn parseEscapeSequence(char[] str,
119 SourceLocation loc, MessageHandler messages)
120 {
121 EscapeReturn res;
122
123 switch(str[1])
124 {
125 case '\'':
126 res.length = 2;
127 res.data ~= '\'';
128 break;
129 case '"':
130 res.length = 2;
131 res.data ~= '\"';
132 break;
133 case '?':
134 res.length = 2;
135 res.data ~= '\?';
136 break;
137 case '\\':
138 res.length = 2;
139 res.data ~= '\\';
140 break;
141 case 'a':
142 res.length = 2;
143 res.data ~= '\a';
144 break;
145 case 'b':
146 res.length = 2;
147 res.data ~= '\b';
148 break;
149 case 'f':
150 res.length = 2;
151 res.data ~= '\f';
152 break;
153 case 'n':
154 res.length = 2;
155 res.data ~= '\n';
156 break;
157 case 'r':
158 res.length = 2;
159 res.data ~= '\r';
160 break;
161 case 't':
162 res.length = 2;
163 res.data ~= '\t';
164 break;
165 case 'v':
166 res.length = 2;
167 res.data ~= '\v';
168 break;
169 case 'x':
170 char[] hex = "0123456789abcdefABCDEF";
171 char[] hexBuf;
172 if(str.length - 1 >= 4)
173 {
174 for(int i = 2; i < 4; i++)
175 if(hex.contains(str[i]))
176 hexBuf ~= str[i];
177 else
178 messages.report(StringHexInvalid, loc + i, loc + i + 1)
179 .arg(Integer.toString(i-1))
180 .arg(Integer.toString(2));
181 res.length = 4;
182 }
183 else
184 {
185 messages.report(StringShortEscape, loc, loc + str.length);
186 res.length = str.length - 1;
187 }
188 res.data ~= cast(ubyte)Integer.toInt(hexBuf, 16);
189 break;
190 case 'u':
191 char[] hex = "0123456789abcdefABCDEF";
192 char[] hexBuf;
193 if(str.length - 1 >= 6)
194 {
195 for(int i = 2; i < 6; i++)
196 if(hex.contains(str[i]))
197 hexBuf ~= str[i];
198 else
199 messages.report(StringHexInvalid, loc + i, loc + i + 1)
200 .arg(Integer.toString(i-1))
201 .arg(Integer.toString(6));
202 res.length = 6;
203 }
204 else
205 {
206 messages.report(StringShortEscape, loc, loc + str.length);
207 res.length = str.length - 1;
208 }
209 uint i = Integer.toLong(hexBuf, 16);
210 if(!isValidUtf8(i))
211 messages.report(InvalidUtf8Hex, loc, loc+6);
212 else
213 res.data ~= parseToUtf8(i);
214 break;
215 case 'U':
216 char[] hex = "0123456789abcdefABCDEF";
217 char[] hexBuf;
218 if(str.length - 1 >= 10)
219 {
220 for(int i = 2; i < 10; i++)
221 if(hex.contains(str[i]))
222 hexBuf ~= str[i];
223 else
224 messages.report(StringHexInvalid, loc + i, loc + i + 1)
225 .arg(Integer.toString(i-1))
226 .arg(Integer.toString(10));
227 res.length = 10;
228 }
229 else
230 {
231 messages.report(StringShortEscape, loc, loc + str.length);
232 res.length = str.length - 1;
233 }
234 uint i = Integer.toLong(hexBuf, 16);
235 if(!isValidUtf8(i))
236 messages.report(InvalidUtf8Hex, loc, loc+10);
237 else
238 res.data ~= parseToUtf8(i);
239 break;
240 case '0':
241 case '1':
242 case '2':
243 case '3':
244 case '4':
245 case '5':
246 case '6':
247 case '7':
248 char[] oct = "01234567";
249 char[] octBuf;
250 octBuf ~= str[1];
251 res.length = 2;
252 for(int i = 2; i < 4; i++)
253 if(oct.contains(str[i]))
254 {
255 octBuf ~= str[i];
256 res.length += 1;
257 }
258 else
259 break;
260
261 uint i = Integer.toLong(octBuf, 8);
262 res.data ~= i;
263 break;
264 default:
265 messages.report(InvalidStrEscape, loc, loc + 2);
266 res.length += 2;
267 }
268
269 return res;
270 }
271
272 String parseWysiwygString(char[] str, String strBuf)
273 {
274 char start = str[0];
275
276 int i = 1;
277
278 while(str[i] != start)
279 {
280 strBuf.data ~= cast(ubyte)str[i];
281 i++;
282 }
283 return strBuf;
284 }
285
286 ubyte[] parseToUtf8(uint i)
287 {
288 if(i <= 0x00007F)
289 return [cast(ubyte)i];
290 else if(i <= 0x0007FF)
291 {
292 ubyte a = (i << 26) >> 26;
293 bts(cast(uint*)&a, 7);
294 ubyte b = (i << 19) >> 25;
295 bts(cast(uint*)&b, 7);
296 bts(cast(uint*)&b, 6);
297 return [b,a];
298 }
299 else if(i <= 0x00FFFF)
300 {
301 ubyte a = (i << 26) >> 26;
302 bts(cast(uint*)&a, 7);
303 ubyte b = (i << 20) >> 26;
304 bts(cast(uint*)&b, 7);
305 ubyte c = (i << 16) >> 28;
306 bts(cast(uint*)&c, 7);
307 bts(cast(uint*)&c, 6);
308 bts(cast(uint*)&c, 5);
309 return [c,b,a];
310 }
311 else if(i <= 0x10FFFF)
312 {
313 ubyte a = (i << 26) >> 26;
314 bts(cast(uint*)&a, 7);
315 ubyte b = (i << 20) >> 26;
316 bts(cast(uint*)&b, 7);
317 ubyte c = (i << 14) >> 26;
318 bts(cast(uint*)&c, 7);
319 ubyte d = (i << 11) >> 29;
320 bts(cast(uint*)&d, 7);
321 bts(cast(uint*)&d, 6);
322 bts(cast(uint*)&d, 5);
323 bts(cast(uint*)&d, 4);
324 return [d,c,b,a];
325 }
326 }
327
328 bool isValidUtf8(uint i)
329 {
330 if(i <= 0x10FFFF)
331 return true;
332 return false;
333 }
334
335 void printString(char[] str, String strBuf)
336 {
337 char[] s;
338 switch(strBuf.type)
339 {
340 case StructType.Char:
341 Stdout(str)(" have become").newline()
342 (cast(char[])strBuf.data).newline;
343 break;
344 case StructType.WChar:
345 Stdout(str)(" have become").newline()
346 (cast(wchar[])strBuf.data).newline;
347 break;
348 case StructType.DChar:
349 Stdout(str)(" have become").newline()
350 (cast(dchar[])strBuf.data).newline;
351 break;
352 }
353 }