Mercurial > projects > mde
comparison mde/mergetag/parse/parseTo.d @ 70:7fc0a8295c83
Moved my parseTo and parseFrom modules from tango.scrapple to mde in order to reduce dependencies.
author | Diggory Hardy <diggory.hardy@gmail.com> |
---|---|
date | Fri, 04 Jul 2008 19:04:16 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
69:ead4afc6d0b8 | 70:7fc0a8295c83 |
---|---|
1 /************************************************************************************************** | |
2 * copyright: Copyright (c) 2007-2008 Diggory Hardy. | |
3 * | |
4 * author: Diggory Hardy, diggory.hardy@gmail.com | |
5 * | |
6 * license: BSD style: $(LICENSE) | |
7 * | |
8 * This contains templates for converting a char[] to various data-types. | |
9 * | |
10 * parseTo is roughly the inverse of $(B parseFrom) and should read any data output by $(B parseFrom). | |
11 * It is also available in tango.scrapple. | |
12 * | |
13 * This module basically implements the following templated function for most basic D types: | |
14 * bool, byte, short, int, long, ubyte, ushort, uint, ulong, float, double, real, char. | |
15 * It also supports arrays and associative arrays of any supported type (including of other arrays) | |
16 * and has special handling for strings (char[]) and binary (ubyte[]) data-types. | |
17 * ----------------------------- | |
18 * T parseTo(T) (char[] source); | |
19 * ----------------------------- | |
20 * | |
21 * $(I source) is the string to parse, and data of the templated type that is read from the string | |
22 * is returned. See the examples to get a better idea of its use. | |
23 * | |
24 * Syntax: | |
25 * The syntax for parsing $(I source) is mostly the same used by D without any prefixes/suffixes | |
26 * (except 0x, 0b & 0o base specifiers). Also a special ubyte[] syntax is supported; see examples. | |
27 * The following escape sequences are supported for strings and characters: \' \" \\ | |
28 * \a \b \f \n \r \t \v . Associative array literals use the same syntax as D, described here: | |
29 * $(LINK http://www.digitalmars.com/d/2.0/expression.html#AssocArrayLiteral). All whitespace is | |
30 * ignored (except of course within strings). | |
31 * | |
32 * There are also some public utility functions with their own documentation. | |
33 * | |
34 * Throws: | |
35 * On errors, a ParseException or a UnicodeException (both extend TextException) is thrown with a | |
36 * suitable message. No other exceptions should be thrown. | |
37 * | |
38 * Remarks: | |
39 * There is currently no support for reading wchar/dchar strings. There are, however, unicode | |
40 * conversions for converting UTF-8 to UTF-16/32. Be careful if converting on a char-by-char basis; | |
41 * such conversions cannot be used for non-ascii characters. | |
42 * | |
43 * Examples: | |
44 * ------------------------------------------------------------------------------------------------ | |
45 * // Basic examples: | |
46 * ulong a = parseTo!(ulong) ("20350"); | |
47 * float d = parseTo!(float) (" 1.2e-9 "); | |
48 * int[] b = parseTo!(int[]) ("[0,1,2,3]"); | |
49 * | |
50 * // String and char[] syntax: | |
51 * char[] c = parseTo!(char[]) ("\"A string\""); | |
52 * char[] e = parseTo!(char[]) ("['a','n','o','t','h','e','r', ' ' ,'s','t','r','i','n','g']"); | |
53 * | |
54 * // These be used interchangably; here's a more complex example of an associative array: | |
55 * bool[char[]] f = parseTo!(bool[char[]]) ("[ \"one\":true, ['t','w','o']:false, \"three\":1, \"four\":000 ]"); | |
56 * | |
57 * // There is also a special notation for ubyte[] types: | |
58 * // The digits following 0x must be in pairs and each specify one ubyte. | |
59 * assert ( parseTo!(ubyte[]) (`0x01F2AC`) == parseTo!(ubyte[]) (`[01 ,0xF2, 0xAC]`) ); | |
60 * | |
61 * // There's no limit to the complexity! | |
62 * char[char[][][][char]][bool] z = ...; // don't expect me to write this! | |
63 * ------------------------------------------------------------------------------------------------ | |
64 *************************************************************************************************/ | |
65 | |
66 module mde.mergetag.parse.parseTo; | |
67 | |
68 // tango imports | |
69 import tango.core.Exception : TextException, UnicodeException; | |
70 import cInt = tango.text.convert.Integer; | |
71 import cFloat = tango.text.convert.Float; | |
72 import Utf = tango.text.convert.Utf; | |
73 import Util = tango.text.Util; | |
74 | |
75 /** | |
76 * Base class for parseTo exceptions. | |
77 */ | |
78 class ParseException : TextException | |
79 { | |
80 this( char[] msg ) | |
81 { | |
82 super( msg ); | |
83 } | |
84 } | |
85 | |
86 | |
87 //BEGIN parseTo templates | |
88 | |
89 // Associative arrays | |
90 | |
91 const char[] AA_ERR = "Invalid associative array: "; | |
92 T[S] parseTo(T : T[S], S) (char[] src) { | |
93 src = Util.trim(src); | |
94 if (src.length < 2 || src[0] != '[' || src[$-1] != ']') | |
95 throw new ParseException (AA_ERR ~ "not [ ... ]"); // bad braces. | |
96 | |
97 T[S] ret; | |
98 foreach (char[] pair; split (src[1..$-1])) { | |
99 uint i = 0; | |
100 while (i < pair.length) { // advance to the ':' | |
101 char c = pair[i]; | |
102 if (c == ':') break; | |
103 if (c == '\'' || c == '"') { // string or character | |
104 ++i; | |
105 while (i < pair.length && pair[i] != c) { | |
106 if (pair[i] == '\\') { | |
107 if (i+2 >= pair.length) throw new ParseException (AA_ERR ~ "unfinished escape sequence within string/char"); | |
108 ++i; // escape seq. | |
109 } | |
110 ++i; | |
111 } | |
112 if (i == pair.length) { | |
113 throw new ParseException (AA_ERR ~ "encountered [ ... KEY] (missing :DATA)"); | |
114 } | |
115 } | |
116 ++i; | |
117 } | |
118 if (i == pair.length) { | |
119 throw new ParseException (AA_ERR ~ "encountered [ ... KEY:] (missing DATA)"); | |
120 } | |
121 ret[parseTo!(S) (pair[0..i])] = parseTo!(T) (pair[i+1..$]); | |
122 } | |
123 return ret; | |
124 } | |
125 debug (UnitTest) unittest { | |
126 char[][char] X = parseTo!(char[][char]) (`['a':"animal", 'b':['b','u','s']]`); | |
127 char[][char] Y = ['a':cast(char[])"animal", 'b':['b','u','s']]; | |
128 | |
129 //FIXME: when the compiler's fixed: http://d.puremagic.com/issues/show_bug.cgi?id=1671 | |
130 // just assert (X == Y) | |
131 assert (X.length == Y.length); | |
132 assert (X.keys == Y.keys); | |
133 assert (X.values == Y.values); | |
134 //X.rehash; Y.rehash; // doesn't make a difference | |
135 //assert (X == Y); // fails (compiler bug) | |
136 } | |
137 | |
138 | |
139 // Arrays | |
140 | |
141 T[] parseTo(T : T[]) (char[] src) { | |
142 src = Util.trim(src); | |
143 if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T[]) (src); | |
144 throw new ParseException ("Invalid array: not [x, ..., z]"); | |
145 } | |
146 | |
147 // String (array special case) | |
148 T parseTo(T : char[]) (char[] src) { | |
149 src = Util.trim(src); | |
150 if (src.length >= 2 && src[0] == '"' && src[$-1] == '"') { | |
151 src = src[1..$-1]; | |
152 T ret; | |
153 ret.length = src.length; // maximum length; retract to actual length later | |
154 uint i = 0; | |
155 for (uint t = 0; t < src.length;) { | |
156 // process a block of non-escaped characters | |
157 uint s = t; | |
158 while (t < src.length && src[t] != '\\') ++t; // non-escaped characters | |
159 uint j = i + t - s; | |
160 ret[i..j] = src[s..t]; // copy a block | |
161 i = j; | |
162 | |
163 // process a block of escaped characters | |
164 while (t < src.length && src[t] == '\\') { | |
165 t++; | |
166 if (t == src.length) throw new ParseException ("Invalid string: ends \\\" !"); // next char is " | |
167 ret[i++] = replaceEscapedChar (src[t++]); // throws if it's invalid | |
168 } | |
169 } | |
170 return ret[0..i]; | |
171 } | |
172 else if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T) (src); | |
173 throw new ParseException ("Invalid string: not quoted (\"*\") or char array (['a',...,'c'])"); | |
174 } | |
175 // Unicode conversions for strings: | |
176 T parseTo(T : wchar[]) (char[] src) { | |
177 // May throw a UnicodeException; don't bother catching and rethrowing: | |
178 return Utf.toString16 (parseTo!(char[]) (src)); | |
179 } | |
180 T parseTo(T : dchar[]) (char[] src) { | |
181 // May throw a UnicodeException; don't bother catching and rethrowing: | |
182 return Utf.toString32 (parseTo!(char[]) (src)); | |
183 } | |
184 | |
185 // Binary (array special case) | |
186 T parseTo(T : ubyte[]) (char[] src) { | |
187 src = Util.trim(src); | |
188 // Standard case: | |
189 if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T) (src); | |
190 // Special case: sequence of hex digits, each pair of which is a ubyte | |
191 if (src.length >= 2 && src[0..2] == "0x") { | |
192 src = src[2..$]; // strip down to actual digits | |
193 | |
194 // Must be in pairs: | |
195 if (src.length % 2 == 1) throw new ParseException ("Invalid binary: odd number of chars"); | |
196 | |
197 T ret; | |
198 ret.length = src.length / 2; // exact | |
199 | |
200 for (uint i, pos; pos + 1 < src.length; ++i) { | |
201 ubyte x = readHexChar(src, pos) << 4; | |
202 x |= readHexChar(src, pos); | |
203 ret[i] = x; | |
204 } | |
205 return ret; | |
206 } | |
207 else throw new ParseException ("Invalid ubyte[]: not an array and doesn't start 0x"); | |
208 } | |
209 | |
210 debug (UnitTest) unittest { | |
211 assert (parseTo!(double[]) (`[1.0,1.0e-10]`) == [1.0, 1.0e-10]); // generic array stuff | |
212 assert (parseTo!(double[]) (`[ ]`) == cast(double[]) []); // empty array | |
213 | |
214 // char[] and char conversions, with commas, escape sequences and multichar UTF8 characters: | |
215 assert (parseTo!(char[][]) (`[ ".\"", [',','\''] ,"!\b€" ]`) == [ ".\"".dup, [',','\''] ,"!\b€" ]); | |
216 | |
217 // wchar[] and dchar[] conversions: | |
218 // The characters were pretty-much pulled at random from unicode tables. | |
219 // The last few cause some wierd (display only) effects in my editor. | |
220 assert (parseTo!(wchar[]) ("\"Test string: ¶α؟अกሀ搀\"") == "Test string: ¶α؟अกሀ搀"w); | |
221 assert (parseTo!(dchar[]) ("\"Test string: ¶α؟अกሀ搀\"") == "Test string: ¶α؟अกሀ搀"d); | |
222 | |
223 assert (parseTo!(ubyte[]) (`0x01F2AC`) == cast(ubyte[]) [0x01, 0xF2, 0xAC]); // ubyte[] special notation | |
224 assert (parseTo!(ubyte[]) (`[01 ,0xF2, 0xAC]`) == cast(ubyte[]) [0x01, 0xF2, 0xAC]); // ubyte[] std notation | |
225 } | |
226 | |
227 | |
228 // Basic types | |
229 | |
230 // Char | |
231 T parseTo(T : char) (char[] src) { | |
232 src = Util.trim(src); | |
233 if (src.length < 3 || src[0] != '\'' || src[$-1] != '\'') | |
234 throw new ParseException ("Invalid char: not quoted (e.g. 'c')"); | |
235 if (src[1] != '\\' && src.length == 3) return src[1]; // Either non escaped | |
236 if (src.length == 4) return replaceEscapedChar (src[2]); // Or escaped | |
237 | |
238 // Report various errors; warnings for likely and difficult to tell cases: | |
239 // Warn in case it's a multibyte UTF-8 character: | |
240 if (src[1] & 0xC0u) throw new UnicodeException ("Invalid char: too long (non-ASCII UTF-8 characters cannot be read as a single character)", 1); | |
241 throw new ParseException ("Invalid char: too long"); | |
242 } | |
243 /* Basic unicode convertions for wide-chars. | |
244 * NOTE: c > 127 signals the start of a multibyte UTF-8 sequence which must be converted for | |
245 * UTF-16/32. But since we don't know what the next bytes are we can't do the conversion. */ | |
246 const char[] WIDE_CHAR_ERROR = "Error: unicode non-ascii character cannot be converted from a single UTF-8 char"; | |
247 T parseTo(T : wchar) (char[] src) { | |
248 char c = parseTo!(char) (src); | |
249 if (c <= 127u) return cast(wchar) c; // this char can be converted | |
250 else throw new UnicodeException (WIDE_CHAR_ERROR, 1); | |
251 } | |
252 T parseTo(T : dchar) (char[] src) { | |
253 char c = parseTo!(char) (src); | |
254 if (c <= 127u) return cast(dchar) c; // this char can be converted | |
255 else throw new UnicodeException (WIDE_CHAR_ERROR, 1); | |
256 } | |
257 debug (UnitTest) unittest { | |
258 assert (parseTo!(char) ("\'\\\'\'") == '\''); | |
259 assert (parseTo!(wchar) ("'X'") == 'X'); | |
260 assert (parseTo!(dchar) ("'X'") == 'X'); | |
261 } | |
262 | |
263 // Bool | |
264 T parseTo(T : bool) (char[] src) { | |
265 src = Util.trim(src); | |
266 if (src == "true") return true; | |
267 if (src == "false") return false; | |
268 uint pos; | |
269 while (src.length > pos && src[pos] == '0') ++pos; // skip leading zeros | |
270 if (src.length == pos && pos > 0) return false; | |
271 if (src.length == pos + 1 && src[pos] == '1') return true; | |
272 throw new ParseException ("Invalid bool: not true or false and doesn't evaluate to 0 or 1"); | |
273 } | |
274 debug (UnitTest) unittest { | |
275 assert (parseTo!(bool[]) (`[true,false,01,00]`) == cast(bool[]) [1,0,1,0]); | |
276 } | |
277 | |
278 // Ints | |
279 T parseTo(T : byte) (char[] src) { | |
280 return toTInt!(T) (src); | |
281 } | |
282 T parseTo(T : short) (char[] src) { | |
283 return toTInt!(T) (src); | |
284 } | |
285 T parseTo(T : int) (char[] src) { | |
286 return toTInt!(T) (src); | |
287 } | |
288 T parseTo(T : long) (char[] src) { | |
289 return toTInt!(T) (src); | |
290 } | |
291 T parseTo(T : ubyte) (char[] src) { | |
292 return toTInt!(T) (src); | |
293 } | |
294 T parseTo(T : ushort) (char[] src) { | |
295 return toTInt!(T) (src); | |
296 } | |
297 T parseTo(T : uint) (char[] src) { | |
298 return toTInt!(T) (src); | |
299 } | |
300 T parseTo(T : ulong) (char[] src) { | |
301 return toTInt!(T) (src); | |
302 } | |
303 debug (UnitTest) unittest { | |
304 assert (parseTo!(byte) ("-5") == cast(byte) -5); | |
305 // annoyingly, octal syntax differs from D (blame tango): | |
306 assert (parseTo!(uint[]) ("[0b0100,0o724,0xFa59c,0xFFFFFFFF,0]") == [0b0100u,0724,0xFa59c,0xFFFFFFFF,0]); | |
307 } | |
308 | |
309 // Floats | |
310 T parseTo(T : float) (char[] src) { | |
311 return toTFloat!(T) (src); | |
312 } | |
313 T parseTo(T : double) (char[] src) { | |
314 return toTFloat!(T) (src); | |
315 } | |
316 T parseTo(T : real) (char[] src) { | |
317 return toTFloat!(T) (src); | |
318 } | |
319 debug (UnitTest) unittest { | |
320 assert (parseTo!(float) ("0.0") == 0.0f); | |
321 assert (parseTo!(double) ("-1e25") == -1e25); | |
322 assert (parseTo!(real) ("5.24e-269") == cast(real) 5.24e-269); | |
323 } | |
324 //END parseTo templates | |
325 | |
326 //BEGIN Utility funcs | |
327 /** Trims whitespace at ends of string and checks for and removes array brackets: [] | |
328 * | |
329 * Throws: | |
330 * ParseException if brackets aren't end non-whitespace characters. | |
331 * | |
332 * Returns: | |
333 * String without brackets (and whitespace outside those brackets). Useful for passing to split. | |
334 */ | |
335 char[] stripBrackets (char[] src) { | |
336 src = Util.trim(src); | |
337 if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return src[1..$-1]; | |
338 throw new ParseException ("Invalid bracketed string: not [...]"); | |
339 } | |
340 | |
341 /** Splits a string into substrings separated by '$(B ,)' with support for characters and strings | |
342 * containing escape sequences and for embedded arrays ($(B [...])). | |
343 * | |
344 * Params: | |
345 * src A string to separate on commas. Where used for parsing arrays, the brackets enclosing | |
346 * the array should be removed before calling this function (stripBrackets can do this). | |
347 * | |
348 * Returns: | |
349 * An array of substrings within src, excluding commas. Whitespace is not stripped and | |
350 * empty strings may get returned. | |
351 * | |
352 * Remarks: | |
353 * This function is primarily intended for as a utility function for use by the templates | |
354 * parsing arrays and associative arrays, but it may be useful in other cases too. Hence the | |
355 * fact no brackets are stripped from src. | |
356 */ | |
357 char[][] split (char[] src) { | |
358 src = Util.trim (src); | |
359 if (src == "") return []; // empty array: no elements when no data | |
360 | |
361 uint depth = 0; // surface depth (embedded arrays) | |
362 char[][] ret; | |
363 ret.length = src.length / 3; // unlikely to need a longer array | |
364 uint k = 0; // current split piece | |
365 uint i = 0, j = 0; // current read location, start of current piece | |
366 | |
367 while (i < src.length) { | |
368 char c = src[i]; | |
369 if (c == '\'' || c == '"') { // string or character | |
370 ++i; | |
371 while (i < src.length && src[i] != c) { | |
372 if (src[i] == '\\') ++i; // escape seq. | |
373 ++i; | |
374 } // Doesn't throw if no terminal quote at end of src, but this should be caught later. | |
375 } | |
376 else if (c == '[') ++depth; | |
377 else if (c == ']') { | |
378 if (depth) --depth; | |
379 else throw new ParseException ("Invalid array literal: closes before end of data item."); | |
380 } | |
381 else if (c == ',' && depth == 0) { // only if not an embedded array | |
382 if (ret.length <= k) ret.length = ret.length * 2; | |
383 ret[k++] = src[j..i]; // add this piece and increment k | |
384 j = i + 1; | |
385 } | |
386 ++i; | |
387 } | |
388 if (ret.length <= k) ret.length = k + 1; | |
389 ret[k] = src[j..i]; // add final piece (i >= j) | |
390 return ret[0..k+1]; | |
391 } | |
392 | |
393 /* Templated read-int function to read (un)signed 1-4 byte integers. | |
394 * | |
395 * Actually a reimplementation of tango.text.convert.Integer toLong and parse functions. | |
396 */ | |
397 private TInt toTInt(TInt) (char[] src) { | |
398 const char[] INT_OUT_OF_RANGE = "Integer out of range"; | |
399 bool sign; | |
400 uint radix, ate, ate2; | |
401 | |
402 // Trim off whitespace. | |
403 // NOTE: Cannot use tango.text.convert.Integer.trim to trim leading whitespace since it doesn't | |
404 // treat new-lines, etc. as whitespace which for our purposes is whitespace. | |
405 src = Util.trim (src); | |
406 | |
407 ate = cInt.trim (src, sign, radix); | |
408 if (ate == src.length) throw new ParseException ("Invalid integer: no digits"); | |
409 ulong val = cInt.convert (src[ate..$], radix, &ate2); | |
410 ate += ate2; | |
411 | |
412 if (ate < src.length) | |
413 throw new ParseException ("Invalid integer at marked character: \"" ~ src[0..ate] ~ "'" ~ src[ate] ~ "'" ~ src[ate+1..$] ~ "\""); | |
414 | |
415 if (val > TInt.max) throw new ParseException (INT_OUT_OF_RANGE); | |
416 if (sign) { | |
417 long sval = cast(long) -val; | |
418 if (sval > TInt.min) return cast(TInt) sval; | |
419 else throw new ParseException (INT_OUT_OF_RANGE); | |
420 } | |
421 return cast(TInt) val; | |
422 } | |
423 | |
424 /* Basically a reimplementation of tango.text.convert.Float.toFloat which checks for | |
425 * whitespace before throwing an exception for overlong input. */ | |
426 private TFloat toTFloat(TFloat) (char[] src) { | |
427 // NOTE: As for toTInt(), this needs to strip leading as well as trailing whitespace. | |
428 src = Util.trim (src); | |
429 if (src == "") throw new ParseException ("Invalid float: no digits"); | |
430 uint ate; | |
431 | |
432 TFloat x = cFloat.parse (src, &ate); | |
433 return x; | |
434 } | |
435 | |
436 /* Throws an exception on invalid escape sequences. Supported escape sequences are the following | |
437 * subset of those supported by D: \" \' \\ \a \b \f \n \r \t \v | |
438 */ | |
439 private char replaceEscapedChar (char c) | |
440 { | |
441 // This code was generated: | |
442 if (c <= 'b') { | |
443 if (c <= '\'') { | |
444 if (c == '\"') { | |
445 return '\"'; | |
446 } else if (c == '\'') { | |
447 return '\''; | |
448 } | |
449 } else { | |
450 if (c == '\\') { | |
451 return '\\'; | |
452 } else if (c == 'a') { | |
453 return '\a'; | |
454 } else if (c == 'b') { | |
455 return '\b'; | |
456 } | |
457 } | |
458 } else { | |
459 if (c <= 'n') { | |
460 if (c == 'f') { | |
461 return '\f'; | |
462 } else if (c == 'n') { | |
463 return '\n'; | |
464 } | |
465 } else { | |
466 if (c == 'r') { | |
467 return '\r'; | |
468 } else if (c == 't') { | |
469 return '\t'; | |
470 } else if (c == 'v') { | |
471 return '\v'; | |
472 } | |
473 } | |
474 } | |
475 | |
476 // if we haven't returned: | |
477 throw new ParseException ("Invalid escape sequence: \\"~c); | |
478 } | |
479 | |
480 // Reads one hex char: [0-9A-Fa-f]. Otherwise throws an exception. Doesn't check src.length. | |
481 private ubyte readHexChar (char[] src, inout uint pos) { | |
482 ubyte x; | |
483 if (src[pos] >= '0' && src[pos] <= '9') x = src[pos] - '0'; | |
484 else if (src[pos] >= 'A' && src[pos] <= 'F') x = src[pos] - 'A' + 10; | |
485 else if (src[pos] >= 'a' && src[pos] <= 'f') x = src[pos] - 'a' + 10; | |
486 else throw new ParseException ("Invalid hex digit."); | |
487 ++pos; | |
488 return x; | |
489 } | |
490 | |
491 // Generic array reader | |
492 // Assumes input is of form "[xxxxx]" (i.e. first and last chars are '[', ']' and length >= 2). | |
493 private T[] toArray(T : T[]) (char[] src) { | |
494 T[] ret = new T[16]; // avoid unnecessary allocations | |
495 uint i = 0; | |
496 foreach (char[] element; split(src[1..$-1])) { | |
497 if (i == ret.length) ret.length = ret.length * 2; | |
498 ret[i] = parseTo!(T) (element); | |
499 ++i; | |
500 } | |
501 return ret[0..i]; | |
502 } | |
503 | |
504 debug (UnitTest) { | |
505 import tango.io.Console; | |
506 | |
507 unittest { | |
508 Cout ("Running unittest: parseTo ...").flush; | |
509 | |
510 assert (parseTo!(char[]) ("\"\\a\\b\\t\\n\\v\\f\\r\\\"\\\'\\\\\"") == "\a\b\t\n\v\f\r\"\'\\"); | |
511 | |
512 Cout (" complete").newline; | |
513 } | |
514 } | |
515 //END Utility funcs |