comparison mde/mergetag/parse/parseTo.d @ 70:7fc0a8295c83

Moved my parseTo and parseFrom modules from tango.scrapple to mde in order to reduce dependencies.
author Diggory Hardy <diggory.hardy@gmail.com>
date Fri, 04 Jul 2008 19:04:16 +0100
parents
children
comparison
equal deleted inserted replaced
69:ead4afc6d0b8 70:7fc0a8295c83
1 /**************************************************************************************************
2 * copyright: Copyright (c) 2007-2008 Diggory Hardy.
3 *
4 * author: Diggory Hardy, diggory.hardy@gmail.com
5 *
6 * license: BSD style: $(LICENSE)
7 *
8 * This contains templates for converting a char[] to various data-types.
9 *
10 * parseTo is roughly the inverse of $(B parseFrom) and should read any data output by $(B parseFrom).
11 * It is also available in tango.scrapple.
12 *
13 * This module basically implements the following templated function for most basic D types:
14 * bool, byte, short, int, long, ubyte, ushort, uint, ulong, float, double, real, char.
15 * It also supports arrays and associative arrays of any supported type (including of other arrays)
16 * and has special handling for strings (char[]) and binary (ubyte[]) data-types.
17 * -----------------------------
18 * T parseTo(T) (char[] source);
19 * -----------------------------
20 *
21 * $(I source) is the string to parse, and data of the templated type that is read from the string
22 * is returned. See the examples to get a better idea of its use.
23 *
24 * Syntax:
25 * The syntax for parsing $(I source) is mostly the same used by D without any prefixes/suffixes
26 * (except 0x, 0b & 0o base specifiers). Also a special ubyte[] syntax is supported; see examples.
27 * The following escape sequences are supported for strings and characters: \' \" \\
28 * \a \b \f \n \r \t \v . Associative array literals use the same syntax as D, described here:
29 * $(LINK http://www.digitalmars.com/d/2.0/expression.html#AssocArrayLiteral). All whitespace is
30 * ignored (except of course within strings).
31 *
32 * There are also some public utility functions with their own documentation.
33 *
34 * Throws:
35 * On errors, a ParseException or a UnicodeException (both extend TextException) is thrown with a
36 * suitable message. No other exceptions should be thrown.
37 *
38 * Remarks:
39 * There is currently no support for reading wchar/dchar strings. There are, however, unicode
40 * conversions for converting UTF-8 to UTF-16/32. Be careful if converting on a char-by-char basis;
41 * such conversions cannot be used for non-ascii characters.
42 *
43 * Examples:
44 * ------------------------------------------------------------------------------------------------
45 * // Basic examples:
46 * ulong a = parseTo!(ulong) ("20350");
47 * float d = parseTo!(float) (" 1.2e-9 ");
48 * int[] b = parseTo!(int[]) ("[0,1,2,3]");
49 *
50 * // String and char[] syntax:
51 * char[] c = parseTo!(char[]) ("\"A string\"");
52 * char[] e = parseTo!(char[]) ("['a','n','o','t','h','e','r', ' ' ,'s','t','r','i','n','g']");
53 *
54 * // These be used interchangably; here's a more complex example of an associative array:
55 * bool[char[]] f = parseTo!(bool[char[]]) ("[ \"one\":true, ['t','w','o']:false, \"three\":1, \"four\":000 ]");
56 *
57 * // There is also a special notation for ubyte[] types:
58 * // The digits following 0x must be in pairs and each specify one ubyte.
59 * assert ( parseTo!(ubyte[]) (`0x01F2AC`) == parseTo!(ubyte[]) (`[01 ,0xF2, 0xAC]`) );
60 *
61 * // There's no limit to the complexity!
62 * char[char[][][][char]][bool] z = ...; // don't expect me to write this!
63 * ------------------------------------------------------------------------------------------------
64 *************************************************************************************************/
65
66 module mde.mergetag.parse.parseTo;
67
68 // tango imports
69 import tango.core.Exception : TextException, UnicodeException;
70 import cInt = tango.text.convert.Integer;
71 import cFloat = tango.text.convert.Float;
72 import Utf = tango.text.convert.Utf;
73 import Util = tango.text.Util;
74
75 /**
76 * Base class for parseTo exceptions.
77 */
78 class ParseException : TextException
79 {
80 this( char[] msg )
81 {
82 super( msg );
83 }
84 }
85
86
87 //BEGIN parseTo templates
88
89 // Associative arrays
90
91 const char[] AA_ERR = "Invalid associative array: ";
92 T[S] parseTo(T : T[S], S) (char[] src) {
93 src = Util.trim(src);
94 if (src.length < 2 || src[0] != '[' || src[$-1] != ']')
95 throw new ParseException (AA_ERR ~ "not [ ... ]"); // bad braces.
96
97 T[S] ret;
98 foreach (char[] pair; split (src[1..$-1])) {
99 uint i = 0;
100 while (i < pair.length) { // advance to the ':'
101 char c = pair[i];
102 if (c == ':') break;
103 if (c == '\'' || c == '"') { // string or character
104 ++i;
105 while (i < pair.length && pair[i] != c) {
106 if (pair[i] == '\\') {
107 if (i+2 >= pair.length) throw new ParseException (AA_ERR ~ "unfinished escape sequence within string/char");
108 ++i; // escape seq.
109 }
110 ++i;
111 }
112 if (i == pair.length) {
113 throw new ParseException (AA_ERR ~ "encountered [ ... KEY] (missing :DATA)");
114 }
115 }
116 ++i;
117 }
118 if (i == pair.length) {
119 throw new ParseException (AA_ERR ~ "encountered [ ... KEY:] (missing DATA)");
120 }
121 ret[parseTo!(S) (pair[0..i])] = parseTo!(T) (pair[i+1..$]);
122 }
123 return ret;
124 }
125 debug (UnitTest) unittest {
126 char[][char] X = parseTo!(char[][char]) (`['a':"animal", 'b':['b','u','s']]`);
127 char[][char] Y = ['a':cast(char[])"animal", 'b':['b','u','s']];
128
129 //FIXME: when the compiler's fixed: http://d.puremagic.com/issues/show_bug.cgi?id=1671
130 // just assert (X == Y)
131 assert (X.length == Y.length);
132 assert (X.keys == Y.keys);
133 assert (X.values == Y.values);
134 //X.rehash; Y.rehash; // doesn't make a difference
135 //assert (X == Y); // fails (compiler bug)
136 }
137
138
139 // Arrays
140
141 T[] parseTo(T : T[]) (char[] src) {
142 src = Util.trim(src);
143 if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T[]) (src);
144 throw new ParseException ("Invalid array: not [x, ..., z]");
145 }
146
147 // String (array special case)
148 T parseTo(T : char[]) (char[] src) {
149 src = Util.trim(src);
150 if (src.length >= 2 && src[0] == '"' && src[$-1] == '"') {
151 src = src[1..$-1];
152 T ret;
153 ret.length = src.length; // maximum length; retract to actual length later
154 uint i = 0;
155 for (uint t = 0; t < src.length;) {
156 // process a block of non-escaped characters
157 uint s = t;
158 while (t < src.length && src[t] != '\\') ++t; // non-escaped characters
159 uint j = i + t - s;
160 ret[i..j] = src[s..t]; // copy a block
161 i = j;
162
163 // process a block of escaped characters
164 while (t < src.length && src[t] == '\\') {
165 t++;
166 if (t == src.length) throw new ParseException ("Invalid string: ends \\\" !"); // next char is "
167 ret[i++] = replaceEscapedChar (src[t++]); // throws if it's invalid
168 }
169 }
170 return ret[0..i];
171 }
172 else if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T) (src);
173 throw new ParseException ("Invalid string: not quoted (\"*\") or char array (['a',...,'c'])");
174 }
175 // Unicode conversions for strings:
176 T parseTo(T : wchar[]) (char[] src) {
177 // May throw a UnicodeException; don't bother catching and rethrowing:
178 return Utf.toString16 (parseTo!(char[]) (src));
179 }
180 T parseTo(T : dchar[]) (char[] src) {
181 // May throw a UnicodeException; don't bother catching and rethrowing:
182 return Utf.toString32 (parseTo!(char[]) (src));
183 }
184
185 // Binary (array special case)
186 T parseTo(T : ubyte[]) (char[] src) {
187 src = Util.trim(src);
188 // Standard case:
189 if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T) (src);
190 // Special case: sequence of hex digits, each pair of which is a ubyte
191 if (src.length >= 2 && src[0..2] == "0x") {
192 src = src[2..$]; // strip down to actual digits
193
194 // Must be in pairs:
195 if (src.length % 2 == 1) throw new ParseException ("Invalid binary: odd number of chars");
196
197 T ret;
198 ret.length = src.length / 2; // exact
199
200 for (uint i, pos; pos + 1 < src.length; ++i) {
201 ubyte x = readHexChar(src, pos) << 4;
202 x |= readHexChar(src, pos);
203 ret[i] = x;
204 }
205 return ret;
206 }
207 else throw new ParseException ("Invalid ubyte[]: not an array and doesn't start 0x");
208 }
209
210 debug (UnitTest) unittest {
211 assert (parseTo!(double[]) (`[1.0,1.0e-10]`) == [1.0, 1.0e-10]); // generic array stuff
212 assert (parseTo!(double[]) (`[ ]`) == cast(double[]) []); // empty array
213
214 // char[] and char conversions, with commas, escape sequences and multichar UTF8 characters:
215 assert (parseTo!(char[][]) (`[ ".\"", [',','\''] ,"!\b€" ]`) == [ ".\"".dup, [',','\''] ,"!\b€" ]);
216
217 // wchar[] and dchar[] conversions:
218 // The characters were pretty-much pulled at random from unicode tables.
219 // The last few cause some wierd (display only) effects in my editor.
220 assert (parseTo!(wchar[]) ("\"Test string: ¶α؟अกሀ搀\"") == "Test string: ¶α؟अกሀ搀"w);
221 assert (parseTo!(dchar[]) ("\"Test string: ¶α؟अกሀ搀\"") == "Test string: ¶α؟अกሀ搀"d);
222
223 assert (parseTo!(ubyte[]) (`0x01F2AC`) == cast(ubyte[]) [0x01, 0xF2, 0xAC]); // ubyte[] special notation
224 assert (parseTo!(ubyte[]) (`[01 ,0xF2, 0xAC]`) == cast(ubyte[]) [0x01, 0xF2, 0xAC]); // ubyte[] std notation
225 }
226
227
228 // Basic types
229
230 // Char
231 T parseTo(T : char) (char[] src) {
232 src = Util.trim(src);
233 if (src.length < 3 || src[0] != '\'' || src[$-1] != '\'')
234 throw new ParseException ("Invalid char: not quoted (e.g. 'c')");
235 if (src[1] != '\\' && src.length == 3) return src[1]; // Either non escaped
236 if (src.length == 4) return replaceEscapedChar (src[2]); // Or escaped
237
238 // Report various errors; warnings for likely and difficult to tell cases:
239 // Warn in case it's a multibyte UTF-8 character:
240 if (src[1] & 0xC0u) throw new UnicodeException ("Invalid char: too long (non-ASCII UTF-8 characters cannot be read as a single character)", 1);
241 throw new ParseException ("Invalid char: too long");
242 }
243 /* Basic unicode convertions for wide-chars.
244 * NOTE: c > 127 signals the start of a multibyte UTF-8 sequence which must be converted for
245 * UTF-16/32. But since we don't know what the next bytes are we can't do the conversion. */
246 const char[] WIDE_CHAR_ERROR = "Error: unicode non-ascii character cannot be converted from a single UTF-8 char";
247 T parseTo(T : wchar) (char[] src) {
248 char c = parseTo!(char) (src);
249 if (c <= 127u) return cast(wchar) c; // this char can be converted
250 else throw new UnicodeException (WIDE_CHAR_ERROR, 1);
251 }
252 T parseTo(T : dchar) (char[] src) {
253 char c = parseTo!(char) (src);
254 if (c <= 127u) return cast(dchar) c; // this char can be converted
255 else throw new UnicodeException (WIDE_CHAR_ERROR, 1);
256 }
257 debug (UnitTest) unittest {
258 assert (parseTo!(char) ("\'\\\'\'") == '\'');
259 assert (parseTo!(wchar) ("'X'") == 'X');
260 assert (parseTo!(dchar) ("'X'") == 'X');
261 }
262
263 // Bool
264 T parseTo(T : bool) (char[] src) {
265 src = Util.trim(src);
266 if (src == "true") return true;
267 if (src == "false") return false;
268 uint pos;
269 while (src.length > pos && src[pos] == '0') ++pos; // skip leading zeros
270 if (src.length == pos && pos > 0) return false;
271 if (src.length == pos + 1 && src[pos] == '1') return true;
272 throw new ParseException ("Invalid bool: not true or false and doesn't evaluate to 0 or 1");
273 }
274 debug (UnitTest) unittest {
275 assert (parseTo!(bool[]) (`[true,false,01,00]`) == cast(bool[]) [1,0,1,0]);
276 }
277
278 // Ints
279 T parseTo(T : byte) (char[] src) {
280 return toTInt!(T) (src);
281 }
282 T parseTo(T : short) (char[] src) {
283 return toTInt!(T) (src);
284 }
285 T parseTo(T : int) (char[] src) {
286 return toTInt!(T) (src);
287 }
288 T parseTo(T : long) (char[] src) {
289 return toTInt!(T) (src);
290 }
291 T parseTo(T : ubyte) (char[] src) {
292 return toTInt!(T) (src);
293 }
294 T parseTo(T : ushort) (char[] src) {
295 return toTInt!(T) (src);
296 }
297 T parseTo(T : uint) (char[] src) {
298 return toTInt!(T) (src);
299 }
300 T parseTo(T : ulong) (char[] src) {
301 return toTInt!(T) (src);
302 }
303 debug (UnitTest) unittest {
304 assert (parseTo!(byte) ("-5") == cast(byte) -5);
305 // annoyingly, octal syntax differs from D (blame tango):
306 assert (parseTo!(uint[]) ("[0b0100,0o724,0xFa59c,0xFFFFFFFF,0]") == [0b0100u,0724,0xFa59c,0xFFFFFFFF,0]);
307 }
308
309 // Floats
310 T parseTo(T : float) (char[] src) {
311 return toTFloat!(T) (src);
312 }
313 T parseTo(T : double) (char[] src) {
314 return toTFloat!(T) (src);
315 }
316 T parseTo(T : real) (char[] src) {
317 return toTFloat!(T) (src);
318 }
319 debug (UnitTest) unittest {
320 assert (parseTo!(float) ("0.0") == 0.0f);
321 assert (parseTo!(double) ("-1e25") == -1e25);
322 assert (parseTo!(real) ("5.24e-269") == cast(real) 5.24e-269);
323 }
324 //END parseTo templates
325
326 //BEGIN Utility funcs
327 /** Trims whitespace at ends of string and checks for and removes array brackets: []
328 *
329 * Throws:
330 * ParseException if brackets aren't end non-whitespace characters.
331 *
332 * Returns:
333 * String without brackets (and whitespace outside those brackets). Useful for passing to split.
334 */
335 char[] stripBrackets (char[] src) {
336 src = Util.trim(src);
337 if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return src[1..$-1];
338 throw new ParseException ("Invalid bracketed string: not [...]");
339 }
340
341 /** Splits a string into substrings separated by '$(B ,)' with support for characters and strings
342 * containing escape sequences and for embedded arrays ($(B [...])).
343 *
344 * Params:
345 * src A string to separate on commas. Where used for parsing arrays, the brackets enclosing
346 * the array should be removed before calling this function (stripBrackets can do this).
347 *
348 * Returns:
349 * An array of substrings within src, excluding commas. Whitespace is not stripped and
350 * empty strings may get returned.
351 *
352 * Remarks:
353 * This function is primarily intended for as a utility function for use by the templates
354 * parsing arrays and associative arrays, but it may be useful in other cases too. Hence the
355 * fact no brackets are stripped from src.
356 */
357 char[][] split (char[] src) {
358 src = Util.trim (src);
359 if (src == "") return []; // empty array: no elements when no data
360
361 uint depth = 0; // surface depth (embedded arrays)
362 char[][] ret;
363 ret.length = src.length / 3; // unlikely to need a longer array
364 uint k = 0; // current split piece
365 uint i = 0, j = 0; // current read location, start of current piece
366
367 while (i < src.length) {
368 char c = src[i];
369 if (c == '\'' || c == '"') { // string or character
370 ++i;
371 while (i < src.length && src[i] != c) {
372 if (src[i] == '\\') ++i; // escape seq.
373 ++i;
374 } // Doesn't throw if no terminal quote at end of src, but this should be caught later.
375 }
376 else if (c == '[') ++depth;
377 else if (c == ']') {
378 if (depth) --depth;
379 else throw new ParseException ("Invalid array literal: closes before end of data item.");
380 }
381 else if (c == ',' && depth == 0) { // only if not an embedded array
382 if (ret.length <= k) ret.length = ret.length * 2;
383 ret[k++] = src[j..i]; // add this piece and increment k
384 j = i + 1;
385 }
386 ++i;
387 }
388 if (ret.length <= k) ret.length = k + 1;
389 ret[k] = src[j..i]; // add final piece (i >= j)
390 return ret[0..k+1];
391 }
392
393 /* Templated read-int function to read (un)signed 1-4 byte integers.
394 *
395 * Actually a reimplementation of tango.text.convert.Integer toLong and parse functions.
396 */
397 private TInt toTInt(TInt) (char[] src) {
398 const char[] INT_OUT_OF_RANGE = "Integer out of range";
399 bool sign;
400 uint radix, ate, ate2;
401
402 // Trim off whitespace.
403 // NOTE: Cannot use tango.text.convert.Integer.trim to trim leading whitespace since it doesn't
404 // treat new-lines, etc. as whitespace which for our purposes is whitespace.
405 src = Util.trim (src);
406
407 ate = cInt.trim (src, sign, radix);
408 if (ate == src.length) throw new ParseException ("Invalid integer: no digits");
409 ulong val = cInt.convert (src[ate..$], radix, &ate2);
410 ate += ate2;
411
412 if (ate < src.length)
413 throw new ParseException ("Invalid integer at marked character: \"" ~ src[0..ate] ~ "'" ~ src[ate] ~ "'" ~ src[ate+1..$] ~ "\"");
414
415 if (val > TInt.max) throw new ParseException (INT_OUT_OF_RANGE);
416 if (sign) {
417 long sval = cast(long) -val;
418 if (sval > TInt.min) return cast(TInt) sval;
419 else throw new ParseException (INT_OUT_OF_RANGE);
420 }
421 return cast(TInt) val;
422 }
423
424 /* Basically a reimplementation of tango.text.convert.Float.toFloat which checks for
425 * whitespace before throwing an exception for overlong input. */
426 private TFloat toTFloat(TFloat) (char[] src) {
427 // NOTE: As for toTInt(), this needs to strip leading as well as trailing whitespace.
428 src = Util.trim (src);
429 if (src == "") throw new ParseException ("Invalid float: no digits");
430 uint ate;
431
432 TFloat x = cFloat.parse (src, &ate);
433 return x;
434 }
435
436 /* Throws an exception on invalid escape sequences. Supported escape sequences are the following
437 * subset of those supported by D: \" \' \\ \a \b \f \n \r \t \v
438 */
439 private char replaceEscapedChar (char c)
440 {
441 // This code was generated:
442 if (c <= 'b') {
443 if (c <= '\'') {
444 if (c == '\"') {
445 return '\"';
446 } else if (c == '\'') {
447 return '\'';
448 }
449 } else {
450 if (c == '\\') {
451 return '\\';
452 } else if (c == 'a') {
453 return '\a';
454 } else if (c == 'b') {
455 return '\b';
456 }
457 }
458 } else {
459 if (c <= 'n') {
460 if (c == 'f') {
461 return '\f';
462 } else if (c == 'n') {
463 return '\n';
464 }
465 } else {
466 if (c == 'r') {
467 return '\r';
468 } else if (c == 't') {
469 return '\t';
470 } else if (c == 'v') {
471 return '\v';
472 }
473 }
474 }
475
476 // if we haven't returned:
477 throw new ParseException ("Invalid escape sequence: \\"~c);
478 }
479
480 // Reads one hex char: [0-9A-Fa-f]. Otherwise throws an exception. Doesn't check src.length.
481 private ubyte readHexChar (char[] src, inout uint pos) {
482 ubyte x;
483 if (src[pos] >= '0' && src[pos] <= '9') x = src[pos] - '0';
484 else if (src[pos] >= 'A' && src[pos] <= 'F') x = src[pos] - 'A' + 10;
485 else if (src[pos] >= 'a' && src[pos] <= 'f') x = src[pos] - 'a' + 10;
486 else throw new ParseException ("Invalid hex digit.");
487 ++pos;
488 return x;
489 }
490
491 // Generic array reader
492 // Assumes input is of form "[xxxxx]" (i.e. first and last chars are '[', ']' and length >= 2).
493 private T[] toArray(T : T[]) (char[] src) {
494 T[] ret = new T[16]; // avoid unnecessary allocations
495 uint i = 0;
496 foreach (char[] element; split(src[1..$-1])) {
497 if (i == ret.length) ret.length = ret.length * 2;
498 ret[i] = parseTo!(T) (element);
499 ++i;
500 }
501 return ret[0..i];
502 }
503
504 debug (UnitTest) {
505 import tango.io.Console;
506
507 unittest {
508 Cout ("Running unittest: parseTo ...").flush;
509
510 assert (parseTo!(char[]) ("\"\\a\\b\\t\\n\\v\\f\\r\\\"\\\'\\\\\"") == "\a\b\t\n\v\f\r\"\'\\");
511
512 Cout (" complete").newline;
513 }
514 }
515 //END Utility funcs