Mercurial > projects > mde
comparison mde/mergetag/parse/parseFrom.d @ 70:7fc0a8295c83
Moved my parseTo and parseFrom modules from tango.scrapple to mde in order to reduce dependencies.
author | Diggory Hardy <diggory.hardy@gmail.com> |
---|---|
date | Fri, 04 Jul 2008 19:04:16 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
69:ead4afc6d0b8 | 70:7fc0a8295c83 |
---|---|
1 /************************************************************************************************** | |
2 * copyright: Copyright (c) 2007-2008 Diggory Hardy. | |
3 * | |
4 * author: Diggory Hardy, diggory.hardy@gmail.com | |
5 * | |
6 * license: BSD style: $(LICENSE) | |
7 * | |
8 * This contains templates for converting various data-types to a char[]. | |
9 * | |
10 * parseFrom is roughly the inverse of $(B parseTo). | |
11 * It is also available in tango.scrapple. | |
12 * | |
13 * This module basically implements the following templated function for most basic D types: | |
14 * bool, byte, short, int, long, ubyte, ushort, uint, ulong, float, double, real, char, wchar, | |
15 * dchar. | |
16 * It also supports arrays of any supported type (including of other arrays) and has special | |
17 * handling for strings (char[]) and binary (ubyte[]) data-types. | |
18 * ----------------------------- | |
19 * char[] parseFrom(T) (T value); | |
20 * ----------------------------- | |
21 * | |
22 * $(I value) is the value to convert; it is converted to a string and returned. | |
23 * | |
24 * Syntax: | |
25 * The syntax is the same as parseTo; but since this module only generates formatted output | |
26 * knowing the syntax shouldn't be necessary. There is currently no way to specify options like | |
27 * output base for ints, precision of floats, or | |
28 * whether to write char[] or ubyte[] types as arrays or in their more compact forms. | |
29 * | |
30 * Throws: | |
31 * On errors, an exception is thrown (UnicodeException or IllegalArgumentException). No other | |
32 * exceptions should be thrown. | |
33 * | |
34 * Remarks: | |
35 * There is currently no support for outputting wchar/dchar strings. There are, however, unicode | |
36 * conversions for converting UTF-16/32 to UTF-8. Be warned though that many wchar/dchar characters | |
37 * (any that are non-ascii) will not fit in a single char and an exception will be thrown. | |
38 * | |
39 * The code does involve some heap activity; this is necessary anyway for returning dynamic arrays. | |
40 * (Slices of a pre-allocated array could be returned instead, but for many uses would have to be | |
41 * duplicated before storage, leading to less efficient operation.) | |
42 * Most memory allocation has been kept to a minimum. | |
43 * | |
44 * Unlike the parseTo!() module, the parseFrom templates could be re-written to use static-ifs | |
45 * instead of type specialisation, thus allowing type inference. However I likely won't bother | |
46 * implementing this myself. | |
47 * | |
48 * Examples: | |
49 * ------------------------------------------------------------------------------------------------ | |
50 * // Examples are printed via Cout. | |
51 * | |
52 * // Basic examples: | |
53 * Cout (parseFrom!(byte) (-13)).newline; // -13 | |
54 * Cout (parseFrom!(real) (2.56e11)).newline; // 2.55999999999999990000e+11 | |
55 * Cout (parseFrom!(double[]) ([0.0, 1.0, 2.0, 3.0])).newline; // [0.00000000000000000,1.00000000000000000,2.00000000000000000,3.00000000000000000] | |
56 * Cout (parseFrom!(bool[]) ([true,false,false])).newline; // [true,false,false] | |
57 * | |
58 * // String and ubyte[] special syntaxes (always used): | |
59 * Cout (parseFrom!(char[]) ("A string.")).newline; // "A string." (including quotes) | |
60 * Cout (parseFrom!(ubyte[]) (cast(ubyte[]) [5u, 0xF1u, 0x10u])).newline; // 0x05f110 | |
61 * | |
62 * // Associative arrays: | |
63 * Cout (parseFrom!(char[][byte]) ([-1:"negative one"[], 0:"zero", 1:"one"])).newline; // [0:"zero",1:"one",-1:"negative one"] | |
64 * | |
65 * // No limit on complexity... | |
66 * char[] somethingComplicated = parseFrom!(real[][][bool[int[][]]]) (...); | |
67 * ------------------------------------------------------------------------------------------------ | |
68 *************************************************************************************************/ | |
69 | |
70 module mde.mergetag.parse.parseFrom; | |
71 | |
72 // tango imports | |
73 import tango.core.Exception : UnicodeException, IllegalArgumentException; | |
74 import cInt = tango.text.convert.Integer; | |
75 import cFloat = tango.text.convert.Float; | |
76 import Utf = tango.text.convert.Utf; | |
77 import Util = tango.text.Util; | |
78 | |
79 //BEGIN parseFrom templates | |
80 /* Idea: could extend parseFrom with a second parameter, containing flags for things like base to output. | |
81 * Unnecessary for mergetag though. | |
82 */ | |
83 | |
84 // Associative arrays | |
85 | |
86 char[] parseFrom(T : T[S], S) (T[S] val) { | |
87 char[] ret; | |
88 // A guess, including values themselves and [,:] elements (must be at least 2). | |
89 ret.length = val.length * (defLength!(T) + defLength!(S) + 2) + 2; | |
90 ret[0] = '['; | |
91 uint i = 1; | |
92 foreach (S k, T v; val) { | |
93 char[] s = parseFrom!(S) (k) ~ ":" ~ parseFrom!(T) (v); | |
94 i += s.length; | |
95 if (i+1 >= ret.length) ret.length = ret.length * 2; // check. | |
96 ret[i-s.length .. i] = s; | |
97 ret[i++] = ','; | |
98 } | |
99 if (i == 1) ++i; // special case - not overwriting a comma | |
100 ret[i-1] = ']'; // replaces last comma | |
101 return ret[0..i]; | |
102 } | |
103 debug (UnitTest) unittest { | |
104 char[] X = parseFrom!(char[][char]) (['a':cast(char[])"animal", 'b':['b','u','s']]); | |
105 char[] Y = `['a':"animal",'b':"bus"]`; | |
106 assert (X == Y); | |
107 } | |
108 | |
109 | |
110 // Arrays | |
111 | |
112 char[] parseFrom(T : T[]) (T[] val) { | |
113 char[] ret; | |
114 // A guess, including commas and brackets (must be at least 2) | |
115 ret.length = val.length * (defLength!(T) + 1) + 2; | |
116 ret[0] = '['; | |
117 uint i = 1; | |
118 foreach (T x; val) { | |
119 char[] s = parseFrom!(T) (x); | |
120 i += s.length; | |
121 if (i+1 >= ret.length) ret.length = ret.length * 2; // check length | |
122 ret[i-s.length .. i] = s; | |
123 ret[i++] = ','; | |
124 } | |
125 if (i == 1) ++i; // special case - not overwriting a comma | |
126 ret[i-1] = ']'; // replaces last comma | |
127 return ret[0..i]; | |
128 } | |
129 | |
130 // Strings (array special case) | |
131 char[] parseFrom(T : char[]) (T val) { | |
132 char[] ret = new char[val.length * 2 + 2]; // Initial storage. This should ALWAYS be enough. | |
133 ret[0] = '"'; | |
134 uint i = 1; | |
135 for (uint t = 0; t < val.length;) { | |
136 // process a block of non-escapable characters | |
137 uint s = t; | |
138 while (t < val.length && !isEscapableChar(val[t])) | |
139 ++t; // skip all non-escapable chars | |
140 uint j = i + t - s; | |
141 ret[i..j] = val[s..t]; // copy a block | |
142 i = j; | |
143 // process a block of escapable charaters | |
144 while (t < val.length && isEscapableChar(val[t])) { | |
145 ret[i++] = '\\'; // backslash; increment i | |
146 ret[i++] = replaceEscapableChar(val[t++]); // character; increment i and t | |
147 } | |
148 } | |
149 ret[i++] = '"'; | |
150 return ret[0..i]; | |
151 } | |
152 // Unicode conversions for strings: | |
153 char[] parseFrom(T : dchar[]) (T val) { | |
154 // May throw a UnicodeException; don't bother catching and rethrowing: | |
155 return parseFrom!(char[]) (Utf.toString (val)); | |
156 } | |
157 char[] parseFrom(T : wchar[]) (T val) { | |
158 // May throw a UnicodeException; don't bother catching and rethrowing: | |
159 return parseFrom!(char[]) (Utf.toString (val)); | |
160 } | |
161 | |
162 // Binary (array special case) | |
163 char[] parseFrom(T : ubyte[]) (T val) { | |
164 static const char[16] digits = "0123456789abcdef"; | |
165 | |
166 char[] ret = new char[val.length * 2 + 2]; // exact length | |
167 ret[0..2] = "0x"; | |
168 uint i = 2; | |
169 | |
170 foreach (ubyte x; val) { | |
171 ret[i++] = digits[x >> 4]; | |
172 ret[i++] = digits[x & 0x0F]; | |
173 } | |
174 return ret; | |
175 } | |
176 | |
177 debug (UnitTest) unittest { | |
178 // generic array stuff: | |
179 assert (parseFrom!(double[]) ([1.0, 1.0e-10]) == `[1.00000000000000000,0.10000000000000000e-09]`); | |
180 assert (parseFrom!(double[]) (cast(double[]) []) == `[]`); // empty array | |
181 | |
182 // char[] conversions, with commas, escape sequences and multichar UTF8 characters: | |
183 assert (parseFrom!(char[][]) ([ ".\""[], [',','\''] ,"!\b€" ]) == `[".\"",",\'","!\b€"]`); | |
184 | |
185 // wchar[] and dchar[] conversions: | |
186 // The characters were pretty-much pulled at random from unicode tables. | |
187 // The last few cause some wierd (display only) effects in my editor. | |
188 assert (parseFrom!(wchar[]) ("Test string: ¶α؟अกሀ搀"w) == "\"Test string: ¶α؟अกሀ搀\""); | |
189 assert (parseFrom!(dchar[]) ("Test string: ¶α؟अกሀ搀"d) == "\"Test string: ¶α؟अกሀ搀\""); | |
190 | |
191 assert (parseFrom!(ubyte[]) (cast(ubyte[]) [0x01, 0xF2, 0xAC]) == `0x01f2ac`); // ubyte[] special notation | |
192 } | |
193 | |
194 | |
195 // Basic types | |
196 | |
197 // Char | |
198 char[] parseFrom(T : char) (T val) { | |
199 // NOTE: if (val > 127) "is invalid UTF-8 single char" | |
200 // However we don't know what this is for, in particular if it will be recombined with other chars later | |
201 | |
202 // Can't return reference to static array; making dynamic is cheaper than copying. | |
203 char[] ret = new char[4]; // max length for an escaped char | |
204 ret[0] = '\''; | |
205 | |
206 if (!isEscapableChar (val)) { | |
207 ret[1] = val; | |
208 ret[2] = '\''; | |
209 return ret[0..3]; | |
210 } else { | |
211 ret[1] = '\\'; | |
212 ret[2] = replaceEscapableChar (val); | |
213 ret[3] = '\''; | |
214 return ret; | |
215 } | |
216 assert (false); | |
217 } | |
218 // Basic unicode convertions for wide-chars. | |
219 // NOTE: any other wide-chars will not fit in a single UTF-8 encoded char. | |
220 const char[] WIDE_CHAR_ERROR = "Error: unicode non-ascii character cannot be converted to a single UTF-8 char"; | |
221 char[] parseFrom(T : wchar) (T val) { | |
222 if (val <= 127u) return parseFrom!(char) (cast(char) val); // this char can be converted | |
223 else throw new UnicodeException (WIDE_CHAR_ERROR, 0); | |
224 } | |
225 char[] parseFrom(T : dchar) (T val) { | |
226 if (val <= 127u) return parseFrom!(char) (cast(char) val); // this char can be converted | |
227 else throw new UnicodeException (WIDE_CHAR_ERROR, 0); | |
228 } | |
229 debug (UnitTest) unittest { | |
230 assert (parseFrom!(char) ('\'') == "\'\\\'\'"); | |
231 assert (parseFrom!(wchar) ('X') == "'X'"); | |
232 assert (parseFrom!(dchar) ('X') == "'X'"); | |
233 } | |
234 | |
235 // Bool | |
236 char[] parseFrom(T : bool) (T val) { | |
237 if (val) return "true"; | |
238 else return "false"; | |
239 } | |
240 // too simple to need a unittest | |
241 | |
242 // Signed ints | |
243 char[] parseFrom(T : byte) (T val) { | |
244 return formatLong (val); | |
245 } | |
246 char[] parseFrom(T : short) (T val) { | |
247 return formatLong (val); | |
248 } | |
249 char[] parseFrom(T : int) (T val) { | |
250 return formatLong (val); | |
251 } | |
252 char[] parseFrom(T : long) (T val) { | |
253 return formatLong (val); | |
254 } | |
255 // Unsigned ints | |
256 char[] parseFrom(T : ubyte) (T val) { | |
257 return formatLong (val); | |
258 } | |
259 char[] parseFrom(T : ushort) (T val) { | |
260 return formatLong (val); | |
261 } | |
262 char[] parseFrom(T : uint) (T val) { | |
263 return formatLong (val); | |
264 } | |
265 char[] parseFrom(T : ulong) (T val) { | |
266 if (val > cast(ulong) long.max) | |
267 throw new IllegalArgumentException ("No handling available for ulong where value > long.max"); | |
268 return formatLong (val); | |
269 } | |
270 debug (UnitTest) unittest { | |
271 assert (parseFrom!(byte) (cast(byte) -5) == "-5"); | |
272 // annoyingly, octal syntax differs from D (blame tango): | |
273 assert (parseFrom!(uint[]) ([0b0100u,0724,0xFa59c,0xFFFFFFFF,0]) == "[4,468,1025436,4294967295,0]"); | |
274 } | |
275 | |
276 // Floats | |
277 /* Old calculation (not used): | |
278 t.dig+2+4+3 // should be sufficient length (mant + (neg, dot, e, exp neg) + exp (3,4,5 for float,double,real resp.)) */ | |
279 char[] parseFrom(T : float) (T val) { | |
280 char[] ret = new char[32]; // minimum allowed by assert in format | |
281 return cFloat.format (ret, val, T.dig+2, 1); // from old C++ tests, T.dig+2 gives best(?) accuracy | |
282 } | |
283 char[] parseFrom(T : double) (T val) { | |
284 char[] ret = new char[32]; | |
285 return cFloat.format (ret, val, T.dig+2, 1); | |
286 } | |
287 char[] parseFrom(T : real) (T val) { | |
288 char[] ret = new char[32]; | |
289 return cFloat.format (ret, val, T.dig+2, 1); | |
290 } | |
291 debug (UnitTest) unittest { | |
292 // NOTE: these numbers are not particularly meaningful. | |
293 assert (parseFrom!(float) (0.0f) == "0.00000000"); | |
294 assert (parseFrom!(double) (-1e25) == "-1.00000000000000000e+25"); | |
295 assert (parseFrom!(real) (cast(real) 4.918e300) == "4.91800000000000000000e+300"); | |
296 } | |
297 //END parrseFrom templates | |
298 | |
299 //BEGIN Length templates | |
300 /* This template provides the initial length for strings for formatting various types. These strings | |
301 * can be expanded; this value is intended to cover 90% of cases or so. | |
302 * | |
303 * NOTE: This template was intended to provide specialisations for different types. | |
304 * This one value should do reasonably well for most types. | |
305 */ | |
306 private { | |
307 template defLength(T) { const uint defLength = 20; } | |
308 template defLength(T : char) { const uint defLength = 4; } | |
309 template defLength(T : bool) { const uint defLength = 5; } | |
310 } | |
311 //END Length templates | |
312 | |
313 //BEGIN Utility funcs | |
314 private char[] formatLong (long val) { | |
315 // May throw an IllegalArgumentException; don't bother catching and rethrowing: | |
316 return cInt.toString (val); | |
317 } | |
318 private bool isEscapableChar (char c) { | |
319 return ((c <= '\r' && c >= '\a') || c == '\"' || c == '\'' || c == '\\'); | |
320 } | |
321 // Throws on unsupported escape sequences; however this should never actually happen within parseFrom. | |
322 private char replaceEscapableChar (char c) { | |
323 // This code was generated: | |
324 if (c <= '\v') { | |
325 if (c <= '\b') { | |
326 if (c == '\a') { | |
327 return 'a'; | |
328 } else if (c == '\b') { | |
329 return 'b'; | |
330 } | |
331 } else { | |
332 if (c == '\t') { | |
333 return 't'; | |
334 } else if (c == '\n') { | |
335 return 'n'; | |
336 } else if (c == '\v') { | |
337 return 'v'; | |
338 } | |
339 } | |
340 } else { | |
341 if (c <= '\r') { | |
342 if (c == '\f') { | |
343 return 'f'; | |
344 } else if (c == '\r') { | |
345 return 'r'; | |
346 } | |
347 } else { | |
348 if (c == '\"') { | |
349 return '\"'; | |
350 } else if (c == '\'') { | |
351 return '\''; | |
352 } else if (c == '\\') { | |
353 return '\\'; | |
354 } | |
355 } | |
356 } | |
357 | |
358 // if we haven't returned: | |
359 throw new IllegalArgumentException ("Character is not escapable (internal parseFrom error)"); | |
360 } | |
361 | |
362 debug (UnitTest) { | |
363 import tango.io.Console; | |
364 | |
365 unittest { | |
366 Cout ("Running unittest: parseFrom ...").flush; | |
367 | |
368 assert (parseFrom!(char[]) ("\a\b\t\n\v\f\r\"\'\\") == "\"\\a\\b\\t\\n\\v\\f\\r\\\"\\\'\\\\\""); | |
369 | |
370 Cout (" complete").newline; | |
371 } | |
372 } | |
373 //END Utility funcs |