comparison mde/mergetag/parse/parseFrom.d @ 70:7fc0a8295c83

Moved my parseTo and parseFrom modules from tango.scrapple to mde in order to reduce dependencies.
author Diggory Hardy <diggory.hardy@gmail.com>
date Fri, 04 Jul 2008 19:04:16 +0100
parents
children
comparison
equal deleted inserted replaced
69:ead4afc6d0b8 70:7fc0a8295c83
1 /**************************************************************************************************
2 * copyright: Copyright (c) 2007-2008 Diggory Hardy.
3 *
4 * author: Diggory Hardy, diggory.hardy@gmail.com
5 *
6 * license: BSD style: $(LICENSE)
7 *
8 * This contains templates for converting various data-types to a char[].
9 *
10 * parseFrom is roughly the inverse of $(B parseTo).
11 * It is also available in tango.scrapple.
12 *
13 * This module basically implements the following templated function for most basic D types:
14 * bool, byte, short, int, long, ubyte, ushort, uint, ulong, float, double, real, char, wchar,
15 * dchar.
16 * It also supports arrays of any supported type (including of other arrays) and has special
17 * handling for strings (char[]) and binary (ubyte[]) data-types.
18 * -----------------------------
19 * char[] parseFrom(T) (T value);
20 * -----------------------------
21 *
22 * $(I value) is the value to convert; it is converted to a string and returned.
23 *
24 * Syntax:
25 * The syntax is the same as parseTo; but since this module only generates formatted output
26 * knowing the syntax shouldn't be necessary. There is currently no way to specify options like
27 * output base for ints, precision of floats, or
28 * whether to write char[] or ubyte[] types as arrays or in their more compact forms.
29 *
30 * Throws:
31 * On errors, an exception is thrown (UnicodeException or IllegalArgumentException). No other
32 * exceptions should be thrown.
33 *
34 * Remarks:
35 * There is currently no support for outputting wchar/dchar strings. There are, however, unicode
36 * conversions for converting UTF-16/32 to UTF-8. Be warned though that many wchar/dchar characters
37 * (any that are non-ascii) will not fit in a single char and an exception will be thrown.
38 *
39 * The code does involve some heap activity; this is necessary anyway for returning dynamic arrays.
40 * (Slices of a pre-allocated array could be returned instead, but for many uses would have to be
41 * duplicated before storage, leading to less efficient operation.)
42 * Most memory allocation has been kept to a minimum.
43 *
44 * Unlike the parseTo!() module, the parseFrom templates could be re-written to use static-ifs
45 * instead of type specialisation, thus allowing type inference. However I likely won't bother
46 * implementing this myself.
47 *
48 * Examples:
49 * ------------------------------------------------------------------------------------------------
50 * // Examples are printed via Cout.
51 *
52 * // Basic examples:
53 * Cout (parseFrom!(byte) (-13)).newline; // -13
54 * Cout (parseFrom!(real) (2.56e11)).newline; // 2.55999999999999990000e+11
55 * Cout (parseFrom!(double[]) ([0.0, 1.0, 2.0, 3.0])).newline; // [0.00000000000000000,1.00000000000000000,2.00000000000000000,3.00000000000000000]
56 * Cout (parseFrom!(bool[]) ([true,false,false])).newline; // [true,false,false]
57 *
58 * // String and ubyte[] special syntaxes (always used):
59 * Cout (parseFrom!(char[]) ("A string.")).newline; // "A string." (including quotes)
60 * Cout (parseFrom!(ubyte[]) (cast(ubyte[]) [5u, 0xF1u, 0x10u])).newline; // 0x05f110
61 *
62 * // Associative arrays:
63 * Cout (parseFrom!(char[][byte]) ([-1:"negative one"[], 0:"zero", 1:"one"])).newline; // [0:"zero",1:"one",-1:"negative one"]
64 *
65 * // No limit on complexity...
66 * char[] somethingComplicated = parseFrom!(real[][][bool[int[][]]]) (...);
67 * ------------------------------------------------------------------------------------------------
68 *************************************************************************************************/
69
70 module mde.mergetag.parse.parseFrom;
71
72 // tango imports
73 import tango.core.Exception : UnicodeException, IllegalArgumentException;
74 import cInt = tango.text.convert.Integer;
75 import cFloat = tango.text.convert.Float;
76 import Utf = tango.text.convert.Utf;
77 import Util = tango.text.Util;
78
79 //BEGIN parseFrom templates
80 /* Idea: could extend parseFrom with a second parameter, containing flags for things like base to output.
81 * Unnecessary for mergetag though.
82 */
83
84 // Associative arrays
85
86 char[] parseFrom(T : T[S], S) (T[S] val) {
87 char[] ret;
88 // A guess, including values themselves and [,:] elements (must be at least 2).
89 ret.length = val.length * (defLength!(T) + defLength!(S) + 2) + 2;
90 ret[0] = '[';
91 uint i = 1;
92 foreach (S k, T v; val) {
93 char[] s = parseFrom!(S) (k) ~ ":" ~ parseFrom!(T) (v);
94 i += s.length;
95 if (i+1 >= ret.length) ret.length = ret.length * 2; // check.
96 ret[i-s.length .. i] = s;
97 ret[i++] = ',';
98 }
99 if (i == 1) ++i; // special case - not overwriting a comma
100 ret[i-1] = ']'; // replaces last comma
101 return ret[0..i];
102 }
103 debug (UnitTest) unittest {
104 char[] X = parseFrom!(char[][char]) (['a':cast(char[])"animal", 'b':['b','u','s']]);
105 char[] Y = `['a':"animal",'b':"bus"]`;
106 assert (X == Y);
107 }
108
109
110 // Arrays
111
112 char[] parseFrom(T : T[]) (T[] val) {
113 char[] ret;
114 // A guess, including commas and brackets (must be at least 2)
115 ret.length = val.length * (defLength!(T) + 1) + 2;
116 ret[0] = '[';
117 uint i = 1;
118 foreach (T x; val) {
119 char[] s = parseFrom!(T) (x);
120 i += s.length;
121 if (i+1 >= ret.length) ret.length = ret.length * 2; // check length
122 ret[i-s.length .. i] = s;
123 ret[i++] = ',';
124 }
125 if (i == 1) ++i; // special case - not overwriting a comma
126 ret[i-1] = ']'; // replaces last comma
127 return ret[0..i];
128 }
129
130 // Strings (array special case)
131 char[] parseFrom(T : char[]) (T val) {
132 char[] ret = new char[val.length * 2 + 2]; // Initial storage. This should ALWAYS be enough.
133 ret[0] = '"';
134 uint i = 1;
135 for (uint t = 0; t < val.length;) {
136 // process a block of non-escapable characters
137 uint s = t;
138 while (t < val.length && !isEscapableChar(val[t]))
139 ++t; // skip all non-escapable chars
140 uint j = i + t - s;
141 ret[i..j] = val[s..t]; // copy a block
142 i = j;
143 // process a block of escapable charaters
144 while (t < val.length && isEscapableChar(val[t])) {
145 ret[i++] = '\\'; // backslash; increment i
146 ret[i++] = replaceEscapableChar(val[t++]); // character; increment i and t
147 }
148 }
149 ret[i++] = '"';
150 return ret[0..i];
151 }
152 // Unicode conversions for strings:
153 char[] parseFrom(T : dchar[]) (T val) {
154 // May throw a UnicodeException; don't bother catching and rethrowing:
155 return parseFrom!(char[]) (Utf.toString (val));
156 }
157 char[] parseFrom(T : wchar[]) (T val) {
158 // May throw a UnicodeException; don't bother catching and rethrowing:
159 return parseFrom!(char[]) (Utf.toString (val));
160 }
161
162 // Binary (array special case)
163 char[] parseFrom(T : ubyte[]) (T val) {
164 static const char[16] digits = "0123456789abcdef";
165
166 char[] ret = new char[val.length * 2 + 2]; // exact length
167 ret[0..2] = "0x";
168 uint i = 2;
169
170 foreach (ubyte x; val) {
171 ret[i++] = digits[x >> 4];
172 ret[i++] = digits[x & 0x0F];
173 }
174 return ret;
175 }
176
177 debug (UnitTest) unittest {
178 // generic array stuff:
179 assert (parseFrom!(double[]) ([1.0, 1.0e-10]) == `[1.00000000000000000,0.10000000000000000e-09]`);
180 assert (parseFrom!(double[]) (cast(double[]) []) == `[]`); // empty array
181
182 // char[] conversions, with commas, escape sequences and multichar UTF8 characters:
183 assert (parseFrom!(char[][]) ([ ".\""[], [',','\''] ,"!\b€" ]) == `[".\"",",\'","!\b€"]`);
184
185 // wchar[] and dchar[] conversions:
186 // The characters were pretty-much pulled at random from unicode tables.
187 // The last few cause some wierd (display only) effects in my editor.
188 assert (parseFrom!(wchar[]) ("Test string: ¶α؟अกሀ搀"w) == "\"Test string: ¶α؟अกሀ搀\"");
189 assert (parseFrom!(dchar[]) ("Test string: ¶α؟अกሀ搀"d) == "\"Test string: ¶α؟अกሀ搀\"");
190
191 assert (parseFrom!(ubyte[]) (cast(ubyte[]) [0x01, 0xF2, 0xAC]) == `0x01f2ac`); // ubyte[] special notation
192 }
193
194
195 // Basic types
196
197 // Char
198 char[] parseFrom(T : char) (T val) {
199 // NOTE: if (val > 127) "is invalid UTF-8 single char"
200 // However we don't know what this is for, in particular if it will be recombined with other chars later
201
202 // Can't return reference to static array; making dynamic is cheaper than copying.
203 char[] ret = new char[4]; // max length for an escaped char
204 ret[0] = '\'';
205
206 if (!isEscapableChar (val)) {
207 ret[1] = val;
208 ret[2] = '\'';
209 return ret[0..3];
210 } else {
211 ret[1] = '\\';
212 ret[2] = replaceEscapableChar (val);
213 ret[3] = '\'';
214 return ret;
215 }
216 assert (false);
217 }
218 // Basic unicode convertions for wide-chars.
219 // NOTE: any other wide-chars will not fit in a single UTF-8 encoded char.
220 const char[] WIDE_CHAR_ERROR = "Error: unicode non-ascii character cannot be converted to a single UTF-8 char";
221 char[] parseFrom(T : wchar) (T val) {
222 if (val <= 127u) return parseFrom!(char) (cast(char) val); // this char can be converted
223 else throw new UnicodeException (WIDE_CHAR_ERROR, 0);
224 }
225 char[] parseFrom(T : dchar) (T val) {
226 if (val <= 127u) return parseFrom!(char) (cast(char) val); // this char can be converted
227 else throw new UnicodeException (WIDE_CHAR_ERROR, 0);
228 }
229 debug (UnitTest) unittest {
230 assert (parseFrom!(char) ('\'') == "\'\\\'\'");
231 assert (parseFrom!(wchar) ('X') == "'X'");
232 assert (parseFrom!(dchar) ('X') == "'X'");
233 }
234
235 // Bool
236 char[] parseFrom(T : bool) (T val) {
237 if (val) return "true";
238 else return "false";
239 }
240 // too simple to need a unittest
241
242 // Signed ints
243 char[] parseFrom(T : byte) (T val) {
244 return formatLong (val);
245 }
246 char[] parseFrom(T : short) (T val) {
247 return formatLong (val);
248 }
249 char[] parseFrom(T : int) (T val) {
250 return formatLong (val);
251 }
252 char[] parseFrom(T : long) (T val) {
253 return formatLong (val);
254 }
255 // Unsigned ints
256 char[] parseFrom(T : ubyte) (T val) {
257 return formatLong (val);
258 }
259 char[] parseFrom(T : ushort) (T val) {
260 return formatLong (val);
261 }
262 char[] parseFrom(T : uint) (T val) {
263 return formatLong (val);
264 }
265 char[] parseFrom(T : ulong) (T val) {
266 if (val > cast(ulong) long.max)
267 throw new IllegalArgumentException ("No handling available for ulong where value > long.max");
268 return formatLong (val);
269 }
270 debug (UnitTest) unittest {
271 assert (parseFrom!(byte) (cast(byte) -5) == "-5");
272 // annoyingly, octal syntax differs from D (blame tango):
273 assert (parseFrom!(uint[]) ([0b0100u,0724,0xFa59c,0xFFFFFFFF,0]) == "[4,468,1025436,4294967295,0]");
274 }
275
276 // Floats
277 /* Old calculation (not used):
278 t.dig+2+4+3 // should be sufficient length (mant + (neg, dot, e, exp neg) + exp (3,4,5 for float,double,real resp.)) */
279 char[] parseFrom(T : float) (T val) {
280 char[] ret = new char[32]; // minimum allowed by assert in format
281 return cFloat.format (ret, val, T.dig+2, 1); // from old C++ tests, T.dig+2 gives best(?) accuracy
282 }
283 char[] parseFrom(T : double) (T val) {
284 char[] ret = new char[32];
285 return cFloat.format (ret, val, T.dig+2, 1);
286 }
287 char[] parseFrom(T : real) (T val) {
288 char[] ret = new char[32];
289 return cFloat.format (ret, val, T.dig+2, 1);
290 }
291 debug (UnitTest) unittest {
292 // NOTE: these numbers are not particularly meaningful.
293 assert (parseFrom!(float) (0.0f) == "0.00000000");
294 assert (parseFrom!(double) (-1e25) == "-1.00000000000000000e+25");
295 assert (parseFrom!(real) (cast(real) 4.918e300) == "4.91800000000000000000e+300");
296 }
297 //END parrseFrom templates
298
299 //BEGIN Length templates
300 /* This template provides the initial length for strings for formatting various types. These strings
301 * can be expanded; this value is intended to cover 90% of cases or so.
302 *
303 * NOTE: This template was intended to provide specialisations for different types.
304 * This one value should do reasonably well for most types.
305 */
306 private {
307 template defLength(T) { const uint defLength = 20; }
308 template defLength(T : char) { const uint defLength = 4; }
309 template defLength(T : bool) { const uint defLength = 5; }
310 }
311 //END Length templates
312
313 //BEGIN Utility funcs
314 private char[] formatLong (long val) {
315 // May throw an IllegalArgumentException; don't bother catching and rethrowing:
316 return cInt.toString (val);
317 }
318 private bool isEscapableChar (char c) {
319 return ((c <= '\r' && c >= '\a') || c == '\"' || c == '\'' || c == '\\');
320 }
321 // Throws on unsupported escape sequences; however this should never actually happen within parseFrom.
322 private char replaceEscapableChar (char c) {
323 // This code was generated:
324 if (c <= '\v') {
325 if (c <= '\b') {
326 if (c == '\a') {
327 return 'a';
328 } else if (c == '\b') {
329 return 'b';
330 }
331 } else {
332 if (c == '\t') {
333 return 't';
334 } else if (c == '\n') {
335 return 'n';
336 } else if (c == '\v') {
337 return 'v';
338 }
339 }
340 } else {
341 if (c <= '\r') {
342 if (c == '\f') {
343 return 'f';
344 } else if (c == '\r') {
345 return 'r';
346 }
347 } else {
348 if (c == '\"') {
349 return '\"';
350 } else if (c == '\'') {
351 return '\'';
352 } else if (c == '\\') {
353 return '\\';
354 }
355 }
356 }
357
358 // if we haven't returned:
359 throw new IllegalArgumentException ("Character is not escapable (internal parseFrom error)");
360 }
361
362 debug (UnitTest) {
363 import tango.io.Console;
364
365 unittest {
366 Cout ("Running unittest: parseFrom ...").flush;
367
368 assert (parseFrom!(char[]) ("\a\b\t\n\v\f\r\"\'\\") == "\"\\a\\b\\t\\n\\v\\f\\r\\\"\\\'\\\\\"");
369
370 Cout (" complete").newline;
371 }
372 }
373 //END Utility funcs