Mercurial > projects > mde
view mde/mergetag/parse/parseFrom.d @ 70:7fc0a8295c83
Moved my parseTo and parseFrom modules from tango.scrapple to mde in order to reduce dependencies.
author | Diggory Hardy <diggory.hardy@gmail.com> |
---|---|
date | Fri, 04 Jul 2008 19:04:16 +0100 |
parents | |
children |
line wrap: on
line source
/************************************************************************************************** * copyright: Copyright (c) 2007-2008 Diggory Hardy. * * author: Diggory Hardy, diggory.hardy@gmail.com * * license: BSD style: $(LICENSE) * * This contains templates for converting various data-types to a char[]. * * parseFrom is roughly the inverse of $(B parseTo). * It is also available in tango.scrapple. * * This module basically implements the following templated function for most basic D types: * bool, byte, short, int, long, ubyte, ushort, uint, ulong, float, double, real, char, wchar, * dchar. * It also supports arrays of any supported type (including of other arrays) and has special * handling for strings (char[]) and binary (ubyte[]) data-types. * ----------------------------- * char[] parseFrom(T) (T value); * ----------------------------- * * $(I value) is the value to convert; it is converted to a string and returned. * * Syntax: * The syntax is the same as parseTo; but since this module only generates formatted output * knowing the syntax shouldn't be necessary. There is currently no way to specify options like * output base for ints, precision of floats, or * whether to write char[] or ubyte[] types as arrays or in their more compact forms. * * Throws: * On errors, an exception is thrown (UnicodeException or IllegalArgumentException). No other * exceptions should be thrown. * * Remarks: * There is currently no support for outputting wchar/dchar strings. There are, however, unicode * conversions for converting UTF-16/32 to UTF-8. Be warned though that many wchar/dchar characters * (any that are non-ascii) will not fit in a single char and an exception will be thrown. * * The code does involve some heap activity; this is necessary anyway for returning dynamic arrays. * (Slices of a pre-allocated array could be returned instead, but for many uses would have to be * duplicated before storage, leading to less efficient operation.) * Most memory allocation has been kept to a minimum. * * Unlike the parseTo!() module, the parseFrom templates could be re-written to use static-ifs * instead of type specialisation, thus allowing type inference. However I likely won't bother * implementing this myself. * * Examples: * ------------------------------------------------------------------------------------------------ * // Examples are printed via Cout. * * // Basic examples: * Cout (parseFrom!(byte) (-13)).newline; // -13 * Cout (parseFrom!(real) (2.56e11)).newline; // 2.55999999999999990000e+11 * Cout (parseFrom!(double[]) ([0.0, 1.0, 2.0, 3.0])).newline; // [0.00000000000000000,1.00000000000000000,2.00000000000000000,3.00000000000000000] * Cout (parseFrom!(bool[]) ([true,false,false])).newline; // [true,false,false] * * // String and ubyte[] special syntaxes (always used): * Cout (parseFrom!(char[]) ("A string.")).newline; // "A string." (including quotes) * Cout (parseFrom!(ubyte[]) (cast(ubyte[]) [5u, 0xF1u, 0x10u])).newline; // 0x05f110 * * // Associative arrays: * Cout (parseFrom!(char[][byte]) ([-1:"negative one"[], 0:"zero", 1:"one"])).newline; // [0:"zero",1:"one",-1:"negative one"] * * // No limit on complexity... * char[] somethingComplicated = parseFrom!(real[][][bool[int[][]]]) (...); * ------------------------------------------------------------------------------------------------ *************************************************************************************************/ module mde.mergetag.parse.parseFrom; // tango imports import tango.core.Exception : UnicodeException, IllegalArgumentException; import cInt = tango.text.convert.Integer; import cFloat = tango.text.convert.Float; import Utf = tango.text.convert.Utf; import Util = tango.text.Util; //BEGIN parseFrom templates /* Idea: could extend parseFrom with a second parameter, containing flags for things like base to output. * Unnecessary for mergetag though. */ // Associative arrays char[] parseFrom(T : T[S], S) (T[S] val) { char[] ret; // A guess, including values themselves and [,:] elements (must be at least 2). ret.length = val.length * (defLength!(T) + defLength!(S) + 2) + 2; ret[0] = '['; uint i = 1; foreach (S k, T v; val) { char[] s = parseFrom!(S) (k) ~ ":" ~ parseFrom!(T) (v); i += s.length; if (i+1 >= ret.length) ret.length = ret.length * 2; // check. ret[i-s.length .. i] = s; ret[i++] = ','; } if (i == 1) ++i; // special case - not overwriting a comma ret[i-1] = ']'; // replaces last comma return ret[0..i]; } debug (UnitTest) unittest { char[] X = parseFrom!(char[][char]) (['a':cast(char[])"animal", 'b':['b','u','s']]); char[] Y = `['a':"animal",'b':"bus"]`; assert (X == Y); } // Arrays char[] parseFrom(T : T[]) (T[] val) { char[] ret; // A guess, including commas and brackets (must be at least 2) ret.length = val.length * (defLength!(T) + 1) + 2; ret[0] = '['; uint i = 1; foreach (T x; val) { char[] s = parseFrom!(T) (x); i += s.length; if (i+1 >= ret.length) ret.length = ret.length * 2; // check length ret[i-s.length .. i] = s; ret[i++] = ','; } if (i == 1) ++i; // special case - not overwriting a comma ret[i-1] = ']'; // replaces last comma return ret[0..i]; } // Strings (array special case) char[] parseFrom(T : char[]) (T val) { char[] ret = new char[val.length * 2 + 2]; // Initial storage. This should ALWAYS be enough. ret[0] = '"'; uint i = 1; for (uint t = 0; t < val.length;) { // process a block of non-escapable characters uint s = t; while (t < val.length && !isEscapableChar(val[t])) ++t; // skip all non-escapable chars uint j = i + t - s; ret[i..j] = val[s..t]; // copy a block i = j; // process a block of escapable charaters while (t < val.length && isEscapableChar(val[t])) { ret[i++] = '\\'; // backslash; increment i ret[i++] = replaceEscapableChar(val[t++]); // character; increment i and t } } ret[i++] = '"'; return ret[0..i]; } // Unicode conversions for strings: char[] parseFrom(T : dchar[]) (T val) { // May throw a UnicodeException; don't bother catching and rethrowing: return parseFrom!(char[]) (Utf.toString (val)); } char[] parseFrom(T : wchar[]) (T val) { // May throw a UnicodeException; don't bother catching and rethrowing: return parseFrom!(char[]) (Utf.toString (val)); } // Binary (array special case) char[] parseFrom(T : ubyte[]) (T val) { static const char[16] digits = "0123456789abcdef"; char[] ret = new char[val.length * 2 + 2]; // exact length ret[0..2] = "0x"; uint i = 2; foreach (ubyte x; val) { ret[i++] = digits[x >> 4]; ret[i++] = digits[x & 0x0F]; } return ret; } debug (UnitTest) unittest { // generic array stuff: assert (parseFrom!(double[]) ([1.0, 1.0e-10]) == `[1.00000000000000000,0.10000000000000000e-09]`); assert (parseFrom!(double[]) (cast(double[]) []) == `[]`); // empty array // char[] conversions, with commas, escape sequences and multichar UTF8 characters: assert (parseFrom!(char[][]) ([ ".\""[], [',','\''] ,"!\b€" ]) == `[".\"",",\'","!\b€"]`); // wchar[] and dchar[] conversions: // The characters were pretty-much pulled at random from unicode tables. // The last few cause some wierd (display only) effects in my editor. assert (parseFrom!(wchar[]) ("Test string: ¶α؟अกሀ搀"w) == "\"Test string: ¶α؟अกሀ搀\""); assert (parseFrom!(dchar[]) ("Test string: ¶α؟अกሀ搀"d) == "\"Test string: ¶α؟अกሀ搀\""); assert (parseFrom!(ubyte[]) (cast(ubyte[]) [0x01, 0xF2, 0xAC]) == `0x01f2ac`); // ubyte[] special notation } // Basic types // Char char[] parseFrom(T : char) (T val) { // NOTE: if (val > 127) "is invalid UTF-8 single char" // However we don't know what this is for, in particular if it will be recombined with other chars later // Can't return reference to static array; making dynamic is cheaper than copying. char[] ret = new char[4]; // max length for an escaped char ret[0] = '\''; if (!isEscapableChar (val)) { ret[1] = val; ret[2] = '\''; return ret[0..3]; } else { ret[1] = '\\'; ret[2] = replaceEscapableChar (val); ret[3] = '\''; return ret; } assert (false); } // Basic unicode convertions for wide-chars. // NOTE: any other wide-chars will not fit in a single UTF-8 encoded char. const char[] WIDE_CHAR_ERROR = "Error: unicode non-ascii character cannot be converted to a single UTF-8 char"; char[] parseFrom(T : wchar) (T val) { if (val <= 127u) return parseFrom!(char) (cast(char) val); // this char can be converted else throw new UnicodeException (WIDE_CHAR_ERROR, 0); } char[] parseFrom(T : dchar) (T val) { if (val <= 127u) return parseFrom!(char) (cast(char) val); // this char can be converted else throw new UnicodeException (WIDE_CHAR_ERROR, 0); } debug (UnitTest) unittest { assert (parseFrom!(char) ('\'') == "\'\\\'\'"); assert (parseFrom!(wchar) ('X') == "'X'"); assert (parseFrom!(dchar) ('X') == "'X'"); } // Bool char[] parseFrom(T : bool) (T val) { if (val) return "true"; else return "false"; } // too simple to need a unittest // Signed ints char[] parseFrom(T : byte) (T val) { return formatLong (val); } char[] parseFrom(T : short) (T val) { return formatLong (val); } char[] parseFrom(T : int) (T val) { return formatLong (val); } char[] parseFrom(T : long) (T val) { return formatLong (val); } // Unsigned ints char[] parseFrom(T : ubyte) (T val) { return formatLong (val); } char[] parseFrom(T : ushort) (T val) { return formatLong (val); } char[] parseFrom(T : uint) (T val) { return formatLong (val); } char[] parseFrom(T : ulong) (T val) { if (val > cast(ulong) long.max) throw new IllegalArgumentException ("No handling available for ulong where value > long.max"); return formatLong (val); } debug (UnitTest) unittest { assert (parseFrom!(byte) (cast(byte) -5) == "-5"); // annoyingly, octal syntax differs from D (blame tango): assert (parseFrom!(uint[]) ([0b0100u,0724,0xFa59c,0xFFFFFFFF,0]) == "[4,468,1025436,4294967295,0]"); } // Floats /* Old calculation (not used): t.dig+2+4+3 // should be sufficient length (mant + (neg, dot, e, exp neg) + exp (3,4,5 for float,double,real resp.)) */ char[] parseFrom(T : float) (T val) { char[] ret = new char[32]; // minimum allowed by assert in format return cFloat.format (ret, val, T.dig+2, 1); // from old C++ tests, T.dig+2 gives best(?) accuracy } char[] parseFrom(T : double) (T val) { char[] ret = new char[32]; return cFloat.format (ret, val, T.dig+2, 1); } char[] parseFrom(T : real) (T val) { char[] ret = new char[32]; return cFloat.format (ret, val, T.dig+2, 1); } debug (UnitTest) unittest { // NOTE: these numbers are not particularly meaningful. assert (parseFrom!(float) (0.0f) == "0.00000000"); assert (parseFrom!(double) (-1e25) == "-1.00000000000000000e+25"); assert (parseFrom!(real) (cast(real) 4.918e300) == "4.91800000000000000000e+300"); } //END parrseFrom templates //BEGIN Length templates /* This template provides the initial length for strings for formatting various types. These strings * can be expanded; this value is intended to cover 90% of cases or so. * * NOTE: This template was intended to provide specialisations for different types. * This one value should do reasonably well for most types. */ private { template defLength(T) { const uint defLength = 20; } template defLength(T : char) { const uint defLength = 4; } template defLength(T : bool) { const uint defLength = 5; } } //END Length templates //BEGIN Utility funcs private char[] formatLong (long val) { // May throw an IllegalArgumentException; don't bother catching and rethrowing: return cInt.toString (val); } private bool isEscapableChar (char c) { return ((c <= '\r' && c >= '\a') || c == '\"' || c == '\'' || c == '\\'); } // Throws on unsupported escape sequences; however this should never actually happen within parseFrom. private char replaceEscapableChar (char c) { // This code was generated: if (c <= '\v') { if (c <= '\b') { if (c == '\a') { return 'a'; } else if (c == '\b') { return 'b'; } } else { if (c == '\t') { return 't'; } else if (c == '\n') { return 'n'; } else if (c == '\v') { return 'v'; } } } else { if (c <= '\r') { if (c == '\f') { return 'f'; } else if (c == '\r') { return 'r'; } } else { if (c == '\"') { return '\"'; } else if (c == '\'') { return '\''; } else if (c == '\\') { return '\\'; } } } // if we haven't returned: throw new IllegalArgumentException ("Character is not escapable (internal parseFrom error)"); } debug (UnitTest) { import tango.io.Console; unittest { Cout ("Running unittest: parseFrom ...").flush; assert (parseFrom!(char[]) ("\a\b\t\n\v\f\r\"\'\\") == "\"\\a\\b\\t\\n\\v\\f\\r\\\"\\\'\\\\\""); Cout (" complete").newline; } } //END Utility funcs