Mercurial > projects > mde
diff mde/mergetag/deserialize.d @ 79:61ea26abe4dd
Moved mde/mergetag/parse/parse(To/From) to mde/mergetag/(de)serialize. Implemented (de)serialization of structs.
author | Diggory Hardy <diggory.hardy@gmail.com> |
---|---|
date | Tue, 05 Aug 2008 11:51:51 +0100 |
parents | mde/mergetag/parse/parseTo.d@7fc0a8295c83 |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mde/mergetag/deserialize.d Tue Aug 05 11:51:51 2008 +0100 @@ -0,0 +1,605 @@ +/************************************************************************************************** + * Generic deserialization templated function. + * + * copyright: Copyright (c) 2007-2008 Diggory Hardy. + * + * author: Diggory Hardy, diggory.hardy@gmail.com + * + * Supports: + * Associative arrays, arrays (inc. strings), structs, char types, bool, int types, float types. + * + * There are also some public utility functions with their own documentation. + * + * Throws: + * On errors, a ParseException or a UnicodeException (both extend TextException) is thrown with a + * suitable message. No other exceptions should be thrown. + * + * Examples: + * ------------------------------------------------------------------------------------------------ + * // Basic examples: + * ulong a = deserialize!(ulong) ("20350"); + * float d = deserialize!(float) (" 1.2e-9 "); + * int[] b = deserialize!(int[]) ("[0,1,2,3]"); + * + * // String and char[] syntax: + * char[] c = deserialize!(char[]) ("\"A string\""); + * char[] e = deserialize!(char[]) ("['a','n','o','t','h','e','r', ' ' ,'s','t','r','i','n','g']"); + * + * // These be used interchangably; here's a more complex example of an associative array: + * bool[char[]] f = deserialize!(bool[char[]]) ("[ \"one\":true, ['t','w','o']:false, \"three\":1, \"four\":000 ]"); + * + * // There is also a special notation for ubyte[] types: + * // The digits following 0x must be in pairs and each specify one ubyte. + * assert ( deserialize!(ubyte[]) (`0x01F2AC`) == deserialize!(ubyte[]) (`[01 ,0xF2, 0xAC]`) ); + * + * // There's no limit to the complexity! + * char[char[][][][char]][bool] z = ...; // don't expect me to write this! + * ------------------------------------------------------------------------------------------------ + * + * TODO: Optimize memory allocation (if possible?). Test best sizes for initial allocations + * instead of merely guessing? + *************************************************************************************************/ +//NOTE: in case of multiple formats, make this a dummy module importing both serialize modules, +// or put all the code here. +module mde.mergetag.deserialize; + +// tango imports +import tango.core.Exception : TextException, UnicodeException; +import cInt = tango.text.convert.Integer; +import cFloat = tango.text.convert.Float; +import Utf = tango.text.convert.Utf; +import Util = tango.text.Util; + +/** + * Base class for deserialize exceptions. + */ +class ParseException : TextException +{ + this( char[] msg ) + { + super( msg ); + } +} + +alias deserialize parseTo; // support the old name + +//BEGIN deserialize templates + +// Associative arrays + +T[S] deserialize(T : T[S], S) (char[] src) { + src = Util.trim(src); + if (src.length < 2 || src[0] != '[' || src[$-1] != ']') + throw new ParseException ("Invalid associative array: not [ ... ]"); // bad braces. + + T[S] ret; + foreach (char[] pair; split (src[1..$-1])) { + uint i = 0; + while (i < pair.length) { // advance to the ':' + char c = pair[i]; + if (c == ':') break; + if (c == '\'' || c == '"') { // string or character + ++i; + while (i < pair.length && pair[i] != c) { + if (pair[i] == '\\') + ++i; // escape seq. + ++i; + } + // Could have an unterminated ' or " causing i >= pair.length, but: + // 1. Impossible: split would have thrown + // 2. In any case this would be caught below. + } + ++i; + } + if (i >= pair.length) + throw new ParseException ("Invalid associative array: encountered [ ... KEY] (missing :DATA)"); + ret[deserialize!(S) (pair[0..i])] = deserialize!(T) (pair[i+1..$]); + } + return ret; +} + + +// Arrays + +T[] deserialize(T : T[]) (char[] src) { + src = Util.trim(src); + if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') + return toArray!(T[]) (src); + throw new ParseException ("Invalid array: not [ ... ]"); +} + +// String (array special case) +T deserialize(T : char[]) (char[] src) { + src = Util.trim(src); + if (src.length >= 2 && src[0] == '"' && src[$-1] == '"') { + src = src[1..$-1]; + T ret; + ret.length = src.length; // maximum length; retract to actual length later + uint i = 0; + for (uint t = 0; t < src.length;) { + // process a block of non-escaped characters + uint s = t; + while (t < src.length && src[t] != '\\') ++t; // non-escaped characters + uint j = i + t - s; + ret[i..j] = src[s..t]; // copy a block + i = j; + + // process a block of escaped characters + while (t < src.length && src[t] == '\\') { + t++; + if (t == src.length) + throw new ParseException ("Invalid string: ends \\\" !"); // next char is " + ret[i++] = unEscapeChar (src[t++]); // throws if it's invalid + } + } + return ret[0..i]; + } + else if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') + return toArray!(T) (src); + throw new ParseException ("Invalid string: not quoted (\"*\") or char array (['a',...,'c'])"); +} +// Unicode conversions for strings: +T deserialize(T : wchar[]) (char[] src) { + // May throw a UnicodeException; don't bother catching and rethrowing: + return Utf.toString16 (deserialize!(char[]) (src)); +} +T deserialize(T : dchar[]) (char[] src) { + // May throw a UnicodeException; don't bother catching and rethrowing: + return Utf.toString32 (deserialize!(char[]) (src)); +} + +// Binary (array special case) +T deserialize(T : ubyte[]) (char[] src) { + src = Util.trim(src); + // Standard case: + if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T) (src); + // Special case: sequence of hex digits, each pair of which is a ubyte + if (src.length >= 2 && src[0..2] == "0x") { + src = src[2..$]; // strip down to actual digits + + // Must be in pairs: + if (src.length % 2 == 1) + throw new ParseException ("Invalid binary: odd number of chars"); + + T ret; + ret.length = src.length / 2; // exact + + for (uint i, pos; pos + 1 < src.length; ++i) { + ubyte x = readHexChar(src, pos) << 4; + x |= readHexChar(src, pos); + ret[i] = x; + } + return ret; + } + else throw new ParseException ("Invalid ubyte[]: not an array and doesn't start 0x"); +} + + +// Basic types + +// Char +// Assumes value is <= 127 (for valid UTF-8), since input would be invalid UTF-8 if not anyway. +// (And we're not really interested in checking for valid unicode; char[] conversions don't either.) +T deserialize(T : char) (char[] src) { + src = Util.trim(src); + if (src.length < 3 || src[0] != '\'' || src[$-1] != '\'') + throw new ParseException ("Invalid char: not 'x' or '\\x'"); + if (src[1] != '\\') { + if (src.length == 3) + return src[1]; // Either non escaped + throw new ParseException ("Invalid char: too long (or non-ASCII)"); + } else if (src.length == 4) + return unEscapeChar (src[2]); // Or escaped + + throw new ParseException ("Invalid char: '\\'"); +} +// Basic unicode convertions for wide-chars. +// Assumes value is <= 127 as does deserialize!(char). +T deserialize(T : wchar) (char[] src) { + return cast(T) deserialize!(char) (src); +} +T deserialize(T : dchar) (char[] src) { + return cast(T) deserialize!(char) (src); +} + +// Bool +T deserialize(T : bool) (char[] src) { + src = Util.trim(src); + if (src == "true") + return true; + if (src == "false") + return false; + uint pos; + while (src.length > pos && src[pos] == '0') ++pos; // skip leading zeros + if (src.length == pos && pos > 0) + return false; + if (src.length == pos + 1 && src[pos] == '1') + return true; + throw new ParseException ("Invalid bool: not true or false and doesn't evaluate to 0 or 1"); +} + +// Ints +T deserialize(T : byte) (char[] src) { + return toTInt!(T) (src); +} +T deserialize(T : short) (char[] src) { + return toTInt!(T) (src); +} +T deserialize(T : int) (char[] src) { + return toTInt!(T) (src); +} +T deserialize(T : long) (char[] src) { + return toTInt!(T) (src); +} +T deserialize(T : ubyte) (char[] src) { + return toTInt!(T) (src); +} +T deserialize(T : ushort) (char[] src) { + return toTInt!(T) (src); +} +T deserialize(T : uint) (char[] src) { + return toTInt!(T) (src); +} +T deserialize(T : ulong) (char[] src) { + return toTInt!(T) (src); +} +debug (UnitTest) unittest { + assert (deserialize!(byte) ("-5") == cast(byte) -5); + // annoyingly, octal syntax differs from D (blame tango): + assert (deserialize!(uint[]) ("[0b0100,0o724,0xFa59c,0xFFFFFFFF,0]") == [0b0100u,0724,0xFa59c,0xFFFFFFFF,0]); +} + +// Floats +T deserialize(T : float) (char[] src) { + return toTFloat!(T) (src); +} +T deserialize(T : double) (char[] src) { + return toTFloat!(T) (src); +} +T deserialize(T : real) (char[] src) { + return toTFloat!(T) (src); +} + + +// Structs +T deserialize(T) (char[] src) { + static assert (is(T == struct), "Unsupported type: "~typeof(T)); + + src = Util.trim(src); + if (src.length < 2 || src[0] != '{' || src[$-1] != '}') + throw new ParseException ("Invalid struct: not { ... }"); + + // cannot access elements of T.tupleof with non-const key, so use a type which can be + // accessed with a non-const key to store slices: + char[][T.tupleof.length] temp; + foreach (char[] pair; split (src[1..$-1])) { + uint i = 0; + while (i < pair.length) { // advance to the ':' + char c = pair[i]; + if (c == ':') + break; + // key must be an int so no need for string checks + ++i; + } + if (i >= pair.length) + throw new ParseException ("Invalid struct: encountered { ... KEY} (missing :DATA)"); + + size_t k = deserialize!(size_t) (pair[0..i]); + // Note: could check no entry was already stored in temp. + temp[k] = pair[i+1..$]; + } + T ret; + setStruct (ret, temp); + return ret; +} +//END deserialize templates + +//BEGIN Utility funcs +/** Splits a string into substrings separated by '$(B ,)' with support for characters and strings + * containing escape sequences and for embedded arrays ($(B [...])). + * + * Params: + * src A string to separate on commas. It shouldn't have enclosing brackets. + * + * Returns: + * An array of substrings within src, excluding commas. Whitespace is not stripped and + * empty strings may get returned. + * + * Remarks: + * This function is primarily intended for as a utility function for use by the templates + * parsing arrays and associative arrays, but it may be useful in other cases too. Hence the + * fact no brackets are stripped from src. + */ +//FIXME foreach struct is more efficient +char[][] split (char[] src) { + src = Util.trim (src); + if (src == "") + return []; // empty array: no elements when no data + + uint depth = 0; // surface depth (embedded arrays) + char[][] ret; + ret.length = src.length / 3; // unlikely to need a longer array + uint k = 0; // current split piece + uint i = 0, j = 0; // current read location, start of current piece + + while (i < src.length) { + char c = src[i]; + if (c == '\'' || c == '"') { // string or character + ++i; + while (i < src.length && src[i] != c) { + if (src[i] == '\\') + ++i; // escape seq. + ++i; + } // Doesn't throw if no terminal quote at end of src, but this should be caught later. + } + else if (c == '[') ++depth; + else if (c == ']') { + if (depth) + --depth; + else throw new ParseException ("Invalid array literal: closes before end of data item."); + } + else if (c == ',' && depth == 0) { // only if not an embedded array + if (ret.length <= k) + ret.length = ret.length * 2; + ret[k++] = src[j..i]; // add this piece and increment k + j = i + 1; + } + ++i; + } + if (i > src.length) + throw new ParseException ("Unterminated quote (\' or \")"); + + if (ret.length <= k) + ret.length = k + 1; + ret[k] = src[j..i]; // add final piece (i >= j) + return ret[0..k+1]; +} + +/* Templated read-int function to read (un)signed 1-4 byte integers. + * + * Actually a reimplementation of tango.text.convert.Integer toLong and parse functions. + */ +private TInt toTInt(TInt) (char[] src) { + const char[] INT_OUT_OF_RANGE = "Integer out of range"; + bool sign; + uint radix, ate, ate2; + + // Trim off whitespace. + // NOTE: Cannot use tango.text.convert.Integer.trim to trim leading whitespace since it doesn't + // treat new-lines, etc. as whitespace which for our purposes is whitespace. + src = Util.trim (src); + + ate = cInt.trim (src, sign, radix); + if (ate == src.length) + throw new ParseException ("Invalid integer: no digits"); + ulong val = cInt.convert (src[ate..$], radix, &ate2); + ate += ate2; + + if (ate < src.length) + throw new ParseException ("Invalid integer at marked character: \"" ~ src[0..ate] ~ "'" ~ src[ate] ~ "'" ~ src[ate+1..$] ~ "\""); + + if (val > TInt.max) + throw new ParseException (INT_OUT_OF_RANGE); + if (sign) { + long sval = cast(long) -val; + if (sval > TInt.min) + return cast(TInt) sval; + else throw new ParseException (INT_OUT_OF_RANGE); + } + return cast(TInt) val; +} + +/* Basically a reimplementation of tango.text.convert.Float.toFloat which checks for + * whitespace before throwing an exception for overlong input. */ +private TFloat toTFloat(TFloat) (char[] src) { + // NOTE: As for toTInt(), this needs to strip leading as well as trailing whitespace. + src = Util.trim (src); + if (src == "") + throw new ParseException ("Invalid float: no digits"); + uint ate; + + TFloat x = cFloat.parse (src, &ate); + return x; +} + +/* Throws an exception on invalid escape sequences. Supported escape sequences are the following + * subset of those supported by D: \" \' \\ \a \b \f \n \r \t \v + */ +private char unEscapeChar (char c) +{ + // This code was generated: + if (c <= 'b') { + if (c <= '\'') { + if (c == '\"') { + return '\"'; + } else if (c == '\'') { + return '\''; + } + } else { + if (c == '\\') { + return '\\'; + } else if (c == 'a') { + return '\a'; + } else if (c == 'b') { + return '\b'; + } + } + } else { + if (c <= 'n') { + if (c == 'f') { + return '\f'; + } else if (c == 'n') { + return '\n'; + } + } else { + if (c == 'r') { + return '\r'; + } else if (c == 't') { + return '\t'; + } else if (c == 'v') { + return '\v'; + } + } + } + + // if we haven't returned: + throw new ParseException ("Bad escape sequence: \\"~c); +} + +// Reads one hex char: [0-9A-Fa-f]. Otherwise throws an exception. Doesn't check src.length. +private ubyte readHexChar (char[] src, inout uint pos) { + ubyte x; + if (src[pos] >= '0' && src[pos] <= '9') x = src[pos] - '0'; + else if (src[pos] >= 'A' && src[pos] <= 'F') x = src[pos] - 'A' + 10; + else if (src[pos] >= 'a' && src[pos] <= 'f') x = src[pos] - 'a' + 10; + else throw new ParseException ("Invalid hex digit."); + ++pos; + return x; +} + +// Generic array reader +// Assumes input is of form "[xxxxx]" (i.e. first and last chars are '[', ']' and length >= 2). +private T[] toArray(T : T[]) (char[] src) { + T[] ret = new T[16]; // avoid unnecessary allocations + uint i = 0; + foreach (char[] element; split(src[1..$-1])) { + if (i == ret.length) ret.length = ret.length * 2; + ret[i] = deserialize!(T) (element); + ++i; + } + return ret[0..i]; +} + +/** Set a struct's elements from an array. +* +* For a more generic version, see http://www.dsource.org/projects/tutorials/wiki/StructTupleof +*/ +// NOTE: Efficiency? Do recursive calls get inlined? +private void setStruct(S, size_t N, size_t i = 0) (ref S s, char[][N] src) { + static assert (is(S == struct), "Only to be used with structs."); + static assert (N == S.tupleof.length, "src.length != S.tupleof.length"); + static if (i < N) { + if (src[i]) + s.tupleof[i] = deserialize!(typeof(s.tupleof[i])) (src[i]); + setStruct!(S, N, i+1) (s, src); + } +} +//END Utility funcs + +debug (UnitTest) { + import tango.util.log.Log : Log, Logger; + + private Logger logger; + static this() { + logger = Log.getLogger ("text.deserialize"); + } +unittest { + // Utility + bool throws (void delegate() dg) { + bool r = false; + try { + dg(); + } catch (Exception e) { + r = true; + logger.info ("Exception caught: "~e.msg); + } + return r; + } + assert (!throws ({ int i = 5; })); + assert (throws ({ throw new Exception ("Test - this exception should be caught"); })); + + + // Associative arrays + char[][char] X = deserialize!(char[][char]) (`['a':"animal\n", 'b':['b','u','s','\n']]`); + char[][char] Y = ['a':cast(char[])"animal\n", 'b':['b','u','s','\n']]; + + //FIXME: when the compiler's fixed: http://d.puremagic.com/issues/show_bug.cgi?id=1671 + // just assert (X == Y) + assert (X.length == Y.length); + assert (X.keys == Y.keys); + assert (X.values == Y.values); + //X.rehash; Y.rehash; // doesn't make a difference + //assert (X == Y); // fails (compiler bug) + + assert (throws ({ deserialize!(int[int]) (`[1:1`); })); // bad brackets + assert (throws ({ deserialize!(int[char[]]) (`["ab\":1]`); })); // unterminated quote + assert (throws ({ deserialize!(int[char[]]) (`["abc,\a\b\c":1]`); })); // bad escape seq. + assert (throws ({ deserialize!(int[char[]]) (`["abc"]`); })); // no data + + + // Arrays + assert (deserialize!(double[]) (`[1.0,1.0e-10]`) == [1.0, 1.0e-10]);// generic array stuff + assert (deserialize!(double[]) (`[ ]`) == cast(double[]) []); // empty array + assert (deserialize!(int[][]) (`[[1],[2,3],[]]`) == [[1],[2,3],[]]);// sub-array + assert (throws ({ deserialize!(int[]) (`[1,2`); })); // bad brackets + assert (throws ({ deserialize!(int[][]) (`[[1]]]`); })); // bad brackets + + // char[] and char conversions, with commas, escape sequences and multichar UTF8 characters: + assert (deserialize!(char[][]) (`[ ".\"", [',','\''] ,"!\b€" ]`) == [ ".\"".dup, [',','\''] ,"!\b€" ]); + assert (throws ({ deserialize!(char[]) ("\"\\\""); })); + assert (throws ({ deserialize!(char[]) (`['a'`); })); // bad brackets + + // wchar[] and dchar[] conversions: + // The characters were pretty-much pulled at random from unicode tables. + // The last few cause some wierd (display only) effects in my editor. + assert (deserialize!(wchar[]) ("\"Test string: ¶α؟अกሀ搀\"") == "Test string: ¶α؟अกሀ搀"w); + assert (deserialize!(dchar[]) ("\"Test string: ¶α؟अกሀ搀\"") == "Test string: ¶α؟अกሀ搀"d); + + assert (deserialize!(ubyte[]) (`0x01F2aC`) == cast(ubyte[]) [0x01, 0xF2, 0xAC]); // ubyte[] special notation + assert (deserialize!(ubyte[]) (`[01 ,0xF2, 0xAC]`) == cast(ubyte[]) [0x01, 0xF2, 0xAC]); // ubyte[] std notation + assert (throws ({ deserialize!(ubyte[]) (`0x123`); })); // digits not in pairs + assert (throws ({ deserialize!(ubyte[]) (`[2,5`); })); // not [...] or 0x.. + assert (throws ({ deserialize!(ubyte[]) (`0x123j`); })); + + + // char types + assert (deserialize!(char) ("'\\\''") == '\''); + assert (deserialize!(wchar) ("'X'") == 'X'); + assert (deserialize!(dchar) ("'X'") == 'X'); + assert (throws ({ deserialize!(char) ("'\\'"); })); + assert (throws ({ deserialize!(char) ("'£'"); })); // non-ascii + assert (throws ({ deserialize!(char) ("''"); })); + assert (throws ({ deserialize!(char) ("'ab'"); })); + assert (throws ({ deserialize!(wchar) ("''"); })); + + + // bool + assert (deserialize!(bool[]) (`[true,false,01,00]`) == cast(bool[]) [1,0,1,0]); + assert (throws ({ deserialize!(bool) ("011"); })); + + + // ints + assert (deserialize!(byte) ("-5") == cast(byte) -5); + assert (deserialize!(int) ("-0x7FFFFFFF") == cast(int) -0x7FFF_FFFF); + // annoyingly, octal syntax differs from D (blame tango): + assert (deserialize!(uint[]) ("[0b0100,0o724,0xFa59c,0xFFFFFFFF,0]") == [0b0100u,0724,0xFa59c,0xFFFFFFFF,0]); + assert (throws ({ deserialize!(int) (""); })); + assert (throws ({ deserialize!(int) ("0x8FFFFFFF"); })); + assert (throws ({ deserialize!(uint) ("-1"); })); + assert (throws ({ deserialize!(uint) ("1a"); })); + + + // floats + assert (deserialize!(float) ("0.0") == 0.0f); + assert (deserialize!(double) ("-1e25") == -1e25); + assert (deserialize!(real) ("5.24e-269") == cast(real) 5.24e-269); + assert (throws ({ deserialize!(float) (""); })); + + + // structs + struct A { int x = 5; char y; } + struct B { A a; float b; } + A a; a.y = 'y'; + assert (deserialize!(A) ("{ 1 : 'y' }") == a); + B b; b.a = a; b.b = 1.0f; + assert (deserialize!(B) (" {1:1.0,0: { 1 : 'y' } } ") == b); + assert (throws ({ deserialize!(A) (" 1:'x'}"); })); // bad braces + assert (throws ({ deserialize!(A) ("{ 1 }"); })); // no :DATA + + + // unEscapeChar + assert (deserialize!(char[]) ("\"\\a\\b\\t\\n\\v\\f\\r\\\"\\\'\\\\\"") == "\a\b\t\n\v\f\r\"\'\\"); + + logger.info ("Unittest complete."); +} +}