diff mde/file/serialize.d @ 81:d8fccaa45d5f

Moved file IO code from mde/mergetag to mde/file[/mergetag] and changed how some errors are caught.
author Diggory Hardy <diggory.hardy@gmail.com>
date Fri, 29 Aug 2008 11:59:43 +0100
parents mde/mergetag/serialize.d@61ea26abe4dd
children ac1e3fd07275
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mde/file/serialize.d	Fri Aug 29 11:59:43 2008 +0100
@@ -0,0 +1,391 @@
+/* LICENSE BLOCK
+Part of mde: a Modular D game-oriented Engine
+Copyright © 2007-2008 Diggory Hardy
+
+This program is free software: you can redistribute it and/or modify it under the terms
+of the GNU General Public License as published by the Free Software Foundation, either
+version 2 of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+/**************************************************************************************************
+ * Generic serialization templated function.
+ *
+ * Supports:
+ *  Associative arrays, arrays (inc. strings), structs, char types, bool, int types, float types.
+ *
+ * Examples:
+ * ------------------------------------------------------------------------------------------------
+ * // Basic examples:
+ * Cout (serialize!(byte) (-13)).newline;                       // -13
+ * Cout (serialize!(real) (2.56e11)).newline;                   // 2.55999999999999990000e+11
+ * Cout (serialize!(double[]) ([0.0, 1.0, 2.0, 3.0])).newline;  // [0.00000000000000000,1.00000000000000000,2.00000000000000000,3.00000000000000000]
+ * Cout (serialize ([true,false,false])).newline;               // [true,false,false]
+ *
+ * // String and ubyte[] special syntaxes (always used):
+ * Cout (serialize ("A string.")).newline;                      // "A string." (including quotes)
+ * Cout (serialize (cast(ubyte[]) [5u, 0xF1u, 0x10u])).newline; // 0x05f110
+ *
+ * // Associative arrays:
+ * Cout (serialize ([-1:"negative one"[], 0:"zero", 1:"one"])).newline; // [0:"zero",1:"one",-1:"negative one"]
+ *
+ * // Structs:
+ * struct S {   int a = 5;  double[int[]] x;    }
+ * S s;
+ * Cout (serialize (s));
+ *
+ * // No limit on complexity...
+ * char[] somethingComplicated = serialize!(real[][][bool[int[][]]]) (...);
+ * ------------------------------------------------------------------------------------------------
+ *
+ * throws:
+ *      May throw a UnicodeException or an IllegalArgumentException.
+ *
+ * TODO: Optimize memory allocation (if possible?). Test best sizes for initial allocations
+ * instead of merely guessing?
+ *************************************************************************************************/
+//NOTE: in case of multiple formats, make this a dummy module importing both serialize modules,
+// or put all the code here.
+module mde.file.serialize;
+// Since serialize is never used in a module where deserialize is not used, save an import:
+public import mde.file.deserialize;
+
+// tango imports
+import tango.core.Traits;
+import tango.core.Exception : UnicodeException, IllegalArgumentException;
+import cInt = tango.text.convert.Integer;
+import cFloat = tango.text.convert.Float;
+import Utf = tango.text.convert.Utf;
+
+
+alias serialize parseFrom;      // support the old name
+
+// Formatting options, for where multiple formats are supported by the deserializer.
+
+// Output using the special binary notation (0x01F2AC instead of [01 ,0xF2, 0xAC])?
+const bool SPECIAL_BINARY_NOTATION = true;
+
+// Output binary as true / false or 1 / 0 ?
+const bool BINARY_AS_WORDS = true;
+
+
+char[] serialize(U) (U val) {
+    // Associative arrays (NOTE: cannot use is() expression)
+    static if (isAssocArrayType!(U)) {          // generic associative array
+        alias typeof(U.keys[0])     S;
+        alias typeof(U.values[0])   T;
+        char[] ret;
+        // A guess, including values themselves and [,:] elements (must be at least 2).
+        ret.length = val.length * (defLength!(T) + defLength!(S) + 2) + 2;
+        ret[0] = '[';
+        uint i = 1;
+        foreach (S k, T v; val) {
+            char[] s = serialize!(S) (k) ~ ":" ~ serialize!(T) (v);
+            i += s.length;
+            if (i+1 >= ret.length)
+                ret.length = ret.length * 2; // check.
+            ret[i-s.length .. i] = s;
+            ret[i++] = ',';
+        }
+        if (i == 1) ++i;    // special case - not overwriting a comma
+            ret[i-1] = ']'; // replaces last comma
+            return ret[0..i];
+    }
+    // Arrays
+    else static if (is(U S == S[]) || isStaticArrayType!(U)) {
+        alias typeof(U[0]) T;
+        
+        static if (is(T == char)) {             // string
+            char[] ret = new char[val.length * 2 + 2];  // Initial storage. This should ALWAYS be enough.
+            ret[0] = '"';
+            uint i = 1;
+            for (uint t = 0; t < val.length;) {
+            // process a block of non-escapable characters
+                uint s = t;
+                while (t < val.length && !isEscapableChar(val[t]))
+                    ++t;	// skip all non-escapable chars
+                uint j = i + t - s;
+                ret[i..j] = val[s..t];	// copy a block
+                i = j;
+            // process a block of escapable charaters
+                while (t < val.length && isEscapableChar(val[t])) {
+                    ret[i++] = '\\';				// backslash; increment i
+                    ret[i++] = escapeChar(val[t++]);	// character; increment i and t
+                }
+            }
+            ret[i++] = '"';
+            return ret[0..i];
+        }
+        else static if (is(T == wchar) || is(T == dchar)) {   // wstring or dstring
+            // May throw a UnicodeException; don't bother catching and rethrowing:
+            return serialize!(char[]) (Utf.toString (val));
+        }
+        else static if (SPECIAL_BINARY_NOTATION && is(T == ubyte)) {    // special binary notation
+            // Note: To disable the usage of this special type, set SPECIAL_BINARY_NOTATION = false.
+            static const char[16] digits = "0123456789abcdef";
+    
+            char[] ret = new char[val.length * 2 + 2];	// exact length
+            ret[0..2] = "0x";
+            uint i = 2;
+    
+            foreach (ubyte x; val) {
+                ret[i++] = digits[x >> 4];
+                ret[i++] = digits[x & 0x0F];
+            }
+            return ret;
+        }
+        else {                                  // generic array
+            char[] ret;
+        // A guess, including commas and brackets (must be at least 2)
+            ret.length = val.length * (defLength!(T) + 1) + 2;
+            ret[0] = '[';
+            uint i = 1;
+            foreach (T x; val) {
+                char[] s = serialize!(T) (x);
+                i += s.length;
+                if (i+1 >= ret.length)
+                    ret.length = ret.length * 2;	// check length
+                ret[i-s.length .. i] = s;
+                ret[i++] = ',';
+            }
+            if (i == 1)
+                ++i;	// special case - not overwriting a comma
+            ret[i-1] = ']'; 	// replaces last comma
+            return ret[0..i];
+        }
+    }
+    // Structs
+    else static if (is(U == struct)) {
+        char[] ret;
+        // A very rough guess.
+        ret.length = val.sizeof * 4;
+        ret[0] = '{';
+        uint i = 1;
+        foreach (k, v; val.tupleof) {
+            alias typeof(v) T;
+            char[] s = serialize!(size_t) (k) ~ ":" ~ serialize!(T) (v);
+            i += s.length;
+            if (i+1 >= ret.length)
+                ret.length = ret.length * 2; // check.
+            ret[i-s.length .. i] = s;
+            ret[i++] = ',';
+        }
+        if (i == 1) ++i;    // special case - not overwriting a comma
+            ret[i-1] = '}'; // replaces last comma
+            return ret[0..i];
+    }
+    // Basic types
+    else static if (is(U == char)) {            // char (UTF-8 byte)
+        // Note: if (val > 127) "is invalid UTF-8 single char".  However we don't know
+        // what this is for, in particular if it will be recombined with other chars later.
+        
+        // Can't return reference to static array; so making it dynamic is cheaper than copying.
+        char[] ret = new char[4];	// max length for an escaped char
+        ret[0] = '\'';
+        
+        if (!isEscapableChar (val)) {
+            ret[1] = val;
+            ret[2] = '\'';
+            return ret[0..3];
+        } else {
+            ret[1] = '\\';
+            ret[2] = escapeChar (val);
+            ret[3] = '\'';
+            return ret;
+        }
+    } else static if (is(U == wchar) ||
+                      is(U == dchar)) {         // wchar or dchar (UTF-16/32 single char)
+        // Note: only ascii can be converted. NOTE: convert to UTF-8 (multibyte) char?
+        if (val <= 127u)
+            return serialize!(char) (cast(char) val);  // ASCII
+        else throw new UnicodeException (
+            "Error: unicode non-ascii character cannot be converted to a single UTF-8 char", 0);
+    } else static if (is (U == bool)) {         // boolean
+        static if (BINARY_AS_WORDS) {
+            if (val)
+                return "true";
+            else return "false";
+        } else {
+            if (val)
+                return "1";
+            else return "0";
+        }
+    } else static if (is (U : long)) {          // any integer type, except char types and bool
+        static if (is (U == ulong))             // ulong may not be supported properly
+            if (val > cast(ulong) long.max)
+                throw new IllegalArgumentException ("No handling available for ulong where value > long.max");
+        return cInt.toString (val);
+    } else static if (is (U : real)) {          // any (real) floating point type
+        char[] ret = new char[32];              // minimum allowed by assert in format
+        return cFloat.format (ret, val, U.dig+2, 1);// from old C++ tests, U.dig+2 gives best(?) accuracy
+    }
+    // Unsupported
+    else
+        static assert (false, "Unsupported type: "~U.stringof);
+}
+
+//BEGIN Utility funcs
+/* This template provides the initial length for strings for formatting various types. These strings
+ * can be expanded; this value is intended to cover 90% of cases or so.
+ *
+ * NOTE: This template was intended to provide specialisations for different types.
+ * This one value should do reasonably well for most types.
+ */
+private {
+    template defLength(T)        { const uint defLength = 20; }
+    template defLength(T : char) { const uint defLength = 4;  }
+    template defLength(T : bool) { const uint defLength = 5;  }
+}
+private bool isEscapableChar (char c) {
+    return ((c <= '\r' && c >= '\a') || c == '\"' || c == '\'' || c == '\\');
+}
+// Throws on unsupported escape sequences; however this should never happen within serialize.
+private char escapeChar (char c) {
+    // This code was generated:
+    if (c <= '\v') {
+        if (c <= '\b') {
+            if (c == '\a') {
+                return 'a';
+            } else if (c == '\b') {
+                return 'b';
+            }
+        } else {
+            if (c == '\t') {
+                return 't';
+            } else if (c == '\n') {
+                return 'n';
+            } else if (c == '\v') {
+                return 'v';
+            }
+        }
+    } else {
+        if (c <= '\r') {
+            if (c == '\f') {
+                return 'f';
+            } else if (c == '\r') {
+                return 'r';
+            }
+        } else {
+            if (c == '\"') {
+                return '\"';
+            } else if (c == '\'') {
+                return '\'';
+            } else if (c == '\\') {
+                return '\\';
+            }
+        }
+    }
+    
+    // if we haven't returned:
+    throw new IllegalArgumentException ("Internal error (escapeChar)");
+}
+//END Utility funcs
+
+
+
+debug (UnitTest) {
+    import tango.util.log.Log : Log, Logger;
+
+    private Logger logger;
+    static this() {
+        logger = Log.getLogger ("text.serialize");
+    }
+unittest {
+    // Utility
+    bool throws (void delegate() dg) {
+        bool r = false;
+        try {
+            dg();
+        } catch (Exception e) {
+            r = true;
+            logger.info ("Exception caught: "~e.msg);
+        }
+        return r;
+    }
+    assert (!throws ({ int i = 5; }));
+    assert (throws ({ throw new Exception ("Test - this exception should be caught"); }));
+    
+    // Associative arrays
+    char[] X = serialize!(char[][char]) (['a':cast(char[])"animal", 'b':['b','u','s']]);
+    char[] Y = `['a':"animal",'b':"bus"]`;
+    assert (X == Y);
+    
+    
+    // Arrays
+    // generic array stuff:
+    assert (serialize!(double[]) ([1.0, 1.0e-10]) == `[1.00000000000000000,0.10000000000000000e-09]`);
+    assert (serialize!(double[]) (cast(double[]) []) == `[]`);		// empty array
+    
+    // char[] conversions, with commas, escape sequences and multichar UTF8 characters:
+    assert (serialize!(char[][]) ([ ".\""[], [',','\''] ,"!\b€" ]) == `[".\"",",\'","!\b€"]`);
+    
+    // wchar[] and dchar[] conversions:
+    // The characters were pretty-much pulled at random from unicode tables.
+    assert (serialize!(wchar[]) ("Test string: ¶α؟अกሀ搀"w) == "\"Test string: ¶α؟अกሀ搀\"");
+    assert (serialize!(dchar[]) ("Test string: ¶α؟अกሀ搀"d) == "\"Test string: ¶α؟अกሀ搀\"");
+    
+    
+    static if (SPECIAL_BINARY_NOTATION)
+        assert (serialize!(ubyte[]) (cast(ubyte[]) [0x01, 0xF2, 0xAC]) == `0x01f2ac`);	// ubyte[] special notation
+    else
+        assert (serialize!(ubyte[]) (cast(ubyte[]) [0x01, 0xF2, 0xAC]) == `[1,242,172]`);
+    
+    
+    // Structs
+    struct Foo {    int a = 9;  char b = '\v'; float c;    }
+    struct Bar {    Foo a,b;    }
+    static Foo foo1 = { a:150, b:'8', c:17.2f}, foo2;
+    Bar bar;
+    bar.a = foo1;
+    bar.b = foo2;
+    assert (serialize(bar) == "{0:{0:150,1:'8',2:1.72000007e+01},1:{0:9,1:'\\v',2:nan}}");
+    
+    
+    // Basic Types
+    // Character types
+    assert (serialize!(char) ('\'') == "\'\\\'\'");
+    assert (serialize!(wchar) ('X') == "'X'");
+    assert (serialize!(dchar) ('X') == "'X'");
+    assert (throws ({ char[] r = serialize!(wchar) ('£');   /* unicode U+00A3 */ }));
+    assert (throws ({ char[] r = serialize!(dchar) ('£'); }));
+    
+    // Bool
+    static if (BINARY_AS_WORDS)
+        assert (serialize(false) == "false");
+    else
+        assert (serialize(true) == "1");
+    
+    // Integers
+    assert (serialize (cast(byte) -5) == "-5");
+    assert (serialize (cast(short) -32768) == "-32768");
+    assert (serialize (-5) == "-5");
+    assert (serialize (-9223372036854775807L) == "-9223372036854775807");
+    assert (serialize (cast(ubyte) -1) == "255");
+    assert (serialize (cast(ushort) -1) == "65535");
+    assert (serialize!(uint) (-1) == "4294967295");
+    assert (serialize (cast(ulong) 0x7FFF_FFFF_FFFF_FFFFLu) == "9223372036854775807");
+    assert (serialize!(uint[]) ([0b0100u,0724,0xFa59c,0xFFFFFFFF,0]) ==
+                               "[4,468,1025436,4294967295,0]");
+    assert (throws ({
+        // ulong is not properly supported.
+        // NOTE: this is something that should really work.
+        char[] r = serialize!(ulong) (0x8FFF_FFFF_FFFF_FFFFLu);
+    }));
+    
+    // Floats
+    // These numbers are not particularly meaningful:
+    assert (serialize!(float) (0.0f) == "0.00000000");
+    assert (serialize!(double) (-1e25) == "-1.00000000000000000e+25");
+    assert (serialize!(real) (cast(real) 4.918e300) == "4.91800000000000000000e+300");
+    
+    // Escape sequences (test conversion functions)
+    assert (serialize ("\a\b\t\n\v\f\r\"\'\\") == `"\a\b\t\n\v\f\r\"\'\\"`);
+    
+    logger.info ("Unittest complete.");
+}
+}