view mde/file/serialize.d @ 88:01f4f5f1acc9

Changes to init and to allow compiling with gdc. Tweaked init code to allow using circular iterators (disabled until my patch makes it into tango). Changes to allow compiling with gdc. Building is successful and unittests complete, but in my experience a SIGSEGV occurs within SDL.
author Diggory Hardy <diggory.hardy@gmail.com>
date Mon, 29 Sep 2008 12:09:44 +0100
parents 79d816b3e2d2
children 97e6dce08037
line wrap: on
line source

/* LICENSE BLOCK
Part of mde: a Modular D game-oriented Engine
Copyright © 2007-2008 Diggory Hardy

This program is free software: you can redistribute it and/or modify it under the terms
of the GNU General Public License as published by the Free Software Foundation, either
version 2 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>. */

/**************************************************************************************************
 * Generic serialization templated function.
 *
 * Supports:
 *  Associative arrays, dynamic arrays (with usual formatting of strings), structs, char types,
 *  bool, int types, float types.
 *
 * Examples:
 * ------------------------------------------------------------------------------------------------
 * // Basic examples:
 * Cout (serialize!(byte) (-13)).newline;                       // -13
 * Cout (serialize!(real) (2.56e11)).newline;                   // 2.55999999999999990000e+11
 * Cout (serialize!(double[]) ([0.0, 1.0, 2.0, 3.0])).newline;  // [0.00000000000000000,1.00000000000000000,2.00000000000000000,3.00000000000000000]
 * Cout (serialize ([true,false,false])).newline;               // [true,false,false]
 *
 * // String and ubyte[] special syntaxes (always used):
 * Cout (serialize ("A string.")).newline;                      // "A string." (including quotes)
 * Cout (serialize (cast(ubyte[]) [5u, 0xF1u, 0x10u])).newline; // 0x05f110
 *
 * // Associative arrays:
 * Cout (serialize ([-1:"negative one"[], 0:"zero", 1:"one"])).newline; // [0:"zero",1:"one",-1:"negative one"]
 *
 * // Structs:
 * struct S {   int a = 5;  double[int[]] x;    }
 * S s;
 * Cout (serialize (s));
 *
 * // No limit on complexity...
 * char[] somethingComplicated = serialize!(real[][][bool[int[][]]]) (...);
 * ------------------------------------------------------------------------------------------------
 *
 * throws:
 *      May throw a UnicodeException or an IllegalArgumentException.
 *
 * TODO: Optimize memory allocation (if possible?). Test best sizes for initial allocations
 * instead of merely guessing?
 *************************************************************************************************/
//NOTE: in case of multiple formats, make this a dummy module importing both serialize modules,
// or put all the code here.
//FIXME: Optimize by using a slicing buffer. Put everything in a struct containing this buffer to
// make it thread-safe.
module mde.file.serialize;
// Since serialize is never used in a module where deserialize is not used, save an import:
public import mde.file.deserialize;

// tango imports
import tango.core.Traits;
import tango.core.Exception : UnicodeException, IllegalArgumentException;
import cInt = tango.text.convert.Integer;
import cFloat = tango.text.convert.Float;
import Utf = tango.text.convert.Utf;


alias serialize parseFrom;      // support the old name

// Formatting options, for where multiple formats are supported by the deserializer.

// Output using the special binary notation (0x01F2AC instead of [01 ,0xF2, 0xAC])?
const bool SPECIAL_BINARY_NOTATION = true;

// Output binary as true / false or 1 / 0 ?
const bool BINARY_AS_WORDS = true;


char[] serialize(U) (U val) {
    // Associative arrays (NOTE: cannot use is() expression)
    static if (isAssocArrayType!(U)) {          // generic associative array
        alias typeof(U.keys[0])     S;
        alias typeof(U.values[0])   T;
        char[] ret;
        // A guess, including values themselves and [,:] elements (must be at least 2).
        ret.length = val.length * (defLength!(T) + defLength!(S) + 2) + 2;
        ret[0] = '[';
        uint i = 1;
        foreach (S k, T v; val) {
            char[] s = serialize!(S) (k) ~ ":" ~ serialize!(T) (v);
            i += s.length;
            if (i+1 >= ret.length)
                ret.length = ret.length * 2; // check.
            ret[i-s.length .. i] = s;
            ret[i++] = ',';
        }
        if (i == 1) ++i;    // special case - not overwriting a comma
            ret[i-1] = ']'; // replaces last comma
            return ret[0..i];
    }
    // Arrays
    else static if (is(U S == S[]) || isStaticArrayType!(U)) {
        alias typeof(U[0]) T;
        
        static if (is(T == char)) {             // string
            char[] ret = new char[val.length * 2 + 2];  // Initial storage. This should ALWAYS be enough.
            ret[0] = '"';
            uint i = 1;
            for (uint t = 0; t < val.length;) {
            // process a block of non-escapable characters
                uint s = t;
                while (t < val.length && !isEscapableChar(val[t]))
                    ++t;	// skip all non-escapable chars
                uint j = i + t - s;
                ret[i..j] = val[s..t];	// copy a block
                i = j;
            // process a block of escapable charaters
                while (t < val.length && isEscapableChar(val[t])) {
                    ret[i++] = '\\';				// backslash; increment i
                    ret[i++] = escapeChar(val[t++]);	// character; increment i and t
                }
            }
            ret[i++] = '"';
            return ret[0..i];
        }
        else static if (is(T == wchar) || is(T == dchar)) {   // wstring or dstring
            // May throw a UnicodeException; don't bother catching and rethrowing:
            return serialize!(char[]) (Utf.toString (val));
        }
        else static if (SPECIAL_BINARY_NOTATION && is(T == ubyte)) {    // special binary notation
            // Note: To disable the usage of this special type, set SPECIAL_BINARY_NOTATION = false.
            static const char[16] digits = "0123456789abcdef";
    
            char[] ret = new char[val.length * 2 + 2];	// exact length
            ret[0..2] = "0x";
            uint i = 2;
    
            foreach (ubyte x; val) {
                ret[i++] = digits[x >> 4];
                ret[i++] = digits[x & 0x0F];
            }
            return ret;
        }
        else {                                  // generic array
            char[] ret;
        // A guess, including commas and brackets (must be at least 2)
            ret.length = val.length * (defLength!(T) + 1) + 2;
            ret[0] = '[';
            uint i = 1;
            foreach (T x; val) {
                char[] s = serialize!(T) (x);
                i += s.length;
                if (i+1 >= ret.length)
                    ret.length = ret.length * 2;	// check length
                ret[i-s.length .. i] = s;
                ret[i++] = ',';
            }
            if (i == 1)
                ++i;	// special case - not overwriting a comma
            ret[i-1] = ']'; 	// replaces last comma
            return ret[0..i];
        }
    }
    // Structs
    else static if (is(U == struct)) {
        char[] ret;
        // A very rough guess.
        ret.length = val.sizeof * 4;
        ret[0] = '{';
        uint i = 1;
        foreach (k, v; val.tupleof) {
            alias typeof(v) T;
            char[] s = serialize!(size_t) (k) ~ ":" ~ serialize!(T) (v);
            i += s.length;
            if (i+1 >= ret.length)
                ret.length = ret.length * 2; // check.
            ret[i-s.length .. i] = s;
            ret[i++] = ',';
        }
        if (i == 1) ++i;    // special case - not overwriting a comma
            ret[i-1] = '}'; // replaces last comma
            return ret[0..i];
    }
    // Basic types
    else static if (is(U == char)) {            // char (UTF-8 byte)
        if (val > 127)      // outputing invalid utf-8 could corrupt the output stream
            throw new IllegalArgumentException ("Not a valid UTF-8 character");
        
        // Can't return reference to static array; so making it dynamic is cheaper than copying.
        char[] ret = new char[4];	// max length for an escaped char
        ret[0] = '\'';
        
        if (!isEscapableChar (val)) {
            ret[1] = val;
            ret[2] = '\'';
            return ret[0..3];
        } else {
            ret[1] = '\\';
            ret[2] = escapeChar (val);
            ret[3] = '\'';
            return ret;
        }
    } else static if (is(U == wchar) ||
                      is(U == dchar)) {         // wchar or dchar (UTF-16/32 single char)
        if (val <= 127u)
            return serialize!(char) (cast(char) val);  // ASCII
        else {  // convert to a multi-byte UTF-8 char
            // NOTE: suboptimal
            char[] t,ret;
            t = Utf.toString([val]);
            ret.length = t.length + 2;
            ret = '\'' ~ t ~ '\'';
            return ret;
        }
    } else static if (is (U == bool)) {         // boolean
        static if (BINARY_AS_WORDS) {
            if (val)
                return "true";
            else return "false";
        } else {
            if (val)
                return "1";
            else return "0";
        }
    } else static if (is (U : long)) {          // any integer type, except char types and bool
        static if (is (U == ulong))             // ulong may not be supported properly
            if (val > cast(ulong) long.max)
                throw new IllegalArgumentException ("No handling available for ulong where value > long.max");
        return cInt.toString (val);
    } else static if (is (U : real)) {          // any (real) floating point type
        char[] ret = new char[32];              // minimum allowed by assert in format
        return cFloat.format (ret, val, U.dig+2, 1);// from old C++ tests, U.dig+2 gives best(?) accuracy
    }
    // Unsupported
    else
        static assert (false, "Unsupported type: "~U.stringof);
}

//BEGIN Utility funcs
/* This template provides the initial length for strings for formatting various types. These strings
 * can be expanded; this value is intended to cover 90% of cases or so.
 *
 * NOTE: This template was intended to provide specialisations for different types.
 * This one value should do reasonably well for most types.
 */
private {
    template defLength(T)        { const uint defLength = 20; }
    template defLength(T : char) { const uint defLength = 4;  }
    template defLength(T : bool) { const uint defLength = 5;  }
}
private bool isEscapableChar (char c) {
    return ((c <= '\r' && c >= '\a') || c == '\"' || c == '\'' || c == '\\');
}
// Throws on unsupported escape sequences; however this should never happen within serialize.
private char escapeChar (char c) {
    // This code was generated:
    if (c <= '\v') {
        if (c <= '\b') {
            if (c == '\a') {
                return 'a';
            } else if (c == '\b') {
                return 'b';
            }
        } else {
            if (c == '\t') {
                return 't';
            } else if (c == '\n') {
                return 'n';
            } else if (c == '\v') {
                return 'v';
            }
        }
    } else {
        if (c <= '\r') {
            if (c == '\f') {
                return 'f';
            } else if (c == '\r') {
                return 'r';
            }
        } else {
            if (c == '\"') {
                return '\"';
            } else if (c == '\'') {
                return '\'';
            } else if (c == '\\') {
                return '\\';
            }
        }
    }
    
    // if we haven't returned:
    throw new IllegalArgumentException ("Internal error (escapeChar)");
}
//END Utility funcs



debug (mdeUnitTest) {
    import tango.util.log.Log : Log, Logger;

    private Logger logger;
    static this() {
        logger = Log.getLogger ("mde.file.serialize");
    }
unittest {
    // Utility
    bool throws (void delegate() dg) {
        bool r = false;
        try {
            dg();
        } catch (Exception e) {
            r = true;
        }
        return r;
    }
    assert (!throws ({ int i = 5; }));
    assert (throws ({ throw new Exception ("Test - this exception should be caught"); }));
    
    // Associative arrays
    char[] X = serialize!(char[][char]) (['a':cast(char[])"animal", 'b':['b','u','s']]);
    char[] Y = `['a':"animal",'b':"bus"]`;
    assert (X == Y);
    
    
    // Arrays
    // generic array stuff:
    assert (serialize!(double[]) ([1.0, 1.0e-10]) == `[1.00000000000000000,0.10000000000000000e-09]`);
    assert (serialize!(double[]) (cast(double[]) []) == `[]`);		// empty array
    
    // char[] conversions, with commas, escape sequences and multichar UTF8 characters:
    assert (serialize!(char[][]) ([ ".\""[], [',','\''] ,"!\b€" ]) == `[".\"",",\'","!\b€"]`);
    
    // wchar[] and dchar[] conversions:
    // The characters were pretty-much pulled at random from unicode tables.
    assert (serialize!(wchar[]) ("Test string: ¶α؟अกሀ搀"w) == "\"Test string: ¶α؟अกሀ搀\"");
    assert (serialize!(dchar[]) ("Test string: ¶α؟अกሀ搀"d) == "\"Test string: ¶α؟अกሀ搀\"");
    
    
    static if (SPECIAL_BINARY_NOTATION)
        assert (serialize!(ubyte[]) (cast(ubyte[]) [0x01, 0xF2, 0xAC]) == `0x01f2ac`);	// ubyte[] special notation
    else
        assert (serialize!(ubyte[]) (cast(ubyte[]) [0x01, 0xF2, 0xAC]) == `[1,242,172]`);
    
    
    // Structs
    struct Foo {    int a = 9;  char b = '\v'; float c;    }
    struct Bar {    Foo a,b;    }
    static Foo foo1 = { a:150, b:'8'}, foo2;
    Bar bar;
    bar.a = foo1;
    bar.b = foo2;
    assert (serialize(bar) == "{0:{0:150,1:'8',2:nan},1:{0:9,1:'\\v',2:nan}}");
    
    
    // Basic Types
    // Character types
    assert (serialize!(char) ('\'') == "\'\\\'\'");
    assert (serialize!(wchar) ('X') == "'X'");
    assert (serialize!(dchar) ('X') == "'X'");
    assert (serialize!(wchar) ('£') == "'£'");  // unicode U+00A3 i.e. a multi-byte UTF-8 char
    assert (serialize!(dchar) ('£') == "'£'");
    assert (throws ({ serialize!(char) ('£'); }));      // compiler converts £ to char, but it's not valid UTF-8
    
    // Bool
    static if (BINARY_AS_WORDS)
        assert (serialize(false) == "false");
    else
        assert (serialize(true) == "1");
    
    // Integers
    assert (serialize (cast(byte) -5) == "-5");
    assert (serialize (cast(short) -32768) == "-32768");
    assert (serialize (-5) == "-5");
    assert (serialize (-9223372036854775807L) == "-9223372036854775807");
    assert (serialize (cast(ubyte) -1) == "255");
    assert (serialize (cast(ushort) -1) == "65535");
    assert (serialize!(uint) (-1) == "4294967295");
    assert (serialize (cast(ulong) 0x7FFF_FFFF_FFFF_FFFFLu) == "9223372036854775807");
    assert (serialize!(uint[]) ([0b0100u,0724,0xFa59c,0xFFFFFFFF,0]) ==
                               "[4,468,1025436,4294967295,0]");
    assert (throws ({
        // ulong is not properly supported.
        // NOTE: this is something that should really work.
        char[] r = serialize!(ulong) (0x8FFF_FFFF_FFFF_FFFFLu);
    }));
    
    // Floats
    // We can't do a proper float-test because we can't rely on numbers being printed to the same
    // number of figures on all platforms. Do nan tests to check type is supported.
    assert (serialize!(float) (float.init) == "nan");
    assert (serialize!(double) (double.init) == "nan");
    assert (serialize!(real) (real.init) == "nan");
    
    // Escape sequences (test conversion functions)
    assert (serialize ("\a\b\t\n\v\f\r\"\'\\") == `"\a\b\t\n\v\f\r\"\'\\"`);
    
    logger.info ("Unittest complete.");
}
}