view mde/file/serialize.d @ 82:ac1e3fd07275

New ssi file format. (De)serializer now supports non-ascii wide characters (encoded to UTF-8) and no longer supports non-ascii 8-bit chars which would result in bad UTF-8. Moved/renamed a few things left over from the last commit.
author Diggory Hardy <diggory.hardy@gmail.com>
date Sat, 30 Aug 2008 09:37:35 +0100
parents d8fccaa45d5f
children 79d816b3e2d2
line wrap: on
line source

/* LICENSE BLOCK
Part of mde: a Modular D game-oriented Engine
Copyright © 2007-2008 Diggory Hardy

This program is free software: you can redistribute it and/or modify it under the terms
of the GNU General Public License as published by the Free Software Foundation, either
version 2 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>. */

/**************************************************************************************************
 * Generic serialization templated function.
 *
 * Supports:
 *  Associative arrays, dynamic arrays (with usual formatting of strings), structs, char types,
 *  bool, int types, float types.
 *
 * Examples:
 * ------------------------------------------------------------------------------------------------
 * // Basic examples:
 * Cout (serialize!(byte) (-13)).newline;                       // -13
 * Cout (serialize!(real) (2.56e11)).newline;                   // 2.55999999999999990000e+11
 * Cout (serialize!(double[]) ([0.0, 1.0, 2.0, 3.0])).newline;  // [0.00000000000000000,1.00000000000000000,2.00000000000000000,3.00000000000000000]
 * Cout (serialize ([true,false,false])).newline;               // [true,false,false]
 *
 * // String and ubyte[] special syntaxes (always used):
 * Cout (serialize ("A string.")).newline;                      // "A string." (including quotes)
 * Cout (serialize (cast(ubyte[]) [5u, 0xF1u, 0x10u])).newline; // 0x05f110
 *
 * // Associative arrays:
 * Cout (serialize ([-1:"negative one"[], 0:"zero", 1:"one"])).newline; // [0:"zero",1:"one",-1:"negative one"]
 *
 * // Structs:
 * struct S {   int a = 5;  double[int[]] x;    }
 * S s;
 * Cout (serialize (s));
 *
 * // No limit on complexity...
 * char[] somethingComplicated = serialize!(real[][][bool[int[][]]]) (...);
 * ------------------------------------------------------------------------------------------------
 *
 * throws:
 *      May throw a UnicodeException or an IllegalArgumentException.
 *
 * TODO: Optimize memory allocation (if possible?). Test best sizes for initial allocations
 * instead of merely guessing?
 *************************************************************************************************/
//NOTE: in case of multiple formats, make this a dummy module importing both serialize modules,
// or put all the code here.
//FIXME: Optimize by using a slicing buffer. Put everything in a struct containing this buffer to
// make it thread-safe.
module mde.file.serialize;
// Since serialize is never used in a module where deserialize is not used, save an import:
public import mde.file.deserialize;

// tango imports
import tango.core.Traits;
import tango.core.Exception : UnicodeException, IllegalArgumentException;
import cInt = tango.text.convert.Integer;
import cFloat = tango.text.convert.Float;
import Utf = tango.text.convert.Utf;


alias serialize parseFrom;      // support the old name

// Formatting options, for where multiple formats are supported by the deserializer.

// Output using the special binary notation (0x01F2AC instead of [01 ,0xF2, 0xAC])?
const bool SPECIAL_BINARY_NOTATION = true;

// Output binary as true / false or 1 / 0 ?
const bool BINARY_AS_WORDS = true;


char[] serialize(U) (U val) {
    // Associative arrays (NOTE: cannot use is() expression)
    static if (isAssocArrayType!(U)) {          // generic associative array
        alias typeof(U.keys[0])     S;
        alias typeof(U.values[0])   T;
        char[] ret;
        // A guess, including values themselves and [,:] elements (must be at least 2).
        ret.length = val.length * (defLength!(T) + defLength!(S) + 2) + 2;
        ret[0] = '[';
        uint i = 1;
        foreach (S k, T v; val) {
            char[] s = serialize!(S) (k) ~ ":" ~ serialize!(T) (v);
            i += s.length;
            if (i+1 >= ret.length)
                ret.length = ret.length * 2; // check.
            ret[i-s.length .. i] = s;
            ret[i++] = ',';
        }
        if (i == 1) ++i;    // special case - not overwriting a comma
            ret[i-1] = ']'; // replaces last comma
            return ret[0..i];
    }
    // Arrays
    else static if (is(U S == S[]) || isStaticArrayType!(U)) {
        alias typeof(U[0]) T;
        
        static if (is(T == char)) {             // string
            char[] ret = new char[val.length * 2 + 2];  // Initial storage. This should ALWAYS be enough.
            ret[0] = '"';
            uint i = 1;
            for (uint t = 0; t < val.length;) {
            // process a block of non-escapable characters
                uint s = t;
                while (t < val.length && !isEscapableChar(val[t]))
                    ++t;	// skip all non-escapable chars
                uint j = i + t - s;
                ret[i..j] = val[s..t];	// copy a block
                i = j;
            // process a block of escapable charaters
                while (t < val.length && isEscapableChar(val[t])) {
                    ret[i++] = '\\';				// backslash; increment i
                    ret[i++] = escapeChar(val[t++]);	// character; increment i and t
                }
            }
            ret[i++] = '"';
            return ret[0..i];
        }
        else static if (is(T == wchar) || is(T == dchar)) {   // wstring or dstring
            // May throw a UnicodeException; don't bother catching and rethrowing:
            return serialize!(char[]) (Utf.toString (val));
        }
        else static if (SPECIAL_BINARY_NOTATION && is(T == ubyte)) {    // special binary notation
            // Note: To disable the usage of this special type, set SPECIAL_BINARY_NOTATION = false.
            static const char[16] digits = "0123456789abcdef";
    
            char[] ret = new char[val.length * 2 + 2];	// exact length
            ret[0..2] = "0x";
            uint i = 2;
    
            foreach (ubyte x; val) {
                ret[i++] = digits[x >> 4];
                ret[i++] = digits[x & 0x0F];
            }
            return ret;
        }
        else {                                  // generic array
            char[] ret;
        // A guess, including commas and brackets (must be at least 2)
            ret.length = val.length * (defLength!(T) + 1) + 2;
            ret[0] = '[';
            uint i = 1;
            foreach (T x; val) {
                char[] s = serialize!(T) (x);
                i += s.length;
                if (i+1 >= ret.length)
                    ret.length = ret.length * 2;	// check length
                ret[i-s.length .. i] = s;
                ret[i++] = ',';
            }
            if (i == 1)
                ++i;	// special case - not overwriting a comma
            ret[i-1] = ']'; 	// replaces last comma
            return ret[0..i];
        }
    }
    // Structs
    else static if (is(U == struct)) {
        char[] ret;
        // A very rough guess.
        ret.length = val.sizeof * 4;
        ret[0] = '{';
        uint i = 1;
        foreach (k, v; val.tupleof) {
            alias typeof(v) T;
            char[] s = serialize!(size_t) (k) ~ ":" ~ serialize!(T) (v);
            i += s.length;
            if (i+1 >= ret.length)
                ret.length = ret.length * 2; // check.
            ret[i-s.length .. i] = s;
            ret[i++] = ',';
        }
        if (i == 1) ++i;    // special case - not overwriting a comma
            ret[i-1] = '}'; // replaces last comma
            return ret[0..i];
    }
    // Basic types
    else static if (is(U == char)) {            // char (UTF-8 byte)
        if (val > 127)      // outputing invalid utf-8 could corrupt the output stream
            throw new IllegalArgumentException ("Not a valid UTF-8 character");
        
        // Can't return reference to static array; so making it dynamic is cheaper than copying.
        char[] ret = new char[4];	// max length for an escaped char
        ret[0] = '\'';
        
        if (!isEscapableChar (val)) {
            ret[1] = val;
            ret[2] = '\'';
            return ret[0..3];
        } else {
            ret[1] = '\\';
            ret[2] = escapeChar (val);
            ret[3] = '\'';
            return ret;
        }
    } else static if (is(U == wchar) ||
                      is(U == dchar)) {         // wchar or dchar (UTF-16/32 single char)
        if (val <= 127u)
            return serialize!(char) (cast(char) val);  // ASCII
        else {  // convert to a multi-byte UTF-8 char
            // NOTE: suboptimal
            char[] t,ret;
            t = Utf.toString([val]);
            ret.length = t.length + 2;
            ret = '\'' ~ t ~ '\'';
            return ret;
        }
    } else static if (is (U == bool)) {         // boolean
        static if (BINARY_AS_WORDS) {
            if (val)
                return "true";
            else return "false";
        } else {
            if (val)
                return "1";
            else return "0";
        }
    } else static if (is (U : long)) {          // any integer type, except char types and bool
        static if (is (U == ulong))             // ulong may not be supported properly
            if (val > cast(ulong) long.max)
                throw new IllegalArgumentException ("No handling available for ulong where value > long.max");
        return cInt.toString (val);
    } else static if (is (U : real)) {          // any (real) floating point type
        char[] ret = new char[32];              // minimum allowed by assert in format
        return cFloat.format (ret, val, U.dig+2, 1);// from old C++ tests, U.dig+2 gives best(?) accuracy
    }
    // Unsupported
    else
        static assert (false, "Unsupported type: "~U.stringof);
}

//BEGIN Utility funcs
/* This template provides the initial length for strings for formatting various types. These strings
 * can be expanded; this value is intended to cover 90% of cases or so.
 *
 * NOTE: This template was intended to provide specialisations for different types.
 * This one value should do reasonably well for most types.
 */
private {
    template defLength(T)        { const uint defLength = 20; }
    template defLength(T : char) { const uint defLength = 4;  }
    template defLength(T : bool) { const uint defLength = 5;  }
}
private bool isEscapableChar (char c) {
    return ((c <= '\r' && c >= '\a') || c == '\"' || c == '\'' || c == '\\');
}
// Throws on unsupported escape sequences; however this should never happen within serialize.
private char escapeChar (char c) {
    // This code was generated:
    if (c <= '\v') {
        if (c <= '\b') {
            if (c == '\a') {
                return 'a';
            } else if (c == '\b') {
                return 'b';
            }
        } else {
            if (c == '\t') {
                return 't';
            } else if (c == '\n') {
                return 'n';
            } else if (c == '\v') {
                return 'v';
            }
        }
    } else {
        if (c <= '\r') {
            if (c == '\f') {
                return 'f';
            } else if (c == '\r') {
                return 'r';
            }
        } else {
            if (c == '\"') {
                return '\"';
            } else if (c == '\'') {
                return '\'';
            } else if (c == '\\') {
                return '\\';
            }
        }
    }
    
    // if we haven't returned:
    throw new IllegalArgumentException ("Internal error (escapeChar)");
}
//END Utility funcs



debug (mdeUnitTest) {
    import tango.util.log.Log : Log, Logger;

    private Logger logger;
    static this() {
        logger = Log.getLogger ("mde.file.serialize");
    }
unittest {
    // Utility
    bool throws (void delegate() dg) {
        bool r = false;
        try {
            dg();
        } catch (Exception e) {
            r = true;
            logger.trace ("Exception caught: "~e.msg);
        }
        return r;
    }
    assert (!throws ({ int i = 5; }));
    assert (throws ({ throw new Exception ("Test - this exception should be caught"); }));
    
    // Associative arrays
    char[] X = serialize!(char[][char]) (['a':cast(char[])"animal", 'b':['b','u','s']]);
    char[] Y = `['a':"animal",'b':"bus"]`;
    assert (X == Y);
    
    
    // Arrays
    // generic array stuff:
    assert (serialize!(double[]) ([1.0, 1.0e-10]) == `[1.00000000000000000,0.10000000000000000e-09]`);
    assert (serialize!(double[]) (cast(double[]) []) == `[]`);		// empty array
    
    // char[] conversions, with commas, escape sequences and multichar UTF8 characters:
    assert (serialize!(char[][]) ([ ".\""[], [',','\''] ,"!\b€" ]) == `[".\"",",\'","!\b€"]`);
    
    // wchar[] and dchar[] conversions:
    // The characters were pretty-much pulled at random from unicode tables.
    assert (serialize!(wchar[]) ("Test string: ¶α؟अกሀ搀"w) == "\"Test string: ¶α؟अกሀ搀\"");
    assert (serialize!(dchar[]) ("Test string: ¶α؟अกሀ搀"d) == "\"Test string: ¶α؟अกሀ搀\"");
    
    
    static if (SPECIAL_BINARY_NOTATION)
        assert (serialize!(ubyte[]) (cast(ubyte[]) [0x01, 0xF2, 0xAC]) == `0x01f2ac`);	// ubyte[] special notation
    else
        assert (serialize!(ubyte[]) (cast(ubyte[]) [0x01, 0xF2, 0xAC]) == `[1,242,172]`);
    
    
    // Structs
    struct Foo {    int a = 9;  char b = '\v'; float c;    }
    struct Bar {    Foo a,b;    }
    static Foo foo1 = { a:150, b:'8', c:17.2f}, foo2;
    Bar bar;
    bar.a = foo1;
    bar.b = foo2;
    assert (serialize(bar) == "{0:{0:150,1:'8',2:1.72000007e+01},1:{0:9,1:'\\v',2:nan}}");
    
    
    // Basic Types
    // Character types
    assert (serialize!(char) ('\'') == "\'\\\'\'");
    assert (serialize!(wchar) ('X') == "'X'");
    assert (serialize!(dchar) ('X') == "'X'");
    assert (serialize!(wchar) ('£') == "'£'");  // unicode U+00A3 i.e. a multi-byte UTF-8 char
    assert (serialize!(dchar) ('£') == "'£'");
    assert (throws ({ serialize!(char) ('£'); }));      // compiler converts £ to char, but it's not valid UTF-8
    
    // Bool
    static if (BINARY_AS_WORDS)
        assert (serialize(false) == "false");
    else
        assert (serialize(true) == "1");
    
    // Integers
    assert (serialize (cast(byte) -5) == "-5");
    assert (serialize (cast(short) -32768) == "-32768");
    assert (serialize (-5) == "-5");
    assert (serialize (-9223372036854775807L) == "-9223372036854775807");
    assert (serialize (cast(ubyte) -1) == "255");
    assert (serialize (cast(ushort) -1) == "65535");
    assert (serialize!(uint) (-1) == "4294967295");
    assert (serialize (cast(ulong) 0x7FFF_FFFF_FFFF_FFFFLu) == "9223372036854775807");
    assert (serialize!(uint[]) ([0b0100u,0724,0xFa59c,0xFFFFFFFF,0]) ==
                               "[4,468,1025436,4294967295,0]");
    assert (throws ({
        // ulong is not properly supported.
        // NOTE: this is something that should really work.
        char[] r = serialize!(ulong) (0x8FFF_FFFF_FFFF_FFFFLu);
    }));
    
    // Floats
    // These numbers are not particularly meaningful:
    assert (serialize!(float) (0.0f) == "0.00000000");
    assert (serialize!(double) (-1e25) == "-1.00000000000000000e+25");
    assert (serialize!(real) (cast(real) 4.918e300) == "4.91800000000000000000e+300");
    
    // Escape sequences (test conversion functions)
    assert (serialize ("\a\b\t\n\v\f\r\"\'\\") == `"\a\b\t\n\v\f\r\"\'\\"`);
    
    logger.info ("Unittest complete.");
}
}