view mde/file/deserialize.d @ 154:0520cc00c0cc

Better error reporting for loading translations; avoided an infinite loop.
author Diggory Hardy <diggory.hardy@gmail.com>
date Sat, 18 Apr 2009 12:02:33 +0200
parents 7f7b40fed72b
children
line wrap: on
line source

/* LICENSE BLOCK
Part of mde: a Modular D game-oriented Engine
Copyright © 2007-2008 Diggory Hardy

This program is free software: you can redistribute it and/or modify it under the terms
of the GNU General Public License as published by the Free Software Foundation, either
version 2 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>. */

/**************************************************************************************************
 * Generic deserialization templated function.
 *
 * Supports:
 *  Associative arrays, dynamic arrays (with usual formatting of strings), structs, char types,
 *  bool, int types, float types.
 *
 * There are also some public utility functions with their own documentation.
 *
 * Examples:
 * ------------------------------------------------------------------------------------------------
 * // Basic examples:
 * ulong        a = deserialize!(ulong) ("20350");
 * float        d = deserialize!(float) ("  1.2e-9 ");
 * int[]        b = deserialize!(int[]) ("[0,1,2,3]");
 *
 * // String and char[] syntax:
 * char[]       c = deserialize!(char[]) ("\"A string\"");
 * char[]       e = deserialize!(char[]) ("['a','n','o','t','h','e','r', ' ' ,'s','t','r','i','n','g']");
 *
 * // These be used interchangably; here's a more complex example of an associative array:
 * bool[char[]] f = deserialize!(bool[char[]]) ("[ \"one\":true, ['t','w','o']:false, \"three\":1, \"four\":000 ]");
 *
 * // There is also a special notation for ubyte[] types:
 * // The digits following 0x must be in pairs and each specify one ubyte.
 * assert ( deserialize!(ubyte[]) (`0x01F2AC`) == deserialize!(ubyte[]) (`[01 ,0xF2, 0xAC]`) );
 *
 * // There's no limit to the complexity!
 * char[char[][][][char]][bool] z = ...; // don't expect me to write this!
 * ------------------------------------------------------------------------------------------------
 *
 * Throws:
 *      May throw a ParseException or a UnicodeException (which both extend TextException).
 *
 * TODO: Optimize memory allocation (if possible?). Test best sizes for initial allocations
 * instead of merely guessing?
 *************************************************************************************************/
//NOTE: in case of multiple formats, make this a dummy module importing both serialize modules,
// or put all the code here.
module mde.file.deserialize;

// tango imports
import tango.core.Exception : TextException, UnicodeException;
import cInt = tango.text.convert.Integer;
import cFloat = tango.text.convert.Float;
import Utf = tango.text.convert.Utf;
import Util = tango.text.Util;

/**
 * Base class for deserialize exceptions.
 */
class ParseException : TextException
{
    this( char[] msg )
    {
        super( msg );
    }
}

alias deserialize parseTo;      // support the old name

//BEGIN deserialize templates

// Associative arrays

T[S] deserialize(T : T[S], S) (char[] src) {
    src = Util.trim(src);
    if (src.length < 2 || src[0] != '[' || src[$-1] != ']')
        throw new ParseException ("Invalid associative array: not [ ... ]");  // bad braces.
    
    T[S] ret;
    foreach (char[] pair; Split (src[1..$-1])) {
        uint i = 0;
        while (i < pair.length) {   // advance to the ':'
            char c = pair[i];
            if (c == ':') break;
            if (c == '\'' || c == '"') {    // string or character
                ++i;
                while (i < pair.length && pair[i] != c) {
                    if (pair[i] == '\\')
                        ++i;    // escape seq.
                    ++i;
                }
                // Could have an unterminated ' or " causing i >= pair.length, but:
                // 1. Impossible: Split would have thrown
                // 2. In any case this would be caught below.
            }
            ++i;
        }
        if (i >= pair.length)
            throw new ParseException ("Invalid associative array: encountered [ ... KEY] (missing :DATA)");
        ret[deserialize!(S) (pair[0..i])] = deserialize!(T) (pair[i+1..$]);
    }
    return ret;
}


// Arrays

T[] deserialize(T : T[]) (char[] src) {
    src = Util.trim(src);
    if (src.length >= 2 && src[0] == '[' && src[$-1] == ']')
        return toArray!(T[]) (src);
    throw new ParseException ("Invalid array: not [ ... ]");
}

// String (array special case)
T deserialize(T : char[]) (char[] src) {
    src = Util.trim(src);
    if (src.length >= 2 && src[0] == '"' && src[$-1] == '"') {
        src = src[1..$-1];
        T ret;
        ret.length = src.length;    // maximum length; retract to actual length later
        uint i = 0;
        for (uint t = 0; t < src.length;) {
            // process a block of non-escaped characters
            uint s = t;
            while (t < src.length && src[t] != '\\') ++t;   // non-escaped characters
            uint j = i + t - s;
            ret[i..j] = src[s..t];  // copy a block
            i = j;
            
            // process a block of escaped characters
            while (t < src.length && src[t] == '\\') {
                t++;
                if (t == src.length)
                    throw new ParseException ("Invalid string: ends \\\" !");  // next char is "
                ret[i++] = unEscapeChar (src[t++]);   // throws if it's invalid
            }
        }
        return ret[0..i];
    }
    else if (src.length >= 2 && src[0] == '[' && src[$-1] == ']')
        return toArray!(T) (src);
    throw new ParseException ("Invalid string: not quoted (\"*\") or char array (['a',...,'c'])");
}
// Unicode conversions for strings:
T deserialize(T : wchar[]) (char[] src) {
    // May throw a UnicodeException; don't bother catching and rethrowing:
    return Utf.toString16 (deserialize!(char[]) (src));
}
T deserialize(T : dchar[]) (char[] src) {
    // May throw a UnicodeException; don't bother catching and rethrowing:
    return Utf.toString32 (deserialize!(char[]) (src));
}

// Binary (array special case)
T deserialize(T : ubyte[]) (char[] src) {
    src = Util.trim(src);
    // Standard case:
    if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T) (src);
    // Special case: sequence of hex digits, each pair of which is a ubyte
    if (src.length >= 2 && src[0..2] == "0x") {
        src = src[2..$];    // strip down to actual digits
        
        // Must be in pairs:
        if (src.length % 2 == 1)
            throw new ParseException ("Invalid binary: odd number of chars");
        
        T ret;
        ret.length = src.length / 2;    // exact
        
        for (uint i, pos; pos + 1 < src.length; ++i) {
            ubyte x = readHexChar(src, pos) << 4;
            x |= readHexChar(src, pos);
            ret[i] = x;
        }
        return ret;
    }
    else throw new ParseException ("Invalid ubyte[]: not an array and doesn't start 0x");
}


// Basic types

// Char
// Assumes value is <= 127 (for valid UTF-8), since input would be invalid UTF-8 if not anyway.
// (And we're not really interested in checking for valid unicode; char[] conversions don't either.)
T deserialize(T : char) (char[] src) {
    src = Util.trim(src);
    if (src.length < 3 || src[0] != '\'' || src[$-1] != '\'')
        throw new ParseException ("Invalid char: not 'x' or '\\x'");
    if (src[1] != '\\') {
        if (src.length == 3)
            return src[1];              // Either non escaped
        throw new ParseException ("Invalid char: too long (or non-ASCII)");
    } else if (src.length == 4)
        return unEscapeChar (src[2]);   // Or escaped
    
    throw new ParseException ("Invalid char: '\\'");
}
// Basic unicode convertions for wide-chars.
T deserialize(T : wchar) (char[] src) {
    src = Util.trim(src);
    if (src.length < 3 || src[0] != '\'' || src[$-1] != '\'')
        throw new ParseException ("Invalid char: not 'x' or '\\x'");
    T[] t = Utf.toString16 (src[1..$-1]);
    if (t.length == 1)
        return t[0];
    else
        throw new ParseException ("Invalid char: not one character");
}
T deserialize(T : dchar) (char[] src) {
    src = Util.trim(src);
    if (src.length < 3 || src[0] != '\'' || src[$-1] != '\'')
        throw new ParseException ("Invalid char: not 'x' or '\\x'");
    T[] t = Utf.toString32 (src[1..$-1]);
    if (t.length == 1)
        return t[0];
    else
        throw new ParseException ("Invalid char: not one character");
}

// Bool
T deserialize(T : bool) (char[] src) {
    src = Util.trim(src);
    if (src == "true")
        return true;
    if (src == "false")
        return false;
    uint pos;
    while (src.length > pos && src[pos] == '0') ++pos;  // skip leading zeros
    if (src.length == pos && pos > 0)
        return false;
    if (src.length == pos + 1 && src[pos] == '1')
        return true;
    throw new ParseException ("Invalid bool: not true or false and doesn't evaluate to 0 or 1");
}

// Ints
T deserialize(T : byte) (char[] src) {
    return toTInt!(T) (src);
}
T deserialize(T : short) (char[] src) {
    return toTInt!(T) (src);
}
T deserialize(T : int) (char[] src) {
    return toTInt!(T) (src);
}
T deserialize(T : long) (char[] src) {
    return toTInt!(T) (src);
}
T deserialize(T : ubyte) (char[] src) {
    return toTInt!(T) (src);
}
T deserialize(T : ushort) (char[] src) {
    return toTInt!(T) (src);
}
T deserialize(T : uint) (char[] src) {
    return toTInt!(T) (src);
}
T deserialize(T : ulong) (char[] src) {
    return toTInt!(T) (src);
}
debug (UnitTest) unittest {
    assert (deserialize!(byte) ("-5") == cast(byte) -5);
    // annoyingly, octal syntax differs from D (blame tango):
    assert (deserialize!(uint[]) ("[0b0100,0o724,0xFa59c,0xFFFFFFFF,0]") == [0b0100u,0724,0xFa59c,0xFFFFFFFF,0]);
}

// Floats
T deserialize(T : float) (char[] src) {
    return toTFloat!(T) (src);
}
T deserialize(T : double) (char[] src) {
    return toTFloat!(T) (src);
}
T deserialize(T : real) (char[] src) {
    return toTFloat!(T) (src);
}


// Structs
T deserialize(T) (char[] src) {
    static assert (is(T == struct), "Unsupported type: "~typeof(T));
    
    src = Util.trim(src);
    if (src.length < 2 || src[0] != '{' || src[$-1] != '}')
        throw new ParseException ("Invalid struct: not { ... }");
    
    // cannot access elements of T.tupleof with non-const key, so use a type which can be
    // accessed with a non-const key to store slices:
    char[][T.tupleof.length] temp;
    foreach (char[] pair; Split (src[1..$-1])) {
        uint i = 0;
        while (i < pair.length) {   // advance to the ':'
            char c = pair[i];
            if (c == ':')
                break;
            // key must be an int so no need for string checks
            ++i;
        }
        if (i >= pair.length)
            throw new ParseException ("Invalid KEY:DATA pair within struct: "~pair);
        
        size_t k = deserialize!(size_t) (pair[0..i]);
        // Note: could check no entry was already stored in temp.
        temp[k] = pair[i+1..$];
    }
    T ret;
    setStruct (ret, temp);
    return ret;
}
//END deserialize templates

//BEGIN Utility funcs
/** Splits a string into substrings separated by '$(B ,)' with support for characters and strings
 * containing escape sequences and for embedded arrays ($(B [...])).
 *
 * ---
 * foreach (element; Split(src))
 *     ...
 * ---
 * Where src is a string to separate on commas. It shouldn't have enclosing brackets.
 *
 * Output elements are substrings of src separated by commas, excluding the commas.
 * Not all whitespace is not stripped and empty strings may get returned.
 *
 * Remarks:
 *     This struct is primarily intended for as a utility for use by the templates
 *     parsing arrays and associative arrays, but it may be useful in other cases too. Hence the
 *     fact no brackets are stripped from src.
 */
struct Split {
    static Split opCall (char[] source) {
        Split ret;
        ret.src = Util.trim (source);
        return ret;
    }
    
    int opApply(int delegate(ref char[]) dg)
    {
        if (src == "")
            return 0;
        
        int result = 0;
        
        uint depth = 0;         // surface depth (embedded arrays)
        size_t i = 0, j = 0;    // current read location, start of current piece
    
        while (i < src.length) {
            char c = src[i];
            if (c == '\'' || c == '"') {    // string or character
                ++i;
                while (i < src.length && src[i] != c) {
                    if (src[i] == '\\')
                        ++i;    // escape seq.
                        ++i;
                }   // Doesn't throw if no terminal quote at end of src, but this should be caught later.
            }
            else if (c == '[') ++depth;
            else if (c == ']') {
                if (depth)
                    --depth;
                else throw new ParseException ("Invalid array literal: closes before end of data item.");
            }
            else if (c == ',' && depth == 0) {      // only if not an embedded array
                char[] t = src[j..i];
                result = dg(t);   // add this piece and increment k
                if (result)
                    return result;
                j = i + 1;
            }
            ++i;
        }
        if (i > src.length)
            throw new ParseException ("Unterminated quote (\' or \")");
        
        char[] t = src[j..i];
        result = dg(t);     // add final piece (i >= j)
        return result;
    }
    
    char[] src;
}

/* Templated read-int function to read (un)signed 1-4 byte integers.
 *
 * Actually a reimplementation of tango.text.convert.Integer toLong and parse functions.
 */
private TInt toTInt(TInt) (char[] src) {
    const char[] INT_OUT_OF_RANGE = "Integer out of range";
    bool sign;
    uint radix, ate, ate2;
    
    // Trim off whitespace.
    // NOTE: Cannot use tango.text.convert.Integer.trim to trim leading whitespace since it doesn't
    // treat new-lines, etc. as whitespace which for our purposes is whitespace.
    src = Util.trim (src);
    
    ate = cInt.trim (src, sign, radix);
    if (ate == src.length)
        throw new ParseException ("Invalid integer: no digits");
    ulong val = cInt.convert (src[ate..$], radix, &ate2);
    ate += ate2;
    
    if (ate < src.length)
        throw new ParseException ("Invalid integer at marked character: \"" ~ src[0..ate] ~ "'" ~ src[ate] ~ "'" ~ src[ate+1..$] ~ "\"");
    
    if (val > TInt.max)
        throw new ParseException (INT_OUT_OF_RANGE);
    if (sign) {
        long sval = cast(long) -val;
        if (sval > TInt.min)
            return cast(TInt) sval;
        else throw new ParseException (INT_OUT_OF_RANGE);
    }
    return cast(TInt) val;
}

/* Basically a reimplementation of tango.text.convert.Float.toFloat which checks for
 * whitespace before throwing an exception for overlong input. */
private TFloat toTFloat(TFloat) (char[] src) {
    // NOTE: As for toTInt(), this needs to strip leading as well as trailing whitespace.
    src = Util.trim (src);
    if (src == "")
        throw new ParseException ("Invalid float: no digits");
    uint ate;
    
    TFloat x = cFloat.parse (src, &ate);
    return x;
}

/* Throws an exception on invalid escape sequences. Supported escape sequences are the following
 * subset of those supported by D: \" \' \\ \a \b \f \n \r \t \v
 */
private char unEscapeChar (char c)
{
    // This code was generated:
    if (c <= 'b') {
        if (c <= '\'') {
            if (c == '\"') {
                return '\"';
            } else if (c == '\'') {
                return '\'';
            }
        } else {
            if (c == '\\') {
                return '\\';
            } else if (c == 'a') {
                return '\a';
            } else if (c == 'b') {
                return '\b';
            }
        }
    } else {
        if (c <= 'n') {
            if (c == 'f') {
                return '\f';
            } else if (c == 'n') {
                return '\n';
            }
        } else {
            if (c == 'r') {
                return '\r';
            } else if (c == 't') {
                return '\t';
            } else if (c == 'v') {
                return '\v';
            }
        }
    }
    
    // if we haven't returned:
    throw new ParseException ("Bad escape sequence: \\"~c);
}

// Reads one hex char: [0-9A-Fa-f]. Otherwise throws an exception. Doesn't check src.length.
private ubyte readHexChar (char[] src, inout uint pos) {
    ubyte x;
    if (src[pos] >= '0' && src[pos] <= '9') x = src[pos] - '0';
    else if (src[pos] >= 'A' && src[pos] <= 'F') x = src[pos] - 'A' + 10;
    else if (src[pos] >= 'a' && src[pos] <= 'f') x = src[pos] - 'a' + 10;
    else throw new ParseException ("Invalid hex digit.");
    ++pos;
    return x;
}

// Generic array reader
// Assumes input is of form "[xxxxx]" (i.e. first and last chars are '[', ']' and length >= 2).
private T[] toArray(T : T[]) (char[] src) {
    T[] ret = new T[16];    // avoid unnecessary allocations
    uint i = 0;
    foreach (char[] element; Split(src[1..$-1])) {
        if (i == ret.length) ret.length = ret.length * 2;
        ret[i] = deserialize!(T) (element);
        ++i;
    }
    return ret[0..i];
}

/** Set a struct's elements from an array.
*
* For a more generic version, see http://www.dsource.org/projects/tutorials/wiki/StructTupleof
*/
// NOTE: Efficiency? Do recursive calls get inlined?
private void setStruct(S, size_t N, size_t i = 0) (ref S s, char[][N] src) {
    static assert (is(S == struct), "Only to be used with structs.");
    static assert (N == S.tupleof.length, "src.length != S.tupleof.length");
    static if (i < N) {
        if (src[i])
            s.tupleof[i] = deserialize!(typeof(s.tupleof[i])) (src[i]);
        setStruct!(S, N, i+1) (s, src);
    }
}
//END Utility funcs

debug (mdeUnitTest) {
    import tango.math.IEEE;	// feqrel
    import tango.util.log.Log : Log, Logger;
    
    private Logger logger;
    static this() {
        logger = Log.getLogger ("mde.file.deserialize");
    }
unittest {
    // Utility
    bool throws (void delegate() dg) {
        bool r = false;
        try {
            dg();
        } catch (Exception e) {
            r = true;
        }
        return r;
    }
    assert (!throws ({ int i = 5; }));
    assert (throws ({ throw new Exception ("Test - this exception should be caught"); }));
    
    
    // Associative arrays
    char[][char] X = deserialize!(char[][char]) (`['a':"animal\n", 'b':['b','u','s','\n']]`);
    char[][char] Y = ['a':cast(char[])"animal\n", 'b':['b','u','s','\n']];
    
    //FIXME: when the compiler's fixed: http://d.puremagic.com/issues/show_bug.cgi?id=1671
    // just assert (X == Y)
    assert (X.length == Y.length);
    assert (X.keys == Y.keys);
    assert (X.values == Y.values);
    //X.rehash; Y.rehash;   // doesn't make a difference
    //assert (X == Y);      // fails (compiler bug)
    
    assert (throws ({ deserialize!(int[int]) (`[1:1`); }));             // bad brackets
    assert (throws ({ deserialize!(int[char[]]) (`["ab\":1]`); }));     // unterminated quote
    assert (throws ({ deserialize!(int[char[]]) (`["abc,\a\b\c":1]`); }));    // bad escape seq.
    assert (throws ({ deserialize!(int[char[]]) (`["abc"]`); }));       // no data
    
    
    // Arrays
    assert (deserialize!(double[]) (`[1.0,1.0e-10]`) == [1.0, 1.0e-10]);// generic array stuff
    assert (deserialize!(double[]) (`[     ]`) == cast(double[]) []);   // empty array
    assert (deserialize!(int[][]) (`[[1],[2,3],[]]`) == [[1],[2,3],[]]);// sub-array
    assert (throws ({ deserialize!(int[]) (`[1,2`); }));                // bad brackets
    assert (throws ({ deserialize!(int[][]) (`[[1]]]`); }));            // bad brackets
    
    // char[] and char conversions, with commas, escape sequences and multichar UTF8 characters:
    assert (deserialize!(char[][]) (`[ ".\"", [',','\''] ,"!\b€" ]`) == [ ".\"".dup, [',','\''] ,"!\b€" ]);
    assert (throws ({ deserialize!(char[]) ("\"\\\""); }));
    assert (throws ({ deserialize!(char[]) (`['a'`); }));               // bad brackets
    
    // wchar[] and dchar[] conversions:
    // The characters were pretty-much pulled at random from unicode tables.
    // The last few cause some wierd (display only) effects in my editor.
    assert (deserialize!(wchar[]) ("\"Test string: ¶α؟अกሀ搀\"") == "Test string: ¶α؟अกሀ搀"w);
    assert (deserialize!(dchar[]) ("\"Test string: ¶α؟अกሀ搀\"") == "Test string: ¶α؟अกሀ搀"d);
    
    assert (deserialize!(ubyte[]) (`0x01F2aC`) == cast(ubyte[]) [0x01, 0xF2, 0xAC]);    // ubyte[] special notation
    assert (deserialize!(ubyte[]) (`[01 ,0xF2, 0xAC]`) == cast(ubyte[]) [0x01, 0xF2, 0xAC]);    // ubyte[] std notation
    assert (throws ({ deserialize!(ubyte[]) (`0x123`); }));             // digits not in pairs
    assert (throws ({ deserialize!(ubyte[]) (`[2,5`); }));              // not [...] or 0x..
    assert (throws ({ deserialize!(ubyte[]) (`0x123j`); }));
    
    
    // char types
    assert (deserialize!(char) ("'\\\''") == '\'');
    assert (deserialize!(wchar) ("'X'") == 'X');
    assert (deserialize!(dchar) ("'X'") == 'X');
    assert (deserialize!(wchar) ("'£'") == '£');
    assert (deserialize!(dchar) ("'£'") == '£');
    assert (throws ({ deserialize!(char) ("'\\'"); }));
    assert (throws ({ deserialize!(char) ("'£'"); }));        // non-ascii
    assert (throws ({ deserialize!(char) ("''"); }));
    assert (throws ({ deserialize!(char) ("'ab'"); }));
    assert (throws ({ deserialize!(wchar) ("''"); }));
    
    
    // bool
    assert (deserialize!(bool[]) (`[true,false,01,00]`) == cast(bool[]) [1,0,1,0]);
    assert (throws ({ deserialize!(bool) ("011"); }));
    
    
    // ints
    assert (deserialize!(byte) ("-5") == cast(byte) -5);
    assert (deserialize!(int) ("-0x7FFFFFFF") == cast(int) -0x7FFF_FFFF);
    // annoyingly, octal syntax differs from D (blame tango):
    assert (deserialize!(uint[]) ("[0b0100,0o724,0xFa59c,0xFFFFFFFF,0]") == [0b0100u,0724,0xFa59c,0xFFFFFFFF,0]);
    assert (throws ({ deserialize!(int) (""); }));
    assert (throws ({ deserialize!(int) ("0x8FFFFFFF"); }));
    assert (throws ({ deserialize!(uint) ("-1"); }));
    assert (throws ({ deserialize!(uint) ("1a"); }));
    
    
    // floats
    assert (feqrel (deserialize!(float) ("0.0"), 0.0f) >= float.mant_dig-1);
    assert (feqrel (deserialize!(double) ("-1e25"), -1e25) >= double.mant_dig-2);
    assert (feqrel (deserialize!(real) ("5.24e-269"), cast(real) 5.24e-269) >= real.mant_dig-3);
    assert (throws ({ deserialize!(float) (""); }));
    
    
    // structs
    struct A {  int x = 5;  char y; }
    struct B {  A a;    float b;   }
    A a;    a.y = 'y';
    assert (deserialize!(A) ("{ 1 : 'y' }") == a);
    B b;    b.a = a;    b.b = 1.0f;
    assert (deserialize!(B) (" {1:1.0,0: { 1 : 'y' } } ") == b);
    assert (throws ({ deserialize!(A) (" 1:'x'}"); })); // bad braces
    assert (throws ({ deserialize!(A) ("{ 1 }"); }));     // no :DATA
    
    
    // unEscapeChar
    assert (deserialize!(char[]) ("\"\\a\\b\\t\\n\\v\\f\\r\\\"\\\'\\\\\"") == "\a\b\t\n\v\f\r\"\'\\");
    
    logger.info ("Unittest complete.");
}
}