Mercurial > projects > mde
view mde/file/deserialize.d @ 154:0520cc00c0cc
Better error reporting for loading translations; avoided an infinite loop.
author | Diggory Hardy <diggory.hardy@gmail.com> |
---|---|
date | Sat, 18 Apr 2009 12:02:33 +0200 |
parents | 7f7b40fed72b |
children |
line wrap: on
line source
/* LICENSE BLOCK Part of mde: a Modular D game-oriented Engine Copyright © 2007-2008 Diggory Hardy This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ /************************************************************************************************** * Generic deserialization templated function. * * Supports: * Associative arrays, dynamic arrays (with usual formatting of strings), structs, char types, * bool, int types, float types. * * There are also some public utility functions with their own documentation. * * Examples: * ------------------------------------------------------------------------------------------------ * // Basic examples: * ulong a = deserialize!(ulong) ("20350"); * float d = deserialize!(float) (" 1.2e-9 "); * int[] b = deserialize!(int[]) ("[0,1,2,3]"); * * // String and char[] syntax: * char[] c = deserialize!(char[]) ("\"A string\""); * char[] e = deserialize!(char[]) ("['a','n','o','t','h','e','r', ' ' ,'s','t','r','i','n','g']"); * * // These be used interchangably; here's a more complex example of an associative array: * bool[char[]] f = deserialize!(bool[char[]]) ("[ \"one\":true, ['t','w','o']:false, \"three\":1, \"four\":000 ]"); * * // There is also a special notation for ubyte[] types: * // The digits following 0x must be in pairs and each specify one ubyte. * assert ( deserialize!(ubyte[]) (`0x01F2AC`) == deserialize!(ubyte[]) (`[01 ,0xF2, 0xAC]`) ); * * // There's no limit to the complexity! * char[char[][][][char]][bool] z = ...; // don't expect me to write this! * ------------------------------------------------------------------------------------------------ * * Throws: * May throw a ParseException or a UnicodeException (which both extend TextException). * * TODO: Optimize memory allocation (if possible?). Test best sizes for initial allocations * instead of merely guessing? *************************************************************************************************/ //NOTE: in case of multiple formats, make this a dummy module importing both serialize modules, // or put all the code here. module mde.file.deserialize; // tango imports import tango.core.Exception : TextException, UnicodeException; import cInt = tango.text.convert.Integer; import cFloat = tango.text.convert.Float; import Utf = tango.text.convert.Utf; import Util = tango.text.Util; /** * Base class for deserialize exceptions. */ class ParseException : TextException { this( char[] msg ) { super( msg ); } } alias deserialize parseTo; // support the old name //BEGIN deserialize templates // Associative arrays T[S] deserialize(T : T[S], S) (char[] src) { src = Util.trim(src); if (src.length < 2 || src[0] != '[' || src[$-1] != ']') throw new ParseException ("Invalid associative array: not [ ... ]"); // bad braces. T[S] ret; foreach (char[] pair; Split (src[1..$-1])) { uint i = 0; while (i < pair.length) { // advance to the ':' char c = pair[i]; if (c == ':') break; if (c == '\'' || c == '"') { // string or character ++i; while (i < pair.length && pair[i] != c) { if (pair[i] == '\\') ++i; // escape seq. ++i; } // Could have an unterminated ' or " causing i >= pair.length, but: // 1. Impossible: Split would have thrown // 2. In any case this would be caught below. } ++i; } if (i >= pair.length) throw new ParseException ("Invalid associative array: encountered [ ... KEY] (missing :DATA)"); ret[deserialize!(S) (pair[0..i])] = deserialize!(T) (pair[i+1..$]); } return ret; } // Arrays T[] deserialize(T : T[]) (char[] src) { src = Util.trim(src); if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T[]) (src); throw new ParseException ("Invalid array: not [ ... ]"); } // String (array special case) T deserialize(T : char[]) (char[] src) { src = Util.trim(src); if (src.length >= 2 && src[0] == '"' && src[$-1] == '"') { src = src[1..$-1]; T ret; ret.length = src.length; // maximum length; retract to actual length later uint i = 0; for (uint t = 0; t < src.length;) { // process a block of non-escaped characters uint s = t; while (t < src.length && src[t] != '\\') ++t; // non-escaped characters uint j = i + t - s; ret[i..j] = src[s..t]; // copy a block i = j; // process a block of escaped characters while (t < src.length && src[t] == '\\') { t++; if (t == src.length) throw new ParseException ("Invalid string: ends \\\" !"); // next char is " ret[i++] = unEscapeChar (src[t++]); // throws if it's invalid } } return ret[0..i]; } else if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T) (src); throw new ParseException ("Invalid string: not quoted (\"*\") or char array (['a',...,'c'])"); } // Unicode conversions for strings: T deserialize(T : wchar[]) (char[] src) { // May throw a UnicodeException; don't bother catching and rethrowing: return Utf.toString16 (deserialize!(char[]) (src)); } T deserialize(T : dchar[]) (char[] src) { // May throw a UnicodeException; don't bother catching and rethrowing: return Utf.toString32 (deserialize!(char[]) (src)); } // Binary (array special case) T deserialize(T : ubyte[]) (char[] src) { src = Util.trim(src); // Standard case: if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T) (src); // Special case: sequence of hex digits, each pair of which is a ubyte if (src.length >= 2 && src[0..2] == "0x") { src = src[2..$]; // strip down to actual digits // Must be in pairs: if (src.length % 2 == 1) throw new ParseException ("Invalid binary: odd number of chars"); T ret; ret.length = src.length / 2; // exact for (uint i, pos; pos + 1 < src.length; ++i) { ubyte x = readHexChar(src, pos) << 4; x |= readHexChar(src, pos); ret[i] = x; } return ret; } else throw new ParseException ("Invalid ubyte[]: not an array and doesn't start 0x"); } // Basic types // Char // Assumes value is <= 127 (for valid UTF-8), since input would be invalid UTF-8 if not anyway. // (And we're not really interested in checking for valid unicode; char[] conversions don't either.) T deserialize(T : char) (char[] src) { src = Util.trim(src); if (src.length < 3 || src[0] != '\'' || src[$-1] != '\'') throw new ParseException ("Invalid char: not 'x' or '\\x'"); if (src[1] != '\\') { if (src.length == 3) return src[1]; // Either non escaped throw new ParseException ("Invalid char: too long (or non-ASCII)"); } else if (src.length == 4) return unEscapeChar (src[2]); // Or escaped throw new ParseException ("Invalid char: '\\'"); } // Basic unicode convertions for wide-chars. T deserialize(T : wchar) (char[] src) { src = Util.trim(src); if (src.length < 3 || src[0] != '\'' || src[$-1] != '\'') throw new ParseException ("Invalid char: not 'x' or '\\x'"); T[] t = Utf.toString16 (src[1..$-1]); if (t.length == 1) return t[0]; else throw new ParseException ("Invalid char: not one character"); } T deserialize(T : dchar) (char[] src) { src = Util.trim(src); if (src.length < 3 || src[0] != '\'' || src[$-1] != '\'') throw new ParseException ("Invalid char: not 'x' or '\\x'"); T[] t = Utf.toString32 (src[1..$-1]); if (t.length == 1) return t[0]; else throw new ParseException ("Invalid char: not one character"); } // Bool T deserialize(T : bool) (char[] src) { src = Util.trim(src); if (src == "true") return true; if (src == "false") return false; uint pos; while (src.length > pos && src[pos] == '0') ++pos; // skip leading zeros if (src.length == pos && pos > 0) return false; if (src.length == pos + 1 && src[pos] == '1') return true; throw new ParseException ("Invalid bool: not true or false and doesn't evaluate to 0 or 1"); } // Ints T deserialize(T : byte) (char[] src) { return toTInt!(T) (src); } T deserialize(T : short) (char[] src) { return toTInt!(T) (src); } T deserialize(T : int) (char[] src) { return toTInt!(T) (src); } T deserialize(T : long) (char[] src) { return toTInt!(T) (src); } T deserialize(T : ubyte) (char[] src) { return toTInt!(T) (src); } T deserialize(T : ushort) (char[] src) { return toTInt!(T) (src); } T deserialize(T : uint) (char[] src) { return toTInt!(T) (src); } T deserialize(T : ulong) (char[] src) { return toTInt!(T) (src); } debug (UnitTest) unittest { assert (deserialize!(byte) ("-5") == cast(byte) -5); // annoyingly, octal syntax differs from D (blame tango): assert (deserialize!(uint[]) ("[0b0100,0o724,0xFa59c,0xFFFFFFFF,0]") == [0b0100u,0724,0xFa59c,0xFFFFFFFF,0]); } // Floats T deserialize(T : float) (char[] src) { return toTFloat!(T) (src); } T deserialize(T : double) (char[] src) { return toTFloat!(T) (src); } T deserialize(T : real) (char[] src) { return toTFloat!(T) (src); } // Structs T deserialize(T) (char[] src) { static assert (is(T == struct), "Unsupported type: "~typeof(T)); src = Util.trim(src); if (src.length < 2 || src[0] != '{' || src[$-1] != '}') throw new ParseException ("Invalid struct: not { ... }"); // cannot access elements of T.tupleof with non-const key, so use a type which can be // accessed with a non-const key to store slices: char[][T.tupleof.length] temp; foreach (char[] pair; Split (src[1..$-1])) { uint i = 0; while (i < pair.length) { // advance to the ':' char c = pair[i]; if (c == ':') break; // key must be an int so no need for string checks ++i; } if (i >= pair.length) throw new ParseException ("Invalid KEY:DATA pair within struct: "~pair); size_t k = deserialize!(size_t) (pair[0..i]); // Note: could check no entry was already stored in temp. temp[k] = pair[i+1..$]; } T ret; setStruct (ret, temp); return ret; } //END deserialize templates //BEGIN Utility funcs /** Splits a string into substrings separated by '$(B ,)' with support for characters and strings * containing escape sequences and for embedded arrays ($(B [...])). * * --- * foreach (element; Split(src)) * ... * --- * Where src is a string to separate on commas. It shouldn't have enclosing brackets. * * Output elements are substrings of src separated by commas, excluding the commas. * Not all whitespace is not stripped and empty strings may get returned. * * Remarks: * This struct is primarily intended for as a utility for use by the templates * parsing arrays and associative arrays, but it may be useful in other cases too. Hence the * fact no brackets are stripped from src. */ struct Split { static Split opCall (char[] source) { Split ret; ret.src = Util.trim (source); return ret; } int opApply(int delegate(ref char[]) dg) { if (src == "") return 0; int result = 0; uint depth = 0; // surface depth (embedded arrays) size_t i = 0, j = 0; // current read location, start of current piece while (i < src.length) { char c = src[i]; if (c == '\'' || c == '"') { // string or character ++i; while (i < src.length && src[i] != c) { if (src[i] == '\\') ++i; // escape seq. ++i; } // Doesn't throw if no terminal quote at end of src, but this should be caught later. } else if (c == '[') ++depth; else if (c == ']') { if (depth) --depth; else throw new ParseException ("Invalid array literal: closes before end of data item."); } else if (c == ',' && depth == 0) { // only if not an embedded array char[] t = src[j..i]; result = dg(t); // add this piece and increment k if (result) return result; j = i + 1; } ++i; } if (i > src.length) throw new ParseException ("Unterminated quote (\' or \")"); char[] t = src[j..i]; result = dg(t); // add final piece (i >= j) return result; } char[] src; } /* Templated read-int function to read (un)signed 1-4 byte integers. * * Actually a reimplementation of tango.text.convert.Integer toLong and parse functions. */ private TInt toTInt(TInt) (char[] src) { const char[] INT_OUT_OF_RANGE = "Integer out of range"; bool sign; uint radix, ate, ate2; // Trim off whitespace. // NOTE: Cannot use tango.text.convert.Integer.trim to trim leading whitespace since it doesn't // treat new-lines, etc. as whitespace which for our purposes is whitespace. src = Util.trim (src); ate = cInt.trim (src, sign, radix); if (ate == src.length) throw new ParseException ("Invalid integer: no digits"); ulong val = cInt.convert (src[ate..$], radix, &ate2); ate += ate2; if (ate < src.length) throw new ParseException ("Invalid integer at marked character: \"" ~ src[0..ate] ~ "'" ~ src[ate] ~ "'" ~ src[ate+1..$] ~ "\""); if (val > TInt.max) throw new ParseException (INT_OUT_OF_RANGE); if (sign) { long sval = cast(long) -val; if (sval > TInt.min) return cast(TInt) sval; else throw new ParseException (INT_OUT_OF_RANGE); } return cast(TInt) val; } /* Basically a reimplementation of tango.text.convert.Float.toFloat which checks for * whitespace before throwing an exception for overlong input. */ private TFloat toTFloat(TFloat) (char[] src) { // NOTE: As for toTInt(), this needs to strip leading as well as trailing whitespace. src = Util.trim (src); if (src == "") throw new ParseException ("Invalid float: no digits"); uint ate; TFloat x = cFloat.parse (src, &ate); return x; } /* Throws an exception on invalid escape sequences. Supported escape sequences are the following * subset of those supported by D: \" \' \\ \a \b \f \n \r \t \v */ private char unEscapeChar (char c) { // This code was generated: if (c <= 'b') { if (c <= '\'') { if (c == '\"') { return '\"'; } else if (c == '\'') { return '\''; } } else { if (c == '\\') { return '\\'; } else if (c == 'a') { return '\a'; } else if (c == 'b') { return '\b'; } } } else { if (c <= 'n') { if (c == 'f') { return '\f'; } else if (c == 'n') { return '\n'; } } else { if (c == 'r') { return '\r'; } else if (c == 't') { return '\t'; } else if (c == 'v') { return '\v'; } } } // if we haven't returned: throw new ParseException ("Bad escape sequence: \\"~c); } // Reads one hex char: [0-9A-Fa-f]. Otherwise throws an exception. Doesn't check src.length. private ubyte readHexChar (char[] src, inout uint pos) { ubyte x; if (src[pos] >= '0' && src[pos] <= '9') x = src[pos] - '0'; else if (src[pos] >= 'A' && src[pos] <= 'F') x = src[pos] - 'A' + 10; else if (src[pos] >= 'a' && src[pos] <= 'f') x = src[pos] - 'a' + 10; else throw new ParseException ("Invalid hex digit."); ++pos; return x; } // Generic array reader // Assumes input is of form "[xxxxx]" (i.e. first and last chars are '[', ']' and length >= 2). private T[] toArray(T : T[]) (char[] src) { T[] ret = new T[16]; // avoid unnecessary allocations uint i = 0; foreach (char[] element; Split(src[1..$-1])) { if (i == ret.length) ret.length = ret.length * 2; ret[i] = deserialize!(T) (element); ++i; } return ret[0..i]; } /** Set a struct's elements from an array. * * For a more generic version, see http://www.dsource.org/projects/tutorials/wiki/StructTupleof */ // NOTE: Efficiency? Do recursive calls get inlined? private void setStruct(S, size_t N, size_t i = 0) (ref S s, char[][N] src) { static assert (is(S == struct), "Only to be used with structs."); static assert (N == S.tupleof.length, "src.length != S.tupleof.length"); static if (i < N) { if (src[i]) s.tupleof[i] = deserialize!(typeof(s.tupleof[i])) (src[i]); setStruct!(S, N, i+1) (s, src); } } //END Utility funcs debug (mdeUnitTest) { import tango.math.IEEE; // feqrel import tango.util.log.Log : Log, Logger; private Logger logger; static this() { logger = Log.getLogger ("mde.file.deserialize"); } unittest { // Utility bool throws (void delegate() dg) { bool r = false; try { dg(); } catch (Exception e) { r = true; } return r; } assert (!throws ({ int i = 5; })); assert (throws ({ throw new Exception ("Test - this exception should be caught"); })); // Associative arrays char[][char] X = deserialize!(char[][char]) (`['a':"animal\n", 'b':['b','u','s','\n']]`); char[][char] Y = ['a':cast(char[])"animal\n", 'b':['b','u','s','\n']]; //FIXME: when the compiler's fixed: http://d.puremagic.com/issues/show_bug.cgi?id=1671 // just assert (X == Y) assert (X.length == Y.length); assert (X.keys == Y.keys); assert (X.values == Y.values); //X.rehash; Y.rehash; // doesn't make a difference //assert (X == Y); // fails (compiler bug) assert (throws ({ deserialize!(int[int]) (`[1:1`); })); // bad brackets assert (throws ({ deserialize!(int[char[]]) (`["ab\":1]`); })); // unterminated quote assert (throws ({ deserialize!(int[char[]]) (`["abc,\a\b\c":1]`); })); // bad escape seq. assert (throws ({ deserialize!(int[char[]]) (`["abc"]`); })); // no data // Arrays assert (deserialize!(double[]) (`[1.0,1.0e-10]`) == [1.0, 1.0e-10]);// generic array stuff assert (deserialize!(double[]) (`[ ]`) == cast(double[]) []); // empty array assert (deserialize!(int[][]) (`[[1],[2,3],[]]`) == [[1],[2,3],[]]);// sub-array assert (throws ({ deserialize!(int[]) (`[1,2`); })); // bad brackets assert (throws ({ deserialize!(int[][]) (`[[1]]]`); })); // bad brackets // char[] and char conversions, with commas, escape sequences and multichar UTF8 characters: assert (deserialize!(char[][]) (`[ ".\"", [',','\''] ,"!\b€" ]`) == [ ".\"".dup, [',','\''] ,"!\b€" ]); assert (throws ({ deserialize!(char[]) ("\"\\\""); })); assert (throws ({ deserialize!(char[]) (`['a'`); })); // bad brackets // wchar[] and dchar[] conversions: // The characters were pretty-much pulled at random from unicode tables. // The last few cause some wierd (display only) effects in my editor. assert (deserialize!(wchar[]) ("\"Test string: ¶α؟अกሀ搀\"") == "Test string: ¶α؟अกሀ搀"w); assert (deserialize!(dchar[]) ("\"Test string: ¶α؟अกሀ搀\"") == "Test string: ¶α؟अกሀ搀"d); assert (deserialize!(ubyte[]) (`0x01F2aC`) == cast(ubyte[]) [0x01, 0xF2, 0xAC]); // ubyte[] special notation assert (deserialize!(ubyte[]) (`[01 ,0xF2, 0xAC]`) == cast(ubyte[]) [0x01, 0xF2, 0xAC]); // ubyte[] std notation assert (throws ({ deserialize!(ubyte[]) (`0x123`); })); // digits not in pairs assert (throws ({ deserialize!(ubyte[]) (`[2,5`); })); // not [...] or 0x.. assert (throws ({ deserialize!(ubyte[]) (`0x123j`); })); // char types assert (deserialize!(char) ("'\\\''") == '\''); assert (deserialize!(wchar) ("'X'") == 'X'); assert (deserialize!(dchar) ("'X'") == 'X'); assert (deserialize!(wchar) ("'£'") == '£'); assert (deserialize!(dchar) ("'£'") == '£'); assert (throws ({ deserialize!(char) ("'\\'"); })); assert (throws ({ deserialize!(char) ("'£'"); })); // non-ascii assert (throws ({ deserialize!(char) ("''"); })); assert (throws ({ deserialize!(char) ("'ab'"); })); assert (throws ({ deserialize!(wchar) ("''"); })); // bool assert (deserialize!(bool[]) (`[true,false,01,00]`) == cast(bool[]) [1,0,1,0]); assert (throws ({ deserialize!(bool) ("011"); })); // ints assert (deserialize!(byte) ("-5") == cast(byte) -5); assert (deserialize!(int) ("-0x7FFFFFFF") == cast(int) -0x7FFF_FFFF); // annoyingly, octal syntax differs from D (blame tango): assert (deserialize!(uint[]) ("[0b0100,0o724,0xFa59c,0xFFFFFFFF,0]") == [0b0100u,0724,0xFa59c,0xFFFFFFFF,0]); assert (throws ({ deserialize!(int) (""); })); assert (throws ({ deserialize!(int) ("0x8FFFFFFF"); })); assert (throws ({ deserialize!(uint) ("-1"); })); assert (throws ({ deserialize!(uint) ("1a"); })); // floats assert (feqrel (deserialize!(float) ("0.0"), 0.0f) >= float.mant_dig-1); assert (feqrel (deserialize!(double) ("-1e25"), -1e25) >= double.mant_dig-2); assert (feqrel (deserialize!(real) ("5.24e-269"), cast(real) 5.24e-269) >= real.mant_dig-3); assert (throws ({ deserialize!(float) (""); })); // structs struct A { int x = 5; char y; } struct B { A a; float b; } A a; a.y = 'y'; assert (deserialize!(A) ("{ 1 : 'y' }") == a); B b; b.a = a; b.b = 1.0f; assert (deserialize!(B) (" {1:1.0,0: { 1 : 'y' } } ") == b); assert (throws ({ deserialize!(A) (" 1:'x'}"); })); // bad braces assert (throws ({ deserialize!(A) ("{ 1 }"); })); // no :DATA // unEscapeChar assert (deserialize!(char[]) ("\"\\a\\b\\t\\n\\v\\f\\r\\\"\\\'\\\\\"") == "\a\b\t\n\v\f\r\"\'\\"); logger.info ("Unittest complete."); } }