Mercurial > projects > mde
diff mde/text/parse.d @ 4:9a990644948c
Many changes: upgraded to tango 0.99.4, reorganised mde/input, large changes to mde/mergetag and mde/init, separated off test/MTTest.d and more.
committer: Diggory Hardy <diggory.hardy@gmail.com>
author | Diggory Hardy <diggory.hardy@gmail.com> |
---|---|
date | Sun, 06 Jan 2008 17:38:51 +0000 |
parents | 485c98ecbd91 |
children | dcb24afa0dce |
line wrap: on
line diff
--- a/mde/text/parse.d Sat Nov 03 16:06:06 2007 +0000 +++ b/mde/text/parse.d Sun Jan 06 17:38:51 2008 +0000 @@ -1,17 +1,23 @@ /************************************************************************************************** * This contains templates for converting a char[] to various data-types. * - * Copyright (c) 2007 Diggory Hardy. - * Licensed under the Academic Free License version 3.0 + * Authors: Diggory Hardy, diggory.hardy@gmail.com + * Copyright: Copyright © 2007 Diggory Hardy. + * License: Licensed under the Academic Free License version 3.0 * * This module basically implements the following templated function for $(B most) basic D types: * bool, byte, short, int, long, ubyte, ushort, uint, ulong, float, double, real, char. - * It also supports arrays of any supported type (including of other arrays) and has special - * handling for strings (char[]) and binary (ubyte[]) data-types. + * It also supports arrays and associative arrays of any supported type (including of other arrays) + * and has special handling for strings (char[]) and binary (ubyte[]) data-types. * ----------------------------- * T parse(T) (char[] source); * ----------------------------- * + * The syntax is mostly the same used by D without any prefixes/suffixes (except 0x, 0b & 0o base + * specifiers). The following escape sequences are supported for strings and characters: \' \" \\ + * \a \b \f \n \r \t \v . Associative array literals use the same syntax as D, described here: + * $(LINK http://www.digitalmars.com/d/expression.html#AssocArrayLiteral). + * * There are also a few utility functions defined; the public ones have their own documentation. * * On errors, a warning is logged and an TextParseException is thrown. No other exceptions should @@ -21,6 +27,7 @@ // package imports import mde.text.exception; +import mde.text.util : postTrim; // tango imports import cInt = tango.text.convert.Integer; @@ -34,11 +41,54 @@ } //BEGIN parse templates +// Associative arrays +T[S] parse(T : T[S], S) (char[] src) { + src = Util.trim(src); + if (src.length < 2 || src[0] != '[' || src[$-1] != ']') + throwException ("Invalid associative array: not [a:x, ..., c:z]"); + + T[S] ret; + foreach (char[] pair; split (src[1..$-1])) { + uint i = 0; + while (i < pair.length) { // advance to the ':' + char c = pair[i]; + if (c == ':') break; + if (c == '\'' || c == '"') { // string or character + ++i; + while (i < pair.length && pair[i] != c) { + if (pair[i] == '\\') ++i; // escape seq. + ++i; + } // Doesn't throw if no terminal quote at end of pair (in this case an error is thrown anyway) + } + ++i; + } + if (i == pair.length) { + debug logger.trace ("In pair: " ~ pair); + throwException ("Invalid key:value pair in associative array literal"); + } + debug logger.trace ("pair is: " ~ pair[0..i] ~ " : " ~ pair[i+1..$]); + ret[parse!(S) (pair[0..i])] = parse!(T) (pair[i+1..$]); + } + return ret; +} +unittest { + char[][char] X = parse!(char[][char]) (`['a':"animal", 'b':['b','u','s']]`); + char[][char] Y = ['a':cast(char[])"animal", 'b':['b','u','s']]; + + //FIXME: when the compiler's fixed... + // just assert (X == Y) + assert (X.length == Y.length); + assert (X.keys == Y.keys); + assert (X.values == Y.values); + //X.rehash; Y.rehash; // doesn't make a difference + //assert (X == Y); // fails +} + // Arrays T[] parse(T : T[]) (char[] src) { src = Util.trim(src); if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T[]) (src); - throwException ("Invalid array: not [., ..., .]"); + throwException ("Invalid array: not [x, ..., z]"); } T parse(T : char[]) (char[] src) { src = Util.trim(src); @@ -58,14 +108,14 @@ // process a block of escaped characters while (t < src.length && src[t] == '\\') { t++; - if (t == src.length) throwException (`Warning: \" in string! There's currently no support for this during tokenising. Thus your input's probably been garbled!`); // next char is " + if (t == src.length) throwException (`Warning: string ends \" !`); // next char is " ret[i++] = replaceEscapedChar (src[t++]); // throws if it's invalid } } return ret[0..i]; } else if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T) (src); - throwException ("Invalid string: not quoted (\"*\") or char array (['.',...,'.'])"); + throwException ("Invalid string: not quoted (\"*\") or char array (['a',...,'c'])"); } T parse(T : ubyte[]) (char[] src) { src = Util.trim(src); @@ -82,20 +132,32 @@ } return ret; } +unittest { + assert (parse!(double[]) (`[1.0,1.0e-10]`) == [1.0, 1.0e-10]); // generic array stuff + assert (parse!(double[]) (`[ ]`) == cast(double[]) []); // empty array + + // char[] and char conversions, with commas, escape sequences and multichar UTF8 characters: + assert (parse!(char[][]) (`[ ".\"", [',','\''] ,"!\b€" ]`) == [ ".\"".dup, [',','\''] ,"!\b€" ]); + + assert (parse!(ubyte[]) (`01F2AC`) == cast(ubyte[]) [0x01, 0xF2, 0xAC]); // ubyte[] special notation + assert (parse!(ubyte[]) (`[01 ,0xF2, 0xAC]`) == cast(ubyte[]) [0x01, 0xF2, 0xAC]); // ubyte[] std notation +} T parse(T : char) (char[] src) { src = Util.trim(src); if (src.length < 3 || src[0] != '\'' || src[$-1] != '\'') - throwException ("Invalid char: not quoted (\'*\')"); + throwException ("Invalid char: not quoted ('c')"); if (src[1] != '\\' && src.length == 3) return src[1]; // Either non escaped if (src.length == 4) return replaceEscapedChar (src[2]); // Or escaped // Report various errors; warnings for likely and difficult to tell cases: - if (src[1] == '\\' && src.length == 3) throwException (`Warning: \' in char! There's currently no support for this during tokenising. Thus your input's probably been garbled!`); // next char is " + /+ This was caused by a bug. Shouldn't occur now normally. + if (src[1] == '\\' && src.length == 3) throwException (`Warning: \' in char! There's currently no support for this during tokenising. Thus your input's probably been garbled!`); // next char is ' +/ // Warn in case it's a multibyte UTF-8 character: if (src[1] & 0xC0u) throwException ("Invalid char: too long (non-ASCII UTF-8 characters cannot be read as a single character)"); throwException ("Invalid char: too long"); } +// unittest covered above T parse(T : bool) (char[] src) { src = Util.trim(src); @@ -107,6 +169,9 @@ if (src.length == pos + 1 && src[pos] == '1') return true; throwException ("Invalid bool: not true or false and doesn't evaluate to 0 or 1"); } +unittest { + assert (parse!(bool[]) (`[true,false,01,00]`) == cast(bool[]) [1,0,1,0]); +} T parse(T : byte) (char[] src) { return toTInt!(T) (src); @@ -132,6 +197,11 @@ T parse(T : ulong) (char[] src) { return toTInt!(T) (src); } +unittest { + assert (parse!(byte) ("-5") == cast(byte) -5); + // annoyingly, octal syntax differs from D (blame tango): + assert (parse!(uint[]) ("[0b0100,0o724,0xFa59c,0xFFFFFFFF,0]") == [0b0100u,0724,0xFa59c,0xFFFFFFFF,0]); +} T parse(T : float) (char[] src) { return toTFloat!(T) (src); @@ -142,6 +212,11 @@ T parse(T : real) (char[] src) { return toTFloat!(T) (src); } +unittest { + assert (parse!(float) ("0.0") == 0.0f); + assert (parse!(double) ("-1e25") == -1e25); + assert (parse!(real) ("5.24e-269") == cast(real) 5.24e-269); +} //END parse templates //BEGIN Utility funcs @@ -155,6 +230,7 @@ uint radix, ate, ate2; ate = cInt.trim (src, sign, radix); + if (ate == src.length) throwException ("Invalid integer: no digits"); ulong val = cInt.convert (src[ate..$], radix, &ate2); ate += ate2; @@ -174,17 +250,54 @@ /** Basically a reimplementation of tango.text.convert.Float.toFloat which checks for trailing * whitespace before throwing an exception for overlong input and throws my exception class - * when it does. - */ + * when it does. */ TFloat toTFloat(TFloat) (char[] src) { + src = postTrim (src); + if (src == "") throwException ("Invalid float: no digits"); uint ate; TFloat x = cFloat.parse (src, &ate); - while (ate < src.length) { - if (src[ate] == ' ' || src[ate] == '\t') ++ate; - else throwException ("Invalid number"); + return x; +} + +/** Splits a string into substrings separated by '$(B ,)' with support for characters and strings + * containing escape sequences and for embedded arrays ($(B [...])). + * + * Empty strings may get returned. */ +char[][] split (char[] src) { + src = Util.trim (src); + if (src == "") return []; // empty array: no elements when no data + + uint depth = 0; // surface depth (embedded arrays) + char[][] ret; + ret.length = src.length / 3; // unlikely to need a longer array + uint k = 0; // current split piece + uint i = 0, j = 0; // current read location, start of current piece + + while (i < src.length) { + char c = src[i]; + if (c == '\'' || c == '"') { // string or character + ++i; + while (i < src.length && src[i] != c) { + if (src[i] == '\\') ++i; // escape seq. + ++i; + } // Doesn't throw if no terminal quote at end of src, but this should be caught later. + } + else if (c == '[') ++depth; + else if (c == ']') { + if (depth) --depth; + else throwException ("Invalid array literal: closes before end of data item."); + } + else if (c == ',' && depth == 0) { // only if not an embedded array + if (ret.length <= k) ret.length = ret.length * 2; + ret[k++] = src[j..i]; // add this piece and increment k + j = i + 1; + } + ++i; } - return x; + if (ret.length <= k) ret.length = k + 1; + ret[k] = src[j..i]; // add final piece (i >= j) + return ret[0..k+1]; } /* Throws an exception on invalid escape sequences. Supported escape sequences are the following @@ -196,17 +309,12 @@ static bool escCharsFilled; // will be initialised false if (!escCharsFilled) { - // map of all supported escape sequences - escChars['"'] = '"'; - escChars['\''] = '\''; - escChars['\\'] = '\\'; - escChars['a'] = '\a'; - escChars['b'] = '\b'; - escChars['f'] = '\f'; - escChars['n'] = '\n'; - escChars['r'] = '\r'; - escChars['t'] = '\t'; - escChars['v'] = '\v'; + // map of all supported escape sequences (cannot be static?) + escChars = ['"' : '"', '\'' : '\'', + '\\' : '\\', 'a' : '\a', + 'b' : '\b', 'f' : '\f', + 'n' : '\n', 'r' : '\r', + 't' : '\t', 'v' : '\v']; escCharsFilled = true; } @@ -228,10 +336,11 @@ } // Generic array reader +// Assumes input is of form "[xxxxx]" (i.e. first and last chars are '[', ']' and length >= 2). private T[] toArray(T : T[]) (char[] src) { T[] ret = new T[16]; // avoid unnecessary allocations uint i = 0; - foreach (char[] element; Util.quotes (src[1..$-1],",")) { + foreach (char[] element; split(src[1..$-1])) { if (i == ret.length) ret.length = ret.length * 2; ret[i] = parse!(T) (element); ++i; @@ -243,4 +352,8 @@ logger.warn (msg); // only small errors are trapped here throw new TextParseException (); } + +unittest { + // all utility functions should be well-enough used not to need testing +} //END Utility funcs