Mercurial > projects > mde
view mde/text/parse.d @ 7:b544c3a7c9ca
Some changes to exceptions and a few more debug commands.
committer: Diggory Hardy <diggory.hardy@gmail.com>
author | Diggory Hardy <diggory.hardy@gmail.com> |
---|---|
date | Wed, 16 Jan 2008 12:48:07 +0000 |
parents | dcb24afa0dce |
children | f63f4f41a2dc |
line wrap: on
line source
/************************************************************************************************** * This contains templates for converting a char[] to various data-types. * * Authors: Diggory Hardy, diggory.hardy@gmail.com * Copyright: Copyright © 2007 Diggory Hardy. * License: Licensed under the Academic Free License version 3.0 * * This module basically implements the following templated function for $(B most) basic D types: * bool, byte, short, int, long, ubyte, ushort, uint, ulong, float, double, real, char. * It also supports arrays and associative arrays of any supported type (including of other arrays) * and has special handling for strings (char[]) and binary (ubyte[]) data-types. * ----------------------------- * T parse(T) (char[] source); * ----------------------------- * * The syntax is mostly the same used by D without any prefixes/suffixes (except 0x, 0b & 0o base * specifiers). The following escape sequences are supported for strings and characters: \' \" \\ * \a \b \f \n \r \t \v . Associative array literals use the same syntax as D, described here: * $(LINK http://www.digitalmars.com/d/expression.html#AssocArrayLiteral). * * There are also a few utility functions defined; the public ones have their own documentation. * * On errors, a textParseException is thrown with a suitable message. No other exceptions should * be thrown and none thrown from functions used outside this module. *************************************************************************************************/ module mde.text.parse; // package imports import mde.text.exception; import mde.text.util : postTrim; // tango imports import cInt = tango.text.convert.Integer; import cFloat = tango.text.convert.Float; import Util = tango.text.Util; debug { import tango.util.log.Log : Log, Logger; private Logger logger; } static this () { debug logger = Log.getLogger ("mde.text.parse"); } //BEGIN parse templates // Associative arrays const char[] AA_ERR = "Invalid associative array: "; T[S] parse(T : T[S], S) (char[] src) { src = Util.trim(src); if (src.length < 2 || src[0] != '[' || src[$-1] != ']') throw new textParseException (AA_ERR ~ "not [ ... ]"); // bad braces. T[S] ret; foreach (char[] pair; split (src[1..$-1])) { uint i = 0; while (i < pair.length) { // advance to the ':' char c = pair[i]; if (c == ':') break; if (c == '\'' || c == '"') { // string or character ++i; while (i < pair.length && pair[i] != c) { if (pair[i] == '\\') { if (i+2 >= pair.length) throw new textParseException (AA_ERR ~ "unfinished escape sequence within string/char"); ++i; // escape seq. } ++i; } if (i == pair.length) { debug logger.warn ("Pair is: " ~ pair); throw new textParseException (AA_ERR ~ "encountered [ ... KEY] (missing :DATA)"); } } ++i; } if (i == pair.length) { throw new textParseException (AA_ERR ~ "encountered [ ... KEY:] (missing DATA)"); } ret[parse!(S) (pair[0..i])] = parse!(T) (pair[i+1..$]); } return ret; } unittest { char[][char] X = parse!(char[][char]) (`['a':"animal", 'b':['b','u','s']]`); char[][char] Y = ['a':cast(char[])"animal", 'b':['b','u','s']]; //FIXME: when the compiler's fixed... // just assert (X == Y) assert (X.length == Y.length); assert (X.keys == Y.keys); assert (X.values == Y.values); //X.rehash; Y.rehash; // doesn't make a difference //assert (X == Y); // fails } // Arrays T[] parse(T : T[]) (char[] src) { src = Util.trim(src); if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T[]) (src); throw new textParseException ("Invalid array: not [x, ..., z]"); } T parse(T : char[]) (char[] src) { src = Util.trim(src); if (src.length >= 2 && src[0] == '"' && src[$-1] == '"') { src = src[1..$-1]; T ret; ret.length = src.length; // maximum length; retract to actual length later uint i = 0; for (uint t = 0; t < src.length;) { // process a block of non-escaped characters uint s = t; while (t < src.length && src[t] != '\\') ++t; // non-escaped characters uint j = i + t - s; ret[i..j] = src[s..t]; // copy a block i = j; // process a block of escaped characters while (t < src.length && src[t] == '\\') { t++; if (t == src.length) throw new textParseException ("Invalid string: ends \\\" !"); // next char is " ret[i++] = replaceEscapedChar (src[t++]); // throws if it's invalid } } return ret[0..i]; } else if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T) (src); throw new textParseException ("Invalid string: not quoted (\"*\") or char array (['a',...,'c'])"); } T parse(T : ubyte[]) (char[] src) { src = Util.trim(src); // Standard case: if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T) (src); // Special case: sequence of hex digits, each pair of which is a ubyte if (src.length % 2 == 1) throw new textParseException ("Invalid binary: odd number of chars"); T ret; ret.length = src.length / 2; // exact for (uint i, pos; pos + 1 < src.length; ++i) { ubyte x = readHexChar(src, pos) << 4; x |= readHexChar(src, pos); ret[i] = x; } return ret; } unittest { assert (parse!(double[]) (`[1.0,1.0e-10]`) == [1.0, 1.0e-10]); // generic array stuff assert (parse!(double[]) (`[ ]`) == cast(double[]) []); // empty array // char[] and char conversions, with commas, escape sequences and multichar UTF8 characters: assert (parse!(char[][]) (`[ ".\"", [',','\''] ,"!\b€" ]`) == [ ".\"".dup, [',','\''] ,"!\b€" ]); assert (parse!(ubyte[]) (`01F2AC`) == cast(ubyte[]) [0x01, 0xF2, 0xAC]); // ubyte[] special notation assert (parse!(ubyte[]) (`[01 ,0xF2, 0xAC]`) == cast(ubyte[]) [0x01, 0xF2, 0xAC]); // ubyte[] std notation } T parse(T : char) (char[] src) { src = Util.trim(src); if (src.length < 3 || src[0] != '\'' || src[$-1] != '\'') throw new textParseException ("Invalid char: not quoted ('c')"); if (src[1] != '\\' && src.length == 3) return src[1]; // Either non escaped if (src.length == 4) return replaceEscapedChar (src[2]); // Or escaped // Report various errors; warnings for likely and difficult to tell cases: /+ This was caused by a bug. Shouldn't occur now normally. if (src[1] == '\\' && src.length == 3) throw new textParseException (`Warning: \' in char! There's currently no support for this during tokenising. Thus your input's probably been garbled!`); // next char is ' +/ // Warn in case it's a multibyte UTF-8 character: if (src[1] & 0xC0u) throw new textParseException ("Invalid char: too long (non-ASCII UTF-8 characters cannot be read as a single character)"); throw new textParseException ("Invalid char: too long"); } // unittest covered above T parse(T : bool) (char[] src) { src = Util.trim(src); if (src == "true") return true; if (src == "false") return false; uint pos; while (src.length > pos && src[pos] == '0') ++pos; // strip leading zeros if (src.length == pos && pos > 0) return false; if (src.length == pos + 1 && src[pos] == '1') return true; throw new textParseException ("Invalid bool: not true or false and doesn't evaluate to 0 or 1"); } unittest { assert (parse!(bool[]) (`[true,false,01,00]`) == cast(bool[]) [1,0,1,0]); } T parse(T : byte) (char[] src) { return toTInt!(T) (src); } T parse(T : short) (char[] src) { return toTInt!(T) (src); } T parse(T : int) (char[] src) { return toTInt!(T) (src); } T parse(T : long) (char[] src) { return toTInt!(T) (src); } T parse(T : ubyte) (char[] src) { return toTInt!(T) (src); } T parse(T : ushort) (char[] src) { return toTInt!(T) (src); } T parse(T : uint) (char[] src) { return toTInt!(T) (src); } T parse(T : ulong) (char[] src) { return toTInt!(T) (src); } unittest { assert (parse!(byte) ("-5") == cast(byte) -5); // annoyingly, octal syntax differs from D (blame tango): assert (parse!(uint[]) ("[0b0100,0o724,0xFa59c,0xFFFFFFFF,0]") == [0b0100u,0724,0xFa59c,0xFFFFFFFF,0]); } T parse(T : float) (char[] src) { return toTFloat!(T) (src); } T parse(T : double) (char[] src) { return toTFloat!(T) (src); } T parse(T : real) (char[] src) { return toTFloat!(T) (src); } unittest { assert (parse!(float) ("0.0") == 0.0f); assert (parse!(double) ("-1e25") == -1e25); assert (parse!(real) ("5.24e-269") == cast(real) 5.24e-269); } //END parse templates //BEGIN Utility funcs /** Templated read-int function to read (un)signed 1-4 byte integers. * * Actually a reimplementation of tango.text.convert.Integer toLong and parse functions. */ TInt toTInt(TInt) (char[] src) { const char[] INT_OUT_OF_RANGE = "Integer out of range"; bool sign; uint radix, ate, ate2; ate = cInt.trim (src, sign, radix); if (ate == src.length) throw new textParseException ("Invalid integer: no digits"); ulong val = cInt.convert (src[ate..$], radix, &ate2); ate += ate2; while (ate < src.length) { if (src[ate] == ' ' || src[ate] == '\t') ++ate; else throw new textParseException ("Invalid integer"); } if (val > TInt.max) throw new textParseException (INT_OUT_OF_RANGE); if (sign) { long sval = cast(long) -val; if (sval > TInt.min) return cast(TInt) sval; else throw new textParseException (INT_OUT_OF_RANGE); } return cast(TInt) val; } /** Basically a reimplementation of tango.text.convert.Float.toFloat which checks for trailing * whitespace before throwing an exception for overlong input and throws my exception class * when it does. */ TFloat toTFloat(TFloat) (char[] src) { src = postTrim (src); if (src == "") throw new textParseException ("Invalid float: no digits"); uint ate; TFloat x = cFloat.parse (src, &ate); return x; } /** Splits a string into substrings separated by '$(B ,)' with support for characters and strings * containing escape sequences and for embedded arrays ($(B [...])). * * Empty strings may get returned. */ char[][] split (char[] src) { src = Util.trim (src); if (src == "") return []; // empty array: no elements when no data uint depth = 0; // surface depth (embedded arrays) char[][] ret; ret.length = src.length / 3; // unlikely to need a longer array uint k = 0; // current split piece uint i = 0, j = 0; // current read location, start of current piece while (i < src.length) { char c = src[i]; if (c == '\'' || c == '"') { // string or character ++i; while (i < src.length && src[i] != c) { if (src[i] == '\\') ++i; // escape seq. ++i; } // Doesn't throw if no terminal quote at end of src, but this should be caught later. } else if (c == '[') ++depth; else if (c == ']') { if (depth) --depth; else throw new textParseException ("Invalid array literal: closes before end of data item."); } else if (c == ',' && depth == 0) { // only if not an embedded array if (ret.length <= k) ret.length = ret.length * 2; ret[k++] = src[j..i]; // add this piece and increment k j = i + 1; } ++i; } if (ret.length <= k) ret.length = k + 1; ret[k] = src[j..i]; // add final piece (i >= j) return ret[0..k+1]; } /* Throws an exception on invalid escape sequences. Supported escape sequences are the following * subset of those supported by D: \" \' \\ \a \b \f \n \r \t \v */ private char replaceEscapedChar (char c) { static char[char] escChars; static bool escCharsFilled; // will be initialised false if (!escCharsFilled) { // map of all supported escape sequences (cannot be static?) escChars = ['"' : '"', '\'' : '\'', '\\' : '\\', 'a' : '\a', 'b' : '\b', 'f' : '\f', 'n' : '\n', 'r' : '\r', 't' : '\t', 'v' : '\v']; escCharsFilled = true; } char* r = c in escChars; if (r != null) return *r; throw new textParseException ("Invalid escape sequence: \\"~c); // we didn't return, so something failed } // Reads one hex char: [0-9A-Fa-f]. Otherwise throws an exception. Doesn't check src.length. private ubyte readHexChar (char[] src, inout uint pos) { ubyte x; if (src[pos] >= '0' && src[pos] <= '9') x = src[pos] - '0'; else if (src[pos] >= 'A' && src[pos] <= 'F') x = src[pos] - 'A' + 10; else if (src[pos] >= 'a' && src[pos] <= 'f') x = src[pos] - 'a' + 10; else throw new textParseException ("Invalid hex digit."); ++pos; return x; } // Generic array reader // Assumes input is of form "[xxxxx]" (i.e. first and last chars are '[', ']' and length >= 2). private T[] toArray(T : T[]) (char[] src) { T[] ret = new T[16]; // avoid unnecessary allocations uint i = 0; foreach (char[] element; split(src[1..$-1])) { if (i == ret.length) ret.length = ret.length * 2; ret[i] = parse!(T) (element); ++i; } return ret[0..i]; } unittest { // all utility functions should be well-enough used not to need testing } //END Utility funcs