projects/mde: mde/mergetag/parse/parseTo.d comparison

comparison mde/mergetag/parse/parseTo.d @ 70:7fc0a8295c83

Moved my parseTo and parseFrom modules from tango.scrapple to mde in order to reduce dependencies.

author	Diggory Hardy <diggory.hardy@gmail.com>
date	Fri, 04 Jul 2008 19:04:16 +0100
parents
children

comparison

equal deleted inserted replaced

-:ead4afc6d0b8
+:7fc0a8295c83
+/**************************************************************************************************
+* copyright: Copyright (c) 2007-2008 Diggory Hardy.
+*
+* author: Diggory Hardy, diggory.hardy@gmail.com
+*
+* license: BSD style: $(LICENSE)
+*
+* This contains templates for converting a char[] to various data-types.
+*
+* parseTo is roughly the inverse of $(B parseFrom) and should read any data output by $(B parseFrom).
+* It is also available in tango.scrapple.
+*
+* This module basically implements the following templated function for most basic D types:
+* bool, byte, short, int, long, ubyte, ushort, uint, ulong, float, double, real, char.
+* It also supports arrays and associative arrays of any supported type (including of other arrays)
+* and has special handling for strings (char[]) and binary (ubyte[]) data-types.
+* -----------------------------
+* T parseTo(T) (char[] source);
+* -----------------------------
+*
+* $(I source) is the string to parse, and data of the templated type that is read from the string
+* is returned. See the examples to get a better idea of its use.
+*
+* Syntax:
+* The syntax for parsing $(I source) is mostly the same used by D without any prefixes/suffixes
+* (except 0x, 0b & 0o base specifiers). Also a special ubyte[] syntax is supported; see examples.
+* The following escape sequences are supported for strings and characters: \' \" \\
+* \a \b \f \n \r \t \v . Associative array literals use the same syntax as D, described here:
+* $(LINK http://www.digitalmars.com/d/2.0/expression.html#AssocArrayLiteral). All whitespace is
+* ignored (except of course within strings).
+*
+* There are also some public utility functions with their own documentation.
+*
+* Throws:
+* On errors, a ParseException or a UnicodeException (both extend TextException) is thrown with a
+* suitable message. No other exceptions should be thrown.
+*
+* Remarks:
+* There is currently no support for reading wchar/dchar strings. There are, however, unicode
+* conversions for converting UTF-8 to UTF-16/32. Be careful if converting on a char-by-char basis;
+* such conversions cannot be used for non-ascii characters.
+*
+* Examples:
+* ------------------------------------------------------------------------------------------------
+* // Basic examples:
+* ulong        a = parseTo!(ulong) ("20350");
+* float        d = parseTo!(float) ("  1.2e-9 ");
+* int[]        b = parseTo!(int[]) ("[0,1,2,3]");
+*
+* // String and char[] syntax:
+* char[]       c = parseTo!(char[]) ("\"A string\"");
+* char[]       e = parseTo!(char[]) ("['a','n','o','t','h','e','r', ' ' ,'s','t','r','i','n','g']");
+*
+* // These be used interchangably; here's a more complex example of an associative array:
+* bool[char[]] f = parseTo!(bool[char[]]) ("[ \"one\":true, ['t','w','o']:false, \"three\":1, \"four\":000 ]");
+*
+* // There is also a special notation for ubyte[] types:
+* // The digits following 0x must be in pairs and each specify one ubyte.
+* assert ( parseTo!(ubyte[]) (`0x01F2AC`) == parseTo!(ubyte[]) (`[01 ,0xF2, 0xAC]`) );
+*
+* // There's no limit to the complexity!
+* char[char[][][][char]][bool] z = ...; // don't expect me to write this!
+* ------------------------------------------------------------------------------------------------
+*************************************************************************************************/
+module mde.mergetag.parse.parseTo;
+// tango imports
+import tango.core.Exception : TextException, UnicodeException;
+import cInt = tango.text.convert.Integer;
+import cFloat = tango.text.convert.Float;
+import Utf = tango.text.convert.Utf;
+import Util = tango.text.Util;
+/**
+* Base class for parseTo exceptions.
+*/
+class ParseException : TextException
+{
+this( char[] msg )
+{
+super( msg );
+}
+}
+//BEGIN parseTo templates
+// Associative arrays
+const char[] AA_ERR = "Invalid associative array: ";
+T[S] parseTo(T : T[S], S) (char[] src) {
+src = Util.trim(src);
+if (src.length < 2 || src[0] != '[' || src[$-1] != ']')
+throw new ParseException (AA_ERR ~ "not [ ... ]");	// bad braces.
+T[S] ret;
+foreach (char[] pair; split (src[1..$-1])) {
+uint i = 0;
+while (i < pair.length) {	// advance to the ':'
+char c = pair[i];
+if (c == ':') break;
+if (c == '\'' || c == '"') {	// string or character
+++i;
+while (i < pair.length && pair[i] != c) {
+if (pair[i] == '\\') {
+if (i+2 >= pair.length) throw new ParseException (AA_ERR ~ "unfinished escape sequence within string/char");
+++i;	// escape seq.
+}
+++i;
+}
+if (i == pair.length) {
+throw new ParseException (AA_ERR ~ "encountered [ ... KEY] (missing :DATA)");
+}
+}
+++i;
+}
+if (i == pair.length) {
+throw new ParseException (AA_ERR ~ "encountered [ ... KEY:] (missing DATA)");
+}
+ret[parseTo!(S) (pair[0..i])] = parseTo!(T) (pair[i+1..$]);
+}
+return ret;
+}
+debug (UnitTest) unittest {
+char[][char] X = parseTo!(char[][char]) (`['a':"animal", 'b':['b','u','s']]`);
+char[][char] Y = ['a':cast(char[])"animal", 'b':['b','u','s']];
+//FIXME: when the compiler's fixed: http://d.puremagic.com/issues/show_bug.cgi?id=1671
+// just assert (X == Y)
+assert (X.length == Y.length);
+assert (X.keys == Y.keys);
+assert (X.values == Y.values);
+//X.rehash; Y.rehash;	// doesn't make a difference
+//assert (X == Y);		// fails (compiler bug)
+}
+// Arrays
+T[] parseTo(T : T[]) (char[] src) {
+src = Util.trim(src);
+if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T[]) (src);
+throw new ParseException ("Invalid array: not [x, ..., z]");
+}
+// String (array special case)
+T parseTo(T : char[]) (char[] src) {
+src = Util.trim(src);
+if (src.length >= 2 && src[0] == '"' && src[$-1] == '"') {
+src = src[1..$-1];
+T ret;
+ret.length = src.length;	// maximum length; retract to actual length later
+uint i = 0;
+for (uint t = 0; t < src.length;) {
+// process a block of non-escaped characters
+uint s = t;
+while (t < src.length && src[t] != '\\') ++t;	// non-escaped characters
+uint j = i + t - s;
+ret[i..j] = src[s..t];	// copy a block
+i = j;
+// process a block of escaped characters
+while (t < src.length && src[t] == '\\') {
+t++;
+if (t == src.length) throw new ParseException ("Invalid string: ends \\\" !");	// next char is "
+ret[i++] = replaceEscapedChar (src[t++]);	// throws if it's invalid
+}
+}
+return ret[0..i];
+}
+else if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T) (src);
+throw new ParseException ("Invalid string: not quoted (\"*\") or char array (['a',...,'c'])");
+}
+// Unicode conversions for strings:
+T parseTo(T : wchar[]) (char[] src) {
+// May throw a UnicodeException; don't bother catching and rethrowing:
+return Utf.toString16 (parseTo!(char[]) (src));
+}
+T parseTo(T : dchar[]) (char[] src) {
+// May throw a UnicodeException; don't bother catching and rethrowing:
+return Utf.toString32 (parseTo!(char[]) (src));
+}
+// Binary (array special case)
+T parseTo(T : ubyte[]) (char[] src) {
+src = Util.trim(src);
+// Standard case:
+if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T) (src);
+// Special case: sequence of hex digits, each pair of which is a ubyte
+if (src.length >= 2 && src[0..2] == "0x") {
+src = src[2..$];    // strip down to actual digits
+// Must be in pairs:
+if (src.length % 2 == 1) throw new ParseException ("Invalid binary: odd number of chars");
+T ret;
+ret.length = src.length / 2;	// exact
+for (uint i, pos; pos + 1 < src.length; ++i) {
+ubyte x = readHexChar(src, pos) << 4;
+x |= readHexChar(src, pos);
+ret[i] = x;
+}
+return ret;
+}
+else throw new ParseException ("Invalid ubyte[]: not an array and doesn't start 0x");
+}
+debug (UnitTest) unittest {
+assert (parseTo!(double[]) (`[1.0,1.0e-10]`) == [1.0, 1.0e-10]);   // generic array stuff
+assert (parseTo!(double[]) (`[     ]`) == cast(double[]) []);      // empty array
+// char[] and char conversions, with commas, escape sequences and multichar UTF8 characters:
+assert (parseTo!(char[][]) (`[ ".\"", [',','\''] ,"!\b€" ]`) == [ ".\"".dup, [',','\''] ,"!\b€" ]);
+// wchar[] and dchar[] conversions:
+// The characters were pretty-much pulled at random from unicode tables.
+// The last few cause some wierd (display only) effects in my editor.
+assert (parseTo!(wchar[]) ("\"Test string: ¶α؟अกሀ搀\"") == "Test string: ¶α؟अกሀ搀"w);
+assert (parseTo!(dchar[]) ("\"Test string: ¶α؟अกሀ搀\"") == "Test string: ¶α؟अกሀ搀"d);
+assert (parseTo!(ubyte[]) (`0x01F2AC`) == cast(ubyte[]) [0x01, 0xF2, 0xAC]);	// ubyte[] special notation
+assert (parseTo!(ubyte[]) (`[01 ,0xF2, 0xAC]`) == cast(ubyte[]) [0x01, 0xF2, 0xAC]);	// ubyte[] std notation
+}
+// Basic types
+// Char
+T parseTo(T : char) (char[] src) {
+src = Util.trim(src);
+if (src.length < 3 || src[0] != '\'' || src[$-1] != '\'')
+throw new ParseException ("Invalid char: not quoted (e.g. 'c')");
+if (src[1] != '\\' && src.length == 3) return src[1];	// Either non escaped
+if (src.length == 4) return replaceEscapedChar (src[2]);	// Or escaped
+// Report various errors; warnings for likely and difficult to tell cases:
+// Warn in case it's a multibyte UTF-8 character:
+if (src[1] & 0xC0u) throw new UnicodeException ("Invalid char: too long (non-ASCII UTF-8 characters cannot be read as a single character)", 1);
+throw new ParseException ("Invalid char: too long");
+}
+/* Basic unicode convertions for wide-chars.
+* NOTE: c > 127 signals the start of a multibyte UTF-8 sequence which must be converted for
+* UTF-16/32. But since we don't know what the next bytes are we can't do the conversion. */
+const char[] WIDE_CHAR_ERROR = "Error: unicode non-ascii character cannot be converted from a single UTF-8 char";
+T parseTo(T : wchar) (char[] src) {
+char c = parseTo!(char) (src);
+if (c <= 127u) return cast(wchar) c;	// this char can be converted
+else throw new UnicodeException (WIDE_CHAR_ERROR, 1);
+}
+T parseTo(T : dchar) (char[] src) {
+char c = parseTo!(char) (src);
+if (c <= 127u) return cast(dchar) c;	// this char can be converted
+else throw new UnicodeException (WIDE_CHAR_ERROR, 1);
+}
+debug (UnitTest) unittest {
+assert (parseTo!(char) ("\'\\\'\'") == '\'');
+assert (parseTo!(wchar) ("'X'") == 'X');
+assert (parseTo!(dchar) ("'X'") == 'X');
+}
+// Bool
+T parseTo(T : bool) (char[] src) {
+src = Util.trim(src);
+if (src == "true") return true;
+if (src == "false") return false;
+uint pos;
+while (src.length > pos && src[pos] == '0') ++pos;	// skip leading zeros
+if (src.length == pos && pos > 0) return false;
+if (src.length == pos + 1 && src[pos] == '1') return true;
+throw new ParseException ("Invalid bool: not true or false and doesn't evaluate to 0 or 1");
+}
+debug (UnitTest) unittest {
+assert (parseTo!(bool[]) (`[true,false,01,00]`) == cast(bool[]) [1,0,1,0]);
+}
+// Ints
+T parseTo(T : byte) (char[] src) {
+return toTInt!(T) (src);
+}
+T parseTo(T : short) (char[] src) {
+return toTInt!(T) (src);
+}
+T parseTo(T : int) (char[] src) {
+return toTInt!(T) (src);
+}
+T parseTo(T : long) (char[] src) {
+return toTInt!(T) (src);
+}
+T parseTo(T : ubyte) (char[] src) {
+return toTInt!(T) (src);
+}
+T parseTo(T : ushort) (char[] src) {
+return toTInt!(T) (src);
+}
+T parseTo(T : uint) (char[] src) {
+return toTInt!(T) (src);
+}
+T parseTo(T : ulong) (char[] src) {
+return toTInt!(T) (src);
+}
+debug (UnitTest) unittest {
+assert (parseTo!(byte) ("-5") == cast(byte) -5);
+// annoyingly, octal syntax differs from D (blame tango):
+assert (parseTo!(uint[]) ("[0b0100,0o724,0xFa59c,0xFFFFFFFF,0]") == [0b0100u,0724,0xFa59c,0xFFFFFFFF,0]);
+}
+// Floats
+T parseTo(T : float) (char[] src) {
+return toTFloat!(T) (src);
+}
+T parseTo(T : double) (char[] src) {
+return toTFloat!(T) (src);
+}
+T parseTo(T : real) (char[] src) {
+return toTFloat!(T) (src);
+}
+debug (UnitTest) unittest {
+assert (parseTo!(float) ("0.0") == 0.0f);
+assert (parseTo!(double) ("-1e25") == -1e25);
+assert (parseTo!(real) ("5.24e-269") == cast(real) 5.24e-269);
+}
+//END parseTo templates
+//BEGIN Utility funcs
+/** Trims whitespace at ends of string and checks for and removes array brackets: []
+*
+* Throws:
+*   ParseException if brackets aren't end non-whitespace characters.
+*
+* Returns:
+*   String without brackets (and whitespace outside those brackets). Useful for passing to split.
+*/
+char[] stripBrackets (char[] src) {
+src = Util.trim(src);
+if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return src[1..$-1];
+throw new ParseException ("Invalid bracketed string: not [...]");
+}
+/** Splits a string into substrings separated by '$(B ,)' with support for characters and strings
+* containing escape sequences and for embedded arrays ($(B [...])).
+*
+* Params:
+*     src A string to separate on commas. Where used for parsing arrays, the brackets enclosing
+*     the array should be removed before calling this function (stripBrackets can do this).
+*
+* Returns:
+*     An array of substrings within src, excluding commas. Whitespace is not stripped and
+*     empty strings may get returned.
+*
+* Remarks:
+*     This function is primarily intended for as a utility function for use by the templates
+*     parsing arrays and associative arrays, but it may be useful in other cases too. Hence the
+*     fact no brackets are stripped from src.
+*/
+char[][] split (char[] src) {
+src = Util.trim (src);
+if (src == "") return [];		// empty array: no elements when no data
+uint depth = 0;			// surface depth (embedded arrays)
+char[][] ret;
+ret.length = src.length / 3;	// unlikely to need a longer array
+uint k = 0;				// current split piece
+uint i = 0, j = 0;			// current read location, start of current piece
+while (i < src.length) {
+char c = src[i];
+if (c == '\'' || c == '"') {	// string or character
+++i;
+while (i < src.length && src[i] != c) {
+if (src[i] == '\\') ++i;	// escape seq.
+++i;
+}	// Doesn't throw if no terminal quote at end of src, but this should be caught later.
+}
+else if (c == '[') ++depth;
+else if (c == ']') {
+if (depth) --depth;
+else throw new ParseException ("Invalid array literal: closes before end of data item.");
+}
+else if (c == ',' && depth == 0) {		// only if not an embedded array
+if (ret.length <= k) ret.length = ret.length * 2;
+ret[k++] = src[j..i];	// add this piece and increment k
+j = i + 1;
+}
+++i;
+}
+if (ret.length <= k) ret.length = k + 1;
+ret[k] = src[j..i];		// add final piece (i >= j)
+return ret[0..k+1];
+}
+/* Templated read-int function to read (un)signed 1-4 byte integers.
+*
+* Actually a reimplementation of tango.text.convert.Integer toLong and parse functions.
+*/
+private TInt toTInt(TInt) (char[] src) {
+const char[] INT_OUT_OF_RANGE = "Integer out of range";
+bool sign;
+uint radix, ate, ate2;
+// Trim off whitespace.
+// NOTE: Cannot use tango.text.convert.Integer.trim to trim leading whitespace since it doesn't
+// treat new-lines, etc. as whitespace which for our purposes is whitespace.
+src = Util.trim (src);
+ate = cInt.trim (src, sign, radix);
+if (ate == src.length) throw new ParseException ("Invalid integer: no digits");
+ulong val = cInt.convert (src[ate..$], radix, &ate2);
+ate += ate2;
+if (ate < src.length)
+throw new ParseException ("Invalid integer at marked character: \"" ~ src[0..ate] ~ "'" ~ src[ate] ~ "'" ~ src[ate+1..$] ~ "\"");
+if (val > TInt.max) throw new ParseException (INT_OUT_OF_RANGE);
+if (sign) {
+long sval = cast(long) -val;
+if (sval > TInt.min) return cast(TInt) sval;
+else throw new ParseException (INT_OUT_OF_RANGE);
+}
+return cast(TInt) val;
+}
+/* Basically a reimplementation of tango.text.convert.Float.toFloat which checks for
+* whitespace before throwing an exception for overlong input. */
+private TFloat toTFloat(TFloat) (char[] src) {
+// NOTE: As for toTInt(), this needs to strip leading as well as trailing whitespace.
+src = Util.trim (src);
+if (src == "") throw new ParseException ("Invalid float: no digits");
+uint ate;
+TFloat x = cFloat.parse (src, &ate);
+return x;
+}
+/* Throws an exception on invalid escape sequences. Supported escape sequences are the following
+* subset of those supported by D: \" \' \\ \a \b \f \n \r \t \v
+*/
+private char replaceEscapedChar (char c)
+{
+// This code was generated:
+if (c <= 'b') {
+if (c <= '\'') {
+if (c == '\"') {
+return '\"';
+} else if (c == '\'') {
+return '\'';
+}
+} else {
+if (c == '\\') {
+return '\\';
+} else if (c == 'a') {
+return '\a';
+} else if (c == 'b') {
+return '\b';
+}
+}
+} else {
+if (c <= 'n') {
+if (c == 'f') {
+return '\f';
+} else if (c == 'n') {
+return '\n';
+}
+} else {
+if (c == 'r') {
+return '\r';
+} else if (c == 't') {
+return '\t';
+} else if (c == 'v') {
+return '\v';
+}
+}
+}
+// if we haven't returned:
+throw new ParseException ("Invalid escape sequence: \\"~c);
+}
+// Reads one hex char: [0-9A-Fa-f]. Otherwise throws an exception. Doesn't check src.length.
+private ubyte readHexChar (char[] src, inout uint pos) {
+ubyte x;
+if (src[pos] >= '0' && src[pos] <= '9') x = src[pos] - '0';
+else if (src[pos] >= 'A' && src[pos] <= 'F') x = src[pos] - 'A' + 10;
+else if (src[pos] >= 'a' && src[pos] <= 'f') x = src[pos] - 'a' + 10;
+else throw new ParseException ("Invalid hex digit.");
+++pos;
+return x;
+}
+// Generic array reader
+// Assumes input is of form "[xxxxx]" (i.e. first and last chars are '[', ']' and length >= 2).
+private T[] toArray(T : T[]) (char[] src) {
+T[] ret = new T[16];	// avoid unnecessary allocations
+uint i = 0;
+foreach (char[] element; split(src[1..$-1])) {
+if (i == ret.length) ret.length = ret.length * 2;
+ret[i] = parseTo!(T) (element);
+++i;
+}
+return ret[0..i];
+}
+debug (UnitTest) {
+import tango.io.Console;
+unittest {
+Cout ("Running unittest: parseTo ...").flush;
+assert (parseTo!(char[]) ("\"\\a\\b\\t\\n\\v\\f\\r\\\"\\\'\\\\\"") == "\a\b\t\n\v\f\r\"\'\\");
+Cout (" complete").newline;
+}
+}
+//END Utility funcs

Mercurial > projects > mde

comparison mde/mergetag/parse/parseTo.d @ 70:7fc0a8295c83