Mercurial > projects > mde

diff mde/text/parse.d @ 4:9a990644948c
Many changes: upgraded to tango 0.99.4, reorganised mde/input, large changes to mde/mergetag and mde/init, separated off test/MTTest.d and more. committer: Diggory Hardy <diggory.hardy@gmail.com>
author: Diggory Hardy <diggory.hardy@gmail.com>
date: Sun, 06 Jan 2008 17:38:51 +0000
parents: 485c98ecbd91
children: dcb24afa0dce
--- a/mde/text/parse.d	Sat Nov 03 16:06:06 2007 +0000
+++ b/mde/text/parse.d	Sun Jan 06 17:38:51 2008 +0000
@@ -1,17 +1,23 @@
 /**************************************************************************************************
  * This contains templates for converting a char[] to various data-types.
  *
- * Copyright (c) 2007 Diggory Hardy.
- * Licensed under the Academic Free License version 3.0
+ * Authors: Diggory Hardy, diggory.hardy@gmail.com
+ * Copyright: Copyright © 2007 Diggory Hardy.
+ * License: Licensed under the Academic Free License version 3.0
  *
  * This module basically implements the following templated function for $(B most) basic D types:
  * bool, byte, short, int, long, ubyte, ushort, uint, ulong, float, double, real, char.
- * It also supports arrays of any supported type (including of other arrays) and has special
- * handling for strings (char[]) and binary (ubyte[]) data-types.
+ * It also supports arrays and associative arrays of any supported type (including of other arrays)
+ * and has special handling for strings (char[]) and binary (ubyte[]) data-types.
  * -----------------------------
  * T parse(T) (char[] source);
  * -----------------------------
  *
+ * The syntax is mostly the same used by D without any prefixes/suffixes (except 0x, 0b & 0o base
+ * specifiers). The following escape sequences are supported for strings and characters: \' \" \\
+ * \a \b \f \n \r \t \v . Associative array literals use the same syntax as D, described here:
+ * $(LINK http://www.digitalmars.com/d/expression.html#AssocArrayLiteral).
+ *
  * There are also a few utility functions defined; the public ones have their own documentation.
  *
  * On errors, a warning is logged and an TextParseException is thrown. No other exceptions should
@@ -21,6 +27,7 @@
 
 // package imports
 import mde.text.exception;
+import mde.text.util : postTrim;
 
 // tango imports
 import cInt = tango.text.convert.Integer;
@@ -34,11 +41,54 @@
 }
 
 //BEGIN parse templates
+// Associative arrays
+T[S] parse(T : T[S], S) (char[] src) {
+    src = Util.trim(src);
+    if (src.length < 2 || src[0] != '[' || src[$-1] != ']')
+        throwException ("Invalid associative array: not [a:x, ..., c:z]");
+    
+    T[S] ret;
+    foreach (char[] pair; split (src[1..$-1])) {
+        uint i = 0;
+        while (i < pair.length) {	// advance to the ':'
+            char c = pair[i];
+            if (c == ':') break;
+            if (c == '\'' || c == '"') {	// string or character
+                ++i;
+                while (i < pair.length && pair[i] != c) {
+                    if (pair[i] == '\\') ++i;	// escape seq.
+                    ++i;
+                }	// Doesn't throw if no terminal quote at end of pair (in this case an error is thrown anyway)
+            }
+            ++i;
+        }
+        if (i == pair.length) {
+            debug logger.trace ("In pair: " ~ pair);
+            throwException ("Invalid key:value pair in associative array literal");
+        }
+        debug logger.trace ("pair is: " ~ pair[0..i] ~ " : " ~ pair[i+1..$]);
+        ret[parse!(S) (pair[0..i])] = parse!(T) (pair[i+1..$]);
+    }
+    return ret;
+}
+unittest {
+    char[][char] X = parse!(char[][char]) (`['a':"animal", 'b':['b','u','s']]`);
+    char[][char] Y = ['a':cast(char[])"animal", 'b':['b','u','s']];
+    
+    //FIXME: when the compiler's fixed...
+    // just assert (X == Y)
+    assert (X.length == Y.length);
+    assert (X.keys == Y.keys);
+    assert (X.values == Y.values);
+    //X.rehash; Y.rehash;	// doesn't make a difference
+    //assert (X == Y);		// fails
+}
+
 // Arrays
 T[] parse(T : T[]) (char[] src) {
     src = Util.trim(src);
     if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T[]) (src);
-    throwException ("Invalid array: not [., ..., .]");
+    throwException ("Invalid array: not [x, ..., z]");
 }
 T parse(T : char[]) (char[] src) {
     src = Util.trim(src);
@@ -58,14 +108,14 @@
             // process a block of escaped characters
             while (t < src.length && src[t] == '\\') {
                 t++;
-                if (t == src.length) throwException (`Warning: \" in string! There's currently no support for this during tokenising. Thus your input's probably been garbled!`);	// next char is "
+                if (t == src.length) throwException (`Warning: string ends \" !`);	// next char is "
                 ret[i++] = replaceEscapedChar (src[t++]);	// throws if it's invalid
             }
         }
         return ret[0..i];
     }
     else if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T) (src);
-    throwException ("Invalid string: not quoted (\"*\") or char array (['.',...,'.'])");
+    throwException ("Invalid string: not quoted (\"*\") or char array (['a',...,'c'])");
 }
 T parse(T : ubyte[]) (char[] src) {
     src = Util.trim(src);
@@ -82,20 +132,32 @@
     }
     return ret;
 }
+unittest {
+    assert (parse!(double[]) (`[1.0,1.0e-10]`) == [1.0, 1.0e-10]);	// generic array stuff
+    assert (parse!(double[]) (`[	]`) == cast(double[]) []);	// empty array
+    
+    // char[] and char conversions, with commas, escape sequences and multichar UTF8 characters:
+    assert (parse!(char[][]) (`[ ".\"", [',','\''] ,"!\b€" ]`) == [ ".\"".dup, [',','\''] ,"!\b€" ]);
+    
+    assert (parse!(ubyte[]) (`01F2AC`) == cast(ubyte[]) [0x01, 0xF2, 0xAC]);	// ubyte[] special notation
+    assert (parse!(ubyte[]) (`[01 ,0xF2, 0xAC]`) == cast(ubyte[]) [0x01, 0xF2, 0xAC]);	// ubyte[] std notation
+}
 
 T parse(T : char) (char[] src) {
     src = Util.trim(src);
     if (src.length < 3 || src[0] != '\'' || src[$-1] != '\'')
-        throwException ("Invalid char: not quoted (\'*\')");
+        throwException ("Invalid char: not quoted ('c')");
     if (src[1] != '\\' && src.length == 3) return src[1];	// Either non escaped
     if (src.length == 4) return replaceEscapedChar (src[2]);	// Or escaped
     
     // Report various errors; warnings for likely and difficult to tell cases:
-    if (src[1] == '\\' && src.length == 3) throwException (`Warning: \' in char! There's currently no support for this during tokenising. Thus your input's probably been garbled!`);	// next char is "
+    /+ This was caused by a bug. Shouldn't occur now normally.
+    if (src[1] == '\\' && src.length == 3) throwException (`Warning: \' in char! There's currently no support for this during tokenising. Thus your input's probably been garbled!`);	// next char is ' +/
     // Warn in case it's a multibyte UTF-8 character:
     if (src[1] & 0xC0u) throwException ("Invalid char: too long (non-ASCII UTF-8 characters cannot be read as a single character)");
     throwException ("Invalid char: too long");
 }
+// unittest covered above
 
 T parse(T : bool) (char[] src) {
     src = Util.trim(src);
@@ -107,6 +169,9 @@
     if (src.length == pos + 1 && src[pos] == '1') return true;
     throwException ("Invalid bool: not true or false and doesn't evaluate to 0 or 1");
 }
+unittest {
+    assert (parse!(bool[]) (`[true,false,01,00]`) == cast(bool[]) [1,0,1,0]);
+}
 
 T parse(T : byte) (char[] src) {
     return toTInt!(T) (src);
@@ -132,6 +197,11 @@
 T parse(T : ulong) (char[] src) {
     return toTInt!(T) (src);
 }
+unittest {
+    assert (parse!(byte) ("-5") == cast(byte) -5);
+    // annoyingly, octal syntax differs from D (blame tango):
+    assert (parse!(uint[]) ("[0b0100,0o724,0xFa59c,0xFFFFFFFF,0]") == [0b0100u,0724,0xFa59c,0xFFFFFFFF,0]);
+}
 
 T parse(T : float) (char[] src) {
     return toTFloat!(T) (src);
@@ -142,6 +212,11 @@
 T parse(T : real) (char[] src) {
     return toTFloat!(T) (src);
 }
+unittest {
+    assert (parse!(float) ("0.0") == 0.0f);
+    assert (parse!(double) ("-1e25") == -1e25);
+    assert (parse!(real) ("5.24e-269") == cast(real) 5.24e-269);
+}
 //END parse templates
 
 //BEGIN Utility funcs
@@ -155,6 +230,7 @@
     uint radix, ate, ate2;
     
     ate = cInt.trim (src, sign, radix);
+    if (ate == src.length) throwException ("Invalid integer: no digits");
     ulong val = cInt.convert (src[ate..$], radix, &ate2);
     ate += ate2;
     
@@ -174,17 +250,54 @@
 
 /** Basically a reimplementation of tango.text.convert.Float.toFloat which checks for trailing
  * whitespace before throwing an exception for overlong input and throws my exception class
- * when it does.
- */
+ * when it does. */
 TFloat toTFloat(TFloat) (char[] src) {
+    src = postTrim (src);
+    if (src == "") throwException ("Invalid float: no digits");
     uint ate;
 
     TFloat x = cFloat.parse (src, &ate);
-    while (ate < src.length) {
-        if (src[ate] == ' ' || src[ate] == '\t') ++ate;
-        else throwException ("Invalid number");
+    return x;
+}
+
+/** Splits a string into substrings separated by '$(B ,)' with support for characters and strings
+ * containing escape sequences and for embedded arrays ($(B [...])).
+ *
+ * Empty strings may get returned. */
+char[][] split (char[] src) {
+    src = Util.trim (src);
+    if (src == "") return [];		// empty array: no elements when no data
+    
+    uint depth = 0;			// surface depth (embedded arrays)
+    char[][] ret;
+    ret.length = src.length / 3;	// unlikely to need a longer array
+    uint k = 0;				// current split piece
+    uint i = 0, j = 0;			// current read location, start of current piece
+    
+    while (i < src.length) {
+        char c = src[i];
+        if (c == '\'' || c == '"') {	// string or character
+            ++i;
+            while (i < src.length && src[i] != c) {
+                if (src[i] == '\\') ++i;	// escape seq.
+                ++i;
+            }	// Doesn't throw if no terminal quote at end of src, but this should be caught later.
+        }
+        else if (c == '[') ++depth;
+        else if (c == ']') {
+            if (depth) --depth;
+            else throwException ("Invalid array literal: closes before end of data item.");
+        }
+        else if (c == ',' && depth == 0) {		// only if not an embedded array
+            if (ret.length <= k) ret.length = ret.length * 2;
+            ret[k++] = src[j..i];	// add this piece and increment k
+            j = i + 1;
+        }
+        ++i;
     }
-    return x;
+    if (ret.length <= k) ret.length = k + 1;
+    ret[k] = src[j..i];		// add final piece (i >= j)
+    return ret[0..k+1];
 }
 
 /* Throws an exception on invalid escape sequences. Supported escape sequences are the following
@@ -196,17 +309,12 @@
     static bool escCharsFilled;	// will be initialised false
     
     if (!escCharsFilled) {
-        // map of all supported escape sequences
-        escChars['"'] = '"';
-        escChars['\''] = '\'';
-        escChars['\\'] = '\\';
-        escChars['a'] = '\a';
-        escChars['b'] = '\b';
-        escChars['f'] = '\f';
-        escChars['n'] = '\n';
-        escChars['r'] = '\r';
-        escChars['t'] = '\t';
-        escChars['v'] = '\v';
+        // map of all supported escape sequences (cannot be static?)
+        escChars = ['"'  : '"', '\'' : '\'',
+                    '\\' : '\\', 'a' : '\a',
+                    'b'  : '\b', 'f' : '\f',
+                    'n'  : '\n', 'r' : '\r',
+                    't'  : '\t', 'v' : '\v'];
         escCharsFilled = true;
     }
     
@@ -228,10 +336,11 @@
 }
 
 // Generic array reader
+// Assumes input is of form "[xxxxx]" (i.e. first and last chars are '[', ']' and length >= 2).
 private T[] toArray(T : T[]) (char[] src) {
     T[] ret = new T[16];	// avoid unnecessary allocations
     uint i = 0;
-    foreach (char[] element; Util.quotes (src[1..$-1],",")) {
+    foreach (char[] element; split(src[1..$-1])) {
         if (i == ret.length) ret.length = ret.length * 2;
         ret[i] = parse!(T) (element);
         ++i;
@@ -243,4 +352,8 @@
     logger.warn (msg);			// only small errors are trapped here
     throw new TextParseException ();
 }
+
+unittest {
+    // all utility functions should be well-enough used not to need testing
+}
 //END Utility funcs
author	Diggory Hardy <diggory.hardy@gmail.com>
date	Sun, 06 Jan 2008 17:38:51 +0000
parents	485c98ecbd91
children	dcb24afa0dce