Mercurial > projects > mde

/**************************************************************************************************
 * This contains templates for converting a char[] to various data-types.
 *
 * Authors: Diggory Hardy, diggory.hardy@gmail.com
 * Copyright: Copyright © 2007 Diggory Hardy.
 * License: Licensed under the Academic Free License version 3.0
 *
 * This module basically implements the following templated function for $(B most) basic D types:
 * bool, byte, short, int, long, ubyte, ushort, uint, ulong, float, double, real, char.
 * It also supports arrays and associative arrays of any supported type (including of other arrays)
 * and has special handling for strings (char[]) and binary (ubyte[]) data-types.
 * -----------------------------
 * T parse(T) (char[] source);
 * -----------------------------
 *
 * The syntax is mostly the same used by D without any prefixes/suffixes (except 0x, 0b & 0o base
 * specifiers). The following escape sequences are supported for strings and characters: \' \" \\
 * \a \b \f \n \r \t \v . Associative array literals use the same syntax as D, described here:
 * $(LINK http://www.digitalmars.com/d/expression.html#AssocArrayLiteral).
 *
 * There are also a few utility functions defined; the public ones have their own documentation.
 *
 * On errors, a textParseException is thrown with a suitable message. No other exceptions should
 * be thrown and none thrown from functions used outside this module.
 *************************************************************************************************/
module mde.text.parse;

// package imports
import mde.text.exception;
import mde.text.util : postTrim;

// tango imports
import cInt = tango.text.convert.Integer;
import cFloat = tango.text.convert.Float;
import Util = tango.text.Util;
debug {
    import tango.util.log.Log : Log, Logger;

    private Logger logger;
}
static this () {
    debug logger = Log.getLogger ("mde.text.parse");
}

//BEGIN parse templates
// Associative arrays
const char[] AA_ERR = "Invalid associative array: ";
T[S] parse(T : T[S], S) (char[] src) {
    src = Util.trim(src);
    if (src.length < 2 || src[0] != '[' || src[$-1] != ']')
        throw new textParseException (AA_ERR ~ "not [ ... ]");	// bad braces.

    T[S] ret;
    foreach (char[] pair; split (src[1..$-1])) {
        uint i = 0;
        while (i < pair.length) {	// advance to the ':'
            char c = pair[i];
            if (c == ':') break;
            if (c == '\'' || c == '"') {	// string or character
                ++i;
                while (i < pair.length && pair[i] != c) {
                    if (pair[i] == '\\') {
                        if (i+2 >= pair.length) throw new textParseException (AA_ERR ~ "unfinished escape sequence within string/char");
                        ++i;	// escape seq.
                    }
                    ++i;
                }
                if (i == pair.length) {
                    debug logger.warn ("Pair is: " ~ pair);
                    throw new textParseException (AA_ERR ~ "encountered [ ... KEY] (missing :DATA)");
                }
            }
            ++i;
        }
        if (i == pair.length) {
            throw new textParseException (AA_ERR ~ "encountered [ ... KEY:] (missing DATA)");
        }
        ret[parse!(S) (pair[0..i])] = parse!(T) (pair[i+1..$]);
    }
    return ret;
}
unittest {
    char[][char] X = parse!(char[][char]) (`['a':"animal", 'b':['b','u','s']]`);
    char[][char] Y = ['a':cast(char[])"animal", 'b':['b','u','s']];

    //FIXME: when the compiler's fixed...
    // just assert (X == Y)
    assert (X.length == Y.length);
    assert (X.keys == Y.keys);
    assert (X.values == Y.values);
    //X.rehash; Y.rehash;	// doesn't make a difference
    //assert (X == Y);		// fails
}

// Arrays
T[] parse(T : T[]) (char[] src) {
    src = Util.trim(src);
    if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T[]) (src);
    throw new textParseException ("Invalid array: not [x, ..., z]");
}
T parse(T : char[]) (char[] src) {
    src = Util.trim(src);
    if (src.length >= 2 && src[0] == '"' && src[$-1] == '"') {
        src = src[1..$-1];
        T ret;
        ret.length = src.length;	// maximum length; retract to actual length later
        uint i = 0;
        for (uint t = 0; t < src.length;) {
            // process a block of non-escaped characters
            uint s = t;
            while (t < src.length && src[t] != '\\') ++t;	// non-escaped characters
            uint j = i + t - s;
            ret[i..j] = src[s..t];	// copy a block
            i = j;

            // process a block of escaped characters
            while (t < src.length && src[t] == '\\') {
                t++;
                if (t == src.length) throw new textParseException ("Invalid string: ends \\\" !");	// next char is "
                ret[i++] = replaceEscapedChar (src[t++]);	// throws if it's invalid
            }
        }
        return ret[0..i];
    }
    else if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T) (src);
    throw new textParseException ("Invalid string: not quoted (\"*\") or char array (['a',...,'c'])");
}
T parse(T : ubyte[]) (char[] src) {
    src = Util.trim(src);
    // Standard case:
    if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T) (src);
    // Special case: sequence of hex digits, each pair of which is a ubyte
    if (src.length % 2 == 1) throw new textParseException ("Invalid binary: odd number of chars");
    T ret;
    ret.length = src.length / 2;	// exact
    for (uint i, pos; pos + 1 < src.length; ++i) {
        ubyte x = readHexChar(src, pos) << 4;
        x |= readHexChar(src, pos);
        ret[i] = x;
    }
    return ret;
}
unittest {
    assert (parse!(double[]) (`[1.0,1.0e-10]`) == [1.0, 1.0e-10]);	// generic array stuff
    assert (parse!(double[]) (`[	]`) == cast(double[]) []);	// empty array

    // char[] and char conversions, with commas, escape sequences and multichar UTF8 characters:
    assert (parse!(char[][]) (`[ ".\"", [',','\''] ,"!\b€" ]`) == [ ".\"".dup, [',','\''] ,"!\b€" ]);

    assert (parse!(ubyte[]) (`01F2AC`) == cast(ubyte[]) [0x01, 0xF2, 0xAC]);	// ubyte[] special notation
    assert (parse!(ubyte[]) (`[01 ,0xF2, 0xAC]`) == cast(ubyte[]) [0x01, 0xF2, 0xAC]);	// ubyte[] std notation
}

T parse(T : char) (char[] src) {
    src = Util.trim(src);
    if (src.length < 3 || src[0] != '\'' || src[$-1] != '\'')
        throw new textParseException ("Invalid char: not quoted ('c')");
    if (src[1] != '\\' && src.length == 3) return src[1];	// Either non escaped
    if (src.length == 4) return replaceEscapedChar (src[2]);	// Or escaped

    // Report various errors; warnings for likely and difficult to tell cases:
    /+ This was caused by a bug. Shouldn't occur now normally.
    if (src[1] == '\\' && src.length == 3) throw new textParseException (`Warning: \' in char! There's currently no support for this during tokenising. Thus your input's probably been garbled!`);	// next char is ' +/
    // Warn in case it's a multibyte UTF-8 character:
    if (src[1] & 0xC0u) throw new textParseException ("Invalid char: too long (non-ASCII UTF-8 characters cannot be read as a single character)");
    throw new textParseException ("Invalid char: too long");
}
// unittest covered above

T parse(T : bool) (char[] src) {
    src = Util.trim(src);
    if (src == "true") return true;
    if (src == "false") return false;
    uint pos;
    while (src.length > pos && src[pos] == '0') ++pos;	// strip leading zeros
    if (src.length == pos && pos > 0) return false;
    if (src.length == pos + 1 && src[pos] == '1') return true;
    throw new textParseException ("Invalid bool: not true or false and doesn't evaluate to 0 or 1");
}
unittest {
    assert (parse!(bool[]) (`[true,false,01,00]`) == cast(bool[]) [1,0,1,0]);
}

T parse(T : byte) (char[] src) {
    return toTInt!(T) (src);
}
T parse(T : short) (char[] src) {
    return toTInt!(T) (src);
}
T parse(T : int) (char[] src) {
    return toTInt!(T) (src);
}
T parse(T : long) (char[] src) {
    return toTInt!(T) (src);
}
T parse(T : ubyte) (char[] src) {
    return toTInt!(T) (src);
}
T parse(T : ushort) (char[] src) {
    return toTInt!(T) (src);
}
T parse(T : uint) (char[] src) {
    return toTInt!(T) (src);
}
T parse(T : ulong) (char[] src) {
    return toTInt!(T) (src);
}
unittest {
    assert (parse!(byte) ("-5") == cast(byte) -5);
    // annoyingly, octal syntax differs from D (blame tango):
    assert (parse!(uint[]) ("[0b0100,0o724,0xFa59c,0xFFFFFFFF,0]") == [0b0100u,0724,0xFa59c,0xFFFFFFFF,0]);
}

T parse(T : float) (char[] src) {
    return toTFloat!(T) (src);
}
T parse(T : double) (char[] src) {
    return toTFloat!(T) (src);
}
T parse(T : real) (char[] src) {
    return toTFloat!(T) (src);
}
unittest {
    assert (parse!(float) ("0.0") == 0.0f);
    assert (parse!(double) ("-1e25") == -1e25);
    assert (parse!(real) ("5.24e-269") == cast(real) 5.24e-269);
}
//END parse templates

//BEGIN Utility funcs
/** Templated read-int function to read (un)signed 1-4 byte integers.
 *
 * Actually a reimplementation of tango.text.convert.Integer toLong and parse functions.
 */
TInt toTInt(TInt) (char[] src) {
    const char[] INT_OUT_OF_RANGE = "Integer out of range";
    bool sign;
    uint radix, ate, ate2;

    ate = cInt.trim (src, sign, radix);
    if (ate == src.length) throw new textParseException ("Invalid integer: no digits");
    ulong val = cInt.convert (src[ate..$], radix, &ate2);
    ate += ate2;

    while (ate < src.length) {
        if (src[ate] == ' ' || src[ate] == '\t') ++ate;
        else throw new textParseException ("Invalid integer");
    }

    if (val > TInt.max) throw new textParseException (INT_OUT_OF_RANGE);
    if (sign) {
        long sval = cast(long) -val;
        if (sval > TInt.min) return cast(TInt) sval;
        else throw new textParseException (INT_OUT_OF_RANGE);
    }
    return cast(TInt) val;
}

/** Basically a reimplementation of tango.text.convert.Float.toFloat which checks for trailing
 * whitespace before throwing an exception for overlong input and throws my exception class
 * when it does. */
TFloat toTFloat(TFloat) (char[] src) {
    src = postTrim (src);
    if (src == "") throw new textParseException ("Invalid float: no digits");
    uint ate;

    TFloat x = cFloat.parse (src, &ate);
    return x;
}

/** Splits a string into substrings separated by '$(B ,)' with support for characters and strings
 * containing escape sequences and for embedded arrays ($(B [...])).
 *
 * Empty strings may get returned. */
char[][] split (char[] src) {
    src = Util.trim (src);
    if (src == "") return [];		// empty array: no elements when no data

    uint depth = 0;			// surface depth (embedded arrays)
    char[][] ret;
    ret.length = src.length / 3;	// unlikely to need a longer array
    uint k = 0;				// current split piece
    uint i = 0, j = 0;			// current read location, start of current piece

    while (i < src.length) {
        char c = src[i];
        if (c == '\'' || c == '"') {	// string or character
            ++i;
            while (i < src.length && src[i] != c) {
                if (src[i] == '\\') ++i;	// escape seq.
                ++i;
            }	// Doesn't throw if no terminal quote at end of src, but this should be caught later.
        }
        else if (c == '[') ++depth;
        else if (c == ']') {
            if (depth) --depth;
            else throw new textParseException ("Invalid array literal: closes before end of data item.");
        }
        else if (c == ',' && depth == 0) {		// only if not an embedded array
            if (ret.length <= k) ret.length = ret.length * 2;
            ret[k++] = src[j..i];	// add this piece and increment k
            j = i + 1;
        }
        ++i;
    }
    if (ret.length <= k) ret.length = k + 1;
    ret[k] = src[j..i];		// add final piece (i >= j)
    return ret[0..k+1];
}

/* Throws an exception on invalid escape sequences. Supported escape sequences are the following
 * subset of those supported by D: \" \' \\ \a \b \f \n \r \t \v
 */
private char replaceEscapedChar (char c)
{
    static char[char] escChars;
    static bool escCharsFilled;	// will be initialised false

    if (!escCharsFilled) {
        // map of all supported escape sequences (cannot be static?)
        escChars = ['"'  : '"', '\'' : '\'',
                    '\\' : '\\', 'a' : '\a',
                    'b'  : '\b', 'f' : '\f',
                    'n'  : '\n', 'r' : '\r',
                    't'  : '\t', 'v' : '\v'];
        escCharsFilled = true;
    }

    char* r = c in escChars;
    if (r != null) return *r;

    throw new textParseException ("Invalid escape sequence: \\"~c);	// we didn't return, so something failed
}

// Reads one hex char: [0-9A-Fa-f]. Otherwise throws an exception. Doesn't check src.length.
private ubyte readHexChar (char[] src, inout uint pos) {
    ubyte x;
    if (src[pos] >= '0' && src[pos] <= '9') x = src[pos] - '0';
    else if (src[pos] >= 'A' && src[pos] <= 'F') x = src[pos] - 'A' + 10;
    else if (src[pos] >= 'a' && src[pos] <= 'f') x = src[pos] - 'a' + 10;
    else throw new textParseException ("Invalid hex digit.");
    ++pos;
    return x;
}

// Generic array reader
// Assumes input is of form "[xxxxx]" (i.e. first and last chars are '[', ']' and length >= 2).
private T[] toArray(T : T[]) (char[] src) {
    T[] ret = new T[16];	// avoid unnecessary allocations
    uint i = 0;
    foreach (char[] element; split(src[1..$-1])) {
        if (i == ret.length) ret.length = ret.length * 2;
        ret[i] = parse!(T) (element);
        ++i;
    }
    return ret[0..i];
}

unittest {
    // all utility functions should be well-enough used not to need testing
}
//END Utility funcs
author	Diggory Hardy <diggory.hardy@gmail.com>
date	Wed, 16 Jan 2008 12:48:07 +0000
parents	dcb24afa0dce
children	f63f4f41a2dc