view mde/text/parse.d @ 3:485c98ecbd91

text.parse: fixed a small bug with char[]'s. committer: Diggory Hardy <diggory.hardy@gmail.com>
author Diggory Hardy <diggory.hardy@gmail.com>
date Sat, 03 Nov 2007 16:06:06 +0000
parents 18491334a525
children 9a990644948c
line wrap: on
line source

/**************************************************************************************************
 * This contains templates for converting a char[] to various data-types.
 *
 * Copyright (c) 2007 Diggory Hardy.
 * Licensed under the Academic Free License version 3.0
 *
 * This module basically implements the following templated function for $(B most) basic D types:
 * bool, byte, short, int, long, ubyte, ushort, uint, ulong, float, double, real, char.
 * It also supports arrays of any supported type (including of other arrays) and has special
 * handling for strings (char[]) and binary (ubyte[]) data-types.
 * -----------------------------
 * T parse(T) (char[] source);
 * -----------------------------
 *
 * There are also a few utility functions defined; the public ones have their own documentation.
 *
 * On errors, a warning is logged and an TextParseException is thrown. No other exceptions should
 * be thrown and none thrown from functions used outside this module.
 *************************************************************************************************/
module mde.text.parse;

// package imports
import mde.text.exception;

// tango imports
import cInt = tango.text.convert.Integer;
import cFloat = tango.text.convert.Float;
import Util = tango.text.Util;
import tango.util.log.Log : Log, Logger;

private Logger logger;
static this () {
    logger = Log.getLogger ("mde.text.parse");
}

//BEGIN parse templates
// Arrays
T[] parse(T : T[]) (char[] src) {
    src = Util.trim(src);
    if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T[]) (src);
    throwException ("Invalid array: not [., ..., .]");
}
T parse(T : char[]) (char[] src) {
    src = Util.trim(src);
    if (src.length >= 2 && src[0] == '"' && src[$-1] == '"') {
        src = src[1..$-1];
        T ret;
        ret.length = src.length;	// maximum length; retract to actual length later
        uint i = 0;
        for (uint t = 0; t < src.length;) {
            // process a block of non-escaped characters
            uint s = t;
            while (t < src.length && src[t] != '\\') ++t;	// non-escaped characters
            uint j = i + t - s;
            ret[i..j] = src[s..t];	// copy a block
            i = j;
            
            // process a block of escaped characters
            while (t < src.length && src[t] == '\\') {
                t++;
                if (t == src.length) throwException (`Warning: \" in string! There's currently no support for this during tokenising. Thus your input's probably been garbled!`);	// next char is "
                ret[i++] = replaceEscapedChar (src[t++]);	// throws if it's invalid
            }
        }
        return ret[0..i];
    }
    else if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T) (src);
    throwException ("Invalid string: not quoted (\"*\") or char array (['.',...,'.'])");
}
T parse(T : ubyte[]) (char[] src) {
    src = Util.trim(src);
    // Standard case:
    if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T) (src);
    // Special case: sequence of hex digits, each pair of which is a ubyte
    if (src.length % 2 == 1) throwException ("Invalid binary: odd number of chars");
    T ret;
    ret.length = src.length / 2;	// exact
    for (uint i, pos; pos + 1 < src.length; ++i) {
        ubyte x = readHexChar(src, pos) << 4;
        x |= readHexChar(src, pos);
        ret[i] = x;
    }
    return ret;
}

T parse(T : char) (char[] src) {
    src = Util.trim(src);
    if (src.length < 3 || src[0] != '\'' || src[$-1] != '\'')
        throwException ("Invalid char: not quoted (\'*\')");
    if (src[1] != '\\' && src.length == 3) return src[1];	// Either non escaped
    if (src.length == 4) return replaceEscapedChar (src[2]);	// Or escaped
    
    // Report various errors; warnings for likely and difficult to tell cases:
    if (src[1] == '\\' && src.length == 3) throwException (`Warning: \' in char! There's currently no support for this during tokenising. Thus your input's probably been garbled!`);	// next char is "
    // Warn in case it's a multibyte UTF-8 character:
    if (src[1] & 0xC0u) throwException ("Invalid char: too long (non-ASCII UTF-8 characters cannot be read as a single character)");
    throwException ("Invalid char: too long");
}

T parse(T : bool) (char[] src) {
    src = Util.trim(src);
    if (src == "true") return true;
    if (src == "false") return false;
    uint pos;
    while (src.length > pos && src[pos] == '0') ++pos;	// strip leading zeros
    if (src.length == pos && pos > 0) return false;
    if (src.length == pos + 1 && src[pos] == '1') return true;
    throwException ("Invalid bool: not true or false and doesn't evaluate to 0 or 1");
}

T parse(T : byte) (char[] src) {
    return toTInt!(T) (src);
}
T parse(T : short) (char[] src) {
    return toTInt!(T) (src);
}
T parse(T : int) (char[] src) {
    return toTInt!(T) (src);
}
T parse(T : long) (char[] src) {
    return toTInt!(T) (src);
}
T parse(T : ubyte) (char[] src) {
    return toTInt!(T) (src);
}
T parse(T : ushort) (char[] src) {
    return toTInt!(T) (src);
}
T parse(T : uint) (char[] src) {
    return toTInt!(T) (src);
}
T parse(T : ulong) (char[] src) {
    return toTInt!(T) (src);
}

T parse(T : float) (char[] src) {
    return toTFloat!(T) (src);
}
T parse(T : double) (char[] src) {
    return toTFloat!(T) (src);
}
T parse(T : real) (char[] src) {
    return toTFloat!(T) (src);
}
//END parse templates

//BEGIN Utility funcs
/** Templated read-int function to read (un)signed 1-4 byte integers.
 *
 * Actually a reimplementation of tango.text.convert.Integer toLong and parse functions.
 */
TInt toTInt(TInt) (char[] src) {
    const char[] INT_OUT_OF_RANGE = "Integer out of range";
    bool sign;
    uint radix, ate, ate2;
    
    ate = cInt.trim (src, sign, radix);
    ulong val = cInt.convert (src[ate..$], radix, &ate2);
    ate += ate2;
    
    while (ate < src.length) {
        if (src[ate] == ' ' || src[ate] == '\t') ++ate;
        else throwException ("Invalid integer");
    }
    
    if (val > TInt.max) throwException (INT_OUT_OF_RANGE);
    if (sign) {
        long sval = cast(long) -val;
        if (sval > TInt.min) return cast(TInt) sval;
        else throwException (INT_OUT_OF_RANGE);
    }
    return cast(TInt) val;
}

/** Basically a reimplementation of tango.text.convert.Float.toFloat which checks for trailing
 * whitespace before throwing an exception for overlong input and throws my exception class
 * when it does.
 */
TFloat toTFloat(TFloat) (char[] src) {
    uint ate;

    TFloat x = cFloat.parse (src, &ate);
    while (ate < src.length) {
        if (src[ate] == ' ' || src[ate] == '\t') ++ate;
        else throwException ("Invalid number");
    }
    return x;
}

/* Throws an exception on invalid escape sequences. Supported escape sequences are the following
 * subset of those supported by D: \" \' \\ \a \b \f \n \r \t \v
 */
private char replaceEscapedChar (char c)
{
    static char[char] escChars;
    static bool escCharsFilled;	// will be initialised false
    
    if (!escCharsFilled) {
        // map of all supported escape sequences
        escChars['"'] = '"';
        escChars['\''] = '\'';
        escChars['\\'] = '\\';
        escChars['a'] = '\a';
        escChars['b'] = '\b';
        escChars['f'] = '\f';
        escChars['n'] = '\n';
        escChars['r'] = '\r';
        escChars['t'] = '\t';
        escChars['v'] = '\v';
        escCharsFilled = true;
    }
    
    char* r = c in escChars;
    if (r != null) return *r;
    
    throwException ("Invalid escape sequence: \\"~c);	// we didn't return, so something failed
}

// Reads one hex char: [0-9A-Fa-f]. Otherwise throws an exception. Doesn't check src.length.
private ubyte readHexChar (char[] src, inout uint pos) {
    ubyte x;
    if (src[pos] >= '0' && src[pos] <= '9') x = src[pos] - '0';
    else if (src[pos] >= 'A' && src[pos] <= 'F') x = src[pos] - 'A' + 10;
    else if (src[pos] >= 'a' && src[pos] <= 'f') x = src[pos] - 'a' + 10;
    else throwException ("Invalid hex digit.");
    ++pos;
    return x;
}

// Generic array reader
private T[] toArray(T : T[]) (char[] src) {
    T[] ret = new T[16];	// avoid unnecessary allocations
    uint i = 0;
    foreach (char[] element; Util.quotes (src[1..$-1],",")) {
        if (i == ret.length) ret.length = ret.length * 2;
        ret[i] = parse!(T) (element);
        ++i;
    }
    return ret[0..i];
}

private void throwException (char[] msg) {
    logger.warn (msg);			// only small errors are trapped here
    throw new TextParseException ();
}
//END Utility funcs