view basic/LiteralParsing.d @ 106:89db676fbacb

Now able of understanding strings.
author Anders Johnsen <skabet@gmail.com>
date Thu, 22 May 2008 12:09:11 +0200
parents
children c658172ca8a0
line wrap: on
line source

module basic.LiteralParsing.d;

import basic.SourceLocation,
       basic.Message;

import tango.io.Stdout,
       tango.core.BitManip,
       Integer = tango.text.convert.Integer,
       tango.text.Util;

enum StructType
{
    Char,
    WChar,
    DChar
}

struct String
{
    StructType type;
    ubyte[] data;
}

private struct EscapeReturn
{
    ubyte[] data;
    int length;
}

String parseString(char[] str, SourceLocation loc, MessageHandler messages)
{
    String strBuf;
    strBuf.data.length = str.length;
    strBuf.data.length = 0;

    switch(str[0])
    {
        case 'r':
            strBuf = parseWysiwygString(str[1..$], strBuf);
            break;
        case '`':
            strBuf = parseWysiwygString(str, strBuf);
            break;
        case '"':
            strBuf = parseDoubleQuotedString(str, strBuf, loc, messages);
            break;
        case 'x':
            strBuf = parseHexString(str[1..$], strBuf, loc + 1, messages);
            break;
        default:
            messages.report(InvalidStrPrefix, loc, loc + 1);

    }

    printString(str, strBuf);

    return strBuf;
}

String parseHexString(char[] str, String strBuf, 
        SourceLocation loc, MessageHandler messages)
{
    int i = 1; // first char is "
    char[] hex = "0123456789abcdefABCDEF";
    char[] whitespace = "\r\n ";
    char[] hexBuf;

    while(str[i] != '"')
    {
        if(hex.contains(str[i]))
        {
            hexBuf ~= str[i];
            if(hexBuf.length == 2)
            {
                strBuf.data ~= Integer.toInt(hexBuf, 16);
                hexBuf.length = 0;
            }
        }
        else if(whitespace.contains(str[i]))
        {}
        else
            messages.report(InvalidHexStrChar, loc + i, loc + i + 1);

        i++;
    }



    return strBuf;
}


String parseDoubleQuotedString(char[] str, String strBuf, 
        SourceLocation loc, MessageHandler messages)
{
    int i = 1; // first char is "
    
    while(str[i] != '"')
    {
        switch(str[i])
        {
            case '\\': // EscapeSequence
                EscapeReturn res = parseEscapeSequence(str[i..$], loc + i, messages);
                strBuf.data ~= res.data;
                i += res.length;
                break;
            default:
                strBuf.data ~= str[i];
                i++;
        }
        if(i >= str.length)
            break;
    }

    return strBuf;
}

EscapeReturn parseEscapeSequence(char[] str,
        SourceLocation loc, MessageHandler messages)
{
    EscapeReturn res;

    switch(str[1])
    {
        case '\'':
            res.length = 2;
            res.data ~= '\'';
            break;
        case '"':
            res.length = 2;
            res.data ~= '\"';
            break;
        case '?':
            res.length = 2;
            res.data ~= '\?';
            break;
        case '\\':
            res.length = 2;
            res.data ~= '\\';
            break;
        case 'a':
            res.length = 2;
            res.data ~= '\a';
            break;
        case 'b':
            res.length = 2;
            res.data ~= '\b';
            break;
        case 'f':
            res.length = 2;
            res.data ~= '\f';
            break;
        case 'n':
            res.length = 2;
            res.data ~= '\n';
            break;
        case 'r':
            res.length = 2;
            res.data ~= '\r';
            break;
        case 't':
            res.length = 2;
            res.data ~= '\t';
            break;
        case 'v':
            res.length = 2;
            res.data ~= '\v';
            break;
        case 'x':
            char[] hex = "0123456789abcdefABCDEF";
            char[] hexBuf;
            if(str.length - 1 >= 4)
            {
                for(int i = 2; i < 4; i++)
                    if(hex.contains(str[i]))
                        hexBuf ~= str[i];
                    else
                        messages.report(StringHexInvalid, loc + i, loc + i + 1)
                            .arg(Integer.toString(i-1))
                            .arg(Integer.toString(2));
                res.length = 4;
            }
            else
            {
                messages.report(StringShortEscape, loc, loc + str.length);
                res.length = str.length - 1;
            }
            res.data ~= cast(ubyte)Integer.toInt(hexBuf, 16);
            break;
        case 'u':
            char[] hex = "0123456789abcdefABCDEF";
            char[] hexBuf;
            if(str.length - 1 >= 6)
            {
                for(int i = 2; i < 6; i++)
                    if(hex.contains(str[i]))
                        hexBuf ~= str[i];
                    else
                        messages.report(StringHexInvalid, loc + i, loc + i + 1)
                            .arg(Integer.toString(i-1))
                            .arg(Integer.toString(6));
                res.length = 6;
            }
            else
            {
                messages.report(StringShortEscape, loc, loc + str.length);
                res.length = str.length - 1;
            }
            uint i = Integer.toLong(hexBuf, 16);
            if(!isValidUtf8(i))
                messages.report(InvalidUtf8Hex, loc, loc+6);
            else
                res.data ~= parseToUtf8(i);
            break;
        case 'U':
            char[] hex = "0123456789abcdefABCDEF";
            char[] hexBuf;
            if(str.length - 1 >= 10)
            {
                for(int i = 2; i < 10; i++)
                    if(hex.contains(str[i]))
                        hexBuf ~= str[i];
                    else
                        messages.report(StringHexInvalid, loc + i, loc + i + 1)
                            .arg(Integer.toString(i-1))
                            .arg(Integer.toString(10));
                res.length = 10;
            }
            else
            {
                messages.report(StringShortEscape, loc, loc + str.length);
                res.length = str.length - 1;
            }
            uint i = Integer.toLong(hexBuf, 16);
            if(!isValidUtf8(i))
                messages.report(InvalidUtf8Hex, loc, loc+10);
            else
                res.data ~= parseToUtf8(i);
            break;
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
            char[] oct = "01234567";
            char[] octBuf;
            octBuf ~= str[1];
            res.length = 2;
            for(int i = 2; i < 4; i++)
                if(oct.contains(str[i]))
                {
                    octBuf ~= str[i];
                    res.length += 1;
                }
                else
                    break;

            uint i = Integer.toLong(octBuf, 8);
            res.data ~= i;
            break;
        default:
            messages.report(InvalidStrEscape, loc, loc + 2);
            res.length += 2;
    }

    return res;
}

String parseWysiwygString(char[] str, String strBuf)
{
    char start = str[0];

    int i = 1;

    while(str[i] != start)
    {
        strBuf.data ~= cast(ubyte)str[i];
        i++;
    }
    return strBuf;
}

ubyte[] parseToUtf8(uint i)
{
    if(i <= 0x00007F)
        return [cast(ubyte)i];
    else if(i <= 0x0007FF)
    {
        ubyte a = (i << 26) >> 26;
        bts(cast(uint*)&a, 7);
        ubyte b = (i << 19) >> 25;
        bts(cast(uint*)&b, 7);
        bts(cast(uint*)&b, 6);
        return [b,a];
    }
    else if(i <= 0x00FFFF)
    {
        ubyte a = (i << 26) >> 26;
        bts(cast(uint*)&a, 7);
        ubyte b = (i << 20) >> 26;
        bts(cast(uint*)&b, 7);
        ubyte c = (i << 16) >> 28;
        bts(cast(uint*)&c, 7);
        bts(cast(uint*)&c, 6);
        bts(cast(uint*)&c, 5);
        return [c,b,a];
    }
    else if(i <= 0x10FFFF)
    {
        ubyte a = (i << 26) >> 26;
        bts(cast(uint*)&a, 7);
        ubyte b = (i << 20) >> 26;
        bts(cast(uint*)&b, 7);
        ubyte c = (i << 14) >> 26;
        bts(cast(uint*)&c, 7);
        ubyte d = (i << 11) >> 29;
        bts(cast(uint*)&d, 7);
        bts(cast(uint*)&d, 6);
        bts(cast(uint*)&d, 5);
        bts(cast(uint*)&d, 4);
        return [d,c,b,a];
    }
}

bool isValidUtf8(uint i)
{
    if(i <= 0x10FFFF)
        return true;
    return false;
}

void printString(char[] str, String strBuf)
{
    char[] s;
    switch(strBuf.type)
    {
        case StructType.Char:
            Stdout(str)(" have become").newline()
                (cast(char[])strBuf.data).newline;
            break;
        case StructType.WChar:
            Stdout(str)(" have become").newline()
                (cast(wchar[])strBuf.data).newline;
            break;
        case StructType.DChar:
            Stdout(str)(" have become").newline()
                (cast(dchar[])strBuf.data).newline;
            break;
    }
}