view mde/mergetag/read.d @ 0:d547009c104c

Repository creation. committer: Diggory Hardy <diggory.hardy@gmail.com>
author Diggory Hardy <diggory.hardy@gmail.com>
date Sat, 27 Oct 2007 18:05:39 +0100
parents
children 18491334a525
line wrap: on
line source

/**************************************************************************************************
 * This module contains all reading functions, for both binary and text MergeTag files.
 *
 * It publically imports mde.mergetag.dataset.
 *************************************************************************************************/

module mde.mergetag.read;

// package imports
public import mde.mergetag.dataset;
import mde.mergetag.exception;

// tango imports
import tango.io.UnicodeFile;
import Util = tango.text.Util;
import ConvInt = tango.text.convert.Integer;
import tango.util.collection.model.View : View;
import tango.util.collection.ArrayBag : ArrayBag;
import tango.util.log.Log : Log, Logger;

// TODO: allow compressing with zlib for both binary and text? (.mtz, .mtt, .mtb extensions)

// For now, all section & data tag IDs are uints.
// TODO: allow a lookup table or function to find a uint ID from a string ID

/**
 *  Class for reading a file.
 * 
 * Use as:
 * -----------------------
 * Reader foo("foo.mtt");
 * foo.read();
 * // get your data from foo.dataset.
 * -----------------------
 */
class Reader
{
//BEGIN DATA
    /**
    A container for all read data.
    
    This may be accessed from here; however it may be preferable to use an external reference
    (passed to the class on initialisation).
    */
    DataSet dataset;
    
    /** A table, which if created, allows items in a text file to have a string ID.
     *
     * If a string ID is given for a section or tag identifier and that string is a key in this
     * table, then the corresponding ID type is used (if the string is not found an error is thrown).
     */
    ID[char[]] indexTable;	// see setIndexLookupTable() doc for use.
    
    /** A function for creating new DataSections within the dataset.
    *
    * Allows a user-made class to be used in the DataSet instead of DefaultData.
    *
    * This works by supplying a function which returns a reference to an instance of a class
    * implementing DataSection. The function is passed the ID of the new section and may use this
    * to use different DataSection classes for different sections.
    */
    DataSection function (ID) dataSecCreator = null;
    
private:
    // Static symbols:
    typedef void delegate (char[],ID,char[]) readDelg;	// Delegate for accepting tags.
    
    static Logger logger;
    
    // Error messages as const variables. Could be loaded from files to support other languages?
    static const char[] ERR_FILEREAD = "Error reading file: ";
    static const char[] ERR_MTHEAD = "Not a valid MergeTag text file";
    static const char[] ERR_MTVER = "Unrecognised MergeTag version: MT";
    static const char[] ERR_EOF = "Unexpected EOF";
    static const char[] ERR_STAG = "Bad section tag format: not {id}";
    static const char[] ERR_DTAG = "Bad data tag format: not <type|id=data>";
    static const char[] ERR_CHAR = "Invalid character (or sequence starting \"!\") outside of tag";
    static const char[] ERR_IDINT = "Tag has invalid integer ID: not a valid uint value";
    
    // Non-static symbols:
    final char[] ErrInFile;		// something like "in \"path/file.mtt\""
    
    final char[] fbuf;			// file is read into this
    MT_VERS fileVer = MT_VERS.INVALID;	// Remains INVALID until set otherwise by CTOR.
    
    uint endOfHeader;
    bool allRead = false;		// true if endOfHeader == fbuf.length or read([]) has run
    bool fatal = false;			// a fatal file error occured; don't try to recover
    /* If the file is scanned for sections, the starting position of all sections are stored
    * in secTable. If this is empty, either no sections exist (and endOfHeader == fbuf.length)
    * or a section scan has not been run (read() with no section names doesn't need to do so).
    */
    struct SecMD {	// sec meta data
        static SecMD opCall (uint _pos, bool _read) {
            SecMD ret;
            ret.pos = _pos;
            ret.read = _read;
            return ret;
        }
        uint pos;			// position to start reading
        bool read;			// true if already read
    }
    SecMD [ID] secTable;
//END DATA
    
//BEGIN METHODS: CTOR / DTOR
    static this () {
        logger = Log.getLogger ("mde.mergetag.read.Reader");
    }
    
    /** Tries to open file path and read it into a buffer.
     *
     * Params:
     * path = The name or FilePath of the file to open.
     *     Standard extensions are .mtt and .mtb for text and binary files respectively.
     * dataset_ = If null create a new DataSet, else use existing DataSet *dataset_ and merge read
     *     data into it.
     * rdHeader = If true, read the header like a standard section. Doesn't read the header by
     *     default since if it's not requested it's likely not wanted.
     *
     * Memory:
     * This currently works by loading the whole file into memory at once. This should be fine most
     * of the time, but could potentially be a problem. Changing this would mean significantly
     * changes to the way the code works.
     */
    /* Ideas for implementing a partial-loading memory model:
     * Use a conduit directly.
     * Use a fiber to do the parsing; let it switch back when it runs out of memory.
     * Redesign the code so it never needs to look backwards in the buffer?
     *
     * Major problem: reading only some sections and keeping references to other sections
     * would no longer be possible.
     */
    public this (char[] path, DataSet* dataset_ = null, bool rdHeader = false) {
        this (new FilePath (path), dataset_, rdHeader);
    }
    /** ditto */
    public this (PathView path, DataSet* dataset_ = null, bool rdHeader = false) {
        // Create a dataset or use an existing one
        if (dataset_) dataset = *dataset_;
        else dataset = new DataSet();
        
        // Open & read the file
        try {	// Supports unicode files with a BOM; defaults to UTF8 when there isn't a BOM:
            scope file = new UnicodeFile!(char) (path, Encoding.Unknown);
            fbuf = cast(char[]) file.read();
        } catch (Exception e) {
            throwMTErr (ERR_FILEREAD ~ e.msg, new MTFileIOException);
        }
        // Remember the file name so that we can report errors (somewhat) informatively:
        ErrInFile = " in \"" ~ path.path ~ path.file ~ '"';
        
        // Version checking & matching header section tag:
        if (fbuf.length < 6 || fbuf[0] != '{' || fbuf[1] != 'M' || fbuf[2] != 'T' || fbuf[5] != '}')
            throwMTErr(ERR_MTHEAD ~ ErrInFile, new MTFileFormatException);
        fileVer = MTFormatVersion.parseString (fbuf[3..5]);
        if (fileVer == MTFormatVersion.VERS.INVALID)
            throwMTErr(ERR_MTVER ~ fbuf[3..5] ~ ErrInFile, new MTFileFormatException);
        
        // Header reading/skipping:
        if (rdHeader) {	// only bother actually reading it if it was requested
            dataset.header = new DefaultData;
            endOfHeader = parseSection (6,&dataset.header.addTag);
        }
        else endOfHeader = parseSection (6,null);
    }
    // Was intended to close file, but file is closed within CTOR anyway.
    public ~this () {
    }
//END METHODS: CTOR / DTOR
    
//BEGIN METHODS: PUBLIC
    /// Scans for sections if not already done and returns a list of IDs.
    public uint[] getSectionNames () {
        if (fatal) return [];
        if (!secTable.length)
            for (uint pos = endOfHeader; pos < fbuf.length;) {
                try {
                    ID id = fbufReadSecMarker (pos);
                    secTable[id] = SecMD(pos,false);	// add to table
                } catch (MTStringIDException) {
                    // ignore section; this happens anyway (but don't add to table)
                }
                pos = parseSection (pos, null);
            }
        return cast(uint[]) secTable.keys;
    }
    
    /** Reads (some) sections of the file into data. Note that sections will never be _read twice.
    *
    * To be more accurate, the file is copied into a buffer by this(). read() then parses the
    * contents of this buffer, and stores the contents in dataset.
    *
    * Each section read is stored in a DataSection class. By default this is an instance of
    * DefaultData; this can be customised (see setDataSectionCreator).
    *
    * If secSet is non-empty, reading is restricted to sections given in secSet, otherwise all
    * sections are read. Sections given in secSet but not found in the file are not reported as an
    * error. Suggested: supply a HashSet!(uint) as the View!(ID). An ArrayBag!(ID) as used is not a
    * good choice, except that in this case it's empty.
    *
    * Merging:
    * Where a section already exists in the DataSet (when either the section is given more than
    * once in the file, or it was read from a different file by another reader) it is merged.
    * Entries already in the DataSet take priority.
    *
    * Performance:
    * Note that loading only desired sections like this still parses the sections not
    * read (although it does not try to understand the type or data fields), so there is only a
    * small performance advantage to this where other sections do exist in the file. There is also
    * some overhead in only partially reading the file to keep track of where other sections are so
    * that the entire file need not be re-read if further (or all remaining) sections are read
    * later.
    */
    public void read (View!(ID) secSet = new ArrayBag!(ID)) {
        if (allRead || fatal) return;			// never do anything in either case
        if (secSet.size) {
            if (secTable.length) {
                foreach (ID id; secSet) {
                    SecMD* psmd = id in secTable;
                    if (psmd && !psmd.read) {			// may not exist
                        DataSection ds = getOrCreateSec (id);
                        parseSection (psmd.pos, &ds.addTag);
                        psmd.read = true;
                    }
                }
            } else {
                for (uint pos = endOfHeader; pos < fbuf.length;) {
                    try {
                        ID id = fbufReadSecMarker (pos);
                        secTable[id] = SecMD(pos,false);	// add to table
                        if (secSet.contains(id)) {
                            DataSection ds = getOrCreateSec (id);
                            pos = parseSection (pos, &ds.addTag);
                            secTable[id].read = true;
                        }
                    } catch (MTStringIDException) {	// don't do any of the stuff above
                        pos = parseSection (pos, null);	// and skip the section
                    }
                }
            }
        } else {
            if (secTable.length) {
                foreach (ID id, ref SecMD smd; secTable) {
                    if (!smd.read) {
                        DataSection ds = getOrCreateSec (id);
                        parseSection (smd.pos, &ds.addTag);
                        smd.read = true;
                    }
                }
            } else {					// this time we don't need to use secTable
                for (uint pos = endOfHeader; pos < fbuf.length;) {
                    try {
                        ID id = fbufReadSecMarker (pos);
                        DataSection ds = getOrCreateSec (id);
                        pos = parseSection (pos, &ds.addTag);
                    } catch (MTStringIDException) {
                        pos = parseSection (pos, null);	// just skip the section
                    }
                }
            }
            allRead = true;
        }
    }
//END METHODS: PUBLIC
    
//BEGIN METHODS: PRIVATE
    /* Reads a section, starting from index pos, finishing at the next section marker (returning
    the position of the start of the marker). pos should start after the section marker.
    
    After analysing tags, the function passes the type, ID (possibly converted) and data to addTag.
    
    NOTE: from performance tests on indexing char[]'s and dereferencing char*'s, the char*'s are
    slightly faster, but a tiny difference isn't worth the extra effort/risk of using char*'s.
    */
    private uint parseSection (uint pos, readDelg addTag) {
        bool comment = false;				// preceding char was !
        for (; pos < fbuf.length; ++pos) {
            if (Util.isSpace(fbuf[pos])) continue;	// whitespace
            else if (fbuf[pos] == '<') {		// data tag
                char[] type, data;
                ID tagID;
                
                // Type section of tag:
                fbufIncrement (pos);
                uint pos_s = pos;
                fbufLocateDataTagChar (pos, false);	// find end of type section
                if (fbuf[pos] != '|') throwMTErr (ERR_DTAG ~ ErrInFile);
                type = fbuf[pos_s..pos];
                // ID section of tag:
                fbufIncrement (pos);
                try {
                    tagID = fbufReadID (pos);		// read the ID, put pos at whatever's next
                } catch (MTStringIDException) {
                    comment = true;			// easiest way to ignore this tag
                }
                if (fbuf[pos] != '=') throwMTErr (ERR_DTAG ~ ErrInFile);
                // Data section of tag:
                fbufIncrement (pos);
                pos_s = pos;
                fbufLocateDataTagChar (pos, true);	// find end of data section
                if (fbuf[pos] != '>') throwMTErr (ERR_DTAG ~ ErrInFile);
                data = fbuf[pos_s..pos];
                
                if (!comment) {
                    if (addTag != null) addTag (type, tagID, data);
                } else comment = false;			// cancel comment status now
            }
            else if (fbuf[pos] == '{') {
                if (comment) {				// simple block comment
                    uint depth = 0;			// depth of embedded comment blocks
                    while (true) {
                        fbufIncrement (pos);
                        if (fbuf[pos] == '}') {
                            if (depth == 0) break;
                            else --depth;
                        } else if (fbuf[pos] == '{')
                            ++depth;
                    }
                    comment = false;			// end of this comment
                } else {
                    return pos;				// next section coming up; we are done
                }
            }
            else if (fbuf[pos] == '!') {		// possibly a comment; check next char
                comment = true;				// starting a comment (or an error)
                					// variable is reset at end of comment
            } else					// must be an error
                throwMTErr (ERR_CHAR ~ ErrInFile);
        }
        // if code execution reaches here, we're at EOF
        // possible error: last character was ! (but don't bother checking since it's inconsequential)
        return pos;
    }
    
    /* Look for a section; return it if it exists otherwise create a new section:
     *     use dataSecCreator if it exists or just create a DefaultData if not.
     */
    DataSection getOrCreateSec (ID id) {
        DataSection* i = id in dataset.sec;
        if (i) return *i;
        return (dataset.sec[id] = (dataSecCreator != null) ? dataSecCreator(id) : new DefaultData);
    }
    
    /* Parses fbuf for a section marker. Already knows fbuf[pos] == '{'.
    */
    private ID fbufReadSecMarker (inout uint pos) {
        // at this point pos is whatever a parseSection run returned
        // since we haven't hit EOF, fbuf[pos] MUST be '{' so no need to check
        fbufIncrement(pos);
        ID id = fbufReadID (pos);
        if (fbuf[pos] != '}') throwMTErr (ERR_STAG ~ ErrInFile);
        fbufIncrement(pos);
        return id;
    }
    
    /* Parses fbuf from pos to read an ID.
    On return pos is the index of the character following the ID.
    */
    private ID fbufReadID (inout uint pos) {
        while (Util.isSpace(fbuf[pos])) fbufIncrement(pos);	// skip any space
        if (fbuf[pos] == '"') {
            fbufIncrement(pos);
            uint start = pos;
            while (fbuf[pos] != '"') fbufIncrement(pos);
            ID* i_p = fbuf[start..pos] in indexTable;
            while (Util.isSpace(fbuf[pos])) fbufIncrement(pos);	// skip any space
            if (i_p != null) return *i_p;			// looked-up value
            // FIXME: log a warning
            throw new MTStringIDException ();			// string not in look-up table
        } else {
            uint ate;
            long x = ConvInt.parse (fbuf[pos..$], 0, &ate);
            if (x < 0L || x > 0xFFFF_FFFFL) throwMTErr (ERR_IDINT ~ ErrInFile);
            pos += ate;					// this is where ConvInt.parse stopped
            while (Util.isSpace(fbuf[pos])) fbufIncrement(pos);	// skip any space
            return cast(ID) x;
        }
    }
    
    /* Searches fbuf starting from start to find one of <=>| and stops at its index.
    
    If quotable then be quote-aware for single and double quotes.
    Note: there's no length restriction for the content of the quote since it could be a single
    non-ascii UTF-8 char which would look like several chars.
    */
    private void fbufLocateDataTagChar (inout uint pos, bool quotable) {
        for (; pos < fbuf.length; ++pos) {
            if ((fbuf[pos] >= '<' && fbuf[pos] <= '>') || fbuf[pos] == '|') return;
            else if (quotable) {
                if (fbuf[pos] == '\'') {
                    do {
                        fbufIncrement(pos);
                    } while (fbuf[pos] != '\'')
                } else if (fbuf[pos] == '"') {
                    do {
                        fbufIncrement(pos);
                    } while (fbuf[pos] != '"')
                }
            }
        }
    }
    /* Increments pos and checks it hasn't hit fbuf.length . */
    private void fbufIncrement(inout uint pos) {
        ++pos;
        if (pos >= fbuf.length) throwMTErr(ERR_EOF ~ ErrInFile);
    }
    
    private void throwMTErr (char[] msg, Exception exc = new MTException) {
        fatal = true;	// if anyone catches the error and tries to do anything --- we're dead now
        logger.error (msg);	// report the error
        throw exc;		// and signal our error
    }
//END METHODS: PRIVATE
    
    invariant {
        // Check secTable is valid, but not if it's complete.
        // This is something I really wouldn't expect to fail.
        /+ um... this causes a lot of linker errors. Shouldn't be necessary anyway..
        foreach (ID id, SecMD smd; secTable) {
            uint pos = smd.pos;
            for (; true; --pos) {
                assert (pos);	// we should never reach 0
                if (fbuf[pos] == '{') break;
            }
            ++pos;
            assert (fbufReadID(pos) == id);
        }+/
    }
    /+ A unittest here is really not practical since a file must be read from. Suggestion: Involve
    + both reading and writing functions in a single unittest for the entire package mergetag.
    + This is just here to point anyone looking in the right direction...
    unittest {}
    +/
}