Mercurial > projects > mde

/**************************************************************************************************
 * This module contains all reading functions, for both binary and text MergeTag files.
 *
 * It publically imports mde.mergetag.dataset.
 *************************************************************************************************/

module mde.mergetag.read;

// package imports
public import mde.mergetag.dataset;
import mde.mergetag.defaultdata;
import mde.mergetag.exception;

import tango.core.Exception;

// tango imports
import tango.io.UnicodeFile;
import Util = tango.text.Util;
import ConvInt = tango.text.convert.Integer;
import tango.util.collection.model.View : View;
import tango.util.collection.ArrayBag : ArrayBag;
import tango.util.collection.HashSet : HashSet;
import tango.util.log.Log : Log, Logger;

// TODO: allow compressing with zlib for both binary and text? (.mtz, .mtt, .mtb extensions)

/**
 *  Class for reading a file.
 *
 * Use as:
 * -----------------------
 * Reader foo;
 * try {
 *   foo = new Reader("foo.mtt");
 *   foo.read();
 * }
 * catch (MTException) {}
 * // get your data from foo.dataset.
 * -----------------------
 *
 * Throws:
 *  $(TABLE
 *  $(TR $(TH Exception) $(TH Thrown when))
 *  $(TR $(TD MTFileIOException) $(TD An error occurs while opening the file))
 *  $(TR $(TD MTFileFormatException) $(TD The file doesn't start with a recognised header/version))
 *  $(TR $(TD MTSyntaxException) $(TD A file syntax error occurs))
 *  $(TR $(TD MTException) $(TD An unexpected error occurs))
 *  )
 * Note that all exceptions extend MTException and when any exception is thrown the class is
 * rendered unusable: any subsequent calls to read will be ignored.
 *
 * Threading: Separate instances of Reader should be thread-safe provided access to the same
 * dataset is synchronized; i.e. no two readers refering to the same dataset should run
 * simultaneously. (The Reader class could be made thread-safe w.r.t. datasets, but
 * performance-wise I doubt it would be worth it.)
 * Do not run a single instance of Reader in multiple threads simultaneously.
 */
class Reader
{
//BEGIN DATA
    /**
    A container for all read data.

    This may be accessed from here; however it may be preferable to use an external reference
    (passed to the class on initialisation).
    */
    DataSet dataset;

    /** A delegate for creating new DataSections within the dataset.
    *
    * Allows a user-made class to be used in the DataSet instead of DefaultData. Also allows an
    * existing class instance to be used instead of a new one.
    *
    * This works by supplying a function which returns a reference to an instance of a class
    * implementing DataSection. The function is passed the ID of the new section and may use this
    * to use different DataSection classes for different sections.
    */
    DataSection delegate (ID) dataSecCreator = null;

private:
    static Logger logger;

    // Non-static symbols:
    final char[] ErrFile;		// added after ErrInFile to do the same without the "in " bit.
    final char[] ErrInFile;		// something like "in \"path/file.mtt\""

    final char[] fbuf;			// file is read into this
    MTFormatVersion.VERS fileVer = MTFormatVersion.VERS.INVALID;	// Remains INVALID until set otherwise by CTOR.

    uint endOfHeader;
    bool allRead = false;		// true if endOfHeader == fbuf.length or read([]) has run
    bool fatal = false;			// a fatal file error occured; don't try to recover
    /* If the file is scanned for sections, the starting position of all sections are stored
    * in secTable. If this is empty, either no sections exist (and endOfHeader == fbuf.length)
    * or a section scan has not been run (read() with no section names doesn't need to do so).
    */
    struct SecMD {	// sec meta data
        static SecMD opCall (uint _pos, bool _read) {
            SecMD ret;
            ret.pos = _pos;
            ret.read = _read;
            return ret;
        }
        uint pos;			// position to start reading
        bool read;			// true if already read
    }
    SecMD [ID] secTable;
//END DATA

//BEGIN METHODS: CTOR / DTOR
    static this () {
        logger = Log.getLogger ("mde.mergetag.read.Reader");
    }

    /** Tries to open file path and read it into a buffer.
     *
     * Params:
     * path = The name or FilePath of the file to open.
     *     Standard extensions are .mtt and .mtb for text and binary files respectively.
     * dataset_ = If null create a new DataSet, else use existing DataSet *dataset_ and merge read
     *     data into it.
     * rdHeader = If true, read the header like a standard section. Doesn't read the header by
     *     default since if it's not requested it's likely not wanted.
     *
     * Memory:
     * This currently works by loading the whole file into memory at once. This should be fine most
     * of the time, but could potentially be a problem. Changing this would mean significantly
     * changes to the way the code works.
     */
    /* Ideas for implementing a partial-loading memory model:
     * Use a conduit directly.
     * Use a fiber to do the parsing; let it switch back when it runs out of memory.
     * Redesign the code so it never needs to look backwards in the buffer?
     *
     * Major problem: reading only some sections and keeping references to other sections
     * would no longer be possible.
     */
    public this (char[] path, DataSet* dataset_ = null, bool rdHeader = false) {
        this (new FilePath (path), dataset_, rdHeader);
    }
    /** ditto */
    public this (PathView path, DataSet* dataset_ = null, bool rdHeader = false) {
        // Create a dataset or use an existing one
        if (dataset_) dataset = *dataset_;
        else dataset = new DataSet();

        // Open & read the file
        try {	// Supports unicode files with a BOM; defaults to UTF8 when there isn't a BOM:
            scope file = new UnicodeFile!(char) (path, Encoding.Unknown);
            fbuf = cast(char[]) file.read();
        } catch (Exception e) {
            throwMTErr ("Error reading file: " ~ e.msg, new MTFileIOException);
        }
        // Remember the file name so that we can report errors (somewhat) informatively:
        ErrFile = path.path ~ path.file;
        ErrInFile = " in \"" ~ ErrFile ~ '"';

        // Version checking & matching header section tag:
        if (fbuf.length < 6 || fbuf[0] != '{' || fbuf[1] != 'M' || fbuf[2] != 'T' || fbuf[5] != '}')
            throwMTErr("Not a valid MergeTag text file" ~ ErrInFile, new MTFileFormatException);
        fileVer = MTFormatVersion.parseString (fbuf[3..5]);
        if (fileVer == MTFormatVersion.VERS.INVALID)
            throwMTErr("Unrecognised MergeTag version: MT" ~ fbuf[3..5] ~ ErrInFile, new MTFileFormatException);

        // Header reading/skipping:
        if (rdHeader) {	// only bother actually reading it if it was requested
            // If already existing, merge.
            if (!dataset.header) dataset.header = new DefaultData;
            endOfHeader = parseSection (6, cast(DataSection*) &dataset.header);
        }
        else endOfHeader = parseSection (6,null);
    }
//END METHODS: CTOR / DTOR

//BEGIN METHODS: PUBLIC
    /// Scans for sections if not already done and returns a list of IDs.
    public uint[] getSectionNames () {
        if (fatal) return [];
        if (!secTable.length)
            for (uint pos = endOfHeader; pos < fbuf.length;) {
                ID id = fbufReadSecMarker (pos);
                secTable[id] = SecMD(pos,false);	// add to table
                pos = parseSection (pos, null);
            }
        return cast(uint[]) secTable.keys;
    }

    /** Reads (some) sections of the file into data. Note that sections will never be _read twice.
    *
    * To be more accurate, the file is copied into a buffer by this(). read() then parses the
    * contents of this buffer, and stores the contents in dataset.
    *
    * Each section read is stored in a DataSection class. By default this is an instance of
    * DefaultData; this can be customised (see setDataSectionCreator).
    *
    * If secSet is non-empty, reading is restricted to sections given in secSet, otherwise all
    * sections are read. Sections given in secSet but not found in the file are not reported as an
    * error. Suggested: supply a HashSet!(uint) as the View!(ID). An ArrayBag!(ID) as used is not a
    * good choice, except that in this case it's empty.
    *
    * Merging:
    * Where a section already exists in the DataSet (when either the section is given more than
    * once in the file, or it was read from a different file by another reader) it is merged.
    * Entries already in the DataSet take priority.
    *
    * Performance:
    * Note that loading only desired sections like this still parses the sections not
    * read (although it does not try to understand the type or data fields), so there is only a
    * small performance advantage to this where other sections do exist in the file. There is also
    * some overhead in only partially reading the file to keep track of where other sections are so
    * that the entire file need not be re-read if further (or all remaining) sections are read
    * later.
    */
    public void read (ID[] secSet) {
        HashSet!(ID) hs = new HashSet!(ID);
        foreach (id; secSet) hs.add(id);
        read (hs);
    }
    public void read (View!(ID) secSet = new ArrayBag!(ID)) {	/** ditto */
        /* Look for a section; return it if it exists otherwise create a new section:
        *     use dataSecCreator if it exists or just create a DefaultData if not.
        */
        DataSection getOrCreateSec (ID id) {
            DataSection* i = id in dataset.sec;
            if (i) return *i;
            return (dataset.sec[id] = (dataSecCreator !is null) ? dataSecCreator(id) : new DefaultData);
        }

        if (allRead || fatal) return;			// never do anything in either case
        if (secSet.size) {
            if (secTable.length) {
                foreach (ID id; secSet) {
                    SecMD* psmd = id in secTable;
                    if (psmd && !psmd.read) {			// may not exist
                        DataSection ds = getOrCreateSec (id);
                        parseSection (psmd.pos, &ds);
                        psmd.read = true;
                    }
                }
            } else {
                for (uint pos = endOfHeader; pos < fbuf.length;) {
                    ID id = fbufReadSecMarker (pos);
                    secTable[id] = SecMD(pos,false);	// add to table
                    if (secSet.contains(id)) {
                        DataSection ds = getOrCreateSec (id);
                        pos = parseSection (pos, &ds);
                        secTable[id].read = true;
                    } else
                        pos = parseSection (pos, null); // skip section
                }
            }
        } else {
            if (secTable.length) {
                foreach (ID id, ref SecMD smd; secTable) {
                    if (!smd.read) {
                        DataSection ds = getOrCreateSec (id);
                        parseSection (smd.pos, &ds);
                        smd.read = true;
                    }
                }
            } else {					// this time we don't need to use secTable
                for (uint pos = endOfHeader; pos < fbuf.length;) {
                    ID id = fbufReadSecMarker (pos);
                    DataSection ds = getOrCreateSec (id);
                    pos = parseSection (pos, &ds);
                }
            }
            allRead = true;
        }
    }
//END METHODS: PUBLIC

//BEGIN METHODS: PRIVATE
    /* Reads a section, starting from index pos, finishing at the next section marker (returning
    the position of the start of the marker). pos should start after the section marker.

    After analysing tags, the function passes the type, ID and data to addTag.

    NOTE: from performance tests on indexing char[]'s and dereferencing char*'s, the char*'s are
    slightly faster, but a tiny difference isn't worth the extra effort/risk of using char*'s.
    */
    private uint parseSection (uint pos, DataSection* dsec) {
        /* Searches fbuf starting from start to find one of <=>| and stops at its index.

        If quotable then be quote-aware for single and double quotes.
        Note: there's no length restriction for the content of the quote since it could be a single
        non-ascii UTF-8 char which would look like several chars.
        */
        void fbufLocateDataTagChar (inout uint pos, bool quotable) {
            for (; pos < fbuf.length; ++pos) {
                if ((fbuf[pos] >= '<' && fbuf[pos] <= '>') || fbuf[pos] == '|') return;
                else if (quotable) {
                    char c = fbuf[pos];
                    if (c == '\'' || c == '"') {
                        ++pos;
                        while (fbuf[pos] != c) {
                            if (fbuf[pos] == '\\') ++pos;	// escape seq.
                            fbufIncrement(pos);
                        }
                    }
                }
            }
        }

        bool comment = false;				// preceding char was !
        for (; pos < fbuf.length; ++pos) {
            if (Util.isSpace(fbuf[pos])) continue;	// whitespace
            else if (fbuf[pos] == '<') {		// data tag
                char[] ErrDTAG = "Bad data tag format: not <type|id=data>" ~ ErrInFile;

                fbufIncrement (pos);

                // Type section of tag:
                uint pos_s = pos;
                fbufLocateDataTagChar (pos, false);	// find end of type section
                if (fbuf[pos] != '|') throwMTErr (ErrDTAG, new MTSyntaxException);
                char[] type = fbuf[pos_s..pos];

                fbufIncrement (pos);

                // ID section of tag:
                pos_s = pos;
                fbufLocateDataTagChar (pos, false);	// find end of type section
                if (fbuf[pos] != '=') throwMTErr (ErrDTAG, new MTSyntaxException);
                ID tagID = cast(ID) fbuf[pos_s..pos];

                fbufIncrement (pos);

                // Data section of tag:
                pos_s = pos;
                fbufLocateDataTagChar (pos, true);      // find end of data section
                if (fbuf[pos] != '>') throwMTErr (ErrDTAG, new MTSyntaxException);
                char[] data = fbuf[pos_s..pos];

                if (!comment && dsec != null) {
                    type = Util.trim(type);
                    try {
                        dsec.addTag (type, tagID, data);
                    }
                    catch (TextException e) {
                        logger.warn ("TextException while reading " ~ ErrFile ~ ":");	// following a parse error
                        logger.warn (e.msg);
                    }
                    catch (Exception e) {
                        logger.error ("Unknown error occured" ~ ErrInFile ~ ':');
                        logger.error (e.msg);
                        throwMTErr (e.msg);             // Fatal to Reader
                    }
                } else comment = false;			// cancel comment status now
            }
            else if (fbuf[pos] == '{') {
                if (comment) {				// simple block comment
                    uint depth = 0;			// depth of embedded comment blocks
                    while (true) {
                        fbufIncrement (pos);
                        if (fbuf[pos] == '}') {
                            if (depth == 0) break;
                            else --depth;
                        } else if (fbuf[pos] == '{')
                            ++depth;
                    }
                    comment = false;			// end of this comment
                } else {
                    return pos;				// next section coming up; we are done
                }
            }
            else if (fbuf[pos] == '!') {		// possibly a comment; check next char
                comment = true;				// starting a comment (or an error)
                					// variable is reset at end of comment
            } else					// must be an error
            throwMTErr ("Invalid character (or sequence starting \"!\") outside of tag" ~ ErrInFile, new MTSyntaxException);
        }
        // if code execution reaches here, we're at EOF
        // possible error: last character was ! (but don't bother checking since it's inconsequential)
        return pos;
    }

    /* Parses fbuf for a section marker. Already knows fbuf[pos] == '{'.
    */
    private ID fbufReadSecMarker (inout uint pos) {
        // at this point pos is whatever a parseSection run returned
        // since we haven't hit EOF, fbuf[pos] MUST be '{' so no need to check
        fbufIncrement(pos);

        uint start = pos;
        for (; pos < fbuf.length; ++pos)
            if (fbuf[pos] == '}' || fbuf[pos] == '{') break;

        if (pos >= fbuf.length || fbuf[pos] != '}')
            throwMTErr ("Bad section tag format: not {id}" ~ ErrInFile, new MTSyntaxException);

        ID id = cast(ID) fbuf[start..pos];
        fbufIncrement(pos);
        return id;
    }

    /* Increments pos and checks it hasn't hit fbuf.length . */
    private void fbufIncrement(inout uint pos) {
        ++pos;
        if (pos >= fbuf.length) throwMTErr("Unexpected EOF" ~ ErrInFile, new MTSyntaxException);
    }

    private void throwMTErr (char[] msg, MTException exc = new MTException) {
        fatal = true;	// if anyone catches the error and tries to do anything --- we're dead now
        logger.error (msg);	// report the error
        throw exc;		// and signal our error
    }
//END METHODS: PRIVATE

    /+ A unittest here is really not practical since a file must be read from.
    + A unittest is included in defaultdata.d .
    unittest {}
    +/
}
author	Diggory Hardy <diggory.hardy@gmail.com>
date	Fri, 22 Feb 2008 11:52:20 +0000
parents	b940f267419e
children