view mde/file/mergetag/Reader.d @ 134:7ababdf97748

Moved mde.setup.paths to mde.file.paths and paths.mdeReader to mde.file.mergetag.Reader.MTMultiReader.
author Diggory Hardy <diggory.hardy@gmail.com>
date Thu, 29 Jan 2009 14:59:45 +0000
parents b16a534f5302
children 4084f07f2c7a
line wrap: on
line source

/* LICENSE BLOCK
Part of mde: a Modular D game-oriented Engine
Copyright © 2007-2008 Diggory Hardy

This program is free software: you can redistribute it and/or modify it under the terms
of the GNU General Public License as published by the Free Software Foundation, either
version 2 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>. */

/**************************************************************************************************
 * This module contains all reading functions, for both binary and text MergeTag files.
 *************************************************************************************************/
module mde.file.mergetag.Reader;

// package imports
public import mde.file.mergetag.iface.IReader;
import mde.file.mergetag.DataSet;
import mde.file.mergetag.DefaultData;
import mde.file.mergetag.exception;
import mde.file.mergetag.internal;

import tango.core.Exception;

// tango imports
import tango.io.FilePath;
import tango.io.UnicodeFile;
import Util = tango.text.Util;
import ConvInt = tango.text.convert.Integer;
//import tango.util.container.HashSet;
import mde.workaround2371;
import tango.util.log.Log : Log, Logger;

private Logger logger;
static this() {
    logger = Log.getLogger ("mde.file.mergetag.Reader");
}

// TODO: allow compressing with zlib for both binary and text? (.mtz, .mtt, .mtb extensions)

/** Make an IReader class.
*
* Create an appropriate reader: MTTReader or MTBReader.
*
* Throws:
*  $(TABLE
*  $(TR $(TH Exception) $(TH Thrown when))
*  $(TR $(TD MTFileIOException) $(TD When extension given is neither mtt nor mtb))
*  )
*
*/
IReader makeReader (FilePath path, DataSet ds = null, bool rdHeader = false) {
    if      (path.ext == "mtb") return new MTBReader (path, ds, rdHeader);
    else if (path.ext == "mtt") return new MTTReader (path, ds, rdHeader);
    else throw new MTFileIOException ("Invalid mergetag extension");
}

/** Resolve a file path.
 *
 * Tries adding both ".mtt" and ".mtb" extensions, returning whichever exists (the most recently
 * modified if both exist), or returns null if neither exist. */
FilePath findFile (char[] path) {
    if (path is null) return null;
    
    FilePath tPath = new FilePath (path ~ ".mtt");
    FilePath bPath = new FilePath (path ~ ".mtb");
        
    bool bPathExists = bPath.exists;
        
    if (tPath.exists) {
        if (bPathExists) {
                // take the latest version (roughly speaking...)
            return (tPath.modified > bPath.modified ? tPath : bPath);
        } else return tPath;
    } else {
        if (bPathExists) return bPath;
        else return null;
    }
}

/**
 * Class for reading a mergetag text file.
 * 
 * Use as:
 * -----------------------
 * IReader foo;
 * try {
 *   foo = new MTTReader("foo.mtt");
 *   foo.read();
 * }
 * catch (MTException) {}
 * // get your data from foo.dataset.
 * -----------------------
 *
 * Throws:
 *  $(TABLE
 *  $(TR $(TH Exception) $(TH Thrown when))
 *  $(TR $(TD MTFileIOException) $(TD An error occurs while opening the file))
 *  $(TR $(TD MTFileFormatException) $(TD The file doesn't start with a recognised header/version))
 *  $(TR $(TD MTSyntaxException) $(TD A file syntax error occurs))
 *  $(TR $(TD MTException) $(TD An unexpected error occurs))
 *  )
 * Note that all exceptions extend MTException and when any exception is thrown the class is
 * rendered unusable: any subsequent calls to read will be ignored.
 *
 * Threading: Separate instances of Reader should be thread-safe provided access to the same
 * dataset is synchronized; i.e. no two readers refering to the same dataset should run
 * simultaneously. (The Reader class could be made thread-safe w.r.t. datasets, but
 * performance-wise I doubt it would be worth it.)
 * Do not run a single instance of Reader in multiple threads simultaneously.
 */
class MTTReader : IReader
{
//BEGIN DATA
    /** Get or set the DataSet
    *
    * A container for all read data.
    *
    * This may be accessed from here; however it may be preferable to use an external reference
    * (passed to the class on initialisation).
    */
    DataSet dataset () {	return _dataset;	}
    void dataset (DataSet ds)	/// ditto
    {	_dataset = ds;	}
    
    /** A delegate for creating new DataSections within the dataset.
    *
    * Allows a user-made class to be used in the DataSet instead of DefaultData (used if no
    * dataSecCreator exists). Also allows an existing class instance to be used instead of a new
    * one.
    *
    * This works by supplying a function which returns a reference to an instance of a class
    * implementing IDataSection. The function is passed the ID of the new section and may use this
    * to use different IDataSection classes for different sections.
    *
    * The function may also return null, in which case the section will be skipped. In the version
    * of read taking a set of sections to read, the section will not be marked as read and may
    * still be read later (assuming dataSecCreator returns non-null). However, in the version of
    * read not taking the set argument, all sections are set as read regardless, and the section
    * cannot be read later.
    */
    void dataSecCreator (IDataSection delegate (ID) dSC) {
        _dataSecCreator = dSC;
    }
    
private:
    // Non-static symbols:
    final char[] ErrFile;		// added after ErrInFile to do the same without the "in " bit.
    final char[] ErrInFile;		// something like "in \"path/file.mtt\""
    
    final char[] fbuf;			// file is read into this
    MTFormatVersion.VERS fileVer = MTFormatVersion.VERS.INVALID;	// Remains INVALID until set otherwise by CTOR.
    
    IDataSection delegate (ID) _dataSecCreator = null;   // see property setter above
    
    size_t endOfHeader;
    bool allRead = false;		// true if endOfHeader == fbuf.length or read([]) has run
    bool fatal = false;			// a fatal file error occured; don't try to recover
    /* If the file is scanned for sections, the starting position of all sections are stored
    * in secTable. If this is empty, either no sections exist (and endOfHeader == fbuf.length)
    * or a section scan has not been run (read() with no section names doesn't need to do so).
    */
    struct SecMD {	// sec meta data
        static SecMD opCall (size_t _pos, bool _read) {
            SecMD ret;
            ret.pos = _pos;
            ret.read = _read;
            return ret;
        }
        size_t pos;			// position to start reading
        bool read;			// true if already read
    }
    SecMD [ID] secTable;
    
    DataSet _dataset;
//END DATA
    
//BEGIN METHODS: CTOR / DTOR
    /** Tries to open file path and read it into a buffer.
     *
     * Params:
     * path     = The name or FilePath of the file to open.
     *     Standard extensions are .mtt and .mtb for text and binary files respectively.
     * ds       = If null create a new DataSet, else use existing DataSet ds and merge read
     *     data into it.
     * rdHeader = If true, read the header like a standard section. Doesn't read the header by
     *     default since if it's not requested it's likely not wanted.
     *
     * Memory:
     * This currently works by loading the whole file into memory at once. This should be fine most
     * of the time, but could potentially be a problem. Changing this would mean significantly
     * changes to the way the code works.
     */
    /* Ideas for implementing a partial-loading memory model:
     * Use a conduit directly.
     * Use a fiber to do the parsing; let it switch back when it runs out of memory.
     * Redesign the code so it never needs to look backwards in the buffer?
     *
     * Major problem: reading only some sections and keeping references to other sections
     * would no longer be possible.
     */
    public this (char[] path, DataSet ds = null, bool rdHeader = false) {
        this (new FilePath (path), ds, rdHeader);
    }
    /** ditto */
    public this (FilePath path, DataSet ds = null, bool rdHeader = false) {
        // Create a dataset or use an existing one
        if (ds !is null) _dataset = ds;
        else _dataset = new DataSet();
        
        // Open & read the file
        try {	// Supports unicode files with a BOM; defaults to UTF8 when there isn't a BOM:
            scope file = new UnicodeFile!(char) (path.toString, Encoding.Unknown);
            fbuf = cast(char[]) file.read();
        } catch (Exception e) {
            throwMTErr ("Error reading file: " ~ e.msg, new MTFileIOException);
        }
        // Remember the file name so that we can report errors (somewhat) informatively:
        ErrFile = path.path ~ path.file;
        ErrInFile = " in \"" ~ ErrFile ~ '"';
        
        // Version checking & matching header section tag:
        if (fbuf.length < 6 || fbuf[0] != '{' || fbuf[1] != 'M' || fbuf[2] != 'T' || fbuf[5] != '}')
            throwMTErr("Not a valid MergeTag text file" ~ ErrInFile, new MTFileFormatException);
        fileVer = MTFormatVersion.parseString (fbuf[3..5]);
        if (fileVer == MTFormatVersion.VERS.INVALID)
            throwMTErr("Unrecognised MergeTag version: MT" ~ fbuf[3..5] ~ ErrInFile, new MTFileFormatException);
        
        // Header reading/skipping:
        if (rdHeader) {	// only bother actually reading it if it was requested
            // If already existing, merge; else create a new DefaultData.
            if (!_dataset.header) _dataset.header = new DefaultData;
            endOfHeader = parseSection (6, cast(IDataSection) _dataset.header);
        }
        else endOfHeader = parseSection (6,null);
    }
//END METHODS: CTOR / DTOR
    
//BEGIN METHODS: PUBLIC
    /** Scans for sections if not already done and returns a list of IDs.
    *
    * Won't work (will return an empty array) if all sections have already been read without
    * scanning for sections.
    */
    public ID[] getSectionNames () {
        if (fatal) return [];
        if (!secTable.length) read([]);     // scan for sections
        return secTable.keys;
    }
    
    /** Reads (some) sections of the file into data. Note that sections will never be _read twice.
    *
    * To be more accurate, the file is copied into a buffer by this(). read() then parses the
    * contents of this buffer, and stores the contents in dataset.
    *
    * Each section read is stored in a DataSection class. By default this is an instance of
    * DefaultData; this can be customised (see dataSecCreator).
    *
    * If secSet is provided, reading is restricted to sections given in secSet, otherwise all
    * sections are read. Sections given in secSet but not found in the file are not reported as an
    * error. Suggested: supply a HashSet!(ID) as the container. If an ID[] is passed this is
    * converted to a HashSet!(ID) to speed up lookups.
    *
    * Merging:
    * Where a section already exists in the DataSet (when either the section is given more than
    * once in the file, or it was read from a different file by another reader) it is merged.
    * Entries already in the DataSet take priority.
    *
    * Performance:
    * Note that loading only desired sections like this still parses the sections not
    * read (although it does not try to understand the type or data fields), so there is only a
    * small performance advantage to this where other sections do exist in the file. There is also
    * some overhead in only partially reading the file to keep track of where other sections are so
    * that the entire file need not be re-read if further (or all remaining) sections are read
    * later.
    */
    public void read () {
        if (secTable.length) {
            foreach (ID id, ref SecMD smd; secTable) {
                if (!smd.read) {
                    IDataSection ds = getOrCreateSec (id);
                    parseSection (smd.pos, ds);
                    // allRead is set true so there's no point setting smd.read = true
                }
            }
        } else {					// this time we don't need to use secTable
            for (size_t pos = endOfHeader; pos < fbuf.length;) {
                ID id = fbufReadSecMarker (pos);
                IDataSection ds = getOrCreateSec (id);
                pos = parseSection (pos, ds);
            }
        }
        
        allRead = true;
    }
    /** ditto */
    public void read (ID[] secSet) {
        myStringHS hs = new myStringHS; // FIXME: workaround2371
        foreach (id; secSet) hs.add(id);
        read (hs);
    }
    /** ditto */
    public void read (IContainer!(ID) secSet) {
        if (allRead || fatal) return;			// never do anything in either case
        
        if (secTable.length) {
            foreach (ID id; secSet) {
                SecMD* psmd = id in secTable;
                if (psmd && !psmd.read) {		// may not exist
                    IDataSection ds = getOrCreateSec (id);
                    parseSection (psmd.pos, ds);
                    if (ds !is null) psmd.read = true;  // getOrCreateSec may return null
                }
            }
        } else {
            for (size_t pos = endOfHeader; pos < fbuf.length;) {
                ID id = fbufReadSecMarker (pos);
                secTable[id] = SecMD(pos,false);	// add to table
                if (secSet.contains(id)) {
                    IDataSection ds = getOrCreateSec (id);
                    pos = parseSection (pos, ds);
                    if (ds !is null) secTable[id].read = true;
                } else {
                    pos = parseSection (pos, null);     // skip section
                }
            }
        }
    }
//END METHODS: PUBLIC
    
//BEGIN METHODS: PRIVATE
    /* Utility function for read
    * Look for a section; return it if it exists otherwise create a new section:
    *   use _dataSecCreator if it exists or just create a DefaultData if not.
    * However if _dataSecCreator returns null don't add it to the dataset.
    */
    private IDataSection getOrCreateSec (ID id) {
        IDataSection* i = id in _dataset.sec;
        if (i) return *i;
        else {
            IDataSection s;
            if (_dataSecCreator !is null) s = _dataSecCreator(id);
            else s = new DefaultData;
            if (s !is null) _dataset.sec[id] = s;
            return s;
        }
    }
    
    /* Reads a section, starting from index pos, finishing at the next section marker (returning
    the position of the start of the marker). pos should start after the section marker.
    
    After analysing tags, the function passes the type, ID and data to addTag.
    
    Note: from performance tests on indexing char[]'s and dereferencing char*'s, the char*'s are
    slightly faster, but a tiny difference isn't worth the extra effort/risk of using char*'s.
    */
    private size_t parseSection (size_t pos, IDataSection dsec) {
        debug scope (failure)
                logger.trace ("MTTReader.parseSection: failure");
        /* Searches fbuf starting from start to find one of <=>| and stops at its index.
    
        If quotable then be quote-aware for single and double quotes.
        Note: there's no length restriction for the content of the quote since it could be a single
        non-ascii UTF-8 char which would look like several chars.
        */
        void fbufLocateDataTagChar (ref size_t pos, bool quotable) {
            while (true) {
                fbufIncrement (pos);
                
                if ((fbuf[pos] >= '<' && fbuf[pos] <= '>') || fbuf[pos] == '|') return;
                else if (quotable) {
                    char c = fbuf[pos];
                    if (c == '\'' || c == '"') {
                        fbufIncrement(pos);
                        while (fbuf[pos] != c) {
                            if (fbuf[pos] == '\\') ++pos;	// escape seq.
                            fbufIncrement(pos);
                        }
                    }
                }
            }
        }
        
        // Used to ignore a tag (if it starts !< or !{ or should otherwise be ignored):
        bool comment = false;
        for (; pos < fbuf.length; ++pos) {
            if (Util.isSpace(fbuf[pos])) continue;	// whitespace
            else if (fbuf[pos] == '<') {		// data tag
                char[] ErrDTAG = "Bad data tag format: not <type|id=data>" ~ ErrInFile;
                
                // Type section of tag:
                size_t pos_s = pos + 1;
                fbufLocateDataTagChar (pos, false);	// find end of type section
                if (fbuf[pos] != '|') throwMTErr (ErrDTAG, new MTSyntaxException);
                char[] type = fbuf[pos_s..pos];
                
                // ID section of tag:
                pos_s = pos + 1;
                fbufLocateDataTagChar (pos, false);	// find end of type section
                if (fbuf[pos] != '=') throwMTErr (ErrDTAG, new MTSyntaxException);
                ID tagID = cast(ID) fbuf[pos_s..pos];
                
                // Data section of tag:
                pos_s = pos + 1;
                fbufLocateDataTagChar (pos, true);      // find end of data section
                if (fbuf[pos] != '>') throwMTErr (ErrDTAG, new MTSyntaxException);
                char[] data = fbuf[pos_s..pos];
                
                if (!comment && dsec !is null) {
                    type = Util.trim(type);
                    try {
                        dsec.addTag (type, tagID, data);
                    }
                    catch (TextException e) {
                        logger.error ("TextException while reading " ~ ErrFile ~ ":");	// following a parse error
                        logger.error (e.msg);
                        logger.error ("Tag ignored: <"~type~"|"~tagID~"="~data~">");
                        // No throw: tag is just ignored
                    }
                    catch (Exception e) {
                        logger.error ("Unknown error occured" ~ ErrInFile ~ ':');
                        logger.error (e.msg);
                        throwMTErr (e.msg);             // Fatal to Reader
                    }
                } else comment = false;			// cancel comment status now
            }
            else if (fbuf[pos] == '{') {
                if (comment) {				// simple block comment
                    uint depth = 0;			// depth of embedded comment blocks
                    while (true) {
                        fbufIncrement (pos);
                        if (fbuf[pos] == '}') {
                            if (depth == 0) break;
                            else --depth;
                        } else if (fbuf[pos] == '{')
                            ++depth;
                    }
                    comment = false;			// end of this comment
                } else {
                    return pos;				// next section coming up; we are done
                }
            }
            else if (fbuf[pos] == '!') {		// possibly a comment; check next char
                comment = true;				// starting a comment (or an error)
                					// variable is reset at end of comment
            } else					// must be an error
            throwMTErr ("Invalid character (or sequence starting \"!\") outside of tag" ~ ErrInFile, new MTSyntaxException);
        }
        // if code execution reaches here, we're at EOF
        // possible error: last character was ! (but don't bother checking since it's inconsequential)
        return pos;
    }
    
    /* Parses fbuf for a section marker. Already knows fbuf[pos] == '{'.
    */
    private ID fbufReadSecMarker (ref size_t pos) {
        // at this point pos is whatever a parseSection run returned
        // since we haven't hit EOF, fbuf[pos] MUST be '{' so no need to check
        fbufIncrement(pos);
        
        size_t start = pos;
        for (; pos < fbuf.length; ++pos)
            if (fbuf[pos] == '}' || fbuf[pos] == '{') break;
        
        if (pos >= fbuf.length || fbuf[pos] != '}')
            throwMTErr ("Bad section tag format: not {id}" ~ ErrInFile, new MTSyntaxException);
        
        ID id = cast(ID) fbuf[start..pos];
        fbufIncrement(pos);
        return id;
    }
    
    /* Increments pos and checks it hasn't hit fbuf.length . */
    private void fbufIncrement(ref size_t pos) {
        ++pos;
        if (pos >= fbuf.length) throwMTErr("Unexpected EOF" ~ ErrInFile, new MTSyntaxException);
    }
    
    private void throwMTErr (char[] msg, MTException exc = new MTException) {
        fatal = true;	// if anyone catches the error and tries to do anything --- we're dead now
        logger.error (msg);	// report the error
        throw exc;		// and signal our error
    }
//END METHODS: PRIVATE
}


/**
* Class for reading a mergetag text file.
*
* Currently only a dummy class: a MTNotImplementedException will be thrown if created.
*/
class MTBReader : IReader
{
    public this (char[] path, DataSet ds = null, bool rdHeader = false) {
        this (new FilePath (path), ds, rdHeader);
    }
    public this (PathView path, DataSet ds = null, bool rdHeader = false) {
        throw new MTNotImplementedException;
    }
        
    DataSet dataset () {                /// Get the DataSet
        return null;
    }
    void dataset (DataSet) {}           /// Set the DataSet
    
    void dataSecCreator (IDataSection delegate (ID)) {} /// Set the dataSecCreator
    
    ID[] getSectionNames () {           /// Get identifiers for all sections
        return [];
    }
    void read () {}                     /// Commence reading
    void read (ID[] secSet) {}          /// ditto
    void read (IContainer!(ID) secSet) {}/// ditto
}


/** A special adapter for reading from multiple mergetag files.
 *
 * The number of files $(B must not) exceed MAX_PATHS. */
class MTMultiReader : IReader
{
    this (FilePath[] files, DataSet ds, bool rdHeader)
    in {
        assert (files !is null, "mdeReader.this: files is null");
    } body {
        // Don't let sub-readers create their own, separate, datasets:
        if (ds is null) ds = new DataSet;
        
        Exception exc;
        foreach (file; files) {
            try {   // try reading header of each file
                IReader r = makeReader (file, ds, rdHeader);
                readers[readersLen++] = r;
            } catch (Exception e) {
                exc = e;
            }
        }
        if (readersLen == 0)        // no files have valid headers
            throw exc;              // fail: re-throw last exception
    }
    
    DataSet dataset () {                /// Get the DataSet
        return readers[0].dataset;      // all readers share the same dataset
    }
    void dataset (DataSet ds) {         /// Set the DataSet
        for (uint i = 0; i < readersLen; ++i) readers[i].dataset (ds);
    }
    
    void dataSecCreator (IDataSection delegate (ID) dsC) {  /// Set the dataSecCreator
        for (uint i = 0; i < readersLen; ++i) readers[i].dataSecCreator = dsC;
    }
    
    /** Get identifiers for all sections.
     *
     * Note: the identifiers from all sections in all files are just strung
     * together, starting with the highest-priority file. */
    ID[] getSectionNames () {
        ID[] names;
        for (int i = readersLen-1; i >= 0; --i)
            names ~= readers[i].getSectionNames;
        return names;
    }
    void read () {                      /// Commence reading
        for (uint i = 0; i < readersLen; ++i) readers[i].read();
    }
    void read (ID[] secSet) {           /// ditto
        for (uint i = 0; i < readersLen; ++i) readers[i].read(secSet);
    }
    void read (IContainer   !(ID) secSet) {      /// ditto
        for (uint i = 0; i < readersLen; ++i) readers[i].read(secSet);
    }
    
    const MAX_READERS = 4;
private:
    // Use a simpler static array:
    IReader[MAX_READERS] readers;
    ubyte readersLen = 0;
}