diff mde/file/mergetag/Reader.d @ 81:d8fccaa45d5f

Moved file IO code from mde/mergetag to mde/file[/mergetag] and changed how some errors are caught.
author Diggory Hardy <diggory.hardy@gmail.com>
date Fri, 29 Aug 2008 11:59:43 +0100
parents mde/mergetag/Reader.d@ea58f277f487
children ac1e3fd07275
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mde/file/mergetag/Reader.d	Fri Aug 29 11:59:43 2008 +0100
@@ -0,0 +1,526 @@
+/* LICENSE BLOCK
+Part of mde: a Modular D game-oriented Engine
+Copyright © 2007-2008 Diggory Hardy
+
+This program is free software: you can redistribute it and/or modify it under the terms
+of the GNU General Public License as published by the Free Software Foundation, either
+version 2 of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+/**************************************************************************************************
+ * This module contains all reading functions, for both binary and text MergeTag files.
+ *************************************************************************************************/
+module mde.file.mergetag.Reader;
+
+// package imports
+public import mde.file.mergetag.iface.IReader;
+import mde.file.mergetag.DataSet;
+import mde.file.mergetag.DefaultData;
+import mde.file.mergetag.exception;
+import mde.file.mergetag.internal;
+
+import tango.core.Exception;
+
+// tango imports
+import tango.io.FilePath;
+import tango.io.UnicodeFile;
+import Util = tango.text.Util;
+import ConvInt = tango.text.convert.Integer;
+//import tango.util.collection.model.View : View;
+import tango.util.collection.HashSet : HashSet;
+import tango.util.log.Log : Log, Logger;
+
+private Logger logger;
+static this() {
+    logger = Log.getLogger ("mde.mergetag.Reader");
+}
+
+// TODO: allow compressing with zlib for both binary and text? (.mtz, .mtt, .mtb extensions)
+
+/** Make an IReader class.
+*
+* Create an appropriate reader: MTTReader or MTBReader.
+*
+* Throws:
+*  $(TABLE
+*  $(TR $(TH Exception) $(TH Thrown when))
+*  $(TR $(TD MTFileIOException) $(TD When extension given is neither mtt nor mtb))
+*  )
+*
+*/
+IReader makeReader (FilePath path, DataSet ds = null, bool rdHeader = false) {
+    if      (path.ext == "mtb") return new MTBReader (path, ds, rdHeader);
+    else if (path.ext == "mtt") return new MTTReader (path, ds, rdHeader);
+    else throw new MTFileIOException ("Invalid mergetag extension");
+}
+
+/** Resolve a file path.
+ *
+ * Tries adding both ".mtt" and ".mtb" extensions, returning whichever exists (the most recently
+ * modified if both exist), or returns null if neither exist. */
+FilePath findFile (char[] path) {
+    if (path is null) return null;
+    
+    FilePath tPath = new FilePath (path ~ ".mtt");
+    FilePath bPath = new FilePath (path ~ ".mtb");
+        
+    bool bPathExists = bPath.exists;
+        
+    if (tPath.exists) {
+        if (bPathExists) {
+                // take the latest version (roughly speaking...)
+            return (tPath.modified > bPath.modified ? tPath : bPath);
+        } else return tPath;
+    } else {
+        if (bPathExists) return bPath;
+        else return null;
+    }
+}
+
+/**
+ * Class for reading a mergetag text file.
+ * 
+ * Use as:
+ * -----------------------
+ * IReader foo;
+ * try {
+ *   foo = new MTTReader("foo.mtt");
+ *   foo.read();
+ * }
+ * catch (MTException) {}
+ * // get your data from foo.dataset.
+ * -----------------------
+ *
+ * Throws:
+ *  $(TABLE
+ *  $(TR $(TH Exception) $(TH Thrown when))
+ *  $(TR $(TD MTFileIOException) $(TD An error occurs while opening the file))
+ *  $(TR $(TD MTFileFormatException) $(TD The file doesn't start with a recognised header/version))
+ *  $(TR $(TD MTSyntaxException) $(TD A file syntax error occurs))
+ *  $(TR $(TD MTException) $(TD An unexpected error occurs))
+ *  )
+ * Note that all exceptions extend MTException and when any exception is thrown the class is
+ * rendered unusable: any subsequent calls to read will be ignored.
+ *
+ * Threading: Separate instances of Reader should be thread-safe provided access to the same
+ * dataset is synchronized; i.e. no two readers refering to the same dataset should run
+ * simultaneously. (The Reader class could be made thread-safe w.r.t. datasets, but
+ * performance-wise I doubt it would be worth it.)
+ * Do not run a single instance of Reader in multiple threads simultaneously.
+ */
+class MTTReader : IReader
+{
+//BEGIN DATA
+    /** Get or set the DataSet
+    *
+    * A container for all read data.
+    *
+    * This may be accessed from here; however it may be preferable to use an external reference
+    * (passed to the class on initialisation).
+    */
+    DataSet dataset () {	return _dataset;	}
+    void dataset (DataSet ds)	/// ditto
+    {	_dataset = ds;	}
+    
+    /** A delegate for creating new DataSections within the dataset.
+    *
+    * Allows a user-made class to be used in the DataSet instead of DefaultData (used if no
+    * dataSecCreator exists). Also allows an existing class instance to be used instead of a new
+    * one.
+    *
+    * This works by supplying a function which returns a reference to an instance of a class
+    * implementing IDataSection. The function is passed the ID of the new section and may use this
+    * to use different IDataSection classes for different sections.
+    *
+    * The function may also return null, in which case the section will be skipped. In the version
+    * of read taking a set of sections to read, the section will not be marked as read and may
+    * still be read later (assuming dataSecCreator returns non-null). However, in the version of
+    * read not taking the set argument, all sections are set as read regardless, and the section
+    * cannot be read later.
+    */
+    void dataSecCreator (IDataSection delegate (ID) dSC) {
+        _dataSecCreator = dSC;
+    }
+    
+private:
+    static Logger logger;
+    
+    // Non-static symbols:
+    final char[] ErrFile;		// added after ErrInFile to do the same without the "in " bit.
+    final char[] ErrInFile;		// something like "in \"path/file.mtt\""
+    
+    final char[] fbuf;			// file is read into this
+    MTFormatVersion.VERS fileVer = MTFormatVersion.VERS.INVALID;	// Remains INVALID until set otherwise by CTOR.
+    
+    IDataSection delegate (ID) _dataSecCreator = null;   // see property setter above
+    
+    size_t endOfHeader;
+    bool allRead = false;		// true if endOfHeader == fbuf.length or read([]) has run
+    bool fatal = false;			// a fatal file error occured; don't try to recover
+    /* If the file is scanned for sections, the starting position of all sections are stored
+    * in secTable. If this is empty, either no sections exist (and endOfHeader == fbuf.length)
+    * or a section scan has not been run (read() with no section names doesn't need to do so).
+    */
+    struct SecMD {	// sec meta data
+        static SecMD opCall (size_t _pos, bool _read) {
+            SecMD ret;
+            ret.pos = _pos;
+            ret.read = _read;
+            return ret;
+        }
+        size_t pos;			// position to start reading
+        bool read;			// true if already read
+    }
+    SecMD [ID] secTable;
+    
+    DataSet _dataset;
+//END DATA
+    
+//BEGIN METHODS: CTOR / DTOR
+    static this () {
+        logger = Log.getLogger ("mde.mergetag.read.Reader");
+    }
+    
+    /** Tries to open file path and read it into a buffer.
+     *
+     * Params:
+     * path     = The name or FilePath of the file to open.
+     *     Standard extensions are .mtt and .mtb for text and binary files respectively.
+     * ds       = If null create a new DataSet, else use existing DataSet ds and merge read
+     *     data into it.
+     * rdHeader = If true, read the header like a standard section. Doesn't read the header by
+     *     default since if it's not requested it's likely not wanted.
+     *
+     * Memory:
+     * This currently works by loading the whole file into memory at once. This should be fine most
+     * of the time, but could potentially be a problem. Changing this would mean significantly
+     * changes to the way the code works.
+     */
+    /* Ideas for implementing a partial-loading memory model:
+     * Use a conduit directly.
+     * Use a fiber to do the parsing; let it switch back when it runs out of memory.
+     * Redesign the code so it never needs to look backwards in the buffer?
+     *
+     * Major problem: reading only some sections and keeping references to other sections
+     * would no longer be possible.
+     */
+    public this (char[] path, DataSet ds = null, bool rdHeader = false) {
+        this (new FilePath (path), ds, rdHeader);
+    }
+    /** ditto */
+    public this (FilePath path, DataSet ds = null, bool rdHeader = false) {
+        // Create a dataset or use an existing one
+        if (ds !is null) _dataset = ds;
+        else _dataset = new DataSet();
+        
+        // Open & read the file
+        try {	// Supports unicode files with a BOM; defaults to UTF8 when there isn't a BOM:
+            scope file = new UnicodeFile!(char) (path, Encoding.Unknown);
+            fbuf = cast(char[]) file.read();
+        } catch (Exception e) {
+            throwMTErr ("Error reading file: " ~ e.msg, new MTFileIOException);
+        }
+        // Remember the file name so that we can report errors (somewhat) informatively:
+        ErrFile = path.path ~ path.file;
+        ErrInFile = " in \"" ~ ErrFile ~ '"';
+        
+        // Version checking & matching header section tag:
+        if (fbuf.length < 6 || fbuf[0] != '{' || fbuf[1] != 'M' || fbuf[2] != 'T' || fbuf[5] != '}')
+            throwMTErr("Not a valid MergeTag text file" ~ ErrInFile, new MTFileFormatException);
+        fileVer = MTFormatVersion.parseString (fbuf[3..5]);
+        if (fileVer == MTFormatVersion.VERS.INVALID)
+            throwMTErr("Unrecognised MergeTag version: MT" ~ fbuf[3..5] ~ ErrInFile, new MTFileFormatException);
+        
+        // Header reading/skipping:
+        if (rdHeader) {	// only bother actually reading it if it was requested
+            // If already existing, merge; else create a new DefaultData.
+            if (!_dataset.header) _dataset.header = new DefaultData;
+            endOfHeader = parseSection (6, cast(IDataSection) _dataset.header);
+        }
+        else endOfHeader = parseSection (6,null);
+    }
+//END METHODS: CTOR / DTOR
+    
+//BEGIN METHODS: PUBLIC
+    /** Scans for sections if not already done and returns a list of IDs.
+    *
+    * Won't work (will return an empty array) if all sections have already been read without
+    * scanning for sections.
+    */
+    public ID[] getSectionNames () {
+        if (fatal) return [];
+        if (!secTable.length) read([]);     // scan for sections
+        return secTable.keys;
+    }
+    
+    /** Reads (some) sections of the file into data. Note that sections will never be _read twice.
+    *
+    * To be more accurate, the file is copied into a buffer by this(). read() then parses the
+    * contents of this buffer, and stores the contents in dataset.
+    *
+    * Each section read is stored in a DataSection class. By default this is an instance of
+    * DefaultData; this can be customised (see dataSecCreator).
+    *
+    * If secSet is provided, reading is restricted to sections given in secSet, otherwise all
+    * sections are read. Sections given in secSet but not found in the file are not reported as an
+    * error. Suggested: supply a HashSet!(uint) as the View!(ID). An ArrayBag!(ID) as used is not a
+    * good choice, except that in this case it's empty.
+    *
+    * Merging:
+    * Where a section already exists in the DataSet (when either the section is given more than
+    * once in the file, or it was read from a different file by another reader) it is merged.
+    * Entries already in the DataSet take priority.
+    *
+    * Performance:
+    * Note that loading only desired sections like this still parses the sections not
+    * read (although it does not try to understand the type or data fields), so there is only a
+    * small performance advantage to this where other sections do exist in the file. There is also
+    * some overhead in only partially reading the file to keep track of where other sections are so
+    * that the entire file need not be re-read if further (or all remaining) sections are read
+    * later.
+    */
+    public void read () {
+        if (secTable.length) {
+            foreach (ID id, ref SecMD smd; secTable) {
+                if (!smd.read) {
+                    IDataSection ds = getOrCreateSec (id);
+                    parseSection (smd.pos, ds);
+                    // allRead is set true so there's no point setting smd.read = true
+                }
+            }
+        } else {					// this time we don't need to use secTable
+            for (size_t pos = endOfHeader; pos < fbuf.length;) {
+                ID id = fbufReadSecMarker (pos);
+                IDataSection ds = getOrCreateSec (id);
+                pos = parseSection (pos, ds);
+            }
+        }
+        
+        allRead = true;
+    }
+    /** ditto */
+    public void read (ID[] secSet) {
+        HashSet!(ID) hs = new HashSet!(ID);
+        foreach (id; secSet) hs.add(id);
+        read (hs);
+    }
+    /** ditto */
+    public void read (View!(ID) secSet) {
+        if (allRead || fatal) return;			// never do anything in either case
+        
+        if (secTable.length) {
+            foreach (ID id; secSet) {
+                SecMD* psmd = id in secTable;
+                if (psmd && !psmd.read) {		// may not exist
+                    IDataSection ds = getOrCreateSec (id);
+                    parseSection (psmd.pos, ds);
+                    if (ds !is null) psmd.read = true;  // getOrCreateSec may return null
+                }
+            }
+        } else {
+            for (size_t pos = endOfHeader; pos < fbuf.length;) {
+                ID id = fbufReadSecMarker (pos);
+                secTable[id] = SecMD(pos,false);	// add to table
+                if (secSet.contains(id)) {
+                    IDataSection ds = getOrCreateSec (id);
+                    pos = parseSection (pos, ds);
+                    if (ds !is null) secTable[id].read = true;
+                } else {
+                    pos = parseSection (pos, null);     // skip section
+                }
+            }
+        }
+    }
+//END METHODS: PUBLIC
+    
+//BEGIN METHODS: PRIVATE
+    /* Utility function for read
+    * Look for a section; return it if it exists otherwise create a new section:
+    *   use _dataSecCreator if it exists or just create a DefaultData if not.
+    * However if _dataSecCreator returns null don't add it to the dataset.
+    */
+    private IDataSection getOrCreateSec (ID id) {
+        IDataSection* i = id in _dataset.sec;
+        if (i) return *i;
+        else {
+            IDataSection s;
+            if (_dataSecCreator !is null) s = _dataSecCreator(id);
+            else s = new DefaultData;
+            if (s !is null) _dataset.sec[id] = s;
+            return s;
+        }
+    }
+    
+    /* Reads a section, starting from index pos, finishing at the next section marker (returning
+    the position of the start of the marker). pos should start after the section marker.
+    
+    After analysing tags, the function passes the type, ID and data to addTag.
+    
+    NOTE: from performance tests on indexing char[]'s and dereferencing char*'s, the char*'s are
+    slightly faster, but a tiny difference isn't worth the extra effort/risk of using char*'s.
+    */
+    private size_t parseSection (size_t pos, IDataSection dsec) {
+        debug scope (failure)
+                logger.trace ("MTTReader.parseSection: failure");
+        /* Searches fbuf starting from start to find one of <=>| and stops at its index.
+    
+        If quotable then be quote-aware for single and double quotes.
+        Note: there's no length restriction for the content of the quote since it could be a single
+        non-ascii UTF-8 char which would look like several chars.
+        */
+        void fbufLocateDataTagChar (ref size_t pos, bool quotable) {
+            while (true) {
+                fbufIncrement (pos);
+                
+                if ((fbuf[pos] >= '<' && fbuf[pos] <= '>') || fbuf[pos] == '|') return;
+                else if (quotable) {
+                    char c = fbuf[pos];
+                    if (c == '\'' || c == '"') {
+                        fbufIncrement(pos);
+                        while (fbuf[pos] != c) {
+                            if (fbuf[pos] == '\\') ++pos;	// escape seq.
+                            fbufIncrement(pos);
+                        }
+                    }
+                }
+            }
+        }
+        
+        // Used to ignore a tag (if it starts !< or !{ or should otherwise be ignored):
+        bool comment = false;
+        for (; pos < fbuf.length; ++pos) {
+            if (Util.isSpace(fbuf[pos])) continue;	// whitespace
+            else if (fbuf[pos] == '<') {		// data tag
+                char[] ErrDTAG = "Bad data tag format: not <type|id=data>" ~ ErrInFile;
+                
+                // Type section of tag:
+                size_t pos_s = pos + 1;
+                fbufLocateDataTagChar (pos, false);	// find end of type section
+                if (fbuf[pos] != '|') throwMTErr (ErrDTAG, new MTSyntaxException);
+                char[] type = fbuf[pos_s..pos];
+                
+                // ID section of tag:
+                pos_s = pos + 1;
+                fbufLocateDataTagChar (pos, false);	// find end of type section
+                if (fbuf[pos] != '=') throwMTErr (ErrDTAG, new MTSyntaxException);
+                ID tagID = cast(ID) fbuf[pos_s..pos];
+                
+                // Data section of tag:
+                pos_s = pos + 1;
+                fbufLocateDataTagChar (pos, true);      // find end of data section
+                if (fbuf[pos] != '>') throwMTErr (ErrDTAG, new MTSyntaxException);
+                char[] data = fbuf[pos_s..pos];
+                
+                if (!comment && dsec !is null) {
+                    type = Util.trim(type);
+                    try {
+                        dsec.addTag (type, tagID, data);
+                    }
+                    catch (TextException e) {
+                        logger.error ("TextException while reading " ~ ErrFile ~ ":");	// following a parse error
+                        logger.error (e.msg);
+                        logger.error ("Tag ignored: <"~type~"|"~tagID~"="~data~">");
+                        // No throw: tag is just ignored
+                    }
+                    catch (Exception e) {
+                        logger.error ("Unknown error occured" ~ ErrInFile ~ ':');
+                        logger.error (e.msg);
+                        throwMTErr (e.msg);             // Fatal to Reader
+                    }
+                } else comment = false;			// cancel comment status now
+            }
+            else if (fbuf[pos] == '{') {
+                if (comment) {				// simple block comment
+                    uint depth = 0;			// depth of embedded comment blocks
+                    while (true) {
+                        fbufIncrement (pos);
+                        if (fbuf[pos] == '}') {
+                            if (depth == 0) break;
+                            else --depth;
+                        } else if (fbuf[pos] == '{')
+                            ++depth;
+                    }
+                    comment = false;			// end of this comment
+                } else {
+                    return pos;				// next section coming up; we are done
+                }
+            }
+            else if (fbuf[pos] == '!') {		// possibly a comment; check next char
+                comment = true;				// starting a comment (or an error)
+                					// variable is reset at end of comment
+            } else					// must be an error
+            throwMTErr ("Invalid character (or sequence starting \"!\") outside of tag" ~ ErrInFile, new MTSyntaxException);
+        }
+        // if code execution reaches here, we're at EOF
+        // possible error: last character was ! (but don't bother checking since it's inconsequential)
+        return pos;
+    }
+    
+    /* Parses fbuf for a section marker. Already knows fbuf[pos] == '{'.
+    */
+    private ID fbufReadSecMarker (ref size_t pos) {
+        // at this point pos is whatever a parseSection run returned
+        // since we haven't hit EOF, fbuf[pos] MUST be '{' so no need to check
+        fbufIncrement(pos);
+        
+        size_t start = pos;
+        for (; pos < fbuf.length; ++pos)
+            if (fbuf[pos] == '}' || fbuf[pos] == '{') break;
+        
+        if (pos >= fbuf.length || fbuf[pos] != '}')
+            throwMTErr ("Bad section tag format: not {id}" ~ ErrInFile, new MTSyntaxException);
+        
+        ID id = cast(ID) fbuf[start..pos];
+        fbufIncrement(pos);
+        return id;
+    }
+    
+    /* Increments pos and checks it hasn't hit fbuf.length . */
+    private void fbufIncrement(ref size_t pos) {
+        ++pos;
+        if (pos >= fbuf.length) throwMTErr("Unexpected EOF" ~ ErrInFile, new MTSyntaxException);
+    }
+    
+    private void throwMTErr (char[] msg, MTException exc = new MTException) {
+        fatal = true;	// if anyone catches the error and tries to do anything --- we're dead now
+        logger.error (msg);	// report the error
+        throw exc;		// and signal our error
+    }
+//END METHODS: PRIVATE
+}
+
+
+/**
+* Class for reading a mergetag text file.
+*
+* Currently only a dummy class: a MTNotImplementedException will be thrown if created.
+*/
+class MTBReader : IReader
+{
+    public this (char[] path, DataSet ds = null, bool rdHeader = false) {
+        this (new FilePath (path), ds, rdHeader);
+    }
+    public this (PathView path, DataSet ds = null, bool rdHeader = false) {
+        throw new MTNotImplementedException;
+    }
+        
+    DataSet dataset () {                /// Get the DataSet
+        return null;
+    }
+    void dataset (DataSet) {}           /// Set the DataSet
+    
+    void dataSecCreator (IDataSection delegate (ID)) {} /// Set the dataSecCreator
+    
+    ID[] getSectionNames () {           /// Get identifiers for all sections
+        return [];
+    }
+    void read () {}                     /// Commence reading
+    void read (ID[] secSet) {}          /// ditto
+    void read (View!(ID) secSet) {}     /// ditto
+}