Mercurial > projects > ldc

diff tango/tango/io/UnicodeFile.d @ 132:1700239cab2e trunk
[svn r136] MAJOR UNSTABLE UPDATE!!! Initial commit after moving to Tango instead of Phobos. Lots of bugfixes... This build is not suitable for most things.
author: lindquist
date: Fri, 11 Jan 2008 17:57:40 +0100
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tango/tango/io/UnicodeFile.d	Fri Jan 11 17:57:40 2008 +0100
@@ -0,0 +1,251 @@
+/*******************************************************************************
+
+        copyright:      Copyright (c) 2005 Kris Bell. All rights reserved
+
+        license:        BSD style: $(LICENSE)
+
+        version:        Initial release: December 2005      
+        
+        author:         Kris
+
+*******************************************************************************/
+
+module tango.io.UnicodeFile;
+
+public  import  tango.io.FilePath;
+
+private import  tango.io.FileConduit;
+
+private import  tango.core.Exception;
+
+public  import  tango.text.convert.UnicodeBom;
+
+/*******************************************************************************
+
+        Read and write unicode files
+
+        For our purposes, unicode files are an encoding of textual material.
+        The goal of this module is to interface that external-encoding with
+        a programmer-defined internal-encoding. This internal encoding is
+        declared via the template argument T, whilst the external encoding
+        is either specified or derived.
+
+        Three internal encodings are supported: char, wchar, and dchar. The
+        methods herein operate upon arrays of this type. For example, read()
+        returns an array of the type, whilst write() and append() expect an
+        array of said type.
+
+        Supported external encodings are as follow:
+
+                $(UL Encoding.Unknown)
+                $(UL Encoding.UTF_8)
+                $(UL Encoding.UTF_8N)
+                $(UL Encoding.UTF_16)
+                $(UL Encoding.UTF_16BE)
+                $(UL Encoding.UTF_16LE) 
+                $(UL Encoding.UTF_32)
+                $(UL Encoding.UTF_32BE)
+                $(UL Encoding.UTF_32LE) 
+
+        These can be divided into implicit and explicit encodings. Here are
+        the implicit subset:
+
+                $(UL Encoding.Unknown)
+                $(UL Encoding.UTF_8)
+                $(UL Encoding.UTF_16)
+                $(UL Encoding.UTF_32) 
+
+        Implicit encodings may be used to 'discover'
+        an unknown encoding, by examining the first few bytes of the file
+        content for a signature. This signature is optional for all files, 
+        but is often written such that the content is self-describing. When
+        the encoding is unknown, using one of the non-explicit encodings will
+        cause the read() method to look for a signature and adjust itself 
+        accordingly. It is possible that a ZWNBSP character might be confused 
+        with the signature; today's files are supposed to use the WORD-JOINER 
+        character instead.
+
+        Explicit encodings are as follows:
+       
+                $(UL Encoding.UTF_8N)
+                $(UL Encoding.UTF_16BE)
+                $(UL Encoding.UTF_16LE) 
+                $(UL Encoding.UTF_32BE)
+                $(UL Encoding.UTF_32LE) 
+        
+        This group of encodings are for use when the file encoding is
+        known. These *must* be used when writing or appending, since written
+        content must be in a known format. It should be noted that, during a
+        read operation, the presence of a signature is in conflict with these 
+        explicit varieties.
+
+        Method read() returns the current content of the file, whilst write()
+        sets the file content, and file length, to the provided array. Method
+        append() adds content to the tail of the file. When appending, it is
+        your responsibility to ensure the existing and current encodings are
+        correctly matched.
+
+        Methods to inspect the file system, check the status of a file or
+        directory, and other facilities are made available via the FilePath
+        superclass.
+
+        See these links for more info:
+        $(UL $(LINK http://www.utf-8.com/))
+        $(UL $(LINK http://www.hackcraft.net/xmlUnicode/))
+        $(UL $(LINK http://www.unicode.org/faq/utf_bom.html/))
+        $(UL $(LINK http://www.azillionmonkeys.com/qed/unicode.html/))
+        $(UL $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/))
+
+*******************************************************************************/
+
+class UnicodeFile(T)
+{
+        private UnicodeBom!(T)  bom;
+        private PathView        path_;
+
+        /***********************************************************************
+        
+                Construct a UnicodeFile from the provided FilePath. The given 
+                encoding represents the external file encoding, and should
+                be one of the Encoding.xx types 
+
+        ***********************************************************************/
+                                  
+        this (PathView path, Encoding encoding)
+        {
+                bom = new UnicodeBom!(T)(encoding);
+                path_ = path;
+        }
+
+        /***********************************************************************
+        
+                Construct a UnicodeFile from a text string. The provided 
+                encoding represents the external file encoding, and should
+                be one of the Encoding.xx types 
+
+        ***********************************************************************/
+
+        this (char[] path, Encoding encoding)
+        {
+                this (new FilePath(path), encoding);
+        }
+
+        /***********************************************************************
+
+                Call-site shortcut to create a UnicodeFile instance. This 
+                enables the same syntax as struct usage, so may expose
+                a migration path
+
+        ***********************************************************************/
+
+        static UnicodeFile opCall (char[] name, Encoding encoding)
+        {
+                return new UnicodeFile (name, encoding);
+        }
+
+        /***********************************************************************
+
+                Return the associated FilePath instance
+
+        ***********************************************************************/
+
+        PathView path ()
+        {
+                return path_;
+        }
+        
+        /***********************************************************************
+
+                Return the current encoding. This is either the originally
+                specified encoding, or a derived one obtained by inspecting
+                the file content for a BOM. The latter is performed as part
+                of the read() method.
+
+        ***********************************************************************/
+
+        Encoding encoding ()
+        {
+                return bom.encoding();
+        }
+        
+        /***********************************************************************
+
+                Return the content of the file. The content is inspected 
+                for a BOM signature, which is stripped. An exception is
+                thrown if a signature is present when, according to the
+                encoding type, it should not be. Conversely, An exception
+                is thrown if there is no known signature where the current
+                encoding expects one to be present.
+
+        ***********************************************************************/
+
+        T[] read ()
+        {
+                scope conduit = new FileConduit (path_);  
+                scope (exit)
+                       conduit.close;
+
+                // allocate enough space for the entire file
+                auto content = new ubyte [cast(uint) conduit.length];
+
+                //read the content
+                if (conduit.read (content) != content.length)
+                    conduit.error ("unexpected eof");
+
+                return bom.decode (content);
+        }
+
+        /***********************************************************************
+
+                Set the file content and length to reflect the given array.
+                The content will be encoded accordingly.
+
+        ***********************************************************************/
+
+        UnicodeFile write (T[] content, bool writeBom = false)
+        {
+                return write (content, FileConduit.ReadWriteCreate, writeBom);  
+        }
+
+        /***********************************************************************
+
+                Append content to the file; the content will be encoded 
+                accordingly.
+
+                Note that it is your responsibility to ensure the 
+                existing and current encodings are correctly matched.
+
+        ***********************************************************************/
+
+        UnicodeFile append (T[] content)
+        {
+                return write (content, FileConduit.WriteAppending, false);  
+        }
+
+        /***********************************************************************
+
+                Internal method to perform writing of content. Note that
+                the encoding must be of the explicit variety by the time
+                we get here.
+
+        ***********************************************************************/
+
+        private final UnicodeFile write (T[] content, FileConduit.Style style, bool writeBom)
+        {       
+                // convert to external representation (may throw an exeption)
+                void[] converted = bom.encode (content);
+
+                // open file after conversion ~ in case of exceptions
+                scope conduit = new FileConduit (path_, style);  
+                scope (exit)
+                       conduit.close;
+
+                if (writeBom)
+                    conduit.write (bom.signature);
+
+                // and write
+                conduit.write (converted);
+                return this;
+        }
+}
+
author	lindquist
date	Fri, 11 Jan 2008 17:57:40 +0100
parents
children