comparison tango/tango/io/UnicodeFile.d @ 132:1700239cab2e trunk

[svn r136] MAJOR UNSTABLE UPDATE!!! Initial commit after moving to Tango instead of Phobos. Lots of bugfixes... This build is not suitable for most things.
author lindquist
date Fri, 11 Jan 2008 17:57:40 +0100
parents
children
comparison
equal deleted inserted replaced
131:5825d48b27d1 132:1700239cab2e
1 /*******************************************************************************
2
3 copyright: Copyright (c) 2005 Kris Bell. All rights reserved
4
5 license: BSD style: $(LICENSE)
6
7 version: Initial release: December 2005
8
9 author: Kris
10
11 *******************************************************************************/
12
13 module tango.io.UnicodeFile;
14
15 public import tango.io.FilePath;
16
17 private import tango.io.FileConduit;
18
19 private import tango.core.Exception;
20
21 public import tango.text.convert.UnicodeBom;
22
23 /*******************************************************************************
24
25 Read and write unicode files
26
27 For our purposes, unicode files are an encoding of textual material.
28 The goal of this module is to interface that external-encoding with
29 a programmer-defined internal-encoding. This internal encoding is
30 declared via the template argument T, whilst the external encoding
31 is either specified or derived.
32
33 Three internal encodings are supported: char, wchar, and dchar. The
34 methods herein operate upon arrays of this type. For example, read()
35 returns an array of the type, whilst write() and append() expect an
36 array of said type.
37
38 Supported external encodings are as follow:
39
40 $(UL Encoding.Unknown)
41 $(UL Encoding.UTF_8)
42 $(UL Encoding.UTF_8N)
43 $(UL Encoding.UTF_16)
44 $(UL Encoding.UTF_16BE)
45 $(UL Encoding.UTF_16LE)
46 $(UL Encoding.UTF_32)
47 $(UL Encoding.UTF_32BE)
48 $(UL Encoding.UTF_32LE)
49
50 These can be divided into implicit and explicit encodings. Here are
51 the implicit subset:
52
53 $(UL Encoding.Unknown)
54 $(UL Encoding.UTF_8)
55 $(UL Encoding.UTF_16)
56 $(UL Encoding.UTF_32)
57
58 Implicit encodings may be used to 'discover'
59 an unknown encoding, by examining the first few bytes of the file
60 content for a signature. This signature is optional for all files,
61 but is often written such that the content is self-describing. When
62 the encoding is unknown, using one of the non-explicit encodings will
63 cause the read() method to look for a signature and adjust itself
64 accordingly. It is possible that a ZWNBSP character might be confused
65 with the signature; today's files are supposed to use the WORD-JOINER
66 character instead.
67
68 Explicit encodings are as follows:
69
70 $(UL Encoding.UTF_8N)
71 $(UL Encoding.UTF_16BE)
72 $(UL Encoding.UTF_16LE)
73 $(UL Encoding.UTF_32BE)
74 $(UL Encoding.UTF_32LE)
75
76 This group of encodings are for use when the file encoding is
77 known. These *must* be used when writing or appending, since written
78 content must be in a known format. It should be noted that, during a
79 read operation, the presence of a signature is in conflict with these
80 explicit varieties.
81
82 Method read() returns the current content of the file, whilst write()
83 sets the file content, and file length, to the provided array. Method
84 append() adds content to the tail of the file. When appending, it is
85 your responsibility to ensure the existing and current encodings are
86 correctly matched.
87
88 Methods to inspect the file system, check the status of a file or
89 directory, and other facilities are made available via the FilePath
90 superclass.
91
92 See these links for more info:
93 $(UL $(LINK http://www.utf-8.com/))
94 $(UL $(LINK http://www.hackcraft.net/xmlUnicode/))
95 $(UL $(LINK http://www.unicode.org/faq/utf_bom.html/))
96 $(UL $(LINK http://www.azillionmonkeys.com/qed/unicode.html/))
97 $(UL $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/))
98
99 *******************************************************************************/
100
101 class UnicodeFile(T)
102 {
103 private UnicodeBom!(T) bom;
104 private PathView path_;
105
106 /***********************************************************************
107
108 Construct a UnicodeFile from the provided FilePath. The given
109 encoding represents the external file encoding, and should
110 be one of the Encoding.xx types
111
112 ***********************************************************************/
113
114 this (PathView path, Encoding encoding)
115 {
116 bom = new UnicodeBom!(T)(encoding);
117 path_ = path;
118 }
119
120 /***********************************************************************
121
122 Construct a UnicodeFile from a text string. The provided
123 encoding represents the external file encoding, and should
124 be one of the Encoding.xx types
125
126 ***********************************************************************/
127
128 this (char[] path, Encoding encoding)
129 {
130 this (new FilePath(path), encoding);
131 }
132
133 /***********************************************************************
134
135 Call-site shortcut to create a UnicodeFile instance. This
136 enables the same syntax as struct usage, so may expose
137 a migration path
138
139 ***********************************************************************/
140
141 static UnicodeFile opCall (char[] name, Encoding encoding)
142 {
143 return new UnicodeFile (name, encoding);
144 }
145
146 /***********************************************************************
147
148 Return the associated FilePath instance
149
150 ***********************************************************************/
151
152 PathView path ()
153 {
154 return path_;
155 }
156
157 /***********************************************************************
158
159 Return the current encoding. This is either the originally
160 specified encoding, or a derived one obtained by inspecting
161 the file content for a BOM. The latter is performed as part
162 of the read() method.
163
164 ***********************************************************************/
165
166 Encoding encoding ()
167 {
168 return bom.encoding();
169 }
170
171 /***********************************************************************
172
173 Return the content of the file. The content is inspected
174 for a BOM signature, which is stripped. An exception is
175 thrown if a signature is present when, according to the
176 encoding type, it should not be. Conversely, An exception
177 is thrown if there is no known signature where the current
178 encoding expects one to be present.
179
180 ***********************************************************************/
181
182 T[] read ()
183 {
184 scope conduit = new FileConduit (path_);
185 scope (exit)
186 conduit.close;
187
188 // allocate enough space for the entire file
189 auto content = new ubyte [cast(uint) conduit.length];
190
191 //read the content
192 if (conduit.read (content) != content.length)
193 conduit.error ("unexpected eof");
194
195 return bom.decode (content);
196 }
197
198 /***********************************************************************
199
200 Set the file content and length to reflect the given array.
201 The content will be encoded accordingly.
202
203 ***********************************************************************/
204
205 UnicodeFile write (T[] content, bool writeBom = false)
206 {
207 return write (content, FileConduit.ReadWriteCreate, writeBom);
208 }
209
210 /***********************************************************************
211
212 Append content to the file; the content will be encoded
213 accordingly.
214
215 Note that it is your responsibility to ensure the
216 existing and current encodings are correctly matched.
217
218 ***********************************************************************/
219
220 UnicodeFile append (T[] content)
221 {
222 return write (content, FileConduit.WriteAppending, false);
223 }
224
225 /***********************************************************************
226
227 Internal method to perform writing of content. Note that
228 the encoding must be of the explicit variety by the time
229 we get here.
230
231 ***********************************************************************/
232
233 private final UnicodeFile write (T[] content, FileConduit.Style style, bool writeBom)
234 {
235 // convert to external representation (may throw an exeption)
236 void[] converted = bom.encode (content);
237
238 // open file after conversion ~ in case of exceptions
239 scope conduit = new FileConduit (path_, style);
240 scope (exit)
241 conduit.close;
242
243 if (writeBom)
244 conduit.write (bom.signature);
245
246 // and write
247 conduit.write (converted);
248 return this;
249 }
250 }
251