132
|
1 /*******************************************************************************
|
|
2
|
|
3 copyright: Copyright (c) 2005 Kris Bell. All rights reserved
|
|
4
|
|
5 license: BSD style: $(LICENSE)
|
|
6
|
|
7 version: Initial release: December 2005
|
|
8
|
|
9 author: Kris
|
|
10
|
|
11 *******************************************************************************/
|
|
12
|
|
13 module tango.io.UnicodeFile;
|
|
14
|
|
15 public import tango.io.FilePath;
|
|
16
|
|
17 private import tango.io.FileConduit;
|
|
18
|
|
19 private import tango.core.Exception;
|
|
20
|
|
21 public import tango.text.convert.UnicodeBom;
|
|
22
|
|
23 /*******************************************************************************
|
|
24
|
|
25 Read and write unicode files
|
|
26
|
|
27 For our purposes, unicode files are an encoding of textual material.
|
|
28 The goal of this module is to interface that external-encoding with
|
|
29 a programmer-defined internal-encoding. This internal encoding is
|
|
30 declared via the template argument T, whilst the external encoding
|
|
31 is either specified or derived.
|
|
32
|
|
33 Three internal encodings are supported: char, wchar, and dchar. The
|
|
34 methods herein operate upon arrays of this type. For example, read()
|
|
35 returns an array of the type, whilst write() and append() expect an
|
|
36 array of said type.
|
|
37
|
|
38 Supported external encodings are as follow:
|
|
39
|
|
40 $(UL Encoding.Unknown)
|
|
41 $(UL Encoding.UTF_8)
|
|
42 $(UL Encoding.UTF_8N)
|
|
43 $(UL Encoding.UTF_16)
|
|
44 $(UL Encoding.UTF_16BE)
|
|
45 $(UL Encoding.UTF_16LE)
|
|
46 $(UL Encoding.UTF_32)
|
|
47 $(UL Encoding.UTF_32BE)
|
|
48 $(UL Encoding.UTF_32LE)
|
|
49
|
|
50 These can be divided into implicit and explicit encodings. Here are
|
|
51 the implicit subset:
|
|
52
|
|
53 $(UL Encoding.Unknown)
|
|
54 $(UL Encoding.UTF_8)
|
|
55 $(UL Encoding.UTF_16)
|
|
56 $(UL Encoding.UTF_32)
|
|
57
|
|
58 Implicit encodings may be used to 'discover'
|
|
59 an unknown encoding, by examining the first few bytes of the file
|
|
60 content for a signature. This signature is optional for all files,
|
|
61 but is often written such that the content is self-describing. When
|
|
62 the encoding is unknown, using one of the non-explicit encodings will
|
|
63 cause the read() method to look for a signature and adjust itself
|
|
64 accordingly. It is possible that a ZWNBSP character might be confused
|
|
65 with the signature; today's files are supposed to use the WORD-JOINER
|
|
66 character instead.
|
|
67
|
|
68 Explicit encodings are as follows:
|
|
69
|
|
70 $(UL Encoding.UTF_8N)
|
|
71 $(UL Encoding.UTF_16BE)
|
|
72 $(UL Encoding.UTF_16LE)
|
|
73 $(UL Encoding.UTF_32BE)
|
|
74 $(UL Encoding.UTF_32LE)
|
|
75
|
|
76 This group of encodings are for use when the file encoding is
|
|
77 known. These *must* be used when writing or appending, since written
|
|
78 content must be in a known format. It should be noted that, during a
|
|
79 read operation, the presence of a signature is in conflict with these
|
|
80 explicit varieties.
|
|
81
|
|
82 Method read() returns the current content of the file, whilst write()
|
|
83 sets the file content, and file length, to the provided array. Method
|
|
84 append() adds content to the tail of the file. When appending, it is
|
|
85 your responsibility to ensure the existing and current encodings are
|
|
86 correctly matched.
|
|
87
|
|
88 Methods to inspect the file system, check the status of a file or
|
|
89 directory, and other facilities are made available via the FilePath
|
|
90 superclass.
|
|
91
|
|
92 See these links for more info:
|
|
93 $(UL $(LINK http://www.utf-8.com/))
|
|
94 $(UL $(LINK http://www.hackcraft.net/xmlUnicode/))
|
|
95 $(UL $(LINK http://www.unicode.org/faq/utf_bom.html/))
|
|
96 $(UL $(LINK http://www.azillionmonkeys.com/qed/unicode.html/))
|
|
97 $(UL $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/))
|
|
98
|
|
99 *******************************************************************************/
|
|
100
|
|
101 class UnicodeFile(T)
|
|
102 {
|
|
103 private UnicodeBom!(T) bom;
|
|
104 private PathView path_;
|
|
105
|
|
106 /***********************************************************************
|
|
107
|
|
108 Construct a UnicodeFile from the provided FilePath. The given
|
|
109 encoding represents the external file encoding, and should
|
|
110 be one of the Encoding.xx types
|
|
111
|
|
112 ***********************************************************************/
|
|
113
|
|
114 this (PathView path, Encoding encoding)
|
|
115 {
|
|
116 bom = new UnicodeBom!(T)(encoding);
|
|
117 path_ = path;
|
|
118 }
|
|
119
|
|
120 /***********************************************************************
|
|
121
|
|
122 Construct a UnicodeFile from a text string. The provided
|
|
123 encoding represents the external file encoding, and should
|
|
124 be one of the Encoding.xx types
|
|
125
|
|
126 ***********************************************************************/
|
|
127
|
|
128 this (char[] path, Encoding encoding)
|
|
129 {
|
|
130 this (new FilePath(path), encoding);
|
|
131 }
|
|
132
|
|
133 /***********************************************************************
|
|
134
|
|
135 Call-site shortcut to create a UnicodeFile instance. This
|
|
136 enables the same syntax as struct usage, so may expose
|
|
137 a migration path
|
|
138
|
|
139 ***********************************************************************/
|
|
140
|
|
141 static UnicodeFile opCall (char[] name, Encoding encoding)
|
|
142 {
|
|
143 return new UnicodeFile (name, encoding);
|
|
144 }
|
|
145
|
|
146 /***********************************************************************
|
|
147
|
|
148 Return the associated FilePath instance
|
|
149
|
|
150 ***********************************************************************/
|
|
151
|
|
152 PathView path ()
|
|
153 {
|
|
154 return path_;
|
|
155 }
|
|
156
|
|
157 /***********************************************************************
|
|
158
|
|
159 Return the current encoding. This is either the originally
|
|
160 specified encoding, or a derived one obtained by inspecting
|
|
161 the file content for a BOM. The latter is performed as part
|
|
162 of the read() method.
|
|
163
|
|
164 ***********************************************************************/
|
|
165
|
|
166 Encoding encoding ()
|
|
167 {
|
|
168 return bom.encoding();
|
|
169 }
|
|
170
|
|
171 /***********************************************************************
|
|
172
|
|
173 Return the content of the file. The content is inspected
|
|
174 for a BOM signature, which is stripped. An exception is
|
|
175 thrown if a signature is present when, according to the
|
|
176 encoding type, it should not be. Conversely, An exception
|
|
177 is thrown if there is no known signature where the current
|
|
178 encoding expects one to be present.
|
|
179
|
|
180 ***********************************************************************/
|
|
181
|
|
182 T[] read ()
|
|
183 {
|
|
184 scope conduit = new FileConduit (path_);
|
|
185 scope (exit)
|
|
186 conduit.close;
|
|
187
|
|
188 // allocate enough space for the entire file
|
|
189 auto content = new ubyte [cast(uint) conduit.length];
|
|
190
|
|
191 //read the content
|
|
192 if (conduit.read (content) != content.length)
|
|
193 conduit.error ("unexpected eof");
|
|
194
|
|
195 return bom.decode (content);
|
|
196 }
|
|
197
|
|
198 /***********************************************************************
|
|
199
|
|
200 Set the file content and length to reflect the given array.
|
|
201 The content will be encoded accordingly.
|
|
202
|
|
203 ***********************************************************************/
|
|
204
|
|
205 UnicodeFile write (T[] content, bool writeBom = false)
|
|
206 {
|
|
207 return write (content, FileConduit.ReadWriteCreate, writeBom);
|
|
208 }
|
|
209
|
|
210 /***********************************************************************
|
|
211
|
|
212 Append content to the file; the content will be encoded
|
|
213 accordingly.
|
|
214
|
|
215 Note that it is your responsibility to ensure the
|
|
216 existing and current encodings are correctly matched.
|
|
217
|
|
218 ***********************************************************************/
|
|
219
|
|
220 UnicodeFile append (T[] content)
|
|
221 {
|
|
222 return write (content, FileConduit.WriteAppending, false);
|
|
223 }
|
|
224
|
|
225 /***********************************************************************
|
|
226
|
|
227 Internal method to perform writing of content. Note that
|
|
228 the encoding must be of the explicit variety by the time
|
|
229 we get here.
|
|
230
|
|
231 ***********************************************************************/
|
|
232
|
|
233 private final UnicodeFile write (T[] content, FileConduit.Style style, bool writeBom)
|
|
234 {
|
|
235 // convert to external representation (may throw an exeption)
|
|
236 void[] converted = bom.encode (content);
|
|
237
|
|
238 // open file after conversion ~ in case of exceptions
|
|
239 scope conduit = new FileConduit (path_, style);
|
|
240 scope (exit)
|
|
241 conduit.close;
|
|
242
|
|
243 if (writeBom)
|
|
244 conduit.write (bom.signature);
|
|
245
|
|
246 // and write
|
|
247 conduit.write (converted);
|
|
248 return this;
|
|
249 }
|
|
250 }
|
|
251
|