Mercurial > projects > ldc
comparison tango/tango/io/UnicodeFile.d @ 132:1700239cab2e trunk
[svn r136] MAJOR UNSTABLE UPDATE!!!
Initial commit after moving to Tango instead of Phobos.
Lots of bugfixes...
This build is not suitable for most things.
author | lindquist |
---|---|
date | Fri, 11 Jan 2008 17:57:40 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
131:5825d48b27d1 | 132:1700239cab2e |
---|---|
1 /******************************************************************************* | |
2 | |
3 copyright: Copyright (c) 2005 Kris Bell. All rights reserved | |
4 | |
5 license: BSD style: $(LICENSE) | |
6 | |
7 version: Initial release: December 2005 | |
8 | |
9 author: Kris | |
10 | |
11 *******************************************************************************/ | |
12 | |
13 module tango.io.UnicodeFile; | |
14 | |
15 public import tango.io.FilePath; | |
16 | |
17 private import tango.io.FileConduit; | |
18 | |
19 private import tango.core.Exception; | |
20 | |
21 public import tango.text.convert.UnicodeBom; | |
22 | |
23 /******************************************************************************* | |
24 | |
25 Read and write unicode files | |
26 | |
27 For our purposes, unicode files are an encoding of textual material. | |
28 The goal of this module is to interface that external-encoding with | |
29 a programmer-defined internal-encoding. This internal encoding is | |
30 declared via the template argument T, whilst the external encoding | |
31 is either specified or derived. | |
32 | |
33 Three internal encodings are supported: char, wchar, and dchar. The | |
34 methods herein operate upon arrays of this type. For example, read() | |
35 returns an array of the type, whilst write() and append() expect an | |
36 array of said type. | |
37 | |
38 Supported external encodings are as follow: | |
39 | |
40 $(UL Encoding.Unknown) | |
41 $(UL Encoding.UTF_8) | |
42 $(UL Encoding.UTF_8N) | |
43 $(UL Encoding.UTF_16) | |
44 $(UL Encoding.UTF_16BE) | |
45 $(UL Encoding.UTF_16LE) | |
46 $(UL Encoding.UTF_32) | |
47 $(UL Encoding.UTF_32BE) | |
48 $(UL Encoding.UTF_32LE) | |
49 | |
50 These can be divided into implicit and explicit encodings. Here are | |
51 the implicit subset: | |
52 | |
53 $(UL Encoding.Unknown) | |
54 $(UL Encoding.UTF_8) | |
55 $(UL Encoding.UTF_16) | |
56 $(UL Encoding.UTF_32) | |
57 | |
58 Implicit encodings may be used to 'discover' | |
59 an unknown encoding, by examining the first few bytes of the file | |
60 content for a signature. This signature is optional for all files, | |
61 but is often written such that the content is self-describing. When | |
62 the encoding is unknown, using one of the non-explicit encodings will | |
63 cause the read() method to look for a signature and adjust itself | |
64 accordingly. It is possible that a ZWNBSP character might be confused | |
65 with the signature; today's files are supposed to use the WORD-JOINER | |
66 character instead. | |
67 | |
68 Explicit encodings are as follows: | |
69 | |
70 $(UL Encoding.UTF_8N) | |
71 $(UL Encoding.UTF_16BE) | |
72 $(UL Encoding.UTF_16LE) | |
73 $(UL Encoding.UTF_32BE) | |
74 $(UL Encoding.UTF_32LE) | |
75 | |
76 This group of encodings are for use when the file encoding is | |
77 known. These *must* be used when writing or appending, since written | |
78 content must be in a known format. It should be noted that, during a | |
79 read operation, the presence of a signature is in conflict with these | |
80 explicit varieties. | |
81 | |
82 Method read() returns the current content of the file, whilst write() | |
83 sets the file content, and file length, to the provided array. Method | |
84 append() adds content to the tail of the file. When appending, it is | |
85 your responsibility to ensure the existing and current encodings are | |
86 correctly matched. | |
87 | |
88 Methods to inspect the file system, check the status of a file or | |
89 directory, and other facilities are made available via the FilePath | |
90 superclass. | |
91 | |
92 See these links for more info: | |
93 $(UL $(LINK http://www.utf-8.com/)) | |
94 $(UL $(LINK http://www.hackcraft.net/xmlUnicode/)) | |
95 $(UL $(LINK http://www.unicode.org/faq/utf_bom.html/)) | |
96 $(UL $(LINK http://www.azillionmonkeys.com/qed/unicode.html/)) | |
97 $(UL $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/)) | |
98 | |
99 *******************************************************************************/ | |
100 | |
101 class UnicodeFile(T) | |
102 { | |
103 private UnicodeBom!(T) bom; | |
104 private PathView path_; | |
105 | |
106 /*********************************************************************** | |
107 | |
108 Construct a UnicodeFile from the provided FilePath. The given | |
109 encoding represents the external file encoding, and should | |
110 be one of the Encoding.xx types | |
111 | |
112 ***********************************************************************/ | |
113 | |
114 this (PathView path, Encoding encoding) | |
115 { | |
116 bom = new UnicodeBom!(T)(encoding); | |
117 path_ = path; | |
118 } | |
119 | |
120 /*********************************************************************** | |
121 | |
122 Construct a UnicodeFile from a text string. The provided | |
123 encoding represents the external file encoding, and should | |
124 be one of the Encoding.xx types | |
125 | |
126 ***********************************************************************/ | |
127 | |
128 this (char[] path, Encoding encoding) | |
129 { | |
130 this (new FilePath(path), encoding); | |
131 } | |
132 | |
133 /*********************************************************************** | |
134 | |
135 Call-site shortcut to create a UnicodeFile instance. This | |
136 enables the same syntax as struct usage, so may expose | |
137 a migration path | |
138 | |
139 ***********************************************************************/ | |
140 | |
141 static UnicodeFile opCall (char[] name, Encoding encoding) | |
142 { | |
143 return new UnicodeFile (name, encoding); | |
144 } | |
145 | |
146 /*********************************************************************** | |
147 | |
148 Return the associated FilePath instance | |
149 | |
150 ***********************************************************************/ | |
151 | |
152 PathView path () | |
153 { | |
154 return path_; | |
155 } | |
156 | |
157 /*********************************************************************** | |
158 | |
159 Return the current encoding. This is either the originally | |
160 specified encoding, or a derived one obtained by inspecting | |
161 the file content for a BOM. The latter is performed as part | |
162 of the read() method. | |
163 | |
164 ***********************************************************************/ | |
165 | |
166 Encoding encoding () | |
167 { | |
168 return bom.encoding(); | |
169 } | |
170 | |
171 /*********************************************************************** | |
172 | |
173 Return the content of the file. The content is inspected | |
174 for a BOM signature, which is stripped. An exception is | |
175 thrown if a signature is present when, according to the | |
176 encoding type, it should not be. Conversely, An exception | |
177 is thrown if there is no known signature where the current | |
178 encoding expects one to be present. | |
179 | |
180 ***********************************************************************/ | |
181 | |
182 T[] read () | |
183 { | |
184 scope conduit = new FileConduit (path_); | |
185 scope (exit) | |
186 conduit.close; | |
187 | |
188 // allocate enough space for the entire file | |
189 auto content = new ubyte [cast(uint) conduit.length]; | |
190 | |
191 //read the content | |
192 if (conduit.read (content) != content.length) | |
193 conduit.error ("unexpected eof"); | |
194 | |
195 return bom.decode (content); | |
196 } | |
197 | |
198 /*********************************************************************** | |
199 | |
200 Set the file content and length to reflect the given array. | |
201 The content will be encoded accordingly. | |
202 | |
203 ***********************************************************************/ | |
204 | |
205 UnicodeFile write (T[] content, bool writeBom = false) | |
206 { | |
207 return write (content, FileConduit.ReadWriteCreate, writeBom); | |
208 } | |
209 | |
210 /*********************************************************************** | |
211 | |
212 Append content to the file; the content will be encoded | |
213 accordingly. | |
214 | |
215 Note that it is your responsibility to ensure the | |
216 existing and current encodings are correctly matched. | |
217 | |
218 ***********************************************************************/ | |
219 | |
220 UnicodeFile append (T[] content) | |
221 { | |
222 return write (content, FileConduit.WriteAppending, false); | |
223 } | |
224 | |
225 /*********************************************************************** | |
226 | |
227 Internal method to perform writing of content. Note that | |
228 the encoding must be of the explicit variety by the time | |
229 we get here. | |
230 | |
231 ***********************************************************************/ | |
232 | |
233 private final UnicodeFile write (T[] content, FileConduit.Style style, bool writeBom) | |
234 { | |
235 // convert to external representation (may throw an exeption) | |
236 void[] converted = bom.encode (content); | |
237 | |
238 // open file after conversion ~ in case of exceptions | |
239 scope conduit = new FileConduit (path_, style); | |
240 scope (exit) | |
241 conduit.close; | |
242 | |
243 if (writeBom) | |
244 conduit.write (bom.signature); | |
245 | |
246 // and write | |
247 conduit.write (converted); | |
248 return this; | |
249 } | |
250 } | |
251 |