132
|
1 /*******************************************************************************
|
|
2
|
|
3 copyright: Copyright (c) 2004 Kris Bell. All rights reserved
|
|
4
|
|
5 license: BSD style: $(LICENSE)
|
|
6
|
|
7 version: Initial release: December 2005
|
|
8
|
|
9 author: Kris
|
|
10
|
|
11 *******************************************************************************/
|
|
12
|
|
13 module tango.text.convert.UnicodeBom;
|
|
14
|
|
15 private import tango.core.ByteSwap;
|
|
16
|
|
17 private import Utf = tango.text.convert.Utf;
|
|
18
|
|
19
|
|
20 private extern (C) void onUnicodeError (char[] msg, size_t idx = 0);
|
|
21
|
|
22 /*******************************************************************************
|
|
23
|
|
24 see http://icu.sourceforge.net/docs/papers/forms_of_unicode/#t2
|
|
25
|
|
26 *******************************************************************************/
|
|
27
|
|
28 enum Encoding {
|
|
29 Unknown,
|
|
30 UTF_8,
|
|
31 UTF_8N,
|
|
32 UTF_16,
|
|
33 UTF_16BE,
|
|
34 UTF_16LE,
|
|
35 UTF_32,
|
|
36 UTF_32BE,
|
|
37 UTF_32LE,
|
|
38 };
|
|
39
|
|
40 /*******************************************************************************
|
|
41
|
|
42 Convert unicode content
|
|
43
|
|
44 Unicode is an encoding of textual material. The purpose of this module
|
|
45 is to interface external-encoding with a programmer-defined internal-
|
|
46 encoding. This internal encoding is declared via the template argument
|
|
47 T, whilst the external encoding is either specified or derived.
|
|
48
|
|
49 Three internal encodings are supported: char, wchar, and dchar. The
|
|
50 methods herein operate upon arrays of this type. That is, decode()
|
|
51 returns an array of the type, while encode() expect an array of said
|
|
52 type.
|
|
53
|
|
54 Supported external encodings are as follow:
|
|
55
|
|
56 Encoding.Unknown
|
|
57 Encoding.UTF_8
|
|
58 Encoding.UTF_8N
|
|
59 Encoding.UTF_16
|
|
60 Encoding.UTF_16BE
|
|
61 Encoding.UTF_16LE
|
|
62 Encoding.UTF_32
|
|
63 Encoding.UTF_32BE
|
|
64 Encoding.UTF_32LE
|
|
65
|
|
66 These can be divided into non-explicit and explicit encodings:
|
|
67
|
|
68 Encoding.Unknown
|
|
69 Encoding.UTF_8
|
|
70 Encoding.UTF_16
|
|
71 Encoding.UTF_32
|
|
72
|
|
73
|
|
74 Encoding.UTF_8N
|
|
75 Encoding.UTF_16BE
|
|
76 Encoding.UTF_16LE
|
|
77 Encoding.UTF_32BE
|
|
78 Encoding.UTF_32LE
|
|
79
|
|
80 The former group of non-explicit encodings may be used to 'discover'
|
|
81 an unknown encoding, by examining the first few bytes of the content
|
|
82 for a signature. This signature is optional, but is often written such
|
|
83 that the content is self-describing. When an encoding is unknown, using
|
|
84 one of the non-explicit encodings will cause the decode() method to look
|
|
85 for a signature and adjust itself accordingly. It is possible that a
|
|
86 ZWNBSP character might be confused with the signature; today's unicode
|
|
87 content is supposed to use the WORD-JOINER character instead.
|
|
88
|
|
89 The group of explicit encodings are for use when the content encoding
|
|
90 is known. These *must* be used when converting back to external encoding,
|
|
91 since written content must be in a known format. It should be noted that,
|
|
92 during a decode() operation, the existence of a signature is in conflict
|
|
93 with these explicit varieties.
|
|
94
|
|
95
|
|
96 See
|
|
97 $(LINK http://www.utf-8.com/)
|
|
98 $(LINK http://www.hackcraft.net/xmlUnicode/)
|
|
99 $(LINK http://www.unicode.org/faq/utf_bom.html/)
|
|
100 $(LINK http://www.azillionmonkeys.com/qed/unicode.html/)
|
|
101 $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/)
|
|
102
|
|
103 *******************************************************************************/
|
|
104
|
|
105 class UnicodeBom(T) : BomSniffer
|
|
106 {
|
|
107 static if (!is (T == char) && !is (T == wchar) && !is (T == dchar))
|
|
108 pragma (msg, "Template type must be char, wchar, or dchar");
|
|
109
|
|
110 /***********************************************************************
|
|
111
|
|
112 Construct a instance using the given external encoding ~ one
|
|
113 of the Encoding.xx types
|
|
114
|
|
115 ***********************************************************************/
|
|
116
|
|
117 this (Encoding encoding)
|
|
118 {
|
|
119 setup (encoding);
|
|
120 }
|
|
121
|
|
122 /***********************************************************************
|
|
123
|
|
124 Convert the provided content. The content is inspected
|
|
125 for a BOM signature, which is stripped. An exception is
|
|
126 thrown if a signature is present when, according to the
|
|
127 encoding type, it should not be. Conversely, An exception
|
|
128 is thrown if there is no known signature where the current
|
|
129 encoding expects one to be present
|
|
130
|
|
131 ***********************************************************************/
|
|
132
|
|
133 final T[] decode (void[] content, T[] dst=null, uint* ate=null)
|
|
134 {
|
|
135 // look for a BOM
|
|
136 auto info = test (content);
|
|
137
|
|
138 // are we expecting a BOM?
|
|
139 if (lookup[encoding].test)
|
|
140 if (info)
|
|
141 {
|
|
142 // yep ~ and we got one
|
|
143 setup (info.encoding);
|
|
144
|
|
145 // strip BOM from content
|
|
146 content = content [info.bom.length .. length];
|
|
147 }
|
|
148 else
|
|
149 // can this encoding be defaulted?
|
|
150 if (settings.fallback)
|
|
151 setup (settings.fallback);
|
|
152 else
|
|
153 onUnicodeError ("UnicodeBom.decode :: unknown or missing BOM");
|
|
154 else
|
|
155 if (info)
|
|
156 // found a BOM when using an explicit encoding
|
|
157 onUnicodeError ("UnicodeBom.decode :: explicit encoding does not permit BOM");
|
|
158
|
|
159 // convert it to internal representation
|
|
160 return into (swapBytes(content), settings.type, dst, ate);
|
|
161 }
|
|
162
|
|
163 /***********************************************************************
|
|
164
|
|
165 Perform encoding of content. Note that the encoding must be
|
|
166 of the explicit variety by the time we get here
|
|
167
|
|
168 ***********************************************************************/
|
|
169
|
|
170 final void[] encode (T[] content, void[] dst=null, uint* ate=null)
|
|
171 {
|
|
172 if (settings.test)
|
|
173 onUnicodeError ("UnicodeBom.encode :: cannot write to a non-specific encoding");
|
|
174
|
|
175 // convert it to external representation, and write
|
|
176 return swapBytes (from (content, settings.type, dst, ate));
|
|
177 }
|
|
178
|
|
179 /***********************************************************************
|
|
180
|
|
181 Swap bytes around, as required by the encoding
|
|
182
|
|
183 ***********************************************************************/
|
|
184
|
|
185 private final void[] swapBytes (void[] content)
|
|
186 {
|
|
187 bool endian = settings.endian;
|
|
188 bool swap = settings.bigEndian;
|
|
189
|
|
190 version (BigEndian)
|
|
191 swap = !swap;
|
|
192
|
|
193 if (endian && swap)
|
|
194 {
|
|
195 if (settings.type == Utf16)
|
|
196 ByteSwap.swap16 (content.ptr, content.length);
|
|
197 else
|
|
198 ByteSwap.swap32 (content.ptr, content.length);
|
|
199 }
|
|
200 return content;
|
|
201 }
|
|
202
|
|
203 /***********************************************************************
|
|
204
|
|
205
|
|
206 ***********************************************************************/
|
|
207
|
|
208 static T[] into (void[] x, uint type, T[] dst=null, uint* ate=null)
|
|
209 {
|
|
210 T[] ret;
|
|
211
|
|
212 static if (is (T == char))
|
|
213 {
|
|
214 if (type == Utf8)
|
|
215 return cast(T[]) x;
|
|
216
|
|
217 if (type == Utf16)
|
|
218 ret = Utf.toString (cast(wchar[]) x, dst, ate);
|
|
219 else
|
|
220 if (type == Utf32)
|
|
221 ret = Utf.toString (cast(dchar[]) x, dst, ate);
|
|
222 }
|
|
223
|
|
224 static if (is (T == wchar))
|
|
225 {
|
|
226 if (type == Utf16)
|
|
227 return cast(T[]) x;
|
|
228
|
|
229 if (type == Utf8)
|
|
230 ret = Utf.toString16 (cast(char[]) x, dst, ate);
|
|
231 else
|
|
232 if (type == Utf32)
|
|
233 ret = Utf.toString16 (cast(dchar[]) x, dst, ate);
|
|
234 }
|
|
235
|
|
236 static if (is (T == dchar))
|
|
237 {
|
|
238 if (type == Utf32)
|
|
239 return cast(T[]) x;
|
|
240
|
|
241 if (type == Utf8)
|
|
242 ret = Utf.toString32 (cast(char[]) x, dst, ate);
|
|
243 else
|
|
244 if (type == Utf16)
|
|
245 ret = Utf.toString32 (cast(wchar[]) x, dst, ate);
|
|
246 }
|
|
247
|
|
248 return ret;
|
|
249 }
|
|
250
|
|
251
|
|
252 /***********************************************************************
|
|
253
|
|
254 ***********************************************************************/
|
|
255
|
|
256 static void[] from (T[] x, uint type, void[] dst=null, uint* ate=null)
|
|
257 {
|
|
258 void[] ret;
|
|
259
|
|
260 static if (is (T == char))
|
|
261 {
|
|
262 if (type == Utf8)
|
|
263 return x;
|
|
264
|
|
265 if (type == Utf16)
|
|
266 ret = Utf.toString16 (x, cast(wchar[]) dst, ate);
|
|
267 else
|
|
268 if (type == Utf32)
|
|
269 ret = Utf.toString32 (x, cast(dchar[]) dst, ate);
|
|
270 }
|
|
271
|
|
272 static if (is (T == wchar))
|
|
273 {
|
|
274 if (type == Utf16)
|
|
275 return x;
|
|
276
|
|
277 if (type == Utf8)
|
|
278 ret = Utf.toString (x, cast(char[]) dst, ate);
|
|
279 else
|
|
280 if (type == Utf32)
|
|
281 ret = Utf.toString32 (x, cast(dchar[]) dst, ate);
|
|
282 }
|
|
283
|
|
284 static if (is (T == dchar))
|
|
285 {
|
|
286 if (type == Utf32)
|
|
287 return x;
|
|
288
|
|
289 if (type == Utf8)
|
|
290 ret = Utf.toString (x, cast(char[]) dst, ate);
|
|
291 else
|
|
292 if (type == Utf16)
|
|
293 ret = Utf.toString16 (x, cast(wchar[]) dst, ate);
|
|
294 }
|
|
295
|
|
296 return ret;
|
|
297 }
|
|
298 }
|
|
299
|
|
300
|
|
301
|
|
302 /*******************************************************************************
|
|
303
|
|
304 *******************************************************************************/
|
|
305
|
|
306 class BomSniffer
|
|
307 {
|
|
308 private Encoding encoder; // the current encoding
|
|
309 private Info* settings; // pointer to encoding configuration
|
|
310
|
|
311 private struct Info
|
|
312 {
|
|
313 int type; // type of element (char/wchar/dchar)
|
|
314 Encoding encoding; // Encoding.xx encoding
|
|
315 char[] bom; // pattern to match for signature
|
|
316 bool test, // should we test for this encoding?
|
|
317 endian, // this encoding have endian concerns?
|
|
318 bigEndian; // is this a big-endian encoding?
|
|
319 Encoding fallback; // can this encoding be defaulted?
|
|
320 };
|
|
321
|
|
322 private enum {Utf8, Utf16, Utf32};
|
|
323
|
|
324 private const Info[] lookup =
|
|
325 [
|
|
326 {Utf8, Encoding.Unknown, null, true, false, false, Encoding.UTF_8N},
|
|
327 {Utf8, Encoding.UTF_8, null, true, false, false, Encoding.UTF_8N},
|
|
328 {Utf8, Encoding.UTF_8N, x"efbbbf", false},
|
|
329 {Utf16, Encoding.UTF_16, null, true, false, false, Encoding.UTF_16BE},
|
|
330 {Utf16, Encoding.UTF_16BE, x"feff", false, true, true},
|
|
331 {Utf16, Encoding.UTF_16LE, x"fffe", false, true},
|
|
332 {Utf32, Encoding.UTF_32, null, true, false, false, Encoding.UTF_32BE},
|
|
333 {Utf32, Encoding.UTF_32BE, x"0000feff", false, true, true},
|
|
334 {Utf32, Encoding.UTF_32LE, x"fffe0000", false, true},
|
|
335 ];
|
|
336
|
|
337 /***********************************************************************
|
|
338
|
|
339 Return the current encoding. This is either the originally
|
|
340 specified encoding, or a derived one obtained by inspecting
|
|
341 the content for a BOM. The latter is performed as part of
|
|
342 the decode() method
|
|
343
|
|
344 ***********************************************************************/
|
|
345
|
|
346 final Encoding encoding ()
|
|
347 {
|
|
348 return encoder;
|
|
349 }
|
|
350
|
|
351 /***********************************************************************
|
|
352
|
|
353 Return the signature (BOM) of the current encoding
|
|
354
|
|
355 ***********************************************************************/
|
|
356
|
|
357 final void[] signature ()
|
|
358 {
|
|
359 return settings.bom;
|
|
360 }
|
|
361
|
|
362 /***********************************************************************
|
|
363
|
|
364 Configure this instance with unicode converters
|
|
365
|
|
366 ***********************************************************************/
|
|
367
|
|
368 final void setup (Encoding encoding)
|
|
369 {
|
|
370 this.settings = &lookup[encoding];
|
|
371 this.encoder = encoding;
|
|
372 }
|
|
373
|
|
374 /***********************************************************************
|
|
375
|
|
376 Scan the BOM signatures looking for a match. We scan in
|
|
377 reverse order to get the longest match first
|
|
378
|
|
379 ***********************************************************************/
|
|
380
|
|
381 static final Info* test (void[] content)
|
|
382 {
|
|
383 for (Info* info=lookup.ptr+lookup.length; --info >= lookup.ptr;)
|
|
384 if (info.bom)
|
|
385 {
|
|
386 int len = info.bom.length;
|
|
387 if (len <= content.length)
|
|
388 if (content[0..len] == info.bom[0..len])
|
|
389 return info;
|
|
390 }
|
|
391 return null;
|
|
392 }
|
|
393 }
|
|
394
|