Mercurial > projects > dil
annotate src/dil/Converter.d @ 806:bcb74c9b895c
Moved out files in the trunk folder to the root.
author | Aziz K?ksal <aziz.koeksal@gmail.com> |
---|---|
date | Sun, 09 Mar 2008 00:12:19 +0100 |
parents | trunk/src/dil/Converter.d@3b34f6a95a27 |
children |
rev | line source |
---|---|
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
1 /++ |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
2 Author: Aziz Köksal |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
3 License: GPL3 |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
4 +/ |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
5 module dil.Converter; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
6 |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
7 import dil.Information; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
8 import dil.Location; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
9 import dil.Unicode; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
10 import dil.FileBOM; |
577
9e811db780a6
Moved LexerFuncs.d to package 'lexer'.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
533
diff
changeset
|
11 import dil.lexer.Funcs; |
533
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
12 import dil.Messages; |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
13 import common; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
14 |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
15 /// Converts various Unicode encoding formats to UTF-8. |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
16 struct Converter |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
17 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
18 char[] filePath; /// For error messages. |
532
50e64bab9c7a
Renamed InformationManager to InfoManager.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
518
diff
changeset
|
19 InfoManager infoMan; |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
20 |
532
50e64bab9c7a
Renamed InformationManager to InfoManager.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
518
diff
changeset
|
21 static Converter opCall(char[] filePath, InfoManager infoMan) |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
22 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
23 Converter conv; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
24 conv.filePath = filePath; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
25 conv.infoMan = infoMan; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
26 return conv; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
27 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
28 |
786
3b34f6a95a27
Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
29 /// Byte-swaps c. |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
30 dchar swapBytes(dchar c) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
31 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
32 return c = (c << 24) | |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
33 ((c >> 8) & 0xFF00) | |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
34 ((c << 8) & 0xFF0000) | |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
35 (c >> 24); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
36 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
37 |
786
3b34f6a95a27
Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
38 /// Byte-swaps c. |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
39 wchar swapBytes(wchar c) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
40 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
41 return (c << 8) | (c >> 8); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
42 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
43 |
786
3b34f6a95a27
Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
44 /// Swaps the bytes of c on a little-endian machine. |
3b34f6a95a27
Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
45 dchar BEtoMachineDword(dchar c) |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
46 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
47 version(LittleEndian) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
48 return swapBytes(c); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
49 else |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
50 return c; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
51 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
52 |
786
3b34f6a95a27
Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
53 /// Swaps the bytes of c on a big-endian machine. |
3b34f6a95a27
Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
54 dchar LEtoMachineDword(dchar c) |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
55 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
56 version(LittleEndian) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
57 return c; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
58 else |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
59 return swapBytes(c); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
60 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
61 |
786
3b34f6a95a27
Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
62 /// Swaps the bytes of c on a little-endian machine. |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
63 wchar BEtoMachineWord(wchar c) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
64 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
65 version(LittleEndian) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
66 return swapBytes(c); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
67 else |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
68 return c; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
69 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
70 |
786
3b34f6a95a27
Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
71 /// Swaps the bytes of c on a big-endian machine. |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
72 wchar LEtoMachineWord(wchar c) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
73 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
74 version(LittleEndian) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
75 return c; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
76 else |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
77 return swapBytes(c); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
78 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
79 |
786
3b34f6a95a27
Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
80 /// Converts a UTF-32 text to UTF-8. |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
81 char[] UTF32toUTF8(bool isBigEndian)(ubyte[] data) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
82 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
83 if (data.length == 0) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
84 return null; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
85 |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
86 char[] result; |
533
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
87 uint lineNum = 1; |
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
88 dchar[] text = cast(dchar[]) data[0 .. $-($%4)]; // Trim to multiple of 4. |
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
89 foreach (dchar c; text) |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
90 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
91 static if (isBigEndian) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
92 c = BEtoMachineDword(c); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
93 else |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
94 c = LEtoMachineDword(c); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
95 |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
96 if (!isValidChar(c)) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
97 { |
533
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
98 infoMan ~= new LexerError( |
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
99 new Location(filePath, lineNum), |
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
100 Format(MSG.InvalidUTF32Character, c) |
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
101 ); |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
102 c = REPLACEMENT_CHAR; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
103 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
104 |
533
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
105 if (isNewline(c)) |
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
106 ++lineNum; |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
107 dil.Unicode.encode(result, c); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
108 } |
533
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
109 |
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
110 if (data.length % 4) |
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
111 infoMan ~= new LexerError( |
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
112 new Location(filePath, lineNum), |
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
113 MSG.UTF32FileMustBeDivisibleBy4 |
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
114 ); |
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
115 |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
116 return result; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
117 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
118 |
786
3b34f6a95a27
Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
119 alias UTF32toUTF8!(true) UTF32BEtoUTF8; /// Instantiation for UTF-32 BE. |
3b34f6a95a27
Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
120 alias UTF32toUTF8!(false) UTF32LEtoUTF8; /// Instantiation for UTF-32 LE. |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
121 |
786
3b34f6a95a27
Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
122 /// Converts a UTF-16 text to UTF-8. |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
123 char[] UTF16toUTF8(bool isBigEndian)(ubyte[] data) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
124 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
125 if (data.length == 0) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
126 return null; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
127 |
533
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
128 wchar[] text = cast(wchar[]) data[0 .. $-($%2)]; // Trim to multiple of two. |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
129 wchar* p = text.ptr, |
533
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
130 end = text.ptr + text.length; |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
131 char[] result; |
533
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
132 uint lineNum = 1; |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
133 |
764
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
134 for (; p < end; p++) |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
135 { |
764
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
136 dchar c = *p; |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
137 static if (isBigEndian) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
138 c = BEtoMachineWord(c); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
139 else |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
140 c = LEtoMachineWord(c); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
141 |
764
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
142 if (0xD800 > c || c > 0xDFFF) |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
143 {} |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
144 else if (c <= 0xDBFF && p+1 < end) |
764
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
145 { // Decode surrogate pairs. |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
146 wchar c2 = p[1]; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
147 static if (isBigEndian) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
148 c2 = BEtoMachineWord(c2); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
149 else |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
150 c2 = LEtoMachineWord(c2); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
151 |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
152 if (0xDC00 <= c2 && c2 <= 0xDFFF) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
153 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
154 c = (c - 0xD7C0) << 10; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
155 c |= (c2 & 0x3FF); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
156 ++p; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
157 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
158 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
159 else |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
160 { |
533
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
161 infoMan ~= new LexerError( |
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
162 new Location(filePath, lineNum), |
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
163 Format(MSG.InvalidUTF16Character, c) |
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
164 ); |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
165 c = REPLACEMENT_CHAR; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
166 } |
533
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
167 |
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
168 if (isNewline(c)) |
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
169 ++lineNum; |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
170 dil.Unicode.encode(result, c); |
764
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
171 } |
533
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
172 |
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
173 if (data.length % 2) |
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
174 infoMan ~= new LexerError( |
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
175 new Location(filePath, lineNum), |
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
176 MSG.UTF16FileMustBeDivisibleBy2 |
2a8d0ed0d71e
Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
532
diff
changeset
|
177 ); |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
178 return result; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
179 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
180 |
786
3b34f6a95a27
Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
181 alias UTF16toUTF8!(true) UTF16BEtoUTF8; /// Instantiation for UTF-16 BE. |
3b34f6a95a27
Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
182 alias UTF16toUTF8!(false) UTF16LEtoUTF8; /// Instantiation for UTF-16 LE. |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
183 |
786
3b34f6a95a27
Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
184 /// Converts the text in data to UTF-8. |
3b34f6a95a27
Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
185 /// Leaves data unchanged if it is in UTF-8 already. |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
186 char[] data2UTF8(ubyte[] data) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
187 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
188 if (data.length == 0) |
755
90668b83ae5e
Introduced new module dil.SourceText and class SourceText.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
739
diff
changeset
|
189 return ""; |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
190 |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
191 char[] text; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
192 BOM bom = tellBOM(data); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
193 |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
194 switch (bom) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
195 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
196 case BOM.None: |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
197 // No BOM found. According to the specs the first character |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
198 // must be an ASCII character. |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
199 if (data.length >= 4) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
200 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
201 if (data[0..3] == cast(ubyte[3])x"00 00 00") |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
202 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
203 text = UTF32BEtoUTF8(data); // UTF-32BE: 00 00 00 XX |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
204 break; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
205 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
206 else if (data[1..4] == cast(ubyte[3])x"00 00 00") |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
207 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
208 text = UTF32LEtoUTF8(data); // UTF-32LE: XX 00 00 00 |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
209 break; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
210 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
211 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
212 if (data.length >= 2) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
213 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
214 if (data[0] == 0) // UTF-16BE: 00 XX |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
215 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
216 text = UTF16BEtoUTF8(data); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
217 break; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
218 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
219 else if (data[1] == 0) // UTF-16LE: XX 00 |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
220 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
221 text = UTF16LEtoUTF8(data); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
222 break; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
223 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
224 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
225 text = cast(char[])data; // UTF-8 |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
226 break; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
227 case BOM.UTF8: |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
228 text = cast(char[])data[3..$]; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
229 break; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
230 case BOM.UTF16BE: |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
231 text = UTF16BEtoUTF8(data[2..$]); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
232 break; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
233 case BOM.UTF16LE: |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
234 text = UTF16LEtoUTF8(data[2..$]); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
235 break; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
236 case BOM.UTF32BE: |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
237 text = UTF32BEtoUTF8(data[4..$]); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
238 break; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
239 case BOM.UTF32LE: |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
240 text = UTF32LEtoUTF8(data[4..$]); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
241 break; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
242 default: |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
243 assert(0); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
244 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
245 return text; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
246 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff
changeset
|
247 } |
739
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
248 |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
249 /// Replaces invalid UTF-8 sequences with U+FFFD (if there's enough space,) |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
250 /// and Newlines with '\n'. |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
251 string sanitizeText(string text) |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
252 { |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
253 if (!text.length) |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
254 return null; |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
255 |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
256 char* p = text.ptr; |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
257 char* end = p + text.length; |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
258 char* q = p; |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
259 |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
260 for (; p < end; p++, q++) |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
261 { |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
262 assert(q <= p); |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
263 switch (*p) |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
264 { |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
265 case '\r': |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
266 if (p+1 < end && p[1] == '\n') |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
267 p++; |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
268 case '\n': |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
269 *q = '\n'; |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
270 continue; |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
271 default: |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
272 if (isascii(*p)) |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
273 break; |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
274 if (p+2 < end && isUnicodeNewline(p)) |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
275 { |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
276 p += 2; |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
277 goto case '\n'; |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
278 } |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
279 auto p2 = p; // Beginning of the UTF-8 sequence. |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
280 dchar c = decode(p, end); |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
281 if (c == ERROR_CHAR) |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
282 { // Skip to next ASCII character or valid UTF-8 sequence. |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
283 while (++p < end && isTrailByte(*p)) |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
284 {} |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
285 alias REPLACEMENT_STR R; |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
286 if (q+2 < p) // Copy replacement char if there is enough space. |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
287 (*q = R[0]), (*++q = R[1]), (*++q = R[2]); |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
288 p--; |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
289 } |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
290 else |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
291 { // Copy the valid UTF-8 sequence. |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
292 while (p2 <= p) // p points to the last trail byte. |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
293 *q++ = *p2++; // Copy code units. |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
294 q--; |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
295 } |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
296 continue; |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
297 } |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
298 assert(isascii(*p)); |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
299 *q = *p; |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
300 } |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
301 assert(p == end); |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
302 text.length = text.length - (p - q); |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
303 //text = text.ptr[0 .. q - text.ptr]; // Another way. |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
304 return text; |
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
577
diff
changeset
|
305 } |
764
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
306 |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
307 unittest |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
308 { |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
309 Stdout("Testing function Converter.\n"); |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
310 struct Data2Text |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
311 { |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
312 char[] text; |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
313 char[] expected = "source"; |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
314 ubyte[] data() |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
315 { return cast(ubyte[])text; } |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
316 } |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
317 const Data2Text[] map = [ |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
318 // Without BOM |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
319 {"source"}, |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
320 {"s\0o\0u\0r\0c\0e\0"}, |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
321 {"\0s\0o\0u\0r\0c\0e"}, |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
322 {"s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0"}, |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
323 {"\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e"}, |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
324 // With BOM |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
325 {"\xEF\xBB\xBFsource"}, |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
326 {"\xFE\xFF\0s\0o\0u\0r\0c\0e"}, |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
327 {"\xFF\xFEs\0o\0u\0r\0c\0e\0"}, |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
328 {"\x00\x00\xFE\xFF\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e"}, |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
329 {"\xFF\xFE\x00\x00s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0"}, |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
330 ]; |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
331 auto converter = Converter("", new InfoManager); |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
332 foreach (i, pair; map) |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
333 assert(converter.data2UTF8(pair.data) == pair.expected, Format("failed at item {}", i)); |
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
755
diff
changeset
|
334 } |