annotate src/dil/Converter.d @ 806:bcb74c9b895c

Moved out files in the trunk folder to the root.
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Sun, 09 Mar 2008 00:12:19 +0100
parents trunk/src/dil/Converter.d@3b34f6a95a27
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
1 /++
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
2 Author: Aziz Köksal
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
3 License: GPL3
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
4 +/
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
5 module dil.Converter;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
6
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
7 import dil.Information;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
8 import dil.Location;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
9 import dil.Unicode;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
10 import dil.FileBOM;
577
9e811db780a6 Moved LexerFuncs.d to package 'lexer'.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 533
diff changeset
11 import dil.lexer.Funcs;
533
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
12 import dil.Messages;
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
13 import common;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
14
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
15 /// Converts various Unicode encoding formats to UTF-8.
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
16 struct Converter
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
17 {
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
18 char[] filePath; /// For error messages.
532
50e64bab9c7a Renamed InformationManager to InfoManager.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 518
diff changeset
19 InfoManager infoMan;
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
20
532
50e64bab9c7a Renamed InformationManager to InfoManager.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 518
diff changeset
21 static Converter opCall(char[] filePath, InfoManager infoMan)
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
22 {
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
23 Converter conv;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
24 conv.filePath = filePath;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
25 conv.infoMan = infoMan;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
26 return conv;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
27 }
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
28
786
3b34f6a95a27 Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 764
diff changeset
29 /// Byte-swaps c.
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
30 dchar swapBytes(dchar c)
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
31 {
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
32 return c = (c << 24) |
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
33 ((c >> 8) & 0xFF00) |
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
34 ((c << 8) & 0xFF0000) |
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
35 (c >> 24);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
36 }
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
37
786
3b34f6a95a27 Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 764
diff changeset
38 /// Byte-swaps c.
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
39 wchar swapBytes(wchar c)
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
40 {
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
41 return (c << 8) | (c >> 8);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
42 }
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
43
786
3b34f6a95a27 Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 764
diff changeset
44 /// Swaps the bytes of c on a little-endian machine.
3b34f6a95a27 Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 764
diff changeset
45 dchar BEtoMachineDword(dchar c)
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
46 {
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
47 version(LittleEndian)
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
48 return swapBytes(c);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
49 else
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
50 return c;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
51 }
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
52
786
3b34f6a95a27 Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 764
diff changeset
53 /// Swaps the bytes of c on a big-endian machine.
3b34f6a95a27 Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 764
diff changeset
54 dchar LEtoMachineDword(dchar c)
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
55 {
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
56 version(LittleEndian)
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
57 return c;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
58 else
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
59 return swapBytes(c);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
60 }
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
61
786
3b34f6a95a27 Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 764
diff changeset
62 /// Swaps the bytes of c on a little-endian machine.
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
63 wchar BEtoMachineWord(wchar c)
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
64 {
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
65 version(LittleEndian)
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
66 return swapBytes(c);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
67 else
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
68 return c;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
69 }
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
70
786
3b34f6a95a27 Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 764
diff changeset
71 /// Swaps the bytes of c on a big-endian machine.
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
72 wchar LEtoMachineWord(wchar c)
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
73 {
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
74 version(LittleEndian)
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
75 return c;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
76 else
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
77 return swapBytes(c);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
78 }
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
79
786
3b34f6a95a27 Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 764
diff changeset
80 /// Converts a UTF-32 text to UTF-8.
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
81 char[] UTF32toUTF8(bool isBigEndian)(ubyte[] data)
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
82 {
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
83 if (data.length == 0)
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
84 return null;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
85
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
86 char[] result;
533
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
87 uint lineNum = 1;
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
88 dchar[] text = cast(dchar[]) data[0 .. $-($%4)]; // Trim to multiple of 4.
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
89 foreach (dchar c; text)
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
90 {
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
91 static if (isBigEndian)
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
92 c = BEtoMachineDword(c);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
93 else
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
94 c = LEtoMachineDword(c);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
95
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
96 if (!isValidChar(c))
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
97 {
533
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
98 infoMan ~= new LexerError(
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
99 new Location(filePath, lineNum),
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
100 Format(MSG.InvalidUTF32Character, c)
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
101 );
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
102 c = REPLACEMENT_CHAR;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
103 }
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
104
533
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
105 if (isNewline(c))
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
106 ++lineNum;
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
107 dil.Unicode.encode(result, c);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
108 }
533
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
109
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
110 if (data.length % 4)
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
111 infoMan ~= new LexerError(
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
112 new Location(filePath, lineNum),
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
113 MSG.UTF32FileMustBeDivisibleBy4
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
114 );
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
115
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
116 return result;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
117 }
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
118
786
3b34f6a95a27 Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 764
diff changeset
119 alias UTF32toUTF8!(true) UTF32BEtoUTF8; /// Instantiation for UTF-32 BE.
3b34f6a95a27 Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 764
diff changeset
120 alias UTF32toUTF8!(false) UTF32LEtoUTF8; /// Instantiation for UTF-32 LE.
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
121
786
3b34f6a95a27 Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 764
diff changeset
122 /// Converts a UTF-16 text to UTF-8.
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
123 char[] UTF16toUTF8(bool isBigEndian)(ubyte[] data)
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
124 {
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
125 if (data.length == 0)
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
126 return null;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
127
533
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
128 wchar[] text = cast(wchar[]) data[0 .. $-($%2)]; // Trim to multiple of two.
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
129 wchar* p = text.ptr,
533
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
130 end = text.ptr + text.length;
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
131 char[] result;
533
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
132 uint lineNum = 1;
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
133
764
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
134 for (; p < end; p++)
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
135 {
764
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
136 dchar c = *p;
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
137 static if (isBigEndian)
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
138 c = BEtoMachineWord(c);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
139 else
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
140 c = LEtoMachineWord(c);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
141
764
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
142 if (0xD800 > c || c > 0xDFFF)
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
143 {}
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
144 else if (c <= 0xDBFF && p+1 < end)
764
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
145 { // Decode surrogate pairs.
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
146 wchar c2 = p[1];
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
147 static if (isBigEndian)
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
148 c2 = BEtoMachineWord(c2);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
149 else
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
150 c2 = LEtoMachineWord(c2);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
151
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
152 if (0xDC00 <= c2 && c2 <= 0xDFFF)
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
153 {
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
154 c = (c - 0xD7C0) << 10;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
155 c |= (c2 & 0x3FF);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
156 ++p;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
157 }
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
158 }
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
159 else
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
160 {
533
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
161 infoMan ~= new LexerError(
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
162 new Location(filePath, lineNum),
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
163 Format(MSG.InvalidUTF16Character, c)
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
164 );
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
165 c = REPLACEMENT_CHAR;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
166 }
533
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
167
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
168 if (isNewline(c))
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
169 ++lineNum;
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
170 dil.Unicode.encode(result, c);
764
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
171 }
533
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
172
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
173 if (data.length % 2)
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
174 infoMan ~= new LexerError(
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
175 new Location(filePath, lineNum),
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
176 MSG.UTF16FileMustBeDivisibleBy2
2a8d0ed0d71e Improved error reporting in dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 532
diff changeset
177 );
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
178 return result;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
179 }
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
180
786
3b34f6a95a27 Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 764
diff changeset
181 alias UTF16toUTF8!(true) UTF16BEtoUTF8; /// Instantiation for UTF-16 BE.
3b34f6a95a27 Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 764
diff changeset
182 alias UTF16toUTF8!(false) UTF16LEtoUTF8; /// Instantiation for UTF-16 LE.
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
183
786
3b34f6a95a27 Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 764
diff changeset
184 /// Converts the text in data to UTF-8.
3b34f6a95a27 Added and revised documenation comments.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 764
diff changeset
185 /// Leaves data unchanged if it is in UTF-8 already.
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
186 char[] data2UTF8(ubyte[] data)
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
187 {
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
188 if (data.length == 0)
755
90668b83ae5e Introduced new module dil.SourceText and class SourceText.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 739
diff changeset
189 return "";
518
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
190
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
191 char[] text;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
192 BOM bom = tellBOM(data);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
193
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
194 switch (bom)
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
195 {
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
196 case BOM.None:
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
197 // No BOM found. According to the specs the first character
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
198 // must be an ASCII character.
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
199 if (data.length >= 4)
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
200 {
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
201 if (data[0..3] == cast(ubyte[3])x"00 00 00")
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
202 {
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
203 text = UTF32BEtoUTF8(data); // UTF-32BE: 00 00 00 XX
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
204 break;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
205 }
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
206 else if (data[1..4] == cast(ubyte[3])x"00 00 00")
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
207 {
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
208 text = UTF32LEtoUTF8(data); // UTF-32LE: XX 00 00 00
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
209 break;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
210 }
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
211 }
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
212 if (data.length >= 2)
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
213 {
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
214 if (data[0] == 0) // UTF-16BE: 00 XX
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
215 {
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
216 text = UTF16BEtoUTF8(data);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
217 break;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
218 }
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
219 else if (data[1] == 0) // UTF-16LE: XX 00
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
220 {
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
221 text = UTF16LEtoUTF8(data);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
222 break;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
223 }
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
224 }
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
225 text = cast(char[])data; // UTF-8
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
226 break;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
227 case BOM.UTF8:
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
228 text = cast(char[])data[3..$];
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
229 break;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
230 case BOM.UTF16BE:
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
231 text = UTF16BEtoUTF8(data[2..$]);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
232 break;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
233 case BOM.UTF16LE:
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
234 text = UTF16LEtoUTF8(data[2..$]);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
235 break;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
236 case BOM.UTF32BE:
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
237 text = UTF32BEtoUTF8(data[4..$]);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
238 break;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
239 case BOM.UTF32LE:
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
240 text = UTF32LEtoUTF8(data[4..$]);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
241 break;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
242 default:
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
243 assert(0);
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
244 }
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
245 return text;
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
246 }
8f86bb9ef715 Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
247 }
739
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
248
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
249 /// Replaces invalid UTF-8 sequences with U+FFFD (if there's enough space,)
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
250 /// and Newlines with '\n'.
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
251 string sanitizeText(string text)
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
252 {
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
253 if (!text.length)
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
254 return null;
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
255
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
256 char* p = text.ptr;
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
257 char* end = p + text.length;
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
258 char* q = p;
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
259
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
260 for (; p < end; p++, q++)
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
261 {
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
262 assert(q <= p);
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
263 switch (*p)
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
264 {
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
265 case '\r':
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
266 if (p+1 < end && p[1] == '\n')
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
267 p++;
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
268 case '\n':
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
269 *q = '\n';
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
270 continue;
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
271 default:
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
272 if (isascii(*p))
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
273 break;
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
274 if (p+2 < end && isUnicodeNewline(p))
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
275 {
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
276 p += 2;
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
277 goto case '\n';
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
278 }
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
279 auto p2 = p; // Beginning of the UTF-8 sequence.
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
280 dchar c = decode(p, end);
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
281 if (c == ERROR_CHAR)
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
282 { // Skip to next ASCII character or valid UTF-8 sequence.
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
283 while (++p < end && isTrailByte(*p))
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
284 {}
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
285 alias REPLACEMENT_STR R;
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
286 if (q+2 < p) // Copy replacement char if there is enough space.
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
287 (*q = R[0]), (*++q = R[1]), (*++q = R[2]);
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
288 p--;
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
289 }
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
290 else
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
291 { // Copy the valid UTF-8 sequence.
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
292 while (p2 <= p) // p points to the last trail byte.
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
293 *q++ = *p2++; // Copy code units.
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
294 q--;
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
295 }
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
296 continue;
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
297 }
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
298 assert(isascii(*p));
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
299 *q = *p;
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
300 }
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
301 assert(p == end);
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
302 text.length = text.length - (p - q);
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
303 //text = text.ptr[0 .. q - text.ptr]; // Another way.
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
304 return text;
49fe21aa387c Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 577
diff changeset
305 }
764
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
306
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
307 unittest
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
308 {
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
309 Stdout("Testing function Converter.\n");
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
310 struct Data2Text
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
311 {
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
312 char[] text;
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
313 char[] expected = "source";
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
314 ubyte[] data()
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
315 { return cast(ubyte[])text; }
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
316 }
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
317 const Data2Text[] map = [
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
318 // Without BOM
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
319 {"source"},
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
320 {"s\0o\0u\0r\0c\0e\0"},
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
321 {"\0s\0o\0u\0r\0c\0e"},
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
322 {"s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0"},
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
323 {"\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e"},
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
324 // With BOM
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
325 {"\xEF\xBB\xBFsource"},
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
326 {"\xFE\xFF\0s\0o\0u\0r\0c\0e"},
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
327 {"\xFF\xFEs\0o\0u\0r\0c\0e\0"},
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
328 {"\x00\x00\xFE\xFF\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e"},
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
329 {"\xFF\xFE\x00\x00s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0"},
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
330 ];
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
331 auto converter = Converter("", new InfoManager);
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
332 foreach (i, pair; map)
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
333 assert(converter.data2UTF8(pair.data) == pair.expected, Format("failed at item {}", i));
4579e8505d5e Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents: 755
diff changeset
334 }