Mercurial > projects > dil
annotate trunk/src/dil/Unicode.d @ 769:5e3ef1b2011c
Added and improved documentation.
author | Aziz K?ksal <aziz.koeksal@gmail.com> |
---|---|
date | Sun, 17 Feb 2008 02:21:55 +0100 |
parents | 4579e8505d5e |
children | c1d5cfd7aa44 |
rev | line source |
---|---|
510 | 1 /++ |
2 Author: Aziz Köksal | |
3 License: GPL3 | |
4 +/ | |
5 module dil.Unicode; | |
629
d050e211402b
Moved files in src/std/ to src/util/.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
518
diff
changeset
|
6 public import util.uni : isUniAlpha; |
510 | 7 |
8 /// U+FFFD = �. Used to replace invalid Unicode characters. | |
9 const dchar REPLACEMENT_CHAR = '\uFFFD'; | |
739
49fe21aa387c
Added sanitizeText() to dil.Converter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
737
diff
changeset
|
10 const char[3] REPLACEMENT_STR = \uFFFD; /// Ditto |
510 | 11 /// Invalid character, returned on errors. |
12 const dchar ERROR_CHAR = 0xD800; | |
13 | |
769
5e3ef1b2011c
Added and improved documentation.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
14 /// Returns: true if this character is not a surrogate |
5e3ef1b2011c
Added and improved documentation.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
15 /// code point and not higher than 0x10FFFF. |
510 | 16 bool isValidChar(dchar d) |
17 { | |
18 return d < 0xD800 || d > 0xDFFF && d <= 0x10FFFF; | |
19 } | |
20 | |
769
5e3ef1b2011c
Added and improved documentation.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
21 /// There are a total of 66 noncharacters. |
5e3ef1b2011c
Added and improved documentation.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
22 /// Returns: true if this is one of them. |
5e3ef1b2011c
Added and improved documentation.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
23 /// See_also: Chapter 16.7 Noncharacters in Unicode 5.0 |
510 | 24 bool isNoncharacter(dchar d) |
25 { | |
26 return 0xFDD0 <= d && d <= 0xFDEF || // 32 | |
27 d <= 0x10FFFF && (d & 0xFFFF) >= 0xFFFE; // 34 | |
28 } | |
29 | |
747
00f872d949ea
Added method scanCommentText() to DDocEmitter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
741
diff
changeset
|
30 /// Returns: true if this is a trail byte of a UTF-8 sequence. |
510 | 31 bool isTrailByte(ubyte b) |
32 { | |
33 return (b & 0xC0) == 0x80; // 10xx_xxxx | |
34 } | |
35 | |
747
00f872d949ea
Added method scanCommentText() to DDocEmitter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
741
diff
changeset
|
36 /// Returns: true if this is a lead byte of a UTF-8 sequence. |
510 | 37 bool isLeadByte(ubyte b) |
38 { | |
39 return (b & 0xC0) == 0xC0; // 11xx_xxxx | |
40 } | |
41 | |
722
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
42 /// Advances ref_p only if this is a valid Unicode alpha character. |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
43 bool isUnicodeAlpha(ref char* ref_p, char* end) |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
44 in { assert(ref_p && ref_p < end); } |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
45 body |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
46 { |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
47 if (*ref_p < 0x80) |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
48 return false; |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
49 auto p = ref_p; |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
50 auto c = decode(p, end); |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
51 if (!isUniAlpha(c)) |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
52 return false; |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
53 ref_p = p; |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
54 return true; |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
55 } |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
56 |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
57 /// index is set one past the last trail byte of the valid UTF-8 sequence. |
510 | 58 dchar decode(char[] str, ref size_t index) |
722
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
59 in { assert(str.length && index < str.length); } |
737
f88b5285b86b
Implemented DDocEmitter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
722
diff
changeset
|
60 out(c) { assert(isValidChar(c) || c == ERROR_CHAR); } |
510 | 61 body |
62 { | |
63 char* p = str.ptr + index; | |
64 char* end = str.ptr + str.length; | |
722
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
65 dchar c = decode(p, end); |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
66 if (c != ERROR_CHAR) |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
67 index = p - str.ptr + 1; |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
68 return c; |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
69 } |
510 | 70 |
722
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
71 /// ref_p is set to the last trail byte of the valid UTF-8 sequence. |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
72 dchar decode(ref char* ref_p, char* end) |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
73 in { assert(ref_p && ref_p < end); } |
737
f88b5285b86b
Implemented DDocEmitter.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
722
diff
changeset
|
74 out(c) { assert(isValidChar(c) || c == ERROR_CHAR); } |
722
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
75 body |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
76 { |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
77 char* p = ref_p; |
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
78 dchar c = *p; |
510 | 79 |
80 if (c < 0x80) | |
81 { | |
722
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
82 ref_p++; |
510 | 83 return c; |
84 } | |
85 | |
722
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
86 p++; // Move to second byte. |
510 | 87 if (!(p < end)) |
88 return ERROR_CHAR; | |
89 | |
90 // Error if second byte is not a trail byte. | |
91 if (!isTrailByte(*p)) | |
92 return ERROR_CHAR; | |
93 | |
94 // Check for overlong sequences. | |
95 switch (c) | |
96 { | |
97 case 0xE0, // 11100000 100xxxxx | |
98 0xF0, // 11110000 1000xxxx | |
99 0xF8, // 11111000 10000xxx | |
100 0xFC: // 11111100 100000xx | |
101 if ((*p & c) == 0x80) | |
102 return ERROR_CHAR; | |
103 default: | |
104 if ((c & 0xFE) == 0xC0) // 1100000x | |
105 return ERROR_CHAR; | |
106 } | |
107 | |
722
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
108 const char[] checkNextByte = "if (!(++p < end && isTrailByte(*p)))" |
510 | 109 " return ERROR_CHAR;"; |
110 const char[] appendSixBits = "c = (c << 6) | *p & 0b0011_1111;"; | |
111 | |
112 // Decode | |
113 if ((c & 0b1110_0000) == 0b1100_0000) | |
114 { | |
115 // 110xxxxx 10xxxxxx | |
116 c &= 0b0001_1111; | |
117 mixin(appendSixBits); | |
118 } | |
119 else if ((c & 0b1111_0000) == 0b1110_0000) | |
120 { | |
121 // 1110xxxx 10xxxxxx 10xxxxxx | |
122 c &= 0b0000_1111; | |
123 mixin(appendSixBits ~ | |
124 checkNextByte ~ appendSixBits); | |
125 } | |
126 else if ((c & 0b1111_1000) == 0b1111_0000) | |
127 { | |
128 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
129 c &= 0b0000_0111; | |
130 mixin(appendSixBits ~ | |
131 checkNextByte ~ appendSixBits ~ | |
132 checkNextByte ~ appendSixBits); | |
133 } | |
134 else | |
135 // 5 and 6 byte UTF-8 sequences are not allowed yet. | |
136 // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
137 // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
138 return ERROR_CHAR; | |
139 | |
140 assert(isTrailByte(*p)); | |
141 | |
142 if (!isValidChar(c)) | |
143 return ERROR_CHAR; | |
722
ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
629
diff
changeset
|
144 ref_p = p; |
510 | 145 return c; |
146 } | |
147 | |
148 /// Encodes a character and appends it to str. | |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
149 void encode(ref char[] str, dchar c) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
150 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
151 assert(isValidChar(c), "check if character is valid before calling encode()."); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
152 |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
153 char[6] b = void; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
154 if (c < 0x80) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
155 str ~= c; |
764
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
747
diff
changeset
|
156 else if (c < 0x800) |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
157 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
158 b[0] = 0xC0 | (c >> 6); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
159 b[1] = 0x80 | (c & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
160 str ~= b[0..2]; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
161 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
162 else if (c < 0x10000) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
163 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
164 b[0] = 0xE0 | (c >> 12); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
165 b[1] = 0x80 | ((c >> 6) & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
166 b[2] = 0x80 | (c & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
167 str ~= b[0..3]; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
168 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
169 else if (c < 0x200000) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
170 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
171 b[0] = 0xF0 | (c >> 18); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
172 b[1] = 0x80 | ((c >> 12) & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
173 b[2] = 0x80 | ((c >> 6) & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
174 b[3] = 0x80 | (c & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
175 str ~= b[0..4]; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
176 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
177 /+ // There are no 5 and 6 byte UTF-8 sequences yet. |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
178 else if (c < 0x4000000) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
179 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
180 b[0] = 0xF8 | (c >> 24); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
181 b[1] = 0x80 | ((c >> 18) & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
182 b[2] = 0x80 | ((c >> 12) & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
183 b[3] = 0x80 | ((c >> 6) & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
184 b[4] = 0x80 | (c & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
185 str ~= b[0..5]; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
186 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
187 else if (c < 0x80000000) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
188 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
189 b[0] = 0xFC | (c >> 30); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
190 b[1] = 0x80 | ((c >> 24) & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
191 b[2] = 0x80 | ((c >> 18) & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
192 b[3] = 0x80 | ((c >> 12) & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
193 b[4] = 0x80 | ((c >> 6) & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
194 b[5] = 0x80 | (c & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
195 str ~= b[0..6]; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
196 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
197 +/ |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
198 else |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
199 assert(0); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
200 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
201 |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
202 /// Encodes a character and appends it to str. |
510 | 203 void encode(ref wchar[] str, dchar c) |
204 in { assert(isValidChar(c)); } | |
205 body | |
206 { | |
207 if (c < 0x10000) | |
208 str ~= cast(wchar)c; | |
209 else | |
764
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
747
diff
changeset
|
210 { // Encode with surrogate pair. |
510 | 211 wchar[2] pair = void; |
212 c -= 0x10000; // c' | |
213 // higher10bits(c') | 0b1101_10xx_xxxx_xxxx | |
214 pair[0] = (c >> 10) | 0xD800; | |
215 // lower10bits(c') | 0b1101_11yy_yyyy_yyyy | |
216 pair[1] = (c & 0x3FF) | 0xDC00; | |
217 str ~= pair; | |
218 } | |
219 } | |
220 | |
769
5e3ef1b2011c
Added and improved documentation.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
221 /// Returns a decoded character from a UTF-16 sequence. |
5e3ef1b2011c
Added and improved documentation.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
222 /// Returns: ERROR_CHAR in case of an error in the sequence. |
5e3ef1b2011c
Added and improved documentation.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
223 /// Params: |
5e3ef1b2011c
Added and improved documentation.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
224 /// str = the UTF-16 sequence. |
5e3ef1b2011c
Added and improved documentation.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
225 /// index = where to start from. |
510 | 226 dchar decode(wchar[] str, ref size_t index) |
227 { | |
228 assert(str.length && index < str.length); | |
229 dchar c = str[index]; | |
230 if (0xD800 > c || c > 0xDFFF) | |
231 { | |
232 ++index; | |
233 return c; | |
234 } | |
235 if (c <= 0xDBFF && index+1 != str.length) | |
236 { | |
237 wchar c2 = str[index+1]; | |
238 if (0xDC00 <= c2 && c2 <= 0xDFFF) | |
764
4579e8505d5e
Fixed unittests and removed dil.File.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
747
diff
changeset
|
239 { // Decode surrogate pair. |
510 | 240 // (c - 0xD800) << 10 + 0x10000 -> |
241 // (c - 0xD800 + 0x40) << 10 -> | |
242 c = (c - 0xD7C0) << 10; | |
243 c |= (c2 & 0x3FF); | |
244 index += 2; | |
245 return c; | |
246 } | |
247 } | |
248 return ERROR_CHAR; | |
249 } | |
250 | |
769
5e3ef1b2011c
Added and improved documentation.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
251 /// Returns a decoded character from a UTF-16 sequence. |
5e3ef1b2011c
Added and improved documentation.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
252 /// Returns: ERROR_CHAR in case of an error in the sequence. |
5e3ef1b2011c
Added and improved documentation.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
253 /// Params: |
5e3ef1b2011c
Added and improved documentation.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
254 /// p = start of the UTF-16 sequence. |
5e3ef1b2011c
Added and improved documentation.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
764
diff
changeset
|
255 /// end = one past the end of the sequence. |
510 | 256 dchar decode(ref wchar* p, wchar* end) |
257 { | |
258 assert(p && p < end); | |
259 dchar c = *p; | |
260 if (0xD800 > c || c > 0xDFFF) | |
261 { | |
262 ++p; | |
263 return c; | |
264 } | |
265 if (c <= 0xDBFF && p+1 != end) | |
266 { | |
267 wchar c2 = p[1]; | |
268 if (0xDC00 <= c2 && c2 <= 0xDFFF) | |
269 { | |
270 c = (c - 0xD7C0) << 10; | |
271 c |= (c2 & 0x3FF); | |
272 p += 2; | |
273 return c; | |
274 } | |
275 } | |
276 return ERROR_CHAR; | |
277 } | |
278 | |
279 /// Decode a character from a zero-terminated string. | |
280 dchar decode(ref wchar* p) | |
281 { | |
282 assert(p); | |
283 dchar c = *p; | |
284 if (0xD800 > c || c > 0xDFFF) | |
285 { | |
286 ++p; | |
287 return c; | |
288 } | |
289 if (c <= 0xDBFF) | |
290 { | |
291 wchar c2 = p[1]; | |
292 if (0xDC00 <= c2 && c2 <= 0xDFFF) | |
293 { | |
294 c = (c - 0xD7C0) << 10; | |
295 c |= (c2 & 0x3FF); | |
296 p += 2; | |
297 return c; | |
298 } | |
299 } | |
300 return ERROR_CHAR; | |
301 } |