510
|
1 /++
|
|
2 Author: Aziz Köksal
|
|
3 License: GPL3
|
|
4 +/
|
|
5 module dil.Unicode;
|
|
6 public import std.uni : isUniAlpha;
|
|
7
|
|
8 /// U+FFFD = �. Used to replace invalid Unicode characters.
|
|
9 const dchar REPLACEMENT_CHAR = '\uFFFD';
|
|
10 /// Invalid character, returned on errors.
|
|
11 const dchar ERROR_CHAR = 0xD800;
|
|
12
|
|
13 /++
|
|
14 Returns true if this character is not a surrogate
|
|
15 code point and not higher than 0x10FFFF.
|
|
16 +/
|
|
17 bool isValidChar(dchar d)
|
|
18 {
|
|
19 return d < 0xD800 || d > 0xDFFF && d <= 0x10FFFF;
|
|
20 }
|
|
21
|
|
22 /++
|
|
23 Returns true if this is one of the
|
|
24 There are a total of 66 noncharacters.
|
|
25 See_also: Chapter 16.7 Noncharacters in Unicode 5.0
|
|
26 +/
|
|
27 bool isNoncharacter(dchar d)
|
|
28 {
|
|
29 return 0xFDD0 <= d && d <= 0xFDEF || // 32
|
|
30 d <= 0x10FFFF && (d & 0xFFFF) >= 0xFFFE; // 34
|
|
31 }
|
|
32
|
|
33 /// Returns true if this is a trail byte of a UTF-8 sequence?
|
|
34 bool isTrailByte(ubyte b)
|
|
35 {
|
|
36 return (b & 0xC0) == 0x80; // 10xx_xxxx
|
|
37 }
|
|
38
|
|
39 /// Returns true if this is a lead byte of a UTF-8 sequence.
|
|
40 bool isLeadByte(ubyte b)
|
|
41 {
|
|
42 return (b & 0xC0) == 0xC0; // 11xx_xxxx
|
|
43 }
|
|
44
|
|
45 dchar decode(char[] str, ref size_t index)
|
|
46 in { assert(str.length); }
|
|
47 out(c) { assert(isValidChar(c)); }
|
|
48 body
|
|
49 {
|
|
50 char* p = str.ptr + index;
|
|
51 char* end = str.ptr + str.length;
|
|
52 dchar c = *p;
|
|
53
|
|
54 if (!(p < end))
|
|
55 return ERROR_CHAR;
|
|
56
|
|
57 if (c < 0x80)
|
|
58 {
|
|
59 ++index;
|
|
60 return c;
|
|
61 }
|
|
62
|
|
63 ++p; // Move to second byte.
|
|
64 if (!(p < end))
|
|
65 return ERROR_CHAR;
|
|
66
|
|
67 // Error if second byte is not a trail byte.
|
|
68 if (!isTrailByte(*p))
|
|
69 return ERROR_CHAR;
|
|
70
|
|
71 // Check for overlong sequences.
|
|
72 switch (c)
|
|
73 {
|
|
74 case 0xE0, // 11100000 100xxxxx
|
|
75 0xF0, // 11110000 1000xxxx
|
|
76 0xF8, // 11111000 10000xxx
|
|
77 0xFC: // 11111100 100000xx
|
|
78 if ((*p & c) == 0x80)
|
|
79 return ERROR_CHAR;
|
|
80 default:
|
|
81 if ((c & 0xFE) == 0xC0) // 1100000x
|
|
82 return ERROR_CHAR;
|
|
83 }
|
|
84
|
|
85 const char[] checkNextByte = "if (++p < end && !isTrailByte(*p))"
|
|
86 " return ERROR_CHAR;";
|
|
87 const char[] appendSixBits = "c = (c << 6) | *p & 0b0011_1111;";
|
|
88
|
|
89 auto next_index = index;
|
|
90 // Decode
|
|
91 if ((c & 0b1110_0000) == 0b1100_0000)
|
|
92 {
|
|
93 // 110xxxxx 10xxxxxx
|
|
94 c &= 0b0001_1111;
|
|
95 mixin(appendSixBits);
|
|
96 next_index += 2;
|
|
97 }
|
|
98 else if ((c & 0b1111_0000) == 0b1110_0000)
|
|
99 {
|
|
100 // 1110xxxx 10xxxxxx 10xxxxxx
|
|
101 c &= 0b0000_1111;
|
|
102 mixin(appendSixBits ~
|
|
103 checkNextByte ~ appendSixBits);
|
|
104 next_index += 3;
|
|
105 }
|
|
106 else if ((c & 0b1111_1000) == 0b1111_0000)
|
|
107 {
|
|
108 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
109 c &= 0b0000_0111;
|
|
110 mixin(appendSixBits ~
|
|
111 checkNextByte ~ appendSixBits ~
|
|
112 checkNextByte ~ appendSixBits);
|
|
113 next_index += 4;
|
|
114 }
|
|
115 else
|
|
116 // 5 and 6 byte UTF-8 sequences are not allowed yet.
|
|
117 // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
118 // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
119 return ERROR_CHAR;
|
|
120
|
|
121 assert(isTrailByte(*p));
|
|
122
|
|
123 if (!isValidChar(c))
|
|
124 return ERROR_CHAR;
|
|
125 index = next_index;
|
|
126 return c;
|
|
127 }
|
|
128
|
|
129 /// Encodes a character and appends it to str.
|
|
130 void encode(ref wchar[] str, dchar c)
|
|
131 in { assert(isValidChar(c)); }
|
|
132 body
|
|
133 {
|
|
134 if (c < 0x10000)
|
|
135 str ~= cast(wchar)c;
|
|
136 else
|
|
137 {
|
|
138 // Encode with surrogate pair.
|
|
139 wchar[2] pair = void;
|
|
140 c -= 0x10000; // c'
|
|
141 // higher10bits(c') | 0b1101_10xx_xxxx_xxxx
|
|
142 pair[0] = (c >> 10) | 0xD800;
|
|
143 // lower10bits(c') | 0b1101_11yy_yyyy_yyyy
|
|
144 pair[1] = (c & 0x3FF) | 0xDC00;
|
|
145 str ~= pair;
|
|
146 }
|
|
147 }
|
|
148
|
|
149 /++
|
|
150 Returns a decoded character from a UTF-16 sequence.
|
|
151 In case of an error in the sequence 0xD800 is returned.
|
|
152 Params:
|
|
153 str = the UTF-16 sequence.
|
|
154 index = where to start from.
|
|
155 +/
|
|
156 dchar decode(wchar[] str, ref size_t index)
|
|
157 {
|
|
158 assert(str.length && index < str.length);
|
|
159 dchar c = str[index];
|
|
160 if (0xD800 > c || c > 0xDFFF)
|
|
161 {
|
|
162 ++index;
|
|
163 return c;
|
|
164 }
|
|
165 if (c <= 0xDBFF && index+1 != str.length)
|
|
166 {
|
|
167 wchar c2 = str[index+1];
|
|
168 if (0xDC00 <= c2 && c2 <= 0xDFFF)
|
|
169 {
|
|
170 // (c - 0xD800) << 10 + 0x10000 ->
|
|
171 // (c - 0xD800 + 0x40) << 10 ->
|
|
172 c = (c - 0xD7C0) << 10;
|
|
173 c |= (c2 & 0x3FF);
|
|
174 index += 2;
|
|
175 return c;
|
|
176 }
|
|
177 }
|
|
178 return ERROR_CHAR;
|
|
179 }
|
|
180
|
|
181 /++
|
|
182 Returns a decoded character from a UTF-16 sequence.
|
|
183 In case of an error in the sequence 0xD800 is returned.
|
|
184 Params:
|
|
185 p = start of the UTF-16 sequence.
|
|
186 end = one past the end of the sequence.
|
|
187 +/
|
|
188 dchar decode(ref wchar* p, wchar* end)
|
|
189 {
|
|
190 assert(p && p < end);
|
|
191 dchar c = *p;
|
|
192 if (0xD800 > c || c > 0xDFFF)
|
|
193 {
|
|
194 ++p;
|
|
195 return c;
|
|
196 }
|
|
197 if (c <= 0xDBFF && p+1 != end)
|
|
198 {
|
|
199 wchar c2 = p[1];
|
|
200 if (0xDC00 <= c2 && c2 <= 0xDFFF)
|
|
201 {
|
|
202 c = (c - 0xD7C0) << 10;
|
|
203 c |= (c2 & 0x3FF);
|
|
204 p += 2;
|
|
205 return c;
|
|
206 }
|
|
207 }
|
|
208 return ERROR_CHAR;
|
|
209 }
|
|
210
|
|
211 /// Decode a character from a zero-terminated string.
|
|
212 dchar decode(ref wchar* p)
|
|
213 {
|
|
214 assert(p);
|
|
215 dchar c = *p;
|
|
216 if (0xD800 > c || c > 0xDFFF)
|
|
217 {
|
|
218 ++p;
|
|
219 return c;
|
|
220 }
|
|
221 if (c <= 0xDBFF)
|
|
222 {
|
|
223 wchar c2 = p[1];
|
|
224 if (0xDC00 <= c2 && c2 <= 0xDFFF)
|
|
225 {
|
|
226 c = (c - 0xD7C0) << 10;
|
|
227 c |= (c2 & 0x3FF);
|
|
228 p += 2;
|
|
229 return c;
|
|
230 }
|
|
231 }
|
|
232 return ERROR_CHAR;
|
|
233 }
|