annotate trunk/src/dil/Unicode.d @ 510:dd3ce87b3569

Added module dil.Unicode. Moved some functions from dil.Lexer to dil.Unicode. Added isIdentifierString() to dil.Lexer. Renamed isNonReservedIdentifier() to isReservedIdentifier().
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Thu, 13 Dec 2007 18:45:29 +0100
parents
children 8f86bb9ef715
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
510
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
1 /++
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
2 Author: Aziz Köksal
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
3 License: GPL3
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
4 +/
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
5 module dil.Unicode;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
6 public import std.uni : isUniAlpha;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
7
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
8 /// U+FFFD = �. Used to replace invalid Unicode characters.
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
9 const dchar REPLACEMENT_CHAR = '\uFFFD';
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
10 /// Invalid character, returned on errors.
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
11 const dchar ERROR_CHAR = 0xD800;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
12
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
13 /++
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
14 Returns true if this character is not a surrogate
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
15 code point and not higher than 0x10FFFF.
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
16 +/
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
17 bool isValidChar(dchar d)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
18 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
19 return d < 0xD800 || d > 0xDFFF && d <= 0x10FFFF;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
20 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
21
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
22 /++
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
23 Returns true if this is one of the
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
24 There are a total of 66 noncharacters.
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
25 See_also: Chapter 16.7 Noncharacters in Unicode 5.0
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
26 +/
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
27 bool isNoncharacter(dchar d)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
28 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
29 return 0xFDD0 <= d && d <= 0xFDEF || // 32
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
30 d <= 0x10FFFF && (d & 0xFFFF) >= 0xFFFE; // 34
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
31 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
32
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
33 /// Returns true if this is a trail byte of a UTF-8 sequence?
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
34 bool isTrailByte(ubyte b)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
35 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
36 return (b & 0xC0) == 0x80; // 10xx_xxxx
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
37 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
38
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
39 /// Returns true if this is a lead byte of a UTF-8 sequence.
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
40 bool isLeadByte(ubyte b)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
41 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
42 return (b & 0xC0) == 0xC0; // 11xx_xxxx
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
43 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
44
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
45 dchar decode(char[] str, ref size_t index)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
46 in { assert(str.length); }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
47 out(c) { assert(isValidChar(c)); }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
48 body
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
49 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
50 char* p = str.ptr + index;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
51 char* end = str.ptr + str.length;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
52 dchar c = *p;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
53
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
54 if (!(p < end))
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
55 return ERROR_CHAR;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
56
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
57 if (c < 0x80)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
58 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
59 ++index;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
60 return c;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
61 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
62
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
63 ++p; // Move to second byte.
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
64 if (!(p < end))
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
65 return ERROR_CHAR;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
66
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
67 // Error if second byte is not a trail byte.
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
68 if (!isTrailByte(*p))
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
69 return ERROR_CHAR;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
70
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
71 // Check for overlong sequences.
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
72 switch (c)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
73 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
74 case 0xE0, // 11100000 100xxxxx
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
75 0xF0, // 11110000 1000xxxx
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
76 0xF8, // 11111000 10000xxx
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
77 0xFC: // 11111100 100000xx
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
78 if ((*p & c) == 0x80)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
79 return ERROR_CHAR;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
80 default:
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
81 if ((c & 0xFE) == 0xC0) // 1100000x
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
82 return ERROR_CHAR;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
83 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
84
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
85 const char[] checkNextByte = "if (++p < end && !isTrailByte(*p))"
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
86 " return ERROR_CHAR;";
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
87 const char[] appendSixBits = "c = (c << 6) | *p & 0b0011_1111;";
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
88
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
89 auto next_index = index;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
90 // Decode
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
91 if ((c & 0b1110_0000) == 0b1100_0000)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
92 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
93 // 110xxxxx 10xxxxxx
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
94 c &= 0b0001_1111;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
95 mixin(appendSixBits);
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
96 next_index += 2;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
97 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
98 else if ((c & 0b1111_0000) == 0b1110_0000)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
99 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
100 // 1110xxxx 10xxxxxx 10xxxxxx
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
101 c &= 0b0000_1111;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
102 mixin(appendSixBits ~
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
103 checkNextByte ~ appendSixBits);
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
104 next_index += 3;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
105 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
106 else if ((c & 0b1111_1000) == 0b1111_0000)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
107 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
108 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
109 c &= 0b0000_0111;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
110 mixin(appendSixBits ~
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
111 checkNextByte ~ appendSixBits ~
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
112 checkNextByte ~ appendSixBits);
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
113 next_index += 4;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
114 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
115 else
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
116 // 5 and 6 byte UTF-8 sequences are not allowed yet.
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
117 // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
118 // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
119 return ERROR_CHAR;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
120
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
121 assert(isTrailByte(*p));
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
122
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
123 if (!isValidChar(c))
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
124 return ERROR_CHAR;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
125 index = next_index;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
126 return c;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
127 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
128
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
129 /// Encodes a character and appends it to str.
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
130 void encode(ref wchar[] str, dchar c)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
131 in { assert(isValidChar(c)); }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
132 body
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
133 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
134 if (c < 0x10000)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
135 str ~= cast(wchar)c;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
136 else
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
137 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
138 // Encode with surrogate pair.
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
139 wchar[2] pair = void;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
140 c -= 0x10000; // c'
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
141 // higher10bits(c') | 0b1101_10xx_xxxx_xxxx
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
142 pair[0] = (c >> 10) | 0xD800;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
143 // lower10bits(c') | 0b1101_11yy_yyyy_yyyy
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
144 pair[1] = (c & 0x3FF) | 0xDC00;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
145 str ~= pair;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
146 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
147 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
148
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
149 /++
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
150 Returns a decoded character from a UTF-16 sequence.
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
151 In case of an error in the sequence 0xD800 is returned.
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
152 Params:
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
153 str = the UTF-16 sequence.
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
154 index = where to start from.
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
155 +/
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
156 dchar decode(wchar[] str, ref size_t index)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
157 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
158 assert(str.length && index < str.length);
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
159 dchar c = str[index];
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
160 if (0xD800 > c || c > 0xDFFF)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
161 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
162 ++index;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
163 return c;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
164 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
165 if (c <= 0xDBFF && index+1 != str.length)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
166 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
167 wchar c2 = str[index+1];
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
168 if (0xDC00 <= c2 && c2 <= 0xDFFF)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
169 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
170 // (c - 0xD800) << 10 + 0x10000 ->
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
171 // (c - 0xD800 + 0x40) << 10 ->
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
172 c = (c - 0xD7C0) << 10;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
173 c |= (c2 & 0x3FF);
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
174 index += 2;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
175 return c;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
176 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
177 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
178 return ERROR_CHAR;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
179 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
180
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
181 /++
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
182 Returns a decoded character from a UTF-16 sequence.
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
183 In case of an error in the sequence 0xD800 is returned.
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
184 Params:
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
185 p = start of the UTF-16 sequence.
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
186 end = one past the end of the sequence.
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
187 +/
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
188 dchar decode(ref wchar* p, wchar* end)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
189 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
190 assert(p && p < end);
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
191 dchar c = *p;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
192 if (0xD800 > c || c > 0xDFFF)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
193 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
194 ++p;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
195 return c;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
196 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
197 if (c <= 0xDBFF && p+1 != end)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
198 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
199 wchar c2 = p[1];
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
200 if (0xDC00 <= c2 && c2 <= 0xDFFF)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
201 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
202 c = (c - 0xD7C0) << 10;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
203 c |= (c2 & 0x3FF);
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
204 p += 2;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
205 return c;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
206 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
207 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
208 return ERROR_CHAR;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
209 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
210
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
211 /// Decode a character from a zero-terminated string.
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
212 dchar decode(ref wchar* p)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
213 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
214 assert(p);
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
215 dchar c = *p;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
216 if (0xD800 > c || c > 0xDFFF)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
217 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
218 ++p;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
219 return c;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
220 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
221 if (c <= 0xDBFF)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
222 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
223 wchar c2 = p[1];
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
224 if (0xDC00 <= c2 && c2 <= 0xDFFF)
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
225 {
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
226 c = (c - 0xD7C0) << 10;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
227 c |= (c2 & 0x3FF);
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
228 p += 2;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
229 return c;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
230 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
231 }
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
232 return ERROR_CHAR;
dd3ce87b3569 Added module dil.Unicode.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
diff changeset
233 }