Mercurial > projects > dil
comparison trunk/src/dil/Unicode.d @ 722:ceaac6a24258
Added isUnicodeAlpha() for DDocParser and MacroParser.
author | Aziz K?ksal <aziz.koeksal@gmail.com> |
---|---|
date | Fri, 01 Feb 2008 19:44:00 +0100 |
parents | d050e211402b |
children | f88b5285b86b |
comparison
equal
deleted
inserted
replaced
721:8955296dd807 | 722:ceaac6a24258 |
---|---|
40 bool isLeadByte(ubyte b) | 40 bool isLeadByte(ubyte b) |
41 { | 41 { |
42 return (b & 0xC0) == 0xC0; // 11xx_xxxx | 42 return (b & 0xC0) == 0xC0; // 11xx_xxxx |
43 } | 43 } |
44 | 44 |
45 /// Advances ref_p only if this is a valid Unicode alpha character. | |
46 bool isUnicodeAlpha(ref char* ref_p, char* end) | |
47 in { assert(ref_p && ref_p < end); } | |
48 body | |
49 { | |
50 if (*ref_p < 0x80) | |
51 return false; | |
52 auto p = ref_p; | |
53 auto c = decode(p, end); | |
54 if (!isUniAlpha(c)) | |
55 return false; | |
56 ref_p = p; | |
57 return true; | |
58 } | |
59 | |
60 /// index is set one past the last trail byte of the valid UTF-8 sequence. | |
45 dchar decode(char[] str, ref size_t index) | 61 dchar decode(char[] str, ref size_t index) |
46 in { assert(str.length); } | 62 in { assert(str.length && index < str.length); } |
47 out(c) { assert(isValidChar(c)); } | 63 out(c) { assert(isValidChar(c)); } |
48 body | 64 body |
49 { | 65 { |
50 char* p = str.ptr + index; | 66 char* p = str.ptr + index; |
51 char* end = str.ptr + str.length; | 67 char* end = str.ptr + str.length; |
68 dchar c = decode(p, end); | |
69 if (c != ERROR_CHAR) | |
70 index = p - str.ptr + 1; | |
71 return c; | |
72 } | |
73 | |
74 /// ref_p is set to the last trail byte of the valid UTF-8 sequence. | |
75 dchar decode(ref char* ref_p, char* end) | |
76 in { assert(ref_p && ref_p < end); } | |
77 out(c) { assert(isValidChar(c)); } | |
78 body | |
79 { | |
80 char* p = ref_p; | |
52 dchar c = *p; | 81 dchar c = *p; |
53 | 82 |
54 if (!(p < end)) | |
55 return ERROR_CHAR; | |
56 | |
57 if (c < 0x80) | 83 if (c < 0x80) |
58 { | 84 { |
59 ++index; | 85 ref_p++; |
60 return c; | 86 return c; |
61 } | 87 } |
62 | 88 |
63 ++p; // Move to second byte. | 89 p++; // Move to second byte. |
64 if (!(p < end)) | 90 if (!(p < end)) |
65 return ERROR_CHAR; | 91 return ERROR_CHAR; |
66 | 92 |
67 // Error if second byte is not a trail byte. | 93 // Error if second byte is not a trail byte. |
68 if (!isTrailByte(*p)) | 94 if (!isTrailByte(*p)) |
80 default: | 106 default: |
81 if ((c & 0xFE) == 0xC0) // 1100000x | 107 if ((c & 0xFE) == 0xC0) // 1100000x |
82 return ERROR_CHAR; | 108 return ERROR_CHAR; |
83 } | 109 } |
84 | 110 |
85 const char[] checkNextByte = "if (++p < end && !isTrailByte(*p))" | 111 const char[] checkNextByte = "if (!(++p < end && isTrailByte(*p)))" |
86 " return ERROR_CHAR;"; | 112 " return ERROR_CHAR;"; |
87 const char[] appendSixBits = "c = (c << 6) | *p & 0b0011_1111;"; | 113 const char[] appendSixBits = "c = (c << 6) | *p & 0b0011_1111;"; |
88 | 114 |
89 auto next_index = index; | |
90 // Decode | 115 // Decode |
91 if ((c & 0b1110_0000) == 0b1100_0000) | 116 if ((c & 0b1110_0000) == 0b1100_0000) |
92 { | 117 { |
93 // 110xxxxx 10xxxxxx | 118 // 110xxxxx 10xxxxxx |
94 c &= 0b0001_1111; | 119 c &= 0b0001_1111; |
95 mixin(appendSixBits); | 120 mixin(appendSixBits); |
96 next_index += 2; | |
97 } | 121 } |
98 else if ((c & 0b1111_0000) == 0b1110_0000) | 122 else if ((c & 0b1111_0000) == 0b1110_0000) |
99 { | 123 { |
100 // 1110xxxx 10xxxxxx 10xxxxxx | 124 // 1110xxxx 10xxxxxx 10xxxxxx |
101 c &= 0b0000_1111; | 125 c &= 0b0000_1111; |
102 mixin(appendSixBits ~ | 126 mixin(appendSixBits ~ |
103 checkNextByte ~ appendSixBits); | 127 checkNextByte ~ appendSixBits); |
104 next_index += 3; | |
105 } | 128 } |
106 else if ((c & 0b1111_1000) == 0b1111_0000) | 129 else if ((c & 0b1111_1000) == 0b1111_0000) |
107 { | 130 { |
108 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | 131 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
109 c &= 0b0000_0111; | 132 c &= 0b0000_0111; |
110 mixin(appendSixBits ~ | 133 mixin(appendSixBits ~ |
111 checkNextByte ~ appendSixBits ~ | 134 checkNextByte ~ appendSixBits ~ |
112 checkNextByte ~ appendSixBits); | 135 checkNextByte ~ appendSixBits); |
113 next_index += 4; | |
114 } | 136 } |
115 else | 137 else |
116 // 5 and 6 byte UTF-8 sequences are not allowed yet. | 138 // 5 and 6 byte UTF-8 sequences are not allowed yet. |
117 // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | 139 // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
118 // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | 140 // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
120 | 142 |
121 assert(isTrailByte(*p)); | 143 assert(isTrailByte(*p)); |
122 | 144 |
123 if (!isValidChar(c)) | 145 if (!isValidChar(c)) |
124 return ERROR_CHAR; | 146 return ERROR_CHAR; |
125 index = next_index; | 147 ref_p = p; |
126 return c; | 148 return c; |
127 } | 149 } |
128 | 150 |
129 /// Encodes a character and appends it to str. | 151 /// Encodes a character and appends it to str. |
130 void encode(ref char[] str, dchar c) | 152 void encode(ref char[] str, dchar c) |