Mercurial > projects > dil
diff trunk/src/dil/Unicode.d @ 789:c1d5cfd7aa44
Implemented string literal conversion.
Removed two MID messages.
Added MSG.InvalidUTF8SequenceInString.
Added toUTF16() and toUTF32().
Fixed escape sequences.
Added formatBytes() and findInvalidUTF8Sequence().
author | Aziz K?ksal <aziz.koeksal@gmail.com> |
---|---|
date | Mon, 25 Feb 2008 02:56:22 +0100 |
parents | 5e3ef1b2011c |
children |
line wrap: on
line diff
--- a/trunk/src/dil/Unicode.d Sun Feb 24 03:19:02 2008 +0100 +++ b/trunk/src/dil/Unicode.d Mon Feb 25 02:56:22 2008 +0100 @@ -54,34 +54,37 @@ return true; } -/// index is set one past the last trail byte of the valid UTF-8 sequence. +/// Decodes a character from str at index. +/// Params: +/// index = set to one past the ASCII char or one past the last trail byte +/// of the valid UTF-8 sequence. dchar decode(char[] str, ref size_t index) in { assert(str.length && index < str.length); } -out(c) { assert(isValidChar(c) || c == ERROR_CHAR); } +out { assert(index <= str.length); } body { char* p = str.ptr + index; char* end = str.ptr + str.length; dchar c = decode(p, end); if (c != ERROR_CHAR) - index = p - str.ptr + 1; + index = p - str.ptr; return c; } -/// ref_p is set to the last trail byte of the valid UTF-8 sequence. +/// Decodes a character starting at ref_p. +/// Params: +/// ref_p = set to one past the ASCII char or one past the last trail byte +/// of the valid UTF-8 sequence. dchar decode(ref char* ref_p, char* end) in { assert(ref_p && ref_p < end); } -out(c) { assert(isValidChar(c) || c == ERROR_CHAR); } +out(c) { assert(ref_p <= end && (isValidChar(c) || c == ERROR_CHAR)); } body { char* p = ref_p; dchar c = *p; if (c < 0x80) - { - ref_p++; - return c; - } + return ref_p++, c; p++; // Move to second byte. if (!(p < end)) @@ -141,11 +144,11 @@ if (!isValidChar(c)) return ERROR_CHAR; - ref_p = p; + ref_p = p+1; return c; } -/// Encodes a character and appends it to str. +/// Encodes c and appends it to str. void encode(ref char[] str, dchar c) { assert(isValidChar(c), "check if character is valid before calling encode()."); @@ -199,7 +202,7 @@ assert(0); } -/// Encodes a character and appends it to str. +/// Encodes c and appends it to str. void encode(ref wchar[] str, dchar c) in { assert(isValidChar(c)); } body @@ -218,11 +221,11 @@ } } -/// Returns a decoded character from a UTF-16 sequence. -/// Returns: ERROR_CHAR in case of an error in the sequence. +/// Decodes a character from a UTF-16 sequence. /// Params: /// str = the UTF-16 sequence. /// index = where to start from. +/// Returns: ERROR_CHAR in case of an error in the sequence. dchar decode(wchar[] str, ref size_t index) { assert(str.length && index < str.length); @@ -248,11 +251,11 @@ return ERROR_CHAR; } -/// Returns a decoded character from a UTF-16 sequence. -/// Returns: ERROR_CHAR in case of an error in the sequence. +/// Decodes a character from a UTF-16 sequence. /// Params: /// p = start of the UTF-16 sequence. /// end = one past the end of the sequence. +/// Returns: ERROR_CHAR in case of an error in the sequence. dchar decode(ref wchar* p, wchar* end) { assert(p && p < end); @@ -276,7 +279,10 @@ return ERROR_CHAR; } -/// Decode a character from a zero-terminated string. +/// Decodes a character from a zero-terminated UTF-16 string. +/// Params: +/// p = start of the UTF-16 sequence. +/// Returns: ERROR_CHAR in case of an error in the sequence. dchar decode(ref wchar* p) { assert(p); @@ -299,3 +305,41 @@ } return ERROR_CHAR; } + +/// Converts a UTF-8 string to a UTF-16 string. +wchar[] toUTF16(char[] str) +{ + wchar[] result; + size_t idx; + while (idx < str.length) + { + auto c = decode(str, idx); + if (c == ERROR_CHAR) + { // Skip trail bytes. + while (++idx < str.length && isTrailByte(str[idx])) + {} + c = REPLACEMENT_CHAR; + } + encode(result, c); + } + return result; +} + +/// Converts a UTF-8 string to a UTF-32 string. +dchar[] toUTF32(char[] str) +{ + dchar[] result; + size_t idx; + while (idx < str.length) + { + auto c = decode(str, idx); + if (c == ERROR_CHAR) + { // Skip trail bytes. + while (++idx < str.length && isTrailByte(str[idx])) + {} + c = REPLACEMENT_CHAR; + } + result ~= c; + } + return result; +}