projects/dil: trunk/src/dil/Unicode.d comparison

comparison trunk/src/dil/Unicode.d @ 789:c1d5cfd7aa44

Implemented string literal conversion. Removed two MID messages. Added MSG.InvalidUTF8SequenceInString. Added toUTF16() and toUTF32(). Fixed escape sequences. Added formatBytes() and findInvalidUTF8Sequence().

author	Aziz K?ksal <aziz.koeksal@gmail.com>
date	Mon, 25 Feb 2008 02:56:22 +0100
parents	5e3ef1b2011c
children

comparison

equal deleted inserted replaced

-:139c9a6a39a8
+:c1d5cfd7aa44
 return false;
 ref_p = p;
 return true;
 }
-/// index is set one past the last trail byte of the valid UTF-8 sequence.
+/// Decodes a character from str at index.
+/// Params:
+///   index = set to one past the ASCII char or one past the last trail byte
+///           of the valid UTF-8 sequence.
 dchar decode(char[] str, ref size_t index)
 in { assert(str.length && index < str.length); }
-out(c) { assert(isValidChar(c) || c == ERROR_CHAR); }
+out { assert(index <= str.length); }
 body
 {
 char* p = str.ptr + index;
 char* end = str.ptr + str.length;
 dchar c = decode(p, end);
 if (c != ERROR_CHAR)
-index = p - str.ptr + 1;
+index = p - str.ptr;
 return c;
 }
-/// ref_p is set to the last trail byte of the valid UTF-8 sequence.
+/// Decodes a character starting at ref_p.
+/// Params:
+///   ref_p = set to one past the ASCII char or one past the last trail byte
+///           of the valid UTF-8 sequence.
 dchar decode(ref char* ref_p, char* end)
 in { assert(ref_p && ref_p < end); }
-out(c) { assert(isValidChar(c) || c == ERROR_CHAR); }
+out(c) { assert(ref_p <= end && (isValidChar(c) || c == ERROR_CHAR)); }
 body
 {
 char* p = ref_p;
 dchar c = *p;
 if (c < 0x80)
-{
+return ref_p++, c;
-ref_p++;
-return c;
-}
 p++; // Move to second byte.
 if (!(p < end))
 return ERROR_CHAR;
 assert(isTrailByte(*p));
 if (!isValidChar(c))
 return ERROR_CHAR;
-ref_p = p;
+ref_p = p+1;
 return c;
 }
-/// Encodes a character and appends it to str.
+/// Encodes c and appends it to str.
 void encode(ref char[] str, dchar c)
 {
 assert(isValidChar(c), "check if character is valid before calling encode().");
 char[6] b = void;
 +/
 else
 assert(0);
 }
-/// Encodes a character and appends it to str.
+/// Encodes c and appends it to str.
 void encode(ref wchar[] str, dchar c)
 in { assert(isValidChar(c)); }
 body
 {
 if (c < 0x10000)
 pair[1] = (c & 0x3FF) | 0xDC00;
 str ~= pair;
 }
 }
-/// Returns a decoded character from a UTF-16 sequence.
+/// Decodes a character from a UTF-16 sequence.
-/// Returns: ERROR_CHAR in case of an error in the sequence.
 /// Params:
 ///   str = the UTF-16 sequence.
 ///   index = where to start from.
+/// Returns: ERROR_CHAR in case of an error in the sequence.
 dchar decode(wchar[] str, ref size_t index)
 {
 assert(str.length && index < str.length);
 dchar c = str[index];
 if (0xD800 > c || c > 0xDFFF)
 }
 }
 return ERROR_CHAR;
 }
-/// Returns a decoded character from a UTF-16 sequence.
+/// Decodes a character from a UTF-16 sequence.
-/// Returns: ERROR_CHAR in case of an error in the sequence.
 /// Params:
 ///   p = start of the UTF-16 sequence.
 ///   end = one past the end of the sequence.
+/// Returns: ERROR_CHAR in case of an error in the sequence.
 dchar decode(ref wchar* p, wchar* end)
 {
 assert(p && p < end);
 dchar c = *p;
 if (0xD800 > c || c > 0xDFFF)
 }
 }
 return ERROR_CHAR;
 }
-/// Decode a character from a zero-terminated string.
+/// Decodes a character from a zero-terminated UTF-16 string.
+/// Params:
+///   p = start of the UTF-16 sequence.
+/// Returns: ERROR_CHAR in case of an error in the sequence.
 dchar decode(ref wchar* p)
 {
 assert(p);
 dchar c = *p;
 if (0xD800 > c || c > 0xDFFF)
 return c;
 }
 }
 return ERROR_CHAR;
 }
+/// Converts a UTF-8 string to a UTF-16 string.
+wchar[] toUTF16(char[] str)
+{
+wchar[] result;
+size_t idx;
+while (idx < str.length)
+{
+auto c = decode(str, idx);
+if (c == ERROR_CHAR)
+{ // Skip trail bytes.
+while (++idx < str.length && isTrailByte(str[idx]))
+{}
+c = REPLACEMENT_CHAR;
+}
+encode(result, c);
+}
+return result;
+}
+/// Converts a UTF-8 string to a UTF-32 string.
+dchar[] toUTF32(char[] str)
+{
+dchar[] result;
+size_t idx;
+while (idx < str.length)
+{
+auto c = decode(str, idx);
+if (c == ERROR_CHAR)
+{ // Skip trail bytes.
+while (++idx < str.length && isTrailByte(str[idx]))
+{}
+c = REPLACEMENT_CHAR;
+}
+result ~= c;
+}
+return result;
+}

Mercurial > projects > dil

comparison trunk/src/dil/Unicode.d @ 789:c1d5cfd7aa44