Mercurial > projects > dil
changeset 629:d050e211402b
Moved files in src/std/ to src/util/.
author | Aziz K?ksal <aziz.koeksal@gmail.com> |
---|---|
date | Fri, 11 Jan 2008 20:03:46 +0100 |
parents | 08681b93c3b3 |
children | 5197bd351e5f |
files | trunk/src/dil/CompilerInfo.d trunk/src/dil/File.d trunk/src/dil/Unicode.d trunk/src/std/metastrings.d trunk/src/std/uni.d trunk/src/std/utf.d trunk/src/util/metastrings.d trunk/src/util/uni.d trunk/src/util/utf.d |
diffstat | 9 files changed, 1833 insertions(+), 1833 deletions(-) [+] |
line wrap: on
line diff
--- a/trunk/src/dil/CompilerInfo.d Fri Jan 11 15:23:38 2008 +0100 +++ b/trunk/src/dil/CompilerInfo.d Fri Jan 11 20:03:46 2008 +0100 @@ -3,7 +3,7 @@ License: GPL3 +/ module dil.CompilerInfo; -import std.metastrings : FormatT = Format, ToString; +import util.metastrings : FormatT = Format, ToString; template Pad(char[] str, uint amount) {
--- a/trunk/src/dil/File.d Fri Jan 11 15:23:38 2008 +0100 +++ b/trunk/src/dil/File.d Fri Jan 11 20:03:46 2008 +0100 @@ -8,7 +8,7 @@ import dil.Information; import dil.Converter; import tango.io.File; -import std.utf; +import util.utf; import common; /// Loads a file in any valid Unicode format and converts it to UTF-8.
--- a/trunk/src/dil/Unicode.d Fri Jan 11 15:23:38 2008 +0100 +++ b/trunk/src/dil/Unicode.d Fri Jan 11 20:03:46 2008 +0100 @@ -3,7 +3,7 @@ License: GPL3 +/ module dil.Unicode; -public import std.uni : isUniAlpha; +public import util.uni : isUniAlpha; /// U+FFFD = �. Used to replace invalid Unicode characters. const dchar REPLACEMENT_CHAR = '\uFFFD';
--- a/trunk/src/std/metastrings.d Fri Jan 11 15:23:38 2008 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,225 +0,0 @@ - -// Written in the D programming language. - -/** - * Templates with which to do compile time manipulation of strings. - * - * Macros: - * WIKI = Phobos/StdMetastrings - * Copyright: - * Public Domain - */ - -/* - * Authors: - * Walter Bright, Digital Mars, www.digitalmars.com - * Don Clugston - */ - -/* - Note: this is not the original file! - Modified by Aziz Köksal: - Only changed some types from string to char[] -*/ - -module std.metastrings; - -/** - * Formats constants into a string at compile time. - * Analogous to std.string.format(). - * Parameters: - * A = tuple of constants, which can be strings, - * characters, or integral values. - * Formats: - * The formats supported are %s for strings, and %% - * for the % character. - * Example: - * --- -import std.metastrings; -import std.stdio; - -void main() -{ - string s = Format!("Arg %s = %s", "foo", 27); - writefln(s); // "Arg foo = 27" -} - * --- - */ - -template Format(A...) -{ - static if (A.length == 0) - const char[] Format = ""; - else static if (is(typeof(A[0]) : char[])) - const char[] Format = FormatString!(A[0], A[1..$]); - //const char[] Format = FormatString!(A[0]); - else - const char[] Format = ToString!(A[0]) ~ Format!(A[1..$]); -} - -template FormatString(char[] F, A...) -{ - static if (F.length == 0) - const char[] FormatString = Format!(A); - else static if (F.length == 1) - const char[] FormatString = F[0] ~ Format!(A); - else static if (F[0..2] == "%s") - const char[] FormatString = ToString!(A[0]) ~ FormatString!(F[2..$],A[1..$]); - else static if (F[0..2] == "%%") - const char[] FormatString = "%" ~ FormatString!(F[2..$],A); - else static if (F[0] == '%') - static assert(0, "unrecognized format %" ~ F[1]); - else - const char[] FormatString = F[0] ~ FormatString!(F[1..$],A); -} - -/** - * Convert constant argument to a string. - */ - -template ToString(ulong U) -{ - static if (U < 10) - const char[] ToString = "" ~ cast(char)(U + '0'); - else - const char[] ToString = ToString!(U / 10) ~ ToString!(U % 10); -} - -/// ditto -template ToString(long I) -{ - static if (I < 0) - const char[] ToString = "-" ~ ToString!(cast(ulong)(-I)); - else - const char[] ToString = ToString!(cast(ulong)I); -} - -static assert(ToString!(0x100000000) == "4294967296"); - -/// ditto -template ToString(uint U) -{ - const char[] ToString = ToString!(cast(ulong)U); -} - -/// ditto -template ToString(int I) -{ - const char[] ToString = ToString!(cast(long)I); -} - -/// ditto -template ToString(ushort U) -{ - const char[] ToString = ToString!(cast(ulong)U); -} - -/// ditto -template ToString(short I) -{ - const char[] ToString = ToString!(cast(long)I); -} - -/// ditto -template ToString(ubyte U) -{ - const char[] ToString = ToString!(cast(ulong)U); -} - -/// ditto -template ToString(byte I) -{ - const char[] ToString = ToString!(cast(long)I); -} - -/// ditto -template ToString(bool B) -{ - const char[] ToString = B ? "true" : "false"; -} - -/// ditto -template ToString(char[] S) -{ - const char[] ToString = S; -} - -/// ditto -template ToString(char C) -{ - const char[] ToString = "" ~ C; -} - -unittest -{ - char[] s = Format!("hel%slo", "world", -138, 'c', true); - assert(s == "helworldlo-138ctrue"); -} - - -/******** - * Parse unsigned integer literal from the start of string s. - * returns: - * .value = the integer literal as a string, - * .rest = the string following the integer literal - * Otherwise: - * .value = null, - * .rest = s - */ - -template ParseUinteger(char[] s) -{ - static if (s.length == 0) - { const char[] value = ""; - const char[] rest = ""; - } - else static if (s[0] >= '0' && s[0] <= '9') - { const char[] value = s[0] ~ ParseUinteger!(s[1..$]).value; - const char[] rest = ParseUinteger!(s[1..$]).rest; - } - else - { const char[] value = ""; - const char[] rest = s; - } -} - -/******** - * Parse integer literal optionally preceded by '-' - * from the start of string s. - * returns: - * .value = the integer literal as a string, - * .rest = the string following the integer literal - * Otherwise: - * .value = null, - * .rest = s - */ - -template ParseInteger(char[] s) -{ - static if (s.length == 0) - { const char[] value = ""; - const char[] rest = ""; - } - else static if (s[0] >= '0' && s[0] <= '9') - { const char[] value = s[0] ~ ParseUinteger!(s[1..$]).value; - const char[] rest = ParseUinteger!(s[1..$]).rest; - } - else static if (s.length >= 2 && - s[0] == '-' && s[1] >= '0' && s[1] <= '9') - { const char[] value = s[0..2] ~ ParseUinteger!(s[2..$]).value; - const char[] rest = ParseUinteger!(s[2..$]).rest; - } - else - { const char[] value = ""; - const char[] rest = s; - } -} - -unittest -{ - assert(ParseUinteger!("1234abc").value == "1234"); - assert(ParseUinteger!("1234abc").rest == "abc"); - assert(ParseInteger!("-1234abc").value == "-1234"); - assert(ParseInteger!("-1234abc").rest == "abc"); -} -
--- a/trunk/src/std/uni.d Fri Jan 11 15:23:38 2008 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,630 +0,0 @@ - -// Written in the D programming language. - -/* - * Placed into the Public Domain. - * Digital Mars, www.digitalmars.com - * Written by Walter Bright - */ - -/** - * Simple Unicode character classification functions. - * For ASCII classification, see $(LINK2 std_ctype.html, std.ctype). - * Macros: - * WIKI=Phobos/StdUni - * References: - * $(LINK2 http://www.digitalmars.com/d/ascii-table.html, ASCII Table), - * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia), - * $(LINK2 http://www.unicode.org, The Unicode Consortium) - * Trademarks: - * Unicode(tm) is a trademark of Unicode, Inc. - */ - - -module std.uni; - -/** - * Returns !=0 if c is a Unicode lower case character. - */ -int isUniLower(dchar c) -{ - if (c <= 0x7F) - return (c >= 'a' && c <= 'z'); - - return isUniAlpha(c) && c == toUniLower(c); -} - -/** - * Returns !=0 if c is a Unicode upper case character. - */ -int isUniUpper(dchar c) -{ - if (c <= 0x7F) - return (c >= 'A' && c <= 'Z'); - - return isUniAlpha(c) && c == toUniUpper(c); -} - -/** - * If c is a Unicode upper case character, return the lower case - * equivalent, otherwise return c. - */ -dchar toUniLower(dchar c) -{ - if (c >= 'A' && c <= 'Z') - { - c += 32; - } - else if (c >= 0x00C0) - { - if ((c >= 0x00C0 && c <= 0x00D6) || (c >= 0x00D8 && c<=0x00DE)) - { - c += 32; - } - else if ((c >= 0x0100 && c < 0x0138) || (c > 0x0149 && c < 0x0178)) - { - if (c == 0x0130) - c = 0x0069; - else if ((c & 1) == 0) - c += 1; - } - else if (c == 0x0178) - { - c = 0x00FF; - } - else if ((c >= 0x0139 && c < 0x0149) || (c > 0x0178 && c < 0x017F)) - { - if (c & 1) - c += 1; - } - else if (c >= 0x0200 && c <= 0x0217) - { - if ((c & 1) == 0) - c += 1; - } - else if ((c >= 0x0401 && c <= 0x040C) || (c>= 0x040E && c <= 0x040F)) - { - c += 80; - } - else if (c >= 0x0410 && c <= 0x042F) - { - c += 32; - } - else if (c >= 0x0460 && c <= 0x047F) - { - if ((c & 1) == 0) - c += 1; - } - else if (c >= 0x0531 && c <= 0x0556) - { - c += 48; - } - else if (c >= 0x10A0 && c <= 0x10C5) - { - c += 48; - } - else if (c >= 0xFF21 && c <= 0xFF3A) - { - c += 32; - } - } - return c; -} - -/** - * If c is a Unicode lower case character, return the upper case - * equivalent, otherwise return c. - */ -dchar toUniUpper(dchar c) -{ - if (c >= 'a' && c <= 'z') - { - c -= 32; - } - else if (c >= 0x00E0) - { - if ((c >= 0x00E0 && c <= 0x00F6) || (c >= 0x00F8 && c <= 0x00FE)) - { - c -= 32; - } - else if (c == 0x00FF) - { - c = 0x0178; - } - else if ((c >= 0x0100 && c < 0x0138) || (c > 0x0149 && c < 0x0178)) - { - if (c == 0x0131) - c = 0x0049; - else if (c & 1) - c -= 1; - } - else if ((c >= 0x0139 && c < 0x0149) || (c > 0x0178 && c < 0x017F)) - { - if ((c & 1) == 0) - c = c-1; - } - else if (c == 0x017F) - { - c = 0x0053; - } - else if (c >= 0x0200 && c <= 0x0217) - { - if (c & 1) - c = c-1; - } - else if (c >= 0x0430 && c<= 0x044F) - { - c -= 32; - } - else if ((c >= 0x0451 && c <= 0x045C) || (c >=0x045E && c<= 0x045F)) - { - c -= 80; - } - else if (c >= 0x0460 && c <= 0x047F) - { - if (c & 1) - c -= 1; - } - else if (c >= 0x0561 && c < 0x0587) - { - c -= 48; - } - else if (c >= 0xFF41 && c <= 0xFF5A) - { - c -= 32; - } - } - return c; -} - - -/******************************* - * Return !=0 if u is a Unicode alpha character. - * (general Unicode category: Lu, Ll, Lt, Lm and Lo) - * - * Standards: Unicode 5.0.0 - */ - -int isUniAlpha(dchar u) -{ - static dchar table[][2] = - [ - [ 'A', 'Z' ], - [ 'a', 'z' ], - [ 0x00AA, 0x00AA ], - [ 0x00B5, 0x00B5 ], - [ 0x00BA, 0x00BA ], - [ 0x00C0, 0x00D6 ], - [ 0x00D8, 0x00F6 ], - [ 0x00F8, 0x02C1 ], - [ 0x02C6, 0x02D1 ], - [ 0x02E0, 0x02E4 ], - [ 0x02EE, 0x02EE ], - [ 0x037A, 0x037D ], - [ 0x0386, 0x0386 ], - [ 0x0388, 0x038A ], - [ 0x038C, 0x038C ], - [ 0x038E, 0x03A1 ], - [ 0x03A3, 0x03CE ], - [ 0x03D0, 0x03F5 ], - [ 0x03F7, 0x0481 ], - [ 0x048A, 0x0513 ], - [ 0x0531, 0x0556 ], - [ 0x0559, 0x0559 ], - [ 0x0561, 0x0587 ], - [ 0x05D0, 0x05EA ], - [ 0x05F0, 0x05F2 ], - [ 0x0621, 0x063A ], - [ 0x0640, 0x064A ], - [ 0x066E, 0x066F ], - [ 0x0671, 0x06D3 ], - [ 0x06D5, 0x06D5 ], - [ 0x06E5, 0x06E6 ], - [ 0x06EE, 0x06EF ], - [ 0x06FA, 0x06FC ], - [ 0x06FF, 0x06FF ], - [ 0x0710, 0x0710 ], - [ 0x0712, 0x072F ], - [ 0x074D, 0x076D ], - [ 0x0780, 0x07A5 ], - [ 0x07B1, 0x07B1 ], - [ 0x07CA, 0x07EA ], - [ 0x07F4, 0x07F5 ], - [ 0x07FA, 0x07FA ], - [ 0x0904, 0x0939 ], - [ 0x093D, 0x093D ], - [ 0x0950, 0x0950 ], - [ 0x0958, 0x0961 ], - [ 0x097B, 0x097F ], - [ 0x0985, 0x098C ], - [ 0x098F, 0x0990 ], - [ 0x0993, 0x09A8 ], - [ 0x09AA, 0x09B0 ], - [ 0x09B2, 0x09B2 ], - [ 0x09B6, 0x09B9 ], - [ 0x09BD, 0x09BD ], - [ 0x09CE, 0x09CE ], - [ 0x09DC, 0x09DD ], - [ 0x09DF, 0x09E1 ], - [ 0x09F0, 0x09F1 ], - [ 0x0A05, 0x0A0A ], - [ 0x0A0F, 0x0A10 ], - [ 0x0A13, 0x0A28 ], - [ 0x0A2A, 0x0A30 ], - [ 0x0A32, 0x0A33 ], - [ 0x0A35, 0x0A36 ], - [ 0x0A38, 0x0A39 ], - [ 0x0A59, 0x0A5C ], - [ 0x0A5E, 0x0A5E ], - [ 0x0A72, 0x0A74 ], - [ 0x0A85, 0x0A8D ], - [ 0x0A8F, 0x0A91 ], - [ 0x0A93, 0x0AA8 ], - [ 0x0AAA, 0x0AB0 ], - [ 0x0AB2, 0x0AB3 ], - [ 0x0AB5, 0x0AB9 ], - [ 0x0ABD, 0x0ABD ], - [ 0x0AD0, 0x0AD0 ], - [ 0x0AE0, 0x0AE1 ], - [ 0x0B05, 0x0B0C ], - [ 0x0B0F, 0x0B10 ], - [ 0x0B13, 0x0B28 ], - [ 0x0B2A, 0x0B30 ], - [ 0x0B32, 0x0B33 ], - [ 0x0B35, 0x0B39 ], - [ 0x0B3D, 0x0B3D ], - [ 0x0B5C, 0x0B5D ], - [ 0x0B5F, 0x0B61 ], - [ 0x0B71, 0x0B71 ], - [ 0x0B83, 0x0B83 ], - [ 0x0B85, 0x0B8A ], - [ 0x0B8E, 0x0B90 ], - [ 0x0B92, 0x0B95 ], - [ 0x0B99, 0x0B9A ], - [ 0x0B9C, 0x0B9C ], - [ 0x0B9E, 0x0B9F ], - [ 0x0BA3, 0x0BA4 ], - [ 0x0BA8, 0x0BAA ], - [ 0x0BAE, 0x0BB9 ], - [ 0x0C05, 0x0C0C ], - [ 0x0C0E, 0x0C10 ], - [ 0x0C12, 0x0C28 ], - [ 0x0C2A, 0x0C33 ], - [ 0x0C35, 0x0C39 ], - [ 0x0C60, 0x0C61 ], - [ 0x0C85, 0x0C8C ], - [ 0x0C8E, 0x0C90 ], - [ 0x0C92, 0x0CA8 ], - [ 0x0CAA, 0x0CB3 ], - [ 0x0CB5, 0x0CB9 ], - [ 0x0CBD, 0x0CBD ], - [ 0x0CDE, 0x0CDE ], - [ 0x0CE0, 0x0CE1 ], - [ 0x0D05, 0x0D0C ], - [ 0x0D0E, 0x0D10 ], - [ 0x0D12, 0x0D28 ], - [ 0x0D2A, 0x0D39 ], - [ 0x0D60, 0x0D61 ], - [ 0x0D85, 0x0D96 ], - [ 0x0D9A, 0x0DB1 ], - [ 0x0DB3, 0x0DBB ], - [ 0x0DBD, 0x0DBD ], - [ 0x0DC0, 0x0DC6 ], - [ 0x0E01, 0x0E30 ], - [ 0x0E32, 0x0E33 ], - [ 0x0E40, 0x0E46 ], - [ 0x0E81, 0x0E82 ], - [ 0x0E84, 0x0E84 ], - [ 0x0E87, 0x0E88 ], - [ 0x0E8A, 0x0E8A ], - [ 0x0E8D, 0x0E8D ], - [ 0x0E94, 0x0E97 ], - [ 0x0E99, 0x0E9F ], - [ 0x0EA1, 0x0EA3 ], - [ 0x0EA5, 0x0EA5 ], - [ 0x0EA7, 0x0EA7 ], - [ 0x0EAA, 0x0EAB ], - [ 0x0EAD, 0x0EB0 ], - [ 0x0EB2, 0x0EB3 ], - [ 0x0EBD, 0x0EBD ], - [ 0x0EC0, 0x0EC4 ], - [ 0x0EC6, 0x0EC6 ], - [ 0x0EDC, 0x0EDD ], - [ 0x0F00, 0x0F00 ], - [ 0x0F40, 0x0F47 ], - [ 0x0F49, 0x0F6A ], - [ 0x0F88, 0x0F8B ], - [ 0x1000, 0x1021 ], - [ 0x1023, 0x1027 ], - [ 0x1029, 0x102A ], - [ 0x1050, 0x1055 ], - [ 0x10A0, 0x10C5 ], - [ 0x10D0, 0x10FA ], - [ 0x10FC, 0x10FC ], - [ 0x1100, 0x1159 ], - [ 0x115F, 0x11A2 ], - [ 0x11A8, 0x11F9 ], - [ 0x1200, 0x1248 ], - [ 0x124A, 0x124D ], - [ 0x1250, 0x1256 ], - [ 0x1258, 0x1258 ], - [ 0x125A, 0x125D ], - [ 0x1260, 0x1288 ], - [ 0x128A, 0x128D ], - [ 0x1290, 0x12B0 ], - [ 0x12B2, 0x12B5 ], - [ 0x12B8, 0x12BE ], - [ 0x12C0, 0x12C0 ], - [ 0x12C2, 0x12C5 ], - [ 0x12C8, 0x12D6 ], - [ 0x12D8, 0x1310 ], - [ 0x1312, 0x1315 ], - [ 0x1318, 0x135A ], - [ 0x1380, 0x138F ], - [ 0x13A0, 0x13F4 ], - [ 0x1401, 0x166C ], - [ 0x166F, 0x1676 ], - [ 0x1681, 0x169A ], - [ 0x16A0, 0x16EA ], - [ 0x1700, 0x170C ], - [ 0x170E, 0x1711 ], - [ 0x1720, 0x1731 ], - [ 0x1740, 0x1751 ], - [ 0x1760, 0x176C ], - [ 0x176E, 0x1770 ], - [ 0x1780, 0x17B3 ], - [ 0x17D7, 0x17D7 ], - [ 0x17DC, 0x17DC ], - [ 0x1820, 0x1877 ], - [ 0x1880, 0x18A8 ], - [ 0x1900, 0x191C ], - [ 0x1950, 0x196D ], - [ 0x1970, 0x1974 ], - [ 0x1980, 0x19A9 ], - [ 0x19C1, 0x19C7 ], - [ 0x1A00, 0x1A16 ], - [ 0x1B05, 0x1B33 ], - [ 0x1B45, 0x1B4B ], - [ 0x1D00, 0x1DBF ], - [ 0x1E00, 0x1E9B ], - [ 0x1EA0, 0x1EF9 ], - [ 0x1F00, 0x1F15 ], - [ 0x1F18, 0x1F1D ], - [ 0x1F20, 0x1F45 ], - [ 0x1F48, 0x1F4D ], - [ 0x1F50, 0x1F57 ], - [ 0x1F59, 0x1F59 ], - [ 0x1F5B, 0x1F5B ], - [ 0x1F5D, 0x1F5D ], - [ 0x1F5F, 0x1F7D ], - [ 0x1F80, 0x1FB4 ], - [ 0x1FB6, 0x1FBC ], - [ 0x1FBE, 0x1FBE ], - [ 0x1FC2, 0x1FC4 ], - [ 0x1FC6, 0x1FCC ], - [ 0x1FD0, 0x1FD3 ], - [ 0x1FD6, 0x1FDB ], - [ 0x1FE0, 0x1FEC ], - [ 0x1FF2, 0x1FF4 ], - [ 0x1FF6, 0x1FFC ], - [ 0x2071, 0x2071 ], - [ 0x207F, 0x207F ], - [ 0x2090, 0x2094 ], - [ 0x2102, 0x2102 ], - [ 0x2107, 0x2107 ], - [ 0x210A, 0x2113 ], - [ 0x2115, 0x2115 ], - [ 0x2119, 0x211D ], - [ 0x2124, 0x2124 ], - [ 0x2126, 0x2126 ], - [ 0x2128, 0x2128 ], - [ 0x212A, 0x212D ], - [ 0x212F, 0x2139 ], - [ 0x213C, 0x213F ], - [ 0x2145, 0x2149 ], - [ 0x214E, 0x214E ], - [ 0x2183, 0x2184 ], - [ 0x2C00, 0x2C2E ], - [ 0x2C30, 0x2C5E ], - [ 0x2C60, 0x2C6C ], - [ 0x2C74, 0x2C77 ], - [ 0x2C80, 0x2CE4 ], - [ 0x2D00, 0x2D25 ], - [ 0x2D30, 0x2D65 ], - [ 0x2D6F, 0x2D6F ], - [ 0x2D80, 0x2D96 ], - [ 0x2DA0, 0x2DA6 ], - [ 0x2DA8, 0x2DAE ], - [ 0x2DB0, 0x2DB6 ], - [ 0x2DB8, 0x2DBE ], - [ 0x2DC0, 0x2DC6 ], - [ 0x2DC8, 0x2DCE ], - [ 0x2DD0, 0x2DD6 ], - [ 0x2DD8, 0x2DDE ], - [ 0x3005, 0x3006 ], - [ 0x3031, 0x3035 ], - [ 0x303B, 0x303C ], - [ 0x3041, 0x3096 ], - [ 0x309D, 0x309F ], - [ 0x30A1, 0x30FA ], - [ 0x30FC, 0x30FF ], - [ 0x3105, 0x312C ], - [ 0x3131, 0x318E ], - [ 0x31A0, 0x31B7 ], - [ 0x31F0, 0x31FF ], - [ 0x3400, 0x4DB5 ], - [ 0x4E00, 0x9FBB ], - [ 0xA000, 0xA48C ], - [ 0xA717, 0xA71A ], - [ 0xA800, 0xA801 ], - [ 0xA803, 0xA805 ], - [ 0xA807, 0xA80A ], - [ 0xA80C, 0xA822 ], - [ 0xA840, 0xA873 ], - [ 0xAC00, 0xD7A3 ], - [ 0xF900, 0xFA2D ], - [ 0xFA30, 0xFA6A ], - [ 0xFA70, 0xFAD9 ], - [ 0xFB00, 0xFB06 ], - [ 0xFB13, 0xFB17 ], - [ 0xFB1D, 0xFB1D ], - [ 0xFB1F, 0xFB28 ], - [ 0xFB2A, 0xFB36 ], - [ 0xFB38, 0xFB3C ], - [ 0xFB3E, 0xFB3E ], - [ 0xFB40, 0xFB41 ], - [ 0xFB43, 0xFB44 ], - [ 0xFB46, 0xFBB1 ], - [ 0xFBD3, 0xFD3D ], - [ 0xFD50, 0xFD8F ], - [ 0xFD92, 0xFDC7 ], - [ 0xFDF0, 0xFDFB ], - [ 0xFE70, 0xFE74 ], - [ 0xFE76, 0xFEFC ], - [ 0xFF21, 0xFF3A ], - [ 0xFF41, 0xFF5A ], - [ 0xFF66, 0xFFBE ], - [ 0xFFC2, 0xFFC7 ], - [ 0xFFCA, 0xFFCF ], - [ 0xFFD2, 0xFFD7 ], - [ 0xFFDA, 0xFFDC ], - [ 0x10000, 0x1000B ], - [ 0x1000D, 0x10026 ], - [ 0x10028, 0x1003A ], - [ 0x1003C, 0x1003D ], - [ 0x1003F, 0x1004D ], - [ 0x10050, 0x1005D ], - [ 0x10080, 0x100FA ], - [ 0x10300, 0x1031E ], - [ 0x10330, 0x10340 ], - [ 0x10342, 0x10349 ], - [ 0x10380, 0x1039D ], - [ 0x103A0, 0x103C3 ], - [ 0x103C8, 0x103CF ], - [ 0x10400, 0x1049D ], - [ 0x10800, 0x10805 ], - [ 0x10808, 0x10808 ], - [ 0x1080A, 0x10835 ], - [ 0x10837, 0x10838 ], - [ 0x1083C, 0x1083C ], - [ 0x1083F, 0x1083F ], - [ 0x10900, 0x10915 ], - [ 0x10A00, 0x10A00 ], - [ 0x10A10, 0x10A13 ], - [ 0x10A15, 0x10A17 ], - [ 0x10A19, 0x10A33 ], - [ 0x12000, 0x1236E ], - [ 0x1D400, 0x1D454 ], - [ 0x1D456, 0x1D49C ], - [ 0x1D49E, 0x1D49F ], - [ 0x1D4A2, 0x1D4A2 ], - [ 0x1D4A5, 0x1D4A6 ], - [ 0x1D4A9, 0x1D4AC ], - [ 0x1D4AE, 0x1D4B9 ], - [ 0x1D4BB, 0x1D4BB ], - [ 0x1D4BD, 0x1D4C3 ], - [ 0x1D4C5, 0x1D505 ], - [ 0x1D507, 0x1D50A ], - [ 0x1D50D, 0x1D514 ], - [ 0x1D516, 0x1D51C ], - [ 0x1D51E, 0x1D539 ], - [ 0x1D53B, 0x1D53E ], - [ 0x1D540, 0x1D544 ], - [ 0x1D546, 0x1D546 ], - [ 0x1D54A, 0x1D550 ], - [ 0x1D552, 0x1D6A5 ], - [ 0x1D6A8, 0x1D6C0 ], - [ 0x1D6C2, 0x1D6DA ], - [ 0x1D6DC, 0x1D6FA ], - [ 0x1D6FC, 0x1D714 ], - [ 0x1D716, 0x1D734 ], - [ 0x1D736, 0x1D74E ], - [ 0x1D750, 0x1D76E ], - [ 0x1D770, 0x1D788 ], - [ 0x1D78A, 0x1D7A8 ], - [ 0x1D7AA, 0x1D7C2 ], - [ 0x1D7C4, 0x1D7CB ], - [ 0x20000, 0x2A6D6 ], - [ 0x2F800, 0x2FA1D ], - ]; - - debug - { - for (int i = 0; i < table.length; i++) - { - assert(table[i][0] <= table[i][1]); - if (i < table.length - 1) - { -// if (table[i][1] >= table[i + 1][0]) -// printf("table[%d][1] = x%x, table[%d][0] = x%x\n", i, table[i][1], i + 1, table[i + 1][0]); - assert(table[i][1] < table[i + 1][0]); - } - } - } - - if (u < 0xAA) - { - if (u < 'A') - goto Lisnot; - if (u <= 'Z') - goto Lis; - if (u < 'a') - goto Lisnot; - if (u <= 'z') - goto Lis; - goto Lisnot; - } - - // Binary search - uint mid; - uint low; - uint high; - - low = 0; - high = table.length - 1; - while (cast(int)low <= cast(int)high) - { - mid = (low + high) >> 1; - if (u < table[mid][0]) - high = mid - 1; - else if (u > table[mid][1]) - low = mid + 1; - else - goto Lis; - } - -Lisnot: - debug - { - for (int i = 0; i < table.length; i++) - { - assert(u < table[i][0] || u > table[i][1]); - } - } - return 0; - -Lis: - debug - { - for (int i = 0; i < table.length; i++) - { - if (u >= table[i][0] && u <= table[i][1]) - return 1; - } - assert(0); // should have been in table - } - return 1; -} - -unittest -{ - for (uint i = 0; i < 0x80; i++) - { - if (i >= 'A' && i <= 'Z') - assert(isUniAlpha(i)); - else if (i >= 'a' && i <= 'z') - assert(isUniAlpha(i)); - else - assert(!isUniAlpha(i)); - } -}
--- a/trunk/src/std/utf.d Fri Jan 11 15:23:38 2008 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,975 +0,0 @@ -// utf.d - -/* - * Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com - * Written by Walter Bright - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * o The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * o Altered source versions must be plainly marked as such, and must not - * be misrepresented as being the original software. - * o This notice may not be removed or altered from any source - * distribution. - */ - -/******************************************** - * Encode and decode UTF-8, UTF-16 and UTF-32 strings. - * - * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D - * wchar type. - * For linux systems, the C wchar_t type is UTF-32 and corresponds to - * the D utf.dchar type. - * - * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF). - * - * See_Also: - * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br> - * $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br> - * $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335) - * Macros: - * WIKI = Phobos/StdUtf - */ - -/* - Note: this is not the original file! - Modified by Aziz Köksal: - Only commented out deprecated class UtfError. -*/ - -module std.utf; - -// private import std.stdio; - -//debug=utf; // uncomment to turn on debugging printf's -/+ -deprecated class UtfError : Error -{ - size_t idx; // index in string of where error occurred - - this(char[] s, size_t i) - { - idx = i; - super(s); - } -} -+/ -/********************************** - * Exception class that is thrown upon any errors. - */ - -class UtfException : Exception -{ - size_t idx; /// index in string of where error occurred - - this(char[] s, size_t i) - { - idx = i; - super(s); - } -} - -/******************************* - * Test if c is a valid UTF-32 character. - * - * \uFFFE and \uFFFF are considered valid by this function, - * as they are permitted for internal use by an application, - * but they are not allowed for interchange by the Unicode standard. - * - * Returns: true if it is, false if not. - */ - -bool isValidDchar(dchar c) -{ - /* Note: FFFE and FFFF are specifically permitted by the - * Unicode standard for application internal use, but are not - * allowed for interchange. - * (thanks to Arcane Jill) - */ - - return c < 0xD800 || - (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/); -} - -unittest -{ - debug(utf) printf("utf.isValidDchar.unittest\n"); - assert(isValidDchar(cast(dchar)'a') == true); - assert(isValidDchar(cast(dchar)0x1FFFFF) == false); -} - - -ubyte[256] UTF8stride = -[ - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, - 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, - 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, - 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF, -]; - -/** - * stride() returns the length of a UTF-8 sequence starting at index i - * in string s. - * Returns: - * The number of bytes in the UTF-8 sequence or - * 0xFF meaning s[i] is not the start of of UTF-8 sequence. - */ - -uint stride(char[] s, size_t i) -{ - return UTF8stride[s[i]]; -} - -/** - * stride() returns the length of a UTF-16 sequence starting at index i - * in string s. - */ - -uint stride(wchar[] s, size_t i) -{ uint u = s[i]; - return 1 + (u >= 0xD800 && u <= 0xDBFF); -} - -/** - * stride() returns the length of a UTF-32 sequence starting at index i - * in string s. - * Returns: The return value will always be 1. - */ - -uint stride(dchar[] s, size_t i) -{ - return 1; -} - -/******************************************* - * Given an index i into an array of characters s[], - * and assuming that index i is at the start of a UTF character, - * determine the number of UCS characters up to that index i. - */ - -size_t toUCSindex(char[] s, size_t i) -{ - size_t n; - size_t j; - size_t stride; - - for (j = 0; j < i; j += stride) - { - stride = UTF8stride[s[j]]; - if (stride == 0xFF) - goto Lerr; - n++; - } - if (j > i) - { - Lerr: - throw new UtfException("1invalid UTF-8 sequence", j); - } - return n; -} - -/** ditto */ - -size_t toUCSindex(wchar[] s, size_t i) -{ - size_t n; - size_t j; - - for (j = 0; j < i; ) - { uint u = s[j]; - - j += 1 + (u >= 0xD800 && u <= 0xDBFF); - n++; - } - if (j > i) - { - Lerr: - throw new UtfException("2invalid UTF-16 sequence", j); - } - return n; -} - -/** ditto */ - -size_t toUCSindex(dchar[] s, size_t i) -{ - return i; -} - -/****************************************** - * Given a UCS index n into an array of characters s[], return the UTF index. - */ - -size_t toUTFindex(char[] s, size_t n) -{ - size_t i; - - while (n--) - { - uint j = UTF8stride[s[i]]; - if (j == 0xFF) - throw new UtfException("3invalid UTF-8 sequence", i); - i += j; - } - return i; -} - -/** ditto */ - -size_t toUTFindex(wchar[] s, size_t n) -{ - size_t i; - - while (n--) - { wchar u = s[i]; - - i += 1 + (u >= 0xD800 && u <= 0xDBFF); - } - return i; -} - -/** ditto */ - -size_t toUTFindex(dchar[] s, size_t n) -{ - return n; -} - -/* =================== Decode ======================= */ - -/*************** - * Decodes and returns character starting at s[idx]. idx is advanced past the - * decoded character. If the character is not well formed, a UtfException is - * thrown and idx remains unchanged. - */ - -dchar decode(char[] s, inout size_t idx) - in - { - assert(idx >= 0 && idx < s.length); - } - out (result) - { - assert(isValidDchar(result)); - } - body - { - size_t len = s.length; - dchar V; - size_t i = idx; - char u = s[i]; - - if (u & 0x80) - { uint n; - char u2; - - /* The following encodings are valid, except for the 5 and 6 byte - * combinations: - * 0xxxxxxx - * 110xxxxx 10xxxxxx - * 1110xxxx 10xxxxxx 10xxxxxx - * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - */ - for (n = 1; ; n++) - { - if (n > 4) - goto Lerr; // only do the first 4 of 6 encodings - if (((u << n) & 0x80) == 0) - { - if (n == 1) - goto Lerr; - break; - } - } - - // Pick off (7 - n) significant bits of B from first byte of octet - V = cast(dchar)(u & ((1 << (7 - n)) - 1)); - - if (i + (n - 1) >= len) - goto Lerr; // off end of string - - /* The following combinations are overlong, and illegal: - * 1100000x (10xxxxxx) - * 11100000 100xxxxx (10xxxxxx) - * 11110000 1000xxxx (10xxxxxx 10xxxxxx) - * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) - * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) - */ - u2 = s[i + 1]; - if ((u & 0xFE) == 0xC0 || - (u == 0xE0 && (u2 & 0xE0) == 0x80) || - (u == 0xF0 && (u2 & 0xF0) == 0x80) || - (u == 0xF8 && (u2 & 0xF8) == 0x80) || - (u == 0xFC && (u2 & 0xFC) == 0x80)) - goto Lerr; // overlong combination - - for (uint j = 1; j != n; j++) - { - u = s[i + j]; - if ((u & 0xC0) != 0x80) - goto Lerr; // trailing bytes are 10xxxxxx - V = (V << 6) | (u & 0x3F); - } - if (!isValidDchar(V)) - goto Lerr; - i += n; - } - else - { - V = cast(dchar) u; - i++; - } - - idx = i; - return V; - - Lerr: - //printf("\ndecode: idx = %d, i = %d, length = %d s = \n'%.*s'\n%x\n'%.*s'\n", idx, i, s.length, s, s[i], s[i .. length]); - throw new UtfException("4invalid UTF-8 sequence", i); - } - -unittest -{ size_t i; - dchar c; - - debug(utf) printf("utf.decode.unittest\n"); - - static char[] s1 = "abcd"; - i = 0; - c = decode(s1, i); - assert(c == cast(dchar)'a'); - assert(i == 1); - c = decode(s1, i); - assert(c == cast(dchar)'b'); - assert(i == 2); - - static char[] s2 = "\xC2\xA9"; - i = 0; - c = decode(s2, i); - assert(c == cast(dchar)'\u00A9'); - assert(i == 2); - - static char[] s3 = "\xE2\x89\xA0"; - i = 0; - c = decode(s3, i); - assert(c == cast(dchar)'\u2260'); - assert(i == 3); - - static char[][] s4 = - [ "\xE2\x89", // too short - "\xC0\x8A", - "\xE0\x80\x8A", - "\xF0\x80\x80\x8A", - "\xF8\x80\x80\x80\x8A", - "\xFC\x80\x80\x80\x80\x8A", - ]; - - for (int j = 0; j < s4.length; j++) - { - try - { - i = 0; - c = decode(s4[j], i); - assert(0); - } - catch (UtfException u) - { - i = 23; - delete u; - } - assert(i == 23); - } -} - -/** ditto */ - -dchar decode(wchar[] s, inout size_t idx) - in - { - assert(idx >= 0 && idx < s.length); - } - out (result) - { - assert(isValidDchar(result)); - } - body - { - char[] msg; - dchar V; - size_t i = idx; - uint u = s[i]; - - if (u & ~0x7F) - { if (u >= 0xD800 && u <= 0xDBFF) - { uint u2; - - if (i + 1 == s.length) - { msg = "surrogate UTF-16 high value past end of string"; - goto Lerr; - } - u2 = s[i + 1]; - if (u2 < 0xDC00 || u2 > 0xDFFF) - { msg = "surrogate UTF-16 low value out of range"; - goto Lerr; - } - u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); - i += 2; - } - else if (u >= 0xDC00 && u <= 0xDFFF) - { msg = "unpaired surrogate UTF-16 value"; - goto Lerr; - } - else if (u == 0xFFFE || u == 0xFFFF) - { msg = "illegal UTF-16 value"; - goto Lerr; - } - else - i++; - } - else - { - i++; - } - - idx = i; - return cast(dchar)u; - - Lerr: - throw new UtfException(msg, i); - } - -/** ditto */ - -dchar decode(dchar[] s, inout size_t idx) - in - { - assert(idx >= 0 && idx < s.length); - } - body - { - size_t i = idx; - dchar c = s[i]; - - if (!isValidDchar(c)) - goto Lerr; - idx = i + 1; - return c; - - Lerr: - throw new UtfException("5invalid UTF-32 value", i); - } - - -/* =================== Encode ======================= */ - -/******************************* - * Encodes character c and appends it to array s[]. - */ - -void encode(inout char[] s, dchar c) - in - { - assert(isValidDchar(c)); - } - body - { - char[] r = s; - - if (c <= 0x7F) - { - r ~= cast(char) c; - } - else - { - char[4] buf; - uint L; - - if (c <= 0x7FF) - { - buf[0] = cast(char)(0xC0 | (c >> 6)); - buf[1] = cast(char)(0x80 | (c & 0x3F)); - L = 2; - } - else if (c <= 0xFFFF) - { - buf[0] = cast(char)(0xE0 | (c >> 12)); - buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); - buf[2] = cast(char)(0x80 | (c & 0x3F)); - L = 3; - } - else if (c <= 0x10FFFF) - { - buf[0] = cast(char)(0xF0 | (c >> 18)); - buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); - buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); - buf[3] = cast(char)(0x80 | (c & 0x3F)); - L = 4; - } - else - { - assert(0); - } - r ~= buf[0 .. L]; - } - s = r; - } - -unittest -{ - debug(utf) printf("utf.encode.unittest\n"); - - char[] s = "abcd"; - encode(s, cast(dchar)'a'); - assert(s.length == 5); - assert(s == "abcda"); - - encode(s, cast(dchar)'\u00A9'); - assert(s.length == 7); - assert(s == "abcda\xC2\xA9"); - //assert(s == "abcda\u00A9"); // BUG: fix compiler - - encode(s, cast(dchar)'\u2260'); - assert(s.length == 10); - assert(s == "abcda\xC2\xA9\xE2\x89\xA0"); -} - -/** ditto */ - -void encode(inout wchar[] s, dchar c) - in - { - assert(isValidDchar(c)); - } - body - { - wchar[] r = s; - - if (c <= 0xFFFF) - { - r ~= cast(wchar) c; - } - else - { - wchar[2] buf; - - buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); - buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); - r ~= buf; - } - s = r; - } - -/** ditto */ - -void encode(inout dchar[] s, dchar c) - in - { - assert(isValidDchar(c)); - } - body - { - s ~= c; - } - -/* =================== Validation ======================= */ - -/*********************************** - * Checks to see if string is well formed or not. Throws a UtfException if it is - * not. Use to check all untrusted input for correctness. - */ - -void validate(char[] s) -{ - size_t len = s.length; - size_t i; - - for (i = 0; i < len; ) - { - decode(s, i); - } -} - -/** ditto */ - -void validate(wchar[] s) -{ - size_t len = s.length; - size_t i; - - for (i = 0; i < len; ) - { - decode(s, i); - } -} - -/** ditto */ - -void validate(dchar[] s) -{ - size_t len = s.length; - size_t i; - - for (i = 0; i < len; ) - { - decode(s, i); - } -} - -/* =================== Conversion to UTF8 ======================= */ - -char[] toUTF8(char[4] buf, dchar c) - in - { - assert(isValidDchar(c)); - } - body - { - if (c <= 0x7F) - { - buf[0] = cast(char) c; - return buf[0 .. 1]; - } - else if (c <= 0x7FF) - { - buf[0] = cast(char)(0xC0 | (c >> 6)); - buf[1] = cast(char)(0x80 | (c & 0x3F)); - return buf[0 .. 2]; - } - else if (c <= 0xFFFF) - { - buf[0] = cast(char)(0xE0 | (c >> 12)); - buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); - buf[2] = cast(char)(0x80 | (c & 0x3F)); - return buf[0 .. 3]; - } - else if (c <= 0x10FFFF) - { - buf[0] = cast(char)(0xF0 | (c >> 18)); - buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); - buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); - buf[3] = cast(char)(0x80 | (c & 0x3F)); - return buf[0 .. 4]; - } - assert(0); - } - -/******************* - * Encodes string s into UTF-8 and returns the encoded string. - */ - -char[] toUTF8(char[] s) - in - { - validate(s); - } - body - { - return s; - } - -/** ditto */ - -char[] toUTF8(wchar[] s) -{ - char[] r; - size_t i; - size_t slen = s.length; - - r.length = slen; - - for (i = 0; i < slen; i++) - { wchar c = s[i]; - - if (c <= 0x7F) - r[i] = cast(char)c; // fast path for ascii - else - { - r.length = i; - foreach (dchar c; s[i .. slen]) - { - encode(r, c); - } - break; - } - } - return r; -} - -/** ditto */ - -char[] toUTF8(dchar[] s) -{ - char[] r; - size_t i; - size_t slen = s.length; - - r.length = slen; - - for (i = 0; i < slen; i++) - { dchar c = s[i]; - - if (c <= 0x7F) - r[i] = cast(char)c; // fast path for ascii - else - { - r.length = i; - foreach (dchar d; s[i .. slen]) - { - encode(r, d); - } - break; - } - } - return r; -} - -/* =================== Conversion to UTF16 ======================= */ - -wchar[] toUTF16(wchar[2] buf, dchar c) - in - { - assert(isValidDchar(c)); - } - body - { - if (c <= 0xFFFF) - { - buf[0] = cast(wchar) c; - return buf[0 .. 1]; - } - else - { - buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); - buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); - return buf[0 .. 2]; - } - } - -/**************** - * Encodes string s into UTF-16 and returns the encoded string. - * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take - * an LPWSTR or LPCWSTR argument. - */ - -wchar[] toUTF16(char[] s) -{ - wchar[] r; - size_t slen = s.length; - - r.length = slen; - r.length = 0; - for (size_t i = 0; i < slen; ) - { - dchar c = s[i]; - if (c <= 0x7F) - { - i++; - r ~= cast(wchar)c; - } - else - { - c = decode(s, i); - encode(r, c); - } - } - return r; -} - -/** ditto */ - -wchar* toUTF16z(char[] s) -{ - wchar[] r; - size_t slen = s.length; - - r.length = slen + 1; - r.length = 0; - for (size_t i = 0; i < slen; ) - { - dchar c = s[i]; - if (c <= 0x7F) - { - i++; - r ~= cast(wchar)c; - } - else - { - c = decode(s, i); - encode(r, c); - } - } - r ~= "\000"; - return r.ptr; -} - -/** ditto */ - -wchar[] toUTF16(wchar[] s) - in - { - validate(s); - } - body - { - return s; - } - -/** ditto */ - -wchar[] toUTF16(dchar[] s) -{ - wchar[] r; - size_t slen = s.length; - - r.length = slen; - r.length = 0; - for (size_t i = 0; i < slen; i++) - { - encode(r, s[i]); - } - return r; -} - -/* =================== Conversion to UTF32 ======================= */ - -/***** - * Encodes string s into UTF-32 and returns the encoded string. - */ - -dchar[] toUTF32(char[] s) -{ - dchar[] r; - size_t slen = s.length; - size_t j = 0; - - r.length = slen; // r[] will never be longer than s[] - for (size_t i = 0; i < slen; ) - { - dchar c = s[i]; - if (c >= 0x80) - c = decode(s, i); - else - i++; // c is ascii, no need for decode - r[j++] = c; - } - return r[0 .. j]; -} - -/** ditto */ - -dchar[] toUTF32(wchar[] s) -{ - dchar[] r; - size_t slen = s.length; - size_t j = 0; - - r.length = slen; // r[] will never be longer than s[] - for (size_t i = 0; i < slen; ) - { - dchar c = s[i]; - if (c >= 0x80) - c = decode(s, i); - else - i++; // c is ascii, no need for decode - r[j++] = c; - } - return r[0 .. j]; -} - -/** ditto */ - -dchar[] toUTF32(dchar[] s) - in - { - validate(s); - } - body - { - return s; - } - -/* ================================ tests ================================== */ - -unittest -{ - debug(utf) printf("utf.toUTF.unittest\n"); - - char[] c; - wchar[] w; - dchar[] d; - - c = "hello"; - w = toUTF16(c); - assert(w == "hello"); - d = toUTF32(c); - assert(d == "hello"); - - c = toUTF8(w); - assert(c == "hello"); - d = toUTF32(w); - assert(d == "hello"); - - c = toUTF8(d); - assert(c == "hello"); - w = toUTF16(d); - assert(w == "hello"); - - - c = "hel\u1234o"; - w = toUTF16(c); - assert(w == "hel\u1234o"); - d = toUTF32(c); - assert(d == "hel\u1234o"); - - c = toUTF8(w); - assert(c == "hel\u1234o"); - d = toUTF32(w); - assert(d == "hel\u1234o"); - - c = toUTF8(d); - assert(c == "hel\u1234o"); - w = toUTF16(d); - assert(w == "hel\u1234o"); - - - c = "he\U0010AAAAllo"; - w = toUTF16(c); - //foreach (wchar c; w) printf("c = x%x\n", c); - //foreach (wchar c; cast(wchar[])"he\U0010AAAAllo") printf("c = x%x\n", c); - assert(w == "he\U0010AAAAllo"); - d = toUTF32(c); - assert(d == "he\U0010AAAAllo"); - - c = toUTF8(w); - assert(c == "he\U0010AAAAllo"); - d = toUTF32(w); - assert(d == "he\U0010AAAAllo"); - - c = toUTF8(d); - assert(c == "he\U0010AAAAllo"); - w = toUTF16(d); - assert(w == "he\U0010AAAAllo"); -}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trunk/src/util/metastrings.d Fri Jan 11 20:03:46 2008 +0100 @@ -0,0 +1,225 @@ + +// Written in the D programming language. + +/** + * Templates with which to do compile time manipulation of strings. + * + * Macros: + * WIKI = Phobos/StdMetastrings + * Copyright: + * Public Domain + */ + +/* + * Authors: + * Walter Bright, Digital Mars, www.digitalmars.com + * Don Clugston + */ + +/* + Note: this is not the original file! + Modified by Aziz Köksal: + Only changed some types from string to char[] +*/ + +module util.metastrings; + +/** + * Formats constants into a string at compile time. + * Analogous to std.string.format(). + * Parameters: + * A = tuple of constants, which can be strings, + * characters, or integral values. + * Formats: + * The formats supported are %s for strings, and %% + * for the % character. + * Example: + * --- +import std.metastrings; +import std.stdio; + +void main() +{ + string s = Format!("Arg %s = %s", "foo", 27); + writefln(s); // "Arg foo = 27" +} + * --- + */ + +template Format(A...) +{ + static if (A.length == 0) + const char[] Format = ""; + else static if (is(typeof(A[0]) : char[])) + const char[] Format = FormatString!(A[0], A[1..$]); + //const char[] Format = FormatString!(A[0]); + else + const char[] Format = ToString!(A[0]) ~ Format!(A[1..$]); +} + +template FormatString(char[] F, A...) +{ + static if (F.length == 0) + const char[] FormatString = Format!(A); + else static if (F.length == 1) + const char[] FormatString = F[0] ~ Format!(A); + else static if (F[0..2] == "%s") + const char[] FormatString = ToString!(A[0]) ~ FormatString!(F[2..$],A[1..$]); + else static if (F[0..2] == "%%") + const char[] FormatString = "%" ~ FormatString!(F[2..$],A); + else static if (F[0] == '%') + static assert(0, "unrecognized format %" ~ F[1]); + else + const char[] FormatString = F[0] ~ FormatString!(F[1..$],A); +} + +/** + * Convert constant argument to a string. + */ + +template ToString(ulong U) +{ + static if (U < 10) + const char[] ToString = "" ~ cast(char)(U + '0'); + else + const char[] ToString = ToString!(U / 10) ~ ToString!(U % 10); +} + +/// ditto +template ToString(long I) +{ + static if (I < 0) + const char[] ToString = "-" ~ ToString!(cast(ulong)(-I)); + else + const char[] ToString = ToString!(cast(ulong)I); +} + +static assert(ToString!(0x100000000) == "4294967296"); + +/// ditto +template ToString(uint U) +{ + const char[] ToString = ToString!(cast(ulong)U); +} + +/// ditto +template ToString(int I) +{ + const char[] ToString = ToString!(cast(long)I); +} + +/// ditto +template ToString(ushort U) +{ + const char[] ToString = ToString!(cast(ulong)U); +} + +/// ditto +template ToString(short I) +{ + const char[] ToString = ToString!(cast(long)I); +} + +/// ditto +template ToString(ubyte U) +{ + const char[] ToString = ToString!(cast(ulong)U); +} + +/// ditto +template ToString(byte I) +{ + const char[] ToString = ToString!(cast(long)I); +} + +/// ditto +template ToString(bool B) +{ + const char[] ToString = B ? "true" : "false"; +} + +/// ditto +template ToString(char[] S) +{ + const char[] ToString = S; +} + +/// ditto +template ToString(char C) +{ + const char[] ToString = "" ~ C; +} + +unittest +{ + char[] s = Format!("hel%slo", "world", -138, 'c', true); + assert(s == "helworldlo-138ctrue"); +} + + +/******** + * Parse unsigned integer literal from the start of string s. + * returns: + * .value = the integer literal as a string, + * .rest = the string following the integer literal + * Otherwise: + * .value = null, + * .rest = s + */ + +template ParseUinteger(char[] s) +{ + static if (s.length == 0) + { const char[] value = ""; + const char[] rest = ""; + } + else static if (s[0] >= '0' && s[0] <= '9') + { const char[] value = s[0] ~ ParseUinteger!(s[1..$]).value; + const char[] rest = ParseUinteger!(s[1..$]).rest; + } + else + { const char[] value = ""; + const char[] rest = s; + } +} + +/******** + * Parse integer literal optionally preceded by '-' + * from the start of string s. + * returns: + * .value = the integer literal as a string, + * .rest = the string following the integer literal + * Otherwise: + * .value = null, + * .rest = s + */ + +template ParseInteger(char[] s) +{ + static if (s.length == 0) + { const char[] value = ""; + const char[] rest = ""; + } + else static if (s[0] >= '0' && s[0] <= '9') + { const char[] value = s[0] ~ ParseUinteger!(s[1..$]).value; + const char[] rest = ParseUinteger!(s[1..$]).rest; + } + else static if (s.length >= 2 && + s[0] == '-' && s[1] >= '0' && s[1] <= '9') + { const char[] value = s[0..2] ~ ParseUinteger!(s[2..$]).value; + const char[] rest = ParseUinteger!(s[2..$]).rest; + } + else + { const char[] value = ""; + const char[] rest = s; + } +} + +unittest +{ + assert(ParseUinteger!("1234abc").value == "1234"); + assert(ParseUinteger!("1234abc").rest == "abc"); + assert(ParseInteger!("-1234abc").value == "-1234"); + assert(ParseInteger!("-1234abc").rest == "abc"); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trunk/src/util/uni.d Fri Jan 11 20:03:46 2008 +0100 @@ -0,0 +1,630 @@ + +// Written in the D programming language. + +/* + * Placed into the Public Domain. + * Digital Mars, www.digitalmars.com + * Written by Walter Bright + */ + +/** + * Simple Unicode character classification functions. + * For ASCII classification, see $(LINK2 std_ctype.html, std.ctype). + * Macros: + * WIKI=Phobos/StdUni + * References: + * $(LINK2 http://www.digitalmars.com/d/ascii-table.html, ASCII Table), + * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia), + * $(LINK2 http://www.unicode.org, The Unicode Consortium) + * Trademarks: + * Unicode(tm) is a trademark of Unicode, Inc. + */ + + +module util.uni; + +/** + * Returns !=0 if c is a Unicode lower case character. + */ +int isUniLower(dchar c) +{ + if (c <= 0x7F) + return (c >= 'a' && c <= 'z'); + + return isUniAlpha(c) && c == toUniLower(c); +} + +/** + * Returns !=0 if c is a Unicode upper case character. + */ +int isUniUpper(dchar c) +{ + if (c <= 0x7F) + return (c >= 'A' && c <= 'Z'); + + return isUniAlpha(c) && c == toUniUpper(c); +} + +/** + * If c is a Unicode upper case character, return the lower case + * equivalent, otherwise return c. + */ +dchar toUniLower(dchar c) +{ + if (c >= 'A' && c <= 'Z') + { + c += 32; + } + else if (c >= 0x00C0) + { + if ((c >= 0x00C0 && c <= 0x00D6) || (c >= 0x00D8 && c<=0x00DE)) + { + c += 32; + } + else if ((c >= 0x0100 && c < 0x0138) || (c > 0x0149 && c < 0x0178)) + { + if (c == 0x0130) + c = 0x0069; + else if ((c & 1) == 0) + c += 1; + } + else if (c == 0x0178) + { + c = 0x00FF; + } + else if ((c >= 0x0139 && c < 0x0149) || (c > 0x0178 && c < 0x017F)) + { + if (c & 1) + c += 1; + } + else if (c >= 0x0200 && c <= 0x0217) + { + if ((c & 1) == 0) + c += 1; + } + else if ((c >= 0x0401 && c <= 0x040C) || (c>= 0x040E && c <= 0x040F)) + { + c += 80; + } + else if (c >= 0x0410 && c <= 0x042F) + { + c += 32; + } + else if (c >= 0x0460 && c <= 0x047F) + { + if ((c & 1) == 0) + c += 1; + } + else if (c >= 0x0531 && c <= 0x0556) + { + c += 48; + } + else if (c >= 0x10A0 && c <= 0x10C5) + { + c += 48; + } + else if (c >= 0xFF21 && c <= 0xFF3A) + { + c += 32; + } + } + return c; +} + +/** + * If c is a Unicode lower case character, return the upper case + * equivalent, otherwise return c. + */ +dchar toUniUpper(dchar c) +{ + if (c >= 'a' && c <= 'z') + { + c -= 32; + } + else if (c >= 0x00E0) + { + if ((c >= 0x00E0 && c <= 0x00F6) || (c >= 0x00F8 && c <= 0x00FE)) + { + c -= 32; + } + else if (c == 0x00FF) + { + c = 0x0178; + } + else if ((c >= 0x0100 && c < 0x0138) || (c > 0x0149 && c < 0x0178)) + { + if (c == 0x0131) + c = 0x0049; + else if (c & 1) + c -= 1; + } + else if ((c >= 0x0139 && c < 0x0149) || (c > 0x0178 && c < 0x017F)) + { + if ((c & 1) == 0) + c = c-1; + } + else if (c == 0x017F) + { + c = 0x0053; + } + else if (c >= 0x0200 && c <= 0x0217) + { + if (c & 1) + c = c-1; + } + else if (c >= 0x0430 && c<= 0x044F) + { + c -= 32; + } + else if ((c >= 0x0451 && c <= 0x045C) || (c >=0x045E && c<= 0x045F)) + { + c -= 80; + } + else if (c >= 0x0460 && c <= 0x047F) + { + if (c & 1) + c -= 1; + } + else if (c >= 0x0561 && c < 0x0587) + { + c -= 48; + } + else if (c >= 0xFF41 && c <= 0xFF5A) + { + c -= 32; + } + } + return c; +} + + +/******************************* + * Return !=0 if u is a Unicode alpha character. + * (general Unicode category: Lu, Ll, Lt, Lm and Lo) + * + * Standards: Unicode 5.0.0 + */ + +int isUniAlpha(dchar u) +{ + static dchar table[][2] = + [ + [ 'A', 'Z' ], + [ 'a', 'z' ], + [ 0x00AA, 0x00AA ], + [ 0x00B5, 0x00B5 ], + [ 0x00BA, 0x00BA ], + [ 0x00C0, 0x00D6 ], + [ 0x00D8, 0x00F6 ], + [ 0x00F8, 0x02C1 ], + [ 0x02C6, 0x02D1 ], + [ 0x02E0, 0x02E4 ], + [ 0x02EE, 0x02EE ], + [ 0x037A, 0x037D ], + [ 0x0386, 0x0386 ], + [ 0x0388, 0x038A ], + [ 0x038C, 0x038C ], + [ 0x038E, 0x03A1 ], + [ 0x03A3, 0x03CE ], + [ 0x03D0, 0x03F5 ], + [ 0x03F7, 0x0481 ], + [ 0x048A, 0x0513 ], + [ 0x0531, 0x0556 ], + [ 0x0559, 0x0559 ], + [ 0x0561, 0x0587 ], + [ 0x05D0, 0x05EA ], + [ 0x05F0, 0x05F2 ], + [ 0x0621, 0x063A ], + [ 0x0640, 0x064A ], + [ 0x066E, 0x066F ], + [ 0x0671, 0x06D3 ], + [ 0x06D5, 0x06D5 ], + [ 0x06E5, 0x06E6 ], + [ 0x06EE, 0x06EF ], + [ 0x06FA, 0x06FC ], + [ 0x06FF, 0x06FF ], + [ 0x0710, 0x0710 ], + [ 0x0712, 0x072F ], + [ 0x074D, 0x076D ], + [ 0x0780, 0x07A5 ], + [ 0x07B1, 0x07B1 ], + [ 0x07CA, 0x07EA ], + [ 0x07F4, 0x07F5 ], + [ 0x07FA, 0x07FA ], + [ 0x0904, 0x0939 ], + [ 0x093D, 0x093D ], + [ 0x0950, 0x0950 ], + [ 0x0958, 0x0961 ], + [ 0x097B, 0x097F ], + [ 0x0985, 0x098C ], + [ 0x098F, 0x0990 ], + [ 0x0993, 0x09A8 ], + [ 0x09AA, 0x09B0 ], + [ 0x09B2, 0x09B2 ], + [ 0x09B6, 0x09B9 ], + [ 0x09BD, 0x09BD ], + [ 0x09CE, 0x09CE ], + [ 0x09DC, 0x09DD ], + [ 0x09DF, 0x09E1 ], + [ 0x09F0, 0x09F1 ], + [ 0x0A05, 0x0A0A ], + [ 0x0A0F, 0x0A10 ], + [ 0x0A13, 0x0A28 ], + [ 0x0A2A, 0x0A30 ], + [ 0x0A32, 0x0A33 ], + [ 0x0A35, 0x0A36 ], + [ 0x0A38, 0x0A39 ], + [ 0x0A59, 0x0A5C ], + [ 0x0A5E, 0x0A5E ], + [ 0x0A72, 0x0A74 ], + [ 0x0A85, 0x0A8D ], + [ 0x0A8F, 0x0A91 ], + [ 0x0A93, 0x0AA8 ], + [ 0x0AAA, 0x0AB0 ], + [ 0x0AB2, 0x0AB3 ], + [ 0x0AB5, 0x0AB9 ], + [ 0x0ABD, 0x0ABD ], + [ 0x0AD0, 0x0AD0 ], + [ 0x0AE0, 0x0AE1 ], + [ 0x0B05, 0x0B0C ], + [ 0x0B0F, 0x0B10 ], + [ 0x0B13, 0x0B28 ], + [ 0x0B2A, 0x0B30 ], + [ 0x0B32, 0x0B33 ], + [ 0x0B35, 0x0B39 ], + [ 0x0B3D, 0x0B3D ], + [ 0x0B5C, 0x0B5D ], + [ 0x0B5F, 0x0B61 ], + [ 0x0B71, 0x0B71 ], + [ 0x0B83, 0x0B83 ], + [ 0x0B85, 0x0B8A ], + [ 0x0B8E, 0x0B90 ], + [ 0x0B92, 0x0B95 ], + [ 0x0B99, 0x0B9A ], + [ 0x0B9C, 0x0B9C ], + [ 0x0B9E, 0x0B9F ], + [ 0x0BA3, 0x0BA4 ], + [ 0x0BA8, 0x0BAA ], + [ 0x0BAE, 0x0BB9 ], + [ 0x0C05, 0x0C0C ], + [ 0x0C0E, 0x0C10 ], + [ 0x0C12, 0x0C28 ], + [ 0x0C2A, 0x0C33 ], + [ 0x0C35, 0x0C39 ], + [ 0x0C60, 0x0C61 ], + [ 0x0C85, 0x0C8C ], + [ 0x0C8E, 0x0C90 ], + [ 0x0C92, 0x0CA8 ], + [ 0x0CAA, 0x0CB3 ], + [ 0x0CB5, 0x0CB9 ], + [ 0x0CBD, 0x0CBD ], + [ 0x0CDE, 0x0CDE ], + [ 0x0CE0, 0x0CE1 ], + [ 0x0D05, 0x0D0C ], + [ 0x0D0E, 0x0D10 ], + [ 0x0D12, 0x0D28 ], + [ 0x0D2A, 0x0D39 ], + [ 0x0D60, 0x0D61 ], + [ 0x0D85, 0x0D96 ], + [ 0x0D9A, 0x0DB1 ], + [ 0x0DB3, 0x0DBB ], + [ 0x0DBD, 0x0DBD ], + [ 0x0DC0, 0x0DC6 ], + [ 0x0E01, 0x0E30 ], + [ 0x0E32, 0x0E33 ], + [ 0x0E40, 0x0E46 ], + [ 0x0E81, 0x0E82 ], + [ 0x0E84, 0x0E84 ], + [ 0x0E87, 0x0E88 ], + [ 0x0E8A, 0x0E8A ], + [ 0x0E8D, 0x0E8D ], + [ 0x0E94, 0x0E97 ], + [ 0x0E99, 0x0E9F ], + [ 0x0EA1, 0x0EA3 ], + [ 0x0EA5, 0x0EA5 ], + [ 0x0EA7, 0x0EA7 ], + [ 0x0EAA, 0x0EAB ], + [ 0x0EAD, 0x0EB0 ], + [ 0x0EB2, 0x0EB3 ], + [ 0x0EBD, 0x0EBD ], + [ 0x0EC0, 0x0EC4 ], + [ 0x0EC6, 0x0EC6 ], + [ 0x0EDC, 0x0EDD ], + [ 0x0F00, 0x0F00 ], + [ 0x0F40, 0x0F47 ], + [ 0x0F49, 0x0F6A ], + [ 0x0F88, 0x0F8B ], + [ 0x1000, 0x1021 ], + [ 0x1023, 0x1027 ], + [ 0x1029, 0x102A ], + [ 0x1050, 0x1055 ], + [ 0x10A0, 0x10C5 ], + [ 0x10D0, 0x10FA ], + [ 0x10FC, 0x10FC ], + [ 0x1100, 0x1159 ], + [ 0x115F, 0x11A2 ], + [ 0x11A8, 0x11F9 ], + [ 0x1200, 0x1248 ], + [ 0x124A, 0x124D ], + [ 0x1250, 0x1256 ], + [ 0x1258, 0x1258 ], + [ 0x125A, 0x125D ], + [ 0x1260, 0x1288 ], + [ 0x128A, 0x128D ], + [ 0x1290, 0x12B0 ], + [ 0x12B2, 0x12B5 ], + [ 0x12B8, 0x12BE ], + [ 0x12C0, 0x12C0 ], + [ 0x12C2, 0x12C5 ], + [ 0x12C8, 0x12D6 ], + [ 0x12D8, 0x1310 ], + [ 0x1312, 0x1315 ], + [ 0x1318, 0x135A ], + [ 0x1380, 0x138F ], + [ 0x13A0, 0x13F4 ], + [ 0x1401, 0x166C ], + [ 0x166F, 0x1676 ], + [ 0x1681, 0x169A ], + [ 0x16A0, 0x16EA ], + [ 0x1700, 0x170C ], + [ 0x170E, 0x1711 ], + [ 0x1720, 0x1731 ], + [ 0x1740, 0x1751 ], + [ 0x1760, 0x176C ], + [ 0x176E, 0x1770 ], + [ 0x1780, 0x17B3 ], + [ 0x17D7, 0x17D7 ], + [ 0x17DC, 0x17DC ], + [ 0x1820, 0x1877 ], + [ 0x1880, 0x18A8 ], + [ 0x1900, 0x191C ], + [ 0x1950, 0x196D ], + [ 0x1970, 0x1974 ], + [ 0x1980, 0x19A9 ], + [ 0x19C1, 0x19C7 ], + [ 0x1A00, 0x1A16 ], + [ 0x1B05, 0x1B33 ], + [ 0x1B45, 0x1B4B ], + [ 0x1D00, 0x1DBF ], + [ 0x1E00, 0x1E9B ], + [ 0x1EA0, 0x1EF9 ], + [ 0x1F00, 0x1F15 ], + [ 0x1F18, 0x1F1D ], + [ 0x1F20, 0x1F45 ], + [ 0x1F48, 0x1F4D ], + [ 0x1F50, 0x1F57 ], + [ 0x1F59, 0x1F59 ], + [ 0x1F5B, 0x1F5B ], + [ 0x1F5D, 0x1F5D ], + [ 0x1F5F, 0x1F7D ], + [ 0x1F80, 0x1FB4 ], + [ 0x1FB6, 0x1FBC ], + [ 0x1FBE, 0x1FBE ], + [ 0x1FC2, 0x1FC4 ], + [ 0x1FC6, 0x1FCC ], + [ 0x1FD0, 0x1FD3 ], + [ 0x1FD6, 0x1FDB ], + [ 0x1FE0, 0x1FEC ], + [ 0x1FF2, 0x1FF4 ], + [ 0x1FF6, 0x1FFC ], + [ 0x2071, 0x2071 ], + [ 0x207F, 0x207F ], + [ 0x2090, 0x2094 ], + [ 0x2102, 0x2102 ], + [ 0x2107, 0x2107 ], + [ 0x210A, 0x2113 ], + [ 0x2115, 0x2115 ], + [ 0x2119, 0x211D ], + [ 0x2124, 0x2124 ], + [ 0x2126, 0x2126 ], + [ 0x2128, 0x2128 ], + [ 0x212A, 0x212D ], + [ 0x212F, 0x2139 ], + [ 0x213C, 0x213F ], + [ 0x2145, 0x2149 ], + [ 0x214E, 0x214E ], + [ 0x2183, 0x2184 ], + [ 0x2C00, 0x2C2E ], + [ 0x2C30, 0x2C5E ], + [ 0x2C60, 0x2C6C ], + [ 0x2C74, 0x2C77 ], + [ 0x2C80, 0x2CE4 ], + [ 0x2D00, 0x2D25 ], + [ 0x2D30, 0x2D65 ], + [ 0x2D6F, 0x2D6F ], + [ 0x2D80, 0x2D96 ], + [ 0x2DA0, 0x2DA6 ], + [ 0x2DA8, 0x2DAE ], + [ 0x2DB0, 0x2DB6 ], + [ 0x2DB8, 0x2DBE ], + [ 0x2DC0, 0x2DC6 ], + [ 0x2DC8, 0x2DCE ], + [ 0x2DD0, 0x2DD6 ], + [ 0x2DD8, 0x2DDE ], + [ 0x3005, 0x3006 ], + [ 0x3031, 0x3035 ], + [ 0x303B, 0x303C ], + [ 0x3041, 0x3096 ], + [ 0x309D, 0x309F ], + [ 0x30A1, 0x30FA ], + [ 0x30FC, 0x30FF ], + [ 0x3105, 0x312C ], + [ 0x3131, 0x318E ], + [ 0x31A0, 0x31B7 ], + [ 0x31F0, 0x31FF ], + [ 0x3400, 0x4DB5 ], + [ 0x4E00, 0x9FBB ], + [ 0xA000, 0xA48C ], + [ 0xA717, 0xA71A ], + [ 0xA800, 0xA801 ], + [ 0xA803, 0xA805 ], + [ 0xA807, 0xA80A ], + [ 0xA80C, 0xA822 ], + [ 0xA840, 0xA873 ], + [ 0xAC00, 0xD7A3 ], + [ 0xF900, 0xFA2D ], + [ 0xFA30, 0xFA6A ], + [ 0xFA70, 0xFAD9 ], + [ 0xFB00, 0xFB06 ], + [ 0xFB13, 0xFB17 ], + [ 0xFB1D, 0xFB1D ], + [ 0xFB1F, 0xFB28 ], + [ 0xFB2A, 0xFB36 ], + [ 0xFB38, 0xFB3C ], + [ 0xFB3E, 0xFB3E ], + [ 0xFB40, 0xFB41 ], + [ 0xFB43, 0xFB44 ], + [ 0xFB46, 0xFBB1 ], + [ 0xFBD3, 0xFD3D ], + [ 0xFD50, 0xFD8F ], + [ 0xFD92, 0xFDC7 ], + [ 0xFDF0, 0xFDFB ], + [ 0xFE70, 0xFE74 ], + [ 0xFE76, 0xFEFC ], + [ 0xFF21, 0xFF3A ], + [ 0xFF41, 0xFF5A ], + [ 0xFF66, 0xFFBE ], + [ 0xFFC2, 0xFFC7 ], + [ 0xFFCA, 0xFFCF ], + [ 0xFFD2, 0xFFD7 ], + [ 0xFFDA, 0xFFDC ], + [ 0x10000, 0x1000B ], + [ 0x1000D, 0x10026 ], + [ 0x10028, 0x1003A ], + [ 0x1003C, 0x1003D ], + [ 0x1003F, 0x1004D ], + [ 0x10050, 0x1005D ], + [ 0x10080, 0x100FA ], + [ 0x10300, 0x1031E ], + [ 0x10330, 0x10340 ], + [ 0x10342, 0x10349 ], + [ 0x10380, 0x1039D ], + [ 0x103A0, 0x103C3 ], + [ 0x103C8, 0x103CF ], + [ 0x10400, 0x1049D ], + [ 0x10800, 0x10805 ], + [ 0x10808, 0x10808 ], + [ 0x1080A, 0x10835 ], + [ 0x10837, 0x10838 ], + [ 0x1083C, 0x1083C ], + [ 0x1083F, 0x1083F ], + [ 0x10900, 0x10915 ], + [ 0x10A00, 0x10A00 ], + [ 0x10A10, 0x10A13 ], + [ 0x10A15, 0x10A17 ], + [ 0x10A19, 0x10A33 ], + [ 0x12000, 0x1236E ], + [ 0x1D400, 0x1D454 ], + [ 0x1D456, 0x1D49C ], + [ 0x1D49E, 0x1D49F ], + [ 0x1D4A2, 0x1D4A2 ], + [ 0x1D4A5, 0x1D4A6 ], + [ 0x1D4A9, 0x1D4AC ], + [ 0x1D4AE, 0x1D4B9 ], + [ 0x1D4BB, 0x1D4BB ], + [ 0x1D4BD, 0x1D4C3 ], + [ 0x1D4C5, 0x1D505 ], + [ 0x1D507, 0x1D50A ], + [ 0x1D50D, 0x1D514 ], + [ 0x1D516, 0x1D51C ], + [ 0x1D51E, 0x1D539 ], + [ 0x1D53B, 0x1D53E ], + [ 0x1D540, 0x1D544 ], + [ 0x1D546, 0x1D546 ], + [ 0x1D54A, 0x1D550 ], + [ 0x1D552, 0x1D6A5 ], + [ 0x1D6A8, 0x1D6C0 ], + [ 0x1D6C2, 0x1D6DA ], + [ 0x1D6DC, 0x1D6FA ], + [ 0x1D6FC, 0x1D714 ], + [ 0x1D716, 0x1D734 ], + [ 0x1D736, 0x1D74E ], + [ 0x1D750, 0x1D76E ], + [ 0x1D770, 0x1D788 ], + [ 0x1D78A, 0x1D7A8 ], + [ 0x1D7AA, 0x1D7C2 ], + [ 0x1D7C4, 0x1D7CB ], + [ 0x20000, 0x2A6D6 ], + [ 0x2F800, 0x2FA1D ], + ]; + + debug + { + for (int i = 0; i < table.length; i++) + { + assert(table[i][0] <= table[i][1]); + if (i < table.length - 1) + { +// if (table[i][1] >= table[i + 1][0]) +// printf("table[%d][1] = x%x, table[%d][0] = x%x\n", i, table[i][1], i + 1, table[i + 1][0]); + assert(table[i][1] < table[i + 1][0]); + } + } + } + + if (u < 0xAA) + { + if (u < 'A') + goto Lisnot; + if (u <= 'Z') + goto Lis; + if (u < 'a') + goto Lisnot; + if (u <= 'z') + goto Lis; + goto Lisnot; + } + + // Binary search + uint mid; + uint low; + uint high; + + low = 0; + high = table.length - 1; + while (cast(int)low <= cast(int)high) + { + mid = (low + high) >> 1; + if (u < table[mid][0]) + high = mid - 1; + else if (u > table[mid][1]) + low = mid + 1; + else + goto Lis; + } + +Lisnot: + debug + { + for (int i = 0; i < table.length; i++) + { + assert(u < table[i][0] || u > table[i][1]); + } + } + return 0; + +Lis: + debug + { + for (int i = 0; i < table.length; i++) + { + if (u >= table[i][0] && u <= table[i][1]) + return 1; + } + assert(0); // should have been in table + } + return 1; +} + +unittest +{ + for (uint i = 0; i < 0x80; i++) + { + if (i >= 'A' && i <= 'Z') + assert(isUniAlpha(i)); + else if (i >= 'a' && i <= 'z') + assert(isUniAlpha(i)); + else + assert(!isUniAlpha(i)); + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trunk/src/util/utf.d Fri Jan 11 20:03:46 2008 +0100 @@ -0,0 +1,975 @@ +// utf.d + +/* + * Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com + * Written by Walter Bright + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * o The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * o Altered source versions must be plainly marked as such, and must not + * be misrepresented as being the original software. + * o This notice may not be removed or altered from any source + * distribution. + */ + +/******************************************** + * Encode and decode UTF-8, UTF-16 and UTF-32 strings. + * + * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D + * wchar type. + * For linux systems, the C wchar_t type is UTF-32 and corresponds to + * the D utf.dchar type. + * + * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF). + * + * See_Also: + * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br> + * $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br> + * $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335) + * Macros: + * WIKI = Phobos/StdUtf + */ + +/* + Note: this is not the original file! + Modified by Aziz Köksal: + Only commented out deprecated class UtfError. +*/ + +module util.utf; + +// private import std.stdio; + +//debug=utf; // uncomment to turn on debugging printf's +/+ +deprecated class UtfError : Error +{ + size_t idx; // index in string of where error occurred + + this(char[] s, size_t i) + { + idx = i; + super(s); + } +} ++/ +/********************************** + * Exception class that is thrown upon any errors. + */ + +class UtfException : Exception +{ + size_t idx; /// index in string of where error occurred + + this(char[] s, size_t i) + { + idx = i; + super(s); + } +} + +/******************************* + * Test if c is a valid UTF-32 character. + * + * \uFFFE and \uFFFF are considered valid by this function, + * as they are permitted for internal use by an application, + * but they are not allowed for interchange by the Unicode standard. + * + * Returns: true if it is, false if not. + */ + +bool isValidDchar(dchar c) +{ + /* Note: FFFE and FFFF are specifically permitted by the + * Unicode standard for application internal use, but are not + * allowed for interchange. + * (thanks to Arcane Jill) + */ + + return c < 0xD800 || + (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/); +} + +unittest +{ + debug(utf) printf("utf.isValidDchar.unittest\n"); + assert(isValidDchar(cast(dchar)'a') == true); + assert(isValidDchar(cast(dchar)0x1FFFFF) == false); +} + + +ubyte[256] UTF8stride = +[ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, + 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, + 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, + 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, + 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF, +]; + +/** + * stride() returns the length of a UTF-8 sequence starting at index i + * in string s. + * Returns: + * The number of bytes in the UTF-8 sequence or + * 0xFF meaning s[i] is not the start of of UTF-8 sequence. + */ + +uint stride(char[] s, size_t i) +{ + return UTF8stride[s[i]]; +} + +/** + * stride() returns the length of a UTF-16 sequence starting at index i + * in string s. + */ + +uint stride(wchar[] s, size_t i) +{ uint u = s[i]; + return 1 + (u >= 0xD800 && u <= 0xDBFF); +} + +/** + * stride() returns the length of a UTF-32 sequence starting at index i + * in string s. + * Returns: The return value will always be 1. + */ + +uint stride(dchar[] s, size_t i) +{ + return 1; +} + +/******************************************* + * Given an index i into an array of characters s[], + * and assuming that index i is at the start of a UTF character, + * determine the number of UCS characters up to that index i. + */ + +size_t toUCSindex(char[] s, size_t i) +{ + size_t n; + size_t j; + size_t stride; + + for (j = 0; j < i; j += stride) + { + stride = UTF8stride[s[j]]; + if (stride == 0xFF) + goto Lerr; + n++; + } + if (j > i) + { + Lerr: + throw new UtfException("1invalid UTF-8 sequence", j); + } + return n; +} + +/** ditto */ + +size_t toUCSindex(wchar[] s, size_t i) +{ + size_t n; + size_t j; + + for (j = 0; j < i; ) + { uint u = s[j]; + + j += 1 + (u >= 0xD800 && u <= 0xDBFF); + n++; + } + if (j > i) + { + Lerr: + throw new UtfException("2invalid UTF-16 sequence", j); + } + return n; +} + +/** ditto */ + +size_t toUCSindex(dchar[] s, size_t i) +{ + return i; +} + +/****************************************** + * Given a UCS index n into an array of characters s[], return the UTF index. + */ + +size_t toUTFindex(char[] s, size_t n) +{ + size_t i; + + while (n--) + { + uint j = UTF8stride[s[i]]; + if (j == 0xFF) + throw new UtfException("3invalid UTF-8 sequence", i); + i += j; + } + return i; +} + +/** ditto */ + +size_t toUTFindex(wchar[] s, size_t n) +{ + size_t i; + + while (n--) + { wchar u = s[i]; + + i += 1 + (u >= 0xD800 && u <= 0xDBFF); + } + return i; +} + +/** ditto */ + +size_t toUTFindex(dchar[] s, size_t n) +{ + return n; +} + +/* =================== Decode ======================= */ + +/*************** + * Decodes and returns character starting at s[idx]. idx is advanced past the + * decoded character. If the character is not well formed, a UtfException is + * thrown and idx remains unchanged. + */ + +dchar decode(char[] s, inout size_t idx) + in + { + assert(idx >= 0 && idx < s.length); + } + out (result) + { + assert(isValidDchar(result)); + } + body + { + size_t len = s.length; + dchar V; + size_t i = idx; + char u = s[i]; + + if (u & 0x80) + { uint n; + char u2; + + /* The following encodings are valid, except for the 5 and 6 byte + * combinations: + * 0xxxxxxx + * 110xxxxx 10xxxxxx + * 1110xxxx 10xxxxxx 10xxxxxx + * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + */ + for (n = 1; ; n++) + { + if (n > 4) + goto Lerr; // only do the first 4 of 6 encodings + if (((u << n) & 0x80) == 0) + { + if (n == 1) + goto Lerr; + break; + } + } + + // Pick off (7 - n) significant bits of B from first byte of octet + V = cast(dchar)(u & ((1 << (7 - n)) - 1)); + + if (i + (n - 1) >= len) + goto Lerr; // off end of string + + /* The following combinations are overlong, and illegal: + * 1100000x (10xxxxxx) + * 11100000 100xxxxx (10xxxxxx) + * 11110000 1000xxxx (10xxxxxx 10xxxxxx) + * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) + * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) + */ + u2 = s[i + 1]; + if ((u & 0xFE) == 0xC0 || + (u == 0xE0 && (u2 & 0xE0) == 0x80) || + (u == 0xF0 && (u2 & 0xF0) == 0x80) || + (u == 0xF8 && (u2 & 0xF8) == 0x80) || + (u == 0xFC && (u2 & 0xFC) == 0x80)) + goto Lerr; // overlong combination + + for (uint j = 1; j != n; j++) + { + u = s[i + j]; + if ((u & 0xC0) != 0x80) + goto Lerr; // trailing bytes are 10xxxxxx + V = (V << 6) | (u & 0x3F); + } + if (!isValidDchar(V)) + goto Lerr; + i += n; + } + else + { + V = cast(dchar) u; + i++; + } + + idx = i; + return V; + + Lerr: + //printf("\ndecode: idx = %d, i = %d, length = %d s = \n'%.*s'\n%x\n'%.*s'\n", idx, i, s.length, s, s[i], s[i .. length]); + throw new UtfException("4invalid UTF-8 sequence", i); + } + +unittest +{ size_t i; + dchar c; + + debug(utf) printf("utf.decode.unittest\n"); + + static char[] s1 = "abcd"; + i = 0; + c = decode(s1, i); + assert(c == cast(dchar)'a'); + assert(i == 1); + c = decode(s1, i); + assert(c == cast(dchar)'b'); + assert(i == 2); + + static char[] s2 = "\xC2\xA9"; + i = 0; + c = decode(s2, i); + assert(c == cast(dchar)'\u00A9'); + assert(i == 2); + + static char[] s3 = "\xE2\x89\xA0"; + i = 0; + c = decode(s3, i); + assert(c == cast(dchar)'\u2260'); + assert(i == 3); + + static char[][] s4 = + [ "\xE2\x89", // too short + "\xC0\x8A", + "\xE0\x80\x8A", + "\xF0\x80\x80\x8A", + "\xF8\x80\x80\x80\x8A", + "\xFC\x80\x80\x80\x80\x8A", + ]; + + for (int j = 0; j < s4.length; j++) + { + try + { + i = 0; + c = decode(s4[j], i); + assert(0); + } + catch (UtfException u) + { + i = 23; + delete u; + } + assert(i == 23); + } +} + +/** ditto */ + +dchar decode(wchar[] s, inout size_t idx) + in + { + assert(idx >= 0 && idx < s.length); + } + out (result) + { + assert(isValidDchar(result)); + } + body + { + char[] msg; + dchar V; + size_t i = idx; + uint u = s[i]; + + if (u & ~0x7F) + { if (u >= 0xD800 && u <= 0xDBFF) + { uint u2; + + if (i + 1 == s.length) + { msg = "surrogate UTF-16 high value past end of string"; + goto Lerr; + } + u2 = s[i + 1]; + if (u2 < 0xDC00 || u2 > 0xDFFF) + { msg = "surrogate UTF-16 low value out of range"; + goto Lerr; + } + u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); + i += 2; + } + else if (u >= 0xDC00 && u <= 0xDFFF) + { msg = "unpaired surrogate UTF-16 value"; + goto Lerr; + } + else if (u == 0xFFFE || u == 0xFFFF) + { msg = "illegal UTF-16 value"; + goto Lerr; + } + else + i++; + } + else + { + i++; + } + + idx = i; + return cast(dchar)u; + + Lerr: + throw new UtfException(msg, i); + } + +/** ditto */ + +dchar decode(dchar[] s, inout size_t idx) + in + { + assert(idx >= 0 && idx < s.length); + } + body + { + size_t i = idx; + dchar c = s[i]; + + if (!isValidDchar(c)) + goto Lerr; + idx = i + 1; + return c; + + Lerr: + throw new UtfException("5invalid UTF-32 value", i); + } + + +/* =================== Encode ======================= */ + +/******************************* + * Encodes character c and appends it to array s[]. + */ + +void encode(inout char[] s, dchar c) + in + { + assert(isValidDchar(c)); + } + body + { + char[] r = s; + + if (c <= 0x7F) + { + r ~= cast(char) c; + } + else + { + char[4] buf; + uint L; + + if (c <= 0x7FF) + { + buf[0] = cast(char)(0xC0 | (c >> 6)); + buf[1] = cast(char)(0x80 | (c & 0x3F)); + L = 2; + } + else if (c <= 0xFFFF) + { + buf[0] = cast(char)(0xE0 | (c >> 12)); + buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); + buf[2] = cast(char)(0x80 | (c & 0x3F)); + L = 3; + } + else if (c <= 0x10FFFF) + { + buf[0] = cast(char)(0xF0 | (c >> 18)); + buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); + buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); + buf[3] = cast(char)(0x80 | (c & 0x3F)); + L = 4; + } + else + { + assert(0); + } + r ~= buf[0 .. L]; + } + s = r; + } + +unittest +{ + debug(utf) printf("utf.encode.unittest\n"); + + char[] s = "abcd"; + encode(s, cast(dchar)'a'); + assert(s.length == 5); + assert(s == "abcda"); + + encode(s, cast(dchar)'\u00A9'); + assert(s.length == 7); + assert(s == "abcda\xC2\xA9"); + //assert(s == "abcda\u00A9"); // BUG: fix compiler + + encode(s, cast(dchar)'\u2260'); + assert(s.length == 10); + assert(s == "abcda\xC2\xA9\xE2\x89\xA0"); +} + +/** ditto */ + +void encode(inout wchar[] s, dchar c) + in + { + assert(isValidDchar(c)); + } + body + { + wchar[] r = s; + + if (c <= 0xFFFF) + { + r ~= cast(wchar) c; + } + else + { + wchar[2] buf; + + buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); + buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); + r ~= buf; + } + s = r; + } + +/** ditto */ + +void encode(inout dchar[] s, dchar c) + in + { + assert(isValidDchar(c)); + } + body + { + s ~= c; + } + +/* =================== Validation ======================= */ + +/*********************************** + * Checks to see if string is well formed or not. Throws a UtfException if it is + * not. Use to check all untrusted input for correctness. + */ + +void validate(char[] s) +{ + size_t len = s.length; + size_t i; + + for (i = 0; i < len; ) + { + decode(s, i); + } +} + +/** ditto */ + +void validate(wchar[] s) +{ + size_t len = s.length; + size_t i; + + for (i = 0; i < len; ) + { + decode(s, i); + } +} + +/** ditto */ + +void validate(dchar[] s) +{ + size_t len = s.length; + size_t i; + + for (i = 0; i < len; ) + { + decode(s, i); + } +} + +/* =================== Conversion to UTF8 ======================= */ + +char[] toUTF8(char[4] buf, dchar c) + in + { + assert(isValidDchar(c)); + } + body + { + if (c <= 0x7F) + { + buf[0] = cast(char) c; + return buf[0 .. 1]; + } + else if (c <= 0x7FF) + { + buf[0] = cast(char)(0xC0 | (c >> 6)); + buf[1] = cast(char)(0x80 | (c & 0x3F)); + return buf[0 .. 2]; + } + else if (c <= 0xFFFF) + { + buf[0] = cast(char)(0xE0 | (c >> 12)); + buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); + buf[2] = cast(char)(0x80 | (c & 0x3F)); + return buf[0 .. 3]; + } + else if (c <= 0x10FFFF) + { + buf[0] = cast(char)(0xF0 | (c >> 18)); + buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); + buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); + buf[3] = cast(char)(0x80 | (c & 0x3F)); + return buf[0 .. 4]; + } + assert(0); + } + +/******************* + * Encodes string s into UTF-8 and returns the encoded string. + */ + +char[] toUTF8(char[] s) + in + { + validate(s); + } + body + { + return s; + } + +/** ditto */ + +char[] toUTF8(wchar[] s) +{ + char[] r; + size_t i; + size_t slen = s.length; + + r.length = slen; + + for (i = 0; i < slen; i++) + { wchar c = s[i]; + + if (c <= 0x7F) + r[i] = cast(char)c; // fast path for ascii + else + { + r.length = i; + foreach (dchar c; s[i .. slen]) + { + encode(r, c); + } + break; + } + } + return r; +} + +/** ditto */ + +char[] toUTF8(dchar[] s) +{ + char[] r; + size_t i; + size_t slen = s.length; + + r.length = slen; + + for (i = 0; i < slen; i++) + { dchar c = s[i]; + + if (c <= 0x7F) + r[i] = cast(char)c; // fast path for ascii + else + { + r.length = i; + foreach (dchar d; s[i .. slen]) + { + encode(r, d); + } + break; + } + } + return r; +} + +/* =================== Conversion to UTF16 ======================= */ + +wchar[] toUTF16(wchar[2] buf, dchar c) + in + { + assert(isValidDchar(c)); + } + body + { + if (c <= 0xFFFF) + { + buf[0] = cast(wchar) c; + return buf[0 .. 1]; + } + else + { + buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); + buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); + return buf[0 .. 2]; + } + } + +/**************** + * Encodes string s into UTF-16 and returns the encoded string. + * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take + * an LPWSTR or LPCWSTR argument. + */ + +wchar[] toUTF16(char[] s) +{ + wchar[] r; + size_t slen = s.length; + + r.length = slen; + r.length = 0; + for (size_t i = 0; i < slen; ) + { + dchar c = s[i]; + if (c <= 0x7F) + { + i++; + r ~= cast(wchar)c; + } + else + { + c = decode(s, i); + encode(r, c); + } + } + return r; +} + +/** ditto */ + +wchar* toUTF16z(char[] s) +{ + wchar[] r; + size_t slen = s.length; + + r.length = slen + 1; + r.length = 0; + for (size_t i = 0; i < slen; ) + { + dchar c = s[i]; + if (c <= 0x7F) + { + i++; + r ~= cast(wchar)c; + } + else + { + c = decode(s, i); + encode(r, c); + } + } + r ~= "\000"; + return r.ptr; +} + +/** ditto */ + +wchar[] toUTF16(wchar[] s) + in + { + validate(s); + } + body + { + return s; + } + +/** ditto */ + +wchar[] toUTF16(dchar[] s) +{ + wchar[] r; + size_t slen = s.length; + + r.length = slen; + r.length = 0; + for (size_t i = 0; i < slen; i++) + { + encode(r, s[i]); + } + return r; +} + +/* =================== Conversion to UTF32 ======================= */ + +/***** + * Encodes string s into UTF-32 and returns the encoded string. + */ + +dchar[] toUTF32(char[] s) +{ + dchar[] r; + size_t slen = s.length; + size_t j = 0; + + r.length = slen; // r[] will never be longer than s[] + for (size_t i = 0; i < slen; ) + { + dchar c = s[i]; + if (c >= 0x80) + c = decode(s, i); + else + i++; // c is ascii, no need for decode + r[j++] = c; + } + return r[0 .. j]; +} + +/** ditto */ + +dchar[] toUTF32(wchar[] s) +{ + dchar[] r; + size_t slen = s.length; + size_t j = 0; + + r.length = slen; // r[] will never be longer than s[] + for (size_t i = 0; i < slen; ) + { + dchar c = s[i]; + if (c >= 0x80) + c = decode(s, i); + else + i++; // c is ascii, no need for decode + r[j++] = c; + } + return r[0 .. j]; +} + +/** ditto */ + +dchar[] toUTF32(dchar[] s) + in + { + validate(s); + } + body + { + return s; + } + +/* ================================ tests ================================== */ + +unittest +{ + debug(utf) printf("utf.toUTF.unittest\n"); + + char[] c; + wchar[] w; + dchar[] d; + + c = "hello"; + w = toUTF16(c); + assert(w == "hello"); + d = toUTF32(c); + assert(d == "hello"); + + c = toUTF8(w); + assert(c == "hello"); + d = toUTF32(w); + assert(d == "hello"); + + c = toUTF8(d); + assert(c == "hello"); + w = toUTF16(d); + assert(w == "hello"); + + + c = "hel\u1234o"; + w = toUTF16(c); + assert(w == "hel\u1234o"); + d = toUTF32(c); + assert(d == "hel\u1234o"); + + c = toUTF8(w); + assert(c == "hel\u1234o"); + d = toUTF32(w); + assert(d == "hel\u1234o"); + + c = toUTF8(d); + assert(c == "hel\u1234o"); + w = toUTF16(d); + assert(w == "hel\u1234o"); + + + c = "he\U0010AAAAllo"; + w = toUTF16(c); + //foreach (wchar c; w) printf("c = x%x\n", c); + //foreach (wchar c; cast(wchar[])"he\U0010AAAAllo") printf("c = x%x\n", c); + assert(w == "he\U0010AAAAllo"); + d = toUTF32(c); + assert(d == "he\U0010AAAAllo"); + + c = toUTF8(w); + assert(c == "he\U0010AAAAllo"); + d = toUTF32(w); + assert(d == "he\U0010AAAAllo"); + + c = toUTF8(d); + assert(c == "he\U0010AAAAllo"); + w = toUTF16(d); + assert(w == "he\U0010AAAAllo"); +}