Mercurial > projects > ldc
diff tango/tango/net/Uri.d @ 132:1700239cab2e trunk
[svn r136] MAJOR UNSTABLE UPDATE!!!
Initial commit after moving to Tango instead of Phobos.
Lots of bugfixes...
This build is not suitable for most things.
author | lindquist |
---|---|
date | Fri, 11 Jan 2008 17:57:40 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tango/tango/net/Uri.d Fri Jan 11 17:57:40 2008 +0100 @@ -0,0 +1,930 @@ +/******************************************************************************* + + copyright: Copyright (c) 2004 Kris Bell. All rights reserved + + license: BSD style: $(LICENSE) + + version: Initial release: April 2004 + + author: Kris + +*******************************************************************************/ + +module tango.net.Uri; + +public import tango.net.model.UriView; + +private import tango.core.Exception; + +private import Integer = tango.text.convert.Integer; + +/******************************************************************************* + + external links + +*******************************************************************************/ + +extern (C) char* memchr (char *, char, uint); + + +/******************************************************************************* + + Implements an RFC 2396 compliant URI specification. See + <A HREF="http://ftp.ics.uci.edu/pub/ietf/uri/rfc2396.txt">this page</A> + for more information. + + The implementation fails the spec on two counts: it doesn't insist + on a scheme being present in the Uri, and it doesn't implement the + "Relative References" support noted in section 5.2. The latter can + be found in tango.util.PathUtil instead. + + Note that IRI support can be implied by assuming each of userinfo, + path, query, and fragment are UTF-8 encoded + (see <A HREF="http://www.w3.org/2001/Talks/0912-IUC-IRI/paper.html"> + this page</A> for further details). + + Use the UriView subset where you need a readonly perspective. + +*******************************************************************************/ + +class Uri : UriView +{ + // simplistic string appender + private alias void delegate (void[]) Consumer; + + private int port; + private char[] host, + path, + query, + scheme, + userinfo, + fragment; + private HeapSlice decoded; + + private static ubyte map[256]; + + private static short[char[]] genericSchemes; + + private static const char[] hexDigits = "0123456789abcdef"; + + private static const SchemePort[] schemePorts = + [ + {"coffee", 80}, + {"file", InvalidPort}, + {"ftp", 21}, + {"gopher", 70}, + {"hnews", 80}, + {"http", 80}, + {"http-ng", 80}, + {"https", 443}, + {"imap", 143}, + {"irc", 194}, + {"ldap", 389}, + {"news", 119}, + {"nfs", 2049}, + {"nntp", 119}, + {"pop", 110}, + {"prospero", 1525}, + {"rwhois", 4321}, + {"sip", InvalidPort}, + {"sips", InvalidPort}, + {"sipt", InvalidPort}, + {"sipu", InvalidPort}, + {"shttp", 80}, + {"smtp", 25}, + {"snews", 563}, + {"telnet", 23}, + {"vemmi", 575}, + {"videotex", 516}, + {"wais", 210}, + {"whois", 43}, + {"whois++", 43}, + ]; + + public enum + { + ExcScheme = 0x01, + ExcAuthority = 0x02, + ExcPath = 0x04, + IncUser = 0x80, // encode spec for User + IncPath = 0x10, // encode spec for Path + IncQuery = 0x20, // encode spec for Query + IncQueryAll = 0x40, + IncScheme = 0x80, // encode spec for Scheme + IncGeneric = IncScheme | + IncUser | + IncPath | + IncQuery | + IncQueryAll + } + + // scheme and port pairs + private struct SchemePort + { + char[] name; + short port; + } + + /*********************************************************************** + + Initialize the Uri character maps and so on + + ***********************************************************************/ + + static this () + { + // Map known generic schemes to their default port. Specify + // InvalidPort for those schemes that don't use ports. Note + // that a port value of zero is not supported ... + foreach (SchemePort sp; schemePorts) + genericSchemes[sp.name] = sp.port; + genericSchemes.rehash; + + // load the character map with valid symbols + for (int i='a'; i <= 'z'; ++i) + map[i] = IncGeneric; + + for (int i='A'; i <= 'Z'; ++i) + map[i] = IncGeneric; + + for (int i='0'; i<='9'; ++i) + map[i] = IncGeneric; + + // exclude these from parsing elements + map[':'] |= ExcScheme; + map['/'] |= ExcScheme | ExcAuthority; + map['?'] |= ExcScheme | ExcAuthority | ExcPath; + map['#'] |= ExcScheme | ExcAuthority | ExcPath; + + // include these as common symbols + map['-'] |= IncUser | IncQuery | IncQueryAll; + map['_'] |= IncUser | IncQuery | IncQueryAll; + map['.'] |= IncUser | IncQuery | IncQueryAll; + map['!'] |= IncUser | IncQuery | IncQueryAll; + map['~'] |= IncUser | IncQuery | IncQueryAll; + map['*'] |= IncUser | IncQuery | IncQueryAll; + map['\''] |= IncUser | IncQuery | IncQueryAll; + map['('] |= IncUser | IncQuery | IncQueryAll; + map[')'] |= IncUser | IncQuery | IncQueryAll; + + // include these as scheme symbols + map['+'] |= IncScheme; + map['-'] |= IncScheme; + map['.'] |= IncScheme; + + // include these as userinfo symbols + map[';'] |= IncUser; + map[':'] |= IncUser; + map['&'] |= IncUser; + map['='] |= IncUser; + map['+'] |= IncUser; + map['$'] |= IncUser; + map[','] |= IncUser; + + // include these as path symbols + map['/'] |= IncPath; + map[';'] |= IncPath; + map[':'] |= IncPath; + map['@'] |= IncPath; + map['&'] |= IncPath; + map['='] |= IncPath; + map['+'] |= IncPath; + map['$'] |= IncPath; + map[','] |= IncPath; + + // include these as query symbols + map[';'] |= IncQuery | IncQueryAll; + map['/'] |= IncQuery | IncQueryAll; + map['?'] |= IncQueryAll; + map[':'] |= IncQuery | IncQueryAll; + map['@'] |= IncQuery | IncQueryAll; + map['&'] |= IncQueryAll; + map['='] |= IncQuery | IncQueryAll; + map['+'] |= IncQuery | IncQueryAll; + map['$'] |= IncQuery | IncQueryAll; + map[','] |= IncQuery | IncQueryAll; + + // '%' are permitted inside queries when constructing output + map['%'] |= IncQueryAll; + } + + /*********************************************************************** + + Create an empty Uri + + ***********************************************************************/ + + this () + { + port = InvalidPort; + decoded = new HeapSlice (256); + } + + /*********************************************************************** + + Construct a Uri from the provided character string + + ***********************************************************************/ + + this (char[] uri) + { + this (); + parse (uri); + } + + /*********************************************************************** + + Construct a Uri from the given components. The query is + optional. + + ***********************************************************************/ + + this (char[] scheme, char[] host, char[] path, char[] query = null) + { + this (); + + this.scheme = scheme; + this.query = query; + this.host = host; + this.path = path; + } + + /*********************************************************************** + + Clone another Uri. This can be used to make a mutable Uri + from an immutable UriView. + + ***********************************************************************/ + + this (UriView other) + { + with (other) + { + this (getScheme, getHost, getPath, getQuery); + this.userinfo = getUserInfo; + this.fragment = getFragment; + this.port = getPort; + } + } + + /*********************************************************************** + + Return the default port for the given scheme. InvalidPort + is returned if the scheme is unknown, or does not accept + a port. + + ***********************************************************************/ + + final int getDefaultPort (char[] scheme) + { + short* port = scheme in genericSchemes; + if (port is null) + return InvalidPort; + return *port; + } + + /*********************************************************************** + + Return the parsed scheme, or null if the scheme was not + specified + + ***********************************************************************/ + + final char[] getScheme() + { + return scheme; + } + + /*********************************************************************** + + Return the parsed host, or null if the host was not + specified + + ***********************************************************************/ + + final char[] getHost() + { + return host; + } + + /*********************************************************************** + + Return the parsed port number, or InvalidPort if the port + was not provided. + + ***********************************************************************/ + + final int getPort() + { + return port; + } + + /*********************************************************************** + + Return a valid port number by performing a lookup on the + known schemes if the port was not explicitly specified. + + ***********************************************************************/ + + final int getValidPort() + { + if (port is InvalidPort) + return getDefaultPort (scheme); + return port; + } + + /*********************************************************************** + + Return the parsed userinfo, or null if userinfo was not + provided. + + ***********************************************************************/ + + final char[] getUserInfo() + { + return userinfo; + } + + /*********************************************************************** + + Return the parsed path, or null if the path was not + provided. + + ***********************************************************************/ + + final char[] getPath() + { + return path; + } + + /*********************************************************************** + + Return the parsed query, or null if a query was not + provided. + + ***********************************************************************/ + + final char[] getQuery() + { + return query; + } + + /*********************************************************************** + + Return the parsed fragment, or null if a fragment was not + provided. + + ***********************************************************************/ + + final char[] getFragment() + { + return fragment; + } + + /*********************************************************************** + + Return whether or not the Uri scheme is considered generic. + + ***********************************************************************/ + + final bool isGeneric () + { + return (scheme in genericSchemes) !is null; + } + + /*********************************************************************** + + Emit the content of this Uri via the provided Consumer. The + output is constructed per RFC 2396. + + ***********************************************************************/ + + final Consumer produce (Consumer consume) + { + if (scheme.length) + consume (scheme), consume (":"); + + + if (userinfo.length || host.length || port != InvalidPort) + { + consume ("//"); + + if (userinfo.length) + encode (consume, userinfo, IncUser) ("@"); + + if (host.length) + consume (host); + + if (port != InvalidPort && port != getDefaultPort(scheme)) + { + char[8] tmp; + consume (":"), consume (Integer.itoa (tmp, cast(uint) port)); + } + } + + if (path.length) + encode (consume, path, IncPath); + + if (query.length) + { + consume ("?"); + encode (consume, query, IncQueryAll); + } + + if (fragment.length) + { + consume ("#"); + encode (consume, fragment, IncQuery); + } + + return consume; + } + + /*********************************************************************** + + Emit the content of this Uri via the provided Consumer. The + output is constructed per RFC 2396. + + ***********************************************************************/ + + final char[] toString () + { + void[] s; + + s.length = 256, s.length = 0; + produce ((void[] v) {s ~= v;}); + return cast(char[]) s; + } + + /*********************************************************************** + + Encode uri characters into a Consumer, such that + reserved chars are converted into their %hex version. + + ***********************************************************************/ + + static Consumer encode (Consumer consume, char[] s, int flags) + { + char[3] hex; + int mark; + + hex[0] = '%'; + foreach (int i, char c; s) + { + if (! (map[c] & flags)) + { + consume (s[mark..i]); + mark = i+1; + + hex[1] = hexDigits [(c >> 4) & 0x0f]; + hex[2] = hexDigits [c & 0x0f]; + consume (hex); + } + } + + // add trailing section + if (mark < s.length) + consume (s[mark..s.length]); + + return consume; + } + + /*********************************************************************** + + Encode uri characters into a string, such that reserved + chars are converted into their %hex version. + + Returns a dup'd string + + ***********************************************************************/ + + final char[] encode (char[] text, int flags) + { + void[] s; + encode ((void[] v) {s ~= v;}, text, flags); + return cast(char[]) s; + } + + /*********************************************************************** + + Decode a character string with potential %hex values in it. + The decoded strings are placed into a thread-safe expanding + buffer, and a slice of it is returned to the caller. + + ***********************************************************************/ + + private char[] decoder (char[] s, char ignore=0) + { + static int toInt (char c) + { + if (c >= '0' && c <= '9') + c -= '0'; + else + if (c >= 'a' && c <= 'f') + c -= ('a' - 10); + else + if (c >= 'A' && c <= 'F') + c -= ('A' - 10); + return c; + } + + int length = s.length; + + // take a peek first, to see if there's work to do + if (length && memchr (s.ptr, '%', length)) + { + char* p; + int j; + + // ensure we have enough decoding space available + p = cast(char*) decoded.expand (length); + + // scan string, stripping % encodings as we go + for (int i; i < length; ++i, ++j, ++p) + { + int c = s[i]; + + if (c is '%' && (i+2) < length) + { + c = toInt(s[i+1]) * 16 + toInt(s[i+2]); + + // leave ignored escapes in the stream, + // permitting escaped '&' to remain in + // the query string + if (c && (c is ignore)) + c = '%'; + else + i += 2; + } + + *p = c; + } + + // return a slice from the decoded input + return cast(char[]) decoded.slice (j); + } + + // return original content + return s; + } + + /*********************************************************************** + + Decode a duplicated string with potential %hex values in it + + ***********************************************************************/ + + final char[] decode (char[] s) + { + return decoder(s).dup; + } + + /*********************************************************************** + + Parsing is performed according to RFC 2396 + + --- + ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? + 12 3 4 5 6 7 8 9 + + 2 isolates scheme + 4 isolates authority + 5 isolates path + 7 isolates query + 9 isolates fragment + --- + + This was originally a state-machine; it turned out to be a + lot faster (~40%) when unwound like this instead. + + ***********************************************************************/ + + final Uri parse (char[] uri, bool relative = false) + { + char c; + int i, + mark, + len = uri.length; + + reset; + + // isolate scheme (note that it's OK to not specify a scheme) + for (i=0; i < len && !(map[c = uri[i]] & ExcScheme); ++i) {} + if (c is ':') + { + scheme = uri [mark .. i]; + toLower (scheme); + mark = i + 1; + } + + // isolate authority + if (mark < len-1 && uri[mark] is '/' && uri[mark+1] is '/') + { + for (mark+=2, i=mark; i < len && !(map[uri[i]] & ExcAuthority); ++i) {} + parseAuthority (uri[mark .. i]); + mark = i; + } + else + if (relative && uri[0] != '/') + { + uri = toLastSlash(path) ~ uri; + query = fragment = null; + len = uri.length; + } + + // isolate path + for (i=mark; i < len && !(map[uri[i]] & ExcPath); ++i) {} + path = decoder (uri[mark .. i]); + mark = i; + + // isolate query + if (mark < len && uri[mark] is '?') + { + for (++mark, i=mark; i < len && uri[i] != '#'; ++i) {} + query = decoder (uri[mark .. i], '&'); + mark = i; + } + + // isolate fragment + if (mark < len && uri[mark] is '#') + fragment = decoder (uri[mark+1 .. len]); + + return this; + } + + /*********************************************************************** + + Clear everything to null. + + ***********************************************************************/ + + final void reset() + { + decoded.reset; + port = InvalidPort; + host = path = query = scheme = userinfo = fragment = null; + } + + /*********************************************************************** + + Parse the given uri, with support for relative URLs + + ***********************************************************************/ + + final Uri relParse (char[] uri) + { + return parse (uri, true); + } + + /*********************************************************************** + + Set the Uri scheme + + ***********************************************************************/ + + final Uri setScheme (char[] scheme) + { + this.scheme = scheme; + return this; + } + + /*********************************************************************** + + Set the Uri host + + ***********************************************************************/ + + final Uri setHost (char[] host) + { + this.host = host; + return this; + } + + /*********************************************************************** + + Set the Uri port + + ***********************************************************************/ + + final Uri setPort (int port) + { + this.port = port; + return this; + } + + /*********************************************************************** + + Set the Uri userinfo + + ***********************************************************************/ + + final Uri setUserInfo (char[] userinfo) + { + this.userinfo = userinfo; + return this; + } + + /*********************************************************************** + + Set the Uri query + + ***********************************************************************/ + + final Uri setQuery (char[] query) + { + this.query = query; + return this; + } + + /*********************************************************************** + + Extend the Uri query + + ***********************************************************************/ + + final char[] extendQuery (char[] tail) + { + if (tail.length) + if (query.length) + query = query ~ "&" ~ tail; + else + query = tail; + return query; + } + + /*********************************************************************** + + Set the Uri path + + ***********************************************************************/ + + final Uri setPath (char[] path) + { + this.path = path; + return this; + } + + /*********************************************************************** + + Set the Uri fragment + + ***********************************************************************/ + + final Uri setFragment (char[] fragment) + { + this.fragment = fragment; + return this; + } + + /*********************************************************************** + + Authority is the section after the scheme, but before the + path, query or fragment; it typically represents a host. + + --- + ^(([^@]*)@?)([^:]*)?(:(.*))? + 12 3 4 5 + + 2 isolates userinfo + 3 isolates host + 5 isolates port + --- + + ***********************************************************************/ + + private void parseAuthority (char[] auth) + { + int mark, + len = auth.length; + + // get userinfo: (([^@]*)@?) + foreach (int i, char c; auth) + if (c is '@') + { + userinfo = decoder (auth[0 .. i]); + mark = i + 1; + break; + } + + // get port: (:(.*))? + for (int i=mark; i < len; ++i) + if (auth [i] is ':') + { + port = Integer.atoi (auth [i+1 .. len]); + len = i; + break; + } + + // get host: ([^:]*)? + host = auth [mark..len]; + } + + /********************************************************************** + + **********************************************************************/ + + private final char[] toLastSlash (char[] path) + { + for (char*p = path.ptr+path.length; --p >= path.ptr;) + if (*p is '/') + return path [0 .. (p-path.ptr)+1]; + return path; + } + + /********************************************************************** + + in-place conversion to lowercase + + **********************************************************************/ + + private final static char[] toLower (inout char[] src) + { + foreach (inout char c; src) + if (c >= 'A' && c <= 'Z') + c = c + ('a' - 'A'); + return src; + } +} + + +/******************************************************************************* + +*******************************************************************************/ + +private class HeapSlice +{ + private uint used; + private void[] buffer; + + /*********************************************************************** + + Create with the specified starting size + + ***********************************************************************/ + + this (uint size) + { + buffer = new void[size]; + } + + /*********************************************************************** + + Reset content length to zero + + ***********************************************************************/ + + final void reset () + { + used = 0; + } + + /*********************************************************************** + + Potentially expand the content space, and return a pointer + to the start of the empty section. + + ***********************************************************************/ + + final void* expand (uint size) + { + if ((used + size) > buffer.length) + buffer.length = (used + size) * 2; + return &buffer [used]; + } + + /*********************************************************************** + + Return a slice of the content from the current position + with the specified size. Adjusts the current position to + point at an empty zone. + + ***********************************************************************/ + + final void[] slice (int size) + { + uint i = used; + used += size; + return buffer [i..used]; + } +} + + + +/******************************************************************************* + +*******************************************************************************/ + +debug (Uri) +{ + import tango.io.Console; + + void main() + { + auto uri = new Uri ("http://foo.bar?a=1&b=2&c=3&d=%26%26&e=5"); + + Cout (uri.getQuery).newline; + Cout (uri).newline; + + Cout (uri.encode ("&#$%", uri.IncQuery)).newline; + } +} \ No newline at end of file