comparison tango/tango/net/Uri.d @ 132:1700239cab2e trunk

[svn r136] MAJOR UNSTABLE UPDATE!!! Initial commit after moving to Tango instead of Phobos. Lots of bugfixes... This build is not suitable for most things.
author lindquist
date Fri, 11 Jan 2008 17:57:40 +0100
parents
children
comparison
equal deleted inserted replaced
131:5825d48b27d1 132:1700239cab2e
1 /*******************************************************************************
2
3 copyright: Copyright (c) 2004 Kris Bell. All rights reserved
4
5 license: BSD style: $(LICENSE)
6
7 version: Initial release: April 2004
8
9 author: Kris
10
11 *******************************************************************************/
12
13 module tango.net.Uri;
14
15 public import tango.net.model.UriView;
16
17 private import tango.core.Exception;
18
19 private import Integer = tango.text.convert.Integer;
20
21 /*******************************************************************************
22
23 external links
24
25 *******************************************************************************/
26
27 extern (C) char* memchr (char *, char, uint);
28
29
30 /*******************************************************************************
31
32 Implements an RFC 2396 compliant URI specification. See
33 <A HREF="http://ftp.ics.uci.edu/pub/ietf/uri/rfc2396.txt">this page</A>
34 for more information.
35
36 The implementation fails the spec on two counts: it doesn't insist
37 on a scheme being present in the Uri, and it doesn't implement the
38 "Relative References" support noted in section 5.2. The latter can
39 be found in tango.util.PathUtil instead.
40
41 Note that IRI support can be implied by assuming each of userinfo,
42 path, query, and fragment are UTF-8 encoded
43 (see <A HREF="http://www.w3.org/2001/Talks/0912-IUC-IRI/paper.html">
44 this page</A> for further details).
45
46 Use the UriView subset where you need a readonly perspective.
47
48 *******************************************************************************/
49
50 class Uri : UriView
51 {
52 // simplistic string appender
53 private alias void delegate (void[]) Consumer;
54
55 private int port;
56 private char[] host,
57 path,
58 query,
59 scheme,
60 userinfo,
61 fragment;
62 private HeapSlice decoded;
63
64 private static ubyte map[256];
65
66 private static short[char[]] genericSchemes;
67
68 private static const char[] hexDigits = "0123456789abcdef";
69
70 private static const SchemePort[] schemePorts =
71 [
72 {"coffee", 80},
73 {"file", InvalidPort},
74 {"ftp", 21},
75 {"gopher", 70},
76 {"hnews", 80},
77 {"http", 80},
78 {"http-ng", 80},
79 {"https", 443},
80 {"imap", 143},
81 {"irc", 194},
82 {"ldap", 389},
83 {"news", 119},
84 {"nfs", 2049},
85 {"nntp", 119},
86 {"pop", 110},
87 {"prospero", 1525},
88 {"rwhois", 4321},
89 {"sip", InvalidPort},
90 {"sips", InvalidPort},
91 {"sipt", InvalidPort},
92 {"sipu", InvalidPort},
93 {"shttp", 80},
94 {"smtp", 25},
95 {"snews", 563},
96 {"telnet", 23},
97 {"vemmi", 575},
98 {"videotex", 516},
99 {"wais", 210},
100 {"whois", 43},
101 {"whois++", 43},
102 ];
103
104 public enum
105 {
106 ExcScheme = 0x01,
107 ExcAuthority = 0x02,
108 ExcPath = 0x04,
109 IncUser = 0x80, // encode spec for User
110 IncPath = 0x10, // encode spec for Path
111 IncQuery = 0x20, // encode spec for Query
112 IncQueryAll = 0x40,
113 IncScheme = 0x80, // encode spec for Scheme
114 IncGeneric = IncScheme |
115 IncUser |
116 IncPath |
117 IncQuery |
118 IncQueryAll
119 }
120
121 // scheme and port pairs
122 private struct SchemePort
123 {
124 char[] name;
125 short port;
126 }
127
128 /***********************************************************************
129
130 Initialize the Uri character maps and so on
131
132 ***********************************************************************/
133
134 static this ()
135 {
136 // Map known generic schemes to their default port. Specify
137 // InvalidPort for those schemes that don't use ports. Note
138 // that a port value of zero is not supported ...
139 foreach (SchemePort sp; schemePorts)
140 genericSchemes[sp.name] = sp.port;
141 genericSchemes.rehash;
142
143 // load the character map with valid symbols
144 for (int i='a'; i <= 'z'; ++i)
145 map[i] = IncGeneric;
146
147 for (int i='A'; i <= 'Z'; ++i)
148 map[i] = IncGeneric;
149
150 for (int i='0'; i<='9'; ++i)
151 map[i] = IncGeneric;
152
153 // exclude these from parsing elements
154 map[':'] |= ExcScheme;
155 map['/'] |= ExcScheme | ExcAuthority;
156 map['?'] |= ExcScheme | ExcAuthority | ExcPath;
157 map['#'] |= ExcScheme | ExcAuthority | ExcPath;
158
159 // include these as common symbols
160 map['-'] |= IncUser | IncQuery | IncQueryAll;
161 map['_'] |= IncUser | IncQuery | IncQueryAll;
162 map['.'] |= IncUser | IncQuery | IncQueryAll;
163 map['!'] |= IncUser | IncQuery | IncQueryAll;
164 map['~'] |= IncUser | IncQuery | IncQueryAll;
165 map['*'] |= IncUser | IncQuery | IncQueryAll;
166 map['\''] |= IncUser | IncQuery | IncQueryAll;
167 map['('] |= IncUser | IncQuery | IncQueryAll;
168 map[')'] |= IncUser | IncQuery | IncQueryAll;
169
170 // include these as scheme symbols
171 map['+'] |= IncScheme;
172 map['-'] |= IncScheme;
173 map['.'] |= IncScheme;
174
175 // include these as userinfo symbols
176 map[';'] |= IncUser;
177 map[':'] |= IncUser;
178 map['&'] |= IncUser;
179 map['='] |= IncUser;
180 map['+'] |= IncUser;
181 map['$'] |= IncUser;
182 map[','] |= IncUser;
183
184 // include these as path symbols
185 map['/'] |= IncPath;
186 map[';'] |= IncPath;
187 map[':'] |= IncPath;
188 map['@'] |= IncPath;
189 map['&'] |= IncPath;
190 map['='] |= IncPath;
191 map['+'] |= IncPath;
192 map['$'] |= IncPath;
193 map[','] |= IncPath;
194
195 // include these as query symbols
196 map[';'] |= IncQuery | IncQueryAll;
197 map['/'] |= IncQuery | IncQueryAll;
198 map['?'] |= IncQueryAll;
199 map[':'] |= IncQuery | IncQueryAll;
200 map['@'] |= IncQuery | IncQueryAll;
201 map['&'] |= IncQueryAll;
202 map['='] |= IncQuery | IncQueryAll;
203 map['+'] |= IncQuery | IncQueryAll;
204 map['$'] |= IncQuery | IncQueryAll;
205 map[','] |= IncQuery | IncQueryAll;
206
207 // '%' are permitted inside queries when constructing output
208 map['%'] |= IncQueryAll;
209 }
210
211 /***********************************************************************
212
213 Create an empty Uri
214
215 ***********************************************************************/
216
217 this ()
218 {
219 port = InvalidPort;
220 decoded = new HeapSlice (256);
221 }
222
223 /***********************************************************************
224
225 Construct a Uri from the provided character string
226
227 ***********************************************************************/
228
229 this (char[] uri)
230 {
231 this ();
232 parse (uri);
233 }
234
235 /***********************************************************************
236
237 Construct a Uri from the given components. The query is
238 optional.
239
240 ***********************************************************************/
241
242 this (char[] scheme, char[] host, char[] path, char[] query = null)
243 {
244 this ();
245
246 this.scheme = scheme;
247 this.query = query;
248 this.host = host;
249 this.path = path;
250 }
251
252 /***********************************************************************
253
254 Clone another Uri. This can be used to make a mutable Uri
255 from an immutable UriView.
256
257 ***********************************************************************/
258
259 this (UriView other)
260 {
261 with (other)
262 {
263 this (getScheme, getHost, getPath, getQuery);
264 this.userinfo = getUserInfo;
265 this.fragment = getFragment;
266 this.port = getPort;
267 }
268 }
269
270 /***********************************************************************
271
272 Return the default port for the given scheme. InvalidPort
273 is returned if the scheme is unknown, or does not accept
274 a port.
275
276 ***********************************************************************/
277
278 final int getDefaultPort (char[] scheme)
279 {
280 short* port = scheme in genericSchemes;
281 if (port is null)
282 return InvalidPort;
283 return *port;
284 }
285
286 /***********************************************************************
287
288 Return the parsed scheme, or null if the scheme was not
289 specified
290
291 ***********************************************************************/
292
293 final char[] getScheme()
294 {
295 return scheme;
296 }
297
298 /***********************************************************************
299
300 Return the parsed host, or null if the host was not
301 specified
302
303 ***********************************************************************/
304
305 final char[] getHost()
306 {
307 return host;
308 }
309
310 /***********************************************************************
311
312 Return the parsed port number, or InvalidPort if the port
313 was not provided.
314
315 ***********************************************************************/
316
317 final int getPort()
318 {
319 return port;
320 }
321
322 /***********************************************************************
323
324 Return a valid port number by performing a lookup on the
325 known schemes if the port was not explicitly specified.
326
327 ***********************************************************************/
328
329 final int getValidPort()
330 {
331 if (port is InvalidPort)
332 return getDefaultPort (scheme);
333 return port;
334 }
335
336 /***********************************************************************
337
338 Return the parsed userinfo, or null if userinfo was not
339 provided.
340
341 ***********************************************************************/
342
343 final char[] getUserInfo()
344 {
345 return userinfo;
346 }
347
348 /***********************************************************************
349
350 Return the parsed path, or null if the path was not
351 provided.
352
353 ***********************************************************************/
354
355 final char[] getPath()
356 {
357 return path;
358 }
359
360 /***********************************************************************
361
362 Return the parsed query, or null if a query was not
363 provided.
364
365 ***********************************************************************/
366
367 final char[] getQuery()
368 {
369 return query;
370 }
371
372 /***********************************************************************
373
374 Return the parsed fragment, or null if a fragment was not
375 provided.
376
377 ***********************************************************************/
378
379 final char[] getFragment()
380 {
381 return fragment;
382 }
383
384 /***********************************************************************
385
386 Return whether or not the Uri scheme is considered generic.
387
388 ***********************************************************************/
389
390 final bool isGeneric ()
391 {
392 return (scheme in genericSchemes) !is null;
393 }
394
395 /***********************************************************************
396
397 Emit the content of this Uri via the provided Consumer. The
398 output is constructed per RFC 2396.
399
400 ***********************************************************************/
401
402 final Consumer produce (Consumer consume)
403 {
404 if (scheme.length)
405 consume (scheme), consume (":");
406
407
408 if (userinfo.length || host.length || port != InvalidPort)
409 {
410 consume ("//");
411
412 if (userinfo.length)
413 encode (consume, userinfo, IncUser) ("@");
414
415 if (host.length)
416 consume (host);
417
418 if (port != InvalidPort && port != getDefaultPort(scheme))
419 {
420 char[8] tmp;
421 consume (":"), consume (Integer.itoa (tmp, cast(uint) port));
422 }
423 }
424
425 if (path.length)
426 encode (consume, path, IncPath);
427
428 if (query.length)
429 {
430 consume ("?");
431 encode (consume, query, IncQueryAll);
432 }
433
434 if (fragment.length)
435 {
436 consume ("#");
437 encode (consume, fragment, IncQuery);
438 }
439
440 return consume;
441 }
442
443 /***********************************************************************
444
445 Emit the content of this Uri via the provided Consumer. The
446 output is constructed per RFC 2396.
447
448 ***********************************************************************/
449
450 final char[] toString ()
451 {
452 void[] s;
453
454 s.length = 256, s.length = 0;
455 produce ((void[] v) {s ~= v;});
456 return cast(char[]) s;
457 }
458
459 /***********************************************************************
460
461 Encode uri characters into a Consumer, such that
462 reserved chars are converted into their %hex version.
463
464 ***********************************************************************/
465
466 static Consumer encode (Consumer consume, char[] s, int flags)
467 {
468 char[3] hex;
469 int mark;
470
471 hex[0] = '%';
472 foreach (int i, char c; s)
473 {
474 if (! (map[c] & flags))
475 {
476 consume (s[mark..i]);
477 mark = i+1;
478
479 hex[1] = hexDigits [(c >> 4) & 0x0f];
480 hex[2] = hexDigits [c & 0x0f];
481 consume (hex);
482 }
483 }
484
485 // add trailing section
486 if (mark < s.length)
487 consume (s[mark..s.length]);
488
489 return consume;
490 }
491
492 /***********************************************************************
493
494 Encode uri characters into a string, such that reserved
495 chars are converted into their %hex version.
496
497 Returns a dup'd string
498
499 ***********************************************************************/
500
501 final char[] encode (char[] text, int flags)
502 {
503 void[] s;
504 encode ((void[] v) {s ~= v;}, text, flags);
505 return cast(char[]) s;
506 }
507
508 /***********************************************************************
509
510 Decode a character string with potential %hex values in it.
511 The decoded strings are placed into a thread-safe expanding
512 buffer, and a slice of it is returned to the caller.
513
514 ***********************************************************************/
515
516 private char[] decoder (char[] s, char ignore=0)
517 {
518 static int toInt (char c)
519 {
520 if (c >= '0' && c <= '9')
521 c -= '0';
522 else
523 if (c >= 'a' && c <= 'f')
524 c -= ('a' - 10);
525 else
526 if (c >= 'A' && c <= 'F')
527 c -= ('A' - 10);
528 return c;
529 }
530
531 int length = s.length;
532
533 // take a peek first, to see if there's work to do
534 if (length && memchr (s.ptr, '%', length))
535 {
536 char* p;
537 int j;
538
539 // ensure we have enough decoding space available
540 p = cast(char*) decoded.expand (length);
541
542 // scan string, stripping % encodings as we go
543 for (int i; i < length; ++i, ++j, ++p)
544 {
545 int c = s[i];
546
547 if (c is '%' && (i+2) < length)
548 {
549 c = toInt(s[i+1]) * 16 + toInt(s[i+2]);
550
551 // leave ignored escapes in the stream,
552 // permitting escaped '&' to remain in
553 // the query string
554 if (c && (c is ignore))
555 c = '%';
556 else
557 i += 2;
558 }
559
560 *p = c;
561 }
562
563 // return a slice from the decoded input
564 return cast(char[]) decoded.slice (j);
565 }
566
567 // return original content
568 return s;
569 }
570
571 /***********************************************************************
572
573 Decode a duplicated string with potential %hex values in it
574
575 ***********************************************************************/
576
577 final char[] decode (char[] s)
578 {
579 return decoder(s).dup;
580 }
581
582 /***********************************************************************
583
584 Parsing is performed according to RFC 2396
585
586 ---
587 ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
588 12 3 4 5 6 7 8 9
589
590 2 isolates scheme
591 4 isolates authority
592 5 isolates path
593 7 isolates query
594 9 isolates fragment
595 ---
596
597 This was originally a state-machine; it turned out to be a
598 lot faster (~40%) when unwound like this instead.
599
600 ***********************************************************************/
601
602 final Uri parse (char[] uri, bool relative = false)
603 {
604 char c;
605 int i,
606 mark,
607 len = uri.length;
608
609 reset;
610
611 // isolate scheme (note that it's OK to not specify a scheme)
612 for (i=0; i < len && !(map[c = uri[i]] & ExcScheme); ++i) {}
613 if (c is ':')
614 {
615 scheme = uri [mark .. i];
616 toLower (scheme);
617 mark = i + 1;
618 }
619
620 // isolate authority
621 if (mark < len-1 && uri[mark] is '/' && uri[mark+1] is '/')
622 {
623 for (mark+=2, i=mark; i < len && !(map[uri[i]] & ExcAuthority); ++i) {}
624 parseAuthority (uri[mark .. i]);
625 mark = i;
626 }
627 else
628 if (relative && uri[0] != '/')
629 {
630 uri = toLastSlash(path) ~ uri;
631 query = fragment = null;
632 len = uri.length;
633 }
634
635 // isolate path
636 for (i=mark; i < len && !(map[uri[i]] & ExcPath); ++i) {}
637 path = decoder (uri[mark .. i]);
638 mark = i;
639
640 // isolate query
641 if (mark < len && uri[mark] is '?')
642 {
643 for (++mark, i=mark; i < len && uri[i] != '#'; ++i) {}
644 query = decoder (uri[mark .. i], '&');
645 mark = i;
646 }
647
648 // isolate fragment
649 if (mark < len && uri[mark] is '#')
650 fragment = decoder (uri[mark+1 .. len]);
651
652 return this;
653 }
654
655 /***********************************************************************
656
657 Clear everything to null.
658
659 ***********************************************************************/
660
661 final void reset()
662 {
663 decoded.reset;
664 port = InvalidPort;
665 host = path = query = scheme = userinfo = fragment = null;
666 }
667
668 /***********************************************************************
669
670 Parse the given uri, with support for relative URLs
671
672 ***********************************************************************/
673
674 final Uri relParse (char[] uri)
675 {
676 return parse (uri, true);
677 }
678
679 /***********************************************************************
680
681 Set the Uri scheme
682
683 ***********************************************************************/
684
685 final Uri setScheme (char[] scheme)
686 {
687 this.scheme = scheme;
688 return this;
689 }
690
691 /***********************************************************************
692
693 Set the Uri host
694
695 ***********************************************************************/
696
697 final Uri setHost (char[] host)
698 {
699 this.host = host;
700 return this;
701 }
702
703 /***********************************************************************
704
705 Set the Uri port
706
707 ***********************************************************************/
708
709 final Uri setPort (int port)
710 {
711 this.port = port;
712 return this;
713 }
714
715 /***********************************************************************
716
717 Set the Uri userinfo
718
719 ***********************************************************************/
720
721 final Uri setUserInfo (char[] userinfo)
722 {
723 this.userinfo = userinfo;
724 return this;
725 }
726
727 /***********************************************************************
728
729 Set the Uri query
730
731 ***********************************************************************/
732
733 final Uri setQuery (char[] query)
734 {
735 this.query = query;
736 return this;
737 }
738
739 /***********************************************************************
740
741 Extend the Uri query
742
743 ***********************************************************************/
744
745 final char[] extendQuery (char[] tail)
746 {
747 if (tail.length)
748 if (query.length)
749 query = query ~ "&" ~ tail;
750 else
751 query = tail;
752 return query;
753 }
754
755 /***********************************************************************
756
757 Set the Uri path
758
759 ***********************************************************************/
760
761 final Uri setPath (char[] path)
762 {
763 this.path = path;
764 return this;
765 }
766
767 /***********************************************************************
768
769 Set the Uri fragment
770
771 ***********************************************************************/
772
773 final Uri setFragment (char[] fragment)
774 {
775 this.fragment = fragment;
776 return this;
777 }
778
779 /***********************************************************************
780
781 Authority is the section after the scheme, but before the
782 path, query or fragment; it typically represents a host.
783
784 ---
785 ^(([^@]*)@?)([^:]*)?(:(.*))?
786 12 3 4 5
787
788 2 isolates userinfo
789 3 isolates host
790 5 isolates port
791 ---
792
793 ***********************************************************************/
794
795 private void parseAuthority (char[] auth)
796 {
797 int mark,
798 len = auth.length;
799
800 // get userinfo: (([^@]*)@?)
801 foreach (int i, char c; auth)
802 if (c is '@')
803 {
804 userinfo = decoder (auth[0 .. i]);
805 mark = i + 1;
806 break;
807 }
808
809 // get port: (:(.*))?
810 for (int i=mark; i < len; ++i)
811 if (auth [i] is ':')
812 {
813 port = Integer.atoi (auth [i+1 .. len]);
814 len = i;
815 break;
816 }
817
818 // get host: ([^:]*)?
819 host = auth [mark..len];
820 }
821
822 /**********************************************************************
823
824 **********************************************************************/
825
826 private final char[] toLastSlash (char[] path)
827 {
828 for (char*p = path.ptr+path.length; --p >= path.ptr;)
829 if (*p is '/')
830 return path [0 .. (p-path.ptr)+1];
831 return path;
832 }
833
834 /**********************************************************************
835
836 in-place conversion to lowercase
837
838 **********************************************************************/
839
840 private final static char[] toLower (inout char[] src)
841 {
842 foreach (inout char c; src)
843 if (c >= 'A' && c <= 'Z')
844 c = c + ('a' - 'A');
845 return src;
846 }
847 }
848
849
850 /*******************************************************************************
851
852 *******************************************************************************/
853
854 private class HeapSlice
855 {
856 private uint used;
857 private void[] buffer;
858
859 /***********************************************************************
860
861 Create with the specified starting size
862
863 ***********************************************************************/
864
865 this (uint size)
866 {
867 buffer = new void[size];
868 }
869
870 /***********************************************************************
871
872 Reset content length to zero
873
874 ***********************************************************************/
875
876 final void reset ()
877 {
878 used = 0;
879 }
880
881 /***********************************************************************
882
883 Potentially expand the content space, and return a pointer
884 to the start of the empty section.
885
886 ***********************************************************************/
887
888 final void* expand (uint size)
889 {
890 if ((used + size) > buffer.length)
891 buffer.length = (used + size) * 2;
892 return &buffer [used];
893 }
894
895 /***********************************************************************
896
897 Return a slice of the content from the current position
898 with the specified size. Adjusts the current position to
899 point at an empty zone.
900
901 ***********************************************************************/
902
903 final void[] slice (int size)
904 {
905 uint i = used;
906 used += size;
907 return buffer [i..used];
908 }
909 }
910
911
912
913 /*******************************************************************************
914
915 *******************************************************************************/
916
917 debug (Uri)
918 {
919 import tango.io.Console;
920
921 void main()
922 {
923 auto uri = new Uri ("http://foo.bar?a=1&b=2&c=3&d=%26%26&e=5");
924
925 Cout (uri.getQuery).newline;
926 Cout (uri).newline;
927
928 Cout (uri.encode ("&#$%", uri.IncQuery)).newline;
929 }
930 }