1# uri.tcl --
2#
3#	URI parsing and fetch
4#
5# Copyright (c) 2000 Zveno Pty Ltd
6# Copyright (c) 2006 Pierre DAVID <Pierre.David@crc.u-strasbg.fr>
7# Copyright (c) 2006 Andreas Kupries <andreas_kupries@users.sourceforge.net>
8# Steve Ball, http://www.zveno.com/
9# Derived from urls.tcl by Andreas Kupries
10#
11# TODO:
12#	Handle www-url-encoding details
13#
14# CVS: $Id: uri.tcl 6 2009-02-03 00:37:42Z jcw@equi4.com $
15
16package require Tcl 8.2
17
18namespace eval ::uri {
19
20    namespace export split join
21    namespace export resolve isrelative
22    namespace export geturl
23    namespace export canonicalize
24    namespace export register
25
26    variable file:counter 0
27
28    # extend these variable in the coming namespaces
29    variable schemes       {}
30    variable schemePattern ""
31    variable url           ""
32    variable url2part
33    array set url2part     {}
34
35    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
36    # basic regular expressions used in URL syntax.
37
38    namespace eval basic {
39	variable	loAlpha		{[a-z]}
40	variable	hiAlpha		{[A-Z]}
41	variable	digit		{[0-9]}
42	variable	alpha		{[a-zA-Z]}
43	variable	safe		{[$_.+-]}
44	variable	extra		{[!*'(,)]}
45	# danger in next pattern, order important for []
46	variable	national	{[][|\}\{\^~`]}
47	variable	punctuation	{[<>#%"]}	;#" fake emacs hilit
48	variable	reserved	{[;/?:@&=]}
49	variable	hex		{[0-9A-Fa-f]}
50	variable	alphaDigit	{[A-Za-z0-9]}
51	variable	alphaDigitMinus	{[A-Za-z0-9-]}
52
53	# next is <national | punctuation>
54	variable	unsafe		{[][<>"#%\{\}|\\^~`]} ;#" emacs hilit
55	variable	escape		"%${hex}${hex}"
56
57	#	unreserved	= alpha | digit | safe | extra
58	#	xchar		= unreserved | reserved | escape
59
60	variable	unreserved	{[a-zA-Z0-9$_.+!*'(,)-]}
61	variable	uChar		"(${unreserved}|${escape})"
62	variable	xCharN		{[a-zA-Z0-9$_.+!*'(,);/?:@&=-]}
63	variable	xChar		"(${xCharN}|${escape})"
64	variable	digits		"${digit}+"
65
66	variable	toplabel	\
67		"(${alpha}${alphaDigitMinus}*${alphaDigit}|${alpha})"
68	variable	domainlabel	\
69		"(${alphaDigit}${alphaDigitMinus}*${alphaDigit}|${alphaDigit})"
70
71	variable	hostname	\
72		"((${domainlabel}\\.)*${toplabel})"
73	variable	hostnumber	\
74		"(${digits}\\.${digits}\\.${digits}\\.${digits})"
75
76	variable	host		"(${hostname}|${hostnumber})"
77
78	variable	port		$digits
79	variable	hostOrPort	"${host}(:${port})?"
80
81	variable	usrCharN	{[a-zA-Z0-9$_.+!*'(,);?&=-]}
82	variable	usrChar		"(${usrCharN}|${escape})"
83	variable	user		"${usrChar}*"
84	variable	password	$user
85	variable	login		"(${user}(:${password})?@)?${hostOrPort}"
86    } ;# basic {}
87}
88
89
90# ::uri::register --
91#
92#	Register a scheme (and aliases) in the package. The command
93#	creates a namespace below "::uri" with the same name as the
94#	scheme and executes the script declaring the pattern variables
95#	for this scheme in the new namespace. At last it updates the
96#	uri variables keeping track of overall scheme information.
97#
98#	The script has to declare at least the variable "schemepart",
99#	the pattern for an url of the registered scheme after the
100#	scheme declaration. Not declaring this variable is an error.
101#
102# Arguments:
103#	schemeList	Name of the scheme to register, plus aliases
104#       script		Script declaring the scheme patterns
105#
106# Results:
107#	None.
108
109proc ::uri::register {schemeList script} {
110    variable schemes
111    variable schemePattern
112    variable url
113    variable url2part
114
115    # Check scheme and its aliases for existence.
116    foreach scheme $schemeList {
117	if {[lsearch -exact $schemes $scheme] >= 0} {
118	    return -code error \
119		    "trying to register scheme (\"$scheme\") which is already known"
120	}
121    }
122
123    # Get the main scheme
124    set scheme  [lindex $schemeList 0]
125
126    if {[catch {namespace eval $scheme $script} msg]} {
127	catch {namespace delete $scheme}
128	return -code error \
129	    "error while evaluating scheme script: $msg"
130    }
131
132    if {![info exists ${scheme}::schemepart]} {
133	namespace delete $scheme
134	return -code error \
135	    "Variable \"schemepart\" is missing."
136    }
137
138    # Now we can extend the variables which keep track of the registered schemes.
139
140    eval [linsert $schemeList 0 lappend schemes]
141    set schemePattern	"([::join $schemes |]):"
142
143    foreach s $schemeList {
144	# FRINK: nocheck
145	set url2part($s) "${s}:[set ${scheme}::schemepart]"
146	# FRINK: nocheck
147	append url "(${s}:[set ${scheme}::schemepart])|"
148    }
149    set url [string trimright $url |]
150    return
151}
152
153# ::uri::split --
154#
155#	Splits the given <a url> into its constituents.
156#
157# Arguments:
158#	url	the URL to split
159#
160# Results:
161#	Tcl list containing constituents, suitable for 'array set'.
162
163proc ::uri::split {url {defaultscheme http}} {
164
165    set url [string trim $url]
166    set scheme {}
167
168    # RFC 1738:	scheme = 1*[ lowalpha | digit | "+" | "-" | "." ]
169    regexp -- {^([a-z0-9+.-][a-z0-9+.-]*):} $url dummy scheme
170
171    if {$scheme == {}} {
172	set scheme $defaultscheme
173    }
174
175    # ease maintenance: dynamic dispatch, able to handle all schemes
176    # added in future!
177
178    if {[::info procs Split[string totitle $scheme]] == {}} {
179	error "unknown scheme '$scheme' in '$url'"
180    }
181
182    regsub -- "^${scheme}:" $url {} url
183
184    set       parts(scheme) $scheme
185    array set parts [Split[string totitle $scheme] $url]
186
187    # should decode all encoded characters!
188
189    return [array get parts]
190}
191
192proc ::uri::SplitFtp {url} {
193    # @c Splits the given ftp-<a url> into its constituents.
194    # @a url: The url to split, without! scheme specification.
195    # @r List containing the constituents, suitable for 'array set'.
196
197    # general syntax:
198    # //<user>:<password>@<host>:<port>/<cwd1>/.../<cwdN>/<name>;type=<typecode>
199    #
200    # additional rules:
201    #
202    # <user>:<password> are optional, detectable by presence of @.
203    # <password> is optional too.
204    #
205    # "//" [ <user> [":" <password> ] "@"] <host> [":" <port>] "/"
206    #	<cwd1> "/" ..."/" <cwdN> "/" <name> [";type=" <typecode>]
207
208    upvar \#0 [namespace current]::ftp::typepart ftptype
209
210    array set parts {user {} pwd {} host {} port {} path {} type {}}
211
212    # slash off possible type specification
213
214    if {[regexp -indices -- "${ftptype}$" $url dummy ftype]} {
215
216	set from	[lindex $ftype 0]
217	set to		[lindex $ftype 1]
218
219	set parts(type)	[string range   $url $from $to]
220
221	set from	[lindex $dummy 0]
222	set url		[string replace $url $from end]
223    }
224
225    # Handle user, password, host and port
226
227    if {[string match "//*" $url]} {
228	set url [string range $url 2 end]
229
230	array set parts [GetUPHP url]
231    }
232
233    set parts(path) [string trimleft $url /]
234
235    return [array get parts]
236}
237
238proc ::uri::JoinFtp args {
239    array set components {
240	user {} pwd {} host {} port {}
241	path {} type {}
242    }
243    array set components $args
244
245    set userPwd {}
246    if {[string length $components(user)] || [string length $components(pwd)]} {
247	set userPwd $components(user)[expr {[string length $components(pwd)] ? ":$components(pwd)" : {}}]@
248    }
249
250    set port {}
251    if {[string length $components(port)]} {
252	set port :$components(port)
253    }
254
255    set type {}
256    if {[string length $components(type)]} {
257	set type \;type=$components(type)
258    }
259
260    return ftp://${userPwd}$components(host)${port}/[string trimleft $components(path) /]$type
261}
262
263proc ::uri::SplitHttps {url} {
264    return [SplitHttp $url]
265}
266
267proc ::uri::SplitHttp {url} {
268    # @c Splits the given http-<a url> into its constituents.
269    # @a url: The url to split, without! scheme specification.
270    # @r List containing the constituents, suitable for 'array set'.
271
272    # general syntax:
273    # //<host>:<port>/<path>?<searchpart>
274    #
275    #   where <host> and <port> are as described in Section 3.1. If :<port>
276    #   is omitted, the port defaults to 80.  No user name or password is
277    #   allowed.  <path> is an HTTP selector, and <searchpart> is a query
278    #   string. The <path> is optional, as is the <searchpart> and its
279    #   preceding "?". If neither <path> nor <searchpart> is present, the "/"
280    #   may also be omitted.
281    #
282    #   Within the <path> and <searchpart> components, "/", ";", "?" are
283    #   reserved.  The "/" character may be used within HTTP to designate a
284    #   hierarchical structure.
285    #
286    # path == <cwd1> "/" ..."/" <cwdN> "/" <name> ["#" <fragment>]
287
288    upvar #0 [namespace current]::http::search  search
289    upvar #0 [namespace current]::http::segment segment
290
291    array set parts {host {} port {} path {} query {}}
292
293    set searchPattern   "\\?(${search})\$"
294    set fragmentPattern "#(${segment})\$"
295
296    # slash off possible query. the 'search' regexp, while official,
297    # is not good enough. We have apparently lots of urls in the wild
298    # which contain unquoted urls with queries in a query. The RE
299    # finds the embedded query, not the actual one. Using string first
300    # now instead of a RE
301
302    if {[set pos [string first ? $url]] >= 0} {
303	incr pos
304	set parts(query) [string range   $url $pos end]
305	incr pos -1
306	set url          [string replace $url $pos end]
307    }
308
309    # slash off possible fragment
310
311    if {[regexp -indices -- $fragmentPattern $url match fragment]} {
312	set from [lindex $fragment 0]
313	set to   [lindex $fragment 1]
314
315	set parts(fragment) [string range $url $from $to]
316
317	set url [string replace $url [lindex $match 0] end]
318    }
319
320    if {[string match "//*" $url]} {
321	set url [string range $url 2 end]
322
323	array set parts [GetUPHP url]
324    }
325
326    set parts(path) [string trimleft $url /]
327
328    return [array get parts]
329}
330
331proc ::uri::JoinHttp {args} {
332    return [eval [linsert $args 0 ::uri::JoinHttpInner http 80]]
333}
334
335proc ::uri::JoinHttps {args} {
336    return [eval [linsert $args 0 ::uri::JoinHttpInner https 443]]
337}
338
339proc ::uri::JoinHttpInner {scheme defport args} {
340    array set components {host {} path {} query {}}
341    set       components(port) $defport
342    array set components $args
343
344    set port {}
345    if {[string length $components(port)] && $components(port) != $defport} {
346	set port :$components(port)
347    }
348
349    set query {}
350    if {[string length $components(query)]} {
351	set query ?$components(query)
352    }
353
354    regsub -- {^/} $components(path) {} components(path)
355
356    if { [info exists components(fragment)] && $components(fragment) != "" } {
357	set components(fragment) "#$components(fragment)"
358    } else {
359	set components(fragment) ""
360    }
361
362    return $scheme://$components(host)$port/$components(path)$components(fragment)$query
363}
364
365proc ::uri::SplitFile {url} {
366    # @c Splits the given file-<a url> into its constituents.
367    # @a url: The url to split, without! scheme specification.
368    # @r List containing the constituents, suitable for 'array set'.
369
370    upvar #0 [namespace current]::basic::hostname	hostname
371    upvar #0 [namespace current]::basic::hostnumber	hostnumber
372
373    if {[string match "//*" $url]} {
374	set url [string range $url 2 end]
375
376	set hostPattern "^($hostname|$hostnumber)"
377	switch -exact -- $::tcl_platform(platform) {
378	    windows {
379		# Catch drive letter
380		append hostPattern :?
381	    }
382	    default {
383		# Proceed as usual
384	    }
385	}
386
387	if {[regexp -indices -- $hostPattern $url match host]} {
388	    set fh	[lindex $host 0]
389	    set th	[lindex $host 1]
390
391	    set parts(host)	[string range $url $fh $th]
392
393	    set  matchEnd   [lindex $match 1]
394	    incr matchEnd
395
396	    set url	[string range $url $matchEnd end]
397	}
398    }
399
400    set parts(path) $url
401
402    return [array get parts]
403}
404
405proc ::uri::JoinFile args {
406    array set components {
407	host {} port {} path {}
408    }
409    array set components $args
410
411    switch -exact -- $::tcl_platform(platform) {
412	windows {
413	    if {[string length $components(host)]} {
414		return file://$components(host):$components(path)
415	    } else {
416		return file://$components(path)
417	    }
418	}
419	default {
420	    return file://$components(host)$components(path)
421	}
422    }
423}
424
425proc ::uri::SplitMailto {url} {
426    # @c Splits the given mailto-<a url> into its constituents.
427    # @a url: The url to split, without! scheme specification.
428    # @r List containing the constituents, suitable for 'array set'.
429
430    if {[string match "*@*" $url]} {
431	set url [::split $url @]
432	return [list user [lindex $url 0] host [lindex $url 1]]
433    } else {
434	return [list user $url]
435    }
436}
437
438proc ::uri::JoinMailto args {
439    array set components {
440	user {} host {}
441    }
442    array set components $args
443
444    return mailto:$components(user)@$components(host)
445}
446
447proc ::uri::SplitNews {url} {
448    if { [string first @ $url] >= 0 } {
449	return [list message-id $url]
450    } else {
451	return [list newsgroup-name $url]
452    }
453}
454
455proc ::uri::JoinNews args {
456    array set components {
457	message-id {} newsgroup-name {}
458    }
459    array set components $args
460    return news:$components(message-id)$components(newsgroup-name)
461}
462
463proc ::uri::SplitLdaps {url} {
464    ::uri::SplitLdap $url
465}
466
467proc ::uri::SplitLdap {url} {
468    # @c Splits the given Ldap-<a url> into its constituents.
469    # @a url: The url to split, without! scheme specification.
470    # @r List containing the constituents, suitable for 'array set'.
471
472    # general syntax:
473    # //<host>:<port>/<dn>?<attrs>?<scope>?<filter>?<extensions>
474    #
475    #   where <host> and <port> are as described in Section 5 of RFC 1738.
476    #   No user name or password is allowed.
477    #   If omitted, the port defaults to 389 for ldap, 636 for ldaps
478    #   <dn> is the base DN for the search
479    #   <attrs> is a comma separated list of attributes description
480    #   <scope> is either "base", "one" or "sub".
481    #   <filter> is a RFC 2254 filter specification
482    #   <extensions> are documented in RFC 2255
483    #
484
485    array set parts {host {} port {} dn {} attrs {} scope {} filter {} extensions {}}
486
487    #          host        port           dn          attrs       scope               filter     extns
488    set re {//([^:?/]+)(?::([0-9]+))?(?:/([^?]+)(?:\?([^?]*)(?:\?(base|one|sub)?(?:\?([^?]*)(?:\?(.*))?)?)?)?)?}
489
490    if {! [regexp $re $url match parts(host) parts(port) \
491		parts(dn) parts(attrs) parts(scope) parts(filter) \
492		parts(extensions)]} then {
493	return -code error "unable to match URL \"$url\""
494    }
495
496    set parts(attrs) [::split $parts(attrs) ","]
497
498    return [array get parts]
499}
500
501proc ::uri::JoinLdap {args} {
502    return [eval [linsert $args 0 ::uri::JoinLdapInner ldap 389]]
503}
504
505proc ::uri::JoinLdaps {args} {
506    return [eval [linsert $args 0 ::uri::JoinLdapInner ldaps 636]]
507}
508
509proc ::uri::JoinLdapInner {scheme defport args} {
510    array set components {host {} port {} dn {} attrs {} scope {} filter {} extensions {}}
511    set       components(port) $defport
512    array set components $args
513
514    set port {}
515    if {[string length $components(port)] && $components(port) != $defport} {
516	set port :$components(port)
517    }
518
519    set url "$scheme://$components(host)$port"
520
521    set components(attrs) [::join $components(attrs) ","]
522
523    set s ""
524    foreach c {dn attrs scope filter extensions} {
525	if {[string equal $c "dn"]} then {
526	    append s "/"
527	} else {
528	    append s "?"
529	}
530	if {! [string equal $components($c) ""]} then {
531	    append url "${s}$components($c)"
532	    set s ""
533	}
534    }
535
536    return $url
537}
538
539proc ::uri::GetUPHP {urlvar} {
540    # @c Parse user, password host and port out of the url stored in
541    # @c variable <a urlvar>.
542    # @d Side effect: The extracted information is removed from the given url.
543    # @r List containing the extracted information in a format suitable for
544    # @r 'array set'.
545    # @a urlvar: Name of the variable containing the url to parse.
546
547    upvar \#0 [namespace current]::basic::user		user
548    upvar \#0 [namespace current]::basic::password	password
549    upvar \#0 [namespace current]::basic::hostname	hostname
550    upvar \#0 [namespace current]::basic::hostnumber	hostnumber
551    upvar \#0 [namespace current]::basic::port		port
552
553    upvar $urlvar url
554
555    array set parts {user {} pwd {} host {} port {}}
556
557    # syntax
558    # "//" [ <user> [":" <password> ] "@"] <host> [":" <port>] "/"
559    # "//" already cut off by caller
560
561    set upPattern "^(${user})(:(${password}))?@"
562
563    if {[regexp -indices -- $upPattern $url match theUser c d thePassword]} {
564	set fu	[lindex $theUser 0]
565	set tu	[lindex $theUser 1]
566
567	set fp	[lindex $thePassword 0]
568	set tp	[lindex $thePassword 1]
569
570	set parts(user)	[string range $url $fu $tu]
571	set parts(pwd)	[string range $url $fp $tp]
572
573	set  matchEnd   [lindex $match 1]
574	incr matchEnd
575
576	set url	[string range $url $matchEnd end]
577    }
578
579    set hpPattern "^($hostname|$hostnumber)(:($port))?"
580
581    if {[regexp -indices -- $hpPattern $url match theHost c d e f g h thePort]} {
582	set fh	[lindex $theHost 0]
583	set th	[lindex $theHost 1]
584
585	set fp	[lindex $thePort 0]
586	set tp	[lindex $thePort 1]
587
588	set parts(host)	[string range $url $fh $th]
589	set parts(port)	[string range $url $fp $tp]
590
591	set  matchEnd   [lindex $match 1]
592	incr matchEnd
593
594	set url	[string range $url $matchEnd end]
595    }
596
597    return [array get parts]
598}
599
600proc ::uri::GetHostPort {urlvar} {
601    # @c Parse host and port out of the url stored in variable <a urlvar>.
602    # @d Side effect: The extracted information is removed from the given url.
603    # @r List containing the extracted information in a format suitable for
604    # @r 'array set'.
605    # @a urlvar: Name of the variable containing the url to parse.
606
607    upvar #0 [namespace current]::basic::hostname	hostname
608    upvar #0 [namespace current]::basic::hostnumber	hostnumber
609    upvar #0 [namespace current]::basic::port		port
610
611    upvar $urlvar url
612
613    set pattern "^(${hostname}|${hostnumber})(:(${port}))?"
614
615    if {[regexp -indices -- $pattern $url match host c d e f g h thePort]} {
616	set fromHost	[lindex $host 0]
617	set toHost	[lindex $host 1]
618
619	set fromPort	[lindex $thePort 0]
620	set toPort	[lindex $thePort 1]
621
622	set parts(host)	[string range $url $fromHost $toHost]
623	set parts(port)	[string range $url $fromPort $toPort]
624
625	set  matchEnd   [lindex $match 1]
626	incr matchEnd
627
628	set url [string range $url $matchEnd end]
629    }
630
631    return [array get parts]
632}
633
634# ::uri::resolve --
635#
636#	Resolve an arbitrary URL, given a base URL
637#
638# Arguments:
639#	base	base URL (absolute)
640#	url	arbitrary URL
641#
642# Results:
643#	Returns a URL
644
645proc ::uri::resolve {base url} {
646    if {[string length $url]} {
647	if {[isrelative $url]} {
648
649	    array set baseparts [split $base]
650
651	    switch -- $baseparts(scheme) {
652		http -
653		https -
654		ftp -
655		file {
656		    array set relparts [split $url]
657		    if { [string match /* $url] } {
658			catch { set baseparts(path) $relparts(path) }
659		    } elseif { [string match */ $baseparts(path)] } {
660			set baseparts(path) "$baseparts(path)$relparts(path)"
661		    } else {
662			if { [string length $relparts(path)] > 0 } {
663			    set path [lreplace [::split $baseparts(path) /] end end]
664			    set baseparts(path) "[::join $path /]/$relparts(path)"
665			}
666		    }
667		    catch { set baseparts(query) $relparts(query) }
668		    catch { set baseparts(fragment) $relparts(fragment) }
669            return [eval [linsert [array get baseparts] 0 join]]
670		}
671		default {
672		    return -code error "unable to resolve relative URL \"$url\""
673		}
674	    }
675
676	} else {
677	    return $url
678	}
679    } else {
680	return $base
681    }
682}
683
684# ::uri::isrelative --
685#
686#	Determines whether a URL is absolute or relative
687#
688# Arguments:
689#	url	URL to check
690#
691# Results:
692#	Returns 1 if the URL is relative, 0 otherwise
693
694proc ::uri::isrelative url {
695    return [expr {![regexp -- {^[a-z0-9+-.][a-z0-9+-.]*:} $url]}]
696}
697
698# ::uri::geturl --
699#
700#	Fetch the data from an arbitrary URL.
701#
702#	This package provides a handler for the file:
703#	scheme, since this conflicts with the file command.
704#
705# Arguments:
706#	url	address of data resource
707#	args	configuration options
708#
709# Results:
710#	Depends on scheme
711
712proc ::uri::geturl {url args} {
713    array set urlparts [split $url]
714
715    switch -- $urlparts(scheme) {
716	file {
717        return [eval [linsert $args 0 file_geturl $url]]
718	}
719	default {
720	    # Load a geturl package for the scheme first and only if
721	    # that fails the scheme package itself. This prevents
722	    # cyclic dependencies between packages.
723	    if {[catch {package require $urlparts(scheme)::geturl}]} {
724		package require $urlparts(scheme)
725	    }
726        return [eval [linsert $args 0 $urlparts(scheme)::geturl $url]]
727	}
728    }
729}
730
731# ::uri::file_geturl --
732#
733#	geturl implementation for file: scheme
734#
735# TODO:
736#	This is an initial, basic implementation.
737#	Eventually want to support all options for geturl.
738#
739# Arguments:
740#	url	URL to fetch
741#	args	configuration options
742#
743# Results:
744#	Returns data from file
745
746proc ::uri::file_geturl {url args} {
747    variable file:counter
748
749    set var [namespace current]::file[incr file:counter]
750    upvar #0 $var state
751    array set state {data {}}
752
753    array set parts [split $url]
754
755    set ch [open $parts(path)]
756    # Could determine text/binary from file extension,
757    # except on Macintosh
758    # fconfigure $ch -translation binary
759    set state(data) [read $ch]
760    close $ch
761
762    return $var
763}
764
765# ::uri::join --
766#
767#	Format a URL
768#
769# Arguments:
770#	args	components, key-value format
771#
772# Results:
773#	A URL
774
775proc ::uri::join args {
776    array set components $args
777
778    return [eval [linsert $args 0 Join[string totitle $components(scheme)]]]
779}
780
781# ::uri::canonicalize --
782#
783#	Canonicalize a URL
784#
785# Acknowledgements:
786#	Andreas Kupries <andreas_kupries@users.sourceforge.net>
787#
788# Arguments:
789#	uri	URI (which contains a path component)
790#
791# Results:
792#	The canonical form of the URI
793
794proc ::uri::canonicalize uri {
795
796    # Make uri canonical with respect to dots (path changing commands)
797    #
798    # Remove single dots (.)  => pwd not changing
799    # Remove double dots (..) => gobble previous segment of path
800    #
801    # Fixes for this command:
802    #
803    # * Ignore any url which cannot be split into components by this
804    #   module. Just assume that such urls do not have a path to
805    #   canonicalize.
806    #
807    # * Ignore any url which could be split into components, but does
808    #   not have a path component.
809    #
810    # In the text above 'ignore' means
811    # 'return the url unchanged to the caller'.
812
813    if {[catch {array set u [::uri::split $uri]}]} {
814	return $uri
815    }
816    if {![info exists u(path)]} {
817	return $uri
818    }
819
820    set uri $u(path)
821
822    # Remove leading "./" "../" "/.." (and "/../")
823    regsub -all -- {^(\./)+}    $uri {}  uri
824    regsub -all -- {^/(\.\./)+} $uri {/} uri
825    regsub -all -- {^(\.\./)+}  $uri {}  uri
826
827    # Remove inner /./ and /../
828    while {[regsub -all -- {/\./}         $uri {/} uri]} {}
829    while {[regsub -all -- {/[^/]+/\.\./} $uri {/} uri]} {}
830    while {[regsub -all -- {^[^/]+/\.\./} $uri {}  uri]} {}
831    # Munge trailing /..
832    while {[regsub -all -- {/[^/]+/\.\.} $uri {/} uri]} {}
833    if { $uri == ".." } { set uri "/" }
834
835    set u(path) $uri
836    set uri [eval [linsert [array get u] 0 ::uri::join]]
837
838    return $uri
839}
840
841# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
842# regular expressions covering various url schemes
843
844# Currently known URL schemes:
845#
846# (RFC 1738)
847# ------------------------------------------------
848# scheme	basic syntax of scheme specific part
849# ------------------------------------------------
850# ftp		//<user>:<password>@<host>:<port>/<cwd1>/.../<cwdN>/<name>;type=<typecode>
851#
852# http		//<host>:<port>/<path>?<searchpart>
853#
854# gopher	//<host>:<port>/<gophertype><selector>
855#				<gophertype><selector>%09<search>
856#		<gophertype><selector>%09<search>%09<gopher+_string>
857#
858# mailto	<rfc822-addr-spec>
859# news		<newsgroup-name>
860#		<message-id>
861# nntp		//<host>:<port>/<newsgroup-name>/<article-number>
862# telnet	//<user>:<password>@<host>:<port>/
863# wais		//<host>:<port>/<database>
864#		//<host>:<port>/<database>?<search>
865#		//<host>:<port>/<database>/<wtype>/<wpath>
866# file		//<host>/<path>
867# prospero	//<host>:<port>/<hsoname>;<field>=<value>
868# ------------------------------------------------
869#
870# (RFC 2111)
871# ------------------------------------------------
872# scheme	basic syntax of scheme specific part
873# ------------------------------------------------
874# mid	message-id
875#		message-id/content-id
876# cid	content-id
877# ------------------------------------------------
878#
879# (RFC 2255)
880# ------------------------------------------------
881# scheme	basic syntax of scheme specific part
882# ------------------------------------------------
883# ldap		//<host>:<port>/<dn>?<attrs>?<scope>?<filter>?<extensions>
884# ------------------------------------------------
885
886# FTP
887uri::register ftp {
888    variable escape [set [namespace parent [namespace current]]::basic::escape]
889    variable login  [set [namespace parent [namespace current]]::basic::login]
890
891    variable	charN	{[a-zA-Z0-9$_.+!*'(,)?:@&=-]}
892    variable	char	"(${charN}|${escape})"
893    variable	segment	"${char}*"
894    variable	path	"${segment}(/${segment})*"
895
896    variable	type		{[AaDdIi]}
897    variable	typepart	";type=(${type})"
898    variable	schemepart	\
899		    "//${login}(/${path}(${typepart})?)?"
900
901    variable	url		"ftp:${schemepart}"
902}
903
904# FILE
905uri::register file {
906    variable	host [set [namespace parent [namespace current]]::basic::host]
907    variable	path [set [namespace parent [namespace current]]::ftp::path]
908
909    variable	schemepart	"//(${host}|localhost)?/${path}"
910    variable	url		"file:${schemepart}"
911}
912
913# HTTP
914uri::register http {
915    variable	escape \
916        [set [namespace parent [namespace current]]::basic::escape]
917    variable	hostOrPort	\
918        [set [namespace parent [namespace current]]::basic::hostOrPort]
919
920    variable	charN		{[a-zA-Z0-9$_.+!*'(,);:@&=-]}
921    variable	char		"($charN|${escape})"
922    variable	segment		"${char}*"
923
924    variable	path		"${segment}(/${segment})*"
925    variable	search		$segment
926    variable	schemepart	\
927	    "//${hostOrPort}(/${path}(\\?${search})?)?"
928
929    variable	url		"http:${schemepart}"
930}
931
932# GOPHER
933uri::register gopher {
934    variable	xChar \
935        [set [namespace parent [namespace current]]::basic::xChar]
936    variable	hostOrPort \
937        [set [namespace parent [namespace current]]::basic::hostOrPort]
938    variable	search \
939        [set [namespace parent [namespace current]]::http::search]
940
941    variable	type		$xChar
942    variable	selector	"$xChar*"
943    variable	string		$selector
944    variable	schemepart	\
945	    "//${hostOrPort}(/(${type}(${selector}(%09${search}(%09${string})?)?)?)?)?"
946    variable	url		"gopher:${schemepart}"
947}
948
949# MAILTO
950uri::register mailto {
951    variable xChar [set [namespace parent [namespace current]]::basic::xChar]
952    variable host  [set [namespace parent [namespace current]]::basic::host]
953
954    variable schemepart	"$xChar+(@${host})?"
955    variable url	"mailto:${schemepart}"
956}
957
958# NEWS
959uri::register news {
960    variable escape [set [namespace parent [namespace current]]::basic::escape]
961    variable alpha  [set [namespace parent [namespace current]]::basic::alpha]
962    variable host   [set [namespace parent [namespace current]]::basic::host]
963
964    variable	aCharN		{[a-zA-Z0-9$_.+!*'(,);/?:&=-]}
965    variable	aChar		"($aCharN|${escape})"
966    variable	gChar		{[a-zA-Z0-9$_.+-]}
967    variable	newsgroup-name	"${alpha}${gChar}*"
968    variable	message-id	"${aChar}+@${host}"
969    variable	schemepart	"\\*|${newsgroup-name}|${message-id}"
970    variable	url		"news:${schemepart}"
971}
972
973# WAIS
974uri::register wais {
975    variable	uChar \
976        [set [namespace parent [namespace current]]::basic::xChar]
977    variable	hostOrPort \
978        [set [namespace parent [namespace current]]::basic::hostOrPort]
979    variable	search \
980        [set [namespace parent [namespace current]]::http::search]
981
982    variable	db		"${uChar}*"
983    variable	type		"${uChar}*"
984    variable	path		"${uChar}*"
985
986    variable	database	"//${hostOrPort}/${db}"
987    variable	index		"//${hostOrPort}/${db}\\?${search}"
988    variable	doc		"//${hostOrPort}/${db}/${type}/${path}"
989
990    #variable	schemepart	"${doc}|${index}|${database}"
991
992    variable	schemepart \
993	    "//${hostOrPort}/${db}((\\?${search})|(/${type}/${path}))?"
994
995    variable	url		"wais:${schemepart}"
996}
997
998# PROSPERO
999uri::register prospero {
1000    variable	escape \
1001        [set [namespace parent [namespace current]]::basic::escape]
1002    variable	hostOrPort \
1003        [set [namespace parent [namespace current]]::basic::hostOrPort]
1004    variable	path \
1005        [set [namespace parent [namespace current]]::ftp::path]
1006
1007    variable	charN		{[a-zA-Z0-9$_.+!*'(,)?:@&-]}
1008    variable	char		"(${charN}|$escape)"
1009
1010    variable	fieldname	"${char}*"
1011    variable	fieldvalue	"${char}*"
1012    variable	fieldspec	";${fieldname}=${fieldvalue}"
1013
1014    variable	schemepart	"//${hostOrPort}/${path}(${fieldspec})*"
1015    variable	url		"prospero:$schemepart"
1016}
1017
1018# LDAP
1019uri::register ldap {
1020    variable	hostOrPort \
1021        [set [namespace parent [namespace current]]::basic::hostOrPort]
1022
1023    # very crude parsing
1024    variable	dn		{[^?]*}
1025    variable	attrs		{[^?]*}
1026    variable	scope		"base|one|sub"
1027    variable	filter		{[^?]*}
1028    # extensions are not handled yet
1029
1030    variable	schemepart	"//${hostOrPort}(/${dn}(\?${attrs}(\?(${scope})(\?${filter})?)?)?)?"
1031    variable	url		"ldap:$schemepart"
1032}
1033
1034package provide uri 1.2.1
1035