1package URI::Heuristic;
2
3# $Id: Heuristic.pm,v 4.17 2004/01/14 13:33:44 gisle Exp $
4
5=head1 NAME
6
7URI::Heuristic - Expand URI using heuristics
8
9=head1 SYNOPSIS
10
11 use URI::Heuristic qw(uf_uristr);
12 $u = uf_uristr("perl");             # http://www.perl.com
13 $u = uf_uristr("www.sol.no/sol");   # http://www.sol.no/sol
14 $u = uf_uristr("aas");              # http://www.aas.no
15 $u = uf_uristr("ftp.funet.fi");     # ftp://ftp.funet.fi
16 $u = uf_uristr("/etc/passwd");      # file:/etc/passwd
17
18=head1 DESCRIPTION
19
20This module provides functions that expand strings into real absolute
21URIs using some built-in heuristics.  Strings that already represent
22absolute URIs (i.e. that start with a C<scheme:> part) are never modified
23and are returned unchanged.  The main use of these functions is to
24allow abbreviated URIs similar to what many web browsers allow for URIs
25typed in by the user.
26
27The following functions are provided:
28
29=over 4
30
31=item uf_uristr($str)
32
33Tries to make the argument string
34into a proper absolute URI string.  The "uf_" prefix stands for "User
35Friendly".  Under MacOS, it assumes that any string with a common URL
36scheme (http, ftp, etc.) is a URL rather than a local path.  So don't name
37your volumes after common URL schemes and expect uf_uristr() to construct
38valid file: URL's on those volumes for you, because it won't.
39
40=item uf_uri($str)
41
42Works the same way as uf_uristr() but
43returns a C<URI> object.
44
45=back
46
47=head1 ENVIRONMENT
48
49If the hostname portion of a URI does not contain any dots, then
50certain qualified guesses are made.  These guesses are governed by
51the following two environment variables:
52
53=over 10
54
55=item COUNTRY
56
57The two-letter country code (ISO 3166) for your location.  If
58the domain name of your host ends with two letters, then it is taken
59to be the default country. See also L<Locale::Country>.
60
61=item URL_GUESS_PATTERN
62
63Contains a space-separated list of URL patterns to try.  The string
64"ACME" is for some reason used as a placeholder for the host name in
65the URL provided.  Example:
66
67 URL_GUESS_PATTERN="www.ACME.no www.ACME.se www.ACME.com"
68 export URL_GUESS_PATTERN
69
70Specifying URL_GUESS_PATTERN disables any guessing rules based on
71country.  An empty URL_GUESS_PATTERN disables any guessing that
72involves host name lookups.
73
74=back
75
76=head1 COPYRIGHT
77
78Copyright 1997-1998, Gisle Aas
79
80This library is free software; you can redistribute it and/or
81modify it under the same terms as Perl itself.
82
83=cut
84
85use strict;
86
87use vars qw(@EXPORT_OK $VERSION $MY_COUNTRY %LOCAL_GUESSING $DEBUG);
88
89require Exporter;
90*import = \&Exporter::import;
91@EXPORT_OK = qw(uf_uri uf_uristr uf_url uf_urlstr);
92$VERSION = sprintf("%d.%02d", q$Revision: 4.17 $ =~ /(\d+)\.(\d+)/);
93
94sub MY_COUNTRY() {
95    for ($MY_COUNTRY) {
96	return $_ if defined;
97
98	# First try the environment.
99	$_ = $ENV{COUNTRY};
100	return $_ if defined;
101
102	# Could use LANG, LC_ALL, etc at this point, but probably too
103	# much of a wild guess.  (Catalan != Canada, etc.)
104	#
105
106	# Last bit of domain name.  This may access the network.
107	require Net::Domain;
108	my $fqdn = Net::Domain::hostfqdn();
109	$_ = lc($1) if $fqdn =~ /\.([a-zA-Z]{2})$/;
110	return $_ if defined;
111
112	# Give up.  Defined but false.
113	return ($_ = 0);
114    }
115}
116
117%LOCAL_GUESSING =
118(
119 'us' => [qw(www.ACME.gov www.ACME.mil)],
120 'uk' => [qw(www.ACME.co.uk www.ACME.org.uk www.ACME.ac.uk)],
121 'au' => [qw(www.ACME.com.au www.ACME.org.au www.ACME.edu.au)],
122 'il' => [qw(www.ACME.co.il www.ACME.org.il www.ACME.net.il)],
123 # send corrections and new entries to <gisle@aas.no>
124);
125
126
127sub uf_uristr ($)
128{
129    local($_) = @_;
130    print STDERR "uf_uristr: resolving $_\n" if $DEBUG;
131    return unless defined;
132
133    s/^\s+//;
134    s/\s+$//;
135
136    if (/^(www|web|home)\./) {
137	$_ = "http://$_";
138
139    } elsif (/^(ftp|gopher|news|wais|http|https)\./) {
140	$_ = "$1://$_";
141
142    } elsif ($^O ne "MacOS" &&
143	    (m,^/,      ||          # absolute file name
144	     m,^\.\.?/, ||          # relative file name
145	     m,^[a-zA-Z]:[/\\],)    # dosish file name
146	    )
147    {
148	$_ = "file:$_";
149
150    } elsif ($^O eq "MacOS" && m/:/) {
151        # potential MacOS file name
152	unless (m/^(ftp|gopher|news|wais|http|https|mailto):/) {
153	    require URI::file;
154	    my $a = URI::file->new($_)->as_string;
155	    $_ = ($a =~ m/^file:/) ? $a : "file:$a";
156	}
157    } elsif (/^\w+([\.\-]\w+)*\@(\w+\.)+\w{2,3}$/) {
158	$_ = "mailto:$_";
159
160    } elsif (!/^[a-zA-Z][a-zA-Z0-9.+\-]*:/) {      # no scheme specified
161	if (s/^([-\w]+(?:\.[-\w]+)*)([\/:\?\#]|$)/$2/) {
162	    my $host = $1;
163
164	    if ($host !~ /\./ && $host ne "localhost") {
165		my @guess;
166		if (exists $ENV{URL_GUESS_PATTERN}) {
167		    @guess = map { s/\bACME\b/$host/; $_ }
168		             split(' ', $ENV{URL_GUESS_PATTERN});
169		} else {
170		    if (MY_COUNTRY()) {
171			my $special = $LOCAL_GUESSING{MY_COUNTRY()};
172			if ($special) {
173			    my @special = @$special;
174			    push(@guess, map { s/\bACME\b/$host/; $_ }
175                                               @special);
176			} else {
177			    push(@guess, 'www.$host.' . MY_COUNTRY());
178			}
179		    }
180		    push(@guess, map "www.$host.$_",
181			             "com", "org", "net", "edu", "int");
182		}
183
184
185		my $guess;
186		for $guess (@guess) {
187		    print STDERR "uf_uristr: gethostbyname('$guess.')..."
188		      if $DEBUG;
189		    if (gethostbyname("$guess.")) {
190			print STDERR "yes\n" if $DEBUG;
191			$host = $guess;
192			last;
193		    }
194		    print STDERR "no\n" if $DEBUG;
195		}
196	    }
197	    $_ = "http://$host$_";
198
199	} else {
200	    # pure junk, just return it unchanged...
201
202	}
203    }
204    print STDERR "uf_uristr: ==> $_\n" if $DEBUG;
205
206    $_;
207}
208
209sub uf_uri ($)
210{
211    require URI;
212    URI->new(uf_uristr($_[0]));
213}
214
215# legacy
216*uf_urlstr = \*uf_uristr;
217
218sub uf_url ($)
219{
220    require URI::URL;
221    URI::URL->new(uf_uristr($_[0]));
222}
223
2241;
225