1#
2# $Id: Escape.pm,v 3.26 2004/04/13 15:17:27 gisle Exp $
3#
4
5package URI::Escape;
6use strict;
7
8=head1 NAME
9
10URI::Escape - Escape and unescape unsafe characters
11
12=head1 SYNOPSIS
13
14 use URI::Escape;
15 $safe = uri_escape("10% is enough\n");
16 $verysafe = uri_escape("foo", "\0-\377");
17 $str  = uri_unescape($safe);
18
19=head1 DESCRIPTION
20
21This module provides functions to escape and unescape URI strings as
22defined by RFC 2396 (and updated by RFC 2732).
23A URI consists of a restricted set of characters,
24denoted as C<uric> in RFC 2396.  The restricted set of characters
25consists of digits, letters, and a few graphic symbols chosen from
26those common to most of the character encodings and input facilities
27available to Internet users:
28
29  "A" .. "Z", "a" .. "z", "0" .. "9",
30  ";", "/", "?", ":", "@", "&", "=", "+", "$", ",", "[", "]",   # reserved
31  "-", "_", ".", "!", "~", "*", "'", "(", ")"
32
33In addition, any byte (octet) can be represented in a URI by an escape
34sequence: a triplet consisting of the character "%" followed by two
35hexadecimal digits.  A byte can also be represented directly by a
36character, using the US-ASCII character for that octet (iff the
37character is part of C<uric>).
38
39Some of the C<uric> characters are I<reserved> for use as delimiters
40or as part of certain URI components.  These must be escaped if they are
41to be treated as ordinary data.  Read RFC 2396 for further details.
42
43The functions provided (and exported by default) from this module are:
44
45=over 4
46
47=item uri_escape( $string )
48
49=item uri_escape( $string, $unsafe )
50
51Replaces each unsafe character in the $string with the corresponding
52escape sequence and returns the result.  The $string argument should
53be a string of bytes.  The uri_escape() function will croak if given a
54characters with code above 255.  Use uri_escape_utf8() if you know you
55have such chars or/and want chars in the 128 .. 255 range treated as
56UTF-8.
57
58The uri_escape() function takes an optional second argument that
59overrides the set of characters that are to be escaped.  The set is
60specified as a string that can be used in a regular expression
61character class (between [ ]).  E.g.:
62
63  "\x00-\x1f\x7f-\xff"          # all control and hi-bit characters
64  "a-z"                         # all lower case characters
65  "^A-Za-z"                     # everything not a letter
66
67The default set of characters to be escaped is all those which are
68I<not> part of the C<uric> character class shown above as well as the
69reserved characters.  I.e. the default is:
70
71  "^A-Za-z0-9\-_.!~*'()"
72
73=item uri_escape_utf8( $string )
74
75=item uri_escape_utf8( $string, $unsafe )
76
77Works like uri_escape(), but will encode chars as UTF-8 before
78escaping them.  This makes this function able do deal with characters
79with code above 255 in $string.  Note that chars in the 128 .. 255
80range will be escaped differently by this function compared to what
81uri_escape() would.  For chars in the 0 .. 127 range there is no
82difference.
83
84The call:
85
86    $uri = uri_escape_utf8($string);
87
88will be the same as:
89
90    use Encode qw(encode);
91    $uri = uri_escape(encode("UTF-8", $string));
92
93but will even work for perl-5.6 for chars in the 128 .. 255 range.
94
95=item uri_unescape($string,...)
96
97Returns a string with each %XX sequence replaced with the actual byte
98(octet).
99
100This does the same as:
101
102   $string =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
103
104but does not modify the string in-place as this RE would.  Using the
105uri_unescape() function instead of the RE might make the code look
106cleaner and is a few characters less to type.
107
108In a simple benchmark test I did,
109calling the function (instead of the inline RE above) if a few chars
110were unescaped was something like 40% slower, and something like 700% slower if none were.  If
111you are going to unescape a lot of times it might be a good idea to
112inline the RE.
113
114If the uri_unescape() function is passed multiple strings, then each
115one is returned unescaped.
116
117=back
118
119The module can also export the C<%escapes> hash, which contains the
120mapping from all 256 bytes to the corresponding escape codes.  Lookup
121in this hash is faster than evaluating C<sprintf("%%%02X", ord($byte))>
122each time.
123
124=head1 SEE ALSO
125
126L<URI>
127
128
129=head1 COPYRIGHT
130
131Copyright 1995-2004 Gisle Aas.
132
133This program is free software; you can redistribute it and/or modify
134it under the same terms as Perl itself.
135
136=cut
137
138use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
139use vars qw(%escapes);
140
141require Exporter;
142@ISA = qw(Exporter);
143@EXPORT = qw(uri_escape uri_unescape);
144@EXPORT_OK = qw(%escapes uri_escape_utf8);
145$VERSION = sprintf("%d.%02d", q$Revision: 3.26 $ =~ /(\d+)\.(\d+)/);
146
147use Carp ();
148
149# Build a char->hex map
150for (0..255) {
151    $escapes{chr($_)} = sprintf("%%%02X", $_);
152}
153
154my %subst;  # compiled patternes
155
156sub uri_escape
157{
158    my($text, $patn) = @_;
159    return undef unless defined $text;
160    if (defined $patn){
161	unless (exists  $subst{$patn}) {
162	    # Because we can't compile the regex we fake it with a cached sub
163	    (my $tmp = $patn) =~ s,/,\\/,g;
164	    eval "\$subst{\$patn} = sub {\$_[0] =~ s/([$tmp])/\$escapes{\$1} || _fail_hi(\$1)/ge; }";
165	    Carp::croak("uri_escape: $@") if $@;
166	}
167	&{$subst{$patn}}($text);
168    } else {
169	# Default unsafe characters.  RFC 2732 ^(uric - reserved)
170	$text =~ s/([^A-Za-z0-9\-_.!~*'()])/$escapes{$1} || _fail_hi($1)/ge;
171    }
172    $text;
173}
174
175sub _fail_hi {
176    my $chr = shift;
177    Carp::croak(sprintf "Can't escape \\x{%04X}, try uri_escape_utf8() instead", ord($chr));
178}
179
180sub uri_escape_utf8
181{
182    if ($] < 5.008) {
183	my $text = shift;
184	$text =~ s/([^\0-\x7F])/do {my $o = ord($1); sprintf("%c%c", 0xc0 | ($o >> 6), 0x80 | ($o & 0x3f)) }/ge;
185	return uri_escape($text, @_);
186    }
187
188    require Encode;
189    return uri_escape(Encode::encode_utf8(shift), @_);
190}
191
192sub uri_unescape
193{
194    # Note from RFC1630:  "Sequences which start with a percent sign
195    # but are not followed by two hexadecimal characters are reserved
196    # for future extension"
197    my $str = shift;
198    if (@_ && wantarray) {
199	# not executed for the common case of a single argument
200	my @str = ($str, @_);  # need to copy
201	foreach (@str) {
202	    s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
203	}
204	return @str;
205    }
206    $str =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg if defined $str;
207    $str;
208}
209
2101;
211