1# 2# $Id: Escape.pm,v 3.26 2004/04/13 15:17:27 gisle Exp $ 3# 4 5package URI::Escape; 6use strict; 7 8=head1 NAME 9 10URI::Escape - Escape and unescape unsafe characters 11 12=head1 SYNOPSIS 13 14 use URI::Escape; 15 $safe = uri_escape("10% is enough\n"); 16 $verysafe = uri_escape("foo", "\0-\377"); 17 $str = uri_unescape($safe); 18 19=head1 DESCRIPTION 20 21This module provides functions to escape and unescape URI strings as 22defined by RFC 2396 (and updated by RFC 2732). 23A URI consists of a restricted set of characters, 24denoted as C<uric> in RFC 2396. The restricted set of characters 25consists of digits, letters, and a few graphic symbols chosen from 26those common to most of the character encodings and input facilities 27available to Internet users: 28 29 "A" .. "Z", "a" .. "z", "0" .. "9", 30 ";", "/", "?", ":", "@", "&", "=", "+", "$", ",", "[", "]", # reserved 31 "-", "_", ".", "!", "~", "*", "'", "(", ")" 32 33In addition, any byte (octet) can be represented in a URI by an escape 34sequence: a triplet consisting of the character "%" followed by two 35hexadecimal digits. A byte can also be represented directly by a 36character, using the US-ASCII character for that octet (iff the 37character is part of C<uric>). 38 39Some of the C<uric> characters are I<reserved> for use as delimiters 40or as part of certain URI components. These must be escaped if they are 41to be treated as ordinary data. Read RFC 2396 for further details. 42 43The functions provided (and exported by default) from this module are: 44 45=over 4 46 47=item uri_escape( $string ) 48 49=item uri_escape( $string, $unsafe ) 50 51Replaces each unsafe character in the $string with the corresponding 52escape sequence and returns the result. The $string argument should 53be a string of bytes. The uri_escape() function will croak if given a 54characters with code above 255. Use uri_escape_utf8() if you know you 55have such chars or/and want chars in the 128 .. 255 range treated as 56UTF-8. 57 58The uri_escape() function takes an optional second argument that 59overrides the set of characters that are to be escaped. The set is 60specified as a string that can be used in a regular expression 61character class (between [ ]). E.g.: 62 63 "\x00-\x1f\x7f-\xff" # all control and hi-bit characters 64 "a-z" # all lower case characters 65 "^A-Za-z" # everything not a letter 66 67The default set of characters to be escaped is all those which are 68I<not> part of the C<uric> character class shown above as well as the 69reserved characters. I.e. the default is: 70 71 "^A-Za-z0-9\-_.!~*'()" 72 73=item uri_escape_utf8( $string ) 74 75=item uri_escape_utf8( $string, $unsafe ) 76 77Works like uri_escape(), but will encode chars as UTF-8 before 78escaping them. This makes this function able do deal with characters 79with code above 255 in $string. Note that chars in the 128 .. 255 80range will be escaped differently by this function compared to what 81uri_escape() would. For chars in the 0 .. 127 range there is no 82difference. 83 84The call: 85 86 $uri = uri_escape_utf8($string); 87 88will be the same as: 89 90 use Encode qw(encode); 91 $uri = uri_escape(encode("UTF-8", $string)); 92 93but will even work for perl-5.6 for chars in the 128 .. 255 range. 94 95=item uri_unescape($string,...) 96 97Returns a string with each %XX sequence replaced with the actual byte 98(octet). 99 100This does the same as: 101 102 $string =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; 103 104but does not modify the string in-place as this RE would. Using the 105uri_unescape() function instead of the RE might make the code look 106cleaner and is a few characters less to type. 107 108In a simple benchmark test I did, 109calling the function (instead of the inline RE above) if a few chars 110were unescaped was something like 40% slower, and something like 700% slower if none were. If 111you are going to unescape a lot of times it might be a good idea to 112inline the RE. 113 114If the uri_unescape() function is passed multiple strings, then each 115one is returned unescaped. 116 117=back 118 119The module can also export the C<%escapes> hash, which contains the 120mapping from all 256 bytes to the corresponding escape codes. Lookup 121in this hash is faster than evaluating C<sprintf("%%%02X", ord($byte))> 122each time. 123 124=head1 SEE ALSO 125 126L<URI> 127 128 129=head1 COPYRIGHT 130 131Copyright 1995-2004 Gisle Aas. 132 133This program is free software; you can redistribute it and/or modify 134it under the same terms as Perl itself. 135 136=cut 137 138use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION); 139use vars qw(%escapes); 140 141require Exporter; 142@ISA = qw(Exporter); 143@EXPORT = qw(uri_escape uri_unescape); 144@EXPORT_OK = qw(%escapes uri_escape_utf8); 145$VERSION = sprintf("%d.%02d", q$Revision: 3.26 $ =~ /(\d+)\.(\d+)/); 146 147use Carp (); 148 149# Build a char->hex map 150for (0..255) { 151 $escapes{chr($_)} = sprintf("%%%02X", $_); 152} 153 154my %subst; # compiled patternes 155 156sub uri_escape 157{ 158 my($text, $patn) = @_; 159 return undef unless defined $text; 160 if (defined $patn){ 161 unless (exists $subst{$patn}) { 162 # Because we can't compile the regex we fake it with a cached sub 163 (my $tmp = $patn) =~ s,/,\\/,g; 164 eval "\$subst{\$patn} = sub {\$_[0] =~ s/([$tmp])/\$escapes{\$1} || _fail_hi(\$1)/ge; }"; 165 Carp::croak("uri_escape: $@") if $@; 166 } 167 &{$subst{$patn}}($text); 168 } else { 169 # Default unsafe characters. RFC 2732 ^(uric - reserved) 170 $text =~ s/([^A-Za-z0-9\-_.!~*'()])/$escapes{$1} || _fail_hi($1)/ge; 171 } 172 $text; 173} 174 175sub _fail_hi { 176 my $chr = shift; 177 Carp::croak(sprintf "Can't escape \\x{%04X}, try uri_escape_utf8() instead", ord($chr)); 178} 179 180sub uri_escape_utf8 181{ 182 if ($] < 5.008) { 183 my $text = shift; 184 $text =~ s/([^\0-\x7F])/do {my $o = ord($1); sprintf("%c%c", 0xc0 | ($o >> 6), 0x80 | ($o & 0x3f)) }/ge; 185 return uri_escape($text, @_); 186 } 187 188 require Encode; 189 return uri_escape(Encode::encode_utf8(shift), @_); 190} 191 192sub uri_unescape 193{ 194 # Note from RFC1630: "Sequences which start with a percent sign 195 # but are not followed by two hexadecimal characters are reserved 196 # for future extension" 197 my $str = shift; 198 if (@_ && wantarray) { 199 # not executed for the common case of a single argument 200 my @str = ($str, @_); # need to copy 201 foreach (@str) { 202 s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; 203 } 204 return @str; 205 } 206 $str =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg if defined $str; 207 $str; 208} 209 2101; 211