1package HTTP::Proxy::BodyFilter::save; 2 3use strict; 4use HTTP::Proxy; 5use HTTP::Proxy::BodyFilter; 6use vars qw( @ISA ); 7@ISA = qw( HTTP::Proxy::BodyFilter ); 8use Fcntl; 9use File::Spec; 10use File::Path; 11use Carp; 12 13sub init { 14 my $self = shift; 15 16 # options 17 my %args = ( 18 template => File::Spec->catfile( '%h', '%P' ), 19 no_host => 0, 20 no_dirs => 0, 21 cut_dirs => 0, 22 prefix => '', 23 filename => undef, 24 multiple => 1, 25 keep_old => 0, # no_clobber in wget parlance 26 timestamp => 0, 27 status => [ 200 ], 28 @_ 29 ); 30 # keep_old and timestamp can't be selected together 31 croak "Can't timestamp and keep older files at the same time" 32 if $args{keep_old} && $args{timestamp}; 33 croak "status must be an array reference" 34 unless ref($args{status}) eq 'ARRAY'; 35 croak "status must contain only HTTP codes" 36 if grep { !/^[12345]\d\d$/ } @{ $args{status} }; 37 croak "filename must be a code reference" 38 if defined $args{filename} && !UNIVERSAL::isa( $args{filename}, 'CODE' ); 39 40 $self->{"_hpbf_save_filename_code"} = $args{filename}; 41 $self->{"_hpbf_save_$_"} = $args{$_} 42 for qw( template no_host no_dirs cut_dirs prefix 43 multiple keep_old timestamp status ); 44} 45 46sub begin { 47 my ( $self, $message ) = @_; 48 49 # internal data initialisation 50 delete @{$self}{qw( _hpbf_save_filename _hpbf_save_fh )}; 51 52 my $uri = $message->isa( 'HTTP::Request' ) 53 ? $message->uri : $message->request->uri; 54 55 # save only the accepted status codes 56 if( $message->isa( 'HTTP::Response' ) ) { 57 my $code = $message->code; 58 return unless grep { $code eq $_ } @{ $self->{_hpbf_save_status} }; 59 } 60 61 my $file = ''; 62 if( defined $self->{_hpbf_save_filename_code} ) { 63 # use the user-provided callback 64 $file = $self->{_hpbf_save_filename_code}->($message); 65 unless ( defined $file and $file ne '' ) { 66 $self->proxy->log( HTTP::Proxy::FILTERS, "HTBF::save", 67 "Filter will not save $uri" ); 68 return; 69 } 70 } 71 else { 72 # set the template variables from the URI 73 my @segs = $uri->path_segments; # starts with an empty string 74 shift @segs; 75 splice(@segs, 0, $self->{_hpbf_save_cut_dirs} >= @segs 76 ? @segs - 1 : $self->{_hpbf_save_cut_dirs} ); 77 my %vars = ( 78 '%' => '%', 79 h => $self->{_hpbf_save_no_host} ? '' : $uri->host, 80 f => $segs[-1] || 'index.html', # same default as wget 81 p => $self->{_hpbf_save_no_dirs} ? $segs[-1] || 'index.html' 82 : File::Spec->catfile(@segs), 83 q => $uri->query, 84 ); 85 pop @segs; 86 $vars{d} 87 = $self->{_hpbf_save_no_dirs} ? '' 88 : @segs ? File::Spec->catfile(@segs) 89 : ''; 90 $vars{P} = $vars{p} . ( $vars{q} ? "?$vars{q}" : '' ); 91 92 # create the filename 93 $file = File::Spec->catfile( $self->{_hpbf_save_prefix} || (), 94 $self->{_hpbf_save_template} ); 95 $file =~ s/%(.)/$vars{$1}/g; 96 } 97 $file = File::Spec->rel2abs( $file ); 98 99 # create the directory 100 my $dir = File::Spec->catpath( (File::Spec->splitpath($file))[ 0, 1 ], '' ); 101 if( ! -e $dir ) { 102 eval { mkpath( $dir ) }; 103 if ($@) { 104 $self->proxy->log( HTTP::Proxy::ERROR, "HTBF::save", 105 "Unable to create directory $dir" ); 106 return; 107 } 108 $self->proxy->log( HTTP::Proxy::FILTERS, "HTBF::save", 109 "Created directory $dir" ); 110 } 111 112 # keep old file? 113 if ( -e $file ) { 114 if ( $self->{_hpbf_save_timestamp} ) { 115 # FIXME timestamp 116 } 117 elsif ( $self->{_hpbf_save_keep_old} ) { 118 $self->proxy->log( HTTP::Proxy::FILTERS, "HPBF::save", 119 "Skip saving $uri" ); 120 delete $self->{_hpbf_save_fh}; # it's a closed filehandle 121 return; 122 } 123 } 124 125 # open and lock the file 126 my ( $ext, $n, $i ) = ( "", 0 ); 127 my $flags = O_WRONLY | O_EXCL | O_CREAT; 128 while( ! sysopen( $self->{_hpbf_save_fh}, "$file$ext", $flags ) ) { 129 $self->proxy->log( HTTP::Proxy::ERROR, "HPBF::save", 130 "Too many errors opening $file$ext" ), return 131 if $i++ - $n == 10; # should be ok now 132 if( $self->{_hpbf_save_multiple} ) { 133 $ext = "." . ++$n while -e $file.$ext; 134 next; 135 } 136 else { 137 $flags = O_WRONLY | O_CREAT; 138 } 139 } 140 141 # we have an open filehandle 142 $self->{_hpbf_save_filename} = $file.$ext; 143 binmode( $self->{_hpbf_save_fh} ); # for Win32 and friends 144 $self->proxy->log( HTTP::Proxy::FILTERS, "HPBF::save", 145 "Saving $uri to $file$ext" ); 146} 147 148sub filter { 149 my ( $self, $dataref ) = @_; 150 return unless exists $self->{_hpbf_save_fh}; 151 152 # save the data to the file 153 my $res = $self->{_hpbf_save_fh}->syswrite( $$dataref ); 154 $self->proxy->log( HTTP::Proxy::ERROR, "HPBF::save", "syswrite() error: $!") 155 if ! defined $res; # FIXME error handling 156} 157 158sub end { 159 my ($self) = @_; 160 161 # close file 162 if( $self->{_hpbf_save_fh} ) { 163 $self->{_hpbf_save_fh}->close; # FIXME error handling 164 delete $self->{_hpbf_save_fh}; 165 } 166} 167 168sub will_modify { 0 } 169 1701; 171 172__END__ 173 174=head1 NAME 175 176HTTP::Proxy::BodyFilter::save - A filter that saves transfered data to a file 177 178=head1 SYNOPSIS 179 180 use HTTP::Proxy; 181 use HTTP::Proxy::BodyFilter::save; 182 183 my $proxy = HTTP::Proxy->new; 184 185 # save RFC files as we browse them 186 $proxy->push_filter( 187 path => qr!/rfc\d+.txt!, 188 mime => 'text/plain', 189 response => HTTP::Proxy::BodyFilter::save->new( 190 template => '%f', 191 prefix => 'rfc', 192 keep_old => 1, 193 ) 194 ); 195 196 $proxy->start; 197 198=head1 DESCRIPTION 199 200The HTTP::Proxy::BodyFilter::save filter can save HTTP messages (responses 201or request) bodies to files. The name of the file is determined by a 202template and the URI of the request. 203 204Simply insert this filter in a filter stack, and it will save the data 205as it flows through the proxy. Depending on where the filter is located 206in the stack, the saved data can be more or less modified. 207 208This filter I<will> create directories if it needs to! 209 210I<Note:> Remember that the default C<mime> parameter for C<push_filter()> 211is C<text/*> and that you may need to change it for other MIME types. 212 213=head2 Constructor 214 215The constructor accepts quite a few options. Most of them control 216the construction of the filename that will be used to save the 217response body. There are two options to compute this filename: 218 219=over 4 220 221=item * 222 223use a template 224 225=item * 226 227use your own filename creation routine 228 229=back 230 231The template option uses the following options: 232 233=over 4 234 235=item B<template> => I<string> 236 237The file name is build from the C<template> option. The following 238placeholders are available: 239 240 %% a percent sign 241 %h the host 242 %p the path (no leading separator) 243 %d the path (filename removed) 244 %f the filename (or 'index.html' if absent) 245 %q the query string 246 %P the path and the query string, 247 separated by '?' (if the query string is not empty) 248 249C</> in the URI path are replaced by the separator used by File::Spec. 250 251The result of the template is modified by the B<no_host>, B<no_dirs> 252and B<cut_dirs>. 253 254The default template is the local equivalent of the C<%h/%P> Unix path. 255 256=item B<no_host> => I<boolean> 257 258The C<no_host> option makes C<%h> empty. Default is I<false>. 259 260=item B<no_dirs> => I<boolean> 261 262The C<no_dirs> option removes all directories from C<%p>, C<%P> and C<%d>. 263Default is I<false>. 264 265=item B<cut_dirs> => I<number> 266 267The C<cut_dirs> options removes the first I<n> directories from the 268content of C<%p>, C<%P> and C<%d>. Default is C<0>. 269 270=item B<prefix> => I<string> 271 272The B<prefix> option prepends the given prefix to the filename 273created from the template. Default is C<"">. 274 275=back 276 277Using your own subroutine is also possible, with the following parameter: 278 279=over 4 280 281=item B<filename> => I<coderef> 282 283When the C<filename> option is used, the C<template> option and the 284other template-related options (C<no_host>, C<no_dirs>, C<cut_dirs> 285and C<prefix>) are ignored. 286 287The C<filename> option expects a reference to a subroutine. The subroutine 288will receive the HTTP::Message object and must return a string which 289is the path of the file to be created (an absolute path is recommended, 290but a relative path is accepted). 291 292Returning C<""> or C<undef> will prevent the creation of the file. 293This lets a filter decide even more precisely what to save or not, 294even though this should be done in the match subroutine (see 295HTTP::Proxy's C<push�_filte()> method). 296 297=back 298 299Other options help the filter decide where and when to save: 300 301=over 4 302 303=item B<multiple> => I<boolean> 304 305With the B<multiple> option, saving the same file in the same directory 306will result in the original copy of file being preserved and the second 307copy being named F<file.1>. If that a file is saved yet again with the same 308name, the third copy will be named F<file.2>, and so on. 309 310Default is I<true>. 311 312If B<multiple> is set to I<false> then a file will be overwritten 313by the next one with the same name. 314 315=item B<timestamp> => I<boolean> 316 317With the C<timestamp> option, the decision as to whether or not to save 318a newer copy of a file depends on the local and remote timestamp and 319size of the file. 320 321The file is saved only if the date given in the C<Last-Modified> is more 322recent than the local file's timestamp. 323 324Default is I<false>. 325 326B<This option is not implemented.> 327 328=item B<keep_old> => I<boolean> 329 330The C<keep_old> option will prevent the file to be saved if a file 331with the same name already exists. Default is I<false>. 332 333No matter if B<multiple> is set or not, the file will I<not> be saved 334if B<keep_old> is set to true. 335 336=item B<status> => \@codes 337 338The C<status> option limits the status codes for which a response body 339will be saved. The default is C<[ 200 ]>, which prevent saving error 340pages (for 404 codes). 341 342=back 343 344=head2 Examples 345 346Given a request for the L<http://search.cpan.org/dist/HTTP-Proxy/> URI, 347the filename is computed as follows, depending on the constructor 348options: 349 350 No options -> search.cpan.org/dist/HTTP-Proxy/index.html 351 352 no_host => 1 -> dist/HTTP-Proxy/index.html 353 354 no_dirs => 1 -> search.cpan.org/index.html 355 356 no_host => 1, 357 no_dirs => 1, 358 prefix => 'data' -> data/index.html 359 360 cut_dirs => 1 -> search.cpan.org/HTTP-Proxy/index.html 361 362 cut_dirs => 2 -> search.cpan.org/index.html 363 364=head1 METHODS 365 366This filter implements several methods, which are all called atuomatically: 367 368=over 4 369 370=item init() 371 372Handle all the parameters passed to the constructor to define the 373filter behaviour. 374 375=item begin() 376 377Open the file to which the data will be saved. 378 379=item filter() 380 381Save all the data that goes through to the opened file. 382 383=item end() 384 385Close the file when the whole message body has been processed. 386 387=item will_modify() 388 389This method returns a I<false> value, thus indicating to the system 390that it will not modify data passing through. 391 392=back 393 394=head1 SEE ALSO 395 396L<HTTP::Proxy>, L<HTTP::Proxy::BodyFilter>. 397 398=head1 AUTHOR 399 400Philippe "BooK" Bruhat, E<lt>book@cpan.orgE<gt>. 401 402=head1 ACKNOWLEDGMENTS 403 404Thanks to Mat Proud for asking how to store all pages which go through 405the proxy to disk, without any processing. The further discussion we 406had led to the writing of this class. 407 408Wget(1) provided the inspiration for many of the file naming options. 409 410Thanks to Nicolas Chuche for telling me about C<O_EXCL>. 411 412Thanks to Rafa�l Garcia-Suarez and David Rigaudiere for their help on 413irc while coding the nasty C<begin()> method. C<;-)> 414 415Thanks to Howard Jones for the inspiration and initial patch for the 416C<filename> option. Lucas Gonze provided a patch to make C<status> 417actually work. 418 419Thanks to Max Maischein for detecting a bug in the parameter validation 420for C<filename> (L<http://rt.cpan.org/Ticket/Display.html?id=14548>). 421 422Thanks to Mark Tilford, who found out that the 423C<filename> option was incorrectly used internally 424(L<http://rt.cpan.org/Ticket/Display.html?id=18644>). 425 426Thanks to Roland Stigge and Gunnar Wolf for 427reporting and forwarding Debian bug #433951 to CPAN RT 428(L<http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=433951>, 429L<http://rt.cpan.org/Ticket/Display.html?id=33018>). 430 431=head1 COPYRIGHT 432 433Copyright 2004-2008, Philippe Bruhat. 434 435=head1 LICENSE 436 437This module is free software; you can redistribute it or modify it under 438the same terms as Perl itself. 439 440=cut 441 442