1package HTTP::Proxy::BodyFilter::save; 2 3use strict; 4use HTTP::Proxy; 5use HTTP::Proxy::BodyFilter; 6use vars qw( @ISA ); 7@ISA = qw( HTTP::Proxy::BodyFilter ); 8use Fcntl; 9use File::Spec; 10use File::Path; 11use Carp; 12 13sub init { 14 my $self = shift; 15 16 # options 17 my %args = ( 18 template => File::Spec->catfile( '%h', '%P' ), 19 no_host => 0, 20 no_dirs => 0, 21 cut_dirs => 0, 22 prefix => '', 23 filename => undef, 24 multiple => 1, 25 keep_old => 0, # no_clobber in wget parlance 26 timestamp => 0, 27 status => [ 200 ], 28 @_ 29 ); 30 # keep_old and timestamp can't be selected together 31 croak "Can't timestamp and keep older files at the same time" 32 if $args{keep_old} && $args{timestamp}; 33 croak "status must be an array reference" 34 unless ref($args{status}) eq 'ARRAY'; 35 croak "status must contain only HTTP codes" 36 if grep { !/^[12345]\d\d$/ } @{ $args{status} }; 37 croak "filename must be a code reference" 38 if defined $args{filename} && !UNIVERSAL::isa( $args{filename}, 'CODE' ); 39 40 $self->{"_hpbf_save_filename_code"} = $args{filename}; 41 $self->{"_hpbf_save_$_"} = $args{$_} 42 for qw( template no_host no_dirs cut_dirs prefix 43 multiple keep_old timestamp status ); 44} 45 46sub begin { 47 my ( $self, $message ) = @_; 48 49 # internal data initialisation 50 delete @{$self}{qw( _hpbf_save_filename _hpbf_save_fh )}; 51 52 my $uri = $message->isa( 'HTTP::Request' ) 53 ? $message->uri : $message->request->uri; 54 55 # save only the accepted status codes 56 if( $message->isa( 'HTTP::Response' ) ) { 57 my $code = $message->code; 58 return unless grep { $code eq $_ } @{ $self->{_hpbf_save_status} }; 59 } 60 61 my $file = ''; 62 if( defined $self->{_hpbf_save_filename_code} ) { 63 # use the user-provided callback 64 $file = $self->{_hpbf_save_filename_code}->($message); 65 unless ( defined $file and $file ne '' ) { 66 $self->proxy->log( HTTP::Proxy::FILTERS, "HTBF::save", 67 "Filter will not save $uri" ); 68 return; 69 } 70 } 71 else { 72 # set the template variables from the URI 73 my @segs = $uri->path_segments; # starts with an empty string 74 shift @segs; 75 splice(@segs, 0, $self->{_hpbf_save_cut_dirs} >= @segs 76 ? @segs - 1 : $self->{_hpbf_save_cut_dirs} ); 77 my %vars = ( 78 '%' => '%', 79 h => $self->{_hpbf_save_no_host} ? '' : $uri->host, 80 f => $segs[-1] || 'index.html', # same default as wget 81 p => $self->{_hpbf_save_no_dirs} ? $segs[-1] || 'index.html' 82 : File::Spec->catfile(@segs), 83 q => $uri->query, 84 ); 85 pop @segs; 86 $vars{d} 87 = $self->{_hpbf_save_no_dirs} ? '' 88 : @segs ? File::Spec->catfile(@segs) 89 : ''; 90 $vars{P} = $vars{p} . ( $vars{q} ? "?$vars{q}" : '' ); 91 92 # create the filename 93 $file = File::Spec->catfile( $self->{_hpbf_save_prefix} || (), 94 $self->{_hpbf_save_template} ); 95 $file =~ s/%(.)/$vars{$1}/g; 96 } 97 $file = File::Spec->rel2abs( $file ); 98 99 # create the directory 100 my $dir = File::Spec->catpath( (File::Spec->splitpath($file))[ 0, 1 ], '' ); 101 if( ! -e $dir ) { 102 eval { mkpath( $dir ) }; 103 if ($@) { 104 $self->proxy->log( HTTP::Proxy::ERROR, "HTBF::save", 105 "Unable to create directory $dir" ); 106 return; 107 } 108 $self->proxy->log( HTTP::Proxy::FILTERS, "HTBF::save", 109 "Created directory $dir" ); 110 } 111 112 # keep old file? 113 if ( -e $file ) { 114 if ( $self->{_hpbf_save_timestamp} ) { 115 # FIXME timestamp 116 } 117 elsif ( $self->{_hpbf_save_keep_old} ) { 118 $self->proxy->log( HTTP::Proxy::FILTERS, "HPBF::save", 119 "Skip saving $uri" ); 120 delete $self->{_hpbf_save_fh}; # it's a closed filehandle 121 return; 122 } 123 } 124 125 # open and lock the file 126 my ( $ext, $n, $i ) = ( "", 0 ); 127 my $flags = O_WRONLY | O_EXCL | O_CREAT; 128 while( ! sysopen( $self->{_hpbf_save_fh}, "$file$ext", $flags ) ) { 129 $self->proxy->log( HTTP::Proxy::ERROR, "HPBF::save", 130 "Too many errors opening $file$ext" ), return 131 if $i++ - $n == 10; # should be ok now 132 if( $self->{_hpbf_save_multiple} ) { 133 $ext = "." . ++$n while -e $file.$ext; 134 next; 135 } 136 else { 137 $flags = O_WRONLY | O_CREAT; 138 } 139 } 140 141 # we have an open filehandle 142 $self->{_hpbf_save_filename} = $file.$ext; 143 binmode( $self->{_hpbf_save_fh} ); # for Win32 and friends 144 $self->proxy->log( HTTP::Proxy::FILTERS, "HPBF::save", 145 "Saving $uri to $file$ext" ); 146} 147 148sub filter { 149 my ( $self, $dataref ) = @_; 150 return unless exists $self->{_hpbf_save_fh}; 151 152 # save the data to the file 153 my $res = $self->{_hpbf_save_fh}->syswrite( $$dataref ); 154 $self->proxy->log( HTTP::Proxy::ERROR, "HPBF::save", "syswrite() error: $!") 155 if ! defined $res; # FIXME error handling 156} 157 158sub end { 159 my ($self) = @_; 160 161 # close file 162 if( $self->{_hpbf_save_fh} ) { 163 $self->{_hpbf_save_fh}->close; # FIXME error handling 164 delete $self->{_hpbf_save_fh}; 165 } 166} 167 168sub will_modify { 0 } 169 1701; 171 172__END__ 173 174=encoding utf8 175 176=head1 NAME 177 178HTTP::Proxy::BodyFilter::save - A filter that saves transfered data to a file 179 180=head1 SYNOPSIS 181 182 use HTTP::Proxy; 183 use HTTP::Proxy::BodyFilter::save; 184 185 my $proxy = HTTP::Proxy->new; 186 187 # save RFC files as we browse them 188 $proxy->push_filter( 189 path => qr!/rfc\d+.txt!, 190 mime => 'text/plain', 191 response => HTTP::Proxy::BodyFilter::save->new( 192 template => '%f', 193 prefix => 'rfc', 194 keep_old => 1, 195 ) 196 ); 197 198 $proxy->start; 199 200=head1 DESCRIPTION 201 202The L<HTTP::Proxy::BodyFilter::save> filter can save HTTP messages (responses 203or request) bodies to files. The name of the file is determined by a 204template and the URI of the request. 205 206Simply insert this filter in a filter stack, and it will save the data 207as it flows through the proxy. Depending on where the filter is located 208in the stack, the saved data can be more or less modified. 209 210This filter I<will> create directories if it needs to! 211 212I<Note:> Remember that the default C<mime> parameter for C<push_filter()> 213is C<text/*> and that you may need to change it for other MIME types. 214 215=head2 Constructor 216 217The constructor accepts quite a few options. Most of them control 218the construction of the filename that will be used to save the 219response body. There are two options to compute this filename: 220 221=over 4 222 223=item * 224 225use a template 226 227=item * 228 229use your own filename creation routine 230 231=back 232 233The template option uses the following options: 234 235=over 4 236 237=item B<template> => I<string> 238 239The file name is build from the C<template> option. The following 240placeholders are available: 241 242 %% a percent sign 243 %h the host 244 %p the path (no leading separator) 245 %d the path (filename removed) 246 %f the filename (or 'index.html' if absent) 247 %q the query string 248 %P the path and the query string, 249 separated by '?' (if the query string is not empty) 250 251C</> in the URI path are replaced by the separator used by File::Spec. 252 253The result of the template is modified by the B<no_host>, B<no_dirs> 254and B<cut_dirs>. 255 256The default template is the local equivalent of the C<%h/%P> Unix path. 257 258=item B<no_host> => I<boolean> 259 260The C<no_host> option makes C<%h> empty. Default is I<false>. 261 262=item B<no_dirs> => I<boolean> 263 264The C<no_dirs> option removes all directories from C<%p>, C<%P> and C<%d>. 265Default is I<false>. 266 267=item B<cut_dirs> => I<number> 268 269The C<cut_dirs> options removes the first I<n> directories from the 270content of C<%p>, C<%P> and C<%d>. Default is C<0>. 271 272=item B<prefix> => I<string> 273 274The B<prefix> option prepends the given prefix to the filename 275created from the template. Default is C<"">. 276 277=back 278 279Using your own subroutine is also possible, with the following parameter: 280 281=over 4 282 283=item B<filename> => I<coderef> 284 285When the C<filename> option is used, the C<template> option and the 286other template-related options (C<no_host>, C<no_dirs>, C<cut_dirs> 287and C<prefix>) are ignored. 288 289The C<filename> option expects a reference to a subroutine. The subroutine 290will receive the L<HTTP::Message> object and must return a string which 291is the path of the file to be created (an absolute path is recommended, 292but a relative path is accepted). 293 294Returning C<""> or C<undef> will prevent the creation of the file. 295This lets a filter decide even more precisely what to save or not, 296even though this should be done in the match subroutine (see 297L<HTTP::Proxy>'s C<push_filter()> method). 298 299=back 300 301Other options help the filter decide where and when to save: 302 303=over 4 304 305=item B<multiple> => I<boolean> 306 307With the B<multiple> option, saving the same file in the same directory 308will result in the original copy of file being preserved and the second 309copy being named F<file.1>. If that a file is saved yet again with the same 310name, the third copy will be named F<file.2>, and so on. 311 312Default is I<true>. 313 314If B<multiple> is set to I<false> then a file will be overwritten 315by the next one with the same name. 316 317=item B<timestamp> => I<boolean> 318 319With the C<timestamp> option, the decision as to whether or not to save 320a newer copy of a file depends on the local and remote timestamp and 321size of the file. 322 323The file is saved only if the date given in the C<Last-Modified> is more 324recent than the local file's timestamp. 325 326Default is I<false>. 327 328B<This option is not implemented.> 329 330=item B<keep_old> => I<boolean> 331 332The C<keep_old> option will prevent the file to be saved if a file 333with the same name already exists. Default is I<false>. 334 335No matter if B<multiple> is set or not, the file will I<not> be saved 336if B<keep_old> is set to true. 337 338=item B<status> => \@codes 339 340The C<status> option limits the status codes for which a response body 341will be saved. The default is C<[ 200 ]>, which prevent saving error 342pages (for 404 codes). 343 344=back 345 346=head2 Examples 347 348Given a request for the L<http://search.cpan.org/dist/HTTP-Proxy/> URI, 349the filename is computed as follows, depending on the constructor 350options: 351 352 No options -> search.cpan.org/dist/HTTP-Proxy/index.html 353 354 no_host => 1 -> dist/HTTP-Proxy/index.html 355 356 no_dirs => 1 -> search.cpan.org/index.html 357 358 no_host => 1, 359 no_dirs => 1, 360 prefix => 'data' -> data/index.html 361 362 cut_dirs => 1 -> search.cpan.org/HTTP-Proxy/index.html 363 364 cut_dirs => 2 -> search.cpan.org/index.html 365 366=head1 METHODS 367 368This filter implements several methods, which are all called atuomatically: 369 370=over 4 371 372=item init() 373 374Handle all the parameters passed to the constructor to define the 375filter behaviour. 376 377=item begin() 378 379Open the file to which the data will be saved. 380 381=item filter() 382 383Save all the data that goes through to the opened file. 384 385=item end() 386 387Close the file when the whole message body has been processed. 388 389=item will_modify() 390 391This method returns a I<false> value, thus indicating to the system 392that it will not modify data passing through. 393 394=back 395 396=head1 SEE ALSO 397 398L<HTTP::Proxy>, L<HTTP::Proxy::BodyFilter>. 399 400=head1 AUTHOR 401 402Philippe "BooK" Bruhat, E<lt>book@cpan.orgE<gt>. 403 404=head1 ACKNOWLEDGMENTS 405 406Thanks to Mat Proud for asking how to store all pages which go through 407the proxy to disk, without any processing. The further discussion we 408had led to the writing of this class. 409 410Wget(1) provided the inspiration for many of the file naming options. 411 412Thanks to Nicolas Chuche for telling me about C<O_EXCL>. 413 414Thanks to Rafa�l Garcia-Suarez and David Rigaudiere for their help on 415irc while coding the nasty C<begin()> method. C<;-)> 416 417Thanks to Howard Jones for the inspiration and initial patch for the 418C<filename> option. Lucas Gonze provided a patch to make C<status> 419actually work. 420 421Thanks to Max Maischein for detecting a bug in the parameter validation 422for C<filename> (L<http://rt.cpan.org/Ticket/Display.html?id=14548>). 423 424Thanks to Mark Tilford, who found out that the 425C<filename> option was incorrectly used internally 426(L<http://rt.cpan.org/Ticket/Display.html?id=18644>). 427 428Thanks to Roland Stigge and Gunnar Wolf for 429reporting and forwarding Debian bug #433951 to CPAN RT 430(L<http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=433951>, 431L<http://rt.cpan.org/Ticket/Display.html?id=33018>). 432 433=head1 COPYRIGHT 434 435Copyright 2004-2013, Philippe Bruhat. 436 437=head1 LICENSE 438 439This module is free software; you can redistribute it or modify it under 440the same terms as Perl itself. 441 442=cut 443 444