1package HTTP::Proxy::BodyFilter::save;
2
3use strict;
4use HTTP::Proxy;
5use HTTP::Proxy::BodyFilter;
6use vars qw( @ISA );
7@ISA = qw( HTTP::Proxy::BodyFilter );
8use Fcntl;
9use File::Spec;
10use File::Path;
11use Carp;
12
13sub init {
14    my $self = shift;
15
16    # options
17    my %args = (
18         template   => File::Spec->catfile( '%h', '%P' ),
19         no_host    => 0,
20         no_dirs    => 0,
21         cut_dirs   => 0,
22         prefix     => '',
23         filename   => undef,
24         multiple   => 1,
25         keep_old   => 0, # no_clobber in wget parlance
26         timestamp  => 0,
27         status     => [ 200 ],
28         @_
29    );
30    # keep_old and timestamp can't be selected together
31    croak "Can't timestamp and keep older files at the same time"
32      if $args{keep_old} && $args{timestamp};
33    croak "status must be an array reference"
34      unless ref($args{status}) eq 'ARRAY';
35    croak "status must contain only HTTP codes"
36      if grep { !/^[12345]\d\d$/ } @{ $args{status} };
37    croak "filename must be a code reference"
38      if defined $args{filename} && !UNIVERSAL::isa( $args{filename}, 'CODE' );
39
40    $self->{"_hpbf_save_filename_code"} = $args{filename};
41    $self->{"_hpbf_save_$_"} = $args{$_}
42      for qw( template no_host no_dirs cut_dirs prefix
43              multiple keep_old timestamp status );
44}
45
46sub begin {
47    my ( $self, $message ) = @_;
48
49    # internal data initialisation
50    delete @{$self}{qw( _hpbf_save_filename _hpbf_save_fh )};
51
52    my $uri = $message->isa( 'HTTP::Request' )
53            ? $message->uri : $message->request->uri;
54
55    # save only the accepted status codes
56    if( $message->isa( 'HTTP::Response' ) ) {
57        my $code = $message->code;
58        return unless grep { $code eq $_ } @{ $self->{_hpbf_save_status} };
59    }
60
61    my $file = '';
62    if( defined $self->{_hpbf_save_filename_code} ) {
63        # use the user-provided callback
64        $file = $self->{_hpbf_save_filename_code}->($message);
65        unless ( defined $file and $file ne '' ) {
66            $self->proxy->log( HTTP::Proxy::FILTERS, "HTBF::save",
67                               "Filter will not save $uri" );
68            return;
69        }
70    }
71    else {
72        # set the template variables from the URI
73        my @segs = $uri->path_segments; # starts with an empty string
74        shift @segs;
75        splice(@segs, 0, $self->{_hpbf_save_cut_dirs} >= @segs
76                         ? @segs - 1 : $self->{_hpbf_save_cut_dirs} );
77        my %vars = (
78             '%' => '%',
79             h   => $self->{_hpbf_save_no_host} ? '' : $uri->host,
80             f   => $segs[-1] || 'index.html', # same default as wget
81             p   => $self->{_hpbf_save_no_dirs} ? $segs[-1] || 'index.html'
82                                                : File::Spec->catfile(@segs),
83             q   => $uri->query,
84        );
85        pop @segs;
86        $vars{d}
87            = $self->{_hpbf_save_no_dirs} ? ''
88            : @segs                       ? File::Spec->catfile(@segs)
89            :                               '';
90        $vars{P} = $vars{p} . ( $vars{q} ? "?$vars{q}" : '' );
91
92        # create the filename
93        $file = File::Spec->catfile( $self->{_hpbf_save_prefix} || (),
94                                     $self->{_hpbf_save_template} );
95        $file =~ s/%(.)/$vars{$1}/g;
96    }
97    $file = File::Spec->rel2abs( $file );
98
99    # create the directory
100    my $dir = File::Spec->catpath( (File::Spec->splitpath($file))[ 0, 1 ], '' );
101    if( ! -e $dir ) {
102        eval { mkpath( $dir ) };
103        if ($@) {
104            $self->proxy->log( HTTP::Proxy::ERROR, "HTBF::save",
105                              "Unable to create directory $dir" );
106            return;
107        }
108        $self->proxy->log( HTTP::Proxy::FILTERS, "HTBF::save",
109                           "Created directory $dir" );
110    }
111
112    # keep old file?
113    if ( -e $file ) {
114        if ( $self->{_hpbf_save_timestamp} ) {
115            # FIXME timestamp
116        }
117        elsif ( $self->{_hpbf_save_keep_old} ) {
118            $self->proxy->log( HTTP::Proxy::FILTERS, "HPBF::save",
119                "Skip saving $uri" );
120            delete $self->{_hpbf_save_fh};    # it's a closed filehandle
121            return;
122        }
123    }
124
125    # open and lock the file
126    my ( $ext, $n, $i ) = ( "", 0 );
127    my $flags = O_WRONLY | O_EXCL | O_CREAT;
128    while( ! sysopen( $self->{_hpbf_save_fh}, "$file$ext", $flags ) ) {
129        $self->proxy->log( HTTP::Proxy::ERROR, "HPBF::save",
130                           "Too many errors opening $file$ext" ), return
131          if $i++ - $n == 10; # should be ok now
132        if( $self->{_hpbf_save_multiple} ) {
133            $ext = "." . ++$n while -e $file.$ext;
134            next;
135        }
136        else {
137            $flags = O_WRONLY | O_CREAT;
138        }
139    }
140
141    # we have an open filehandle
142    $self->{_hpbf_save_filename} = $file.$ext;
143    binmode( $self->{_hpbf_save_fh} );    # for Win32 and friends
144    $self->proxy->log( HTTP::Proxy::FILTERS, "HPBF::save",
145                       "Saving $uri to $file$ext" );
146}
147
148sub filter {
149    my ( $self, $dataref ) = @_;
150    return unless exists $self->{_hpbf_save_fh};
151
152    # save the data to the file
153    my $res = $self->{_hpbf_save_fh}->syswrite( $$dataref );
154    $self->proxy->log( HTTP::Proxy::ERROR, "HPBF::save", "syswrite() error: $!")
155      if ! defined $res;  # FIXME error handling
156}
157
158sub end {
159    my ($self) = @_;
160
161    # close file
162    if( $self->{_hpbf_save_fh} ) {
163        $self->{_hpbf_save_fh}->close; # FIXME error handling
164        delete $self->{_hpbf_save_fh};
165    }
166}
167
168sub will_modify { 0 }
169
1701;
171
172__END__
173
174=encoding utf8
175
176=head1 NAME
177
178HTTP::Proxy::BodyFilter::save - A filter that saves transfered data to a file
179
180=head1 SYNOPSIS
181
182    use HTTP::Proxy;
183    use HTTP::Proxy::BodyFilter::save;
184
185    my $proxy = HTTP::Proxy->new;
186
187    # save RFC files as we browse them
188    $proxy->push_filter(
189        path     => qr!/rfc\d+.txt!,
190        mime     => 'text/plain',
191        response => HTTP::Proxy::BodyFilter::save->new(
192            template => '%f',
193            prefix   => 'rfc',
194            keep_old => 1,
195        )
196    );
197
198    $proxy->start;
199
200=head1 DESCRIPTION
201
202The L<HTTP::Proxy::BodyFilter::save> filter can save HTTP messages (responses
203or request) bodies to files. The name of the file is determined by a
204template and the URI of the request.
205
206Simply insert this filter in a filter stack, and it will save the data
207as it flows through the proxy. Depending on where the filter is located
208in the stack, the saved data can be more or less modified.
209
210This filter I<will> create directories if it needs to!
211
212I<Note:> Remember that the default C<mime> parameter for C<push_filter()>
213is C<text/*> and that you may need to change it for other MIME types.
214
215=head2 Constructor
216
217The constructor accepts quite a few options. Most of them control
218the construction of the filename that will be used to save the
219response body. There are two options to compute this filename:
220
221=over 4
222
223=item *
224
225use a template
226
227=item *
228
229use your own filename creation routine
230
231=back
232
233The template option uses the following options:
234
235=over 4
236
237=item B<template> => I<string>
238
239The file name is build from the C<template> option. The following
240placeholders are available:
241
242    %%   a percent sign
243    %h   the host
244    %p   the path (no leading separator)
245    %d   the path (filename removed)
246    %f   the filename (or 'index.html' if absent)
247    %q   the query string
248    %P   the path and the query string,
249         separated by '?' (if the query string is not empty)
250
251C</> in the URI path are replaced by the separator used by File::Spec.
252
253The result of the template is modified by the B<no_host>, B<no_dirs>
254and B<cut_dirs>.
255
256The default template is the local equivalent of the C<%h/%P> Unix path.
257
258=item B<no_host> => I<boolean>
259
260The C<no_host> option makes C<%h> empty. Default is I<false>.
261
262=item B<no_dirs> => I<boolean>
263
264The C<no_dirs> option removes all directories from C<%p>, C<%P> and C<%d>.
265Default is I<false>.
266
267=item B<cut_dirs> => I<number>
268
269The C<cut_dirs> options removes the first I<n> directories from the
270content of C<%p>, C<%P> and C<%d>. Default is C<0>.
271
272=item B<prefix> => I<string>
273
274The B<prefix> option prepends the given prefix to the filename
275created from the template. Default is C<"">.
276
277=back
278
279Using your own subroutine is also possible, with the following parameter:
280
281=over 4
282
283=item B<filename> => I<coderef>
284
285When the C<filename> option is used, the C<template> option and the
286other template-related options (C<no_host>, C<no_dirs>, C<cut_dirs>
287and C<prefix>) are ignored.
288
289The C<filename> option expects a reference to a subroutine. The subroutine
290will receive the L<HTTP::Message> object and must return a string which
291is the path of the file to be created (an absolute path is recommended,
292but a relative path is accepted).
293
294Returning C<""> or C<undef> will prevent the creation of the file.
295This lets a filter decide even more precisely what to save or not,
296even though this should be done in the match subroutine (see
297L<HTTP::Proxy>'s C<push_filter()> method).
298
299=back
300
301Other options help the filter decide where and when to save:
302
303=over 4
304
305=item B<multiple> => I<boolean>
306
307With the B<multiple> option, saving the same file in the same directory
308will result in the original copy of file being preserved and the second
309copy being named F<file.1>. If that a file is saved yet again with the same
310name, the third copy will be named F<file.2>, and so on.
311
312Default is I<true>.
313
314If B<multiple> is set to I<false> then a file will be overwritten
315by the next one with the same name.
316
317=item B<timestamp> => I<boolean>
318
319With the C<timestamp> option, the decision as to whether or not to save
320a newer copy of a file depends on the local and remote timestamp and
321size of the file.
322
323The file is saved only if the date given in the C<Last-Modified> is more
324recent than the local file's timestamp.
325
326Default is I<false>.
327
328B<This option is not implemented.>
329
330=item B<keep_old> => I<boolean>
331
332The C<keep_old> option will prevent the file to be saved if a file
333with the same name already exists. Default is I<false>.
334
335No matter if B<multiple> is set or not, the file will I<not> be saved
336if B<keep_old> is set to true.
337
338=item B<status> => \@codes
339
340The C<status> option limits the status codes for which a response body
341will be saved. The default is C<[ 200 ]>, which prevent saving error
342pages (for 404 codes).
343
344=back
345
346=head2 Examples
347
348Given a request for the L<http://search.cpan.org/dist/HTTP-Proxy/> URI,
349the filename is computed as follows, depending on the constructor
350options:
351
352    No options          -> search.cpan.org/dist/HTTP-Proxy/index.html
353
354    no_host  => 1       -> dist/HTTP-Proxy/index.html
355
356    no_dirs  => 1       -> search.cpan.org/index.html
357
358    no_host  => 1,
359    no_dirs  => 1,
360    prefix   => 'data'  -> data/index.html
361
362    cut_dirs => 1       -> search.cpan.org/HTTP-Proxy/index.html
363
364    cut_dirs => 2       -> search.cpan.org/index.html
365
366=head1 METHODS
367
368This filter implements several methods, which are all called atuomatically:
369
370=over 4
371
372=item init()
373
374Handle all the parameters passed to the constructor to define the
375filter behaviour.
376
377=item begin()
378
379Open the file to which the data will be saved.
380
381=item filter()
382
383Save all the data that goes through to the opened file.
384
385=item end()
386
387Close the file when the whole message body has been processed.
388
389=item will_modify()
390
391This method returns a I<false> value, thus indicating to the system
392that it will not modify data passing through.
393
394=back
395
396=head1 SEE ALSO
397
398L<HTTP::Proxy>, L<HTTP::Proxy::BodyFilter>.
399
400=head1 AUTHOR
401
402Philippe "BooK" Bruhat, E<lt>book@cpan.orgE<gt>.
403
404=head1 ACKNOWLEDGMENTS
405
406Thanks to Mat Proud for asking how to store all pages which go through
407the proxy to disk, without any processing. The further discussion we
408had led to the writing of this class.
409
410Wget(1) provided the inspiration for many of the file naming options.
411
412Thanks to Nicolas Chuche for telling me about C<O_EXCL>.
413
414Thanks to Rafa�l Garcia-Suarez and David Rigaudiere for their help on
415irc while coding the nasty C<begin()> method. C<;-)>
416
417Thanks to Howard Jones for the inspiration and initial patch for the
418C<filename> option. Lucas Gonze provided a patch to make C<status>
419actually work.
420
421Thanks to Max Maischein for detecting a bug in the parameter validation
422for C<filename> (L<http://rt.cpan.org/Ticket/Display.html?id=14548>).
423
424Thanks to Mark Tilford, who found out that the
425C<filename> option was incorrectly used internally
426(L<http://rt.cpan.org/Ticket/Display.html?id=18644>).
427
428Thanks to Roland Stigge and Gunnar Wolf for
429reporting and forwarding Debian bug #433951 to CPAN RT
430(L<http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=433951>,
431L<http://rt.cpan.org/Ticket/Display.html?id=33018>).
432
433=head1 COPYRIGHT
434
435Copyright 2004-2013, Philippe Bruhat.
436
437=head1 LICENSE
438
439This module is free software; you can redistribute it or modify it under
440the same terms as Perl itself.
441
442=cut
443
444