1package HTTP::Proxy::BodyFilter::save;
2
3use strict;
4use HTTP::Proxy;
5use HTTP::Proxy::BodyFilter;
6use vars qw( @ISA );
7@ISA = qw( HTTP::Proxy::BodyFilter );
8use Fcntl;
9use File::Spec;
10use File::Path;
11use Carp;
12
13sub init {
14    my $self = shift;
15
16    # options
17    my %args = (
18         template   => File::Spec->catfile( '%h', '%P' ),
19         no_host    => 0,
20         no_dirs    => 0,
21         cut_dirs   => 0,
22         prefix     => '',
23         filename   => undef,
24         multiple   => 1,
25         keep_old   => 0, # no_clobber in wget parlance
26         timestamp  => 0,
27         status     => [ 200 ],
28         @_
29    );
30    # keep_old and timestamp can't be selected together
31    croak "Can't timestamp and keep older files at the same time"
32      if $args{keep_old} && $args{timestamp};
33    croak "status must be an array reference"
34      unless ref($args{status}) eq 'ARRAY';
35    croak "status must contain only HTTP codes"
36      if grep { !/^[12345]\d\d$/ } @{ $args{status} };
37    croak "filename must be a code reference"
38      if defined $args{filename} && !UNIVERSAL::isa( $args{filename}, 'CODE' );
39
40    $self->{"_hpbf_save_filename_code"} = $args{filename};
41    $self->{"_hpbf_save_$_"} = $args{$_}
42      for qw( template no_host no_dirs cut_dirs prefix
43              multiple keep_old timestamp status );
44}
45
46sub begin {
47    my ( $self, $message ) = @_;
48
49    # internal data initialisation
50    delete @{$self}{qw( _hpbf_save_filename _hpbf_save_fh )};
51
52    my $uri = $message->isa( 'HTTP::Request' )
53            ? $message->uri : $message->request->uri;
54
55    # save only the accepted status codes
56    if( $message->isa( 'HTTP::Response' ) ) {
57        my $code = $message->code;
58        return unless grep { $code eq $_ } @{ $self->{_hpbf_save_status} };
59    }
60
61    my $file = '';
62    if( defined $self->{_hpbf_save_filename_code} ) {
63        # use the user-provided callback
64        $file = $self->{_hpbf_save_filename_code}->($message);
65        unless ( defined $file and $file ne '' ) {
66            $self->proxy->log( HTTP::Proxy::FILTERS, "HTBF::save",
67                               "Filter will not save $uri" );
68            return;
69        }
70    }
71    else {
72        # set the template variables from the URI
73        my @segs = $uri->path_segments; # starts with an empty string
74        shift @segs;
75        splice(@segs, 0, $self->{_hpbf_save_cut_dirs} >= @segs
76                         ? @segs - 1 : $self->{_hpbf_save_cut_dirs} );
77        my %vars = (
78             '%' => '%',
79             h   => $self->{_hpbf_save_no_host} ? '' : $uri->host,
80             f   => $segs[-1] || 'index.html', # same default as wget
81             p   => $self->{_hpbf_save_no_dirs} ? $segs[-1] || 'index.html'
82                                                : File::Spec->catfile(@segs),
83             q   => $uri->query,
84        );
85        pop @segs;
86        $vars{d}
87            = $self->{_hpbf_save_no_dirs} ? ''
88            : @segs                       ? File::Spec->catfile(@segs)
89            :                               '';
90        $vars{P} = $vars{p} . ( $vars{q} ? "?$vars{q}" : '' );
91
92        # create the filename
93        $file = File::Spec->catfile( $self->{_hpbf_save_prefix} || (),
94                                     $self->{_hpbf_save_template} );
95        $file =~ s/%(.)/$vars{$1}/g;
96    }
97    $file = File::Spec->rel2abs( $file );
98
99    # create the directory
100    my $dir = File::Spec->catpath( (File::Spec->splitpath($file))[ 0, 1 ], '' );
101    if( ! -e $dir ) {
102        eval { mkpath( $dir ) };
103        if ($@) {
104            $self->proxy->log( HTTP::Proxy::ERROR, "HTBF::save",
105                              "Unable to create directory $dir" );
106            return;
107        }
108        $self->proxy->log( HTTP::Proxy::FILTERS, "HTBF::save",
109                           "Created directory $dir" );
110    }
111
112    # keep old file?
113    if ( -e $file ) {
114        if ( $self->{_hpbf_save_timestamp} ) {
115            # FIXME timestamp
116        }
117        elsif ( $self->{_hpbf_save_keep_old} ) {
118            $self->proxy->log( HTTP::Proxy::FILTERS, "HPBF::save",
119                "Skip saving $uri" );
120            delete $self->{_hpbf_save_fh};    # it's a closed filehandle
121            return;
122        }
123    }
124
125    # open and lock the file
126    my ( $ext, $n, $i ) = ( "", 0 );
127    my $flags = O_WRONLY | O_EXCL | O_CREAT;
128    while( ! sysopen( $self->{_hpbf_save_fh}, "$file$ext", $flags ) ) {
129        $self->proxy->log( HTTP::Proxy::ERROR, "HPBF::save",
130                           "Too many errors opening $file$ext" ), return
131          if $i++ - $n == 10; # should be ok now
132        if( $self->{_hpbf_save_multiple} ) {
133            $ext = "." . ++$n while -e $file.$ext;
134            next;
135        }
136        else {
137            $flags = O_WRONLY | O_CREAT;
138        }
139    }
140
141    # we have an open filehandle
142    $self->{_hpbf_save_filename} = $file.$ext;
143    binmode( $self->{_hpbf_save_fh} );    # for Win32 and friends
144    $self->proxy->log( HTTP::Proxy::FILTERS, "HPBF::save",
145                       "Saving $uri to $file$ext" );
146}
147
148sub filter {
149    my ( $self, $dataref ) = @_;
150    return unless exists $self->{_hpbf_save_fh};
151
152    # save the data to the file
153    my $res = $self->{_hpbf_save_fh}->syswrite( $$dataref );
154    $self->proxy->log( HTTP::Proxy::ERROR, "HPBF::save", "syswrite() error: $!")
155      if ! defined $res;  # FIXME error handling
156}
157
158sub end {
159    my ($self) = @_;
160
161    # close file
162    if( $self->{_hpbf_save_fh} ) {
163        $self->{_hpbf_save_fh}->close; # FIXME error handling
164        delete $self->{_hpbf_save_fh};
165    }
166}
167
168sub will_modify { 0 }
169
1701;
171
172__END__
173
174=head1 NAME
175
176HTTP::Proxy::BodyFilter::save - A filter that saves transfered data to a file
177
178=head1 SYNOPSIS
179
180    use HTTP::Proxy;
181    use HTTP::Proxy::BodyFilter::save;
182
183    my $proxy = HTTP::Proxy->new;
184
185    # save RFC files as we browse them
186    $proxy->push_filter(
187        path     => qr!/rfc\d+.txt!,
188        mime     => 'text/plain',
189        response => HTTP::Proxy::BodyFilter::save->new(
190            template => '%f',
191            prefix   => 'rfc',
192            keep_old => 1,
193        )
194    );
195
196    $proxy->start;
197
198=head1 DESCRIPTION
199
200The HTTP::Proxy::BodyFilter::save filter can save HTTP messages (responses
201or request) bodies to files. The name of the file is determined by a
202template and the URI of the request.
203
204Simply insert this filter in a filter stack, and it will save the data
205as it flows through the proxy. Depending on where the filter is located
206in the stack, the saved data can be more or less modified.
207
208This filter I<will> create directories if it needs to!
209
210I<Note:> Remember that the default C<mime> parameter for C<push_filter()>
211is C<text/*> and that you may need to change it for other MIME types.
212
213=head2 Constructor
214
215The constructor accepts quite a few options. Most of them control
216the construction of the filename that will be used to save the
217response body. There are two options to compute this filename:
218
219=over 4
220
221=item *
222
223use a template
224
225=item *
226
227use your own filename creation routine
228
229=back
230
231The template option uses the following options:
232
233=over 4
234
235=item B<template> => I<string>
236
237The file name is build from the C<template> option. The following
238placeholders are available:
239
240    %%   a percent sign
241    %h   the host
242    %p   the path (no leading separator)
243    %d   the path (filename removed)
244    %f   the filename (or 'index.html' if absent)
245    %q   the query string
246    %P   the path and the query string,
247         separated by '?' (if the query string is not empty)
248
249C</> in the URI path are replaced by the separator used by File::Spec.
250
251The result of the template is modified by the B<no_host>, B<no_dirs>
252and B<cut_dirs>.
253
254The default template is the local equivalent of the C<%h/%P> Unix path.
255
256=item B<no_host> => I<boolean>
257
258The C<no_host> option makes C<%h> empty. Default is I<false>.
259
260=item B<no_dirs> => I<boolean>
261
262The C<no_dirs> option removes all directories from C<%p>, C<%P> and C<%d>.
263Default is I<false>.
264
265=item B<cut_dirs> => I<number>
266
267The C<cut_dirs> options removes the first I<n> directories from the
268content of C<%p>, C<%P> and C<%d>. Default is C<0>.
269
270=item B<prefix> => I<string>
271
272The B<prefix> option prepends the given prefix to the filename
273created from the template. Default is C<"">.
274
275=back
276
277Using your own subroutine is also possible, with the following parameter:
278
279=over 4
280
281=item B<filename> => I<coderef>
282
283When the C<filename> option is used, the C<template> option and the
284other template-related options (C<no_host>, C<no_dirs>, C<cut_dirs>
285and C<prefix>) are ignored.
286
287The C<filename> option expects a reference to a subroutine. The subroutine
288will receive the HTTP::Message object and must return a string which
289is the path of the file to be created (an absolute path is recommended,
290but a relative path is accepted).
291
292Returning C<""> or C<undef> will prevent the creation of the file.
293This lets a filter decide even more precisely what to save or not,
294even though this should be done in the match subroutine (see
295HTTP::Proxy's C<push�_filte()> method).
296
297=back
298
299Other options help the filter decide where and when to save:
300
301=over 4
302
303=item B<multiple> => I<boolean>
304
305With the B<multiple> option, saving the same file in the same directory
306will result in the original copy of file being preserved and the second
307copy being named F<file.1>. If that a file is saved yet again with the same
308name, the third copy will be named F<file.2>, and so on.
309
310Default is I<true>.
311
312If B<multiple> is set to I<false> then a file will be overwritten
313by the next one with the same name.
314
315=item B<timestamp> => I<boolean>
316
317With the C<timestamp> option, the decision as to whether or not to save
318a newer copy of a file depends on the local and remote timestamp and
319size of the file.
320
321The file is saved only if the date given in the C<Last-Modified> is more
322recent than the local file's timestamp.
323
324Default is I<false>.
325
326B<This option is not implemented.>
327
328=item B<keep_old> => I<boolean>
329
330The C<keep_old> option will prevent the file to be saved if a file
331with the same name already exists. Default is I<false>.
332
333No matter if B<multiple> is set or not, the file will I<not> be saved
334if B<keep_old> is set to true.
335
336=item B<status> => \@codes
337
338The C<status> option limits the status codes for which a response body
339will be saved. The default is C<[ 200 ]>, which prevent saving error
340pages (for 404 codes).
341
342=back
343
344=head2 Examples
345
346Given a request for the L<http://search.cpan.org/dist/HTTP-Proxy/> URI,
347the filename is computed as follows, depending on the constructor
348options:
349
350    No options          -> search.cpan.org/dist/HTTP-Proxy/index.html
351
352    no_host  => 1       -> dist/HTTP-Proxy/index.html
353
354    no_dirs  => 1       -> search.cpan.org/index.html
355
356    no_host  => 1,
357    no_dirs  => 1,
358    prefix   => 'data'  -> data/index.html
359
360    cut_dirs => 1       -> search.cpan.org/HTTP-Proxy/index.html
361
362    cut_dirs => 2       -> search.cpan.org/index.html
363
364=head1 METHODS
365
366This filter implements several methods, which are all called atuomatically:
367
368=over 4
369
370=item init()
371
372Handle all the parameters passed to the constructor to define the
373filter behaviour.
374
375=item begin()
376
377Open the file to which the data will be saved.
378
379=item filter()
380
381Save all the data that goes through to the opened file.
382
383=item end()
384
385Close the file when the whole message body has been processed.
386
387=item will_modify()
388
389This method returns a I<false> value, thus indicating to the system
390that it will not modify data passing through.
391
392=back
393
394=head1 SEE ALSO
395
396L<HTTP::Proxy>, L<HTTP::Proxy::BodyFilter>.
397
398=head1 AUTHOR
399
400Philippe "BooK" Bruhat, E<lt>book@cpan.orgE<gt>.
401
402=head1 ACKNOWLEDGMENTS
403
404Thanks to Mat Proud for asking how to store all pages which go through
405the proxy to disk, without any processing. The further discussion we
406had led to the writing of this class.
407
408Wget(1) provided the inspiration for many of the file naming options.
409
410Thanks to Nicolas Chuche for telling me about C<O_EXCL>.
411
412Thanks to Rafa�l Garcia-Suarez and David Rigaudiere for their help on
413irc while coding the nasty C<begin()> method. C<;-)>
414
415Thanks to Howard Jones for the inspiration and initial patch for the
416C<filename> option. Lucas Gonze provided a patch to make C<status>
417actually work.
418
419Thanks to Max Maischein for detecting a bug in the parameter validation
420for C<filename> (L<http://rt.cpan.org/Ticket/Display.html?id=14548>).
421
422Thanks to Mark Tilford, who found out that the
423C<filename> option was incorrectly used internally
424(L<http://rt.cpan.org/Ticket/Display.html?id=18644>).
425
426Thanks to Roland Stigge and Gunnar Wolf for
427reporting and forwarding Debian bug #433951 to CPAN RT
428(L<http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=433951>,
429L<http://rt.cpan.org/Ticket/Display.html?id=33018>).
430
431=head1 COPYRIGHT
432
433Copyright 2004-2008, Philippe Bruhat.
434
435=head1 LICENSE
436
437This module is free software; you can redistribute it or modify it under
438the same terms as Perl itself.
439
440=cut
441
442