1package HTTP::Proxy;
2
3use HTTP::Daemon;
4use HTTP::Date qw(time2str);
5use LWP::UserAgent;
6use LWP::ConnCache;
7use Fcntl ':flock';         # import LOCK_* constants
8use IO::Select;
9use Sys::Hostname;          # hostname()
10use Carp;
11
12use strict;
13use vars qw( $VERSION $AUTOLOAD @METHODS
14             @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS );
15
16require Exporter;
17@ISA    = qw(Exporter);
18@EXPORT = ();               # no export by default
19@EXPORT_OK = qw( ERROR NONE    PROXY  STATUS PROCESS SOCKET HEADERS FILTERS
20                 DATA  CONNECT ENGINE ALL );
21%EXPORT_TAGS = ( log => [@EXPORT_OK] );    # only one tag
22
23$VERSION = '0.300';
24
25my $CRLF = "\015\012";                     # "\r\n" is not portable
26
27# standard filters
28use HTTP::Proxy::HeaderFilter::standard;
29
30# constants used for logging
31use constant ERROR   => -1;    # always log
32use constant NONE    => 0;     # never log
33use constant PROXY   => 1;     # proxy information
34use constant STATUS  => 2;     # HTTP status
35use constant PROCESS => 4;     # sub-process life (and death)
36use constant SOCKET  => 8;     # low-level connections
37use constant HEADERS => 16;    # HTTP headers
38use constant FILTERS => 32;    # Messages from filters
39use constant DATA    => 64;    # Data received by the filters
40use constant CONNECT => 128;   # Data transmitted by the CONNECT method
41use constant ENGINE  => 256;   # Internal information from the Engine
42use constant ALL     => 511;   # All of the above
43
44# modules that need those constants to be defined
45use HTTP::Proxy::Engine;
46use HTTP::Proxy::FilterStack;
47
48# Methods we can forward
49my %METHODS;
50
51# HTTP (RFC 2616)
52$METHODS{http} = [qw( CONNECT DELETE GET HEAD OPTIONS POST PUT TRACE )];
53
54# WebDAV (RFC 2518)
55$METHODS{webdav} = [
56    @{ $METHODS{http} },
57    qw( COPY LOCK MKCOL MOVE PROPFIND PROPPATCH UNLOCK )
58];
59
60# Delta-V (RFC 3253)
61$METHODS{deltav} = [
62    @{ $METHODS{webdav} },
63    qw( BASELINE-CONTROL CHECKIN CHECKOUT LABEL MERGE MKACTIVITY
64        MKWORKSPACE REPORT UNCHECKOUT UPDATE VERSION-CONTROL ),
65];
66
67# the whole method list
68@METHODS = HTTP::Proxy->known_methods();
69
70# useful regexes (from RFC 2616 BNF grammar)
71my %RX;
72$RX{token}  = qr/[-!#\$%&'*+.0-9A-Z^_`a-z|~]+/;
73$RX{mime}   = qr($RX{token}/$RX{token});
74$RX{method} = '(?:' . join ( '|', @METHODS ) . ')';
75$RX{method} = qr/$RX{method}/;
76
77sub new {
78    my $class  = shift;
79    my %params = @_;
80
81    # some defaults
82    my %defaults = (
83        agent    => undef,
84        chunk    => 4096,
85        daemon   => undef,
86        host     => 'localhost',
87        logfh    => *STDERR,
88        logmask  => NONE,
89        max_connections => 0,
90        max_keep_alive_requests => 10,
91        port     => 8080,
92        stash    => {},
93        timeout  => 60,
94        via      => hostname() . " (HTTP::Proxy/$VERSION)",
95        x_forwarded_for => 1,
96    );
97
98    # non modifiable defaults
99    my $self = bless { conn => 0, loop => 1 }, $class;
100
101    # support for deprecated stuff
102    {
103        my %convert = (
104            maxchild => 'max_clients',
105            maxconn  => 'max_connections',
106            maxserve => 'max_keep_alive_requests',
107        );
108        while( my ($old, $new) = each %convert ) {
109            if( exists $params{$old} ) {
110               $params{$new} = delete $params{$old};
111               carp "$old is deprecated, please use $new";
112            }
113        }
114    }
115
116    # get attributes
117    $self->{$_} = exists $params{$_} ? delete( $params{$_} ) : $defaults{$_}
118      for keys %defaults;
119
120    # choose an engine with the remaining parameters
121    $self->{engine} = HTTP::Proxy::Engine->new( %params, proxy => $self );
122    $self->log( PROXY, "PROXY", "Selected engine " . ref $self->{engine} );
123
124    return $self;
125}
126
127sub known_methods {
128    my ( $class, @args ) = @_;
129
130    @args = map { lc } @args ? @args : ( keys %METHODS );
131    exists $METHODS{$_} || carp "Method group $_ doesn't exist"
132        for @args;
133    my %seen;
134    return grep { !$seen{$_}++ } map { @{ $METHODS{$_} || [] } } @args;
135}
136
137sub timeout {
138    my $self = shift;
139    my $old  = $self->{timeout};
140    if (@_) {
141        $self->{timeout} = shift;
142        $self->agent->timeout( $self->{timeout} ) if $self->agent;
143    }
144    return $old;
145}
146
147sub url {
148    my $self = shift;
149    if ( not defined $self->daemon ) {
150        carp "HTTP daemon not started yet";
151        return undef;
152    }
153    return $self->daemon->url;
154}
155
156# normal accessors
157for my $attr ( qw(
158    agent chunk daemon host logfh port request response hop_headers
159    logmask via x_forwarded_for client_headers engine
160    max_connections max_keep_alive_requests
161    )
162  )
163{
164    no strict 'refs';
165    *{"HTTP::Proxy::$attr"} = sub {
166        my $self = shift;
167        my $old  = $self->{$attr};
168        $self->{$attr} = shift if @_;
169        return $old;
170      }
171}
172
173# read-only accessors
174for my $attr (qw( conn loop client_socket )) {
175    no strict 'refs';
176    *{"HTTP::Proxy::$attr"} = sub { $_[0]{$attr} }
177}
178
179sub max_clients { shift->engine->max_clients( @_ ) }
180
181# deprecated methods are still supported
182{
183    my %convert = (
184        maxchild => 'max_clients',
185        maxconn  => 'max_connections',
186        maxserve => 'max_keep_alive_requests',
187    );
188    while ( my ( $old, $new ) = each %convert ) {
189        no strict 'refs';
190        *$old = sub {
191            carp "$old is deprecated, please use $new";
192            goto \&$new;
193        };
194    }
195}
196
197sub stash {
198    my $stash = shift->{stash};
199    return $stash unless @_;
200    return $stash->{ $_[0] } if @_ == 1;
201    return $stash->{ $_[0] } = $_[1];
202}
203
204sub new_connection { ++$_[0]{conn} }
205
206sub start {
207    my $self = shift;
208
209    $self->init;
210    $SIG{INT} = $SIG{TERM} = sub { $self->{loop} = 0 };
211
212    # the main loop
213    my $engine = $self->engine;
214    $engine->start if $engine->can('start');
215    while( $self->loop ) {
216        $engine->run;
217        last if $self->max_connections && $self->conn >= $self->max_connections;
218    }
219    $engine->stop if $engine->can('stop');
220
221    $self->log( STATUS, "STATUS",
222        "Processed " . $self->conn . " connection(s)" );
223
224    return $self->conn;
225}
226
227# semi-private init method
228sub init {
229    my $self = shift;
230
231    # must be run only once
232    return if $self->{_init}++;
233
234    $self->_init_daemon if ( !defined $self->daemon );
235    $self->_init_agent  if ( !defined $self->agent );
236
237    # specific agent config
238    $self->agent->requests_redirectable( [] );
239    $self->agent->agent('');    # for TRACE support
240    $self->agent->protocols_allowed( [qw( http https ftp gopher )] );
241
242    # standard header filters
243    $self->{headers}{request}  = HTTP::Proxy::FilterStack->new;
244    $self->{headers}{response} = HTTP::Proxy::FilterStack->new;
245
246    # the same standard filter is used to handle headers
247    my $std = HTTP::Proxy::HeaderFilter::standard->new();
248    $std->proxy( $self );
249    $self->{headers}{request}->push(  [ sub { 1 }, $std ] );
250    $self->{headers}{response}->push( [ sub { 1 }, $std ] );
251
252    # standard body filters
253    $self->{body}{request}  = HTTP::Proxy::FilterStack->new(1);
254    $self->{body}{response} = HTTP::Proxy::FilterStack->new(1);
255
256    return;
257}
258
259#
260# private init methods
261#
262
263sub _init_daemon {
264    my $self = shift;
265    my %args = (
266        LocalAddr => $self->host,
267        LocalPort => $self->port,
268        ReuseAddr => 1,
269    );
270    delete $args{LocalPort} unless $self->port;    # 0 means autoselect
271    my $daemon = HTTP::Daemon->new(%args)
272      or die "Cannot initialize proxy daemon: $!";
273    $self->daemon($daemon);
274
275    return $daemon;
276}
277
278sub _init_agent {
279    my $self  = shift;
280    my $agent = LWP::UserAgent->new(
281        env_proxy  => 1,
282        keep_alive => 2,
283        parse_head => 0,
284        timeout    => $self->timeout,
285      )
286      or die "Cannot initialize proxy agent: $!";
287    $self->agent($agent);
288    return $agent;
289}
290
291# This is the internal "loop" that lets the child process process the
292# incoming connections.
293
294sub serve_connections {
295    my ( $self, $conn ) = @_;
296    my $response;
297    $self->{client_socket} = $conn;  # read-only
298    $self->log( SOCKET, "SOCKET", "New connection from " . $conn->peerhost
299                      . ":" . $conn->peerport );
300
301    my ( $last, $served ) = ( 0, 0 );
302
303    while ( $self->loop() ) {
304        my $req;
305        {
306            local $SIG{INT} = local $SIG{TERM} = 'DEFAULT';
307            $req = $conn->get_request();
308        }
309
310        $served++;
311
312        # initialisation
313        $self->request($req);
314        $self->response(undef);
315
316        # Got a request?
317        unless ( defined $req ) {
318            $self->log( SOCKET, "SOCKET",
319                "Getting request failed: " . $conn->reason )
320                if $conn->reason ne 'No more requests from this connection';
321            return;
322        }
323        $self->log( STATUS, "REQUEST", $req->method . ' '
324           . ( $req->method eq 'CONNECT' ? $req->uri->host_port : $req->uri ) );
325
326        # can we forward this method?
327        if ( !grep { $_ eq $req->method } @METHODS ) {
328            $response = HTTP::Response->new( 501, 'Not Implemented' );
329            $response->content_type( "text/plain" );
330            $response->content(
331                "Method " . $req->method . " is not supported by this proxy." );
332            $self->response($response);
333            goto SEND;
334        }
335
336        # transparent proxying support
337        if( not defined $req->uri->scheme ) {
338            if( my $host = $req->header('Host') ) {
339                 $req->uri->scheme( 'http' );
340                 $req->uri->host( $host );
341            }
342            else {
343                $response = HTTP::Response->new( 400, 'Bad request' );
344                $response->content_type( "text/plain" );
345                $response->content("Can't do transparent proxying without a Host: header.");
346                $self->response($response);
347                goto SEND;
348            }
349        }
350
351        # can we serve this protocol?
352        if ( !$self->is_protocol_supported( my $s = $req->uri->scheme ) )
353        {
354            # should this be 400 Bad Request?
355            $response = HTTP::Response->new( 501, 'Not Implemented' );
356            $response->content_type( "text/plain" );
357            $response->content("Scheme $s is not supported by this proxy.");
358            $self->response($response);
359            goto SEND;
360        }
361
362        # select the request filters
363        $self->{$_}{request}->select_filters( $req ) for qw( headers body );
364
365        # massage the request
366        $self->{headers}{request}->filter( $req->headers, $req );
367
368        # FIXME I don't know how to get the LWP::Protocol objet...
369        # NOTE: the request is always received in one piece
370        $self->{body}{request}->filter( $req->content_ref, $req, undef );
371        $self->{body}{request}->eod;    # end of data
372        $self->log( HEADERS, "REQUEST", $req->headers->as_string );
373
374        # CONNECT method is a very special case
375        if( ! defined $self->response and $req->method eq 'CONNECT' ) {
376            $last = $self->_handle_CONNECT($served);
377            return if $last;
378        }
379
380        # the header filters created a response,
381        # we won't contact the origin server
382        # FIXME should the response header and body be filtered?
383        goto SEND if defined $self->response;
384
385        # FIXME - don't forward requests to ourselves!
386
387        # pop a response
388        my ( $sent, $chunked ) = ( 0, 0 );
389        $response = $self->agent->simple_request(
390            $req,
391            sub {
392                my ( $data, $response, $proto ) = @_;
393
394                # first time, filter the headers
395                if ( !$sent ) {
396                    $sent++;
397                    $self->response( $response );
398
399                    # select the response filters
400                    $self->{$_}{response}->select_filters( $response )
401                      for qw( headers body );
402
403                    $self->{headers}{response}
404                         ->filter( $response->headers, $response );
405                    ( $last, $chunked ) =
406                      $self->_send_response_headers( $served );
407                }
408
409                # filter and send the data
410                $self->log( DATA, "DATA",
411                    "got " . length($data) . " bytes of body data" );
412                $self->{body}{response}->filter( \$data, $response, $proto );
413                if ($chunked) {
414                    printf $conn "%x$CRLF%s$CRLF", length($data), $data
415                      if length($data);    # the filter may leave nothing
416                }
417                else { print $conn $data; }
418            },
419            $self->chunk
420        );
421
422        # remove the header added by LWP::UA before it sends the response back
423        $response->remove_header('Client-Date');
424
425        # the callback is not called by LWP::UA->request
426        # in some cases (HEAD, redirect, error responses have no body)
427        if ( !$sent ) {
428            $self->response($response);
429            $self->{$_}{response}->select_filters( $response )
430              for qw( headers body );
431            $self->{headers}{response}
432                 ->filter( $response->headers, $response );
433        }
434
435        # do a last pass, in case there was something left in the buffers
436        my $data = "";    # FIXME $protocol is undef here too
437        $self->{body}{response}->filter_last( \$data, $response, undef );
438        if ( length $data ) {
439            if ($chunked) {
440                printf $conn "%x$CRLF%s$CRLF", length($data), $data;
441            }
442            else { print $conn $data; }
443        }
444
445        # last chunk
446        print $conn "0$CRLF$CRLF" if $chunked;    # no trailers either
447        $self->response($response);
448
449        # what about X-Died and X-Content-Range?
450        if( my $died = $response->header('X-Died') ) {
451            $self->log( ERROR, "ERROR", $died );
452            $sent = 0;
453            $response = HTTP::Response->new( 500, "Proxy filter error" );
454            $response->content_type( "text/plain" );
455            $response->content($died);
456            $self->response($response);
457        }
458
459      SEND:
460
461        $response = $self->response ;
462
463        # responses that weren't filtered through callbacks
464        # (empty body or error)
465        # FIXME some error response headers might not be filtered
466        if ( !$sent ) {
467            ($last, $chunked) = $self->_send_response_headers( $served );
468            my $content = $response->content;
469            if ($chunked) {
470                printf $conn "%x$CRLF%s$CRLF", length($content), $content
471                  if length($content);    # the filter may leave nothing
472                print $conn "0$CRLF$CRLF";
473            }
474            else { print $conn $content; }
475        }
476
477        # FIXME ftp, gopher
478        $conn->print( $response->content )
479          if defined $req->uri->scheme
480             and $req->uri->scheme =~ /^(?:ftp|gopher)$/
481             and $response->is_success;
482
483        $self->log( SOCKET, "SOCKET", "Connection closed by the proxy" ), last
484          if $last || $served >= $self->max_keep_alive_requests;
485    }
486    $self->log( SOCKET, "SOCKET", "Connection closed by the client" )
487      if !$last
488      and $served < $self->max_keep_alive_requests;
489    $self->log( PROCESS, "PROCESS", "Served $served requests" );
490    $conn->close;
491}
492
493# INTERNAL METHOD
494# send the response headers for the proxy
495# expects $served  (number of requests served)
496# returns $last and $chunked (last request served, chunked encoding)
497sub _send_response_headers {
498    my ( $self, $served ) = @_;
499    my ( $last, $chunked ) = ( 0, 0 );
500    my $conn = $self->client_socket;
501    my $response = $self->response;
502
503    # correct headers
504    $response->remove_header("Content-Length")
505      if $self->{body}{response}->will_modify();
506    $response->header( Server => "HTTP::Proxy/$VERSION" )
507      unless $response->header( 'Server' );
508    $response->header( Date => time2str(time) )
509      unless $response->header( 'Date' );
510
511    # this is adapted from HTTP::Daemon
512    if ( $conn->antique_client ) { $last++ }
513    else {
514        my $code = $response->code;
515        $conn->send_status_line( $code, $response->message,
516            $self->request()->protocol() );
517        if ( $code =~ /^(1\d\d|[23]04)$/ ) {
518
519            # make sure content is empty
520            $response->remove_header("Content-Length");
521            $response->content('');
522        }
523        elsif ( $response->request && $response->request->method eq "HEAD" )
524        {    # probably OK, says HTTP::Daemon
525        }
526        else {
527            if ( $conn->proto_ge("HTTP/1.1") ) {
528                $chunked++;
529                $response->push_header( "Transfer-Encoding" => "chunked" );
530                $response->push_header( "Connection"        => "close" )
531                  if $served >= $self->max_keep_alive_requests;
532            }
533            else {
534                $last++;
535                $conn->force_last_request;
536            }
537        }
538        print $conn $response->headers_as_string($CRLF);
539        print $conn $CRLF;    # separates headers and content
540    }
541    $self->log( STATUS,  "RESPONSE", $response->status_line );
542    $self->log( HEADERS, "RESPONSE", $response->headers->as_string );
543    return ($last, $chunked);
544}
545
546# INTERNAL method
547# FIXME no man-in-the-middle for now
548sub _handle_CONNECT {
549    my ($self, $served) = @_;
550    my $last = 0;
551
552    my $conn = $self->client_socket;
553    my $req  = $self->request;
554    my $upstream;
555
556    # connect upstream
557    if ( my $up = $self->agent->proxy('http') ) {
558
559        # clean up authentication info from proxy URL
560        $up =~ s{^http://[^/\@]*\@}{http://};
561
562        # forward to upstream proxy
563        $self->log( PROXY, "PROXY",
564            "Forwarding CONNECT request to next proxy: $up" );
565        my $response = $self->agent->simple_request($req);
566
567        # check the upstream proxy's response
568        my $code = $response->code;
569        if ( $code == 407 ) {    # don't forward Proxy Authentication requests
570            my $response_407 = $response->as_string;
571            $response_407 =~ s/^Client-.*$//mg;
572            $response = HTTP::Response->new(502);
573            $response->content_type("text/plain");
574            $response->content( "Upstream proxy ($up) "
575                    . "requested authentication:\n\n"
576                    . $response_407 );
577            $self->response($response);
578            return $last;
579        }
580        elsif ( $code != 200 ) {    # forward every other failure
581            $self->response($response);
582            return $last;
583        }
584
585        $upstream = $response->{client_socket};
586    }
587    else {                                  # direct connection
588        $upstream = IO::Socket::INET->new( PeerAddr => $req->uri->host_port );
589    }
590
591    # no upstream socket obtained
592    if( !$upstream ) {
593        my $response = HTTP::Response->new( 500 );
594        $response->content_type( "text/plain" );
595        $response->content( "CONNECT failed: $@");
596        $self->response($response);
597        return $last;
598    }
599
600    # send the response headers (FIXME more headers required?)
601    my $response = HTTP::Response->new(200);
602    $self->response($response);
603    $self->{$_}{response}->select_filters( $response ) for qw( headers body );
604
605    $self->_send_response_headers( $served );
606
607    # we now have a TCP connection
608    $last = 1;
609
610    my $select = IO::Select->new;
611    for ( $conn, $upstream ) {
612         $_->autoflush(1);
613         $_->blocking(0);
614         $select->add($_);
615    }
616
617    # loop while there is data
618    while ( my @ready = $select->can_read ) {
619        for (@ready) {
620            my $data = "";
621            my ($sock, $peer, $from ) = $conn eq $_
622                                      ? ( $conn, $upstream, "client" )
623                                      : ( $upstream, $conn, "server" );
624
625            # read the data
626            my $read = $sock->sysread( $data, 4096 );
627
628            # check for errors
629            if(not defined $read ) {
630                $self->log( ERROR, "CONNECT", "Read undef from $from ($!)" );
631                next;
632            }
633
634            # end of connection
635            if ( $read == 0 ) {
636                $_->close for ( $sock, $peer );
637                $select->remove( $sock, $peer );
638                $self->log( SOCKET, "CONNECT", "Connection closed by the $from" );
639                $self->log( PROCESS, "PROCESS", "Served $served requests" );
640                next;
641            }
642
643            # proxy the data
644            $self->log( CONNECT, "CONNECT", "$read bytes received from $from" );
645            $peer->syswrite($data, length $data);
646        }
647    }
648    $self->log( CONNECT, "CONNECT", "End of CONNECT proxyfication");
649    return $last;
650}
651
652sub push_filter {
653    my $self = shift;
654    my %arg  = (
655        mime   => 'text/*',
656        method => join( ',', @METHODS ),
657        scheme => 'http',
658        host   => '',
659        path   => '',
660        query  => '',
661    );
662
663    # parse parameters
664    for( my $i = 0; $i < @_ ; $i += 2 ) {
665        next if $_[$i] !~ /^(mime|method|scheme|host|path|query)$/;
666        $arg{$_[$i]} = $_[$i+1];
667        splice @_, $i, 2;
668        $i -= 2;
669    }
670    croak "Odd number of arguments" if @_ % 2;
671
672    # the proxy must be initialised
673    $self->init;
674
675    # prepare the variables for the closure
676    my ( $mime, $method, $scheme, $host, $path, $query ) =
677      @arg{qw( mime method scheme host path query )};
678
679    if ( defined $mime && $mime ne '' ) {
680        $mime =~ m!/! or croak "Invalid MIME type definition: $mime";
681        $mime =~ s/\*/$RX{token}/;    #turn it into a regex
682        $mime = qr/^$mime(?:$|\s*;?)/;
683    }
684
685    my @method = split /\s*,\s*/, $method;
686    for (@method) { croak "Invalid method: $_" if !/$RX{method}/ }
687    $method = @method ? '(?:' . join ( '|', @method ) . ')' : '';
688    $method = qr/^$method$/;
689
690    my @scheme = split /\s*,\s*/, $scheme;
691    for (@scheme) {
692        croak "Unsupported scheme: $_"
693          if !$self->is_protocol_supported($_);
694    }
695    $scheme = @scheme ? '(?:' . join ( '|', @scheme ) . ')' : '';
696    $scheme = qr/$scheme/;
697
698    $host  ||= '.*'; $host  = qr/$host/i;
699    $path  ||= '.*'; $path  = qr/$path/;
700    $query ||= '.*'; $query = qr/$query/;
701
702    # push the filter and its match method on the correct stack
703    while(@_) {
704        my ($message, $filter ) = (shift, shift);
705        croak "'$message' is not a filter stack"
706          unless $message =~ /^(request|response)$/;
707
708        croak "Not a Filter reference for filter queue $message"
709          unless ref( $filter )
710          && ( $filter->isa('HTTP::Proxy::HeaderFilter')
711            || $filter->isa('HTTP::Proxy::BodyFilter') );
712
713        my $stack;
714        $stack = 'headers' if $filter->isa('HTTP::Proxy::HeaderFilter');
715        $stack = 'body'    if $filter->isa('HTTP::Proxy::BodyFilter');
716
717        # MIME can only match on reponse
718        my $mime = $mime;
719        undef $mime if $message eq 'request';
720
721        # compute the match sub as a closure
722        # for $self, $mime, $method, $scheme, $host, $path
723        my $match = sub {
724            return 0
725              if ( defined $mime )
726              && ( $self->response->content_type || '' ) !~ $mime;
727            return 0 if ( $self->{request}->method || '' ) !~ $method;
728            return 0 if ( $self->{request}->uri->scheme    || '' ) !~ $scheme;
729            return 0 if ( $self->{request}->uri->authority || '' ) !~ $host;
730            return 0 if ( $self->{request}->uri->path      || '' ) !~ $path;
731            return 0 if ( $self->{request}->uri->query     || '' ) !~ $query;
732            return 1;    # it's a match
733        };
734
735        # push it on the corresponding FilterStack
736        $self->{$stack}{$message}->push( [ $match, $filter ] );
737        $filter->proxy( $self );
738    }
739}
740
741sub is_protocol_supported {
742    my ( $self, $scheme ) = @_;
743    my $ok = 1;
744    if ( !$self->agent->is_protocol_supported($scheme) ) {
745
746        # double check, in case a dummy scheme was added
747        # to be handled directly by a filter
748        $ok = 0;
749        $scheme eq $_ && $ok++ for @{ $self->agent->protocols_allowed };
750    }
751    $ok;
752}
753
754sub log {
755    my $self  = shift;
756    my $level = shift;
757    my $fh    = $self->logfh;
758
759    return unless $self->logmask & $level || $level == ERROR;
760
761    my ( $prefix, $msg ) = ( @_, '' );
762    my @lines = split /\n/, $msg;
763    @lines = ('') if not @lines;
764
765    flock( $fh, LOCK_EX );
766    print $fh "[" . localtime() . "] ($$) $prefix: $_\n" for @lines;
767    flock( $fh, LOCK_UN );
768}
769
7701;
771
772__END__
773
774=head1 NAME
775
776HTTP::Proxy - A pure Perl HTTP proxy
777
778=head1 SYNOPSIS
779
780    use HTTP::Proxy;
781
782    # initialisation
783    my $proxy = HTTP::Proxy->new( port => 3128 );
784
785    # alternate initialisation
786    my $proxy = HTTP::Proxy->new;
787    $proxy->port( 3128 ); # the classical accessors are here!
788
789    # this is a MainLoop-like method
790    $proxy->start;
791
792=head1 DESCRIPTION
793
794This module implements a HTTP proxy, using a L<HTTP::Daemon> to accept
795client connections, and a LWP::UserAgent to ask for the requested pages.
796
797The most interesting feature of this proxy object is its ability to
798filter the HTTP requests and responses through user-defined filters.
799
800Once the proxy is created, with the C<new()> method, it is possible
801to alter its behaviour by adding so-called "filters". This is
802done by the C<push_filter()> method. Once the filter is ready to
803run, it can be launched, with the C<start()> method. This method
804does not normally return until the proxy is killed or otherwise
805stopped.
806
807An important thing to note is that the proxy is (except when running
808the C<NoFork> engine) a I<forking> proxy: it doesn't support passing
809information between child processes, and you can count on reliable
810information passing only during a single HTTP connection (request +
811response).
812
813=head1 FILTERS
814
815You can alter the way the default L<HTTP::Proxy> works by plugging callbacks
816(filter objects, actually) at different stages of the request/response
817handling.
818
819When a request is received by the L<HTTP::Proxy> object, it is filtered through
820a standard filter that transform this request accordingly to RFC 2616
821(by adding the C<Via:> header, and a few other transformations). This is
822the default, bare minimum behaviour.
823
824The response is also filtered in the same manner. There is a total of four
825filter chains: C<request-headers>, C<request-body>, C<reponse-headers> and
826C<response-body>.
827
828You can add your own filters to the default ones with the
829C<push_filter()> method. The method pushes a filter on the appropriate
830filter stack.
831
832    $proxy->push_filter( response => $filter );
833
834The headers/body category is determined by the base class of the filter.
835There are two base classes for filters, which are
836L<HTTP::Proxy::HeaderFilter> and L<HTTP::Proxy::BodyFilter> (the names
837are self-explanatory). See the documentation of those two classes
838to find out how to write your own header or body filters.
839
840The named parameter is used to determine the request/response part.
841
842It is possible to push the same filter on the request and response
843stacks, as in the following example:
844
845    $proxy->push_filter( request => $filter, response => $filter );
846
847If several filters match the message, they will be applied in the order
848they were pushed on their filter stack.
849
850Named parameters can be used to create the match routine. They are:
851
852    method - the request method
853    scheme - the URI scheme
854    host   - the URI authority (host:port)
855    path   - the URI path
856    query  - the URI query string
857    mime   - the MIME type (for a response-body filter)
858
859The filters are applied only when all the the parameters match the
860request or the response. All these named parameters have default values,
861which are:
862
863    method => 'OPTIONS,GET,HEAD,POST,PUT,DELETE,TRACE,CONNECT'
864    scheme => 'http'
865    host   => ''
866    path   => ''
867    query  => ''
868    mime   => 'text/*'
869
870The C<mime> parameter is a glob-like string, with a required C</>
871character and a C<*> as a joker. Thus, C<*/*> matches I<all> responses,
872and C<""> those with no C<Content-Type:> header. To match any
873reponse (with or without a C<Content-Type:> header), use C<undef>.
874
875The C<mime> parameter is only meaningful with the C<response-body>
876filter stack. It is ignored if passed to any other filter stack.
877
878The C<method> and C<scheme> parameters are strings consisting of
879comma-separated values. The C<host> and C<path> parameters are regular
880expressions.
881
882A match routine is compiled by the proxy and used to check if a particular
883request or response must be filtered through a particular filter.
884
885It is also possible to push several filters on the same stack with
886the same match subroutine:
887
888    # convert italics to bold
889    $proxy->push_filter(
890        mime     => 'text/html',
891        response => HTTP::Proxy::BodyFilter::tags->new(),
892        response => HTTP::Proxy::BodyFilter::simple->new(
893            sub { ${ $_[1] } =~ s!(</?)i>!$1b>!ig }
894        )
895    );
896
897For more details regarding the creation of new filters, check the
898L<HTTP::Proxy::HeaderFilter> and L<HTTP::Proxy::BodyFilter> documentation.
899
900Here's an example of subclassing a base filter class:
901
902    # fixes a common typo ;-)
903    # but chances are that this will modify a correct URL
904    {
905        package FilterPerl;
906        use base qw( HTTP::Proxy::BodyFilter );
907
908        sub filter {
909            my ( $self, $dataref, $message, $protocol, $buffer ) = @_;
910            $$dataref =~ s/PERL/Perl/g;
911        }
912    }
913    $proxy->push_filter( response => FilterPerl->new() );
914
915Other examples can be found in the documentation for
916L<HTTP::Proxy::HeaderFilter>, L<HTTP::Proxy::BodyFilter>,
917L<HTTP::Proxy::HeaderFilter::simple>, L<HTTP::Proxy::BodyFilter::simple>.
918
919    # a simple anonymiser
920    # see eg/anonymiser.pl for the complete code
921    $proxy->push_filter(
922        mime    => undef,
923        request => HTTP::Proxy::HeaderFilter::simple->new(
924            sub { $_[1]->remove_header(qw( User-Agent From Referer Cookie )) },
925        ),
926        response => HTTP::Proxy::HeaderFilter::simple->new(
927            sub { $_[1]->remove_header(qw( Set-Cookie )); },
928        )
929    );
930
931IMPORTANT: If you use your own L<LWP::UserAgent>, you must install it
932before your calls to C<push_filter()>, otherwise
933the match method will make wrong assumptions about the schemes your
934agent supports.
935
936NOTE: It is likely that possibility of changing the agent or the daemon
937may disappear in future versions.
938
939=head1 METHODS
940
941=head2 Constructor and initialisation
942
943=over 4
944
945=item new()
946
947The C<new()> method creates a new L<HTTP::Proxy> object. All attributes can
948be passed as parameters to replace the default.
949
950Parameters that are not L<HTTP::Proxy> attributes will be ignored and
951passed to the chosen L<HTTP::Proxy::Engine> object.
952
953=item init()
954
955C<init()> initialise the proxy without starting it. It is usually not
956needed.
957
958This method is called by C<start()> if needed.
959
960=item push_filter()
961
962The C<push_filter()> method is used to add filters to the proxy.
963It is fully described in section L<FILTERS>.
964
965=back
966
967=head2 Accessors and mutators
968
969L<HTTP::Proxy> class has several accessors and mutators.
970
971Called with arguments, the accessor returns the current value.
972Called with a single argument, it sets the current value and
973returns the previous one, in case you want to keep it.
974
975If you call a read-only accessor with a parameter, this parameter
976will be ignored.
977
978The defined accessors are (in alphabetical order):
979
980=over 4
981
982=item agent
983
984The L<LWP::UserAgent> object used internally to connect to remote sites.
985
986=item chunk
987
988The chunk size for the L<LWP::UserAgent> callbacks.
989
990=item client_socket (read-only)
991
992The socket currently connected to the client. Mostly useful in filters.
993
994=item client_headers
995
996This attribute holds a reference to the client headers set up by
997L<LWP::UserAgent>
998(C<Client-Aborted>, C<Client-Bad-Header-Line>, C<Client-Date>,
999C<Client-Junk>, C<Client-Peer>, C<Client-Request-Num>,
1000C<Client-Response-Num>, C<Client-SSL-Cert-Issuer>,
1001C<Client-SSL-Cert-Subject>, C<Client-SSL-Cipher>, C<Client-SSL-Warning>,
1002C<Client-Transfer-Encoding>, C<Client-Warning>).
1003
1004They are removed by the filter L<HTTP::Proxy::HeaderFilter::standard> from
1005the request and response objects received by the proxy.
1006
1007If a filter (such as a SSL certificate verification filter) need to
1008access them, it must do it through this accessor.
1009
1010=item conn (read-only)
1011
1012The number of connections processed by this L<HTTP::Proxy> instance.
1013
1014=item daemon
1015
1016The L<HTTP::Daemon> object used to accept incoming connections.
1017(You usually never need this.)
1018
1019=item engine
1020
1021The L<HTTP::Proxy::Engine> object that manages the child processes.
1022
1023=item hop_headers
1024
1025This attribute holds a reference to the hop-by-hop headers
1026(C<Connection>, C<Keep-Alive>, C<Proxy-Authenticate>, C<Proxy-Authorization>,
1027C<TE>, C<Trailers>, C<Transfer-Encoding>, C<Upgrade>).
1028
1029They are removed by the filter HTTP::Proxy::HeaderFilter::standard from
1030the request and response objects received by the proxy.
1031
1032If a filter (such as a proxy authorisation filter) need to access them,
1033it must do it through this accessor.
1034
1035=item host
1036
1037The proxy L<HTTP::Daemon> host (default: 'localhost').
1038
1039This means that by default, the proxy answers only to clients on the
1040local machine. You can pass a specific interface address or C<"">/C<undef>
1041for any interface.
1042
1043This default prevents your proxy to be used as an anonymous proxy
1044by script kiddies.
1045
1046=item known_methods( @groups ) (read-only)
1047
1048This method returns all HTTP (and extensions to HTTP) known to
1049C<HTTP::Proxy>. Methods are grouped by type. Known method groups are:
1050C<HTTP>, C<WebDAV> and C<DeltaV>.
1051
1052Called with an empty list, this method will return all known methods.
1053This method is case-insensitive, and will C<carp()> if an unknown
1054group name is passed.
1055
1056=item logfh
1057
1058A filehandle to a logfile (default: C<*STDERR>).
1059
1060=item logmask( [$mask] )
1061
1062Be verbose in the logs (default: C<NONE>).
1063
1064Here are the various elements that can be added to the mask (their values
1065are powers of 2, starting from 0 and listed here in ascending order):
1066
1067    NONE    - Log only errors
1068    PROXY   - Proxy information
1069    STATUS  - Requested URL, reponse status and total number
1070              of connections processed
1071    PROCESS - Subprocesses information (fork, wait, etc.)
1072    SOCKET  - Information about low-level sockets
1073    HEADERS - Full request and response headers are sent along
1074    FILTERS - Filter information
1075    DATA    - Data received by the filters
1076    CONNECT - Data transmitted by the CONNECT method
1077    ENGINE  - Engine information
1078    ALL     - Log all of the above
1079
1080If you only want status and process information, you can use:
1081
1082    $proxy->logmask( STATUS | PROCESS );
1083
1084Note that all the logging constants are not exported by default, but
1085by the C<:log> tag. They can also be exported one by one.
1086
1087=item loop (read-only)
1088
1089Internal. False when the main loop is about to be broken.
1090
1091=item max_clients
1092
1093=item maxchild
1094
1095The maximum number of child process the L<HTTP::Proxy> object will spawn
1096to handle client requests (default: depends on the engine).
1097
1098This method is currently delegated to the L<HTTP::Proxy::Engine> object.
1099
1100C<maxchild> is deprecated and will disappear.
1101
1102=item max_connections
1103
1104=item maxconn
1105
1106The maximum number of TCP connections the proxy will accept before
1107returning from start(). 0 (the default) means never stop accepting
1108connections.
1109
1110C<maxconn> is deprecated.
1111
1112Note: C<max_connections> will be deprecated soon, for two reasons: 1)
1113it is more of an L<HTTP::Proxy::Engine> attribute, 2) not all engines will
1114support it.
1115
1116=item max_keep_alive_requests
1117
1118=item maxserve
1119
1120The maximum number of requests the proxy will serve in a single connection.
1121(same as C<MaxRequestsPerChild> in Apache)
1122
1123C<maxserve> is deprecated.
1124
1125=item port
1126
1127The proxy L<HTTP::Daemon> port (default: 8080).
1128
1129=item request
1130
1131The request originaly received by the proxy from the user-agent, which
1132will be modified by the request filters.
1133
1134=item response
1135
1136The response received from the origin server by the proxy. It is
1137normally C<undef> until the proxy actually receives the beginning
1138of a response from the origin server.
1139
1140If one of the request filters sets this attribute, it "short-circuits"
1141the request/response scheme, and the proxy will return this response
1142(which is NOT filtered through the response filter stacks) instead of
1143the expected origin server response. This is useful for caching (though
1144Squid does it much better) and proxy authentication, for example.
1145
1146=item stash
1147
1148The stash is a hash where filters can store data to share between them.
1149
1150The stash() method can be used to set the whole hash (with a HASH reference).
1151To access individual keys simply do:
1152
1153    $proxy->stash( 'bloop' );
1154
1155To set it, type:
1156
1157    $proxy->stash( bloop => 'owww' );
1158
1159It's also possibly to get a reference to the stash:
1160
1161    my $s = $filter->proxy->stash();
1162    $s->{bang} = 'bam';
1163
1164    # $proxy->stash( 'bang' ) will now return 'bam'
1165
1166B<Warning:> since the proxy forks for each TCP connection, the data is
1167only shared between filters in the same child process.
1168
1169=item timeout
1170
1171The timeout used by the internal L<LWP::UserAgent> (default: 60).
1172
1173=item url (read-only)
1174
1175The url where the proxy can be reached.
1176
1177=item via
1178
1179The content of the Via: header. Setting it to an empty string will
1180prevent its addition. (default: C<$hostname (HTTP::Proxy/$VERSION)>)
1181
1182=item x_forwarded_for
1183
1184If set to a true value, the proxy will send the C<X-Forwarded-For:> header.
1185(default: true)
1186
1187=back
1188
1189=head2 Connection handling methods
1190
1191=over 4
1192
1193=item start()
1194
1195This method works like Tk's C<MainLoop>: you hand over control to the
1196L<HTTP::Proxy> object you created and configured.
1197
1198If C<maxconn> is not zero, C<start()> will return after accepting
1199at most that many connections. It will return the total number of
1200connexions.
1201
1202=item serve_connections()
1203
1204This is the internal method used to handle each new TCP connection
1205to the proxy.
1206
1207=back
1208
1209=head2 Other methods
1210
1211=over 4
1212
1213=item log( $level, $prefix, $message )
1214
1215Adds C<$message> at the end of C<logfh>, if $level matches C<logmask>.
1216The C<log()> method also prints a timestamp.
1217
1218The output looks like:
1219
1220    [Thu Dec  5 12:30:12 2002] ($$) $prefix: $message
1221
1222where C<$$> is the current processus id.
1223
1224If C<$message> is a multiline string, several log lines will be output,
1225each line starting with C<$prefix>.
1226
1227=item is_protocol_supported( $scheme )
1228
1229Returns a boolean indicating if $scheme is supported by the proxy.
1230
1231This method is only used internaly.
1232
1233It is essential to allow L<HTTP::Proxy> users to create "pseudo-schemes"
1234that LWP doesn't know about, but that one of the proxy filters can handle
1235directly. New schemes are added as follows:
1236
1237    $proxy->init();    # required to get an agent
1238    $proxy->agent->protocols_allowed(
1239        [ @{ $proxy->agent->protocols_allowed }, 'myhttp' ] );
1240
1241=item new_connection()
1242
1243Increase the proxy's TCP connections counter. Only used by
1244L<HTTP::Proxy::Engine> objects.
1245
1246=back
1247
1248=head2 Apache-like attributes
1249
1250L<HTTP::Proxy> has several Apache-like attributes that control the
1251way the HTTP and TCP connections are handled.
1252
1253The following attributes control the TCP connection. They are passed to
1254the underlying L<HTTP::Proxy::Engine>, which may (or may not) use them
1255to change its behaviour.
1256
1257=over 4
1258
1259=item start_servers
1260
1261Number of child process to fork at the beginning.
1262
1263=item max_clients
1264
1265Maximum number of concurrent TCP connections (i.e. child processes).
1266
1267=item max_requests_per_child
1268
1269Maximum number of TCP connections handled by the same child process.
1270
1271=item min_spare_servers
1272
1273Minimum number of inactive child processes.
1274
1275=item max_spare_servers
1276
1277Maximum number of inactive child processes.
1278
1279=back
1280
1281Those attributes control the HTTP connection:
1282
1283=over 4
1284
1285=item keep_alive
1286
1287Support for keep alive HTTP connections.
1288
1289=item max_keep_alive_requests
1290
1291Maximum number of HTTP connections within a single TCP connection.
1292
1293=item keep_alive_timeout
1294
1295Timeout for keep-alive connection.
1296
1297=back
1298
1299=head1 EXPORTED SYMBOLS
1300
1301No symbols are exported by default. The C<:log> tag exports all the
1302logging constants.
1303
1304=head1 BUGS
1305
1306This module does not work under Windows, but I can't see why, and do not
1307have a development platform under that system. Patches and explanations
1308very welcome.
1309
1310I guess it is because C<fork()> is not well supported.
1311
1312    $proxy->maxchild(0);
1313
1314=over 4
1315
1316=item However, David Fishburn says:
1317
1318This did not work for me under WinXP - ActiveState Perl 5.6, but it DOES
1319work on WinXP ActiveState Perl 5.8.
1320
1321=back
1322
1323Several people have tried to help, but we haven't found a way to make it work
1324correctly yet.
1325
1326As from version 0.16, the default engine is L<HTTP::Proxy::Engine::NoFork>.
1327Let me know if it works better.
1328
1329=head1 SEE ALSO
1330
1331L<HTTP::Proxy::Engine>, L<HTTP::Proxy::BodyFilter>,
1332L<HTTP::Proxy::HeaderFilter>, the examples in F<eg/>.
1333
1334=head1 AUTHOR
1335
1336Philippe "BooK" Bruhat, E<lt>book@cpan.orgE<gt>.
1337
1338There is also a mailing-list: http-proxy@mongueurs.net for general
1339discussion about L<HTTP::Proxy>.
1340
1341=head1 THANKS
1342
1343Many people helped me during the development of this module, either on
1344mailing-lists, IRC or over a beer in a pub...
1345
1346So, in no particular order, thanks to the libwww-perl team for such a
1347terrific suite of modules, perl-qa (tips for testing), the French Perl
1348I<Mongueurs> (for code tricks, beers and encouragements) and my growing
1349user base... C<;-)>
1350
1351I'd like to particularly thank Dan Grigsby, who's been using
1352L<HTTP::Proxy> since 2003 (before the filter classes even existed).  He is
1353apparently making a living from a product based on L<HTTP::Proxy>. Thanks
1354a lot for your confidence in my work!
1355
1356=head1 COPYRIGHT
1357
1358Copyright 2002-2013, Philippe Bruhat.
1359
1360=head1 LICENSE
1361
1362This module is free software; you can redistribute it or modify it under
1363the same terms as Perl itself.
1364
1365=cut
1366
1367