1package HTTP::Proxy::BodyFilter::htmltext;
2
3use strict;
4use Carp;
5use HTTP::Proxy::BodyFilter;
6use vars qw( @ISA );
7@ISA = qw( HTTP::Proxy::BodyFilter );
8
9sub init {
10    croak "Parameter must be a CODE reference" unless ref $_[1] eq 'CODE';
11    $_[0]->{_filter} = $_[1];
12}
13
14sub begin { $_[0]->{js} = 0; }    # per message initialisation
15
16sub filter {
17    my ( $self, $dataref, $message, $protocol, $buffer ) = @_;
18
19    my $pos = pos($$dataref) = 0;
20  SCAN:
21    {
22        $pos = pos($$dataref);
23        $$dataref =~ /\G<\s*(?:script|style)[^>]*>/cgi    # protect
24          && do { $self->{js} = 1; redo SCAN; };
25        $$dataref =~ /\G<\s*\/\s*(?:script|style)[^>]*>/cgi    # unprotect
26          && do { $self->{js} = 0; redo SCAN; };
27        # comments are considered as text
28        # if you want comments as comments,
29        # use HTTP::Proxy::BodyFilter::htmlparser
30        $$dataref =~ /\G<!--/cg                  && redo SCAN;  # comment
31        $$dataref =~ /\G>/cg                     && redo SCAN;  # lost >
32        $$dataref =~ /\G(?=(<[^\s\/?%!a-z]))/cgi && goto TEXT;  # < in text
33        $$dataref =~ /\G(?:<[^>]*>)+/cg          && redo SCAN;  # tags
34        $$dataref =~ /\G(?:&[^\s;]*;?)+/cg       && redo SCAN;  # entities
35        $$dataref =~ /\G([^<>&]+)/cg             && do {        # text
36          TEXT:
37            redo SCAN if $self->{js};    # ignore protected
38            {
39                local $_ = $1;
40                $self->{_filter}->();
41                substr( $$dataref, $pos, length($1), $_ );
42                pos($$dataref) = $pos + length($_);
43            }
44            redo SCAN;
45        };
46    }
47}
48
491;
50
51__END__
52
53=head1 NAME
54
55HTTP::Proxy::BodyFilter::htmltext - A filter to transmogrify HTML text
56
57=head1 SYNOPSIS
58
59    use HTTP::Proxy::BodyFilter::tags;
60    use HTTP::Proxy::BodyFilter::htmltext;
61
62    # could it be any simpler?
63    $proxy->push_filter(
64        mime     => 'text/html',
65        response => HTTP::Proxy::BodyFilter::tags->new,
66        response => HTTP::Proxy::BodyFilter::htmltext->new(
67            sub { tr/a-zA-z/n-za-mN-ZA-M/ }
68        )
69    );
70
71=head1 DESCRIPTION
72
73The HTTP::Proxy::BodyFilter::htmltext is a filter spawner that
74calls the callback of your choice on any HTML text (outside
75C<< <script> >> and C<< <style> >> tags, and entities).
76
77The subroutine should modify the content of C<$_> as it sees fit.
78Simple, and terribly efficient.
79
80=head1 METHODS
81
82The filter defines the following methods, called automatically:
83
84=over 4
85
86=item init()
87
88Ensures that the filter is initialised with a CODE reference.
89
90=item begin()
91
92Per page parser initialisation.
93
94=item filter()
95
96A simple HTML parser that runs the given callback on the text contained
97in the HTML data. Please look at L<HTTP::Proxy::BodyFilter::htmlparser>
98if you need something more elaborate.
99
100=back
101
102=head1 SEE ALSO
103
104L<HTTP::Proxy>, L<HTTP::Proxy::BodyFilter>,
105L<HTTP::Proxy::BodyFilter::htmlparser>.
106
107=head1 AUTHOR
108
109Philippe "BooK" Bruhat, E<lt>book@cpan.orgE<gt>.
110
111=head1 COPYRIGHT
112
113Copyright 2003-2005, Philippe Bruhat.
114
115=head1 LICENSE
116
117This module is free software; you can redistribute it or modify it under
118the same terms as Perl itself.
119
120=cut
121
122