1package HTTP::Proxy::BodyFilter::htmltext; 2 3use strict; 4use Carp; 5use HTTP::Proxy::BodyFilter; 6use vars qw( @ISA ); 7@ISA = qw( HTTP::Proxy::BodyFilter ); 8 9sub init { 10 croak "Parameter must be a CODE reference" unless ref $_[1] eq 'CODE'; 11 $_[0]->{_filter} = $_[1]; 12} 13 14sub begin { $_[0]->{js} = 0; } # per message initialisation 15 16sub filter { 17 my ( $self, $dataref, $message, $protocol, $buffer ) = @_; 18 19 my $pos = pos($$dataref) = 0; 20 SCAN: 21 { 22 $pos = pos($$dataref); 23 $$dataref =~ /\G<\s*(?:script|style)[^>]*>/cgi # protect 24 && do { $self->{js} = 1; redo SCAN; }; 25 $$dataref =~ /\G<\s*\/\s*(?:script|style)[^>]*>/cgi # unprotect 26 && do { $self->{js} = 0; redo SCAN; }; 27 # comments are considered as text 28 # if you want comments as comments, 29 # use HTTP::Proxy::BodyFilter::htmlparser 30 $$dataref =~ /\G<!--/cg && redo SCAN; # comment 31 $$dataref =~ /\G>/cg && redo SCAN; # lost > 32 $$dataref =~ /\G(?=(<[^\s\/?%!a-z]))/cgi && goto TEXT; # < in text 33 $$dataref =~ /\G(?:<[^>]*>)+/cg && redo SCAN; # tags 34 $$dataref =~ /\G(?:&[^\s;]*;?)+/cg && redo SCAN; # entities 35 $$dataref =~ /\G([^<>&]+)/cg && do { # text 36 TEXT: 37 redo SCAN if $self->{js}; # ignore protected 38 { 39 local $_ = $1; 40 $self->{_filter}->(); 41 substr( $$dataref, $pos, length($1), $_ ); 42 pos($$dataref) = $pos + length($_); 43 } 44 redo SCAN; 45 }; 46 } 47} 48 491; 50 51__END__ 52 53=head1 NAME 54 55HTTP::Proxy::BodyFilter::htmltext - A filter to transmogrify HTML text 56 57=head1 SYNOPSIS 58 59 use HTTP::Proxy::BodyFilter::tags; 60 use HTTP::Proxy::BodyFilter::htmltext; 61 62 # could it be any simpler? 63 $proxy->push_filter( 64 mime => 'text/html', 65 response => HTTP::Proxy::BodyFilter::tags->new, 66 response => HTTP::Proxy::BodyFilter::htmltext->new( 67 sub { tr/a-zA-z/n-za-mN-ZA-M/ } 68 ) 69 ); 70 71=head1 DESCRIPTION 72 73The HTTP::Proxy::BodyFilter::htmltext is a filter spawner that 74calls the callback of your choice on any HTML text (outside 75C<< <script> >> and C<< <style> >> tags, and entities). 76 77The subroutine should modify the content of C<$_> as it sees fit. 78Simple, and terribly efficient. 79 80=head1 METHODS 81 82The filter defines the following methods, called automatically: 83 84=over 4 85 86=item init() 87 88Ensures that the filter is initialised with a CODE reference. 89 90=item begin() 91 92Per page parser initialisation. 93 94=item filter() 95 96A simple HTML parser that runs the given callback on the text contained 97in the HTML data. Please look at L<HTTP::Proxy::BodyFilter::htmlparser> 98if you need something more elaborate. 99 100=back 101 102=head1 SEE ALSO 103 104L<HTTP::Proxy>, L<HTTP::Proxy::BodyFilter>, 105L<HTTP::Proxy::BodyFilter::htmlparser>. 106 107=head1 AUTHOR 108 109Philippe "BooK" Bruhat, E<lt>book@cpan.orgE<gt>. 110 111=head1 COPYRIGHT 112 113Copyright 2003-2005, Philippe Bruhat. 114 115=head1 LICENSE 116 117This module is free software; you can redistribute it or modify it under 118the same terms as Perl itself. 119 120=cut 121 122